xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 257873cfc1dd3337766407f80397db60a56f2f5a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/user.h>
44 #include <sys/termios.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/strsun.h>
48 #include <sys/ddi.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/vtrace.h>
53 #include <sys/cmn_err.h>
54 #include <sys/pathname.h>
55 
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/sockio.h>
59 #include <sys/sodirect.h>
60 #include <netinet/in.h>
61 #include <sys/un.h>
62 #include <sys/strsun.h>
63 
64 #include <sys/tiuser.h>
65 #define	_SUN_TPI_VERSION	2
66 #include <sys/tihdr.h>
67 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
68 
69 #include <c2/audit.h>
70 
71 #include <inet/common.h>
72 #include <inet/ip.h>
73 #include <inet/ip6.h>
74 #include <inet/tcp.h>
75 #include <inet/udp_impl.h>
76 
77 #include <sys/zone.h>
78 
79 #include <fs/sockfs/nl7c.h>
80 #include <fs/sockfs/nl7curi.h>
81 
82 #include <inet/kssl/ksslapi.h>
83 
84 /*
85  * Possible failures when memory can't be allocated. The documented behavior:
86  *
87  * 		5.5:			4.X:		XNET:
88  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
89  *							EINTR
90  *	(4.X does not document EINTR but returns it)
91  * bind:	ENOSR			-		ENOBUFS/ENOSR
92  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
93  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
94  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
95  *	(4.X getpeername and getsockname do not fail in practice)
96  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
97  * listen:	-			-		ENOBUFS
98  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
99  *							EINTR
100  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
101  *							EINTR
102  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
103  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
104  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
105  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
106  *
107  * Resolution. When allocation fails:
108  *	recv: return EINTR
109  *	send: return EINTR
110  *	connect, accept: EINTR
111  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
112  *	socket, socketpair: ENOBUFS
113  *	getpeername, getsockname: sleep
114  *	getsockopt, setsockopt: sleep
115  */
116 
117 #ifdef SOCK_TEST
118 /*
119  * Variables that make sockfs do something other than the standard TPI
120  * for the AF_INET transports.
121  *
122  * solisten_tpi_tcp:
123  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
124  *	the transport is already bound. This is needed to avoid loosing the
125  *	port number should listen() do a T_UNBIND_REQ followed by a
126  *	O_T_BIND_REQ.
127  *
128  * soconnect_tpi_udp:
129  *	UDP and ICMP can handle a T_CONN_REQ.
130  *	This is needed to make the sequence of connect(), getsockname()
131  *	return the local IP address used to send packets to the connected to
132  *	destination.
133  *
134  * soconnect_tpi_tcp:
135  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
136  *	Set this to non-zero to send TPI conformant messages to TCP in this
137  *	respect. This is a performance optimization.
138  *
139  * soaccept_tpi_tcp:
140  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
141  *	This is a performance optimization that has been picked up in XTI.
142  *
143  * soaccept_tpi_multioptions:
144  *	When inheriting SOL_SOCKET options from the listener to the accepting
145  *	socket send them as a single message for AF_INET{,6}.
146  */
147 int solisten_tpi_tcp = 0;
148 int soconnect_tpi_udp = 0;
149 int soconnect_tpi_tcp = 0;
150 int soaccept_tpi_tcp = 0;
151 int soaccept_tpi_multioptions = 1;
152 #else /* SOCK_TEST */
153 #define	soconnect_tpi_tcp	0
154 #define	soconnect_tpi_udp	0
155 #define	solisten_tpi_tcp	0
156 #define	soaccept_tpi_tcp	0
157 #define	soaccept_tpi_multioptions	1
158 #endif /* SOCK_TEST */
159 
160 #ifdef SOCK_TEST
161 extern int do_useracc;
162 extern clock_t sock_test_timelimit;
163 #endif /* SOCK_TEST */
164 
165 /*
166  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
167  * applications working. Turn on this flag to disable these checks.
168  */
169 int xnet_skip_checks = 0;
170 int xnet_check_print = 0;
171 int xnet_truncate_print = 0;
172 
173 extern	void sigintr(k_sigset_t *, int);
174 extern	void sigunintr(k_sigset_t *);
175 
176 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
177 extern	void *nl7c_add_addr(void *, t_uscalar_t);
178 extern	void nl7c_listener_addr(void *, struct sonode *);
179 
180 /* Sockets acting as an in-kernel SSL proxy */
181 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
182 		    strsigset_t *, strsigset_t *, strpollset_t *);
183 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
184 		    strsigset_t *, strsigset_t *, strpollset_t *);
185 
186 static int	sotpi_unbind(struct sonode *, int);
187 
188 extern int	sodput(sodirect_t *, mblk_t *);
189 extern void	sodwakeup(sodirect_t *);
190 
191 /* TPI sockfs sonode operations */
192 static int	sotpi_accept(struct sonode *, int, struct sonode **);
193 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
194 		    int);
195 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
196 		    socklen_t, int, int);
197 static int	sotpi_listen(struct sonode *, int);
198 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
199 		    struct uio *);
200 static int	sotpi_shutdown(struct sonode *, int);
201 static int	sotpi_getsockname(struct sonode *);
202 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
203 		    struct uio *, void *, t_uscalar_t, int);
204 static int	sodgram_direct(struct sonode *, struct sockaddr *,
205 		    socklen_t, struct uio *, int);
206 
207 sonodeops_t sotpi_sonodeops = {
208 	sotpi_accept,		/* sop_accept		*/
209 	sotpi_bind,		/* sop_bind		*/
210 	sotpi_listen,		/* sop_listen		*/
211 	sotpi_connect,		/* sop_connect		*/
212 	sotpi_recvmsg,		/* sop_recvmsg		*/
213 	sotpi_sendmsg,		/* sop_sendmsg		*/
214 	sotpi_getpeername,	/* sop_getpeername	*/
215 	sotpi_getsockname,	/* sop_getsockname	*/
216 	sotpi_shutdown,		/* sop_shutdown		*/
217 	sotpi_getsockopt,	/* sop_getsockopt	*/
218 	sotpi_setsockopt	/* sop_setsockopt	*/
219 };
220 
221 /*
222  * Common create code for socket and accept. If tso is set the values
223  * from that node is used instead of issuing a T_INFO_REQ.
224  *
225  * Assumes that the caller has a VN_HOLD on accessvp.
226  * The VN_RELE will occur either when sotpi_create() fails or when
227  * the returned sonode is freed.
228  */
229 struct sonode *
230 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
231     struct sonode *tso, int *errorp)
232 {
233 	struct sonode	*so;
234 	vnode_t		*vp;
235 	int		flags, error;
236 
237 	ASSERT(accessvp != NULL);
238 	vp = makesockvp(accessvp, domain, type, protocol);
239 	ASSERT(vp != NULL);
240 	so = VTOSO(vp);
241 
242 	flags = FREAD|FWRITE;
243 
244 	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
245 	    (domain == AF_INET || domain == AF_INET6) &&
246 	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
247 	    protocol == IPPROTO_IP)) {
248 		/* Tell tcp or udp that it's talking to sockets */
249 		flags |= SO_SOCKSTR;
250 
251 		/*
252 		 * Here we indicate to socktpi_open() our attempt to
253 		 * make direct calls between sockfs and transport.
254 		 * The final decision is left to socktpi_open().
255 		 */
256 		so->so_state |= SS_DIRECT;
257 
258 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
259 		if (so->so_type == SOCK_STREAM && tso != NULL) {
260 			if (tso->so_state & SS_DIRECT) {
261 				/*
262 				 * Inherit SS_DIRECT from listener and pass
263 				 * SO_ACCEPTOR open flag to tcp, indicating
264 				 * that this is an accept fast-path instance.
265 				 */
266 				flags |= SO_ACCEPTOR;
267 			} else {
268 				/*
269 				 * SS_DIRECT is not set on listener, meaning
270 				 * that the listener has been converted from
271 				 * a socket to a stream.  Ensure that the
272 				 * acceptor inherits these settings.
273 				 */
274 				so->so_state &= ~SS_DIRECT;
275 				flags &= ~SO_SOCKSTR;
276 			}
277 		}
278 	}
279 
280 	/*
281 	 * Tell local transport that it is talking to sockets.
282 	 */
283 	if (so->so_family == AF_UNIX) {
284 		flags |= SO_SOCKSTR;
285 	}
286 
287 	/* Initialize the kernel SSL proxy fields */
288 	so->so_kssl_type = KSSL_NO_PROXY;
289 	so->so_kssl_ent = NULL;
290 	so->so_kssl_ctx = NULL;
291 
292 	if (error = socktpi_open(&vp, flags, CRED(), NULL)) {
293 		VN_RELE(vp);
294 		*errorp = error;
295 		return (NULL);
296 	}
297 
298 	if (error = so_strinit(so, tso)) {
299 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
300 		VN_RELE(vp);
301 		*errorp = error;
302 		return (NULL);
303 	}
304 
305 	if (version == SOV_DEFAULT)
306 		version = so_default_version;
307 
308 	so->so_version = (short)version;
309 
310 	return (so);
311 }
312 
313 /*
314  * Bind the socket to an unspecified address in sockfs only.
315  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
316  * required in all cases.
317  */
318 static void
319 so_automatic_bind(struct sonode *so)
320 {
321 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
322 
323 	ASSERT(MUTEX_HELD(&so->so_lock));
324 	ASSERT(!(so->so_state & SS_ISBOUND));
325 	ASSERT(so->so_unbind_mp);
326 
327 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
328 	bzero(so->so_laddr_sa, so->so_laddr_len);
329 	so->so_laddr_sa->sa_family = so->so_family;
330 	so->so_state |= SS_ISBOUND;
331 }
332 
333 
334 /*
335  * bind the socket.
336  *
337  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
338  * are passed in we allow rebinding. Note that for backwards compatibility
339  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
340  * Thus the rebinding code is currently not executed.
341  *
342  * The constraints for rebinding are:
343  * - it is a SOCK_DGRAM, or
344  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
345  *   and no listen() has been done.
346  * This rebinding code was added based on some language in the XNET book
347  * about not returning EINVAL it the protocol allows rebinding. However,
348  * this language is not present in the Posix socket draft. Thus maybe the
349  * rebinding logic should be deleted from the source.
350  *
351  * A null "name" can be used to unbind the socket if:
352  * - it is a SOCK_DGRAM, or
353  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
354  *   and no listen() has been done.
355  */
356 static int
357 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
358     socklen_t namelen, int backlog, int flags)
359 {
360 	struct T_bind_req	bind_req;
361 	struct T_bind_ack	*bind_ack;
362 	int			error = 0;
363 	mblk_t			*mp;
364 	void			*addr;
365 	t_uscalar_t		addrlen;
366 	int			unbind_on_err = 1;
367 	boolean_t		clear_acceptconn_on_err = B_FALSE;
368 	boolean_t		restore_backlog_on_err = B_FALSE;
369 	int			save_so_backlog;
370 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
371 	boolean_t		tcp_udp_xport;
372 	void			*nl7c = NULL;
373 
374 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
375 	    (void *)so, (void *)name, namelen, backlog, flags,
376 	    pr_state(so->so_state, so->so_mode)));
377 
378 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
379 
380 	if (!(flags & _SOBIND_LOCK_HELD)) {
381 		mutex_enter(&so->so_lock);
382 		so_lock_single(so);	/* Set SOLOCKED */
383 	} else {
384 		ASSERT(MUTEX_HELD(&so->so_lock));
385 		ASSERT(so->so_flag & SOLOCKED);
386 	}
387 
388 	/*
389 	 * Make sure that there is a preallocated unbind_req message
390 	 * before binding. This message allocated when the socket is
391 	 * created  but it might be have been consumed.
392 	 */
393 	if (so->so_unbind_mp == NULL) {
394 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
395 		/* NOTE: holding so_lock while sleeping */
396 		so->so_unbind_mp =
397 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
398 	}
399 
400 	if (flags & _SOBIND_REBIND) {
401 		/*
402 		 * Called from solisten after doing an sotpi_unbind() or
403 		 * potentially without the unbind (latter for AF_INET{,6}).
404 		 */
405 		ASSERT(name == NULL && namelen == 0);
406 
407 		if (so->so_family == AF_UNIX) {
408 			ASSERT(so->so_ux_bound_vp);
409 			addr = &so->so_ux_laddr;
410 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
411 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
412 			    "addr 0x%p, vp %p\n",
413 			    addrlen,
414 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
415 			    (void *)so->so_ux_bound_vp));
416 		} else {
417 			addr = so->so_laddr_sa;
418 			addrlen = (t_uscalar_t)so->so_laddr_len;
419 		}
420 	} else if (flags & _SOBIND_UNSPEC) {
421 		ASSERT(name == NULL && namelen == 0);
422 
423 		/*
424 		 * The caller checked SS_ISBOUND but not necessarily
425 		 * under so_lock
426 		 */
427 		if (so->so_state & SS_ISBOUND) {
428 			/* No error */
429 			goto done;
430 		}
431 
432 		/* Set an initial local address */
433 		switch (so->so_family) {
434 		case AF_UNIX:
435 			/*
436 			 * Use an address with same size as struct sockaddr
437 			 * just like BSD.
438 			 */
439 			so->so_laddr_len =
440 			    (socklen_t)sizeof (struct sockaddr);
441 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
442 			bzero(so->so_laddr_sa, so->so_laddr_len);
443 			so->so_laddr_sa->sa_family = so->so_family;
444 
445 			/*
446 			 * Pass down an address with the implicit bind
447 			 * magic number and the rest all zeros.
448 			 * The transport will return a unique address.
449 			 */
450 			so->so_ux_laddr.soua_vp = NULL;
451 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
452 			addr = &so->so_ux_laddr;
453 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
454 			break;
455 
456 		case AF_INET:
457 		case AF_INET6:
458 			/*
459 			 * An unspecified bind in TPI has a NULL address.
460 			 * Set the address in sockfs to have the sa_family.
461 			 */
462 			so->so_laddr_len = (so->so_family == AF_INET) ?
463 			    (socklen_t)sizeof (sin_t) :
464 			    (socklen_t)sizeof (sin6_t);
465 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
466 			bzero(so->so_laddr_sa, so->so_laddr_len);
467 			so->so_laddr_sa->sa_family = so->so_family;
468 			addr = NULL;
469 			addrlen = 0;
470 			break;
471 
472 		default:
473 			/*
474 			 * An unspecified bind in TPI has a NULL address.
475 			 * Set the address in sockfs to be zero length.
476 			 *
477 			 * Can not assume there is a sa_family for all
478 			 * protocol families. For example, AF_X25 does not
479 			 * have a family field.
480 			 */
481 			bzero(so->so_laddr_sa, so->so_laddr_len);
482 			so->so_laddr_len = 0;	/* XXX correct? */
483 			addr = NULL;
484 			addrlen = 0;
485 			break;
486 		}
487 
488 	} else {
489 		if (so->so_state & SS_ISBOUND) {
490 			/*
491 			 * If it is ok to rebind the socket, first unbind
492 			 * with the transport. A rebind to the NULL address
493 			 * is interpreted as an unbind.
494 			 * Note that a bind to NULL in BSD does unbind the
495 			 * socket but it fails with EINVAL.
496 			 * Note that regular sockets set SOV_SOCKBSD i.e.
497 			 * _SOBIND_SOCKBSD gets set here hence no type of
498 			 * socket does currently allow rebinding.
499 			 *
500 			 * If the name is NULL just do an unbind.
501 			 */
502 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
503 			    name != NULL) {
504 				error = EINVAL;
505 				unbind_on_err = 0;
506 				eprintsoline(so, error);
507 				goto done;
508 			}
509 			if ((so->so_mode & SM_CONNREQUIRED) &&
510 			    (so->so_state & SS_CANTREBIND)) {
511 				error = EINVAL;
512 				unbind_on_err = 0;
513 				eprintsoline(so, error);
514 				goto done;
515 			}
516 			error = sotpi_unbind(so, 0);
517 			if (error) {
518 				eprintsoline(so, error);
519 				goto done;
520 			}
521 			ASSERT(!(so->so_state & SS_ISBOUND));
522 			if (name == NULL) {
523 				so->so_state &=
524 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
525 				goto done;
526 			}
527 		}
528 		/* X/Open requires this check */
529 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
530 			if (xnet_check_print) {
531 				printf("sockfs: X/Open bind state check "
532 				    "caused EINVAL\n");
533 			}
534 			error = EINVAL;
535 			goto done;
536 		}
537 
538 		switch (so->so_family) {
539 		case AF_UNIX:
540 			/*
541 			 * All AF_UNIX addresses are nul terminated
542 			 * when copied (copyin_name) in so the minimum
543 			 * length is 3 bytes.
544 			 */
545 			if (name == NULL ||
546 			    (ssize_t)namelen <= sizeof (short) + 1) {
547 				error = EISDIR;
548 				eprintsoline(so, error);
549 				goto done;
550 			}
551 			/*
552 			 * Verify so_family matches the bound family.
553 			 * BSD does not check this for AF_UNIX resulting
554 			 * in funny mknods.
555 			 */
556 			if (name->sa_family != so->so_family) {
557 				error = EAFNOSUPPORT;
558 				goto done;
559 			}
560 			break;
561 		case AF_INET:
562 			if (name == NULL) {
563 				error = EINVAL;
564 				eprintsoline(so, error);
565 				goto done;
566 			}
567 			if ((size_t)namelen != sizeof (sin_t)) {
568 				error = name->sa_family != so->so_family ?
569 				    EAFNOSUPPORT : EINVAL;
570 				eprintsoline(so, error);
571 				goto done;
572 			}
573 			if ((flags & _SOBIND_XPG4_2) &&
574 			    (name->sa_family != so->so_family)) {
575 				/*
576 				 * This check has to be made for X/Open
577 				 * sockets however application failures have
578 				 * been observed when it is applied to
579 				 * all sockets.
580 				 */
581 				error = EAFNOSUPPORT;
582 				eprintsoline(so, error);
583 				goto done;
584 			}
585 			/*
586 			 * Force a zero sa_family to match so_family.
587 			 *
588 			 * Some programs like inetd(1M) don't set the
589 			 * family field. Other programs leave
590 			 * sin_family set to garbage - SunOS 4.X does
591 			 * not check the family field on a bind.
592 			 * We use the family field that
593 			 * was passed in to the socket() call.
594 			 */
595 			name->sa_family = so->so_family;
596 			break;
597 
598 		case AF_INET6: {
599 #ifdef DEBUG
600 			sin6_t *sin6 = (sin6_t *)name;
601 #endif /* DEBUG */
602 
603 			if (name == NULL) {
604 				error = EINVAL;
605 				eprintsoline(so, error);
606 				goto done;
607 			}
608 			if ((size_t)namelen != sizeof (sin6_t)) {
609 				error = name->sa_family != so->so_family ?
610 				    EAFNOSUPPORT : EINVAL;
611 				eprintsoline(so, error);
612 				goto done;
613 			}
614 			if (name->sa_family != so->so_family) {
615 				/*
616 				 * With IPv6 we require the family to match
617 				 * unlike in IPv4.
618 				 */
619 				error = EAFNOSUPPORT;
620 				eprintsoline(so, error);
621 				goto done;
622 			}
623 #ifdef DEBUG
624 			/*
625 			 * Verify that apps don't forget to clear
626 			 * sin6_scope_id etc
627 			 */
628 			if (sin6->sin6_scope_id != 0 &&
629 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
630 				zcmn_err(getzoneid(), CE_WARN,
631 				    "bind with uninitialized sin6_scope_id "
632 				    "(%d) on socket. Pid = %d\n",
633 				    (int)sin6->sin6_scope_id,
634 				    (int)curproc->p_pid);
635 			}
636 			if (sin6->__sin6_src_id != 0) {
637 				zcmn_err(getzoneid(), CE_WARN,
638 				    "bind with uninitialized __sin6_src_id "
639 				    "(%d) on socket. Pid = %d\n",
640 				    (int)sin6->__sin6_src_id,
641 				    (int)curproc->p_pid);
642 			}
643 #endif /* DEBUG */
644 			break;
645 		}
646 		default:
647 			/*
648 			 * Don't do any length or sa_family check to allow
649 			 * non-sockaddr style addresses.
650 			 */
651 			if (name == NULL) {
652 				error = EINVAL;
653 				eprintsoline(so, error);
654 				goto done;
655 			}
656 			break;
657 		}
658 
659 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
660 			error = ENAMETOOLONG;
661 			eprintsoline(so, error);
662 			goto done;
663 		}
664 		/*
665 		 * Save local address.
666 		 */
667 		so->so_laddr_len = (socklen_t)namelen;
668 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
669 		bcopy(name, so->so_laddr_sa, namelen);
670 
671 		addr = so->so_laddr_sa;
672 		addrlen = (t_uscalar_t)so->so_laddr_len;
673 		switch (so->so_family) {
674 		case AF_INET6:
675 		case AF_INET:
676 			break;
677 		case AF_UNIX: {
678 			struct sockaddr_un *soun =
679 			    (struct sockaddr_un *)so->so_laddr_sa;
680 			struct vnode *vp, *rvp;
681 			struct vattr vattr;
682 
683 			ASSERT(so->so_ux_bound_vp == NULL);
684 			/*
685 			 * Create vnode for the specified path name.
686 			 * Keep vnode held with a reference in so_ux_bound_vp.
687 			 * Use the vnode pointer as the address used in the
688 			 * bind with the transport.
689 			 *
690 			 * Use the same mode as in BSD. In particular this does
691 			 * not observe the umask.
692 			 */
693 			/* MAXPATHLEN + soun_family + nul termination */
694 			if (so->so_laddr_len >
695 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
696 				error = ENAMETOOLONG;
697 				eprintsoline(so, error);
698 				goto done;
699 			}
700 			vattr.va_type = VSOCK;
701 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
702 			vattr.va_mask = AT_TYPE|AT_MODE;
703 			/* NOTE: holding so_lock */
704 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
705 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
706 			if (error) {
707 				if (error == EEXIST)
708 					error = EADDRINUSE;
709 				eprintsoline(so, error);
710 				goto done;
711 			}
712 			/*
713 			 * Establish pointer from the underlying filesystem
714 			 * vnode to the socket node.
715 			 * so_ux_bound_vp and v_stream->sd_vnode form the
716 			 * cross-linkage between the underlying filesystem
717 			 * node and the socket node.
718 			 */
719 
720 			if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
721 				VN_HOLD(rvp);
722 				VN_RELE(vp);
723 				vp = rvp;
724 			}
725 
726 			ASSERT(SOTOV(so)->v_stream);
727 			mutex_enter(&vp->v_lock);
728 			vp->v_stream = SOTOV(so)->v_stream;
729 			so->so_ux_bound_vp = vp;
730 			mutex_exit(&vp->v_lock);
731 
732 			/*
733 			 * Use the vnode pointer value as a unique address
734 			 * (together with the magic number to avoid conflicts
735 			 * with implicit binds) in the transport provider.
736 			 */
737 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
738 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
739 			addr = &so->so_ux_laddr;
740 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
741 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
742 			    addrlen,
743 			    ((struct so_ux_addr *)addr)->soua_vp));
744 			break;
745 		}
746 		} /* end switch (so->so_family) */
747 	}
748 
749 	/*
750 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
751 	 * the transport can start passing up T_CONN_IND messages
752 	 * as soon as it receives the bind req and strsock_proto()
753 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
754 	 */
755 	if (flags & _SOBIND_LISTEN) {
756 		if ((so->so_state & SS_ACCEPTCONN) == 0)
757 			clear_acceptconn_on_err = B_TRUE;
758 		save_so_backlog = so->so_backlog;
759 		restore_backlog_on_err = B_TRUE;
760 		so->so_state |= SS_ACCEPTCONN;
761 		so->so_backlog = backlog;
762 	}
763 
764 	/*
765 	 * If NL7C addr(s) have been configured check for addr/port match,
766 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
767 	 *
768 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
769 	 * family sockets only. If match mark as such.
770 	 */
771 	if (nl7c_enabled && ((addr != NULL &&
772 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
773 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
774 	    so->so_nl7c_flags == NL7C_AF_NCA)) {
775 		/*
776 		 * NL7C is not supported in non-global zones,
777 		 * we enforce this restriction here.
778 		 */
779 		if (so->so_zoneid == GLOBAL_ZONEID) {
780 			/* An NL7C socket, mark it */
781 			so->so_nl7c_flags |= NL7C_ENABLED;
782 			if (nl7c == NULL) {
783 				/*
784 				 * Was an AF_NCA bind() so add it to the
785 				 * addr list for reporting purposes.
786 				 */
787 				nl7c = nl7c_add_addr(addr, addrlen);
788 			}
789 		} else
790 			nl7c = NULL;
791 	}
792 	/*
793 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
794 	 * for other transports we will send in a O_T_BIND_REQ.
795 	 */
796 	if (tcp_udp_xport &&
797 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
798 		PRIM_type = T_BIND_REQ;
799 
800 	bind_req.PRIM_type = PRIM_type;
801 	bind_req.ADDR_length = addrlen;
802 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
803 	bind_req.CONIND_number = backlog;
804 	/* NOTE: holding so_lock while sleeping */
805 	mp = soallocproto2(&bind_req, sizeof (bind_req),
806 	    addr, addrlen, 0, _ALLOC_SLEEP);
807 	so->so_state &= ~SS_LADDR_VALID;
808 
809 	/* Done using so_laddr_sa - can drop the lock */
810 	mutex_exit(&so->so_lock);
811 
812 	/*
813 	 * Intercept the bind_req message here to check if this <address/port>
814 	 * was configured as an SSL proxy server, or if another endpoint was
815 	 * already configured to act as a proxy for us.
816 	 *
817 	 * Note, only if NL7C not enabled for this socket.
818 	 */
819 	if (nl7c == NULL &&
820 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
821 	    so->so_type == SOCK_STREAM) {
822 
823 		if (so->so_kssl_ent != NULL) {
824 			kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type);
825 			so->so_kssl_ent = NULL;
826 		}
827 
828 		so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent);
829 		switch (so->so_kssl_type) {
830 		case KSSL_NO_PROXY:
831 			break;
832 
833 		case KSSL_HAS_PROXY:
834 			mutex_enter(&so->so_lock);
835 			goto skip_transport;
836 
837 		case KSSL_IS_PROXY:
838 			break;
839 		}
840 	}
841 
842 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
843 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
844 	if (error) {
845 		eprintsoline(so, error);
846 		mutex_enter(&so->so_lock);
847 		goto done;
848 	}
849 
850 	mutex_enter(&so->so_lock);
851 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
852 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
853 	if (error) {
854 		eprintsoline(so, error);
855 		goto done;
856 	}
857 skip_transport:
858 	ASSERT(mp);
859 	/*
860 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
861 	 * strsock_proto while the lock was dropped above, the bind
862 	 * is allowed to complete.
863 	 */
864 
865 	/* Mark as bound. This will be undone if we detect errors below. */
866 	if (flags & _SOBIND_NOXLATE) {
867 		ASSERT(so->so_family == AF_UNIX);
868 		so->so_state |= SS_FADDR_NOXLATE;
869 	}
870 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
871 	so->so_state |= SS_ISBOUND;
872 	ASSERT(so->so_unbind_mp);
873 
874 	/* note that we've already set SS_ACCEPTCONN above */
875 
876 	/*
877 	 * Recompute addrlen - an unspecied bind sent down an
878 	 * address of length zero but we expect the appropriate length
879 	 * in return.
880 	 */
881 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
882 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
883 
884 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
885 	/*
886 	 * The alignment restriction is really too strict but
887 	 * we want enough alignment to inspect the fields of
888 	 * a sockaddr_in.
889 	 */
890 	addr = sogetoff(mp, bind_ack->ADDR_offset,
891 	    bind_ack->ADDR_length,
892 	    __TPI_ALIGN_SIZE);
893 	if (addr == NULL) {
894 		freemsg(mp);
895 		error = EPROTO;
896 		eprintsoline(so, error);
897 		goto done;
898 	}
899 	if (!(flags & _SOBIND_UNSPEC)) {
900 		/*
901 		 * Verify that the transport didn't return something we
902 		 * did not want e.g. an address other than what we asked for.
903 		 *
904 		 * NOTE: These checks would go away if/when we switch to
905 		 * using the new TPI (in which the transport would fail
906 		 * the request instead of assigning a different address).
907 		 *
908 		 * NOTE2: For protocols that we don't know (i.e. any
909 		 * other than AF_INET6, AF_INET and AF_UNIX), we
910 		 * cannot know if the transport should be expected to
911 		 * return the same address as that requested.
912 		 *
913 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
914 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
915 		 *
916 		 * For example, in the case of netatalk it may be
917 		 * inappropriate for the transport to return the
918 		 * requested address (as it may have allocated a local
919 		 * port number in behaviour similar to that of an
920 		 * AF_INET bind request with a port number of zero).
921 		 *
922 		 * Given the definition of O_T_BIND_REQ, where the
923 		 * transport may bind to an address other than the
924 		 * requested address, it's not possible to determine
925 		 * whether a returned address that differs from the
926 		 * requested address is a reason to fail (because the
927 		 * requested address was not available) or succeed
928 		 * (because the transport allocated an appropriate
929 		 * address and/or port).
930 		 *
931 		 * sockfs currently requires that the transport return
932 		 * the requested address in the T_BIND_ACK, unless
933 		 * there is code here to allow for any discrepancy.
934 		 * Such code exists for AF_INET and AF_INET6.
935 		 *
936 		 * Netatalk chooses to return the requested address
937 		 * rather than the (correct) allocated address.  This
938 		 * means that netatalk violates the TPI specification
939 		 * (and would not function correctly if used from a
940 		 * TLI application), but it does mean that it works
941 		 * with sockfs.
942 		 *
943 		 * As noted above, using the newer XTI bind primitive
944 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
945 		 * allow sockfs to be more sure about whether or not
946 		 * the bind request had succeeded (as transports are
947 		 * not permitted to bind to a different address than
948 		 * that requested - they must return failure).
949 		 * Unfortunately, support for T_BIND_REQ may not be
950 		 * present in all transport implementations (netatalk,
951 		 * for example, doesn't have it), making the
952 		 * transition difficult.
953 		 */
954 		if (bind_ack->ADDR_length != addrlen) {
955 			/* Assumes that the requested address was in use */
956 			freemsg(mp);
957 			error = EADDRINUSE;
958 			eprintsoline(so, error);
959 			goto done;
960 		}
961 
962 		switch (so->so_family) {
963 		case AF_INET6:
964 		case AF_INET: {
965 			sin_t *rname, *aname;
966 
967 			rname = (sin_t *)addr;
968 			aname = (sin_t *)so->so_laddr_sa;
969 
970 			/*
971 			 * Take advantage of the alignment
972 			 * of sin_port and sin6_port which fall
973 			 * in the same place in their data structures.
974 			 * Just use sin_port for either address family.
975 			 *
976 			 * This may become a problem if (heaven forbid)
977 			 * there's a separate ipv6port_reserved... :-P
978 			 *
979 			 * Binding to port 0 has the semantics of letting
980 			 * the transport bind to any port.
981 			 *
982 			 * If the transport is TCP or UDP since we had sent
983 			 * a T_BIND_REQ we would not get a port other than
984 			 * what we asked for.
985 			 */
986 			if (tcp_udp_xport) {
987 				/*
988 				 * Pick up the new port number if we bound to
989 				 * port 0.
990 				 */
991 				if (aname->sin_port == 0)
992 					aname->sin_port = rname->sin_port;
993 				so->so_state |= SS_LADDR_VALID;
994 				break;
995 			}
996 			if (aname->sin_port != 0 &&
997 			    aname->sin_port != rname->sin_port) {
998 				freemsg(mp);
999 				error = EADDRINUSE;
1000 				eprintsoline(so, error);
1001 				goto done;
1002 			}
1003 			/*
1004 			 * Pick up the new port number if we bound to port 0.
1005 			 */
1006 			aname->sin_port = rname->sin_port;
1007 
1008 			/*
1009 			 * Unfortunately, addresses aren't _quite_ the same.
1010 			 */
1011 			if (so->so_family == AF_INET) {
1012 				if (aname->sin_addr.s_addr !=
1013 				    rname->sin_addr.s_addr) {
1014 					freemsg(mp);
1015 					error = EADDRNOTAVAIL;
1016 					eprintsoline(so, error);
1017 					goto done;
1018 				}
1019 			} else {
1020 				sin6_t *rname6 = (sin6_t *)rname;
1021 				sin6_t *aname6 = (sin6_t *)aname;
1022 
1023 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1024 				    &rname6->sin6_addr)) {
1025 					freemsg(mp);
1026 					error = EADDRNOTAVAIL;
1027 					eprintsoline(so, error);
1028 					goto done;
1029 				}
1030 			}
1031 			break;
1032 		}
1033 		case AF_UNIX:
1034 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
1035 				freemsg(mp);
1036 				error = EADDRINUSE;
1037 				eprintsoline(so, error);
1038 				eprintso(so,
1039 				    ("addrlen %d, addr 0x%x, vp %p\n",
1040 				    addrlen, *((int *)addr),
1041 				    (void *)so->so_ux_bound_vp));
1042 				goto done;
1043 			}
1044 			so->so_state |= SS_LADDR_VALID;
1045 			break;
1046 		default:
1047 			/*
1048 			 * NOTE: This assumes that addresses can be
1049 			 * byte-compared for equivalence.
1050 			 */
1051 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
1052 				freemsg(mp);
1053 				error = EADDRINUSE;
1054 				eprintsoline(so, error);
1055 				goto done;
1056 			}
1057 			/*
1058 			 * Don't mark SS_LADDR_VALID, as we cannot be
1059 			 * sure that the returned address is the real
1060 			 * bound address when talking to an unknown
1061 			 * transport.
1062 			 */
1063 			break;
1064 		}
1065 	} else {
1066 		/*
1067 		 * Save for returned address for getsockname.
1068 		 * Needed for unspecific bind unless transport supports
1069 		 * the TI_GETMYNAME ioctl.
1070 		 * Do this for AF_INET{,6} even though they do, as
1071 		 * caching info here is much better performance than
1072 		 * a TPI/STREAMS trip to the transport for getsockname.
1073 		 * Any which can't for some reason _must_ _not_ set
1074 		 * LADDR_VALID here for the caching version of getsockname
1075 		 * to not break;
1076 		 */
1077 		switch (so->so_family) {
1078 		case AF_UNIX:
1079 			/*
1080 			 * Record the address bound with the transport
1081 			 * for use by socketpair.
1082 			 */
1083 			bcopy(addr, &so->so_ux_laddr, addrlen);
1084 			so->so_state |= SS_LADDR_VALID;
1085 			break;
1086 		case AF_INET:
1087 		case AF_INET6:
1088 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
1089 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
1090 			so->so_state |= SS_LADDR_VALID;
1091 			break;
1092 		default:
1093 			/*
1094 			 * Don't mark SS_LADDR_VALID, as we cannot be
1095 			 * sure that the returned address is the real
1096 			 * bound address when talking to an unknown
1097 			 * transport.
1098 			 */
1099 			break;
1100 		}
1101 	}
1102 
1103 	if (nl7c != NULL) {
1104 		/* Register listen()er sonode pointer with NL7C */
1105 		nl7c_listener_addr(nl7c, so);
1106 	}
1107 
1108 	freemsg(mp);
1109 
1110 done:
1111 	if (error) {
1112 		/* reset state & backlog to values held on entry */
1113 		if (clear_acceptconn_on_err == B_TRUE)
1114 			so->so_state &= ~SS_ACCEPTCONN;
1115 		if (restore_backlog_on_err == B_TRUE)
1116 			so->so_backlog = save_so_backlog;
1117 
1118 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1119 			int err;
1120 
1121 			err = sotpi_unbind(so, 0);
1122 			/* LINTED - statement has no consequent: if */
1123 			if (err) {
1124 				eprintsoline(so, error);
1125 			} else {
1126 				ASSERT(!(so->so_state & SS_ISBOUND));
1127 			}
1128 		}
1129 	}
1130 	if (!(flags & _SOBIND_LOCK_HELD)) {
1131 		so_unlock_single(so, SOLOCKED);
1132 		mutex_exit(&so->so_lock);
1133 	} else {
1134 		/* If the caller held the lock don't release it here */
1135 		ASSERT(MUTEX_HELD(&so->so_lock));
1136 		ASSERT(so->so_flag & SOLOCKED);
1137 	}
1138 	return (error);
1139 }
1140 
1141 /* bind the socket */
1142 static int
1143 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1144     int flags)
1145 {
1146 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1147 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1148 
1149 	flags &= ~_SOBIND_SOCKETPAIR;
1150 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1151 }
1152 
1153 /*
1154  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1155  * address, or when listen needs to unbind and bind.
1156  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1157  * so that a sobind can pick them up.
1158  */
1159 static int
1160 sotpi_unbind(struct sonode *so, int flags)
1161 {
1162 	struct T_unbind_req	unbind_req;
1163 	int			error = 0;
1164 	mblk_t			*mp;
1165 
1166 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1167 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1168 
1169 	ASSERT(MUTEX_HELD(&so->so_lock));
1170 	ASSERT(so->so_flag & SOLOCKED);
1171 
1172 	if (!(so->so_state & SS_ISBOUND)) {
1173 		error = EINVAL;
1174 		eprintsoline(so, error);
1175 		goto done;
1176 	}
1177 
1178 	mutex_exit(&so->so_lock);
1179 
1180 	/*
1181 	 * Flush the read and write side (except stream head read queue)
1182 	 * and send down T_UNBIND_REQ.
1183 	 */
1184 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1185 
1186 	unbind_req.PRIM_type = T_UNBIND_REQ;
1187 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1188 	    0, _ALLOC_SLEEP);
1189 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1190 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1191 	mutex_enter(&so->so_lock);
1192 	if (error) {
1193 		eprintsoline(so, error);
1194 		goto done;
1195 	}
1196 
1197 	error = sowaitokack(so, T_UNBIND_REQ);
1198 	if (error) {
1199 		eprintsoline(so, error);
1200 		goto done;
1201 	}
1202 
1203 	/*
1204 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1205 	 * strsock_proto while the lock was dropped above, the unbind
1206 	 * is allowed to complete.
1207 	 */
1208 	if (!(flags & _SOUNBIND_REBIND)) {
1209 		/*
1210 		 * Clear out bound address.
1211 		 */
1212 		vnode_t *vp;
1213 
1214 		if ((vp = so->so_ux_bound_vp) != NULL) {
1215 
1216 			/* Undo any SSL proxy setup */
1217 			if ((so->so_family == AF_INET ||
1218 			    so->so_family == AF_INET6) &&
1219 			    (so->so_type == SOCK_STREAM) &&
1220 			    (so->so_kssl_ent != NULL)) {
1221 				kssl_release_ent(so->so_kssl_ent, so,
1222 				    so->so_kssl_type);
1223 				so->so_kssl_ent = NULL;
1224 				so->so_kssl_type = KSSL_NO_PROXY;
1225 			}
1226 
1227 			so->so_ux_bound_vp = NULL;
1228 			vn_rele_stream(vp);
1229 		}
1230 		/* Clear out address */
1231 		so->so_laddr_len = 0;
1232 	}
1233 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1234 
1235 done:
1236 
1237 	/* If the caller held the lock don't release it here */
1238 	ASSERT(MUTEX_HELD(&so->so_lock));
1239 	ASSERT(so->so_flag & SOLOCKED);
1240 
1241 	return (error);
1242 }
1243 
1244 /*
1245  * listen on the socket.
1246  * For TPI conforming transports this has to first unbind with the transport
1247  * and then bind again using the new backlog.
1248  */
1249 int
1250 sotpi_listen(struct sonode *so, int backlog)
1251 {
1252 	int		error = 0;
1253 
1254 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1255 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1256 
1257 	if (so->so_serv_type == T_CLTS)
1258 		return (EOPNOTSUPP);
1259 
1260 	/*
1261 	 * If the socket is ready to accept connections already, then
1262 	 * return without doing anything.  This avoids a problem where
1263 	 * a second listen() call fails if a connection is pending and
1264 	 * leaves the socket unbound. Only when we are not unbinding
1265 	 * with the transport can we safely increase the backlog.
1266 	 */
1267 	if (so->so_state & SS_ACCEPTCONN &&
1268 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1269 	    /*CONSTCOND*/
1270 	    !solisten_tpi_tcp))
1271 		return (0);
1272 
1273 	if (so->so_state & SS_ISCONNECTED)
1274 		return (EINVAL);
1275 
1276 	mutex_enter(&so->so_lock);
1277 	so_lock_single(so);	/* Set SOLOCKED */
1278 
1279 	if (backlog < 0)
1280 		backlog = 0;
1281 	/*
1282 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1283 	 * before queuing the next connection implying that a
1284 	 * listen(sock, 0) allows one connection to be queued.
1285 	 * BSD also uses 1.5 times the requested backlog.
1286 	 *
1287 	 * XNS Issue 4 required a strict interpretation of the backlog.
1288 	 * This has been waived subsequently for Issue 4 and the change
1289 	 * incorporated in XNS Issue 5. So we aren't required to do
1290 	 * anything special for XPG apps.
1291 	 */
1292 	if (backlog >= (INT_MAX - 1) / 3)
1293 		backlog = INT_MAX;
1294 	else
1295 		backlog = backlog * 3 / 2 + 1;
1296 
1297 	/*
1298 	 * If the listen doesn't change the backlog we do nothing.
1299 	 * This avoids an EPROTO error from the transport.
1300 	 */
1301 	if ((so->so_state & SS_ACCEPTCONN) &&
1302 	    so->so_backlog == backlog)
1303 		goto done;
1304 
1305 	if (!(so->so_state & SS_ISBOUND)) {
1306 		/*
1307 		 * Must have been explicitly bound in the UNIX domain.
1308 		 */
1309 		if (so->so_family == AF_UNIX) {
1310 			error = EINVAL;
1311 			goto done;
1312 		}
1313 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1314 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1315 	} else if (backlog > 0) {
1316 		/*
1317 		 * AF_INET{,6} hack to avoid losing the port.
1318 		 * Assumes that all AF_INET{,6} transports can handle a
1319 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1320 		 * has already bound thus it is possible to avoid the unbind.
1321 		 */
1322 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1323 		    /*CONSTCOND*/
1324 		    !solisten_tpi_tcp)) {
1325 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1326 			if (error)
1327 				goto done;
1328 		}
1329 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1330 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1331 	} else {
1332 		so->so_state |= SS_ACCEPTCONN;
1333 		so->so_backlog = backlog;
1334 	}
1335 	if (error)
1336 		goto done;
1337 	ASSERT(so->so_state & SS_ACCEPTCONN);
1338 done:
1339 	so_unlock_single(so, SOLOCKED);
1340 	mutex_exit(&so->so_lock);
1341 	return (error);
1342 }
1343 
1344 /*
1345  * Disconnect either a specified seqno or all (-1).
1346  * The former is used on listening sockets only.
1347  *
1348  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1349  * the current use of sodisconnect(seqno == -1) is only for shutdown
1350  * so there is no point (and potentially incorrect) to unbind.
1351  */
1352 int
1353 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1354 {
1355 	struct T_discon_req	discon_req;
1356 	int			error = 0;
1357 	mblk_t			*mp;
1358 
1359 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1360 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1361 
1362 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1363 		mutex_enter(&so->so_lock);
1364 		so_lock_single(so);	/* Set SOLOCKED */
1365 	} else {
1366 		ASSERT(MUTEX_HELD(&so->so_lock));
1367 		ASSERT(so->so_flag & SOLOCKED);
1368 	}
1369 
1370 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1371 		error = EINVAL;
1372 		eprintsoline(so, error);
1373 		goto done;
1374 	}
1375 
1376 	mutex_exit(&so->so_lock);
1377 	/*
1378 	 * Flush the write side (unless this is a listener)
1379 	 * and then send down a T_DISCON_REQ.
1380 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1381 	 * and other messages.)
1382 	 */
1383 	if (!(so->so_state & SS_ACCEPTCONN))
1384 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1385 
1386 	discon_req.PRIM_type = T_DISCON_REQ;
1387 	discon_req.SEQ_number = seqno;
1388 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1389 	    0, _ALLOC_SLEEP);
1390 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1391 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1392 	mutex_enter(&so->so_lock);
1393 	if (error) {
1394 		eprintsoline(so, error);
1395 		goto done;
1396 	}
1397 
1398 	error = sowaitokack(so, T_DISCON_REQ);
1399 	if (error) {
1400 		eprintsoline(so, error);
1401 		goto done;
1402 	}
1403 	/*
1404 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1405 	 * strsock_proto while the lock was dropped above, the disconnect
1406 	 * is allowed to complete. However, it is not possible to
1407 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1408 	 */
1409 	so->so_state &=
1410 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1411 done:
1412 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1413 		so_unlock_single(so, SOLOCKED);
1414 		mutex_exit(&so->so_lock);
1415 	} else {
1416 		/* If the caller held the lock don't release it here */
1417 		ASSERT(MUTEX_HELD(&so->so_lock));
1418 		ASSERT(so->so_flag & SOLOCKED);
1419 	}
1420 	return (error);
1421 }
1422 
1423 int
1424 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1425 {
1426 	struct T_conn_ind	*conn_ind;
1427 	struct T_conn_res	*conn_res;
1428 	int			error = 0;
1429 	mblk_t			*mp, *ctxmp, *ack_mp;
1430 	struct sonode		*nso;
1431 	vnode_t			*nvp;
1432 	void			*src;
1433 	t_uscalar_t		srclen;
1434 	void			*opt;
1435 	t_uscalar_t		optlen;
1436 	t_scalar_t		PRIM_type;
1437 	t_scalar_t		SEQ_number;
1438 	size_t			sinlen;
1439 
1440 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1441 	    (void *)so, fflag, (void *)nsop,
1442 	    pr_state(so->so_state, so->so_mode)));
1443 
1444 	/*
1445 	 * Defer single-threading the accepting socket until
1446 	 * the T_CONN_IND has been received and parsed and the
1447 	 * new sonode has been opened.
1448 	 */
1449 
1450 	/* Check that we are not already connected */
1451 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1452 		goto conn_bad;
1453 again:
1454 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1455 		goto e_bad;
1456 
1457 	ASSERT(mp);
1458 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1459 	ctxmp = mp->b_cont;
1460 
1461 	/*
1462 	 * Save SEQ_number for error paths.
1463 	 */
1464 	SEQ_number = conn_ind->SEQ_number;
1465 
1466 	srclen = conn_ind->SRC_length;
1467 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1468 	if (src == NULL) {
1469 		error = EPROTO;
1470 		freemsg(mp);
1471 		eprintsoline(so, error);
1472 		goto disconnect_unlocked;
1473 	}
1474 	optlen = conn_ind->OPT_length;
1475 	switch (so->so_family) {
1476 	case AF_INET:
1477 	case AF_INET6:
1478 		if ((optlen == sizeof (intptr_t)) &&
1479 		    ((so->so_state & SS_DIRECT) != 0)) {
1480 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1481 			    &opt, conn_ind->OPT_length);
1482 		} else {
1483 			/*
1484 			 * The transport (in this case TCP) hasn't sent up
1485 			 * a pointer to an instance for the accept fast-path.
1486 			 * Disable fast-path completely because the call to
1487 			 * sotpi_create() below would otherwise create an
1488 			 * incomplete TCP instance, which would lead to
1489 			 * problems when sockfs sends a normal T_CONN_RES
1490 			 * message down the new stream.
1491 			 */
1492 			if (so->so_state & SS_DIRECT) {
1493 				int rval;
1494 				/*
1495 				 * For consistency we inform tcp to disable
1496 				 * direct interface on the listener, though
1497 				 * we can certainly live without doing this
1498 				 * because no data will ever travel upstream
1499 				 * on the listening socket.
1500 				 */
1501 				so->so_state &= ~SS_DIRECT;
1502 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1503 				    0, 0, K_TO_K, CRED(), &rval);
1504 			}
1505 			opt = NULL;
1506 			optlen = 0;
1507 		}
1508 		break;
1509 	case AF_UNIX:
1510 	default:
1511 		if (optlen != 0) {
1512 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1513 			    __TPI_ALIGN_SIZE);
1514 			if (opt == NULL) {
1515 				error = EPROTO;
1516 				freemsg(mp);
1517 				eprintsoline(so, error);
1518 				goto disconnect_unlocked;
1519 			}
1520 		}
1521 		if (so->so_family == AF_UNIX) {
1522 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1523 				src = NULL;
1524 				srclen = 0;
1525 			}
1526 			/* Extract src address from options */
1527 			if (optlen != 0)
1528 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1529 		}
1530 		break;
1531 	}
1532 
1533 	/*
1534 	 * Create the new socket.
1535 	 */
1536 	VN_HOLD(so->so_accessvp);
1537 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1538 	    so->so_protocol, so->so_version, so, &error);
1539 	if (nso == NULL) {
1540 		ASSERT(error != 0);
1541 		/*
1542 		 * Accept can not fail with ENOBUFS. sotpi_create
1543 		 * sleeps waiting for memory until a signal is caught
1544 		 * so return EINTR.
1545 		 */
1546 		freemsg(mp);
1547 		if (error == ENOBUFS)
1548 			error = EINTR;
1549 		goto e_disc_unl;
1550 	}
1551 	nvp = SOTOV(nso);
1552 
1553 	/*
1554 	 * If the transport sent up an SSL connection context, then attach
1555 	 * it the new socket, and set the (sd_wputdatafunc)() and
1556 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1557 	 * SSL records.
1558 	 */
1559 	if (ctxmp != NULL) {
1560 		/*
1561 		 * This kssl_ctx_t is already held for us by the transport.
1562 		 * So, we don't need to do a kssl_hold_ctx() here.
1563 		 */
1564 		nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1565 		freemsg(ctxmp);
1566 		mp->b_cont = NULL;
1567 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1568 		    strsock_kssl_output);
1569 
1570 		/* Disable sodirect if any */
1571 		if (nso->so_direct != NULL) {
1572 			mutex_enter(nso->so_direct->sod_lockp);
1573 			SOD_DISABLE(nso->so_direct);
1574 			mutex_exit(nso->so_direct->sod_lockp);
1575 			nso->so_direct = NULL;
1576 		}
1577 	}
1578 #ifdef DEBUG
1579 	/*
1580 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1581 	 * it's inherited early to allow debugging of the accept code itself.
1582 	 */
1583 	nso->so_options |= so->so_options & SO_DEBUG;
1584 #endif /* DEBUG */
1585 
1586 	/*
1587 	 * Save the SRC address from the T_CONN_IND
1588 	 * for getpeername to work on AF_UNIX and on transports that do not
1589 	 * support TI_GETPEERNAME.
1590 	 *
1591 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1592 	 * copyin_name().
1593 	 */
1594 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1595 		error = EINVAL;
1596 		freemsg(mp);
1597 		eprintsoline(so, error);
1598 		goto disconnect_vp_unlocked;
1599 	}
1600 	nso->so_faddr_len = (socklen_t)srclen;
1601 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1602 	bcopy(src, nso->so_faddr_sa, srclen);
1603 	nso->so_state |= SS_FADDR_VALID;
1604 
1605 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1606 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1607 		cred_t *cr;
1608 
1609 		if ((cr = DB_CRED(mp)) != NULL) {
1610 			crhold(cr);
1611 			nso->so_peercred = cr;
1612 			nso->so_cpid = DB_CPID(mp);
1613 		}
1614 		freemsg(mp);
1615 
1616 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1617 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1618 		if (mp == NULL) {
1619 			/*
1620 			 * Accept can not fail with ENOBUFS.
1621 			 * A signal was caught so return EINTR.
1622 			 */
1623 			error = EINTR;
1624 			eprintsoline(so, error);
1625 			goto disconnect_vp_unlocked;
1626 		}
1627 		conn_res = (struct T_conn_res *)mp->b_rptr;
1628 	} else {
1629 		nso->so_peercred = DB_CRED(mp);
1630 		nso->so_cpid = DB_CPID(mp);
1631 		DB_CRED(mp) = NULL;
1632 
1633 		mp->b_rptr = DB_BASE(mp);
1634 		conn_res = (struct T_conn_res *)mp->b_rptr;
1635 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1636 	}
1637 
1638 	/*
1639 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1640 	 * (or AF_INET6) it also has to be bound in the transport provider.
1641 	 * We set the local address in the sonode from the T_OK_ACK of the
1642 	 * T_CONN_RES. For this reason the address we bind to here isn't
1643 	 * important.
1644 	 */
1645 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1646 	    /*CONSTCOND*/
1647 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1648 		/*
1649 		 * Optimization for AF_INET{,6} transports
1650 		 * that can handle a T_CONN_RES without being bound.
1651 		 */
1652 		mutex_enter(&nso->so_lock);
1653 		so_automatic_bind(nso);
1654 		mutex_exit(&nso->so_lock);
1655 	} else {
1656 		/* Perform NULL bind with the transport provider. */
1657 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1658 			ASSERT(error != ENOBUFS);
1659 			freemsg(mp);
1660 			eprintsoline(nso, error);
1661 			goto disconnect_vp_unlocked;
1662 		}
1663 	}
1664 
1665 	/*
1666 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1667 	 * so that any data arriving on the new socket will cause the
1668 	 * appropriate signals to be delivered for the new socket.
1669 	 *
1670 	 * No other thread (except strsock_proto and strsock_misc)
1671 	 * can access the new socket thus we relax the locking.
1672 	 */
1673 	nso->so_pgrp = so->so_pgrp;
1674 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1675 
1676 	if (nso->so_pgrp != 0) {
1677 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1678 			eprintsoline(nso, error);
1679 			error = 0;
1680 			nso->so_pgrp = 0;
1681 		}
1682 	}
1683 
1684 	/*
1685 	 * Make note of the socket level options. TCP and IP level options
1686 	 * are already inherited. We could do all this after accept is
1687 	 * successful but doing it here simplifies code and no harm done
1688 	 * for error case.
1689 	 */
1690 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1691 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1692 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1693 	nso->so_sndbuf = so->so_sndbuf;
1694 	nso->so_rcvbuf = so->so_rcvbuf;
1695 	if (nso->so_options & SO_LINGER)
1696 		nso->so_linger = so->so_linger;
1697 
1698 	if ((so->so_state & SS_DIRECT) != 0) {
1699 
1700 		ASSERT(opt != NULL);
1701 
1702 		conn_res->OPT_length = optlen;
1703 		conn_res->OPT_offset = MBLKL(mp);
1704 		bcopy(&opt, mp->b_wptr, optlen);
1705 		mp->b_wptr += optlen;
1706 		conn_res->PRIM_type = T_CONN_RES;
1707 		conn_res->ACCEPTOR_id = 0;
1708 		PRIM_type = T_CONN_RES;
1709 
1710 		/* Send down the T_CONN_RES on acceptor STREAM */
1711 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1712 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1713 		if (error) {
1714 			mutex_enter(&so->so_lock);
1715 			so_lock_single(so);
1716 			eprintsoline(so, error);
1717 			goto disconnect_vp;
1718 		}
1719 		mutex_enter(&nso->so_lock);
1720 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1721 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1722 		if (error) {
1723 			mutex_exit(&nso->so_lock);
1724 			mutex_enter(&so->so_lock);
1725 			so_lock_single(so);
1726 			eprintsoline(so, error);
1727 			goto disconnect_vp;
1728 		}
1729 		if (nso->so_family == AF_INET) {
1730 			sin_t *sin;
1731 
1732 			sin = (sin_t *)(ack_mp->b_rptr +
1733 			    sizeof (struct T_ok_ack));
1734 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1735 			nso->so_laddr_len = sizeof (sin_t);
1736 		} else {
1737 			sin6_t *sin6;
1738 
1739 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1740 			    sizeof (struct T_ok_ack));
1741 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1742 			nso->so_laddr_len = sizeof (sin6_t);
1743 		}
1744 		freemsg(ack_mp);
1745 
1746 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1747 		nso->so_priv = opt;
1748 
1749 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1750 			/*
1751 			 * A NL7C marked listen()er so the new socket
1752 			 * inherits the listen()er's NL7C state, except
1753 			 * for NL7C_POLLIN.
1754 			 *
1755 			 * Only call NL7C to process the new socket if
1756 			 * the listen socket allows blocking i/o.
1757 			 */
1758 			nso->so_nl7c_flags = so->so_nl7c_flags & (~NL7C_POLLIN);
1759 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1760 				/*
1761 				 * Nonblocking accept() just make it
1762 				 * persist to defer processing to the
1763 				 * read-side syscall (e.g. read).
1764 				 */
1765 				nso->so_nl7c_flags |= NL7C_SOPERSIST;
1766 			} else if (nl7c_process(nso, B_FALSE)) {
1767 				/*
1768 				 * NL7C has completed processing on the
1769 				 * socket, close the socket and back to
1770 				 * the top to await the next T_CONN_IND.
1771 				 */
1772 				mutex_exit(&nso->so_lock);
1773 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1774 				    CRED(), NULL);
1775 				VN_RELE(nvp);
1776 				goto again;
1777 			}
1778 			/* Pass the new socket out */
1779 		}
1780 
1781 		mutex_exit(&nso->so_lock);
1782 
1783 		/*
1784 		 * It's possible, through the use of autopush for example,
1785 		 * that the acceptor stream may not support SS_DIRECT
1786 		 * semantics. If the new socket does not support SS_DIRECT
1787 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1788 		 * as we would in the I_PUSH case.
1789 		 */
1790 		if (!(nso->so_state & SS_DIRECT)) {
1791 			int	rval;
1792 
1793 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1794 			    0, 0, K_TO_K, CRED(), &rval)) != 0) {
1795 				mutex_enter(&so->so_lock);
1796 				so_lock_single(so);
1797 				eprintsoline(so, error);
1798 				goto disconnect_vp;
1799 			}
1800 		}
1801 
1802 		/*
1803 		 * Pass out new socket.
1804 		 */
1805 		if (nsop != NULL)
1806 			*nsop = nso;
1807 
1808 		return (0);
1809 	}
1810 
1811 	/*
1812 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1813 	 * which don't support the FireEngine accept fast-path. It is also
1814 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1815 	 * again. Neither sockfs nor TCP attempt to find out if some other
1816 	 * random module has been inserted in between (in which case we
1817 	 * should follow TLI accept behaviour). We blindly assume the worst
1818 	 * case and revert back to old behaviour i.e. TCP will not send us
1819 	 * any option (eager) and the accept should happen on the listener
1820 	 * queue. Any queued T_conn_ind have already got their options removed
1821 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1822 	 */
1823 	/*
1824 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1825 	 */
1826 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1827 #ifdef	_ILP32
1828 		queue_t	*q;
1829 
1830 		/*
1831 		 * Find read queue in driver
1832 		 * Can safely do this since we "own" nso/nvp.
1833 		 */
1834 		q = strvp2wq(nvp)->q_next;
1835 		while (SAMESTR(q))
1836 			q = q->q_next;
1837 		q = RD(q);
1838 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1839 #else
1840 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1841 #endif	/* _ILP32 */
1842 		conn_res->PRIM_type = O_T_CONN_RES;
1843 		PRIM_type = O_T_CONN_RES;
1844 	} else {
1845 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1846 		conn_res->PRIM_type = T_CONN_RES;
1847 		PRIM_type = T_CONN_RES;
1848 	}
1849 	conn_res->SEQ_number = SEQ_number;
1850 	conn_res->OPT_length = 0;
1851 	conn_res->OPT_offset = 0;
1852 
1853 	mutex_enter(&so->so_lock);
1854 	so_lock_single(so);	/* Set SOLOCKED */
1855 	mutex_exit(&so->so_lock);
1856 
1857 	error = kstrputmsg(SOTOV(so), mp, NULL,
1858 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1859 	mutex_enter(&so->so_lock);
1860 	if (error) {
1861 		eprintsoline(so, error);
1862 		goto disconnect_vp;
1863 	}
1864 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
1865 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1866 	if (error) {
1867 		eprintsoline(so, error);
1868 		goto disconnect_vp;
1869 	}
1870 	/*
1871 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
1872 	 * that to set the local address. If this is not present
1873 	 * then we zero out the address and don't set the
1874 	 * SS_LADDR_VALID bit. For AF_UNIX endpoints we copy over
1875 	 * the pathname from the listening socket.
1876 	 */
1877 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
1878 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
1879 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
1880 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
1881 		bcopy(ack_mp->b_rptr, nso->so_laddr_sa, sinlen);
1882 		nso->so_laddr_len = sinlen;
1883 		nso->so_state |= SS_LADDR_VALID;
1884 	} else if (nso->so_family == AF_UNIX) {
1885 		ASSERT(so->so_family == AF_UNIX);
1886 		nso->so_laddr_len = so->so_laddr_len;
1887 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1888 		bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1889 		nso->so_state |= SS_LADDR_VALID;
1890 	} else {
1891 		nso->so_laddr_len = so->so_laddr_len;
1892 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1893 		bzero(nso->so_laddr_sa, nso->so_addr_size);
1894 		nso->so_laddr_sa->sa_family = nso->so_family;
1895 	}
1896 	freemsg(ack_mp);
1897 
1898 	so_unlock_single(so, SOLOCKED);
1899 	mutex_exit(&so->so_lock);
1900 
1901 	nso->so_state |= SS_ISCONNECTED;
1902 
1903 	/*
1904 	 * Pass out new socket.
1905 	 */
1906 	if (nsop != NULL)
1907 		*nsop = nso;
1908 
1909 	return (0);
1910 
1911 
1912 eproto_disc_unl:
1913 	error = EPROTO;
1914 e_disc_unl:
1915 	eprintsoline(so, error);
1916 	goto disconnect_unlocked;
1917 
1918 pr_disc_vp_unl:
1919 	eprintsoline(so, error);
1920 disconnect_vp_unlocked:
1921 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1922 	VN_RELE(nvp);
1923 disconnect_unlocked:
1924 	(void) sodisconnect(so, SEQ_number, 0);
1925 	return (error);
1926 
1927 pr_disc_vp:
1928 	eprintsoline(so, error);
1929 disconnect_vp:
1930 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1931 	so_unlock_single(so, SOLOCKED);
1932 	mutex_exit(&so->so_lock);
1933 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1934 	VN_RELE(nvp);
1935 	return (error);
1936 
1937 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1938 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1939 	    ? EOPNOTSUPP : EINVAL;
1940 e_bad:
1941 	eprintsoline(so, error);
1942 	return (error);
1943 }
1944 
1945 /*
1946  * connect a socket.
1947  *
1948  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1949  * unconnect (by specifying a null address).
1950  */
1951 int
1952 sotpi_connect(struct sonode *so,
1953 	const struct sockaddr *name,
1954 	socklen_t namelen,
1955 	int fflag,
1956 	int flags)
1957 {
1958 	struct T_conn_req	conn_req;
1959 	int			error = 0;
1960 	mblk_t			*mp;
1961 	void			*src;
1962 	socklen_t		srclen;
1963 	void			*addr;
1964 	socklen_t		addrlen;
1965 	boolean_t		need_unlock;
1966 
1967 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1968 	    (void *)so, (void *)name, namelen, fflag, flags,
1969 	    pr_state(so->so_state, so->so_mode)));
1970 
1971 	/*
1972 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1973 	 * avoid sleeping for memory with SOLOCKED held.
1974 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1975 	 * + sizeof (struct T_opthdr).
1976 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1977 	 * exceed so_faddr_maxlen).
1978 	 */
1979 	mp = soallocproto(sizeof (struct T_conn_req) +
1980 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1981 	if (mp == NULL) {
1982 		/*
1983 		 * Connect can not fail with ENOBUFS. A signal was
1984 		 * caught so return EINTR.
1985 		 */
1986 		error = EINTR;
1987 		eprintsoline(so, error);
1988 		return (error);
1989 	}
1990 
1991 	mutex_enter(&so->so_lock);
1992 	/*
1993 	 * Make sure there is a preallocated T_unbind_req message
1994 	 * before any binding. This message is allocated when the
1995 	 * socket is created. Since another thread can consume
1996 	 * so_unbind_mp by the time we return from so_lock_single(),
1997 	 * we should check the availability of so_unbind_mp after
1998 	 * we return from so_lock_single().
1999 	 */
2000 
2001 	so_lock_single(so);	/* Set SOLOCKED */
2002 	need_unlock = B_TRUE;
2003 
2004 	if (so->so_unbind_mp == NULL) {
2005 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2006 		/* NOTE: holding so_lock while sleeping */
2007 		so->so_unbind_mp =
2008 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
2009 		if (so->so_unbind_mp == NULL) {
2010 			error = EINTR;
2011 			goto done;
2012 		}
2013 	}
2014 
2015 	/*
2016 	 * Can't have done a listen before connecting.
2017 	 */
2018 	if (so->so_state & SS_ACCEPTCONN) {
2019 		error = EOPNOTSUPP;
2020 		goto done;
2021 	}
2022 
2023 	/*
2024 	 * Must be bound with the transport
2025 	 */
2026 	if (!(so->so_state & SS_ISBOUND)) {
2027 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2028 		    /*CONSTCOND*/
2029 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2030 			/*
2031 			 * Optimization for AF_INET{,6} transports
2032 			 * that can handle a T_CONN_REQ without being bound.
2033 			 */
2034 			so_automatic_bind(so);
2035 		} else {
2036 			error = sotpi_bind(so, NULL, 0,
2037 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
2038 			if (error)
2039 				goto done;
2040 		}
2041 		ASSERT(so->so_state & SS_ISBOUND);
2042 		flags |= _SOCONNECT_DID_BIND;
2043 	}
2044 
2045 	/*
2046 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2047 	 * connect to a null address. This is the portable method to
2048 	 * unconnect a socket.
2049 	 */
2050 	if ((namelen >= sizeof (sa_family_t)) &&
2051 	    (name->sa_family == AF_UNSPEC)) {
2052 		name = NULL;
2053 		namelen = 0;
2054 	}
2055 
2056 	/*
2057 	 * Check that we are not already connected.
2058 	 * A connection-oriented socket cannot be reconnected.
2059 	 * A connected connection-less socket can be
2060 	 * - connected to a different address by a subsequent connect
2061 	 * - "unconnected" by a connect to the NULL address
2062 	 */
2063 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2064 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2065 		if (so->so_mode & SM_CONNREQUIRED) {
2066 			/* Connection-oriented socket */
2067 			error = so->so_state & SS_ISCONNECTED ?
2068 			    EISCONN : EALREADY;
2069 			goto done;
2070 		}
2071 		/* Connection-less socket */
2072 		if (name == NULL) {
2073 			/*
2074 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2075 			 * since it was set when the socket was connected.
2076 			 * If this is UDP also send down a T_DISCON_REQ.
2077 			 */
2078 			int val;
2079 
2080 			if ((so->so_family == AF_INET ||
2081 			    so->so_family == AF_INET6) &&
2082 			    (so->so_type == SOCK_DGRAM ||
2083 			    so->so_type == SOCK_RAW) &&
2084 			    /*CONSTCOND*/
2085 			    !soconnect_tpi_udp) {
2086 				/* XXX What about implicitly unbinding here? */
2087 				error = sodisconnect(so, -1,
2088 				    _SODISCONNECT_LOCK_HELD);
2089 			} else {
2090 				so->so_state &=
2091 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
2092 				    SS_FADDR_VALID);
2093 				so->so_faddr_len = 0;
2094 			}
2095 
2096 			so_unlock_single(so, SOLOCKED);
2097 			mutex_exit(&so->so_lock);
2098 
2099 			val = 0;
2100 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2101 			    &val, (t_uscalar_t)sizeof (val));
2102 
2103 			mutex_enter(&so->so_lock);
2104 			so_lock_single(so);	/* Set SOLOCKED */
2105 			goto done;
2106 		}
2107 	}
2108 	ASSERT(so->so_state & SS_ISBOUND);
2109 
2110 	if (name == NULL || namelen == 0) {
2111 		error = EINVAL;
2112 		goto done;
2113 	}
2114 	/*
2115 	 * Mark the socket if so_faddr_sa represents the transport level
2116 	 * address.
2117 	 */
2118 	if (flags & _SOCONNECT_NOXLATE) {
2119 		struct sockaddr_ux	*soaddr_ux;
2120 
2121 		ASSERT(so->so_family == AF_UNIX);
2122 		if (namelen != sizeof (struct sockaddr_ux)) {
2123 			error = EINVAL;
2124 			goto done;
2125 		}
2126 		soaddr_ux = (struct sockaddr_ux *)name;
2127 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2128 		namelen = sizeof (soaddr_ux->sou_addr);
2129 		so->so_state |= SS_FADDR_NOXLATE;
2130 	}
2131 
2132 	/*
2133 	 * Length and family checks.
2134 	 */
2135 	error = so_addr_verify(so, name, namelen);
2136 	if (error)
2137 		goto bad;
2138 
2139 	/*
2140 	 * Save foreign address. Needed for AF_UNIX as well as
2141 	 * transport providers that do not support TI_GETPEERNAME.
2142 	 * Also used for cached foreign address for TCP and UDP.
2143 	 */
2144 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
2145 		error = EINVAL;
2146 		goto done;
2147 	}
2148 	so->so_faddr_len = (socklen_t)namelen;
2149 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2150 	bcopy(name, so->so_faddr_sa, namelen);
2151 	so->so_state |= SS_FADDR_VALID;
2152 
2153 	if (so->so_family == AF_UNIX) {
2154 		if (so->so_state & SS_FADDR_NOXLATE) {
2155 			/*
2156 			 * Already have a transport internal address. Do not
2157 			 * pass any (transport internal) source address.
2158 			 */
2159 			addr = so->so_faddr_sa;
2160 			addrlen = (t_uscalar_t)so->so_faddr_len;
2161 			src = NULL;
2162 			srclen = 0;
2163 		} else {
2164 			/*
2165 			 * Pass the sockaddr_un source address as an option
2166 			 * and translate the remote address.
2167 			 * Holding so_lock thus so_laddr_sa can not change.
2168 			 */
2169 			src = so->so_laddr_sa;
2170 			srclen = (t_uscalar_t)so->so_laddr_len;
2171 			dprintso(so, 1,
2172 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2173 			    srclen, src));
2174 			error = so_ux_addr_xlate(so,
2175 			    so->so_faddr_sa, (socklen_t)so->so_faddr_len,
2176 			    (flags & _SOCONNECT_XPG4_2),
2177 			    &addr, &addrlen);
2178 			if (error)
2179 				goto bad;
2180 		}
2181 	} else {
2182 		addr = so->so_faddr_sa;
2183 		addrlen = (t_uscalar_t)so->so_faddr_len;
2184 		src = NULL;
2185 		srclen = 0;
2186 	}
2187 	/*
2188 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2189 	 * option which asks the transport provider to send T_UDERR_IND
2190 	 * messages. These T_UDERR_IND messages are used to return connected
2191 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2192 	 *
2193 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2194 	 * we send down a T_CONN_REQ. This is needed to let the
2195 	 * transport assign a local address that is consistent with
2196 	 * the remote address. Applications depend on a getsockname()
2197 	 * after a connect() to retrieve the "source" IP address for
2198 	 * the connected socket.  Invalidate the cached local address
2199 	 * to force getsockname() to enquire of the transport.
2200 	 */
2201 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2202 		/*
2203 		 * Datagram socket.
2204 		 */
2205 		int32_t val;
2206 
2207 		so_unlock_single(so, SOLOCKED);
2208 		mutex_exit(&so->so_lock);
2209 
2210 		val = 1;
2211 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2212 		    &val, (t_uscalar_t)sizeof (val));
2213 
2214 		mutex_enter(&so->so_lock);
2215 		so_lock_single(so);	/* Set SOLOCKED */
2216 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2217 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2218 		    soconnect_tpi_udp) {
2219 			soisconnected(so);
2220 			goto done;
2221 		}
2222 		/*
2223 		 * Send down T_CONN_REQ etc.
2224 		 * Clear fflag to avoid returning EWOULDBLOCK.
2225 		 */
2226 		fflag = 0;
2227 		ASSERT(so->so_family != AF_UNIX);
2228 		so->so_state &= ~SS_LADDR_VALID;
2229 	} else if (so->so_laddr_len != 0) {
2230 		/*
2231 		 * If the local address or port was "any" then it may be
2232 		 * changed by the transport as a result of the
2233 		 * connect.  Invalidate the cached version if we have one.
2234 		 */
2235 		switch (so->so_family) {
2236 		case AF_INET:
2237 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2238 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2239 			    INADDR_ANY ||
2240 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2241 				so->so_state &= ~SS_LADDR_VALID;
2242 			break;
2243 
2244 		case AF_INET6:
2245 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2246 			if (IN6_IS_ADDR_UNSPECIFIED(
2247 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2248 			    IN6_IS_ADDR_V4MAPPED_ANY(
2249 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2250 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2251 				so->so_state &= ~SS_LADDR_VALID;
2252 			break;
2253 
2254 		default:
2255 			break;
2256 		}
2257 	}
2258 
2259 	/*
2260 	 * Check for failure of an earlier call
2261 	 */
2262 	if (so->so_error != 0)
2263 		goto so_bad;
2264 
2265 	/*
2266 	 * Send down T_CONN_REQ. Message was allocated above.
2267 	 */
2268 	conn_req.PRIM_type = T_CONN_REQ;
2269 	conn_req.DEST_length = addrlen;
2270 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2271 	if (srclen == 0) {
2272 		conn_req.OPT_length = 0;
2273 		conn_req.OPT_offset = 0;
2274 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2275 		soappendmsg(mp, addr, addrlen);
2276 	} else {
2277 		/*
2278 		 * There is a AF_UNIX sockaddr_un to include as a source
2279 		 * address option.
2280 		 */
2281 		struct T_opthdr toh;
2282 
2283 		toh.level = SOL_SOCKET;
2284 		toh.name = SO_SRCADDR;
2285 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2286 		toh.status = 0;
2287 		conn_req.OPT_length =
2288 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2289 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2290 		    _TPI_ALIGN_TOPT(addrlen));
2291 
2292 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2293 		soappendmsg(mp, addr, addrlen);
2294 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2295 		soappendmsg(mp, &toh, sizeof (toh));
2296 		soappendmsg(mp, src, srclen);
2297 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2298 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2299 	}
2300 	/*
2301 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2302 	 * in order to have the right state when the T_CONN_CON shows up.
2303 	 */
2304 	soisconnecting(so);
2305 	mutex_exit(&so->so_lock);
2306 
2307 	if (audit_active)
2308 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2309 
2310 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2311 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2312 	mp = NULL;
2313 	mutex_enter(&so->so_lock);
2314 	if (error != 0)
2315 		goto bad;
2316 
2317 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2318 		goto bad;
2319 
2320 	/* Allow other threads to access the socket */
2321 	so_unlock_single(so, SOLOCKED);
2322 	need_unlock = B_FALSE;
2323 
2324 	/*
2325 	 * Wait until we get a T_CONN_CON or an error
2326 	 */
2327 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2328 		so_lock_single(so);	/* Set SOLOCKED */
2329 		need_unlock = B_TRUE;
2330 	}
2331 
2332 done:
2333 	freemsg(mp);
2334 	switch (error) {
2335 	case EINPROGRESS:
2336 	case EALREADY:
2337 	case EISCONN:
2338 	case EINTR:
2339 		/* Non-fatal errors */
2340 		so->so_state &= ~SS_LADDR_VALID;
2341 		/* FALLTHRU */
2342 	case 0:
2343 		break;
2344 
2345 	case EHOSTUNREACH:
2346 		if (flags & _SOCONNECT_XPG4_2) {
2347 			/*
2348 			 * X/Open specification contains a requirement that
2349 			 * ENETUNREACH be returned but does not require
2350 			 * EHOSTUNREACH. In order to keep the test suite
2351 			 * happy we mess with the errno here.
2352 			 */
2353 			error = ENETUNREACH;
2354 		}
2355 		/* FALLTHRU */
2356 
2357 	default:
2358 		ASSERT(need_unlock);
2359 		/*
2360 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2361 		 * and invalidate local-address cache
2362 		 */
2363 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2364 		/* A discon_ind might have already unbound us */
2365 		if ((flags & _SOCONNECT_DID_BIND) &&
2366 		    (so->so_state & SS_ISBOUND)) {
2367 			int err;
2368 
2369 			err = sotpi_unbind(so, 0);
2370 			/* LINTED - statement has no conseq */
2371 			if (err) {
2372 				eprintsoline(so, err);
2373 			}
2374 		}
2375 		break;
2376 	}
2377 	if (need_unlock)
2378 		so_unlock_single(so, SOLOCKED);
2379 	mutex_exit(&so->so_lock);
2380 	return (error);
2381 
2382 so_bad:	error = sogeterr(so);
2383 bad:	eprintsoline(so, error);
2384 	goto done;
2385 }
2386 
2387 int
2388 sotpi_shutdown(struct sonode *so, int how)
2389 {
2390 	struct T_ordrel_req	ordrel_req;
2391 	mblk_t			*mp;
2392 	uint_t			old_state, state_change;
2393 	int			error = 0;
2394 
2395 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2396 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2397 
2398 	mutex_enter(&so->so_lock);
2399 	so_lock_single(so);	/* Set SOLOCKED */
2400 
2401 	/*
2402 	 * SunOS 4.X has no check for datagram sockets.
2403 	 * 5.X checks that it is connected (ENOTCONN)
2404 	 * X/Open requires that we check the connected state.
2405 	 */
2406 	if (!(so->so_state & SS_ISCONNECTED)) {
2407 		if (!xnet_skip_checks) {
2408 			error = ENOTCONN;
2409 			if (xnet_check_print) {
2410 				printf("sockfs: X/Open shutdown check "
2411 				    "caused ENOTCONN\n");
2412 			}
2413 		}
2414 		goto done;
2415 	}
2416 	/*
2417 	 * Record the current state and then perform any state changes.
2418 	 * Then use the difference between the old and new states to
2419 	 * determine which messages need to be sent.
2420 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2421 	 * duplicate calls to shutdown().
2422 	 */
2423 	old_state = so->so_state;
2424 
2425 	switch (how) {
2426 	case 0:
2427 		socantrcvmore(so);
2428 		break;
2429 	case 1:
2430 		socantsendmore(so);
2431 		break;
2432 	case 2:
2433 		socantsendmore(so);
2434 		socantrcvmore(so);
2435 		break;
2436 	default:
2437 		error = EINVAL;
2438 		goto done;
2439 	}
2440 
2441 	/*
2442 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2443 	 */
2444 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2445 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2446 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2447 
2448 	switch (state_change) {
2449 	case 0:
2450 		dprintso(so, 1,
2451 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2452 		    so->so_state));
2453 		goto done;
2454 
2455 	case SS_CANTRCVMORE:
2456 		mutex_exit(&so->so_lock);
2457 		strseteof(SOTOV(so), 1);
2458 		/*
2459 		 * strseteof takes care of read side wakeups,
2460 		 * pollwakeups, and signals.
2461 		 */
2462 		/*
2463 		 * Get the read lock before flushing data to avoid problems
2464 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2465 		 */
2466 		mutex_enter(&so->so_lock);
2467 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2468 		mutex_exit(&so->so_lock);
2469 
2470 		/* Flush read side queue */
2471 		strflushrq(SOTOV(so), FLUSHALL);
2472 
2473 		mutex_enter(&so->so_lock);
2474 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2475 		break;
2476 
2477 	case SS_CANTSENDMORE:
2478 		mutex_exit(&so->so_lock);
2479 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2480 		mutex_enter(&so->so_lock);
2481 		break;
2482 
2483 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2484 		mutex_exit(&so->so_lock);
2485 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2486 		strseteof(SOTOV(so), 1);
2487 		/*
2488 		 * strseteof takes care of read side wakeups,
2489 		 * pollwakeups, and signals.
2490 		 */
2491 		/*
2492 		 * Get the read lock before flushing data to avoid problems
2493 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2494 		 */
2495 		mutex_enter(&so->so_lock);
2496 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2497 		mutex_exit(&so->so_lock);
2498 
2499 		/* Flush read side queue */
2500 		strflushrq(SOTOV(so), FLUSHALL);
2501 
2502 		mutex_enter(&so->so_lock);
2503 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2504 		break;
2505 	}
2506 
2507 	ASSERT(MUTEX_HELD(&so->so_lock));
2508 
2509 	/*
2510 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2511 	 * was set due to this call and the new state has both of them set:
2512 	 *	Send the AF_UNIX close indication
2513 	 *	For T_COTS send a discon_ind
2514 	 *
2515 	 * If cantsend was set due to this call:
2516 	 *	For T_COTSORD send an ordrel_ind
2517 	 *
2518 	 * Note that for T_CLTS there is no message sent here.
2519 	 */
2520 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2521 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2522 		/*
2523 		 * For SunOS 4.X compatibility we tell the other end
2524 		 * that we are unable to receive at this point.
2525 		 */
2526 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2527 			so_unix_close(so);
2528 
2529 		if (so->so_serv_type == T_COTS)
2530 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2531 	}
2532 	if ((state_change & SS_CANTSENDMORE) &&
2533 	    (so->so_serv_type == T_COTS_ORD)) {
2534 		/* Send an orderly release */
2535 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2536 
2537 		mutex_exit(&so->so_lock);
2538 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2539 		    0, _ALLOC_SLEEP);
2540 		/*
2541 		 * Send down the T_ORDREL_REQ even if there is flow control.
2542 		 * This prevents shutdown from blocking.
2543 		 * Note that there is no T_OK_ACK for ordrel_req.
2544 		 */
2545 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2546 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2547 		mutex_enter(&so->so_lock);
2548 		if (error) {
2549 			eprintsoline(so, error);
2550 			goto done;
2551 		}
2552 	}
2553 
2554 done:
2555 	so_unlock_single(so, SOLOCKED);
2556 	mutex_exit(&so->so_lock);
2557 	return (error);
2558 }
2559 
2560 /*
2561  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2562  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2563  * that we have closed.
2564  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2565  * T_UNITDATA_REQ containing the same option.
2566  *
2567  * For SOCK_DGRAM half-connections (somebody connected to this end
2568  * but this end is not connect) we don't know where to send any
2569  * SO_UNIX_CLOSE.
2570  *
2571  * We have to ignore stream head errors just in case there has been
2572  * a shutdown(output).
2573  * Ignore any flow control to try to get the message more quickly to the peer.
2574  * While locally ignoring flow control solves the problem when there
2575  * is only the loopback transport on the stream it would not provide
2576  * the correct AF_UNIX socket semantics when one or more modules have
2577  * been pushed.
2578  */
2579 void
2580 so_unix_close(struct sonode *so)
2581 {
2582 	int		error;
2583 	struct T_opthdr	toh;
2584 	mblk_t		*mp;
2585 
2586 	ASSERT(MUTEX_HELD(&so->so_lock));
2587 
2588 	ASSERT(so->so_family == AF_UNIX);
2589 
2590 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2591 	    (SS_ISCONNECTED|SS_ISBOUND))
2592 		return;
2593 
2594 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2595 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2596 
2597 	toh.level = SOL_SOCKET;
2598 	toh.name = SO_UNIX_CLOSE;
2599 
2600 	/* zero length + header */
2601 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2602 	toh.status = 0;
2603 
2604 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2605 		struct T_optdata_req tdr;
2606 
2607 		tdr.PRIM_type = T_OPTDATA_REQ;
2608 		tdr.DATA_flag = 0;
2609 
2610 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2611 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2612 
2613 		/* NOTE: holding so_lock while sleeping */
2614 		mp = soallocproto2(&tdr, sizeof (tdr),
2615 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2616 	} else {
2617 		struct T_unitdata_req	tudr;
2618 		void			*addr;
2619 		socklen_t		addrlen;
2620 		void			*src;
2621 		socklen_t		srclen;
2622 		struct T_opthdr		toh2;
2623 		t_scalar_t		size;
2624 
2625 		/* Connecteded DGRAM socket */
2626 
2627 		/*
2628 		 * For AF_UNIX the destination address is translated to
2629 		 * an internal name and the source address is passed as
2630 		 * an option.
2631 		 */
2632 		/*
2633 		 * Length and family checks.
2634 		 */
2635 		error = so_addr_verify(so, so->so_faddr_sa,
2636 		    (t_uscalar_t)so->so_faddr_len);
2637 		if (error) {
2638 			eprintsoline(so, error);
2639 			return;
2640 		}
2641 		if (so->so_state & SS_FADDR_NOXLATE) {
2642 			/*
2643 			 * Already have a transport internal address. Do not
2644 			 * pass any (transport internal) source address.
2645 			 */
2646 			addr = so->so_faddr_sa;
2647 			addrlen = (t_uscalar_t)so->so_faddr_len;
2648 			src = NULL;
2649 			srclen = 0;
2650 		} else {
2651 			/*
2652 			 * Pass the sockaddr_un source address as an option
2653 			 * and translate the remote address.
2654 			 * Holding so_lock thus so_laddr_sa can not change.
2655 			 */
2656 			src = so->so_laddr_sa;
2657 			srclen = (socklen_t)so->so_laddr_len;
2658 			dprintso(so, 1,
2659 			    ("so_ux_close: srclen %d, src %p\n",
2660 			    srclen, src));
2661 			error = so_ux_addr_xlate(so,
2662 			    so->so_faddr_sa,
2663 			    (socklen_t)so->so_faddr_len, 0,
2664 			    &addr, &addrlen);
2665 			if (error) {
2666 				eprintsoline(so, error);
2667 				return;
2668 			}
2669 		}
2670 		tudr.PRIM_type = T_UNITDATA_REQ;
2671 		tudr.DEST_length = addrlen;
2672 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2673 		if (srclen == 0) {
2674 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2675 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2676 			    _TPI_ALIGN_TOPT(addrlen));
2677 
2678 			size = tudr.OPT_offset + tudr.OPT_length;
2679 			/* NOTE: holding so_lock while sleeping */
2680 			mp = soallocproto2(&tudr, sizeof (tudr),
2681 			    addr, addrlen, size, _ALLOC_SLEEP);
2682 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2683 			soappendmsg(mp, &toh, sizeof (toh));
2684 		} else {
2685 			/*
2686 			 * There is a AF_UNIX sockaddr_un to include as a
2687 			 * source address option.
2688 			 */
2689 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2690 			    _TPI_ALIGN_TOPT(srclen));
2691 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2692 			    _TPI_ALIGN_TOPT(addrlen));
2693 
2694 			toh2.level = SOL_SOCKET;
2695 			toh2.name = SO_SRCADDR;
2696 			toh2.len = (t_uscalar_t)(srclen +
2697 			    sizeof (struct T_opthdr));
2698 			toh2.status = 0;
2699 
2700 			size = tudr.OPT_offset + tudr.OPT_length;
2701 
2702 			/* NOTE: holding so_lock while sleeping */
2703 			mp = soallocproto2(&tudr, sizeof (tudr),
2704 			    addr, addrlen, size, _ALLOC_SLEEP);
2705 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2706 			soappendmsg(mp, &toh, sizeof (toh));
2707 			soappendmsg(mp, &toh2, sizeof (toh2));
2708 			soappendmsg(mp, src, srclen);
2709 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2710 		}
2711 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2712 	}
2713 	mutex_exit(&so->so_lock);
2714 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2715 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2716 	mutex_enter(&so->so_lock);
2717 }
2718 
2719 /*
2720  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2721  */
2722 int
2723 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2724 {
2725 	mblk_t		*mp, *nmp;
2726 	int		error;
2727 
2728 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n",
2729 	    (void *)so, (void *)msg, flags));
2730 
2731 	/*
2732 	 * There is never any oob data with addresses or control since
2733 	 * the T_EXDATA_IND does not carry any options.
2734 	 */
2735 	msg->msg_controllen = 0;
2736 	msg->msg_namelen = 0;
2737 
2738 	mutex_enter(&so->so_lock);
2739 	ASSERT(so_verify_oobstate(so));
2740 	if ((so->so_options & SO_OOBINLINE) ||
2741 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2742 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2743 		mutex_exit(&so->so_lock);
2744 		return (EINVAL);
2745 	}
2746 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2747 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2748 		mutex_exit(&so->so_lock);
2749 		return (EWOULDBLOCK);
2750 	}
2751 	ASSERT(so->so_oobmsg != NULL);
2752 	mp = so->so_oobmsg;
2753 	if (flags & MSG_PEEK) {
2754 		/*
2755 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2756 		 * Instead we revert to the consolidation private
2757 		 * allocb_wait plus bcopy.
2758 		 */
2759 		mblk_t *mp1;
2760 
2761 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2762 		ASSERT(mp1);
2763 
2764 		while (mp != NULL) {
2765 			ssize_t size;
2766 
2767 			size = MBLKL(mp);
2768 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2769 			mp1->b_wptr += size;
2770 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2771 			mp = mp->b_cont;
2772 		}
2773 		mp = mp1;
2774 	} else {
2775 		/*
2776 		 * Update the state indicating that the data has been consumed.
2777 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2778 		 */
2779 		so->so_oobmsg = NULL;
2780 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2781 	}
2782 	dprintso(so, 1,
2783 	    ("after recvoob(%p): counts %d/%d state %s\n",
2784 	    (void *)so, so->so_oobsigcnt,
2785 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2786 	ASSERT(so_verify_oobstate(so));
2787 	mutex_exit(&so->so_lock);
2788 
2789 	error = 0;
2790 	nmp = mp;
2791 	while (nmp != NULL && uiop->uio_resid > 0) {
2792 		ssize_t n = MBLKL(nmp);
2793 
2794 		n = MIN(n, uiop->uio_resid);
2795 		if (n > 0)
2796 			error = uiomove(nmp->b_rptr, n,
2797 			    UIO_READ, uiop);
2798 		if (error)
2799 			break;
2800 		nmp = nmp->b_cont;
2801 	}
2802 	freemsg(mp);
2803 	return (error);
2804 }
2805 
2806 /*
2807  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2808  * In addition, the caller typically verifies that there is some
2809  * potential state to clear by checking
2810  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2811  * before calling this routine.
2812  * Note that such a check can be made without holding so_lock since
2813  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2814  * decrements so_oobsigcnt.
2815  *
2816  * When data is read *after* the point that all pending
2817  * oob data has been consumed the oob indication is cleared.
2818  *
2819  * This logic keeps select/poll returning POLLRDBAND and
2820  * SIOCATMARK returning true until we have read past
2821  * the mark.
2822  */
2823 static void
2824 sorecv_update_oobstate(struct sonode *so)
2825 {
2826 	mutex_enter(&so->so_lock);
2827 	ASSERT(so_verify_oobstate(so));
2828 	dprintso(so, 1,
2829 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2830 	    so->so_oobsigcnt,
2831 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2832 	if (so->so_oobsigcnt == 0) {
2833 		/* No more pending oob indications */
2834 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2835 		freemsg(so->so_oobmsg);
2836 		so->so_oobmsg = NULL;
2837 	}
2838 	ASSERT(so_verify_oobstate(so));
2839 	mutex_exit(&so->so_lock);
2840 }
2841 
2842 /*
2843  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2844  */
2845 static int
2846 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2847 {
2848 	int	error = 0;
2849 	mblk_t *tmp = NULL;
2850 	mblk_t *pmp = NULL;
2851 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2852 
2853 	ASSERT(nmp != NULL);
2854 
2855 	while (nmp != NULL && uiop->uio_resid > 0) {
2856 		ssize_t n;
2857 
2858 		if (DB_TYPE(nmp) == M_DATA) {
2859 			/*
2860 			 * We have some data, uiomove up to resid bytes.
2861 			 */
2862 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2863 			if (n > 0)
2864 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2865 			nmp->b_rptr += n;
2866 			if (nmp->b_rptr == nmp->b_wptr) {
2867 				pmp = nmp;
2868 				nmp = nmp->b_cont;
2869 			}
2870 			if (error)
2871 				break;
2872 		} else {
2873 			/*
2874 			 * We only handle data, save for caller to handle.
2875 			 */
2876 			if (pmp != NULL) {
2877 				pmp->b_cont = nmp->b_cont;
2878 			}
2879 			nmp->b_cont = NULL;
2880 			if (*rmp == NULL) {
2881 				*rmp = nmp;
2882 			} else {
2883 				tmp->b_cont = nmp;
2884 			}
2885 			nmp = nmp->b_cont;
2886 			tmp = nmp;
2887 		}
2888 	}
2889 	if (pmp != NULL) {
2890 		/* Free any mblk_t(s) which we have consumed */
2891 		pmp->b_cont = NULL;
2892 		freemsg(so->so_nl7c_rcv_mp);
2893 	}
2894 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2895 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
2896 		if (error == 0) {
2897 			rval_t	*p = (rval_t *)&so->so_nl7c_rcv_rval;
2898 
2899 			error = p->r_v.r_v2;
2900 			p->r_v.r_v2 = 0;
2901 		}
2902 		rp->r_vals = so->so_nl7c_rcv_rval;
2903 		so->so_nl7c_rcv_rval = 0;
2904 	} else {
2905 		/* More mblk_t(s) to process so no rval to return */
2906 		rp->r_vals = 0;
2907 	}
2908 	return (error);
2909 }
2910 
2911 /*
2912  * Receive the next message on the queue.
2913  * If msg_controllen is non-zero when called the caller is interested in
2914  * any received control info (options).
2915  * If msg_namelen is non-zero when called the caller is interested in
2916  * any received source address.
2917  * The routine returns with msg_control and msg_name pointing to
2918  * kmem_alloc'ed memory which the caller has to free.
2919  */
2920 int
2921 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2922 {
2923 	union T_primitives	*tpr;
2924 	mblk_t			*mp;
2925 	uchar_t			pri;
2926 	int			pflag, opflag;
2927 	void			*control;
2928 	t_uscalar_t		controllen;
2929 	t_uscalar_t		namelen;
2930 	int			so_state = so->so_state; /* Snapshot */
2931 	ssize_t			saved_resid;
2932 	rval_t			rval;
2933 	int			flags;
2934 	clock_t			timout;
2935 	int			first;
2936 	int			error = 0;
2937 	struct uio		*suiop = NULL;
2938 	sodirect_t		*sodp = so->so_direct;
2939 
2940 	flags = msg->msg_flags;
2941 	msg->msg_flags = 0;
2942 
2943 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2944 	    (void *)so, (void *)msg, flags,
2945 	    pr_state(so->so_state, so->so_mode), so->so_error));
2946 
2947 	/*
2948 	 * If we are not connected because we have never been connected
2949 	 * we return ENOTCONN. If we have been connected (but are no longer
2950 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2951 	 * the EOF.
2952 	 *
2953 	 * An alternative would be to post an ENOTCONN error in stream head
2954 	 * (read+write) and clear it when we're connected. However, that error
2955 	 * would cause incorrect poll/select behavior!
2956 	 */
2957 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2958 	    (so->so_mode & SM_CONNREQUIRED)) {
2959 		return (ENOTCONN);
2960 	}
2961 
2962 	/*
2963 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2964 	 * after checking that the read queue is empty) and returns zero.
2965 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2966 	 * is zero.
2967 	 */
2968 
2969 	if (flags & MSG_OOB) {
2970 		/* Check that the transport supports OOB */
2971 		if (!(so->so_mode & SM_EXDATA))
2972 			return (EOPNOTSUPP);
2973 		return (sorecvoob(so, msg, uiop, flags));
2974 	}
2975 
2976 	/*
2977 	 * Set msg_controllen and msg_namelen to zero here to make it
2978 	 * simpler in the cases that no control or name is returned.
2979 	 */
2980 	controllen = msg->msg_controllen;
2981 	namelen = msg->msg_namelen;
2982 	msg->msg_controllen = 0;
2983 	msg->msg_namelen = 0;
2984 
2985 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2986 	    namelen, controllen));
2987 
2988 	mutex_enter(&so->so_lock);
2989 	/*
2990 	 * If an NL7C enabled socket and not waiting for write data.
2991 	 */
2992 	if ((so->so_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
2993 	    NL7C_ENABLED) {
2994 		if (so->so_nl7c_uri) {
2995 			/* Close uri processing for a previous request */
2996 			nl7c_close(so);
2997 		}
2998 		if ((so_state & SS_CANTRCVMORE) && so->so_nl7c_rcv_mp == NULL) {
2999 			/* Nothing to process, EOF */
3000 			mutex_exit(&so->so_lock);
3001 			return (0);
3002 		} else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
3003 			/* Persistent NL7C socket, try to process request */
3004 			boolean_t ret;
3005 
3006 			ret = nl7c_process(so,
3007 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3008 			rval.r_vals = so->so_nl7c_rcv_rval;
3009 			error = rval.r_v.r_v2;
3010 			if (error) {
3011 				/* Error of some sort, return it */
3012 				mutex_exit(&so->so_lock);
3013 				return (error);
3014 			}
3015 			if (so->so_nl7c_flags &&
3016 			    ! (so->so_nl7c_flags & NL7C_WAITWRITE)) {
3017 				/*
3018 				 * Still an NL7C socket and no data
3019 				 * to pass up to the caller.
3020 				 */
3021 				mutex_exit(&so->so_lock);
3022 				if (ret) {
3023 					/* EOF */
3024 					return (0);
3025 				} else {
3026 					/* Need more data */
3027 					return (EAGAIN);
3028 				}
3029 			}
3030 		} else {
3031 			/*
3032 			 * Not persistent so no further NL7C processing.
3033 			 */
3034 			so->so_nl7c_flags = 0;
3035 		}
3036 	}
3037 	/*
3038 	 * Only one reader is allowed at any given time. This is needed
3039 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3040 	 *
3041 	 * This is slightly different that BSD behavior in that it fails with
3042 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3043 	 * is single-threaded using sblock(), which is dropped while waiting
3044 	 * for data to appear. The difference shows up e.g. if one
3045 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3046 	 * does use nonblocking io and different threads are reading each
3047 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3048 	 * in this case as long as the read queue doesn't get empty.
3049 	 * In this implementation the thread using nonblocking io can
3050 	 * get an EWOULDBLOCK error due to the blocking thread executing
3051 	 * e.g. in the uiomove in kstrgetmsg.
3052 	 * This difference is not believed to be significant.
3053 	 */
3054 	/* Set SOREADLOCKED */
3055 	error = so_lock_read_intr(so,
3056 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3057 	mutex_exit(&so->so_lock);
3058 	if (error)
3059 		return (error);
3060 
3061 	/*
3062 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3063 	 * queued data has been consumed.
3064 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3065 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3066 	 *
3067 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3068 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3069 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3070 	 */
3071 	pflag = MSG_ANY | MSG_DELAYERROR;
3072 	if (flags & MSG_PEEK) {
3073 		pflag |= MSG_IPEEK;
3074 		flags &= ~MSG_WAITALL;
3075 	}
3076 	if (so->so_mode & SM_ATOMIC)
3077 		pflag |= MSG_DISCARDTAIL;
3078 
3079 	if (flags & MSG_DONTWAIT)
3080 		timout = 0;
3081 	else
3082 		timout = -1;
3083 	opflag = pflag;
3084 	first = 1;
3085 
3086 	if (uiop->uio_resid >= uioasync.mincnt &&
3087 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
3088 	    uioasync.enabled && !(flags & MSG_PEEK) &&
3089 	    !(so_state & SS_CANTRCVMORE)) {
3090 		/*
3091 		 * Big enough I/O for uioa min setup and an sodirect socket
3092 		 * and sodirect enabled and uioa enabled and I/O will be done
3093 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
3094 		 */
3095 		mutex_enter(sodp->sod_lockp);
3096 		if (!uioainit(uiop, &sodp->sod_uioa)) {
3097 			/*
3098 			 * Successful uioainit() so the uio_t part of the
3099 			 * uioa_t will be used for all uio_t work to follow,
3100 			 * we save the original "uiop" in "suiop".
3101 			 */
3102 			suiop = uiop;
3103 			uiop = (uio_t *)&sodp->sod_uioa;
3104 			/*
3105 			 * Before returning to the caller the passed in uio_t
3106 			 * "uiop" will be updated via a call to uioafini()
3107 			 * below.
3108 			 *
3109 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
3110 			 * here as first we have to uioamove() any currently
3111 			 * queued M_DATA mblk_t(s) so it will be done in
3112 			 * kstrgetmsg().
3113 			 */
3114 		}
3115 		/*
3116 		 * In either uioainit() success or not case note the number
3117 		 * of uio bytes the caller wants for sod framework and/or
3118 		 * transport (e.g. TCP) strategy.
3119 		 */
3120 		sodp->sod_want = uiop->uio_resid;
3121 		mutex_exit(sodp->sod_lockp);
3122 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
3123 		/*
3124 		 * No uioa but still using sodirect so note the number of
3125 		 * uio bytes the caller wants for sodirect framework and/or
3126 		 * transport (e.g. TCP) strategy.
3127 		 *
3128 		 * Note, sod_lockp not held, only writer is in this function
3129 		 * and only one thread at a time so not needed just to init.
3130 		 */
3131 		sodp->sod_want = uiop->uio_resid;
3132 	}
3133 retry:
3134 	saved_resid = uiop->uio_resid;
3135 	pri = 0;
3136 	mp = NULL;
3137 	if (so->so_nl7c_rcv_mp != NULL) {
3138 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3139 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3140 	} else {
3141 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3142 		    timout, &rval);
3143 	}
3144 	if (error) {
3145 		switch (error) {
3146 		case EINTR:
3147 		case EWOULDBLOCK:
3148 			if (!first)
3149 				error = 0;
3150 			break;
3151 		case ETIME:
3152 			/* Returned from kstrgetmsg when timeout expires */
3153 			if (!first)
3154 				error = 0;
3155 			else
3156 				error = EWOULDBLOCK;
3157 			break;
3158 		default:
3159 			eprintsoline(so, error);
3160 			break;
3161 		}
3162 		goto out;
3163 	}
3164 	/*
3165 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3166 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3167 	 */
3168 	ASSERT(!(rval.r_val1 & MORECTL));
3169 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3170 		msg->msg_flags |= MSG_TRUNC;
3171 
3172 	if (mp == NULL) {
3173 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3174 		/*
3175 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3176 		 * The draft Posix socket spec states that the mark should
3177 		 * not be cleared when peeking. We follow the latter.
3178 		 */
3179 		if ((so->so_state &
3180 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3181 		    (uiop->uio_resid != saved_resid) &&
3182 		    !(flags & MSG_PEEK)) {
3183 			sorecv_update_oobstate(so);
3184 		}
3185 
3186 		mutex_enter(&so->so_lock);
3187 		/* Set MSG_EOR based on MOREDATA */
3188 		if (!(rval.r_val1 & MOREDATA)) {
3189 			if (so->so_state & SS_SAVEDEOR) {
3190 				msg->msg_flags |= MSG_EOR;
3191 				so->so_state &= ~SS_SAVEDEOR;
3192 			}
3193 		}
3194 		/*
3195 		 * If some data was received (i.e. not EOF) and the
3196 		 * read/recv* has not been satisfied wait for some more.
3197 		 */
3198 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3199 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3200 			mutex_exit(&so->so_lock);
3201 			first = 0;
3202 			pflag = opflag | MSG_NOMARK;
3203 			goto retry;
3204 		}
3205 		goto out_locked;
3206 	}
3207 
3208 	/* strsock_proto has already verified length and alignment */
3209 	tpr = (union T_primitives *)mp->b_rptr;
3210 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3211 
3212 	switch (tpr->type) {
3213 	case T_DATA_IND: {
3214 		if ((so->so_state &
3215 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3216 		    (uiop->uio_resid != saved_resid) &&
3217 		    !(flags & MSG_PEEK)) {
3218 			sorecv_update_oobstate(so);
3219 		}
3220 
3221 		/*
3222 		 * Set msg_flags to MSG_EOR based on
3223 		 * MORE_flag and MOREDATA.
3224 		 */
3225 		mutex_enter(&so->so_lock);
3226 		so->so_state &= ~SS_SAVEDEOR;
3227 		if (!(tpr->data_ind.MORE_flag & 1)) {
3228 			if (!(rval.r_val1 & MOREDATA))
3229 				msg->msg_flags |= MSG_EOR;
3230 			else
3231 				so->so_state |= SS_SAVEDEOR;
3232 		}
3233 		freemsg(mp);
3234 		/*
3235 		 * If some data was received (i.e. not EOF) and the
3236 		 * read/recv* has not been satisfied wait for some more.
3237 		 */
3238 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3239 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3240 			mutex_exit(&so->so_lock);
3241 			first = 0;
3242 			pflag = opflag | MSG_NOMARK;
3243 			goto retry;
3244 		}
3245 		goto out_locked;
3246 	}
3247 	case T_UNITDATA_IND: {
3248 		void *addr;
3249 		t_uscalar_t addrlen;
3250 		void *abuf;
3251 		t_uscalar_t optlen;
3252 		void *opt;
3253 
3254 		if ((so->so_state &
3255 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3256 		    (uiop->uio_resid != saved_resid) &&
3257 		    !(flags & MSG_PEEK)) {
3258 			sorecv_update_oobstate(so);
3259 		}
3260 
3261 		if (namelen != 0) {
3262 			/* Caller wants source address */
3263 			addrlen = tpr->unitdata_ind.SRC_length;
3264 			addr = sogetoff(mp,
3265 			    tpr->unitdata_ind.SRC_offset,
3266 			    addrlen, 1);
3267 			if (addr == NULL) {
3268 				freemsg(mp);
3269 				error = EPROTO;
3270 				eprintsoline(so, error);
3271 				goto out;
3272 			}
3273 			if (so->so_family == AF_UNIX) {
3274 				/*
3275 				 * Can not use the transport level address.
3276 				 * If there is a SO_SRCADDR option carrying
3277 				 * the socket level address it will be
3278 				 * extracted below.
3279 				 */
3280 				addr = NULL;
3281 				addrlen = 0;
3282 			}
3283 		}
3284 		optlen = tpr->unitdata_ind.OPT_length;
3285 		if (optlen != 0) {
3286 			t_uscalar_t ncontrollen;
3287 
3288 			/*
3289 			 * Extract any source address option.
3290 			 * Determine how large cmsg buffer is needed.
3291 			 */
3292 			opt = sogetoff(mp,
3293 			    tpr->unitdata_ind.OPT_offset,
3294 			    optlen, __TPI_ALIGN_SIZE);
3295 
3296 			if (opt == NULL) {
3297 				freemsg(mp);
3298 				error = EPROTO;
3299 				eprintsoline(so, error);
3300 				goto out;
3301 			}
3302 			if (so->so_family == AF_UNIX)
3303 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3304 			ncontrollen = so_cmsglen(mp, opt, optlen,
3305 			    !(flags & MSG_XPG4_2));
3306 			if (controllen != 0)
3307 				controllen = ncontrollen;
3308 			else if (ncontrollen != 0)
3309 				msg->msg_flags |= MSG_CTRUNC;
3310 		} else {
3311 			controllen = 0;
3312 		}
3313 
3314 		if (namelen != 0) {
3315 			/*
3316 			 * Return address to caller.
3317 			 * Caller handles truncation if length
3318 			 * exceeds msg_namelen.
3319 			 * NOTE: AF_UNIX NUL termination is ensured by
3320 			 * the sender's copyin_name().
3321 			 */
3322 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3323 
3324 			bcopy(addr, abuf, addrlen);
3325 			msg->msg_name = abuf;
3326 			msg->msg_namelen = addrlen;
3327 		}
3328 
3329 		if (controllen != 0) {
3330 			/*
3331 			 * Return control msg to caller.
3332 			 * Caller handles truncation if length
3333 			 * exceeds msg_controllen.
3334 			 */
3335 			control = kmem_zalloc(controllen, KM_SLEEP);
3336 
3337 			error = so_opt2cmsg(mp, opt, optlen,
3338 			    !(flags & MSG_XPG4_2),
3339 			    control, controllen);
3340 			if (error) {
3341 				freemsg(mp);
3342 				if (msg->msg_namelen != 0)
3343 					kmem_free(msg->msg_name,
3344 					    msg->msg_namelen);
3345 				kmem_free(control, controllen);
3346 				eprintsoline(so, error);
3347 				goto out;
3348 			}
3349 			msg->msg_control = control;
3350 			msg->msg_controllen = controllen;
3351 		}
3352 
3353 		freemsg(mp);
3354 		goto out;
3355 	}
3356 	case T_OPTDATA_IND: {
3357 		struct T_optdata_req *tdr;
3358 		void *opt;
3359 		t_uscalar_t optlen;
3360 
3361 		if ((so->so_state &
3362 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3363 		    (uiop->uio_resid != saved_resid) &&
3364 		    !(flags & MSG_PEEK)) {
3365 			sorecv_update_oobstate(so);
3366 		}
3367 
3368 		tdr = (struct T_optdata_req *)mp->b_rptr;
3369 		optlen = tdr->OPT_length;
3370 		if (optlen != 0) {
3371 			t_uscalar_t ncontrollen;
3372 			/*
3373 			 * Determine how large cmsg buffer is needed.
3374 			 */
3375 			opt = sogetoff(mp,
3376 			    tpr->optdata_ind.OPT_offset,
3377 			    optlen, __TPI_ALIGN_SIZE);
3378 
3379 			if (opt == NULL) {
3380 				freemsg(mp);
3381 				error = EPROTO;
3382 				eprintsoline(so, error);
3383 				goto out;
3384 			}
3385 
3386 			ncontrollen = so_cmsglen(mp, opt, optlen,
3387 			    !(flags & MSG_XPG4_2));
3388 			if (controllen != 0)
3389 				controllen = ncontrollen;
3390 			else if (ncontrollen != 0)
3391 				msg->msg_flags |= MSG_CTRUNC;
3392 		} else {
3393 			controllen = 0;
3394 		}
3395 
3396 		if (controllen != 0) {
3397 			/*
3398 			 * Return control msg to caller.
3399 			 * Caller handles truncation if length
3400 			 * exceeds msg_controllen.
3401 			 */
3402 			control = kmem_zalloc(controllen, KM_SLEEP);
3403 
3404 			error = so_opt2cmsg(mp, opt, optlen,
3405 			    !(flags & MSG_XPG4_2),
3406 			    control, controllen);
3407 			if (error) {
3408 				freemsg(mp);
3409 				kmem_free(control, controllen);
3410 				eprintsoline(so, error);
3411 				goto out;
3412 			}
3413 			msg->msg_control = control;
3414 			msg->msg_controllen = controllen;
3415 		}
3416 
3417 		/*
3418 		 * Set msg_flags to MSG_EOR based on
3419 		 * DATA_flag and MOREDATA.
3420 		 */
3421 		mutex_enter(&so->so_lock);
3422 		so->so_state &= ~SS_SAVEDEOR;
3423 		if (!(tpr->data_ind.MORE_flag & 1)) {
3424 			if (!(rval.r_val1 & MOREDATA))
3425 				msg->msg_flags |= MSG_EOR;
3426 			else
3427 				so->so_state |= SS_SAVEDEOR;
3428 		}
3429 		freemsg(mp);
3430 		/*
3431 		 * If some data was received (i.e. not EOF) and the
3432 		 * read/recv* has not been satisfied wait for some more.
3433 		 * Not possible to wait if control info was received.
3434 		 */
3435 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3436 		    controllen == 0 &&
3437 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3438 			mutex_exit(&so->so_lock);
3439 			first = 0;
3440 			pflag = opflag | MSG_NOMARK;
3441 			goto retry;
3442 		}
3443 		goto out_locked;
3444 	}
3445 	case T_EXDATA_IND: {
3446 		dprintso(so, 1,
3447 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3448 		    "state %s\n",
3449 		    so->so_oobsigcnt, so->so_oobcnt,
3450 		    saved_resid - uiop->uio_resid,
3451 		    pr_state(so->so_state, so->so_mode)));
3452 		/*
3453 		 * kstrgetmsg handles MSGMARK so there is nothing to
3454 		 * inspect in the T_EXDATA_IND.
3455 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3456 		 * as a separate message with no M_DATA component. Furthermore,
3457 		 * the stream head does not consolidate M_DATA messages onto
3458 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3459 		 * remains a message by itself. This is needed since MSGMARK
3460 		 * marks both the whole message as well as the last byte
3461 		 * of the message.
3462 		 */
3463 		freemsg(mp);
3464 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3465 		if (flags & MSG_PEEK) {
3466 			/*
3467 			 * Even though we are peeking we consume the
3468 			 * T_EXDATA_IND thereby moving the mark information
3469 			 * to SS_RCVATMARK. Then the oob code below will
3470 			 * retry the peeking kstrgetmsg.
3471 			 * Note that the stream head read queue is
3472 			 * never flushed without holding SOREADLOCKED
3473 			 * thus the T_EXDATA_IND can not disappear
3474 			 * underneath us.
3475 			 */
3476 			dprintso(so, 1,
3477 			    ("sotpi_recvmsg: consume EXDATA_IND "
3478 			    "counts %d/%d state %s\n",
3479 			    so->so_oobsigcnt,
3480 			    so->so_oobcnt,
3481 			    pr_state(so->so_state, so->so_mode)));
3482 
3483 			pflag = MSG_ANY | MSG_DELAYERROR;
3484 			if (so->so_mode & SM_ATOMIC)
3485 				pflag |= MSG_DISCARDTAIL;
3486 
3487 			pri = 0;
3488 			mp = NULL;
3489 
3490 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3491 			    &pri, &pflag, (clock_t)-1, &rval);
3492 			ASSERT(uiop->uio_resid == saved_resid);
3493 
3494 			if (error) {
3495 #ifdef SOCK_DEBUG
3496 				if (error != EWOULDBLOCK && error != EINTR) {
3497 					eprintsoline(so, error);
3498 				}
3499 #endif /* SOCK_DEBUG */
3500 				goto out;
3501 			}
3502 			ASSERT(mp);
3503 			tpr = (union T_primitives *)mp->b_rptr;
3504 			ASSERT(tpr->type == T_EXDATA_IND);
3505 			freemsg(mp);
3506 		} /* end "if (flags & MSG_PEEK)" */
3507 
3508 		/*
3509 		 * Decrement the number of queued and pending oob.
3510 		 *
3511 		 * SS_RCVATMARK is cleared when we read past a mark.
3512 		 * SS_HAVEOOBDATA is cleared when we've read past the
3513 		 * last mark.
3514 		 * SS_OOBPEND is cleared if we've read past the last
3515 		 * mark and no (new) SIGURG has been posted.
3516 		 */
3517 		mutex_enter(&so->so_lock);
3518 		ASSERT(so_verify_oobstate(so));
3519 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3520 		ASSERT(so->so_oobsigcnt > 0);
3521 		so->so_oobsigcnt--;
3522 		ASSERT(so->so_oobcnt > 0);
3523 		so->so_oobcnt--;
3524 		/*
3525 		 * Since the T_EXDATA_IND has been removed from the stream
3526 		 * head, but we have not read data past the mark,
3527 		 * sockfs needs to track that the socket is still at the mark.
3528 		 *
3529 		 * Since no data was received call kstrgetmsg again to wait
3530 		 * for data.
3531 		 */
3532 		so->so_state |= SS_RCVATMARK;
3533 		mutex_exit(&so->so_lock);
3534 		dprintso(so, 1,
3535 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3536 		    so->so_oobsigcnt, so->so_oobcnt,
3537 		    pr_state(so->so_state, so->so_mode)));
3538 		pflag = opflag;
3539 		goto retry;
3540 	}
3541 	default:
3542 		ASSERT(0);
3543 		freemsg(mp);
3544 		error = EPROTO;
3545 		eprintsoline(so, error);
3546 		goto out;
3547 	}
3548 	/* NOTREACHED */
3549 out:
3550 	mutex_enter(&so->so_lock);
3551 out_locked:
3552 	if (sodp != NULL) {
3553 		/* Finish any sodirect and uioa processing */
3554 		mutex_enter(sodp->sod_lockp);
3555 		if (suiop != NULL) {
3556 			/* Finish any uioa_t processing */
3557 			int ret;
3558 
3559 			ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
3560 			ret = uioafini(suiop, (uioa_t *)uiop);
3561 			if (error == 0 && ret != 0) {
3562 				/* If no error yet, set it */
3563 				error = ret;
3564 			}
3565 			if ((mp = sodp->sod_uioafh) != NULL) {
3566 				sodp->sod_uioafh = NULL;
3567 				sodp->sod_uioaft = NULL;
3568 				freemsg(mp);
3569 			}
3570 		}
3571 		ASSERT(sodp->sod_uioafh == NULL);
3572 		if (!(sodp->sod_state & SOD_WAKE_NOT)) {
3573 			/* Awoke */
3574 			sodp->sod_state &= SOD_WAKE_CLR;
3575 			sodp->sod_state |= SOD_WAKE_NOT;
3576 		}
3577 		/* Last, clear sod_want value */
3578 		sodp->sod_want = 0;
3579 		mutex_exit(sodp->sod_lockp);
3580 	}
3581 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3582 	mutex_exit(&so->so_lock);
3583 	return (error);
3584 }
3585 
3586 /*
3587  * Sending data with options on a datagram socket.
3588  * Assumes caller has verified that SS_ISBOUND etc. are set.
3589  */
3590 static int
3591 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3592     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3593 {
3594 	struct T_unitdata_req	tudr;
3595 	mblk_t			*mp;
3596 	int			error;
3597 	void			*addr;
3598 	socklen_t		addrlen;
3599 	void			*src;
3600 	socklen_t		srclen;
3601 	ssize_t			len;
3602 	int			size;
3603 	struct T_opthdr		toh;
3604 	struct fdbuf		*fdbuf;
3605 	t_uscalar_t		optlen;
3606 	void			*fds;
3607 	int			fdlen;
3608 
3609 	ASSERT(name && namelen);
3610 	ASSERT(control && controllen);
3611 
3612 	len = uiop->uio_resid;
3613 	if (len > (ssize_t)so->so_tidu_size) {
3614 		return (EMSGSIZE);
3615 	}
3616 
3617 	/*
3618 	 * For AF_UNIX the destination address is translated to an internal
3619 	 * name and the source address is passed as an option.
3620 	 * Also, file descriptors are passed as file pointers in an
3621 	 * option.
3622 	 */
3623 
3624 	/*
3625 	 * Length and family checks.
3626 	 */
3627 	error = so_addr_verify(so, name, namelen);
3628 	if (error) {
3629 		eprintsoline(so, error);
3630 		return (error);
3631 	}
3632 	if (so->so_family == AF_UNIX) {
3633 		if (so->so_state & SS_FADDR_NOXLATE) {
3634 			/*
3635 			 * Already have a transport internal address. Do not
3636 			 * pass any (transport internal) source address.
3637 			 */
3638 			addr = name;
3639 			addrlen = namelen;
3640 			src = NULL;
3641 			srclen = 0;
3642 		} else {
3643 			/*
3644 			 * Pass the sockaddr_un source address as an option
3645 			 * and translate the remote address.
3646 			 *
3647 			 * Note that this code does not prevent so_laddr_sa
3648 			 * from changing while it is being used. Thus
3649 			 * if an unbind+bind occurs concurrently with this
3650 			 * send the peer might see a partially new and a
3651 			 * partially old "from" address.
3652 			 */
3653 			src = so->so_laddr_sa;
3654 			srclen = (t_uscalar_t)so->so_laddr_len;
3655 			dprintso(so, 1,
3656 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3657 			    srclen, src));
3658 			error = so_ux_addr_xlate(so, name, namelen,
3659 			    (flags & MSG_XPG4_2),
3660 			    &addr, &addrlen);
3661 			if (error) {
3662 				eprintsoline(so, error);
3663 				return (error);
3664 			}
3665 		}
3666 	} else {
3667 		addr = name;
3668 		addrlen = namelen;
3669 		src = NULL;
3670 		srclen = 0;
3671 	}
3672 	optlen = so_optlen(control, controllen,
3673 	    !(flags & MSG_XPG4_2));
3674 	tudr.PRIM_type = T_UNITDATA_REQ;
3675 	tudr.DEST_length = addrlen;
3676 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3677 	if (srclen != 0)
3678 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3679 		    _TPI_ALIGN_TOPT(srclen));
3680 	else
3681 		tudr.OPT_length = optlen;
3682 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3683 	    _TPI_ALIGN_TOPT(addrlen));
3684 
3685 	size = tudr.OPT_offset + tudr.OPT_length;
3686 
3687 	/*
3688 	 * File descriptors only when SM_FDPASSING set.
3689 	 */
3690 	error = so_getfdopt(control, controllen,
3691 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3692 	if (error)
3693 		return (error);
3694 	if (fdlen != -1) {
3695 		if (!(so->so_mode & SM_FDPASSING))
3696 			return (EOPNOTSUPP);
3697 
3698 		error = fdbuf_create(fds, fdlen, &fdbuf);
3699 		if (error)
3700 			return (error);
3701 		mp = fdbuf_allocmsg(size, fdbuf);
3702 	} else {
3703 		mp = soallocproto(size, _ALLOC_INTR);
3704 		if (mp == NULL) {
3705 			/*
3706 			 * Caught a signal waiting for memory.
3707 			 * Let send* return EINTR.
3708 			 */
3709 			return (EINTR);
3710 		}
3711 	}
3712 	soappendmsg(mp, &tudr, sizeof (tudr));
3713 	soappendmsg(mp, addr, addrlen);
3714 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3715 
3716 	if (fdlen != -1) {
3717 		ASSERT(fdbuf != NULL);
3718 		toh.level = SOL_SOCKET;
3719 		toh.name = SO_FILEP;
3720 		toh.len = fdbuf->fd_size +
3721 		    (t_uscalar_t)sizeof (struct T_opthdr);
3722 		toh.status = 0;
3723 		soappendmsg(mp, &toh, sizeof (toh));
3724 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3725 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3726 	}
3727 	if (srclen != 0) {
3728 		/*
3729 		 * There is a AF_UNIX sockaddr_un to include as a source
3730 		 * address option.
3731 		 */
3732 		toh.level = SOL_SOCKET;
3733 		toh.name = SO_SRCADDR;
3734 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3735 		toh.status = 0;
3736 		soappendmsg(mp, &toh, sizeof (toh));
3737 		soappendmsg(mp, src, srclen);
3738 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3739 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3740 	}
3741 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3742 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3743 	/* At most 3 bytes left in the message */
3744 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3745 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3746 
3747 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3748 	if (audit_active)
3749 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3750 
3751 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3752 #ifdef SOCK_DEBUG
3753 	if (error) {
3754 		eprintsoline(so, error);
3755 	}
3756 #endif /* SOCK_DEBUG */
3757 	return (error);
3758 }
3759 
3760 /*
3761  * Sending data with options on a connected stream socket.
3762  * Assumes caller has verified that SS_ISCONNECTED is set.
3763  */
3764 static int
3765 sosend_svccmsg(struct sonode *so,
3766 		struct uio *uiop,
3767 		int more,
3768 		void *control,
3769 		t_uscalar_t controllen,
3770 		int flags)
3771 {
3772 	struct T_optdata_req	tdr;
3773 	mblk_t			*mp;
3774 	int			error;
3775 	ssize_t			iosize;
3776 	int			first = 1;
3777 	int			size;
3778 	struct fdbuf		*fdbuf;
3779 	t_uscalar_t		optlen;
3780 	void			*fds;
3781 	int			fdlen;
3782 	struct T_opthdr		toh;
3783 
3784 	dprintso(so, 1,
3785 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3786 
3787 	/*
3788 	 * Has to be bound and connected. However, since no locks are
3789 	 * held the state could have changed after sotpi_sendmsg checked it
3790 	 * thus it is not possible to ASSERT on the state.
3791 	 */
3792 
3793 	/* Options on connection-oriented only when SM_OPTDATA set. */
3794 	if (!(so->so_mode & SM_OPTDATA))
3795 		return (EOPNOTSUPP);
3796 
3797 	do {
3798 		/*
3799 		 * Set the MORE flag if uio_resid does not fit in this
3800 		 * message or if the caller passed in "more".
3801 		 * Error for transports with zero tidu_size.
3802 		 */
3803 		tdr.PRIM_type = T_OPTDATA_REQ;
3804 		iosize = so->so_tidu_size;
3805 		if (iosize <= 0)
3806 			return (EMSGSIZE);
3807 		if (uiop->uio_resid > iosize) {
3808 			tdr.DATA_flag = 1;
3809 		} else {
3810 			if (more)
3811 				tdr.DATA_flag = 1;
3812 			else
3813 				tdr.DATA_flag = 0;
3814 			iosize = uiop->uio_resid;
3815 		}
3816 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3817 		    tdr.DATA_flag, iosize));
3818 
3819 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3820 		tdr.OPT_length = optlen;
3821 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3822 
3823 		size = (int)sizeof (tdr) + optlen;
3824 		/*
3825 		 * File descriptors only when SM_FDPASSING set.
3826 		 */
3827 		error = so_getfdopt(control, controllen,
3828 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3829 		if (error)
3830 			return (error);
3831 		if (fdlen != -1) {
3832 			if (!(so->so_mode & SM_FDPASSING))
3833 				return (EOPNOTSUPP);
3834 
3835 			error = fdbuf_create(fds, fdlen, &fdbuf);
3836 			if (error)
3837 				return (error);
3838 			mp = fdbuf_allocmsg(size, fdbuf);
3839 		} else {
3840 			mp = soallocproto(size, _ALLOC_INTR);
3841 			if (mp == NULL) {
3842 				/*
3843 				 * Caught a signal waiting for memory.
3844 				 * Let send* return EINTR.
3845 				 */
3846 				return (first ? EINTR : 0);
3847 			}
3848 		}
3849 		soappendmsg(mp, &tdr, sizeof (tdr));
3850 
3851 		if (fdlen != -1) {
3852 			ASSERT(fdbuf != NULL);
3853 			toh.level = SOL_SOCKET;
3854 			toh.name = SO_FILEP;
3855 			toh.len = fdbuf->fd_size +
3856 			    (t_uscalar_t)sizeof (struct T_opthdr);
3857 			toh.status = 0;
3858 			soappendmsg(mp, &toh, sizeof (toh));
3859 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3860 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3861 		}
3862 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3863 		/* At most 3 bytes left in the message */
3864 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3865 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3866 
3867 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3868 
3869 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3870 		    0, MSG_BAND, 0);
3871 		if (error) {
3872 			if (!first && error == EWOULDBLOCK)
3873 				return (0);
3874 			eprintsoline(so, error);
3875 			return (error);
3876 		}
3877 		control = NULL;
3878 		first = 0;
3879 		if (uiop->uio_resid > 0) {
3880 			/*
3881 			 * Recheck for fatal errors. Fail write even though
3882 			 * some data have been written. This is consistent
3883 			 * with strwrite semantics and BSD sockets semantics.
3884 			 */
3885 			if (so->so_state & SS_CANTSENDMORE) {
3886 				tsignal(curthread, SIGPIPE);
3887 				eprintsoline(so, error);
3888 				return (EPIPE);
3889 			}
3890 			if (so->so_error != 0) {
3891 				mutex_enter(&so->so_lock);
3892 				error = sogeterr(so);
3893 				mutex_exit(&so->so_lock);
3894 				if (error != 0) {
3895 					eprintsoline(so, error);
3896 					return (error);
3897 				}
3898 			}
3899 		}
3900 	} while (uiop->uio_resid > 0);
3901 	return (0);
3902 }
3903 
3904 /*
3905  * Sending data on a datagram socket.
3906  * Assumes caller has verified that SS_ISBOUND etc. are set.
3907  *
3908  * For AF_UNIX the destination address is translated to an internal
3909  * name and the source address is passed as an option.
3910  */
3911 int
3912 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3913     struct uio *uiop, int flags)
3914 {
3915 	struct T_unitdata_req	tudr;
3916 	mblk_t			*mp;
3917 	int			error;
3918 	void			*addr;
3919 	socklen_t		addrlen;
3920 	void			*src;
3921 	socklen_t		srclen;
3922 	ssize_t			len;
3923 
3924 	ASSERT(name != NULL && namelen != 0);
3925 
3926 	len = uiop->uio_resid;
3927 	if (len > so->so_tidu_size) {
3928 		error = EMSGSIZE;
3929 		goto done;
3930 	}
3931 
3932 	/* Length and family checks */
3933 	error = so_addr_verify(so, name, namelen);
3934 	if (error != 0)
3935 		goto done;
3936 
3937 	if (so->so_state & SS_DIRECT)
3938 		return (sodgram_direct(so, name, namelen, uiop, flags));
3939 
3940 	if (so->so_family == AF_UNIX) {
3941 		if (so->so_state & SS_FADDR_NOXLATE) {
3942 			/*
3943 			 * Already have a transport internal address. Do not
3944 			 * pass any (transport internal) source address.
3945 			 */
3946 			addr = name;
3947 			addrlen = namelen;
3948 			src = NULL;
3949 			srclen = 0;
3950 		} else {
3951 			/*
3952 			 * Pass the sockaddr_un source address as an option
3953 			 * and translate the remote address.
3954 			 *
3955 			 * Note that this code does not prevent so_laddr_sa
3956 			 * from changing while it is being used. Thus
3957 			 * if an unbind+bind occurs concurrently with this
3958 			 * send the peer might see a partially new and a
3959 			 * partially old "from" address.
3960 			 */
3961 			src = so->so_laddr_sa;
3962 			srclen = (socklen_t)so->so_laddr_len;
3963 			dprintso(so, 1,
3964 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3965 			    srclen, src));
3966 			error = so_ux_addr_xlate(so, name, namelen,
3967 			    (flags & MSG_XPG4_2),
3968 			    &addr, &addrlen);
3969 			if (error) {
3970 				eprintsoline(so, error);
3971 				goto done;
3972 			}
3973 		}
3974 	} else {
3975 		addr = name;
3976 		addrlen = namelen;
3977 		src = NULL;
3978 		srclen = 0;
3979 	}
3980 	tudr.PRIM_type = T_UNITDATA_REQ;
3981 	tudr.DEST_length = addrlen;
3982 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3983 	if (srclen == 0) {
3984 		tudr.OPT_length = 0;
3985 		tudr.OPT_offset = 0;
3986 
3987 		mp = soallocproto2(&tudr, sizeof (tudr),
3988 		    addr, addrlen, 0, _ALLOC_INTR);
3989 		if (mp == NULL) {
3990 			/*
3991 			 * Caught a signal waiting for memory.
3992 			 * Let send* return EINTR.
3993 			 */
3994 			error = EINTR;
3995 			goto done;
3996 		}
3997 	} else {
3998 		/*
3999 		 * There is a AF_UNIX sockaddr_un to include as a source
4000 		 * address option.
4001 		 */
4002 		struct T_opthdr toh;
4003 		ssize_t size;
4004 
4005 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4006 		    _TPI_ALIGN_TOPT(srclen));
4007 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4008 		    _TPI_ALIGN_TOPT(addrlen));
4009 
4010 		toh.level = SOL_SOCKET;
4011 		toh.name = SO_SRCADDR;
4012 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4013 		toh.status = 0;
4014 
4015 		size = tudr.OPT_offset + tudr.OPT_length;
4016 		mp = soallocproto2(&tudr, sizeof (tudr),
4017 		    addr, addrlen, size, _ALLOC_INTR);
4018 		if (mp == NULL) {
4019 			/*
4020 			 * Caught a signal waiting for memory.
4021 			 * Let send* return EINTR.
4022 			 */
4023 			error = EINTR;
4024 			goto done;
4025 		}
4026 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4027 		soappendmsg(mp, &toh, sizeof (toh));
4028 		soappendmsg(mp, src, srclen);
4029 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4030 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4031 	}
4032 
4033 	if (audit_active)
4034 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4035 
4036 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4037 done:
4038 #ifdef SOCK_DEBUG
4039 	if (error) {
4040 		eprintsoline(so, error);
4041 	}
4042 #endif /* SOCK_DEBUG */
4043 	return (error);
4044 }
4045 
4046 /*
4047  * Sending data on a connected stream socket.
4048  * Assumes caller has verified that SS_ISCONNECTED is set.
4049  */
4050 int
4051 sosend_svc(struct sonode *so,
4052 	struct uio *uiop,
4053 	t_scalar_t prim,
4054 	int more,
4055 	int sflag)
4056 {
4057 	struct T_data_req	tdr;
4058 	mblk_t			*mp;
4059 	int			error;
4060 	ssize_t			iosize;
4061 	int			first = 1;
4062 
4063 	dprintso(so, 1,
4064 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4065 	    (void *)so, uiop->uio_resid, prim, sflag));
4066 
4067 	/*
4068 	 * Has to be bound and connected. However, since no locks are
4069 	 * held the state could have changed after sotpi_sendmsg checked it
4070 	 * thus it is not possible to ASSERT on the state.
4071 	 */
4072 
4073 	do {
4074 		/*
4075 		 * Set the MORE flag if uio_resid does not fit in this
4076 		 * message or if the caller passed in "more".
4077 		 * Error for transports with zero tidu_size.
4078 		 */
4079 		tdr.PRIM_type = prim;
4080 		iosize = so->so_tidu_size;
4081 		if (iosize <= 0)
4082 			return (EMSGSIZE);
4083 		if (uiop->uio_resid > iosize) {
4084 			tdr.MORE_flag = 1;
4085 		} else {
4086 			if (more)
4087 				tdr.MORE_flag = 1;
4088 			else
4089 				tdr.MORE_flag = 0;
4090 			iosize = uiop->uio_resid;
4091 		}
4092 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4093 		    prim, tdr.MORE_flag, iosize));
4094 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
4095 		if (mp == NULL) {
4096 			/*
4097 			 * Caught a signal waiting for memory.
4098 			 * Let send* return EINTR.
4099 			 */
4100 			if (first)
4101 				return (EINTR);
4102 			else
4103 				return (0);
4104 		}
4105 
4106 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4107 		    0, sflag | MSG_BAND, 0);
4108 		if (error) {
4109 			if (!first && error == EWOULDBLOCK)
4110 				return (0);
4111 			eprintsoline(so, error);
4112 			return (error);
4113 		}
4114 		first = 0;
4115 		if (uiop->uio_resid > 0) {
4116 			/*
4117 			 * Recheck for fatal errors. Fail write even though
4118 			 * some data have been written. This is consistent
4119 			 * with strwrite semantics and BSD sockets semantics.
4120 			 */
4121 			if (so->so_state & SS_CANTSENDMORE) {
4122 				tsignal(curthread, SIGPIPE);
4123 				eprintsoline(so, error);
4124 				return (EPIPE);
4125 			}
4126 			if (so->so_error != 0) {
4127 				mutex_enter(&so->so_lock);
4128 				error = sogeterr(so);
4129 				mutex_exit(&so->so_lock);
4130 				if (error != 0) {
4131 					eprintsoline(so, error);
4132 					return (error);
4133 				}
4134 			}
4135 		}
4136 	} while (uiop->uio_resid > 0);
4137 	return (0);
4138 }
4139 
4140 /*
4141  * Check the state for errors and call the appropriate send function.
4142  *
4143  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4144  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4145  * after sending the message.
4146  */
4147 static int
4148 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
4149 {
4150 	int		so_state;
4151 	int		so_mode;
4152 	int		error;
4153 	struct sockaddr *name;
4154 	t_uscalar_t	namelen;
4155 	int		dontroute;
4156 	int		flags;
4157 
4158 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4159 	    (void *)so, (void *)msg, msg->msg_flags,
4160 	    pr_state(so->so_state, so->so_mode), so->so_error));
4161 
4162 	mutex_enter(&so->so_lock);
4163 	so_state = so->so_state;
4164 
4165 	if (so_state & SS_CANTSENDMORE) {
4166 		mutex_exit(&so->so_lock);
4167 		tsignal(curthread, SIGPIPE);
4168 		return (EPIPE);
4169 	}
4170 
4171 	if (so->so_error != 0) {
4172 		error = sogeterr(so);
4173 		if (error != 0) {
4174 			mutex_exit(&so->so_lock);
4175 			return (error);
4176 		}
4177 	}
4178 
4179 	name = (struct sockaddr *)msg->msg_name;
4180 	namelen = msg->msg_namelen;
4181 
4182 	so_mode = so->so_mode;
4183 
4184 	if (name == NULL) {
4185 		if (!(so_state & SS_ISCONNECTED)) {
4186 			mutex_exit(&so->so_lock);
4187 			if (so_mode & SM_CONNREQUIRED)
4188 				return (ENOTCONN);
4189 			else
4190 				return (EDESTADDRREQ);
4191 		}
4192 		if (so_mode & SM_CONNREQUIRED) {
4193 			name = NULL;
4194 			namelen = 0;
4195 		} else {
4196 			/*
4197 			 * Note that this code does not prevent so_faddr_sa
4198 			 * from changing while it is being used. Thus
4199 			 * if an "unconnect"+connect occurs concurrently with
4200 			 * this send the datagram might be delivered to a
4201 			 * garbaled address.
4202 			 */
4203 			ASSERT(so->so_faddr_sa);
4204 			name = so->so_faddr_sa;
4205 			namelen = (t_uscalar_t)so->so_faddr_len;
4206 		}
4207 	} else {
4208 		if (!(so_state & SS_ISCONNECTED) &&
4209 		    (so_mode & SM_CONNREQUIRED)) {
4210 			/* Required but not connected */
4211 			mutex_exit(&so->so_lock);
4212 			return (ENOTCONN);
4213 		}
4214 		/*
4215 		 * Ignore the address on connection-oriented sockets.
4216 		 * Just like BSD this code does not generate an error for
4217 		 * TCP (a CONNREQUIRED socket) when sending to an address
4218 		 * passed in with sendto/sendmsg. Instead the data is
4219 		 * delivered on the connection as if no address had been
4220 		 * supplied.
4221 		 */
4222 		if ((so_state & SS_ISCONNECTED) &&
4223 		    !(so_mode & SM_CONNREQUIRED)) {
4224 			mutex_exit(&so->so_lock);
4225 			return (EISCONN);
4226 		}
4227 		if (!(so_state & SS_ISBOUND)) {
4228 			so_lock_single(so);	/* Set SOLOCKED */
4229 			error = sotpi_bind(so, NULL, 0,
4230 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
4231 			so_unlock_single(so, SOLOCKED);
4232 			if (error) {
4233 				mutex_exit(&so->so_lock);
4234 				eprintsoline(so, error);
4235 				return (error);
4236 			}
4237 		}
4238 		/*
4239 		 * Handle delayed datagram errors. These are only queued
4240 		 * when the application sets SO_DGRAM_ERRIND.
4241 		 * Return the error if we are sending to the address
4242 		 * that was returned in the last T_UDERROR_IND.
4243 		 * If sending to some other address discard the delayed
4244 		 * error indication.
4245 		 */
4246 		if (so->so_delayed_error) {
4247 			struct T_uderror_ind	*tudi;
4248 			void			*addr;
4249 			t_uscalar_t		addrlen;
4250 			boolean_t		match = B_FALSE;
4251 
4252 			ASSERT(so->so_eaddr_mp);
4253 			error = so->so_delayed_error;
4254 			so->so_delayed_error = 0;
4255 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
4256 			addrlen = tudi->DEST_length;
4257 			addr = sogetoff(so->so_eaddr_mp,
4258 			    tudi->DEST_offset,
4259 			    addrlen, 1);
4260 			ASSERT(addr);	/* Checked by strsock_proto */
4261 			switch (so->so_family) {
4262 			case AF_INET: {
4263 				/* Compare just IP address and port */
4264 				sin_t *sin1 = (sin_t *)name;
4265 				sin_t *sin2 = (sin_t *)addr;
4266 
4267 				if (addrlen == sizeof (sin_t) &&
4268 				    namelen == addrlen &&
4269 				    sin1->sin_port == sin2->sin_port &&
4270 				    sin1->sin_addr.s_addr ==
4271 				    sin2->sin_addr.s_addr)
4272 					match = B_TRUE;
4273 				break;
4274 			}
4275 			case AF_INET6: {
4276 				/* Compare just IP address and port. Not flow */
4277 				sin6_t *sin1 = (sin6_t *)name;
4278 				sin6_t *sin2 = (sin6_t *)addr;
4279 
4280 				if (addrlen == sizeof (sin6_t) &&
4281 				    namelen == addrlen &&
4282 				    sin1->sin6_port == sin2->sin6_port &&
4283 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4284 				    &sin2->sin6_addr))
4285 					match = B_TRUE;
4286 				break;
4287 			}
4288 			case AF_UNIX:
4289 			default:
4290 				if (namelen == addrlen &&
4291 				    bcmp(name, addr, namelen) == 0)
4292 					match = B_TRUE;
4293 			}
4294 			if (match) {
4295 				freemsg(so->so_eaddr_mp);
4296 				so->so_eaddr_mp = NULL;
4297 				mutex_exit(&so->so_lock);
4298 #ifdef DEBUG
4299 				dprintso(so, 0,
4300 				    ("sockfs delayed error %d for %s\n",
4301 				    error,
4302 				    pr_addr(so->so_family, name, namelen)));
4303 #endif /* DEBUG */
4304 				return (error);
4305 			}
4306 			freemsg(so->so_eaddr_mp);
4307 			so->so_eaddr_mp = NULL;
4308 		}
4309 	}
4310 	mutex_exit(&so->so_lock);
4311 
4312 	flags = msg->msg_flags;
4313 	dontroute = 0;
4314 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4315 		uint32_t	val;
4316 
4317 		val = 1;
4318 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4319 		    &val, (t_uscalar_t)sizeof (val));
4320 		if (error)
4321 			return (error);
4322 		dontroute = 1;
4323 	}
4324 
4325 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4326 		error = EOPNOTSUPP;
4327 		goto done;
4328 	}
4329 	if (msg->msg_controllen != 0) {
4330 		if (!(so_mode & SM_CONNREQUIRED)) {
4331 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4332 			    msg->msg_control, msg->msg_controllen, flags);
4333 		} else {
4334 			if (flags & MSG_OOB) {
4335 				/* Can't generate T_EXDATA_REQ with options */
4336 				error = EOPNOTSUPP;
4337 				goto done;
4338 			}
4339 			error = sosend_svccmsg(so, uiop,
4340 			    !(flags & MSG_EOR),
4341 			    msg->msg_control, msg->msg_controllen,
4342 			    flags);
4343 		}
4344 		goto done;
4345 	}
4346 
4347 	if (!(so_mode & SM_CONNREQUIRED)) {
4348 		/*
4349 		 * If there is no SO_DONTROUTE to turn off return immediately
4350 		 * from send_dgram. This can allow tail-call optimizations.
4351 		 */
4352 		if (!dontroute) {
4353 			return (sosend_dgram(so, name, namelen, uiop, flags));
4354 		}
4355 		error = sosend_dgram(so, name, namelen, uiop, flags);
4356 	} else {
4357 		t_scalar_t prim;
4358 		int sflag;
4359 
4360 		/* Ignore msg_name in the connected state */
4361 		if (flags & MSG_OOB) {
4362 			prim = T_EXDATA_REQ;
4363 			/*
4364 			 * Send down T_EXDATA_REQ even if there is flow
4365 			 * control for data.
4366 			 */
4367 			sflag = MSG_IGNFLOW;
4368 		} else {
4369 			if (so_mode & SM_BYTESTREAM) {
4370 				/* Byte stream transport - use write */
4371 
4372 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4373 				/*
4374 				 * If there is no SO_DONTROUTE to turn off,
4375 				 * SS_DIRECT is on, and there is no flow
4376 				 * control, we can take the fast path.
4377 				 */
4378 				if (!dontroute &&
4379 				    (so_state & SS_DIRECT) &&
4380 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4381 					return (sostream_direct(so, uiop,
4382 					    NULL, CRED()));
4383 				}
4384 				error = strwrite(SOTOV(so), uiop, CRED());
4385 				goto done;
4386 			}
4387 			prim = T_DATA_REQ;
4388 			sflag = 0;
4389 		}
4390 		/*
4391 		 * If there is no SO_DONTROUTE to turn off return immediately
4392 		 * from sosend_svc. This can allow tail-call optimizations.
4393 		 */
4394 		if (!dontroute)
4395 			return (sosend_svc(so, uiop, prim,
4396 			    !(flags & MSG_EOR), sflag));
4397 		error = sosend_svc(so, uiop, prim,
4398 		    !(flags & MSG_EOR), sflag);
4399 	}
4400 	ASSERT(dontroute);
4401 done:
4402 	if (dontroute) {
4403 		uint32_t	val;
4404 
4405 		val = 0;
4406 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4407 		    &val, (t_uscalar_t)sizeof (val));
4408 	}
4409 	return (error);
4410 }
4411 
4412 /*
4413  * Sending data on a datagram socket.
4414  * Assumes caller has verified that SS_ISBOUND etc. are set.
4415  */
4416 /* ARGSUSED */
4417 static int
4418 sodgram_direct(struct sonode *so, struct sockaddr *name,
4419     socklen_t namelen, struct uio *uiop, int flags)
4420 {
4421 	struct T_unitdata_req	tudr;
4422 	mblk_t			*mp = NULL;
4423 	int			error = 0;
4424 	void			*addr;
4425 	socklen_t		addrlen;
4426 	ssize_t			len;
4427 	struct stdata		*stp = SOTOV(so)->v_stream;
4428 	int			so_state;
4429 	queue_t			*udp_wq;
4430 	boolean_t		connected;
4431 	mblk_t			*mpdata = NULL;
4432 
4433 	ASSERT(name != NULL && namelen != 0);
4434 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4435 	ASSERT(!(so->so_mode & SM_EXDATA));
4436 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4437 	ASSERT(SOTOV(so)->v_type == VSOCK);
4438 
4439 	/* Caller checked for proper length */
4440 	len = uiop->uio_resid;
4441 	ASSERT(len <= so->so_tidu_size);
4442 
4443 	/* Length and family checks have been done by caller */
4444 	ASSERT(name->sa_family == so->so_family);
4445 	ASSERT(so->so_family == AF_INET ||
4446 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4447 	ASSERT(so->so_family == AF_INET6 ||
4448 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4449 
4450 	addr = name;
4451 	addrlen = namelen;
4452 
4453 	if (stp->sd_sidp != NULL &&
4454 	    (error = straccess(stp, JCWRITE)) != 0)
4455 		goto done;
4456 
4457 	so_state = so->so_state;
4458 
4459 	connected = so_state & SS_ISCONNECTED;
4460 	if (!connected) {
4461 		tudr.PRIM_type = T_UNITDATA_REQ;
4462 		tudr.DEST_length = addrlen;
4463 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4464 		tudr.OPT_length = 0;
4465 		tudr.OPT_offset = 0;
4466 
4467 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4468 		    _ALLOC_INTR);
4469 		if (mp == NULL) {
4470 			/*
4471 			 * Caught a signal waiting for memory.
4472 			 * Let send* return EINTR.
4473 			 */
4474 			error = EINTR;
4475 			goto done;
4476 		}
4477 	}
4478 
4479 	/*
4480 	 * For UDP we don't break up the copyin into smaller pieces
4481 	 * as in the TCP case.  That means if ENOMEM is returned by
4482 	 * mcopyinuio() then the uio vector has not been modified at
4483 	 * all and we fallback to either strwrite() or kstrputmsg()
4484 	 * below.  Note also that we never generate priority messages
4485 	 * from here.
4486 	 */
4487 	udp_wq = stp->sd_wrq->q_next;
4488 	if (canput(udp_wq) &&
4489 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4490 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4491 		ASSERT(uiop->uio_resid == 0);
4492 		if (!connected)
4493 			linkb(mp, mpdata);
4494 		else
4495 			mp = mpdata;
4496 		if (audit_active)
4497 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4498 
4499 		udp_wput(udp_wq, mp);
4500 		return (0);
4501 	}
4502 
4503 	ASSERT(mpdata == NULL);
4504 	if (error != 0 && error != ENOMEM) {
4505 		freemsg(mp);
4506 		return (error);
4507 	}
4508 
4509 	/*
4510 	 * For connected, let strwrite() handle the blocking case.
4511 	 * Otherwise we fall thru and use kstrputmsg().
4512 	 */
4513 	if (connected)
4514 		return (strwrite(SOTOV(so), uiop, CRED()));
4515 
4516 	if (audit_active)
4517 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4518 
4519 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4520 done:
4521 #ifdef SOCK_DEBUG
4522 	if (error != 0) {
4523 		eprintsoline(so, error);
4524 	}
4525 #endif /* SOCK_DEBUG */
4526 	return (error);
4527 }
4528 
4529 int
4530 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4531 {
4532 	struct stdata *stp = SOTOV(so)->v_stream;
4533 	ssize_t iosize, rmax, maxblk;
4534 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4535 	mblk_t *newmp;
4536 	int error = 0, wflag = 0;
4537 
4538 	ASSERT(so->so_mode & SM_BYTESTREAM);
4539 	ASSERT(SOTOV(so)->v_type == VSOCK);
4540 
4541 	if (stp->sd_sidp != NULL &&
4542 	    (error = straccess(stp, JCWRITE)) != 0)
4543 		return (error);
4544 
4545 	if (uiop == NULL) {
4546 		/*
4547 		 * kstrwritemp() should have checked sd_flag and
4548 		 * flow-control before coming here.  If we end up
4549 		 * here it means that we can simply pass down the
4550 		 * data to tcp.
4551 		 */
4552 		ASSERT(mp != NULL);
4553 		if (stp->sd_wputdatafunc != NULL) {
4554 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4555 			    NULL, NULL, NULL);
4556 			if (newmp == NULL) {
4557 				/* The caller will free mp */
4558 				return (ECOMM);
4559 			}
4560 			mp = newmp;
4561 		}
4562 		tcp_wput(tcp_wq, mp);
4563 		return (0);
4564 	}
4565 
4566 	/* Fallback to strwrite() to do proper error handling */
4567 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4568 		return (strwrite(SOTOV(so), uiop, cr));
4569 
4570 	rmax = stp->sd_qn_maxpsz;
4571 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4572 	if (rmax == 0 || uiop->uio_resid <= 0)
4573 		return (0);
4574 
4575 	if (rmax == INFPSZ)
4576 		rmax = uiop->uio_resid;
4577 
4578 	maxblk = stp->sd_maxblk;
4579 
4580 	for (;;) {
4581 		iosize = MIN(uiop->uio_resid, rmax);
4582 
4583 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4584 		if (mp == NULL) {
4585 			/*
4586 			 * Fallback to strwrite() for ENOMEM; if this
4587 			 * is our first time in this routine and the uio
4588 			 * vector has not been modified, we will end up
4589 			 * calling strwrite() without any flag set.
4590 			 */
4591 			if (error == ENOMEM)
4592 				goto slow_send;
4593 			else
4594 				return (error);
4595 		}
4596 		ASSERT(uiop->uio_resid >= 0);
4597 		/*
4598 		 * If mp is non-NULL and ENOMEM is set, it means that
4599 		 * mcopyinuio() was able to break down some of the user
4600 		 * data into one or more mblks.  Send the partial data
4601 		 * to tcp and let the rest be handled in strwrite().
4602 		 */
4603 		ASSERT(error == 0 || error == ENOMEM);
4604 		if (stp->sd_wputdatafunc != NULL) {
4605 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4606 			    NULL, NULL, NULL);
4607 			if (newmp == NULL) {
4608 				/* The caller will free mp */
4609 				return (ECOMM);
4610 			}
4611 			mp = newmp;
4612 		}
4613 		tcp_wput(tcp_wq, mp);
4614 
4615 		wflag |= NOINTR;
4616 
4617 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4618 			ASSERT(error == 0);
4619 			break;
4620 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4621 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4622 slow_send:
4623 			/*
4624 			 * We were able to send down partial data using
4625 			 * the direct call interface, but are now relying
4626 			 * on strwrite() to handle the non-fastpath cases.
4627 			 * If the socket is blocking we will sleep in
4628 			 * strwaitq() until write is permitted, otherwise,
4629 			 * we will need to return the amount of bytes
4630 			 * written so far back to the app.  This is the
4631 			 * reason why we pass NOINTR flag to strwrite()
4632 			 * for non-blocking socket, because we don't want
4633 			 * to return EAGAIN when portion of the user data
4634 			 * has actually been sent down.
4635 			 */
4636 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4637 		}
4638 	}
4639 	return (0);
4640 }
4641 
4642 /*
4643  * Update so_faddr by asking the transport (unless AF_UNIX).
4644  */
4645 int
4646 sotpi_getpeername(struct sonode *so)
4647 {
4648 	struct strbuf	strbuf;
4649 	int		error = 0, res;
4650 	void		*addr;
4651 	t_uscalar_t	addrlen;
4652 	k_sigset_t	smask;
4653 
4654 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4655 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4656 
4657 	mutex_enter(&so->so_lock);
4658 	so_lock_single(so);	/* Set SOLOCKED */
4659 	if (!(so->so_state & SS_ISCONNECTED)) {
4660 		error = ENOTCONN;
4661 		goto done;
4662 	}
4663 	/* Added this check for X/Open */
4664 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4665 		error = EINVAL;
4666 		if (xnet_check_print) {
4667 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4668 		}
4669 		goto done;
4670 	}
4671 #ifdef DEBUG
4672 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4673 	    pr_addr(so->so_family, so->so_faddr_sa,
4674 	    (t_uscalar_t)so->so_faddr_len)));
4675 #endif /* DEBUG */
4676 
4677 	if (so->so_family == AF_UNIX) {
4678 		/* Transport has different name space - return local info */
4679 		error = 0;
4680 		goto done;
4681 	}
4682 
4683 	ASSERT(so->so_faddr_sa);
4684 	/* Allocate local buffer to use with ioctl */
4685 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4686 	mutex_exit(&so->so_lock);
4687 	addr = kmem_alloc(addrlen, KM_SLEEP);
4688 
4689 	/*
4690 	 * Issue TI_GETPEERNAME with signals masked.
4691 	 * Put the result in so_faddr_sa so that getpeername works after
4692 	 * a shutdown(output).
4693 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4694 	 * back to the socket.
4695 	 */
4696 	strbuf.buf = addr;
4697 	strbuf.maxlen = addrlen;
4698 	strbuf.len = 0;
4699 
4700 	sigintr(&smask, 0);
4701 	res = 0;
4702 	ASSERT(CRED());
4703 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4704 	    0, K_TO_K, CRED(), &res);
4705 	sigunintr(&smask);
4706 
4707 	mutex_enter(&so->so_lock);
4708 	/*
4709 	 * If there is an error record the error in so_error put don't fail
4710 	 * the getpeername. Instead fallback on the recorded
4711 	 * so->so_faddr_sa.
4712 	 */
4713 	if (error) {
4714 		/*
4715 		 * Various stream head errors can be returned to the ioctl.
4716 		 * However, it is impossible to determine which ones of
4717 		 * these are really socket level errors that were incorrectly
4718 		 * consumed by the ioctl. Thus this code silently ignores the
4719 		 * error - to code explicitly does not reinstate the error
4720 		 * using soseterror().
4721 		 * Experiments have shows that at least this set of
4722 		 * errors are reported and should not be reinstated on the
4723 		 * socket:
4724 		 *	EINVAL	E.g. if an I_LINK was in effect when
4725 		 *		getpeername was called.
4726 		 *	EPIPE	The ioctl error semantics prefer the write
4727 		 *		side error over the read side error.
4728 		 *	ENOTCONN The transport just got disconnected but
4729 		 *		sockfs had not yet seen the T_DISCON_IND
4730 		 *		when issuing the ioctl.
4731 		 */
4732 		error = 0;
4733 	} else if (res == 0 && strbuf.len > 0 &&
4734 	    (so->so_state & SS_ISCONNECTED)) {
4735 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4736 		so->so_faddr_len = (socklen_t)strbuf.len;
4737 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4738 		so->so_state |= SS_FADDR_VALID;
4739 	}
4740 	kmem_free(addr, addrlen);
4741 #ifdef DEBUG
4742 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4743 	    pr_addr(so->so_family, so->so_faddr_sa,
4744 	    (t_uscalar_t)so->so_faddr_len)));
4745 #endif /* DEBUG */
4746 done:
4747 	so_unlock_single(so, SOLOCKED);
4748 	mutex_exit(&so->so_lock);
4749 	return (error);
4750 }
4751 
4752 /*
4753  * Update so_laddr by asking the transport (unless AF_UNIX).
4754  */
4755 int
4756 sotpi_getsockname(struct sonode *so)
4757 {
4758 	struct strbuf	strbuf;
4759 	int		error = 0, res;
4760 	void		*addr;
4761 	t_uscalar_t	addrlen;
4762 	k_sigset_t	smask;
4763 
4764 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4765 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4766 
4767 	mutex_enter(&so->so_lock);
4768 	so_lock_single(so);	/* Set SOLOCKED */
4769 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4770 		/* Return an all zero address except for the family */
4771 		if (so->so_family == AF_INET)
4772 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4773 		else if (so->so_family == AF_INET6)
4774 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4775 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4776 		bzero(so->so_laddr_sa, so->so_laddr_len);
4777 		/*
4778 		 * Can not assume there is a sa_family for all
4779 		 * protocol families.
4780 		 */
4781 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4782 			so->so_laddr_sa->sa_family = so->so_family;
4783 	}
4784 #ifdef DEBUG
4785 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4786 	    pr_addr(so->so_family, so->so_laddr_sa,
4787 	    (t_uscalar_t)so->so_laddr_len)));
4788 #endif /* DEBUG */
4789 	if (so->so_family == AF_UNIX) {
4790 		/* Transport has different name space - return local info */
4791 		error = 0;
4792 		goto done;
4793 	}
4794 	if (!(so->so_state & SS_ISBOUND)) {
4795 		/* If not bound, then nothing to return. */
4796 		error = 0;
4797 		goto done;
4798 	}
4799 	/* Allocate local buffer to use with ioctl */
4800 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4801 	mutex_exit(&so->so_lock);
4802 	addr = kmem_alloc(addrlen, KM_SLEEP);
4803 
4804 	/*
4805 	 * Issue TI_GETMYNAME with signals masked.
4806 	 * Put the result in so_laddr_sa so that getsockname works after
4807 	 * a shutdown(output).
4808 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4809 	 * back to the socket.
4810 	 */
4811 	strbuf.buf = addr;
4812 	strbuf.maxlen = addrlen;
4813 	strbuf.len = 0;
4814 
4815 	sigintr(&smask, 0);
4816 	res = 0;
4817 	ASSERT(CRED());
4818 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4819 	    0, K_TO_K, CRED(), &res);
4820 	sigunintr(&smask);
4821 
4822 	mutex_enter(&so->so_lock);
4823 	/*
4824 	 * If there is an error record the error in so_error put don't fail
4825 	 * the getsockname. Instead fallback on the recorded
4826 	 * so->so_laddr_sa.
4827 	 */
4828 	if (error) {
4829 		/*
4830 		 * Various stream head errors can be returned to the ioctl.
4831 		 * However, it is impossible to determine which ones of
4832 		 * these are really socket level errors that were incorrectly
4833 		 * consumed by the ioctl. Thus this code silently ignores the
4834 		 * error - to code explicitly does not reinstate the error
4835 		 * using soseterror().
4836 		 * Experiments have shows that at least this set of
4837 		 * errors are reported and should not be reinstated on the
4838 		 * socket:
4839 		 *	EINVAL	E.g. if an I_LINK was in effect when
4840 		 *		getsockname was called.
4841 		 *	EPIPE	The ioctl error semantics prefer the write
4842 		 *		side error over the read side error.
4843 		 */
4844 		error = 0;
4845 	} else if (res == 0 && strbuf.len > 0 &&
4846 	    (so->so_state & SS_ISBOUND)) {
4847 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4848 		so->so_laddr_len = (socklen_t)strbuf.len;
4849 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4850 		so->so_state |= SS_LADDR_VALID;
4851 	}
4852 	kmem_free(addr, addrlen);
4853 #ifdef DEBUG
4854 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4855 	    pr_addr(so->so_family, so->so_laddr_sa,
4856 	    (t_uscalar_t)so->so_laddr_len)));
4857 #endif /* DEBUG */
4858 done:
4859 	so_unlock_single(so, SOLOCKED);
4860 	mutex_exit(&so->so_lock);
4861 	return (error);
4862 }
4863 
4864 /*
4865  * Get socket options. For SOL_SOCKET options some options are handled
4866  * by the sockfs while others use the value recorded in the sonode as a
4867  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4868  *
4869  * On the return most *optlenp bytes are copied to optval.
4870  */
4871 int
4872 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4873 		void *optval, socklen_t *optlenp, int flags)
4874 {
4875 	struct T_optmgmt_req	optmgmt_req;
4876 	struct T_optmgmt_ack	*optmgmt_ack;
4877 	struct opthdr		oh;
4878 	struct opthdr		*opt_res;
4879 	mblk_t			*mp = NULL;
4880 	int			error = 0;
4881 	void			*option = NULL;	/* Set if fallback value */
4882 	t_uscalar_t		maxlen = *optlenp;
4883 	t_uscalar_t		len;
4884 	uint32_t		value;
4885 
4886 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4887 	    (void *)so, level, option_name, optval, (void *)optlenp,
4888 	    pr_state(so->so_state, so->so_mode)));
4889 
4890 	mutex_enter(&so->so_lock);
4891 	so_lock_single(so);	/* Set SOLOCKED */
4892 
4893 	/*
4894 	 * Check for SOL_SOCKET options.
4895 	 * Certain SOL_SOCKET options are returned directly whereas
4896 	 * others only provide a default (fallback) value should
4897 	 * the T_SVR4_OPTMGMT_REQ fail.
4898 	 */
4899 	if (level == SOL_SOCKET) {
4900 		/* Check parameters */
4901 		switch (option_name) {
4902 		case SO_TYPE:
4903 		case SO_ERROR:
4904 		case SO_DEBUG:
4905 		case SO_ACCEPTCONN:
4906 		case SO_REUSEADDR:
4907 		case SO_KEEPALIVE:
4908 		case SO_DONTROUTE:
4909 		case SO_BROADCAST:
4910 		case SO_USELOOPBACK:
4911 		case SO_OOBINLINE:
4912 		case SO_SNDBUF:
4913 		case SO_RCVBUF:
4914 #ifdef notyet
4915 		case SO_SNDLOWAT:
4916 		case SO_RCVLOWAT:
4917 		case SO_SNDTIMEO:
4918 		case SO_RCVTIMEO:
4919 #endif /* notyet */
4920 		case SO_DOMAIN:
4921 		case SO_DGRAM_ERRIND:
4922 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4923 				error = EINVAL;
4924 				eprintsoline(so, error);
4925 				goto done2;
4926 			}
4927 			break;
4928 		case SO_LINGER:
4929 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4930 				error = EINVAL;
4931 				eprintsoline(so, error);
4932 				goto done2;
4933 			}
4934 			break;
4935 		}
4936 
4937 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4938 
4939 		switch (option_name) {
4940 		case SO_TYPE:
4941 			value = so->so_type;
4942 			option = &value;
4943 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4944 
4945 		case SO_ERROR:
4946 			value = sogeterr(so);
4947 			option = &value;
4948 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4949 
4950 		case SO_ACCEPTCONN:
4951 			if (so->so_state & SS_ACCEPTCONN)
4952 				value = SO_ACCEPTCONN;
4953 			else
4954 				value = 0;
4955 #ifdef DEBUG
4956 			if (value) {
4957 				dprintso(so, 1,
4958 				    ("sotpi_getsockopt: 0x%x is set\n",
4959 				    option_name));
4960 			} else {
4961 				dprintso(so, 1,
4962 				    ("sotpi_getsockopt: 0x%x not set\n",
4963 				    option_name));
4964 			}
4965 #endif /* DEBUG */
4966 			option = &value;
4967 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4968 
4969 		case SO_DEBUG:
4970 		case SO_REUSEADDR:
4971 		case SO_KEEPALIVE:
4972 		case SO_DONTROUTE:
4973 		case SO_BROADCAST:
4974 		case SO_USELOOPBACK:
4975 		case SO_OOBINLINE:
4976 		case SO_DGRAM_ERRIND:
4977 			value = (so->so_options & option_name);
4978 #ifdef DEBUG
4979 			if (value) {
4980 				dprintso(so, 1,
4981 				    ("sotpi_getsockopt: 0x%x is set\n",
4982 				    option_name));
4983 			} else {
4984 				dprintso(so, 1,
4985 				    ("sotpi_getsockopt: 0x%x not set\n",
4986 				    option_name));
4987 			}
4988 #endif /* DEBUG */
4989 			option = &value;
4990 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4991 
4992 		/*
4993 		 * The following options are only returned by sockfs when the
4994 		 * T_SVR4_OPTMGMT_REQ fails.
4995 		 */
4996 		case SO_LINGER:
4997 			option = &so->so_linger;
4998 			len = (t_uscalar_t)sizeof (struct linger);
4999 			break;
5000 		case SO_SNDBUF: {
5001 			ssize_t lvalue;
5002 
5003 			/*
5004 			 * If the option has not been set then get a default
5005 			 * value from the read queue. This value is
5006 			 * returned if the transport fails
5007 			 * the T_SVR4_OPTMGMT_REQ.
5008 			 */
5009 			lvalue = so->so_sndbuf;
5010 			if (lvalue == 0) {
5011 				mutex_exit(&so->so_lock);
5012 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5013 				    QHIWAT, 0, &lvalue);
5014 				mutex_enter(&so->so_lock);
5015 				dprintso(so, 1,
5016 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5017 			}
5018 			value = (int)lvalue;
5019 			option = &value;
5020 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5021 			break;
5022 		}
5023 		case SO_RCVBUF: {
5024 			ssize_t lvalue;
5025 
5026 			/*
5027 			 * If the option has not been set then get a default
5028 			 * value from the read queue. This value is
5029 			 * returned if the transport fails
5030 			 * the T_SVR4_OPTMGMT_REQ.
5031 			 *
5032 			 * XXX If SO_RCVBUF has been set and this is an
5033 			 * XPG 4.2 application then do not ask the transport
5034 			 * since the transport might adjust the value and not
5035 			 * return exactly what was set by the application.
5036 			 * For non-XPG 4.2 application we return the value
5037 			 * that the transport is actually using.
5038 			 */
5039 			lvalue = so->so_rcvbuf;
5040 			if (lvalue == 0) {
5041 				mutex_exit(&so->so_lock);
5042 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5043 				    QHIWAT, 0, &lvalue);
5044 				mutex_enter(&so->so_lock);
5045 				dprintso(so, 1,
5046 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5047 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5048 				value = (int)lvalue;
5049 				option = &value;
5050 				goto copyout;	/* skip asking transport */
5051 			}
5052 			value = (int)lvalue;
5053 			option = &value;
5054 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5055 			break;
5056 		}
5057 		case SO_DOMAIN:
5058 			value = so->so_family;
5059 			option = &value;
5060 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5061 
5062 #ifdef notyet
5063 		/*
5064 		 * We do not implement the semantics of these options
5065 		 * thus we shouldn't implement the options either.
5066 		 */
5067 		case SO_SNDLOWAT:
5068 			value = so->so_sndlowat;
5069 			option = &value;
5070 			break;
5071 		case SO_RCVLOWAT:
5072 			value = so->so_rcvlowat;
5073 			option = &value;
5074 			break;
5075 		case SO_SNDTIMEO:
5076 			value = so->so_sndtimeo;
5077 			option = &value;
5078 			break;
5079 		case SO_RCVTIMEO:
5080 			value = so->so_rcvtimeo;
5081 			option = &value;
5082 			break;
5083 #endif /* notyet */
5084 		}
5085 	}
5086 
5087 	mutex_exit(&so->so_lock);
5088 
5089 	/* Send request */
5090 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5091 	optmgmt_req.MGMT_flags = T_CHECK;
5092 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5093 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5094 
5095 	oh.level = level;
5096 	oh.name = option_name;
5097 	oh.len = maxlen;
5098 
5099 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5100 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
5101 	/* Let option management work in the presence of data flow control */
5102 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5103 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5104 	mp = NULL;
5105 	mutex_enter(&so->so_lock);
5106 	if (error) {
5107 		eprintsoline(so, error);
5108 		goto done2;
5109 	}
5110 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5111 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5112 	if (error) {
5113 		if (option != NULL) {
5114 			/* We have a fallback value */
5115 			error = 0;
5116 			goto copyout;
5117 		}
5118 		eprintsoline(so, error);
5119 		goto done2;
5120 	}
5121 	ASSERT(mp);
5122 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5123 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5124 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5125 	if (opt_res == NULL) {
5126 		if (option != NULL) {
5127 			/* We have a fallback value */
5128 			error = 0;
5129 			goto copyout;
5130 		}
5131 		error = EPROTO;
5132 		eprintsoline(so, error);
5133 		goto done;
5134 	}
5135 	option = &opt_res[1];
5136 
5137 	/* check to ensure that the option is within bounds */
5138 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5139 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5140 		if (option != NULL) {
5141 			/* We have a fallback value */
5142 			error = 0;
5143 			goto copyout;
5144 		}
5145 		error = EPROTO;
5146 		eprintsoline(so, error);
5147 		goto done;
5148 	}
5149 
5150 	len = opt_res->len;
5151 
5152 copyout: {
5153 		t_uscalar_t size = MIN(len, maxlen);
5154 		bcopy(option, optval, size);
5155 		bcopy(&size, optlenp, sizeof (size));
5156 	}
5157 done:
5158 	freemsg(mp);
5159 done2:
5160 	so_unlock_single(so, SOLOCKED);
5161 	mutex_exit(&so->so_lock);
5162 	return (error);
5163 }
5164 
5165 /*
5166  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5167  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5168  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5169  * setsockopt has to work even if the transport does not support the option.
5170  */
5171 int
5172 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5173 	const void *optval, t_uscalar_t optlen)
5174 {
5175 	struct T_optmgmt_req	optmgmt_req;
5176 	struct opthdr		oh;
5177 	mblk_t			*mp;
5178 	int			error = 0;
5179 	boolean_t		handled = B_FALSE;
5180 
5181 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5182 	    (void *)so, level, option_name, optval, optlen,
5183 	    pr_state(so->so_state, so->so_mode)));
5184 
5185 
5186 	/* X/Open requires this check */
5187 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5188 		if (xnet_check_print)
5189 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5190 		return (EINVAL);
5191 	}
5192 
5193 	/* Caller allocates aligned optval, or passes null */
5194 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
5195 	/* If optval is null optlen is 0, and vice-versa */
5196 	ASSERT(optval != NULL || optlen == 0);
5197 	ASSERT(optlen != 0 || optval == NULL);
5198 
5199 	mutex_enter(&so->so_lock);
5200 	so_lock_single(so);	/* Set SOLOCKED */
5201 	mutex_exit(&so->so_lock);
5202 
5203 	/*
5204 	 * For SOCKET or TCP level options, try to set it here itself
5205 	 * provided socket has not been popped and we know the tcp
5206 	 * structure (stored in so_priv).
5207 	 */
5208 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5209 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5210 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
5211 		tcp_t		*tcp = so->so_priv;
5212 		boolean_t	onoff;
5213 
5214 #define	intvalue	(*(int32_t *)optval)
5215 
5216 		switch (level) {
5217 		case SOL_SOCKET:
5218 			switch (option_name) {		/* Check length param */
5219 			case SO_DEBUG:
5220 			case SO_REUSEADDR:
5221 			case SO_DONTROUTE:
5222 			case SO_BROADCAST:
5223 			case SO_USELOOPBACK:
5224 			case SO_OOBINLINE:
5225 			case SO_DGRAM_ERRIND:
5226 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5227 					error = EINVAL;
5228 					eprintsoline(so, error);
5229 					mutex_enter(&so->so_lock);
5230 					goto done2;
5231 				}
5232 				ASSERT(optval);
5233 				onoff = intvalue != 0;
5234 				handled = B_TRUE;
5235 				break;
5236 			case SO_LINGER:
5237 				if (optlen !=
5238 				    (t_uscalar_t)sizeof (struct linger)) {
5239 					error = EINVAL;
5240 					eprintsoline(so, error);
5241 					mutex_enter(&so->so_lock);
5242 					goto done2;
5243 				}
5244 				ASSERT(optval);
5245 				handled = B_TRUE;
5246 				break;
5247 			}
5248 
5249 			switch (option_name) {			/* Do actions */
5250 			case SO_LINGER: {
5251 				struct linger *lgr = (struct linger *)optval;
5252 
5253 				if (lgr->l_onoff) {
5254 					tcp->tcp_linger = 1;
5255 					tcp->tcp_lingertime = lgr->l_linger;
5256 					so->so_linger.l_onoff = SO_LINGER;
5257 					so->so_options |= SO_LINGER;
5258 				} else {
5259 					tcp->tcp_linger = 0;
5260 					tcp->tcp_lingertime = 0;
5261 					so->so_linger.l_onoff = 0;
5262 					so->so_options &= ~SO_LINGER;
5263 				}
5264 				so->so_linger.l_linger = lgr->l_linger;
5265 				handled = B_TRUE;
5266 				break;
5267 			}
5268 			case SO_DEBUG:
5269 				tcp->tcp_debug = onoff;
5270 #ifdef SOCK_TEST
5271 				if (intvalue & 2)
5272 					sock_test_timelimit = 10 * hz;
5273 				else
5274 					sock_test_timelimit = 0;
5275 
5276 				if (intvalue & 4)
5277 					do_useracc = 0;
5278 				else
5279 					do_useracc = 1;
5280 #endif /* SOCK_TEST */
5281 				break;
5282 			case SO_DONTROUTE:
5283 				/*
5284 				 * SO_DONTROUTE, SO_USELOOPBACK and
5285 				 * SO_BROADCAST are only of interest to IP.
5286 				 * We track them here only so
5287 				 * that we can report their current value.
5288 				 */
5289 				tcp->tcp_dontroute = onoff;
5290 				if (onoff)
5291 					so->so_options |= option_name;
5292 				else
5293 					so->so_options &= ~option_name;
5294 				break;
5295 			case SO_USELOOPBACK:
5296 				tcp->tcp_useloopback = onoff;
5297 				if (onoff)
5298 					so->so_options |= option_name;
5299 				else
5300 					so->so_options &= ~option_name;
5301 				break;
5302 			case SO_BROADCAST:
5303 				tcp->tcp_broadcast = onoff;
5304 				if (onoff)
5305 					so->so_options |= option_name;
5306 				else
5307 					so->so_options &= ~option_name;
5308 				break;
5309 			case SO_REUSEADDR:
5310 				tcp->tcp_reuseaddr = onoff;
5311 				if (onoff)
5312 					so->so_options |= option_name;
5313 				else
5314 					so->so_options &= ~option_name;
5315 				break;
5316 			case SO_OOBINLINE:
5317 				tcp->tcp_oobinline = onoff;
5318 				if (onoff)
5319 					so->so_options |= option_name;
5320 				else
5321 					so->so_options &= ~option_name;
5322 				break;
5323 			case SO_DGRAM_ERRIND:
5324 				tcp->tcp_dgram_errind = onoff;
5325 				if (onoff)
5326 					so->so_options |= option_name;
5327 				else
5328 					so->so_options &= ~option_name;
5329 				break;
5330 			}
5331 			break;
5332 		case IPPROTO_TCP:
5333 			switch (option_name) {
5334 			case TCP_NODELAY:
5335 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5336 					error = EINVAL;
5337 					eprintsoline(so, error);
5338 					mutex_enter(&so->so_lock);
5339 					goto done2;
5340 				}
5341 				ASSERT(optval);
5342 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5343 				handled = B_TRUE;
5344 				break;
5345 			}
5346 			break;
5347 		default:
5348 			handled = B_FALSE;
5349 			break;
5350 		}
5351 	}
5352 
5353 	if (handled) {
5354 		mutex_enter(&so->so_lock);
5355 		goto done2;
5356 	}
5357 
5358 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5359 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5360 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5361 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5362 
5363 	oh.level = level;
5364 	oh.name = option_name;
5365 	oh.len = optlen;
5366 
5367 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5368 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5369 	/* Let option management work in the presence of data flow control */
5370 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5371 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5372 	mp = NULL;
5373 	mutex_enter(&so->so_lock);
5374 	if (error) {
5375 		eprintsoline(so, error);
5376 		goto done;
5377 	}
5378 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5379 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5380 	if (error) {
5381 		eprintsoline(so, error);
5382 		goto done;
5383 	}
5384 	ASSERT(mp);
5385 	/* No need to verify T_optmgmt_ack */
5386 	freemsg(mp);
5387 done:
5388 	/*
5389 	 * Check for SOL_SOCKET options and record their values.
5390 	 * If we know about a SOL_SOCKET parameter and the transport
5391 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5392 	 * EPROTO) we let the setsockopt succeed.
5393 	 */
5394 	if (level == SOL_SOCKET) {
5395 		/* Check parameters */
5396 		switch (option_name) {
5397 		case SO_DEBUG:
5398 		case SO_REUSEADDR:
5399 		case SO_KEEPALIVE:
5400 		case SO_DONTROUTE:
5401 		case SO_BROADCAST:
5402 		case SO_USELOOPBACK:
5403 		case SO_OOBINLINE:
5404 		case SO_SNDBUF:
5405 		case SO_RCVBUF:
5406 #ifdef notyet
5407 		case SO_SNDLOWAT:
5408 		case SO_RCVLOWAT:
5409 		case SO_SNDTIMEO:
5410 		case SO_RCVTIMEO:
5411 #endif /* notyet */
5412 		case SO_DGRAM_ERRIND:
5413 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5414 				error = EINVAL;
5415 				eprintsoline(so, error);
5416 				goto done2;
5417 			}
5418 			ASSERT(optval);
5419 			handled = B_TRUE;
5420 			break;
5421 		case SO_LINGER:
5422 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5423 				error = EINVAL;
5424 				eprintsoline(so, error);
5425 				goto done2;
5426 			}
5427 			ASSERT(optval);
5428 			handled = B_TRUE;
5429 			break;
5430 		}
5431 
5432 #define	intvalue	(*(int32_t *)optval)
5433 
5434 		switch (option_name) {
5435 		case SO_TYPE:
5436 		case SO_ERROR:
5437 		case SO_ACCEPTCONN:
5438 			/* Can't be set */
5439 			error = ENOPROTOOPT;
5440 			goto done2;
5441 		case SO_LINGER: {
5442 			struct linger *l = (struct linger *)optval;
5443 
5444 			so->so_linger.l_linger = l->l_linger;
5445 			if (l->l_onoff) {
5446 				so->so_linger.l_onoff = SO_LINGER;
5447 				so->so_options |= SO_LINGER;
5448 			} else {
5449 				so->so_linger.l_onoff = 0;
5450 				so->so_options &= ~SO_LINGER;
5451 			}
5452 			break;
5453 		}
5454 
5455 		case SO_DEBUG:
5456 #ifdef SOCK_TEST
5457 			if (intvalue & 2)
5458 				sock_test_timelimit = 10 * hz;
5459 			else
5460 				sock_test_timelimit = 0;
5461 
5462 			if (intvalue & 4)
5463 				do_useracc = 0;
5464 			else
5465 				do_useracc = 1;
5466 #endif /* SOCK_TEST */
5467 			/* FALLTHRU */
5468 		case SO_REUSEADDR:
5469 		case SO_KEEPALIVE:
5470 		case SO_DONTROUTE:
5471 		case SO_BROADCAST:
5472 		case SO_USELOOPBACK:
5473 		case SO_OOBINLINE:
5474 		case SO_DGRAM_ERRIND:
5475 			if (intvalue != 0) {
5476 				dprintso(so, 1,
5477 				    ("sotpi_setsockopt: setting 0x%x\n",
5478 				    option_name));
5479 				so->so_options |= option_name;
5480 			} else {
5481 				dprintso(so, 1,
5482 				    ("sotpi_setsockopt: clearing 0x%x\n",
5483 				    option_name));
5484 				so->so_options &= ~option_name;
5485 			}
5486 			break;
5487 		/*
5488 		 * The following options are only returned by us when the
5489 		 * T_SVR4_OPTMGMT_REQ fails.
5490 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5491 		 * since the transport might adjust the value and not
5492 		 * return exactly what was set by the application.
5493 		 */
5494 		case SO_SNDBUF:
5495 			so->so_sndbuf = intvalue;
5496 			break;
5497 		case SO_RCVBUF:
5498 			so->so_rcvbuf = intvalue;
5499 			break;
5500 #ifdef notyet
5501 		/*
5502 		 * We do not implement the semantics of these options
5503 		 * thus we shouldn't implement the options either.
5504 		 */
5505 		case SO_SNDLOWAT:
5506 			so->so_sndlowat = intvalue;
5507 			break;
5508 		case SO_RCVLOWAT:
5509 			so->so_rcvlowat = intvalue;
5510 			break;
5511 		case SO_SNDTIMEO:
5512 			so->so_sndtimeo = intvalue;
5513 			break;
5514 		case SO_RCVTIMEO:
5515 			so->so_rcvtimeo = intvalue;
5516 			break;
5517 #endif /* notyet */
5518 		}
5519 #undef	intvalue
5520 
5521 		if (error) {
5522 			if ((error == ENOPROTOOPT || error == EPROTO ||
5523 			    error == EINVAL) && handled) {
5524 				dprintso(so, 1,
5525 				    ("setsockopt: ignoring error %d for 0x%x\n",
5526 				    error, option_name));
5527 				error = 0;
5528 			}
5529 		}
5530 	}
5531 done2:
5532 ret:
5533 	so_unlock_single(so, SOLOCKED);
5534 	mutex_exit(&so->so_lock);
5535 	return (error);
5536 }
5537