xref: /illumos-gate/usr/src/uts/common/fs/sockfs/socktpi.c (revision 0bb073995ac5a95bd35f2dd790df1ea3d8c2d507)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/t_lock.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/buf.h>
32 #include <sys/conf.h>
33 #include <sys/cred.h>
34 #include <sys/kmem.h>
35 #include <sys/sysmacros.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/time.h>
41 #include <sys/file.h>
42 #include <sys/open.h>
43 #include <sys/user.h>
44 #include <sys/termios.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
47 #include <sys/strsun.h>
48 #include <sys/ddi.h>
49 #include <sys/esunddi.h>
50 #include <sys/flock.h>
51 #include <sys/modctl.h>
52 #include <sys/vtrace.h>
53 #include <sys/cmn_err.h>
54 #include <sys/pathname.h>
55 
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/sockio.h>
59 #include <sys/sodirect.h>
60 #include <netinet/in.h>
61 #include <sys/un.h>
62 #include <sys/strsun.h>
63 
64 #include <sys/tiuser.h>
65 #define	_SUN_TPI_VERSION	2
66 #include <sys/tihdr.h>
67 #include <sys/timod.h>		/* TI_GETMYNAME, TI_GETPEERNAME */
68 
69 #include <c2/audit.h>
70 
71 #include <inet/common.h>
72 #include <inet/ip.h>
73 #include <inet/ip6.h>
74 #include <inet/tcp.h>
75 #include <inet/udp_impl.h>
76 
77 #include <sys/zone.h>
78 
79 #include <fs/sockfs/nl7c.h>
80 #include <fs/sockfs/nl7curi.h>
81 
82 #include <inet/kssl/ksslapi.h>
83 
84 /*
85  * Possible failures when memory can't be allocated. The documented behavior:
86  *
87  * 		5.5:			4.X:		XNET:
88  * accept:	ENOMEM/ENOSR/EINTR	- (EINTR)	ENOMEM/ENOBUFS/ENOSR/
89  *							EINTR
90  *	(4.X does not document EINTR but returns it)
91  * bind:	ENOSR			-		ENOBUFS/ENOSR
92  * connect: 	EINTR			EINTR		ENOBUFS/ENOSR/EINTR
93  * getpeername:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
94  * getsockname:	ENOMEM/ENOSR		ENOBUFS (-)	ENOBUFS/ENOSR
95  *	(4.X getpeername and getsockname do not fail in practice)
96  * getsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
97  * listen:	-			-		ENOBUFS
98  * recv:	ENOMEM/ENOSR/EINTR	EINTR		ENOBUFS/ENOMEM/ENOSR/
99  *							EINTR
100  * send:	ENOMEM/ENOSR/EINTR	ENOBUFS/EINTR	ENOBUFS/ENOMEM/ENOSR/
101  *							EINTR
102  * setsockopt:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
103  * shutdown:	ENOMEM/ENOSR		-		ENOBUFS/ENOSR
104  * socket:	ENOMEM/ENOSR		ENOBUFS		ENOBUFS/ENOMEM/ENOSR
105  * socketpair:	ENOMEM/ENOSR		-		ENOBUFS/ENOMEM/ENOSR
106  *
107  * Resolution. When allocation fails:
108  *	recv: return EINTR
109  *	send: return EINTR
110  *	connect, accept: EINTR
111  *	bind, listen, shutdown (unbind, unix_close, disconnect): sleep
112  *	socket, socketpair: ENOBUFS
113  *	getpeername, getsockname: sleep
114  *	getsockopt, setsockopt: sleep
115  */
116 
117 #ifdef SOCK_TEST
118 /*
119  * Variables that make sockfs do something other than the standard TPI
120  * for the AF_INET transports.
121  *
122  * solisten_tpi_tcp:
123  *	TCP can handle a O_T_BIND_REQ with an increased backlog even though
124  *	the transport is already bound. This is needed to avoid loosing the
125  *	port number should listen() do a T_UNBIND_REQ followed by a
126  *	O_T_BIND_REQ.
127  *
128  * soconnect_tpi_udp:
129  *	UDP and ICMP can handle a T_CONN_REQ.
130  *	This is needed to make the sequence of connect(), getsockname()
131  *	return the local IP address used to send packets to the connected to
132  *	destination.
133  *
134  * soconnect_tpi_tcp:
135  *	TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
136  *	Set this to non-zero to send TPI conformant messages to TCP in this
137  *	respect. This is a performance optimization.
138  *
139  * soaccept_tpi_tcp:
140  *	TCP can handle a T_CONN_REQ without the acceptor being bound.
141  *	This is a performance optimization that has been picked up in XTI.
142  *
143  * soaccept_tpi_multioptions:
144  *	When inheriting SOL_SOCKET options from the listener to the accepting
145  *	socket send them as a single message for AF_INET{,6}.
146  */
147 int solisten_tpi_tcp = 0;
148 int soconnect_tpi_udp = 0;
149 int soconnect_tpi_tcp = 0;
150 int soaccept_tpi_tcp = 0;
151 int soaccept_tpi_multioptions = 1;
152 #else /* SOCK_TEST */
153 #define	soconnect_tpi_tcp	0
154 #define	soconnect_tpi_udp	0
155 #define	solisten_tpi_tcp	0
156 #define	soaccept_tpi_tcp	0
157 #define	soaccept_tpi_multioptions	1
158 #endif /* SOCK_TEST */
159 
160 #ifdef SOCK_TEST
161 extern int do_useracc;
162 extern clock_t sock_test_timelimit;
163 #endif /* SOCK_TEST */
164 
165 /*
166  * Some X/Open added checks might have to be backed out to keep SunOS 4.X
167  * applications working. Turn on this flag to disable these checks.
168  */
169 int xnet_skip_checks = 0;
170 int xnet_check_print = 0;
171 int xnet_truncate_print = 0;
172 
173 extern	void sigintr(k_sigset_t *, int);
174 extern	void sigunintr(k_sigset_t *);
175 
176 extern	void *nl7c_lookup_addr(void *, t_uscalar_t);
177 extern	void *nl7c_add_addr(void *, t_uscalar_t);
178 extern	void nl7c_listener_addr(void *, struct sonode *);
179 
180 /* Sockets acting as an in-kernel SSL proxy */
181 extern mblk_t	*strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
182 		    strsigset_t *, strsigset_t *, strpollset_t *);
183 extern mblk_t	*strsock_kssl_output(vnode_t *, mblk_t *, strwakeup_t *,
184 		    strsigset_t *, strsigset_t *, strpollset_t *);
185 
186 static int	sotpi_unbind(struct sonode *, int);
187 
188 extern int	sodput(sodirect_t *, mblk_t *);
189 extern void	sodwakeup(sodirect_t *);
190 
191 /* TPI sockfs sonode operations */
192 static int	sotpi_accept(struct sonode *, int, struct sonode **);
193 static int	sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
194 		    int);
195 static int	sotpi_connect(struct sonode *, const struct sockaddr *,
196 		    socklen_t, int, int);
197 static int	sotpi_listen(struct sonode *, int);
198 static int	sotpi_sendmsg(struct sonode *, struct nmsghdr *,
199 		    struct uio *);
200 static int	sotpi_shutdown(struct sonode *, int);
201 static int	sotpi_getsockname(struct sonode *);
202 static int	sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
203 		    struct uio *, void *, t_uscalar_t, int);
204 static int	sodgram_direct(struct sonode *, struct sockaddr *,
205 		    socklen_t, struct uio *, int);
206 
207 sonodeops_t sotpi_sonodeops = {
208 	sotpi_accept,		/* sop_accept		*/
209 	sotpi_bind,		/* sop_bind		*/
210 	sotpi_listen,		/* sop_listen		*/
211 	sotpi_connect,		/* sop_connect		*/
212 	sotpi_recvmsg,		/* sop_recvmsg		*/
213 	sotpi_sendmsg,		/* sop_sendmsg		*/
214 	sotpi_getpeername,	/* sop_getpeername	*/
215 	sotpi_getsockname,	/* sop_getsockname	*/
216 	sotpi_shutdown,		/* sop_shutdown		*/
217 	sotpi_getsockopt,	/* sop_getsockopt	*/
218 	sotpi_setsockopt	/* sop_setsockopt	*/
219 };
220 
221 /*
222  * Common create code for socket and accept. If tso is set the values
223  * from that node is used instead of issuing a T_INFO_REQ.
224  *
225  * Assumes that the caller has a VN_HOLD on accessvp.
226  * The VN_RELE will occur either when sotpi_create() fails or when
227  * the returned sonode is freed.
228  */
229 struct sonode *
230 sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
231     struct sonode *tso, int *errorp)
232 {
233 	struct sonode	*so;
234 	vnode_t		*vp;
235 	int		flags, error;
236 
237 	ASSERT(accessvp != NULL);
238 	vp = makesockvp(accessvp, domain, type, protocol);
239 	ASSERT(vp != NULL);
240 	so = VTOSO(vp);
241 
242 	flags = FREAD|FWRITE;
243 
244 	if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
245 	    (domain == AF_INET || domain == AF_INET6) &&
246 	    (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
247 	    protocol == IPPROTO_IP)) {
248 		/* Tell tcp or udp that it's talking to sockets */
249 		flags |= SO_SOCKSTR;
250 
251 		/*
252 		 * Here we indicate to socktpi_open() our attempt to
253 		 * make direct calls between sockfs and transport.
254 		 * The final decision is left to socktpi_open().
255 		 */
256 		so->so_state |= SS_DIRECT;
257 
258 		ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
259 		if (so->so_type == SOCK_STREAM && tso != NULL) {
260 			if (tso->so_state & SS_DIRECT) {
261 				/*
262 				 * Inherit SS_DIRECT from listener and pass
263 				 * SO_ACCEPTOR open flag to tcp, indicating
264 				 * that this is an accept fast-path instance.
265 				 */
266 				flags |= SO_ACCEPTOR;
267 			} else {
268 				/*
269 				 * SS_DIRECT is not set on listener, meaning
270 				 * that the listener has been converted from
271 				 * a socket to a stream.  Ensure that the
272 				 * acceptor inherits these settings.
273 				 */
274 				so->so_state &= ~SS_DIRECT;
275 				flags &= ~SO_SOCKSTR;
276 			}
277 		}
278 	}
279 
280 	/*
281 	 * Tell local transport that it is talking to sockets.
282 	 */
283 	if (so->so_family == AF_UNIX) {
284 		flags |= SO_SOCKSTR;
285 	}
286 
287 	/* Initialize the kernel SSL proxy fields */
288 	so->so_kssl_type = KSSL_NO_PROXY;
289 	so->so_kssl_ent = NULL;
290 	so->so_kssl_ctx = NULL;
291 
292 	if (error = socktpi_open(&vp, flags, CRED(), NULL)) {
293 		VN_RELE(vp);
294 		*errorp = error;
295 		return (NULL);
296 	}
297 
298 	if (error = so_strinit(so, tso)) {
299 		(void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
300 		VN_RELE(vp);
301 		*errorp = error;
302 		return (NULL);
303 	}
304 
305 	if (version == SOV_DEFAULT)
306 		version = so_default_version;
307 
308 	so->so_version = (short)version;
309 
310 	return (so);
311 }
312 
313 /*
314  * Bind the socket to an unspecified address in sockfs only.
315  * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
316  * required in all cases.
317  */
318 static void
319 so_automatic_bind(struct sonode *so)
320 {
321 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
322 
323 	ASSERT(MUTEX_HELD(&so->so_lock));
324 	ASSERT(!(so->so_state & SS_ISBOUND));
325 	ASSERT(so->so_unbind_mp);
326 
327 	ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
328 	bzero(so->so_laddr_sa, so->so_laddr_len);
329 	so->so_laddr_sa->sa_family = so->so_family;
330 	so->so_state |= SS_ISBOUND;
331 }
332 
333 
334 /*
335  * bind the socket.
336  *
337  * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
338  * are passed in we allow rebinding. Note that for backwards compatibility
339  * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
340  * Thus the rebinding code is currently not executed.
341  *
342  * The constraints for rebinding are:
343  * - it is a SOCK_DGRAM, or
344  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
345  *   and no listen() has been done.
346  * This rebinding code was added based on some language in the XNET book
347  * about not returning EINVAL it the protocol allows rebinding. However,
348  * this language is not present in the Posix socket draft. Thus maybe the
349  * rebinding logic should be deleted from the source.
350  *
351  * A null "name" can be used to unbind the socket if:
352  * - it is a SOCK_DGRAM, or
353  * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
354  *   and no listen() has been done.
355  */
356 static int
357 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
358     socklen_t namelen, int backlog, int flags)
359 {
360 	struct T_bind_req	bind_req;
361 	struct T_bind_ack	*bind_ack;
362 	int			error = 0;
363 	mblk_t			*mp;
364 	void			*addr;
365 	t_uscalar_t		addrlen;
366 	int			unbind_on_err = 1;
367 	boolean_t		clear_acceptconn_on_err = B_FALSE;
368 	boolean_t		restore_backlog_on_err = B_FALSE;
369 	int			save_so_backlog;
370 	t_scalar_t		PRIM_type = O_T_BIND_REQ;
371 	boolean_t		tcp_udp_xport;
372 	void			*nl7c = NULL;
373 
374 	dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
375 	    (void *)so, (void *)name, namelen, backlog, flags,
376 	    pr_state(so->so_state, so->so_mode)));
377 
378 	tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
379 
380 	if (!(flags & _SOBIND_LOCK_HELD)) {
381 		mutex_enter(&so->so_lock);
382 		so_lock_single(so);	/* Set SOLOCKED */
383 	} else {
384 		ASSERT(MUTEX_HELD(&so->so_lock));
385 		ASSERT(so->so_flag & SOLOCKED);
386 	}
387 
388 	/*
389 	 * Make sure that there is a preallocated unbind_req message
390 	 * before binding. This message allocated when the socket is
391 	 * created  but it might be have been consumed.
392 	 */
393 	if (so->so_unbind_mp == NULL) {
394 		dprintso(so, 1, ("sobind: allocating unbind_req\n"));
395 		/* NOTE: holding so_lock while sleeping */
396 		so->so_unbind_mp =
397 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
398 	}
399 
400 	if (flags & _SOBIND_REBIND) {
401 		/*
402 		 * Called from solisten after doing an sotpi_unbind() or
403 		 * potentially without the unbind (latter for AF_INET{,6}).
404 		 */
405 		ASSERT(name == NULL && namelen == 0);
406 
407 		if (so->so_family == AF_UNIX) {
408 			ASSERT(so->so_ux_bound_vp);
409 			addr = &so->so_ux_laddr;
410 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
411 			dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
412 			    "addr 0x%p, vp %p\n",
413 			    addrlen,
414 			    (void *)((struct so_ux_addr *)addr)->soua_vp,
415 			    (void *)so->so_ux_bound_vp));
416 		} else {
417 			addr = so->so_laddr_sa;
418 			addrlen = (t_uscalar_t)so->so_laddr_len;
419 		}
420 	} else if (flags & _SOBIND_UNSPEC) {
421 		ASSERT(name == NULL && namelen == 0);
422 
423 		/*
424 		 * The caller checked SS_ISBOUND but not necessarily
425 		 * under so_lock
426 		 */
427 		if (so->so_state & SS_ISBOUND) {
428 			/* No error */
429 			goto done;
430 		}
431 
432 		/* Set an initial local address */
433 		switch (so->so_family) {
434 		case AF_UNIX:
435 			/*
436 			 * Use an address with same size as struct sockaddr
437 			 * just like BSD.
438 			 */
439 			so->so_laddr_len =
440 			    (socklen_t)sizeof (struct sockaddr);
441 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
442 			bzero(so->so_laddr_sa, so->so_laddr_len);
443 			so->so_laddr_sa->sa_family = so->so_family;
444 
445 			/*
446 			 * Pass down an address with the implicit bind
447 			 * magic number and the rest all zeros.
448 			 * The transport will return a unique address.
449 			 */
450 			so->so_ux_laddr.soua_vp = NULL;
451 			so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
452 			addr = &so->so_ux_laddr;
453 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
454 			break;
455 
456 		case AF_INET:
457 		case AF_INET6:
458 			/*
459 			 * An unspecified bind in TPI has a NULL address.
460 			 * Set the address in sockfs to have the sa_family.
461 			 */
462 			so->so_laddr_len = (so->so_family == AF_INET) ?
463 			    (socklen_t)sizeof (sin_t) :
464 			    (socklen_t)sizeof (sin6_t);
465 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
466 			bzero(so->so_laddr_sa, so->so_laddr_len);
467 			so->so_laddr_sa->sa_family = so->so_family;
468 			addr = NULL;
469 			addrlen = 0;
470 			break;
471 
472 		default:
473 			/*
474 			 * An unspecified bind in TPI has a NULL address.
475 			 * Set the address in sockfs to be zero length.
476 			 *
477 			 * Can not assume there is a sa_family for all
478 			 * protocol families. For example, AF_X25 does not
479 			 * have a family field.
480 			 */
481 			bzero(so->so_laddr_sa, so->so_laddr_len);
482 			so->so_laddr_len = 0;	/* XXX correct? */
483 			addr = NULL;
484 			addrlen = 0;
485 			break;
486 		}
487 
488 	} else {
489 		if (so->so_state & SS_ISBOUND) {
490 			/*
491 			 * If it is ok to rebind the socket, first unbind
492 			 * with the transport. A rebind to the NULL address
493 			 * is interpreted as an unbind.
494 			 * Note that a bind to NULL in BSD does unbind the
495 			 * socket but it fails with EINVAL.
496 			 * Note that regular sockets set SOV_SOCKBSD i.e.
497 			 * _SOBIND_SOCKBSD gets set here hence no type of
498 			 * socket does currently allow rebinding.
499 			 *
500 			 * If the name is NULL just do an unbind.
501 			 */
502 			if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
503 			    name != NULL) {
504 				error = EINVAL;
505 				unbind_on_err = 0;
506 				eprintsoline(so, error);
507 				goto done;
508 			}
509 			if ((so->so_mode & SM_CONNREQUIRED) &&
510 			    (so->so_state & SS_CANTREBIND)) {
511 				error = EINVAL;
512 				unbind_on_err = 0;
513 				eprintsoline(so, error);
514 				goto done;
515 			}
516 			error = sotpi_unbind(so, 0);
517 			if (error) {
518 				eprintsoline(so, error);
519 				goto done;
520 			}
521 			ASSERT(!(so->so_state & SS_ISBOUND));
522 			if (name == NULL) {
523 				so->so_state &=
524 				    ~(SS_ISCONNECTED|SS_ISCONNECTING);
525 				goto done;
526 			}
527 		}
528 		/* X/Open requires this check */
529 		if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
530 			if (xnet_check_print) {
531 				printf("sockfs: X/Open bind state check "
532 				    "caused EINVAL\n");
533 			}
534 			error = EINVAL;
535 			goto done;
536 		}
537 
538 		switch (so->so_family) {
539 		case AF_UNIX:
540 			/*
541 			 * All AF_UNIX addresses are nul terminated
542 			 * when copied (copyin_name) in so the minimum
543 			 * length is 3 bytes.
544 			 */
545 			if (name == NULL ||
546 			    (ssize_t)namelen <= sizeof (short) + 1) {
547 				error = EISDIR;
548 				eprintsoline(so, error);
549 				goto done;
550 			}
551 			/*
552 			 * Verify so_family matches the bound family.
553 			 * BSD does not check this for AF_UNIX resulting
554 			 * in funny mknods.
555 			 */
556 			if (name->sa_family != so->so_family) {
557 				error = EAFNOSUPPORT;
558 				goto done;
559 			}
560 			break;
561 		case AF_INET:
562 			if (name == NULL) {
563 				error = EINVAL;
564 				eprintsoline(so, error);
565 				goto done;
566 			}
567 			if ((size_t)namelen != sizeof (sin_t)) {
568 				error = name->sa_family != so->so_family ?
569 				    EAFNOSUPPORT : EINVAL;
570 				eprintsoline(so, error);
571 				goto done;
572 			}
573 			if ((flags & _SOBIND_XPG4_2) &&
574 			    (name->sa_family != so->so_family)) {
575 				/*
576 				 * This check has to be made for X/Open
577 				 * sockets however application failures have
578 				 * been observed when it is applied to
579 				 * all sockets.
580 				 */
581 				error = EAFNOSUPPORT;
582 				eprintsoline(so, error);
583 				goto done;
584 			}
585 			/*
586 			 * Force a zero sa_family to match so_family.
587 			 *
588 			 * Some programs like inetd(1M) don't set the
589 			 * family field. Other programs leave
590 			 * sin_family set to garbage - SunOS 4.X does
591 			 * not check the family field on a bind.
592 			 * We use the family field that
593 			 * was passed in to the socket() call.
594 			 */
595 			name->sa_family = so->so_family;
596 			break;
597 
598 		case AF_INET6: {
599 #ifdef DEBUG
600 			sin6_t *sin6 = (sin6_t *)name;
601 #endif /* DEBUG */
602 
603 			if (name == NULL) {
604 				error = EINVAL;
605 				eprintsoline(so, error);
606 				goto done;
607 			}
608 			if ((size_t)namelen != sizeof (sin6_t)) {
609 				error = name->sa_family != so->so_family ?
610 				    EAFNOSUPPORT : EINVAL;
611 				eprintsoline(so, error);
612 				goto done;
613 			}
614 			if (name->sa_family != so->so_family) {
615 				/*
616 				 * With IPv6 we require the family to match
617 				 * unlike in IPv4.
618 				 */
619 				error = EAFNOSUPPORT;
620 				eprintsoline(so, error);
621 				goto done;
622 			}
623 #ifdef DEBUG
624 			/*
625 			 * Verify that apps don't forget to clear
626 			 * sin6_scope_id etc
627 			 */
628 			if (sin6->sin6_scope_id != 0 &&
629 			    !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
630 				zcmn_err(getzoneid(), CE_WARN,
631 				    "bind with uninitialized sin6_scope_id "
632 				    "(%d) on socket. Pid = %d\n",
633 				    (int)sin6->sin6_scope_id,
634 				    (int)curproc->p_pid);
635 			}
636 			if (sin6->__sin6_src_id != 0) {
637 				zcmn_err(getzoneid(), CE_WARN,
638 				    "bind with uninitialized __sin6_src_id "
639 				    "(%d) on socket. Pid = %d\n",
640 				    (int)sin6->__sin6_src_id,
641 				    (int)curproc->p_pid);
642 			}
643 #endif /* DEBUG */
644 			break;
645 		}
646 		default:
647 			/*
648 			 * Don't do any length or sa_family check to allow
649 			 * non-sockaddr style addresses.
650 			 */
651 			if (name == NULL) {
652 				error = EINVAL;
653 				eprintsoline(so, error);
654 				goto done;
655 			}
656 			break;
657 		}
658 
659 		if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
660 			error = ENAMETOOLONG;
661 			eprintsoline(so, error);
662 			goto done;
663 		}
664 		/*
665 		 * Save local address.
666 		 */
667 		so->so_laddr_len = (socklen_t)namelen;
668 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
669 		bcopy(name, so->so_laddr_sa, namelen);
670 
671 		addr = so->so_laddr_sa;
672 		addrlen = (t_uscalar_t)so->so_laddr_len;
673 		switch (so->so_family) {
674 		case AF_INET6:
675 		case AF_INET:
676 			break;
677 		case AF_UNIX: {
678 			struct sockaddr_un *soun =
679 			    (struct sockaddr_un *)so->so_laddr_sa;
680 			struct vnode *vp;
681 			struct vattr vattr;
682 
683 			ASSERT(so->so_ux_bound_vp == NULL);
684 			/*
685 			 * Create vnode for the specified path name.
686 			 * Keep vnode held with a reference in so_ux_bound_vp.
687 			 * Use the vnode pointer as the address used in the
688 			 * bind with the transport.
689 			 *
690 			 * Use the same mode as in BSD. In particular this does
691 			 * not observe the umask.
692 			 */
693 			/* MAXPATHLEN + soun_family + nul termination */
694 			if (so->so_laddr_len >
695 			    (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
696 				error = ENAMETOOLONG;
697 				eprintsoline(so, error);
698 				goto done;
699 			}
700 			vattr.va_type = VSOCK;
701 			vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
702 			vattr.va_mask = AT_TYPE|AT_MODE;
703 			/* NOTE: holding so_lock */
704 			error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
705 			    EXCL, 0, &vp, CRMKNOD, 0, 0);
706 			if (error) {
707 				if (error == EEXIST)
708 					error = EADDRINUSE;
709 				eprintsoline(so, error);
710 				goto done;
711 			}
712 			/*
713 			 * Establish pointer from the underlying filesystem
714 			 * vnode to the socket node.
715 			 * so_ux_bound_vp and v_stream->sd_vnode form the
716 			 * cross-linkage between the underlying filesystem
717 			 * node and the socket node.
718 			 */
719 			ASSERT(SOTOV(so)->v_stream);
720 			mutex_enter(&vp->v_lock);
721 			vp->v_stream = SOTOV(so)->v_stream;
722 			so->so_ux_bound_vp = vp;
723 			mutex_exit(&vp->v_lock);
724 
725 			/*
726 			 * Use the vnode pointer value as a unique address
727 			 * (together with the magic number to avoid conflicts
728 			 * with implicit binds) in the transport provider.
729 			 */
730 			so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
731 			so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
732 			addr = &so->so_ux_laddr;
733 			addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
734 			dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
735 			    addrlen,
736 			    ((struct so_ux_addr *)addr)->soua_vp));
737 			break;
738 		}
739 		} /* end switch (so->so_family) */
740 	}
741 
742 	/*
743 	 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
744 	 * the transport can start passing up T_CONN_IND messages
745 	 * as soon as it receives the bind req and strsock_proto()
746 	 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
747 	 */
748 	if (flags & _SOBIND_LISTEN) {
749 		if ((so->so_state & SS_ACCEPTCONN) == 0)
750 			clear_acceptconn_on_err = B_TRUE;
751 		save_so_backlog = so->so_backlog;
752 		restore_backlog_on_err = B_TRUE;
753 		so->so_state |= SS_ACCEPTCONN;
754 		so->so_backlog = backlog;
755 	}
756 
757 	/*
758 	 * If NL7C addr(s) have been configured check for addr/port match,
759 	 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
760 	 *
761 	 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
762 	 * family sockets only. If match mark as such.
763 	 */
764 	if (nl7c_enabled && ((addr != NULL &&
765 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
766 	    (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
767 	    so->so_nl7c_flags == NL7C_AF_NCA)) {
768 		/*
769 		 * NL7C is not supported in non-global zones,
770 		 * we enforce this restriction here.
771 		 */
772 		if (so->so_zoneid == GLOBAL_ZONEID) {
773 			/* An NL7C socket, mark it */
774 			so->so_nl7c_flags |= NL7C_ENABLED;
775 			if (nl7c == NULL) {
776 				/*
777 				 * Was an AF_NCA bind() so add it to the
778 				 * addr list for reporting purposes.
779 				 */
780 				nl7c = nl7c_add_addr(addr, addrlen);
781 			}
782 		} else
783 			nl7c = NULL;
784 	}
785 	/*
786 	 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
787 	 * for other transports we will send in a O_T_BIND_REQ.
788 	 */
789 	if (tcp_udp_xport &&
790 	    (so->so_family == AF_INET || so->so_family == AF_INET6))
791 		PRIM_type = T_BIND_REQ;
792 
793 	bind_req.PRIM_type = PRIM_type;
794 	bind_req.ADDR_length = addrlen;
795 	bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
796 	bind_req.CONIND_number = backlog;
797 	/* NOTE: holding so_lock while sleeping */
798 	mp = soallocproto2(&bind_req, sizeof (bind_req),
799 	    addr, addrlen, 0, _ALLOC_SLEEP);
800 	so->so_state &= ~SS_LADDR_VALID;
801 
802 	/* Done using so_laddr_sa - can drop the lock */
803 	mutex_exit(&so->so_lock);
804 
805 	/*
806 	 * Intercept the bind_req message here to check if this <address/port>
807 	 * was configured as an SSL proxy server, or if another endpoint was
808 	 * already configured to act as a proxy for us.
809 	 *
810 	 * Note, only if NL7C not enabled for this socket.
811 	 */
812 	if (nl7c == NULL &&
813 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
814 	    so->so_type == SOCK_STREAM) {
815 
816 		if (so->so_kssl_ent != NULL) {
817 			kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type);
818 			so->so_kssl_ent = NULL;
819 		}
820 
821 		so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent);
822 		switch (so->so_kssl_type) {
823 		case KSSL_NO_PROXY:
824 			break;
825 
826 		case KSSL_HAS_PROXY:
827 			mutex_enter(&so->so_lock);
828 			goto skip_transport;
829 
830 		case KSSL_IS_PROXY:
831 			break;
832 		}
833 	}
834 
835 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
836 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
837 	if (error) {
838 		eprintsoline(so, error);
839 		mutex_enter(&so->so_lock);
840 		goto done;
841 	}
842 
843 	mutex_enter(&so->so_lock);
844 	error = sowaitprim(so, PRIM_type, T_BIND_ACK,
845 	    (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
846 	if (error) {
847 		eprintsoline(so, error);
848 		goto done;
849 	}
850 skip_transport:
851 	ASSERT(mp);
852 	/*
853 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
854 	 * strsock_proto while the lock was dropped above, the bind
855 	 * is allowed to complete.
856 	 */
857 
858 	/* Mark as bound. This will be undone if we detect errors below. */
859 	if (flags & _SOBIND_NOXLATE) {
860 		ASSERT(so->so_family == AF_UNIX);
861 		so->so_state |= SS_FADDR_NOXLATE;
862 	}
863 	ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
864 	so->so_state |= SS_ISBOUND;
865 	ASSERT(so->so_unbind_mp);
866 
867 	/* note that we've already set SS_ACCEPTCONN above */
868 
869 	/*
870 	 * Recompute addrlen - an unspecied bind sent down an
871 	 * address of length zero but we expect the appropriate length
872 	 * in return.
873 	 */
874 	addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
875 	    sizeof (so->so_ux_laddr) : so->so_laddr_len);
876 
877 	bind_ack = (struct T_bind_ack *)mp->b_rptr;
878 	/*
879 	 * The alignment restriction is really too strict but
880 	 * we want enough alignment to inspect the fields of
881 	 * a sockaddr_in.
882 	 */
883 	addr = sogetoff(mp, bind_ack->ADDR_offset,
884 	    bind_ack->ADDR_length,
885 	    __TPI_ALIGN_SIZE);
886 	if (addr == NULL) {
887 		freemsg(mp);
888 		error = EPROTO;
889 		eprintsoline(so, error);
890 		goto done;
891 	}
892 	if (!(flags & _SOBIND_UNSPEC)) {
893 		/*
894 		 * Verify that the transport didn't return something we
895 		 * did not want e.g. an address other than what we asked for.
896 		 *
897 		 * NOTE: These checks would go away if/when we switch to
898 		 * using the new TPI (in which the transport would fail
899 		 * the request instead of assigning a different address).
900 		 *
901 		 * NOTE2: For protocols that we don't know (i.e. any
902 		 * other than AF_INET6, AF_INET and AF_UNIX), we
903 		 * cannot know if the transport should be expected to
904 		 * return the same address as that requested.
905 		 *
906 		 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
907 		 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
908 		 *
909 		 * For example, in the case of netatalk it may be
910 		 * inappropriate for the transport to return the
911 		 * requested address (as it may have allocated a local
912 		 * port number in behaviour similar to that of an
913 		 * AF_INET bind request with a port number of zero).
914 		 *
915 		 * Given the definition of O_T_BIND_REQ, where the
916 		 * transport may bind to an address other than the
917 		 * requested address, it's not possible to determine
918 		 * whether a returned address that differs from the
919 		 * requested address is a reason to fail (because the
920 		 * requested address was not available) or succeed
921 		 * (because the transport allocated an appropriate
922 		 * address and/or port).
923 		 *
924 		 * sockfs currently requires that the transport return
925 		 * the requested address in the T_BIND_ACK, unless
926 		 * there is code here to allow for any discrepancy.
927 		 * Such code exists for AF_INET and AF_INET6.
928 		 *
929 		 * Netatalk chooses to return the requested address
930 		 * rather than the (correct) allocated address.  This
931 		 * means that netatalk violates the TPI specification
932 		 * (and would not function correctly if used from a
933 		 * TLI application), but it does mean that it works
934 		 * with sockfs.
935 		 *
936 		 * As noted above, using the newer XTI bind primitive
937 		 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
938 		 * allow sockfs to be more sure about whether or not
939 		 * the bind request had succeeded (as transports are
940 		 * not permitted to bind to a different address than
941 		 * that requested - they must return failure).
942 		 * Unfortunately, support for T_BIND_REQ may not be
943 		 * present in all transport implementations (netatalk,
944 		 * for example, doesn't have it), making the
945 		 * transition difficult.
946 		 */
947 		if (bind_ack->ADDR_length != addrlen) {
948 			/* Assumes that the requested address was in use */
949 			freemsg(mp);
950 			error = EADDRINUSE;
951 			eprintsoline(so, error);
952 			goto done;
953 		}
954 
955 		switch (so->so_family) {
956 		case AF_INET6:
957 		case AF_INET: {
958 			sin_t *rname, *aname;
959 
960 			rname = (sin_t *)addr;
961 			aname = (sin_t *)so->so_laddr_sa;
962 
963 			/*
964 			 * Take advantage of the alignment
965 			 * of sin_port and sin6_port which fall
966 			 * in the same place in their data structures.
967 			 * Just use sin_port for either address family.
968 			 *
969 			 * This may become a problem if (heaven forbid)
970 			 * there's a separate ipv6port_reserved... :-P
971 			 *
972 			 * Binding to port 0 has the semantics of letting
973 			 * the transport bind to any port.
974 			 *
975 			 * If the transport is TCP or UDP since we had sent
976 			 * a T_BIND_REQ we would not get a port other than
977 			 * what we asked for.
978 			 */
979 			if (tcp_udp_xport) {
980 				/*
981 				 * Pick up the new port number if we bound to
982 				 * port 0.
983 				 */
984 				if (aname->sin_port == 0)
985 					aname->sin_port = rname->sin_port;
986 				so->so_state |= SS_LADDR_VALID;
987 				break;
988 			}
989 			if (aname->sin_port != 0 &&
990 			    aname->sin_port != rname->sin_port) {
991 				freemsg(mp);
992 				error = EADDRINUSE;
993 				eprintsoline(so, error);
994 				goto done;
995 			}
996 			/*
997 			 * Pick up the new port number if we bound to port 0.
998 			 */
999 			aname->sin_port = rname->sin_port;
1000 
1001 			/*
1002 			 * Unfortunately, addresses aren't _quite_ the same.
1003 			 */
1004 			if (so->so_family == AF_INET) {
1005 				if (aname->sin_addr.s_addr !=
1006 				    rname->sin_addr.s_addr) {
1007 					freemsg(mp);
1008 					error = EADDRNOTAVAIL;
1009 					eprintsoline(so, error);
1010 					goto done;
1011 				}
1012 			} else {
1013 				sin6_t *rname6 = (sin6_t *)rname;
1014 				sin6_t *aname6 = (sin6_t *)aname;
1015 
1016 				if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1017 				    &rname6->sin6_addr)) {
1018 					freemsg(mp);
1019 					error = EADDRNOTAVAIL;
1020 					eprintsoline(so, error);
1021 					goto done;
1022 				}
1023 			}
1024 			break;
1025 		}
1026 		case AF_UNIX:
1027 			if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
1028 				freemsg(mp);
1029 				error = EADDRINUSE;
1030 				eprintsoline(so, error);
1031 				eprintso(so,
1032 				    ("addrlen %d, addr 0x%x, vp %p\n",
1033 				    addrlen, *((int *)addr),
1034 				    (void *)so->so_ux_bound_vp));
1035 				goto done;
1036 			}
1037 			so->so_state |= SS_LADDR_VALID;
1038 			break;
1039 		default:
1040 			/*
1041 			 * NOTE: This assumes that addresses can be
1042 			 * byte-compared for equivalence.
1043 			 */
1044 			if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
1045 				freemsg(mp);
1046 				error = EADDRINUSE;
1047 				eprintsoline(so, error);
1048 				goto done;
1049 			}
1050 			/*
1051 			 * Don't mark SS_LADDR_VALID, as we cannot be
1052 			 * sure that the returned address is the real
1053 			 * bound address when talking to an unknown
1054 			 * transport.
1055 			 */
1056 			break;
1057 		}
1058 	} else {
1059 		/*
1060 		 * Save for returned address for getsockname.
1061 		 * Needed for unspecific bind unless transport supports
1062 		 * the TI_GETMYNAME ioctl.
1063 		 * Do this for AF_INET{,6} even though they do, as
1064 		 * caching info here is much better performance than
1065 		 * a TPI/STREAMS trip to the transport for getsockname.
1066 		 * Any which can't for some reason _must_ _not_ set
1067 		 * LADDR_VALID here for the caching version of getsockname
1068 		 * to not break;
1069 		 */
1070 		switch (so->so_family) {
1071 		case AF_UNIX:
1072 			/*
1073 			 * Record the address bound with the transport
1074 			 * for use by socketpair.
1075 			 */
1076 			bcopy(addr, &so->so_ux_laddr, addrlen);
1077 			so->so_state |= SS_LADDR_VALID;
1078 			break;
1079 		case AF_INET:
1080 		case AF_INET6:
1081 			ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
1082 			bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
1083 			so->so_state |= SS_LADDR_VALID;
1084 			break;
1085 		default:
1086 			/*
1087 			 * Don't mark SS_LADDR_VALID, as we cannot be
1088 			 * sure that the returned address is the real
1089 			 * bound address when talking to an unknown
1090 			 * transport.
1091 			 */
1092 			break;
1093 		}
1094 	}
1095 
1096 	if (nl7c != NULL) {
1097 		/* Register listen()er sonode pointer with NL7C */
1098 		nl7c_listener_addr(nl7c, so);
1099 	}
1100 
1101 	freemsg(mp);
1102 
1103 done:
1104 	if (error) {
1105 		/* reset state & backlog to values held on entry */
1106 		if (clear_acceptconn_on_err == B_TRUE)
1107 			so->so_state &= ~SS_ACCEPTCONN;
1108 		if (restore_backlog_on_err == B_TRUE)
1109 			so->so_backlog = save_so_backlog;
1110 
1111 		if (unbind_on_err && so->so_state & SS_ISBOUND) {
1112 			int err;
1113 
1114 			err = sotpi_unbind(so, 0);
1115 			/* LINTED - statement has no consequent: if */
1116 			if (err) {
1117 				eprintsoline(so, error);
1118 			} else {
1119 				ASSERT(!(so->so_state & SS_ISBOUND));
1120 			}
1121 		}
1122 	}
1123 	if (!(flags & _SOBIND_LOCK_HELD)) {
1124 		so_unlock_single(so, SOLOCKED);
1125 		mutex_exit(&so->so_lock);
1126 	} else {
1127 		/* If the caller held the lock don't release it here */
1128 		ASSERT(MUTEX_HELD(&so->so_lock));
1129 		ASSERT(so->so_flag & SOLOCKED);
1130 	}
1131 	return (error);
1132 }
1133 
1134 /* bind the socket */
1135 static int
1136 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1137     int flags)
1138 {
1139 	if ((flags & _SOBIND_SOCKETPAIR) == 0)
1140 		return (sotpi_bindlisten(so, name, namelen, 0, flags));
1141 
1142 	flags &= ~_SOBIND_SOCKETPAIR;
1143 	return (sotpi_bindlisten(so, name, namelen, 1, flags));
1144 }
1145 
1146 /*
1147  * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1148  * address, or when listen needs to unbind and bind.
1149  * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1150  * so that a sobind can pick them up.
1151  */
1152 static int
1153 sotpi_unbind(struct sonode *so, int flags)
1154 {
1155 	struct T_unbind_req	unbind_req;
1156 	int			error = 0;
1157 	mblk_t			*mp;
1158 
1159 	dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1160 	    (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1161 
1162 	ASSERT(MUTEX_HELD(&so->so_lock));
1163 	ASSERT(so->so_flag & SOLOCKED);
1164 
1165 	if (!(so->so_state & SS_ISBOUND)) {
1166 		error = EINVAL;
1167 		eprintsoline(so, error);
1168 		goto done;
1169 	}
1170 
1171 	mutex_exit(&so->so_lock);
1172 
1173 	/*
1174 	 * Flush the read and write side (except stream head read queue)
1175 	 * and send down T_UNBIND_REQ.
1176 	 */
1177 	(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1178 
1179 	unbind_req.PRIM_type = T_UNBIND_REQ;
1180 	mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1181 	    0, _ALLOC_SLEEP);
1182 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1183 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1184 	mutex_enter(&so->so_lock);
1185 	if (error) {
1186 		eprintsoline(so, error);
1187 		goto done;
1188 	}
1189 
1190 	error = sowaitokack(so, T_UNBIND_REQ);
1191 	if (error) {
1192 		eprintsoline(so, error);
1193 		goto done;
1194 	}
1195 
1196 	/*
1197 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1198 	 * strsock_proto while the lock was dropped above, the unbind
1199 	 * is allowed to complete.
1200 	 */
1201 	if (!(flags & _SOUNBIND_REBIND)) {
1202 		/*
1203 		 * Clear out bound address.
1204 		 */
1205 		vnode_t *vp;
1206 
1207 		if ((vp = so->so_ux_bound_vp) != NULL) {
1208 
1209 			/* Undo any SSL proxy setup */
1210 			if ((so->so_family == AF_INET ||
1211 			    so->so_family == AF_INET6) &&
1212 			    (so->so_type == SOCK_STREAM) &&
1213 			    (so->so_kssl_ent != NULL)) {
1214 				kssl_release_ent(so->so_kssl_ent, so,
1215 				    so->so_kssl_type);
1216 				so->so_kssl_ent = NULL;
1217 				so->so_kssl_type = KSSL_NO_PROXY;
1218 			}
1219 
1220 			so->so_ux_bound_vp = NULL;
1221 			vn_rele_stream(vp);
1222 		}
1223 		/* Clear out address */
1224 		so->so_laddr_len = 0;
1225 	}
1226 	so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
1227 
1228 done:
1229 
1230 	/* If the caller held the lock don't release it here */
1231 	ASSERT(MUTEX_HELD(&so->so_lock));
1232 	ASSERT(so->so_flag & SOLOCKED);
1233 
1234 	return (error);
1235 }
1236 
1237 /*
1238  * listen on the socket.
1239  * For TPI conforming transports this has to first unbind with the transport
1240  * and then bind again using the new backlog.
1241  */
1242 int
1243 sotpi_listen(struct sonode *so, int backlog)
1244 {
1245 	int		error = 0;
1246 
1247 	dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1248 	    (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1249 
1250 	if (so->so_serv_type == T_CLTS)
1251 		return (EOPNOTSUPP);
1252 
1253 	/*
1254 	 * If the socket is ready to accept connections already, then
1255 	 * return without doing anything.  This avoids a problem where
1256 	 * a second listen() call fails if a connection is pending and
1257 	 * leaves the socket unbound. Only when we are not unbinding
1258 	 * with the transport can we safely increase the backlog.
1259 	 */
1260 	if (so->so_state & SS_ACCEPTCONN &&
1261 	    !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1262 	    /*CONSTCOND*/
1263 	    !solisten_tpi_tcp))
1264 		return (0);
1265 
1266 	if (so->so_state & SS_ISCONNECTED)
1267 		return (EINVAL);
1268 
1269 	mutex_enter(&so->so_lock);
1270 	so_lock_single(so);	/* Set SOLOCKED */
1271 
1272 	if (backlog < 0)
1273 		backlog = 0;
1274 	/*
1275 	 * Use the same qlimit as in BSD. BSD checks the qlimit
1276 	 * before queuing the next connection implying that a
1277 	 * listen(sock, 0) allows one connection to be queued.
1278 	 * BSD also uses 1.5 times the requested backlog.
1279 	 *
1280 	 * XNS Issue 4 required a strict interpretation of the backlog.
1281 	 * This has been waived subsequently for Issue 4 and the change
1282 	 * incorporated in XNS Issue 5. So we aren't required to do
1283 	 * anything special for XPG apps.
1284 	 */
1285 	if (backlog >= (INT_MAX - 1) / 3)
1286 		backlog = INT_MAX;
1287 	else
1288 		backlog = backlog * 3 / 2 + 1;
1289 
1290 	/*
1291 	 * If the listen doesn't change the backlog we do nothing.
1292 	 * This avoids an EPROTO error from the transport.
1293 	 */
1294 	if ((so->so_state & SS_ACCEPTCONN) &&
1295 	    so->so_backlog == backlog)
1296 		goto done;
1297 
1298 	if (!(so->so_state & SS_ISBOUND)) {
1299 		/*
1300 		 * Must have been explicitly bound in the UNIX domain.
1301 		 */
1302 		if (so->so_family == AF_UNIX) {
1303 			error = EINVAL;
1304 			goto done;
1305 		}
1306 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1307 		    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1308 	} else if (backlog > 0) {
1309 		/*
1310 		 * AF_INET{,6} hack to avoid losing the port.
1311 		 * Assumes that all AF_INET{,6} transports can handle a
1312 		 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1313 		 * has already bound thus it is possible to avoid the unbind.
1314 		 */
1315 		if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1316 		    /*CONSTCOND*/
1317 		    !solisten_tpi_tcp)) {
1318 			error = sotpi_unbind(so, _SOUNBIND_REBIND);
1319 			if (error)
1320 				goto done;
1321 		}
1322 		error = sotpi_bindlisten(so, NULL, 0, backlog,
1323 		    _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
1324 	} else {
1325 		so->so_state |= SS_ACCEPTCONN;
1326 		so->so_backlog = backlog;
1327 	}
1328 	if (error)
1329 		goto done;
1330 	ASSERT(so->so_state & SS_ACCEPTCONN);
1331 done:
1332 	so_unlock_single(so, SOLOCKED);
1333 	mutex_exit(&so->so_lock);
1334 	return (error);
1335 }
1336 
1337 /*
1338  * Disconnect either a specified seqno or all (-1).
1339  * The former is used on listening sockets only.
1340  *
1341  * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1342  * the current use of sodisconnect(seqno == -1) is only for shutdown
1343  * so there is no point (and potentially incorrect) to unbind.
1344  */
1345 int
1346 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1347 {
1348 	struct T_discon_req	discon_req;
1349 	int			error = 0;
1350 	mblk_t			*mp;
1351 
1352 	dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1353 	    (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1354 
1355 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1356 		mutex_enter(&so->so_lock);
1357 		so_lock_single(so);	/* Set SOLOCKED */
1358 	} else {
1359 		ASSERT(MUTEX_HELD(&so->so_lock));
1360 		ASSERT(so->so_flag & SOLOCKED);
1361 	}
1362 
1363 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1364 		error = EINVAL;
1365 		eprintsoline(so, error);
1366 		goto done;
1367 	}
1368 
1369 	mutex_exit(&so->so_lock);
1370 	/*
1371 	 * Flush the write side (unless this is a listener)
1372 	 * and then send down a T_DISCON_REQ.
1373 	 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1374 	 * and other messages.)
1375 	 */
1376 	if (!(so->so_state & SS_ACCEPTCONN))
1377 		(void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1378 
1379 	discon_req.PRIM_type = T_DISCON_REQ;
1380 	discon_req.SEQ_number = seqno;
1381 	mp = soallocproto1(&discon_req, sizeof (discon_req),
1382 	    0, _ALLOC_SLEEP);
1383 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1384 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1385 	mutex_enter(&so->so_lock);
1386 	if (error) {
1387 		eprintsoline(so, error);
1388 		goto done;
1389 	}
1390 
1391 	error = sowaitokack(so, T_DISCON_REQ);
1392 	if (error) {
1393 		eprintsoline(so, error);
1394 		goto done;
1395 	}
1396 	/*
1397 	 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1398 	 * strsock_proto while the lock was dropped above, the disconnect
1399 	 * is allowed to complete. However, it is not possible to
1400 	 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1401 	 */
1402 	so->so_state &=
1403 	    ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
1404 done:
1405 	if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1406 		so_unlock_single(so, SOLOCKED);
1407 		mutex_exit(&so->so_lock);
1408 	} else {
1409 		/* If the caller held the lock don't release it here */
1410 		ASSERT(MUTEX_HELD(&so->so_lock));
1411 		ASSERT(so->so_flag & SOLOCKED);
1412 	}
1413 	return (error);
1414 }
1415 
1416 int
1417 sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
1418 {
1419 	struct T_conn_ind	*conn_ind;
1420 	struct T_conn_res	*conn_res;
1421 	int			error = 0;
1422 	mblk_t			*mp, *ctxmp, *ack_mp;
1423 	struct sonode		*nso;
1424 	vnode_t			*nvp;
1425 	void			*src;
1426 	t_uscalar_t		srclen;
1427 	void			*opt;
1428 	t_uscalar_t		optlen;
1429 	t_scalar_t		PRIM_type;
1430 	t_scalar_t		SEQ_number;
1431 	size_t			sinlen;
1432 
1433 	dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1434 	    (void *)so, fflag, (void *)nsop,
1435 	    pr_state(so->so_state, so->so_mode)));
1436 
1437 	/*
1438 	 * Defer single-threading the accepting socket until
1439 	 * the T_CONN_IND has been received and parsed and the
1440 	 * new sonode has been opened.
1441 	 */
1442 
1443 	/* Check that we are not already connected */
1444 	if ((so->so_state & SS_ACCEPTCONN) == 0)
1445 		goto conn_bad;
1446 again:
1447 	if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1448 		goto e_bad;
1449 
1450 	ASSERT(mp);
1451 	conn_ind = (struct T_conn_ind *)mp->b_rptr;
1452 	ctxmp = mp->b_cont;
1453 
1454 	/*
1455 	 * Save SEQ_number for error paths.
1456 	 */
1457 	SEQ_number = conn_ind->SEQ_number;
1458 
1459 	srclen = conn_ind->SRC_length;
1460 	src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1461 	if (src == NULL) {
1462 		error = EPROTO;
1463 		freemsg(mp);
1464 		eprintsoline(so, error);
1465 		goto disconnect_unlocked;
1466 	}
1467 	optlen = conn_ind->OPT_length;
1468 	switch (so->so_family) {
1469 	case AF_INET:
1470 	case AF_INET6:
1471 		if ((optlen == sizeof (intptr_t)) &&
1472 		    ((so->so_state & SS_DIRECT) != 0)) {
1473 			bcopy(mp->b_rptr + conn_ind->OPT_offset,
1474 			    &opt, conn_ind->OPT_length);
1475 		} else {
1476 			/*
1477 			 * The transport (in this case TCP) hasn't sent up
1478 			 * a pointer to an instance for the accept fast-path.
1479 			 * Disable fast-path completely because the call to
1480 			 * sotpi_create() below would otherwise create an
1481 			 * incomplete TCP instance, which would lead to
1482 			 * problems when sockfs sends a normal T_CONN_RES
1483 			 * message down the new stream.
1484 			 */
1485 			if (so->so_state & SS_DIRECT) {
1486 				int rval;
1487 				/*
1488 				 * For consistency we inform tcp to disable
1489 				 * direct interface on the listener, though
1490 				 * we can certainly live without doing this
1491 				 * because no data will ever travel upstream
1492 				 * on the listening socket.
1493 				 */
1494 				so->so_state &= ~SS_DIRECT;
1495 				(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1496 				    0, 0, K_TO_K, CRED(), &rval);
1497 			}
1498 			opt = NULL;
1499 			optlen = 0;
1500 		}
1501 		break;
1502 	case AF_UNIX:
1503 	default:
1504 		if (optlen != 0) {
1505 			opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1506 			    __TPI_ALIGN_SIZE);
1507 			if (opt == NULL) {
1508 				error = EPROTO;
1509 				freemsg(mp);
1510 				eprintsoline(so, error);
1511 				goto disconnect_unlocked;
1512 			}
1513 		}
1514 		if (so->so_family == AF_UNIX) {
1515 			if (!(so->so_state & SS_FADDR_NOXLATE)) {
1516 				src = NULL;
1517 				srclen = 0;
1518 			}
1519 			/* Extract src address from options */
1520 			if (optlen != 0)
1521 				so_getopt_srcaddr(opt, optlen, &src, &srclen);
1522 		}
1523 		break;
1524 	}
1525 
1526 	/*
1527 	 * Create the new socket.
1528 	 */
1529 	VN_HOLD(so->so_accessvp);
1530 	nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
1531 	    so->so_protocol, so->so_version, so, &error);
1532 	if (nso == NULL) {
1533 		ASSERT(error != 0);
1534 		/*
1535 		 * Accept can not fail with ENOBUFS. sotpi_create
1536 		 * sleeps waiting for memory until a signal is caught
1537 		 * so return EINTR.
1538 		 */
1539 		freemsg(mp);
1540 		if (error == ENOBUFS)
1541 			error = EINTR;
1542 		goto e_disc_unl;
1543 	}
1544 	nvp = SOTOV(nso);
1545 
1546 	/*
1547 	 * If the transport sent up an SSL connection context, then attach
1548 	 * it the new socket, and set the (sd_wputdatafunc)() and
1549 	 * (sd_rputdatafunc)() stream head hooks to intercept and process
1550 	 * SSL records.
1551 	 */
1552 	if (ctxmp != NULL) {
1553 		/*
1554 		 * This kssl_ctx_t is already held for us by the transport.
1555 		 * So, we don't need to do a kssl_hold_ctx() here.
1556 		 */
1557 		nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
1558 		freemsg(ctxmp);
1559 		mp->b_cont = NULL;
1560 		strsetrwputdatahooks(nvp, strsock_kssl_input,
1561 		    strsock_kssl_output);
1562 
1563 		/* Disable sodirect if any */
1564 		if (nso->so_direct != NULL) {
1565 			mutex_enter(nso->so_direct->sod_lockp);
1566 			SOD_DISABLE(nso->so_direct);
1567 			mutex_exit(nso->so_direct->sod_lockp);
1568 			nso->so_direct = NULL;
1569 		}
1570 	}
1571 #ifdef DEBUG
1572 	/*
1573 	 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1574 	 * it's inherited early to allow debugging of the accept code itself.
1575 	 */
1576 	nso->so_options |= so->so_options & SO_DEBUG;
1577 #endif /* DEBUG */
1578 
1579 	/*
1580 	 * Save the SRC address from the T_CONN_IND
1581 	 * for getpeername to work on AF_UNIX and on transports that do not
1582 	 * support TI_GETPEERNAME.
1583 	 *
1584 	 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1585 	 * copyin_name().
1586 	 */
1587 	if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
1588 		error = EINVAL;
1589 		freemsg(mp);
1590 		eprintsoline(so, error);
1591 		goto disconnect_vp_unlocked;
1592 	}
1593 	nso->so_faddr_len = (socklen_t)srclen;
1594 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
1595 	bcopy(src, nso->so_faddr_sa, srclen);
1596 	nso->so_state |= SS_FADDR_VALID;
1597 
1598 	if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1599 	    (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1600 		cred_t *cr;
1601 
1602 		if ((cr = DB_CRED(mp)) != NULL) {
1603 			crhold(cr);
1604 			nso->so_peercred = cr;
1605 			nso->so_cpid = DB_CPID(mp);
1606 		}
1607 		freemsg(mp);
1608 
1609 		mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1610 		    sizeof (intptr_t), 0, _ALLOC_INTR);
1611 		if (mp == NULL) {
1612 			/*
1613 			 * Accept can not fail with ENOBUFS.
1614 			 * A signal was caught so return EINTR.
1615 			 */
1616 			error = EINTR;
1617 			eprintsoline(so, error);
1618 			goto disconnect_vp_unlocked;
1619 		}
1620 		conn_res = (struct T_conn_res *)mp->b_rptr;
1621 	} else {
1622 		nso->so_peercred = DB_CRED(mp);
1623 		nso->so_cpid = DB_CPID(mp);
1624 		DB_CRED(mp) = NULL;
1625 
1626 		mp->b_rptr = DB_BASE(mp);
1627 		conn_res = (struct T_conn_res *)mp->b_rptr;
1628 		mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1629 	}
1630 
1631 	/*
1632 	 * New socket must be bound at least in sockfs and, except for AF_INET,
1633 	 * (or AF_INET6) it also has to be bound in the transport provider.
1634 	 * We set the local address in the sonode from the T_OK_ACK of the
1635 	 * T_CONN_RES. For this reason the address we bind to here isn't
1636 	 * important.
1637 	 */
1638 	if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1639 	    /*CONSTCOND*/
1640 	    nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1641 		/*
1642 		 * Optimization for AF_INET{,6} transports
1643 		 * that can handle a T_CONN_RES without being bound.
1644 		 */
1645 		mutex_enter(&nso->so_lock);
1646 		so_automatic_bind(nso);
1647 		mutex_exit(&nso->so_lock);
1648 	} else {
1649 		/* Perform NULL bind with the transport provider. */
1650 		if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
1651 			ASSERT(error != ENOBUFS);
1652 			freemsg(mp);
1653 			eprintsoline(nso, error);
1654 			goto disconnect_vp_unlocked;
1655 		}
1656 	}
1657 
1658 	/*
1659 	 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1660 	 * so that any data arriving on the new socket will cause the
1661 	 * appropriate signals to be delivered for the new socket.
1662 	 *
1663 	 * No other thread (except strsock_proto and strsock_misc)
1664 	 * can access the new socket thus we relax the locking.
1665 	 */
1666 	nso->so_pgrp = so->so_pgrp;
1667 	nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
1668 
1669 	if (nso->so_pgrp != 0) {
1670 		if ((error = so_set_events(nso, nvp, CRED())) != 0) {
1671 			eprintsoline(nso, error);
1672 			error = 0;
1673 			nso->so_pgrp = 0;
1674 		}
1675 	}
1676 
1677 	/*
1678 	 * Make note of the socket level options. TCP and IP level options
1679 	 * are already inherited. We could do all this after accept is
1680 	 * successful but doing it here simplifies code and no harm done
1681 	 * for error case.
1682 	 */
1683 	nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1684 	    SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1685 	    SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1686 	nso->so_sndbuf = so->so_sndbuf;
1687 	nso->so_rcvbuf = so->so_rcvbuf;
1688 	if (nso->so_options & SO_LINGER)
1689 		nso->so_linger = so->so_linger;
1690 
1691 	if ((so->so_state & SS_DIRECT) != 0) {
1692 
1693 		ASSERT(opt != NULL);
1694 
1695 		conn_res->OPT_length = optlen;
1696 		conn_res->OPT_offset = MBLKL(mp);
1697 		bcopy(&opt, mp->b_wptr, optlen);
1698 		mp->b_wptr += optlen;
1699 		conn_res->PRIM_type = T_CONN_RES;
1700 		conn_res->ACCEPTOR_id = 0;
1701 		PRIM_type = T_CONN_RES;
1702 
1703 		/* Send down the T_CONN_RES on acceptor STREAM */
1704 		error = kstrputmsg(SOTOV(nso), mp, NULL,
1705 		    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1706 		if (error) {
1707 			mutex_enter(&so->so_lock);
1708 			so_lock_single(so);
1709 			eprintsoline(so, error);
1710 			goto disconnect_vp;
1711 		}
1712 		mutex_enter(&nso->so_lock);
1713 		error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1714 		    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1715 		if (error) {
1716 			mutex_exit(&nso->so_lock);
1717 			mutex_enter(&so->so_lock);
1718 			so_lock_single(so);
1719 			eprintsoline(so, error);
1720 			goto disconnect_vp;
1721 		}
1722 		if (nso->so_family == AF_INET) {
1723 			sin_t *sin;
1724 
1725 			sin = (sin_t *)(ack_mp->b_rptr +
1726 			    sizeof (struct T_ok_ack));
1727 			bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
1728 			nso->so_laddr_len = sizeof (sin_t);
1729 		} else {
1730 			sin6_t *sin6;
1731 
1732 			sin6 = (sin6_t *)(ack_mp->b_rptr +
1733 			    sizeof (struct T_ok_ack));
1734 			bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
1735 			nso->so_laddr_len = sizeof (sin6_t);
1736 		}
1737 		freemsg(ack_mp);
1738 
1739 		nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
1740 		nso->so_priv = opt;
1741 
1742 		if (so->so_nl7c_flags & NL7C_ENABLED) {
1743 			/*
1744 			 * A NL7C marked listen()er so the new socket
1745 			 * inherits the listen()er's NL7C state, except
1746 			 * for NL7C_POLLIN.
1747 			 *
1748 			 * Only call NL7C to process the new socket if
1749 			 * the listen socket allows blocking i/o.
1750 			 */
1751 			nso->so_nl7c_flags = so->so_nl7c_flags & (~NL7C_POLLIN);
1752 			if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1753 				/*
1754 				 * Nonblocking accept() just make it
1755 				 * persist to defer processing to the
1756 				 * read-side syscall (e.g. read).
1757 				 */
1758 				nso->so_nl7c_flags |= NL7C_SOPERSIST;
1759 			} else if (nl7c_process(nso, B_FALSE)) {
1760 				/*
1761 				 * NL7C has completed processing on the
1762 				 * socket, close the socket and back to
1763 				 * the top to await the next T_CONN_IND.
1764 				 */
1765 				mutex_exit(&nso->so_lock);
1766 				(void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1767 				    CRED(), NULL);
1768 				VN_RELE(nvp);
1769 				goto again;
1770 			}
1771 			/* Pass the new socket out */
1772 		}
1773 
1774 		mutex_exit(&nso->so_lock);
1775 
1776 		/*
1777 		 * It's possible, through the use of autopush for example,
1778 		 * that the acceptor stream may not support SS_DIRECT
1779 		 * semantics. If the new socket does not support SS_DIRECT
1780 		 * we issue a _SIOCSOCKFALLBACK to inform the transport
1781 		 * as we would in the I_PUSH case.
1782 		 */
1783 		if (!(nso->so_state & SS_DIRECT)) {
1784 			int	rval;
1785 
1786 			if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1787 			    0, 0, K_TO_K, CRED(), &rval)) != 0) {
1788 				mutex_enter(&so->so_lock);
1789 				so_lock_single(so);
1790 				eprintsoline(so, error);
1791 				goto disconnect_vp;
1792 			}
1793 		}
1794 
1795 		/*
1796 		 * Pass out new socket.
1797 		 */
1798 		if (nsop != NULL)
1799 			*nsop = nso;
1800 
1801 		return (0);
1802 	}
1803 
1804 	/*
1805 	 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1806 	 * which don't support the FireEngine accept fast-path. It is also
1807 	 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1808 	 * again. Neither sockfs nor TCP attempt to find out if some other
1809 	 * random module has been inserted in between (in which case we
1810 	 * should follow TLI accept behaviour). We blindly assume the worst
1811 	 * case and revert back to old behaviour i.e. TCP will not send us
1812 	 * any option (eager) and the accept should happen on the listener
1813 	 * queue. Any queued T_conn_ind have already got their options removed
1814 	 * by so_sock2_stream() when "sockmod" was I_POP'd.
1815 	 */
1816 	/*
1817 	 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1818 	 */
1819 	if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1820 #ifdef	_ILP32
1821 		queue_t	*q;
1822 
1823 		/*
1824 		 * Find read queue in driver
1825 		 * Can safely do this since we "own" nso/nvp.
1826 		 */
1827 		q = strvp2wq(nvp)->q_next;
1828 		while (SAMESTR(q))
1829 			q = q->q_next;
1830 		q = RD(q);
1831 		conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1832 #else
1833 		conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1834 #endif	/* _ILP32 */
1835 		conn_res->PRIM_type = O_T_CONN_RES;
1836 		PRIM_type = O_T_CONN_RES;
1837 	} else {
1838 		conn_res->ACCEPTOR_id = nso->so_acceptor_id;
1839 		conn_res->PRIM_type = T_CONN_RES;
1840 		PRIM_type = T_CONN_RES;
1841 	}
1842 	conn_res->SEQ_number = SEQ_number;
1843 	conn_res->OPT_length = 0;
1844 	conn_res->OPT_offset = 0;
1845 
1846 	mutex_enter(&so->so_lock);
1847 	so_lock_single(so);	/* Set SOLOCKED */
1848 	mutex_exit(&so->so_lock);
1849 
1850 	error = kstrputmsg(SOTOV(so), mp, NULL,
1851 	    0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1852 	mutex_enter(&so->so_lock);
1853 	if (error) {
1854 		eprintsoline(so, error);
1855 		goto disconnect_vp;
1856 	}
1857 	error = sowaitprim(so, PRIM_type, T_OK_ACK,
1858 	    (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1859 	if (error) {
1860 		eprintsoline(so, error);
1861 		goto disconnect_vp;
1862 	}
1863 	/*
1864 	 * If there is a sin/sin6 appended onto the T_OK_ACK use
1865 	 * that to set the local address. If this is not present
1866 	 * then we zero out the address and don't set the
1867 	 * SS_LADDR_VALID bit. For AF_UNIX endpoints we copy over
1868 	 * the pathname from the listening socket.
1869 	 */
1870 	sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
1871 	if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
1872 	    MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
1873 		ack_mp->b_rptr += sizeof (struct T_ok_ack);
1874 		bcopy(ack_mp->b_rptr, nso->so_laddr_sa, sinlen);
1875 		nso->so_laddr_len = sinlen;
1876 		nso->so_state |= SS_LADDR_VALID;
1877 	} else if (nso->so_family == AF_UNIX) {
1878 		ASSERT(so->so_family == AF_UNIX);
1879 		nso->so_laddr_len = so->so_laddr_len;
1880 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1881 		bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
1882 		nso->so_state |= SS_LADDR_VALID;
1883 	} else {
1884 		nso->so_laddr_len = so->so_laddr_len;
1885 		ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
1886 		bzero(nso->so_laddr_sa, nso->so_addr_size);
1887 		nso->so_laddr_sa->sa_family = nso->so_family;
1888 	}
1889 	freemsg(ack_mp);
1890 
1891 	so_unlock_single(so, SOLOCKED);
1892 	mutex_exit(&so->so_lock);
1893 
1894 	nso->so_state |= SS_ISCONNECTED;
1895 
1896 	/*
1897 	 * Pass out new socket.
1898 	 */
1899 	if (nsop != NULL)
1900 		*nsop = nso;
1901 
1902 	return (0);
1903 
1904 
1905 eproto_disc_unl:
1906 	error = EPROTO;
1907 e_disc_unl:
1908 	eprintsoline(so, error);
1909 	goto disconnect_unlocked;
1910 
1911 pr_disc_vp_unl:
1912 	eprintsoline(so, error);
1913 disconnect_vp_unlocked:
1914 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1915 	VN_RELE(nvp);
1916 disconnect_unlocked:
1917 	(void) sodisconnect(so, SEQ_number, 0);
1918 	return (error);
1919 
1920 pr_disc_vp:
1921 	eprintsoline(so, error);
1922 disconnect_vp:
1923 	(void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
1924 	so_unlock_single(so, SOLOCKED);
1925 	mutex_exit(&so->so_lock);
1926 	(void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
1927 	VN_RELE(nvp);
1928 	return (error);
1929 
1930 conn_bad:	/* Note: SunOS 4/BSD unconditionally returns EINVAL here */
1931 	error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
1932 	    ? EOPNOTSUPP : EINVAL;
1933 e_bad:
1934 	eprintsoline(so, error);
1935 	return (error);
1936 }
1937 
1938 /*
1939  * connect a socket.
1940  *
1941  * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
1942  * unconnect (by specifying a null address).
1943  */
1944 int
1945 sotpi_connect(struct sonode *so,
1946 	const struct sockaddr *name,
1947 	socklen_t namelen,
1948 	int fflag,
1949 	int flags)
1950 {
1951 	struct T_conn_req	conn_req;
1952 	int			error = 0;
1953 	mblk_t			*mp;
1954 	void			*src;
1955 	socklen_t		srclen;
1956 	void			*addr;
1957 	socklen_t		addrlen;
1958 	boolean_t		need_unlock;
1959 
1960 	dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
1961 	    (void *)so, (void *)name, namelen, fflag, flags,
1962 	    pr_state(so->so_state, so->so_mode)));
1963 
1964 	/*
1965 	 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
1966 	 * avoid sleeping for memory with SOLOCKED held.
1967 	 * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
1968 	 * + sizeof (struct T_opthdr).
1969 	 * (the AF_UNIX so_ux_addr_xlate() does not make the address
1970 	 * exceed so_faddr_maxlen).
1971 	 */
1972 	mp = soallocproto(sizeof (struct T_conn_req) +
1973 	    2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
1974 	if (mp == NULL) {
1975 		/*
1976 		 * Connect can not fail with ENOBUFS. A signal was
1977 		 * caught so return EINTR.
1978 		 */
1979 		error = EINTR;
1980 		eprintsoline(so, error);
1981 		return (error);
1982 	}
1983 
1984 	mutex_enter(&so->so_lock);
1985 	/*
1986 	 * Make sure there is a preallocated T_unbind_req message
1987 	 * before any binding. This message is allocated when the
1988 	 * socket is created. Since another thread can consume
1989 	 * so_unbind_mp by the time we return from so_lock_single(),
1990 	 * we should check the availability of so_unbind_mp after
1991 	 * we return from so_lock_single().
1992 	 */
1993 
1994 	so_lock_single(so);	/* Set SOLOCKED */
1995 	need_unlock = B_TRUE;
1996 
1997 	if (so->so_unbind_mp == NULL) {
1998 		dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
1999 		/* NOTE: holding so_lock while sleeping */
2000 		so->so_unbind_mp =
2001 		    soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
2002 		if (so->so_unbind_mp == NULL) {
2003 			error = EINTR;
2004 			goto done;
2005 		}
2006 	}
2007 
2008 	/*
2009 	 * Can't have done a listen before connecting.
2010 	 */
2011 	if (so->so_state & SS_ACCEPTCONN) {
2012 		error = EOPNOTSUPP;
2013 		goto done;
2014 	}
2015 
2016 	/*
2017 	 * Must be bound with the transport
2018 	 */
2019 	if (!(so->so_state & SS_ISBOUND)) {
2020 		if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2021 		    /*CONSTCOND*/
2022 		    so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2023 			/*
2024 			 * Optimization for AF_INET{,6} transports
2025 			 * that can handle a T_CONN_REQ without being bound.
2026 			 */
2027 			so_automatic_bind(so);
2028 		} else {
2029 			error = sotpi_bind(so, NULL, 0,
2030 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
2031 			if (error)
2032 				goto done;
2033 		}
2034 		ASSERT(so->so_state & SS_ISBOUND);
2035 		flags |= _SOCONNECT_DID_BIND;
2036 	}
2037 
2038 	/*
2039 	 * Handle a connect to a name parameter of type AF_UNSPEC like a
2040 	 * connect to a null address. This is the portable method to
2041 	 * unconnect a socket.
2042 	 */
2043 	if ((namelen >= sizeof (sa_family_t)) &&
2044 	    (name->sa_family == AF_UNSPEC)) {
2045 		name = NULL;
2046 		namelen = 0;
2047 	}
2048 
2049 	/*
2050 	 * Check that we are not already connected.
2051 	 * A connection-oriented socket cannot be reconnected.
2052 	 * A connected connection-less socket can be
2053 	 * - connected to a different address by a subsequent connect
2054 	 * - "unconnected" by a connect to the NULL address
2055 	 */
2056 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2057 		ASSERT(!(flags & _SOCONNECT_DID_BIND));
2058 		if (so->so_mode & SM_CONNREQUIRED) {
2059 			/* Connection-oriented socket */
2060 			error = so->so_state & SS_ISCONNECTED ?
2061 			    EISCONN : EALREADY;
2062 			goto done;
2063 		}
2064 		/* Connection-less socket */
2065 		if (name == NULL) {
2066 			/*
2067 			 * Remove the connected state and clear SO_DGRAM_ERRIND
2068 			 * since it was set when the socket was connected.
2069 			 * If this is UDP also send down a T_DISCON_REQ.
2070 			 */
2071 			int val;
2072 
2073 			if ((so->so_family == AF_INET ||
2074 			    so->so_family == AF_INET6) &&
2075 			    (so->so_type == SOCK_DGRAM ||
2076 			    so->so_type == SOCK_RAW) &&
2077 			    /*CONSTCOND*/
2078 			    !soconnect_tpi_udp) {
2079 				/* XXX What about implicitly unbinding here? */
2080 				error = sodisconnect(so, -1,
2081 				    _SODISCONNECT_LOCK_HELD);
2082 			} else {
2083 				so->so_state &=
2084 				    ~(SS_ISCONNECTED | SS_ISCONNECTING |
2085 				    SS_FADDR_VALID);
2086 				so->so_faddr_len = 0;
2087 			}
2088 
2089 			so_unlock_single(so, SOLOCKED);
2090 			mutex_exit(&so->so_lock);
2091 
2092 			val = 0;
2093 			(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2094 			    &val, (t_uscalar_t)sizeof (val));
2095 
2096 			mutex_enter(&so->so_lock);
2097 			so_lock_single(so);	/* Set SOLOCKED */
2098 			goto done;
2099 		}
2100 	}
2101 	ASSERT(so->so_state & SS_ISBOUND);
2102 
2103 	if (name == NULL || namelen == 0) {
2104 		error = EINVAL;
2105 		goto done;
2106 	}
2107 	/*
2108 	 * Mark the socket if so_faddr_sa represents the transport level
2109 	 * address.
2110 	 */
2111 	if (flags & _SOCONNECT_NOXLATE) {
2112 		struct sockaddr_ux	*soaddr_ux;
2113 
2114 		ASSERT(so->so_family == AF_UNIX);
2115 		if (namelen != sizeof (struct sockaddr_ux)) {
2116 			error = EINVAL;
2117 			goto done;
2118 		}
2119 		soaddr_ux = (struct sockaddr_ux *)name;
2120 		name = (struct sockaddr *)&soaddr_ux->sou_addr;
2121 		namelen = sizeof (soaddr_ux->sou_addr);
2122 		so->so_state |= SS_FADDR_NOXLATE;
2123 	}
2124 
2125 	/*
2126 	 * Length and family checks.
2127 	 */
2128 	error = so_addr_verify(so, name, namelen);
2129 	if (error)
2130 		goto bad;
2131 
2132 	/*
2133 	 * Save foreign address. Needed for AF_UNIX as well as
2134 	 * transport providers that do not support TI_GETPEERNAME.
2135 	 * Also used for cached foreign address for TCP and UDP.
2136 	 */
2137 	if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
2138 		error = EINVAL;
2139 		goto done;
2140 	}
2141 	so->so_faddr_len = (socklen_t)namelen;
2142 	ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
2143 	bcopy(name, so->so_faddr_sa, namelen);
2144 	so->so_state |= SS_FADDR_VALID;
2145 
2146 	if (so->so_family == AF_UNIX) {
2147 		if (so->so_state & SS_FADDR_NOXLATE) {
2148 			/*
2149 			 * Already have a transport internal address. Do not
2150 			 * pass any (transport internal) source address.
2151 			 */
2152 			addr = so->so_faddr_sa;
2153 			addrlen = (t_uscalar_t)so->so_faddr_len;
2154 			src = NULL;
2155 			srclen = 0;
2156 		} else {
2157 			/*
2158 			 * Pass the sockaddr_un source address as an option
2159 			 * and translate the remote address.
2160 			 * Holding so_lock thus so_laddr_sa can not change.
2161 			 */
2162 			src = so->so_laddr_sa;
2163 			srclen = (t_uscalar_t)so->so_laddr_len;
2164 			dprintso(so, 1,
2165 			    ("sotpi_connect UNIX: srclen %d, src %p\n",
2166 			    srclen, src));
2167 			error = so_ux_addr_xlate(so,
2168 			    so->so_faddr_sa, (socklen_t)so->so_faddr_len,
2169 			    (flags & _SOCONNECT_XPG4_2),
2170 			    &addr, &addrlen);
2171 			if (error)
2172 				goto bad;
2173 		}
2174 	} else {
2175 		addr = so->so_faddr_sa;
2176 		addrlen = (t_uscalar_t)so->so_faddr_len;
2177 		src = NULL;
2178 		srclen = 0;
2179 	}
2180 	/*
2181 	 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2182 	 * option which asks the transport provider to send T_UDERR_IND
2183 	 * messages. These T_UDERR_IND messages are used to return connected
2184 	 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2185 	 *
2186 	 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2187 	 * we send down a T_CONN_REQ. This is needed to let the
2188 	 * transport assign a local address that is consistent with
2189 	 * the remote address. Applications depend on a getsockname()
2190 	 * after a connect() to retrieve the "source" IP address for
2191 	 * the connected socket.  Invalidate the cached local address
2192 	 * to force getsockname() to enquire of the transport.
2193 	 */
2194 	if (!(so->so_mode & SM_CONNREQUIRED)) {
2195 		/*
2196 		 * Datagram socket.
2197 		 */
2198 		int32_t val;
2199 
2200 		so_unlock_single(so, SOLOCKED);
2201 		mutex_exit(&so->so_lock);
2202 
2203 		val = 1;
2204 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2205 		    &val, (t_uscalar_t)sizeof (val));
2206 
2207 		mutex_enter(&so->so_lock);
2208 		so_lock_single(so);	/* Set SOLOCKED */
2209 		if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2210 		    (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2211 		    soconnect_tpi_udp) {
2212 			soisconnected(so);
2213 			goto done;
2214 		}
2215 		/*
2216 		 * Send down T_CONN_REQ etc.
2217 		 * Clear fflag to avoid returning EWOULDBLOCK.
2218 		 */
2219 		fflag = 0;
2220 		ASSERT(so->so_family != AF_UNIX);
2221 		so->so_state &= ~SS_LADDR_VALID;
2222 	} else if (so->so_laddr_len != 0) {
2223 		/*
2224 		 * If the local address or port was "any" then it may be
2225 		 * changed by the transport as a result of the
2226 		 * connect.  Invalidate the cached version if we have one.
2227 		 */
2228 		switch (so->so_family) {
2229 		case AF_INET:
2230 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
2231 			if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
2232 			    INADDR_ANY ||
2233 			    ((sin_t *)so->so_laddr_sa)->sin_port == 0)
2234 				so->so_state &= ~SS_LADDR_VALID;
2235 			break;
2236 
2237 		case AF_INET6:
2238 			ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
2239 			if (IN6_IS_ADDR_UNSPECIFIED(
2240 			    &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
2241 			    IN6_IS_ADDR_V4MAPPED_ANY(
2242 			    &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
2243 			    ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
2244 				so->so_state &= ~SS_LADDR_VALID;
2245 			break;
2246 
2247 		default:
2248 			break;
2249 		}
2250 	}
2251 
2252 	/*
2253 	 * Check for failure of an earlier call
2254 	 */
2255 	if (so->so_error != 0)
2256 		goto so_bad;
2257 
2258 	/*
2259 	 * Send down T_CONN_REQ. Message was allocated above.
2260 	 */
2261 	conn_req.PRIM_type = T_CONN_REQ;
2262 	conn_req.DEST_length = addrlen;
2263 	conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2264 	if (srclen == 0) {
2265 		conn_req.OPT_length = 0;
2266 		conn_req.OPT_offset = 0;
2267 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2268 		soappendmsg(mp, addr, addrlen);
2269 	} else {
2270 		/*
2271 		 * There is a AF_UNIX sockaddr_un to include as a source
2272 		 * address option.
2273 		 */
2274 		struct T_opthdr toh;
2275 
2276 		toh.level = SOL_SOCKET;
2277 		toh.name = SO_SRCADDR;
2278 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2279 		toh.status = 0;
2280 		conn_req.OPT_length =
2281 		    (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2282 		conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2283 		    _TPI_ALIGN_TOPT(addrlen));
2284 
2285 		soappendmsg(mp, &conn_req, sizeof (conn_req));
2286 		soappendmsg(mp, addr, addrlen);
2287 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2288 		soappendmsg(mp, &toh, sizeof (toh));
2289 		soappendmsg(mp, src, srclen);
2290 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2291 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2292 	}
2293 	/*
2294 	 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2295 	 * in order to have the right state when the T_CONN_CON shows up.
2296 	 */
2297 	soisconnecting(so);
2298 	mutex_exit(&so->so_lock);
2299 
2300 	if (audit_active)
2301 		audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2302 
2303 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2304 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2305 	mp = NULL;
2306 	mutex_enter(&so->so_lock);
2307 	if (error != 0)
2308 		goto bad;
2309 
2310 	if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2311 		goto bad;
2312 
2313 	/* Allow other threads to access the socket */
2314 	so_unlock_single(so, SOLOCKED);
2315 	need_unlock = B_FALSE;
2316 
2317 	/*
2318 	 * Wait until we get a T_CONN_CON or an error
2319 	 */
2320 	if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2321 		so_lock_single(so);	/* Set SOLOCKED */
2322 		need_unlock = B_TRUE;
2323 	}
2324 
2325 done:
2326 	freemsg(mp);
2327 	switch (error) {
2328 	case EINPROGRESS:
2329 	case EALREADY:
2330 	case EISCONN:
2331 	case EINTR:
2332 		/* Non-fatal errors */
2333 		so->so_state &= ~SS_LADDR_VALID;
2334 		/* FALLTHRU */
2335 	case 0:
2336 		break;
2337 
2338 	case EHOSTUNREACH:
2339 		if (flags & _SOCONNECT_XPG4_2) {
2340 			/*
2341 			 * X/Open specification contains a requirement that
2342 			 * ENETUNREACH be returned but does not require
2343 			 * EHOSTUNREACH. In order to keep the test suite
2344 			 * happy we mess with the errno here.
2345 			 */
2346 			error = ENETUNREACH;
2347 		}
2348 		/* FALLTHRU */
2349 
2350 	default:
2351 		ASSERT(need_unlock);
2352 		/*
2353 		 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2354 		 * and invalidate local-address cache
2355 		 */
2356 		so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
2357 		/* A discon_ind might have already unbound us */
2358 		if ((flags & _SOCONNECT_DID_BIND) &&
2359 		    (so->so_state & SS_ISBOUND)) {
2360 			int err;
2361 
2362 			err = sotpi_unbind(so, 0);
2363 			/* LINTED - statement has no conseq */
2364 			if (err) {
2365 				eprintsoline(so, err);
2366 			}
2367 		}
2368 		break;
2369 	}
2370 	if (need_unlock)
2371 		so_unlock_single(so, SOLOCKED);
2372 	mutex_exit(&so->so_lock);
2373 	return (error);
2374 
2375 so_bad:	error = sogeterr(so);
2376 bad:	eprintsoline(so, error);
2377 	goto done;
2378 }
2379 
2380 int
2381 sotpi_shutdown(struct sonode *so, int how)
2382 {
2383 	struct T_ordrel_req	ordrel_req;
2384 	mblk_t			*mp;
2385 	uint_t			old_state, state_change;
2386 	int			error = 0;
2387 
2388 	dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2389 	    (void *)so, how, pr_state(so->so_state, so->so_mode)));
2390 
2391 	mutex_enter(&so->so_lock);
2392 	so_lock_single(so);	/* Set SOLOCKED */
2393 
2394 	/*
2395 	 * SunOS 4.X has no check for datagram sockets.
2396 	 * 5.X checks that it is connected (ENOTCONN)
2397 	 * X/Open requires that we check the connected state.
2398 	 */
2399 	if (!(so->so_state & SS_ISCONNECTED)) {
2400 		if (!xnet_skip_checks) {
2401 			error = ENOTCONN;
2402 			if (xnet_check_print) {
2403 				printf("sockfs: X/Open shutdown check "
2404 				    "caused ENOTCONN\n");
2405 			}
2406 		}
2407 		goto done;
2408 	}
2409 	/*
2410 	 * Record the current state and then perform any state changes.
2411 	 * Then use the difference between the old and new states to
2412 	 * determine which messages need to be sent.
2413 	 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2414 	 * duplicate calls to shutdown().
2415 	 */
2416 	old_state = so->so_state;
2417 
2418 	switch (how) {
2419 	case 0:
2420 		socantrcvmore(so);
2421 		break;
2422 	case 1:
2423 		socantsendmore(so);
2424 		break;
2425 	case 2:
2426 		socantsendmore(so);
2427 		socantrcvmore(so);
2428 		break;
2429 	default:
2430 		error = EINVAL;
2431 		goto done;
2432 	}
2433 
2434 	/*
2435 	 * Assumes that the SS_CANT* flags are never cleared in the above code.
2436 	 */
2437 	state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2438 	    (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2439 	ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2440 
2441 	switch (state_change) {
2442 	case 0:
2443 		dprintso(so, 1,
2444 		    ("sotpi_shutdown: nothing to send in state 0x%x\n",
2445 		    so->so_state));
2446 		goto done;
2447 
2448 	case SS_CANTRCVMORE:
2449 		mutex_exit(&so->so_lock);
2450 		strseteof(SOTOV(so), 1);
2451 		/*
2452 		 * strseteof takes care of read side wakeups,
2453 		 * pollwakeups, and signals.
2454 		 */
2455 		/*
2456 		 * Get the read lock before flushing data to avoid problems
2457 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2458 		 */
2459 		mutex_enter(&so->so_lock);
2460 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2461 		mutex_exit(&so->so_lock);
2462 
2463 		/* Flush read side queue */
2464 		strflushrq(SOTOV(so), FLUSHALL);
2465 
2466 		mutex_enter(&so->so_lock);
2467 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2468 		break;
2469 
2470 	case SS_CANTSENDMORE:
2471 		mutex_exit(&so->so_lock);
2472 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2473 		mutex_enter(&so->so_lock);
2474 		break;
2475 
2476 	case SS_CANTSENDMORE|SS_CANTRCVMORE:
2477 		mutex_exit(&so->so_lock);
2478 		strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2479 		strseteof(SOTOV(so), 1);
2480 		/*
2481 		 * strseteof takes care of read side wakeups,
2482 		 * pollwakeups, and signals.
2483 		 */
2484 		/*
2485 		 * Get the read lock before flushing data to avoid problems
2486 		 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2487 		 */
2488 		mutex_enter(&so->so_lock);
2489 		(void) so_lock_read(so, 0);	/* Set SOREADLOCKED */
2490 		mutex_exit(&so->so_lock);
2491 
2492 		/* Flush read side queue */
2493 		strflushrq(SOTOV(so), FLUSHALL);
2494 
2495 		mutex_enter(&so->so_lock);
2496 		so_unlock_read(so);		/* Clear SOREADLOCKED */
2497 		break;
2498 	}
2499 
2500 	ASSERT(MUTEX_HELD(&so->so_lock));
2501 
2502 	/*
2503 	 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2504 	 * was set due to this call and the new state has both of them set:
2505 	 *	Send the AF_UNIX close indication
2506 	 *	For T_COTS send a discon_ind
2507 	 *
2508 	 * If cantsend was set due to this call:
2509 	 *	For T_COTSORD send an ordrel_ind
2510 	 *
2511 	 * Note that for T_CLTS there is no message sent here.
2512 	 */
2513 	if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2514 	    (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2515 		/*
2516 		 * For SunOS 4.X compatibility we tell the other end
2517 		 * that we are unable to receive at this point.
2518 		 */
2519 		if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
2520 			so_unix_close(so);
2521 
2522 		if (so->so_serv_type == T_COTS)
2523 			error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2524 	}
2525 	if ((state_change & SS_CANTSENDMORE) &&
2526 	    (so->so_serv_type == T_COTS_ORD)) {
2527 		/* Send an orderly release */
2528 		ordrel_req.PRIM_type = T_ORDREL_REQ;
2529 
2530 		mutex_exit(&so->so_lock);
2531 		mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2532 		    0, _ALLOC_SLEEP);
2533 		/*
2534 		 * Send down the T_ORDREL_REQ even if there is flow control.
2535 		 * This prevents shutdown from blocking.
2536 		 * Note that there is no T_OK_ACK for ordrel_req.
2537 		 */
2538 		error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2539 		    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2540 		mutex_enter(&so->so_lock);
2541 		if (error) {
2542 			eprintsoline(so, error);
2543 			goto done;
2544 		}
2545 	}
2546 
2547 done:
2548 	so_unlock_single(so, SOLOCKED);
2549 	mutex_exit(&so->so_lock);
2550 	return (error);
2551 }
2552 
2553 /*
2554  * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2555  * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2556  * that we have closed.
2557  * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2558  * T_UNITDATA_REQ containing the same option.
2559  *
2560  * For SOCK_DGRAM half-connections (somebody connected to this end
2561  * but this end is not connect) we don't know where to send any
2562  * SO_UNIX_CLOSE.
2563  *
2564  * We have to ignore stream head errors just in case there has been
2565  * a shutdown(output).
2566  * Ignore any flow control to try to get the message more quickly to the peer.
2567  * While locally ignoring flow control solves the problem when there
2568  * is only the loopback transport on the stream it would not provide
2569  * the correct AF_UNIX socket semantics when one or more modules have
2570  * been pushed.
2571  */
2572 void
2573 so_unix_close(struct sonode *so)
2574 {
2575 	int		error;
2576 	struct T_opthdr	toh;
2577 	mblk_t		*mp;
2578 
2579 	ASSERT(MUTEX_HELD(&so->so_lock));
2580 
2581 	ASSERT(so->so_family == AF_UNIX);
2582 
2583 	if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2584 	    (SS_ISCONNECTED|SS_ISBOUND))
2585 		return;
2586 
2587 	dprintso(so, 1, ("so_unix_close(%p) %s\n",
2588 	    (void *)so, pr_state(so->so_state, so->so_mode)));
2589 
2590 	toh.level = SOL_SOCKET;
2591 	toh.name = SO_UNIX_CLOSE;
2592 
2593 	/* zero length + header */
2594 	toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2595 	toh.status = 0;
2596 
2597 	if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2598 		struct T_optdata_req tdr;
2599 
2600 		tdr.PRIM_type = T_OPTDATA_REQ;
2601 		tdr.DATA_flag = 0;
2602 
2603 		tdr.OPT_length = (t_scalar_t)sizeof (toh);
2604 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2605 
2606 		/* NOTE: holding so_lock while sleeping */
2607 		mp = soallocproto2(&tdr, sizeof (tdr),
2608 		    &toh, sizeof (toh), 0, _ALLOC_SLEEP);
2609 	} else {
2610 		struct T_unitdata_req	tudr;
2611 		void			*addr;
2612 		socklen_t		addrlen;
2613 		void			*src;
2614 		socklen_t		srclen;
2615 		struct T_opthdr		toh2;
2616 		t_scalar_t		size;
2617 
2618 		/* Connecteded DGRAM socket */
2619 
2620 		/*
2621 		 * For AF_UNIX the destination address is translated to
2622 		 * an internal name and the source address is passed as
2623 		 * an option.
2624 		 */
2625 		/*
2626 		 * Length and family checks.
2627 		 */
2628 		error = so_addr_verify(so, so->so_faddr_sa,
2629 		    (t_uscalar_t)so->so_faddr_len);
2630 		if (error) {
2631 			eprintsoline(so, error);
2632 			return;
2633 		}
2634 		if (so->so_state & SS_FADDR_NOXLATE) {
2635 			/*
2636 			 * Already have a transport internal address. Do not
2637 			 * pass any (transport internal) source address.
2638 			 */
2639 			addr = so->so_faddr_sa;
2640 			addrlen = (t_uscalar_t)so->so_faddr_len;
2641 			src = NULL;
2642 			srclen = 0;
2643 		} else {
2644 			/*
2645 			 * Pass the sockaddr_un source address as an option
2646 			 * and translate the remote address.
2647 			 * Holding so_lock thus so_laddr_sa can not change.
2648 			 */
2649 			src = so->so_laddr_sa;
2650 			srclen = (socklen_t)so->so_laddr_len;
2651 			dprintso(so, 1,
2652 			    ("so_ux_close: srclen %d, src %p\n",
2653 			    srclen, src));
2654 			error = so_ux_addr_xlate(so,
2655 			    so->so_faddr_sa,
2656 			    (socklen_t)so->so_faddr_len, 0,
2657 			    &addr, &addrlen);
2658 			if (error) {
2659 				eprintsoline(so, error);
2660 				return;
2661 			}
2662 		}
2663 		tudr.PRIM_type = T_UNITDATA_REQ;
2664 		tudr.DEST_length = addrlen;
2665 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2666 		if (srclen == 0) {
2667 			tudr.OPT_length = (t_scalar_t)sizeof (toh);
2668 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2669 			    _TPI_ALIGN_TOPT(addrlen));
2670 
2671 			size = tudr.OPT_offset + tudr.OPT_length;
2672 			/* NOTE: holding so_lock while sleeping */
2673 			mp = soallocproto2(&tudr, sizeof (tudr),
2674 			    addr, addrlen, size, _ALLOC_SLEEP);
2675 			mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2676 			soappendmsg(mp, &toh, sizeof (toh));
2677 		} else {
2678 			/*
2679 			 * There is a AF_UNIX sockaddr_un to include as a
2680 			 * source address option.
2681 			 */
2682 			tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2683 			    _TPI_ALIGN_TOPT(srclen));
2684 			tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2685 			    _TPI_ALIGN_TOPT(addrlen));
2686 
2687 			toh2.level = SOL_SOCKET;
2688 			toh2.name = SO_SRCADDR;
2689 			toh2.len = (t_uscalar_t)(srclen +
2690 			    sizeof (struct T_opthdr));
2691 			toh2.status = 0;
2692 
2693 			size = tudr.OPT_offset + tudr.OPT_length;
2694 
2695 			/* NOTE: holding so_lock while sleeping */
2696 			mp = soallocproto2(&tudr, sizeof (tudr),
2697 			    addr, addrlen, size, _ALLOC_SLEEP);
2698 			mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2699 			soappendmsg(mp, &toh, sizeof (toh));
2700 			soappendmsg(mp, &toh2, sizeof (toh2));
2701 			soappendmsg(mp, src, srclen);
2702 			mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2703 		}
2704 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2705 	}
2706 	mutex_exit(&so->so_lock);
2707 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2708 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2709 	mutex_enter(&so->so_lock);
2710 }
2711 
2712 /*
2713  * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
2714  */
2715 int
2716 sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
2717 {
2718 	mblk_t		*mp, *nmp;
2719 	int		error;
2720 
2721 	dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n",
2722 	    (void *)so, (void *)msg, flags));
2723 
2724 	/*
2725 	 * There is never any oob data with addresses or control since
2726 	 * the T_EXDATA_IND does not carry any options.
2727 	 */
2728 	msg->msg_controllen = 0;
2729 	msg->msg_namelen = 0;
2730 
2731 	mutex_enter(&so->so_lock);
2732 	ASSERT(so_verify_oobstate(so));
2733 	if ((so->so_options & SO_OOBINLINE) ||
2734 	    (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
2735 		dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
2736 		mutex_exit(&so->so_lock);
2737 		return (EINVAL);
2738 	}
2739 	if (!(so->so_state & SS_HAVEOOBDATA)) {
2740 		dprintso(so, 1, ("sorecvoob: no data yet\n"));
2741 		mutex_exit(&so->so_lock);
2742 		return (EWOULDBLOCK);
2743 	}
2744 	ASSERT(so->so_oobmsg != NULL);
2745 	mp = so->so_oobmsg;
2746 	if (flags & MSG_PEEK) {
2747 		/*
2748 		 * Since recv* can not return ENOBUFS we can not use dupmsg.
2749 		 * Instead we revert to the consolidation private
2750 		 * allocb_wait plus bcopy.
2751 		 */
2752 		mblk_t *mp1;
2753 
2754 		mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
2755 		ASSERT(mp1);
2756 
2757 		while (mp != NULL) {
2758 			ssize_t size;
2759 
2760 			size = MBLKL(mp);
2761 			bcopy(mp->b_rptr, mp1->b_wptr, size);
2762 			mp1->b_wptr += size;
2763 			ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
2764 			mp = mp->b_cont;
2765 		}
2766 		mp = mp1;
2767 	} else {
2768 		/*
2769 		 * Update the state indicating that the data has been consumed.
2770 		 * Keep SS_OOBPEND set until data is consumed past the mark.
2771 		 */
2772 		so->so_oobmsg = NULL;
2773 		so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
2774 	}
2775 	dprintso(so, 1,
2776 	    ("after recvoob(%p): counts %d/%d state %s\n",
2777 	    (void *)so, so->so_oobsigcnt,
2778 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2779 	ASSERT(so_verify_oobstate(so));
2780 	mutex_exit(&so->so_lock);
2781 
2782 	error = 0;
2783 	nmp = mp;
2784 	while (nmp != NULL && uiop->uio_resid > 0) {
2785 		ssize_t n = MBLKL(nmp);
2786 
2787 		n = MIN(n, uiop->uio_resid);
2788 		if (n > 0)
2789 			error = uiomove(nmp->b_rptr, n,
2790 			    UIO_READ, uiop);
2791 		if (error)
2792 			break;
2793 		nmp = nmp->b_cont;
2794 	}
2795 	freemsg(mp);
2796 	return (error);
2797 }
2798 
2799 /*
2800  * Called by sotpi_recvmsg when reading a non-zero amount of data.
2801  * In addition, the caller typically verifies that there is some
2802  * potential state to clear by checking
2803  *	if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2804  * before calling this routine.
2805  * Note that such a check can be made without holding so_lock since
2806  * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2807  * decrements so_oobsigcnt.
2808  *
2809  * When data is read *after* the point that all pending
2810  * oob data has been consumed the oob indication is cleared.
2811  *
2812  * This logic keeps select/poll returning POLLRDBAND and
2813  * SIOCATMARK returning true until we have read past
2814  * the mark.
2815  */
2816 static void
2817 sorecv_update_oobstate(struct sonode *so)
2818 {
2819 	mutex_enter(&so->so_lock);
2820 	ASSERT(so_verify_oobstate(so));
2821 	dprintso(so, 1,
2822 	    ("sorecv_update_oobstate: counts %d/%d state %s\n",
2823 	    so->so_oobsigcnt,
2824 	    so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
2825 	if (so->so_oobsigcnt == 0) {
2826 		/* No more pending oob indications */
2827 		so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2828 		freemsg(so->so_oobmsg);
2829 		so->so_oobmsg = NULL;
2830 	}
2831 	ASSERT(so_verify_oobstate(so));
2832 	mutex_exit(&so->so_lock);
2833 }
2834 
2835 /*
2836  * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2837  */
2838 static int
2839 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2840 {
2841 	int	error = 0;
2842 	mblk_t *tmp = NULL;
2843 	mblk_t *pmp = NULL;
2844 	mblk_t *nmp = so->so_nl7c_rcv_mp;
2845 
2846 	ASSERT(nmp != NULL);
2847 
2848 	while (nmp != NULL && uiop->uio_resid > 0) {
2849 		ssize_t n;
2850 
2851 		if (DB_TYPE(nmp) == M_DATA) {
2852 			/*
2853 			 * We have some data, uiomove up to resid bytes.
2854 			 */
2855 			n = MIN(MBLKL(nmp), uiop->uio_resid);
2856 			if (n > 0)
2857 				error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2858 			nmp->b_rptr += n;
2859 			if (nmp->b_rptr == nmp->b_wptr) {
2860 				pmp = nmp;
2861 				nmp = nmp->b_cont;
2862 			}
2863 			if (error)
2864 				break;
2865 		} else {
2866 			/*
2867 			 * We only handle data, save for caller to handle.
2868 			 */
2869 			if (pmp != NULL) {
2870 				pmp->b_cont = nmp->b_cont;
2871 			}
2872 			nmp->b_cont = NULL;
2873 			if (*rmp == NULL) {
2874 				*rmp = nmp;
2875 			} else {
2876 				tmp->b_cont = nmp;
2877 			}
2878 			nmp = nmp->b_cont;
2879 			tmp = nmp;
2880 		}
2881 	}
2882 	if (pmp != NULL) {
2883 		/* Free any mblk_t(s) which we have consumed */
2884 		pmp->b_cont = NULL;
2885 		freemsg(so->so_nl7c_rcv_mp);
2886 	}
2887 	if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
2888 		/* Last mblk_t so return the saved kstrgetmsg() rval/error */
2889 		if (error == 0) {
2890 			rval_t	*p = (rval_t *)&so->so_nl7c_rcv_rval;
2891 
2892 			error = p->r_v.r_v2;
2893 			p->r_v.r_v2 = 0;
2894 		}
2895 		rp->r_vals = so->so_nl7c_rcv_rval;
2896 		so->so_nl7c_rcv_rval = 0;
2897 	} else {
2898 		/* More mblk_t(s) to process so no rval to return */
2899 		rp->r_vals = 0;
2900 	}
2901 	return (error);
2902 }
2903 
2904 /*
2905  * Receive the next message on the queue.
2906  * If msg_controllen is non-zero when called the caller is interested in
2907  * any received control info (options).
2908  * If msg_namelen is non-zero when called the caller is interested in
2909  * any received source address.
2910  * The routine returns with msg_control and msg_name pointing to
2911  * kmem_alloc'ed memory which the caller has to free.
2912  */
2913 int
2914 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
2915 {
2916 	union T_primitives	*tpr;
2917 	mblk_t			*mp;
2918 	uchar_t			pri;
2919 	int			pflag, opflag;
2920 	void			*control;
2921 	t_uscalar_t		controllen;
2922 	t_uscalar_t		namelen;
2923 	int			so_state = so->so_state; /* Snapshot */
2924 	ssize_t			saved_resid;
2925 	rval_t			rval;
2926 	int			flags;
2927 	clock_t			timout;
2928 	int			first;
2929 	int			error = 0;
2930 	struct uio		*suiop = NULL;
2931 	sodirect_t		*sodp = so->so_direct;
2932 
2933 	flags = msg->msg_flags;
2934 	msg->msg_flags = 0;
2935 
2936 	dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2937 	    (void *)so, (void *)msg, flags,
2938 	    pr_state(so->so_state, so->so_mode), so->so_error));
2939 
2940 	/*
2941 	 * If we are not connected because we have never been connected
2942 	 * we return ENOTCONN. If we have been connected (but are no longer
2943 	 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2944 	 * the EOF.
2945 	 *
2946 	 * An alternative would be to post an ENOTCONN error in stream head
2947 	 * (read+write) and clear it when we're connected. However, that error
2948 	 * would cause incorrect poll/select behavior!
2949 	 */
2950 	if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2951 	    (so->so_mode & SM_CONNREQUIRED)) {
2952 		return (ENOTCONN);
2953 	}
2954 
2955 	/*
2956 	 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2957 	 * after checking that the read queue is empty) and returns zero.
2958 	 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2959 	 * is zero.
2960 	 */
2961 
2962 	if (flags & MSG_OOB) {
2963 		/* Check that the transport supports OOB */
2964 		if (!(so->so_mode & SM_EXDATA))
2965 			return (EOPNOTSUPP);
2966 		return (sorecvoob(so, msg, uiop, flags));
2967 	}
2968 
2969 	/*
2970 	 * Set msg_controllen and msg_namelen to zero here to make it
2971 	 * simpler in the cases that no control or name is returned.
2972 	 */
2973 	controllen = msg->msg_controllen;
2974 	namelen = msg->msg_namelen;
2975 	msg->msg_controllen = 0;
2976 	msg->msg_namelen = 0;
2977 
2978 	dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2979 	    namelen, controllen));
2980 
2981 	mutex_enter(&so->so_lock);
2982 	/*
2983 	 * If an NL7C enabled socket and not waiting for write data.
2984 	 */
2985 	if ((so->so_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
2986 	    NL7C_ENABLED) {
2987 		if (so->so_nl7c_uri) {
2988 			/* Close uri processing for a previous request */
2989 			nl7c_close(so);
2990 		}
2991 		if ((so_state & SS_CANTRCVMORE) && so->so_nl7c_rcv_mp == NULL) {
2992 			/* Nothing to process, EOF */
2993 			mutex_exit(&so->so_lock);
2994 			return (0);
2995 		} else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
2996 			/* Persistent NL7C socket, try to process request */
2997 			boolean_t ret;
2998 
2999 			ret = nl7c_process(so,
3000 			    (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3001 			rval.r_vals = so->so_nl7c_rcv_rval;
3002 			error = rval.r_v.r_v2;
3003 			if (error) {
3004 				/* Error of some sort, return it */
3005 				mutex_exit(&so->so_lock);
3006 				return (error);
3007 			}
3008 			if (so->so_nl7c_flags &&
3009 			    ! (so->so_nl7c_flags & NL7C_WAITWRITE)) {
3010 				/*
3011 				 * Still an NL7C socket and no data
3012 				 * to pass up to the caller.
3013 				 */
3014 				mutex_exit(&so->so_lock);
3015 				if (ret) {
3016 					/* EOF */
3017 					return (0);
3018 				} else {
3019 					/* Need more data */
3020 					return (EAGAIN);
3021 				}
3022 			}
3023 		} else {
3024 			/*
3025 			 * Not persistent so no further NL7C processing.
3026 			 */
3027 			so->so_nl7c_flags = 0;
3028 		}
3029 	}
3030 	/*
3031 	 * Only one reader is allowed at any given time. This is needed
3032 	 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3033 	 *
3034 	 * This is slightly different that BSD behavior in that it fails with
3035 	 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3036 	 * is single-threaded using sblock(), which is dropped while waiting
3037 	 * for data to appear. The difference shows up e.g. if one
3038 	 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3039 	 * does use nonblocking io and different threads are reading each
3040 	 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3041 	 * in this case as long as the read queue doesn't get empty.
3042 	 * In this implementation the thread using nonblocking io can
3043 	 * get an EWOULDBLOCK error due to the blocking thread executing
3044 	 * e.g. in the uiomove in kstrgetmsg.
3045 	 * This difference is not believed to be significant.
3046 	 */
3047 	/* Set SOREADLOCKED */
3048 	error = so_lock_read_intr(so,
3049 	    uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3050 	mutex_exit(&so->so_lock);
3051 	if (error)
3052 		return (error);
3053 
3054 	/*
3055 	 * Tell kstrgetmsg to not inspect the stream head errors until all
3056 	 * queued data has been consumed.
3057 	 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3058 	 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3059 	 *
3060 	 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3061 	 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3062 	 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3063 	 */
3064 	pflag = MSG_ANY | MSG_DELAYERROR;
3065 	if (flags & MSG_PEEK) {
3066 		pflag |= MSG_IPEEK;
3067 		flags &= ~MSG_WAITALL;
3068 	}
3069 	if (so->so_mode & SM_ATOMIC)
3070 		pflag |= MSG_DISCARDTAIL;
3071 
3072 	if (flags & MSG_DONTWAIT)
3073 		timout = 0;
3074 	else
3075 		timout = -1;
3076 	opflag = pflag;
3077 	first = 1;
3078 
3079 	if (uiop->uio_resid >= uioasync.mincnt &&
3080 	    sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
3081 	    uioasync.enabled && !(flags & MSG_PEEK) &&
3082 	    !(so_state & SS_CANTRCVMORE)) {
3083 		/*
3084 		 * Big enough I/O for uioa min setup and an sodirect socket
3085 		 * and sodirect enabled and uioa enabled and I/O will be done
3086 		 * and not EOF so initialize the sodirect_t uioa_t with "uiop".
3087 		 */
3088 		mutex_enter(sodp->sod_lockp);
3089 		if (!uioainit(uiop, &sodp->sod_uioa)) {
3090 			/*
3091 			 * Successful uioainit() so the uio_t part of the
3092 			 * uioa_t will be used for all uio_t work to follow,
3093 			 * we save the original "uiop" in "suiop".
3094 			 */
3095 			suiop = uiop;
3096 			uiop = (uio_t *)&sodp->sod_uioa;
3097 			/*
3098 			 * Before returning to the caller the passed in uio_t
3099 			 * "uiop" will be updated via a call to uioafini()
3100 			 * below.
3101 			 *
3102 			 * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
3103 			 * here as first we have to uioamove() any currently
3104 			 * queued M_DATA mblk_t(s) so it will be done in
3105 			 * kstrgetmsg().
3106 			 */
3107 		}
3108 		/*
3109 		 * In either uioainit() success or not case note the number
3110 		 * of uio bytes the caller wants for sod framework and/or
3111 		 * transport (e.g. TCP) strategy.
3112 		 */
3113 		sodp->sod_want = uiop->uio_resid;
3114 		mutex_exit(sodp->sod_lockp);
3115 	} else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
3116 		/*
3117 		 * No uioa but still using sodirect so note the number of
3118 		 * uio bytes the caller wants for sodirect framework and/or
3119 		 * transport (e.g. TCP) strategy.
3120 		 *
3121 		 * Note, sod_lockp not held, only writer is in this function
3122 		 * and only one thread at a time so not needed just to init.
3123 		 */
3124 		sodp->sod_want = uiop->uio_resid;
3125 	}
3126 retry:
3127 	saved_resid = uiop->uio_resid;
3128 	pri = 0;
3129 	mp = NULL;
3130 	if (so->so_nl7c_rcv_mp != NULL) {
3131 		/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3132 		error = nl7c_sorecv(so, &mp, uiop, &rval);
3133 	} else {
3134 		error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3135 		    timout, &rval);
3136 	}
3137 	if (error) {
3138 		switch (error) {
3139 		case EINTR:
3140 		case EWOULDBLOCK:
3141 			if (!first)
3142 				error = 0;
3143 			break;
3144 		case ETIME:
3145 			/* Returned from kstrgetmsg when timeout expires */
3146 			if (!first)
3147 				error = 0;
3148 			else
3149 				error = EWOULDBLOCK;
3150 			break;
3151 		default:
3152 			eprintsoline(so, error);
3153 			break;
3154 		}
3155 		goto out;
3156 	}
3157 	/*
3158 	 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3159 	 * For non-datagrams MOREDATA is used to set MSG_EOR.
3160 	 */
3161 	ASSERT(!(rval.r_val1 & MORECTL));
3162 	if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3163 		msg->msg_flags |= MSG_TRUNC;
3164 
3165 	if (mp == NULL) {
3166 		dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3167 		/*
3168 		 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3169 		 * The draft Posix socket spec states that the mark should
3170 		 * not be cleared when peeking. We follow the latter.
3171 		 */
3172 		if ((so->so_state &
3173 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3174 		    (uiop->uio_resid != saved_resid) &&
3175 		    !(flags & MSG_PEEK)) {
3176 			sorecv_update_oobstate(so);
3177 		}
3178 
3179 		mutex_enter(&so->so_lock);
3180 		/* Set MSG_EOR based on MOREDATA */
3181 		if (!(rval.r_val1 & MOREDATA)) {
3182 			if (so->so_state & SS_SAVEDEOR) {
3183 				msg->msg_flags |= MSG_EOR;
3184 				so->so_state &= ~SS_SAVEDEOR;
3185 			}
3186 		}
3187 		/*
3188 		 * If some data was received (i.e. not EOF) and the
3189 		 * read/recv* has not been satisfied wait for some more.
3190 		 */
3191 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3192 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3193 			mutex_exit(&so->so_lock);
3194 			first = 0;
3195 			pflag = opflag | MSG_NOMARK;
3196 			goto retry;
3197 		}
3198 		goto out_locked;
3199 	}
3200 
3201 	/* strsock_proto has already verified length and alignment */
3202 	tpr = (union T_primitives *)mp->b_rptr;
3203 	dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3204 
3205 	switch (tpr->type) {
3206 	case T_DATA_IND: {
3207 		if ((so->so_state &
3208 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3209 		    (uiop->uio_resid != saved_resid) &&
3210 		    !(flags & MSG_PEEK)) {
3211 			sorecv_update_oobstate(so);
3212 		}
3213 
3214 		/*
3215 		 * Set msg_flags to MSG_EOR based on
3216 		 * MORE_flag and MOREDATA.
3217 		 */
3218 		mutex_enter(&so->so_lock);
3219 		so->so_state &= ~SS_SAVEDEOR;
3220 		if (!(tpr->data_ind.MORE_flag & 1)) {
3221 			if (!(rval.r_val1 & MOREDATA))
3222 				msg->msg_flags |= MSG_EOR;
3223 			else
3224 				so->so_state |= SS_SAVEDEOR;
3225 		}
3226 		freemsg(mp);
3227 		/*
3228 		 * If some data was received (i.e. not EOF) and the
3229 		 * read/recv* has not been satisfied wait for some more.
3230 		 */
3231 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3232 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3233 			mutex_exit(&so->so_lock);
3234 			first = 0;
3235 			pflag = opflag | MSG_NOMARK;
3236 			goto retry;
3237 		}
3238 		goto out_locked;
3239 	}
3240 	case T_UNITDATA_IND: {
3241 		void *addr;
3242 		t_uscalar_t addrlen;
3243 		void *abuf;
3244 		t_uscalar_t optlen;
3245 		void *opt;
3246 
3247 		if ((so->so_state &
3248 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3249 		    (uiop->uio_resid != saved_resid) &&
3250 		    !(flags & MSG_PEEK)) {
3251 			sorecv_update_oobstate(so);
3252 		}
3253 
3254 		if (namelen != 0) {
3255 			/* Caller wants source address */
3256 			addrlen = tpr->unitdata_ind.SRC_length;
3257 			addr = sogetoff(mp,
3258 			    tpr->unitdata_ind.SRC_offset,
3259 			    addrlen, 1);
3260 			if (addr == NULL) {
3261 				freemsg(mp);
3262 				error = EPROTO;
3263 				eprintsoline(so, error);
3264 				goto out;
3265 			}
3266 			if (so->so_family == AF_UNIX) {
3267 				/*
3268 				 * Can not use the transport level address.
3269 				 * If there is a SO_SRCADDR option carrying
3270 				 * the socket level address it will be
3271 				 * extracted below.
3272 				 */
3273 				addr = NULL;
3274 				addrlen = 0;
3275 			}
3276 		}
3277 		optlen = tpr->unitdata_ind.OPT_length;
3278 		if (optlen != 0) {
3279 			t_uscalar_t ncontrollen;
3280 
3281 			/*
3282 			 * Extract any source address option.
3283 			 * Determine how large cmsg buffer is needed.
3284 			 */
3285 			opt = sogetoff(mp,
3286 			    tpr->unitdata_ind.OPT_offset,
3287 			    optlen, __TPI_ALIGN_SIZE);
3288 
3289 			if (opt == NULL) {
3290 				freemsg(mp);
3291 				error = EPROTO;
3292 				eprintsoline(so, error);
3293 				goto out;
3294 			}
3295 			if (so->so_family == AF_UNIX)
3296 				so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3297 			ncontrollen = so_cmsglen(mp, opt, optlen,
3298 			    !(flags & MSG_XPG4_2));
3299 			if (controllen != 0)
3300 				controllen = ncontrollen;
3301 			else if (ncontrollen != 0)
3302 				msg->msg_flags |= MSG_CTRUNC;
3303 		} else {
3304 			controllen = 0;
3305 		}
3306 
3307 		if (namelen != 0) {
3308 			/*
3309 			 * Return address to caller.
3310 			 * Caller handles truncation if length
3311 			 * exceeds msg_namelen.
3312 			 * NOTE: AF_UNIX NUL termination is ensured by
3313 			 * the sender's copyin_name().
3314 			 */
3315 			abuf = kmem_alloc(addrlen, KM_SLEEP);
3316 
3317 			bcopy(addr, abuf, addrlen);
3318 			msg->msg_name = abuf;
3319 			msg->msg_namelen = addrlen;
3320 		}
3321 
3322 		if (controllen != 0) {
3323 			/*
3324 			 * Return control msg to caller.
3325 			 * Caller handles truncation if length
3326 			 * exceeds msg_controllen.
3327 			 */
3328 			control = kmem_zalloc(controllen, KM_SLEEP);
3329 
3330 			error = so_opt2cmsg(mp, opt, optlen,
3331 			    !(flags & MSG_XPG4_2),
3332 			    control, controllen);
3333 			if (error) {
3334 				freemsg(mp);
3335 				if (msg->msg_namelen != 0)
3336 					kmem_free(msg->msg_name,
3337 					    msg->msg_namelen);
3338 				kmem_free(control, controllen);
3339 				eprintsoline(so, error);
3340 				goto out;
3341 			}
3342 			msg->msg_control = control;
3343 			msg->msg_controllen = controllen;
3344 		}
3345 
3346 		freemsg(mp);
3347 		goto out;
3348 	}
3349 	case T_OPTDATA_IND: {
3350 		struct T_optdata_req *tdr;
3351 		void *opt;
3352 		t_uscalar_t optlen;
3353 
3354 		if ((so->so_state &
3355 		    (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3356 		    (uiop->uio_resid != saved_resid) &&
3357 		    !(flags & MSG_PEEK)) {
3358 			sorecv_update_oobstate(so);
3359 		}
3360 
3361 		tdr = (struct T_optdata_req *)mp->b_rptr;
3362 		optlen = tdr->OPT_length;
3363 		if (optlen != 0) {
3364 			t_uscalar_t ncontrollen;
3365 			/*
3366 			 * Determine how large cmsg buffer is needed.
3367 			 */
3368 			opt = sogetoff(mp,
3369 			    tpr->optdata_ind.OPT_offset,
3370 			    optlen, __TPI_ALIGN_SIZE);
3371 
3372 			if (opt == NULL) {
3373 				freemsg(mp);
3374 				error = EPROTO;
3375 				eprintsoline(so, error);
3376 				goto out;
3377 			}
3378 
3379 			ncontrollen = so_cmsglen(mp, opt, optlen,
3380 			    !(flags & MSG_XPG4_2));
3381 			if (controllen != 0)
3382 				controllen = ncontrollen;
3383 			else if (ncontrollen != 0)
3384 				msg->msg_flags |= MSG_CTRUNC;
3385 		} else {
3386 			controllen = 0;
3387 		}
3388 
3389 		if (controllen != 0) {
3390 			/*
3391 			 * Return control msg to caller.
3392 			 * Caller handles truncation if length
3393 			 * exceeds msg_controllen.
3394 			 */
3395 			control = kmem_zalloc(controllen, KM_SLEEP);
3396 
3397 			error = so_opt2cmsg(mp, opt, optlen,
3398 			    !(flags & MSG_XPG4_2),
3399 			    control, controllen);
3400 			if (error) {
3401 				freemsg(mp);
3402 				kmem_free(control, controllen);
3403 				eprintsoline(so, error);
3404 				goto out;
3405 			}
3406 			msg->msg_control = control;
3407 			msg->msg_controllen = controllen;
3408 		}
3409 
3410 		/*
3411 		 * Set msg_flags to MSG_EOR based on
3412 		 * DATA_flag and MOREDATA.
3413 		 */
3414 		mutex_enter(&so->so_lock);
3415 		so->so_state &= ~SS_SAVEDEOR;
3416 		if (!(tpr->data_ind.MORE_flag & 1)) {
3417 			if (!(rval.r_val1 & MOREDATA))
3418 				msg->msg_flags |= MSG_EOR;
3419 			else
3420 				so->so_state |= SS_SAVEDEOR;
3421 		}
3422 		freemsg(mp);
3423 		/*
3424 		 * If some data was received (i.e. not EOF) and the
3425 		 * read/recv* has not been satisfied wait for some more.
3426 		 * Not possible to wait if control info was received.
3427 		 */
3428 		if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3429 		    controllen == 0 &&
3430 		    uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3431 			mutex_exit(&so->so_lock);
3432 			first = 0;
3433 			pflag = opflag | MSG_NOMARK;
3434 			goto retry;
3435 		}
3436 		goto out_locked;
3437 	}
3438 	case T_EXDATA_IND: {
3439 		dprintso(so, 1,
3440 		    ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3441 		    "state %s\n",
3442 		    so->so_oobsigcnt, so->so_oobcnt,
3443 		    saved_resid - uiop->uio_resid,
3444 		    pr_state(so->so_state, so->so_mode)));
3445 		/*
3446 		 * kstrgetmsg handles MSGMARK so there is nothing to
3447 		 * inspect in the T_EXDATA_IND.
3448 		 * strsock_proto makes the stream head queue the T_EXDATA_IND
3449 		 * as a separate message with no M_DATA component. Furthermore,
3450 		 * the stream head does not consolidate M_DATA messages onto
3451 		 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3452 		 * remains a message by itself. This is needed since MSGMARK
3453 		 * marks both the whole message as well as the last byte
3454 		 * of the message.
3455 		 */
3456 		freemsg(mp);
3457 		ASSERT(uiop->uio_resid == saved_resid);	/* No data */
3458 		if (flags & MSG_PEEK) {
3459 			/*
3460 			 * Even though we are peeking we consume the
3461 			 * T_EXDATA_IND thereby moving the mark information
3462 			 * to SS_RCVATMARK. Then the oob code below will
3463 			 * retry the peeking kstrgetmsg.
3464 			 * Note that the stream head read queue is
3465 			 * never flushed without holding SOREADLOCKED
3466 			 * thus the T_EXDATA_IND can not disappear
3467 			 * underneath us.
3468 			 */
3469 			dprintso(so, 1,
3470 			    ("sotpi_recvmsg: consume EXDATA_IND "
3471 			    "counts %d/%d state %s\n",
3472 			    so->so_oobsigcnt,
3473 			    so->so_oobcnt,
3474 			    pr_state(so->so_state, so->so_mode)));
3475 
3476 			pflag = MSG_ANY | MSG_DELAYERROR;
3477 			if (so->so_mode & SM_ATOMIC)
3478 				pflag |= MSG_DISCARDTAIL;
3479 
3480 			pri = 0;
3481 			mp = NULL;
3482 
3483 			error = kstrgetmsg(SOTOV(so), &mp, uiop,
3484 			    &pri, &pflag, (clock_t)-1, &rval);
3485 			ASSERT(uiop->uio_resid == saved_resid);
3486 
3487 			if (error) {
3488 #ifdef SOCK_DEBUG
3489 				if (error != EWOULDBLOCK && error != EINTR) {
3490 					eprintsoline(so, error);
3491 				}
3492 #endif /* SOCK_DEBUG */
3493 				goto out;
3494 			}
3495 			ASSERT(mp);
3496 			tpr = (union T_primitives *)mp->b_rptr;
3497 			ASSERT(tpr->type == T_EXDATA_IND);
3498 			freemsg(mp);
3499 		} /* end "if (flags & MSG_PEEK)" */
3500 
3501 		/*
3502 		 * Decrement the number of queued and pending oob.
3503 		 *
3504 		 * SS_RCVATMARK is cleared when we read past a mark.
3505 		 * SS_HAVEOOBDATA is cleared when we've read past the
3506 		 * last mark.
3507 		 * SS_OOBPEND is cleared if we've read past the last
3508 		 * mark and no (new) SIGURG has been posted.
3509 		 */
3510 		mutex_enter(&so->so_lock);
3511 		ASSERT(so_verify_oobstate(so));
3512 		ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
3513 		ASSERT(so->so_oobsigcnt > 0);
3514 		so->so_oobsigcnt--;
3515 		ASSERT(so->so_oobcnt > 0);
3516 		so->so_oobcnt--;
3517 		/*
3518 		 * Since the T_EXDATA_IND has been removed from the stream
3519 		 * head, but we have not read data past the mark,
3520 		 * sockfs needs to track that the socket is still at the mark.
3521 		 *
3522 		 * Since no data was received call kstrgetmsg again to wait
3523 		 * for data.
3524 		 */
3525 		so->so_state |= SS_RCVATMARK;
3526 		mutex_exit(&so->so_lock);
3527 		dprintso(so, 1,
3528 		    ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3529 		    so->so_oobsigcnt, so->so_oobcnt,
3530 		    pr_state(so->so_state, so->so_mode)));
3531 		pflag = opflag;
3532 		goto retry;
3533 	}
3534 	default:
3535 		ASSERT(0);
3536 		freemsg(mp);
3537 		error = EPROTO;
3538 		eprintsoline(so, error);
3539 		goto out;
3540 	}
3541 	/* NOTREACHED */
3542 out:
3543 	mutex_enter(&so->so_lock);
3544 out_locked:
3545 	if (sodp != NULL) {
3546 		/* Finish any sodirect and uioa processing */
3547 		mutex_enter(sodp->sod_lockp);
3548 		if (suiop != NULL) {
3549 			/* Finish any uioa_t processing */
3550 			int ret;
3551 
3552 			ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
3553 			ret = uioafini(suiop, (uioa_t *)uiop);
3554 			if (error == 0 && ret != 0) {
3555 				/* If no error yet, set it */
3556 				error = ret;
3557 			}
3558 			if ((mp = sodp->sod_uioafh) != NULL) {
3559 				sodp->sod_uioafh = NULL;
3560 				sodp->sod_uioaft = NULL;
3561 				freemsg(mp);
3562 			}
3563 		}
3564 		ASSERT(sodp->sod_uioafh == NULL);
3565 		if (!(sodp->sod_state & SOD_WAKE_NOT)) {
3566 			/* Awoke */
3567 			sodp->sod_state &= SOD_WAKE_CLR;
3568 			sodp->sod_state |= SOD_WAKE_NOT;
3569 		}
3570 		/* Last, clear sod_want value */
3571 		sodp->sod_want = 0;
3572 		mutex_exit(sodp->sod_lockp);
3573 	}
3574 	so_unlock_read(so);	/* Clear SOREADLOCKED */
3575 	mutex_exit(&so->so_lock);
3576 	return (error);
3577 }
3578 
3579 /*
3580  * Sending data with options on a datagram socket.
3581  * Assumes caller has verified that SS_ISBOUND etc. are set.
3582  */
3583 static int
3584 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3585     struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3586 {
3587 	struct T_unitdata_req	tudr;
3588 	mblk_t			*mp;
3589 	int			error;
3590 	void			*addr;
3591 	socklen_t		addrlen;
3592 	void			*src;
3593 	socklen_t		srclen;
3594 	ssize_t			len;
3595 	int			size;
3596 	struct T_opthdr		toh;
3597 	struct fdbuf		*fdbuf;
3598 	t_uscalar_t		optlen;
3599 	void			*fds;
3600 	int			fdlen;
3601 
3602 	ASSERT(name && namelen);
3603 	ASSERT(control && controllen);
3604 
3605 	len = uiop->uio_resid;
3606 	if (len > (ssize_t)so->so_tidu_size) {
3607 		return (EMSGSIZE);
3608 	}
3609 
3610 	/*
3611 	 * For AF_UNIX the destination address is translated to an internal
3612 	 * name and the source address is passed as an option.
3613 	 * Also, file descriptors are passed as file pointers in an
3614 	 * option.
3615 	 */
3616 
3617 	/*
3618 	 * Length and family checks.
3619 	 */
3620 	error = so_addr_verify(so, name, namelen);
3621 	if (error) {
3622 		eprintsoline(so, error);
3623 		return (error);
3624 	}
3625 	if (so->so_family == AF_UNIX) {
3626 		if (so->so_state & SS_FADDR_NOXLATE) {
3627 			/*
3628 			 * Already have a transport internal address. Do not
3629 			 * pass any (transport internal) source address.
3630 			 */
3631 			addr = name;
3632 			addrlen = namelen;
3633 			src = NULL;
3634 			srclen = 0;
3635 		} else {
3636 			/*
3637 			 * Pass the sockaddr_un source address as an option
3638 			 * and translate the remote address.
3639 			 *
3640 			 * Note that this code does not prevent so_laddr_sa
3641 			 * from changing while it is being used. Thus
3642 			 * if an unbind+bind occurs concurrently with this
3643 			 * send the peer might see a partially new and a
3644 			 * partially old "from" address.
3645 			 */
3646 			src = so->so_laddr_sa;
3647 			srclen = (t_uscalar_t)so->so_laddr_len;
3648 			dprintso(so, 1,
3649 			    ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3650 			    srclen, src));
3651 			error = so_ux_addr_xlate(so, name, namelen,
3652 			    (flags & MSG_XPG4_2),
3653 			    &addr, &addrlen);
3654 			if (error) {
3655 				eprintsoline(so, error);
3656 				return (error);
3657 			}
3658 		}
3659 	} else {
3660 		addr = name;
3661 		addrlen = namelen;
3662 		src = NULL;
3663 		srclen = 0;
3664 	}
3665 	optlen = so_optlen(control, controllen,
3666 	    !(flags & MSG_XPG4_2));
3667 	tudr.PRIM_type = T_UNITDATA_REQ;
3668 	tudr.DEST_length = addrlen;
3669 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3670 	if (srclen != 0)
3671 		tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3672 		    _TPI_ALIGN_TOPT(srclen));
3673 	else
3674 		tudr.OPT_length = optlen;
3675 	tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3676 	    _TPI_ALIGN_TOPT(addrlen));
3677 
3678 	size = tudr.OPT_offset + tudr.OPT_length;
3679 
3680 	/*
3681 	 * File descriptors only when SM_FDPASSING set.
3682 	 */
3683 	error = so_getfdopt(control, controllen,
3684 	    !(flags & MSG_XPG4_2), &fds, &fdlen);
3685 	if (error)
3686 		return (error);
3687 	if (fdlen != -1) {
3688 		if (!(so->so_mode & SM_FDPASSING))
3689 			return (EOPNOTSUPP);
3690 
3691 		error = fdbuf_create(fds, fdlen, &fdbuf);
3692 		if (error)
3693 			return (error);
3694 		mp = fdbuf_allocmsg(size, fdbuf);
3695 	} else {
3696 		mp = soallocproto(size, _ALLOC_INTR);
3697 		if (mp == NULL) {
3698 			/*
3699 			 * Caught a signal waiting for memory.
3700 			 * Let send* return EINTR.
3701 			 */
3702 			return (EINTR);
3703 		}
3704 	}
3705 	soappendmsg(mp, &tudr, sizeof (tudr));
3706 	soappendmsg(mp, addr, addrlen);
3707 	mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3708 
3709 	if (fdlen != -1) {
3710 		ASSERT(fdbuf != NULL);
3711 		toh.level = SOL_SOCKET;
3712 		toh.name = SO_FILEP;
3713 		toh.len = fdbuf->fd_size +
3714 		    (t_uscalar_t)sizeof (struct T_opthdr);
3715 		toh.status = 0;
3716 		soappendmsg(mp, &toh, sizeof (toh));
3717 		soappendmsg(mp, fdbuf, fdbuf->fd_size);
3718 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3719 	}
3720 	if (srclen != 0) {
3721 		/*
3722 		 * There is a AF_UNIX sockaddr_un to include as a source
3723 		 * address option.
3724 		 */
3725 		toh.level = SOL_SOCKET;
3726 		toh.name = SO_SRCADDR;
3727 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3728 		toh.status = 0;
3729 		soappendmsg(mp, &toh, sizeof (toh));
3730 		soappendmsg(mp, src, srclen);
3731 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3732 		ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3733 	}
3734 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3735 	so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3736 	/* At most 3 bytes left in the message */
3737 	ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3738 	ASSERT(MBLKL(mp) <= (ssize_t)size);
3739 
3740 	ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3741 	if (audit_active)
3742 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3743 
3744 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3745 #ifdef SOCK_DEBUG
3746 	if (error) {
3747 		eprintsoline(so, error);
3748 	}
3749 #endif /* SOCK_DEBUG */
3750 	return (error);
3751 }
3752 
3753 /*
3754  * Sending data with options on a connected stream socket.
3755  * Assumes caller has verified that SS_ISCONNECTED is set.
3756  */
3757 static int
3758 sosend_svccmsg(struct sonode *so,
3759 		struct uio *uiop,
3760 		int more,
3761 		void *control,
3762 		t_uscalar_t controllen,
3763 		int flags)
3764 {
3765 	struct T_optdata_req	tdr;
3766 	mblk_t			*mp;
3767 	int			error;
3768 	ssize_t			iosize;
3769 	int			first = 1;
3770 	int			size;
3771 	struct fdbuf		*fdbuf;
3772 	t_uscalar_t		optlen;
3773 	void			*fds;
3774 	int			fdlen;
3775 	struct T_opthdr		toh;
3776 
3777 	dprintso(so, 1,
3778 	    ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3779 
3780 	/*
3781 	 * Has to be bound and connected. However, since no locks are
3782 	 * held the state could have changed after sotpi_sendmsg checked it
3783 	 * thus it is not possible to ASSERT on the state.
3784 	 */
3785 
3786 	/* Options on connection-oriented only when SM_OPTDATA set. */
3787 	if (!(so->so_mode & SM_OPTDATA))
3788 		return (EOPNOTSUPP);
3789 
3790 	do {
3791 		/*
3792 		 * Set the MORE flag if uio_resid does not fit in this
3793 		 * message or if the caller passed in "more".
3794 		 * Error for transports with zero tidu_size.
3795 		 */
3796 		tdr.PRIM_type = T_OPTDATA_REQ;
3797 		iosize = so->so_tidu_size;
3798 		if (iosize <= 0)
3799 			return (EMSGSIZE);
3800 		if (uiop->uio_resid > iosize) {
3801 			tdr.DATA_flag = 1;
3802 		} else {
3803 			if (more)
3804 				tdr.DATA_flag = 1;
3805 			else
3806 				tdr.DATA_flag = 0;
3807 			iosize = uiop->uio_resid;
3808 		}
3809 		dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3810 		    tdr.DATA_flag, iosize));
3811 
3812 		optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3813 		tdr.OPT_length = optlen;
3814 		tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3815 
3816 		size = (int)sizeof (tdr) + optlen;
3817 		/*
3818 		 * File descriptors only when SM_FDPASSING set.
3819 		 */
3820 		error = so_getfdopt(control, controllen,
3821 		    !(flags & MSG_XPG4_2), &fds, &fdlen);
3822 		if (error)
3823 			return (error);
3824 		if (fdlen != -1) {
3825 			if (!(so->so_mode & SM_FDPASSING))
3826 				return (EOPNOTSUPP);
3827 
3828 			error = fdbuf_create(fds, fdlen, &fdbuf);
3829 			if (error)
3830 				return (error);
3831 			mp = fdbuf_allocmsg(size, fdbuf);
3832 		} else {
3833 			mp = soallocproto(size, _ALLOC_INTR);
3834 			if (mp == NULL) {
3835 				/*
3836 				 * Caught a signal waiting for memory.
3837 				 * Let send* return EINTR.
3838 				 */
3839 				return (first ? EINTR : 0);
3840 			}
3841 		}
3842 		soappendmsg(mp, &tdr, sizeof (tdr));
3843 
3844 		if (fdlen != -1) {
3845 			ASSERT(fdbuf != NULL);
3846 			toh.level = SOL_SOCKET;
3847 			toh.name = SO_FILEP;
3848 			toh.len = fdbuf->fd_size +
3849 			    (t_uscalar_t)sizeof (struct T_opthdr);
3850 			toh.status = 0;
3851 			soappendmsg(mp, &toh, sizeof (toh));
3852 			soappendmsg(mp, fdbuf, fdbuf->fd_size);
3853 			ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3854 		}
3855 		so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3856 		/* At most 3 bytes left in the message */
3857 		ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3858 		ASSERT(MBLKL(mp) <= (ssize_t)size);
3859 
3860 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3861 
3862 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3863 		    0, MSG_BAND, 0);
3864 		if (error) {
3865 			if (!first && error == EWOULDBLOCK)
3866 				return (0);
3867 			eprintsoline(so, error);
3868 			return (error);
3869 		}
3870 		control = NULL;
3871 		first = 0;
3872 		if (uiop->uio_resid > 0) {
3873 			/*
3874 			 * Recheck for fatal errors. Fail write even though
3875 			 * some data have been written. This is consistent
3876 			 * with strwrite semantics and BSD sockets semantics.
3877 			 */
3878 			if (so->so_state & SS_CANTSENDMORE) {
3879 				tsignal(curthread, SIGPIPE);
3880 				eprintsoline(so, error);
3881 				return (EPIPE);
3882 			}
3883 			if (so->so_error != 0) {
3884 				mutex_enter(&so->so_lock);
3885 				error = sogeterr(so);
3886 				mutex_exit(&so->so_lock);
3887 				if (error != 0) {
3888 					eprintsoline(so, error);
3889 					return (error);
3890 				}
3891 			}
3892 		}
3893 	} while (uiop->uio_resid > 0);
3894 	return (0);
3895 }
3896 
3897 /*
3898  * Sending data on a datagram socket.
3899  * Assumes caller has verified that SS_ISBOUND etc. are set.
3900  *
3901  * For AF_UNIX the destination address is translated to an internal
3902  * name and the source address is passed as an option.
3903  */
3904 int
3905 sosend_dgram(struct sonode *so, struct sockaddr	*name, socklen_t namelen,
3906     struct uio *uiop, int flags)
3907 {
3908 	struct T_unitdata_req	tudr;
3909 	mblk_t			*mp;
3910 	int			error;
3911 	void			*addr;
3912 	socklen_t		addrlen;
3913 	void			*src;
3914 	socklen_t		srclen;
3915 	ssize_t			len;
3916 
3917 	ASSERT(name != NULL && namelen != 0);
3918 
3919 	len = uiop->uio_resid;
3920 	if (len > so->so_tidu_size) {
3921 		error = EMSGSIZE;
3922 		goto done;
3923 	}
3924 
3925 	/* Length and family checks */
3926 	error = so_addr_verify(so, name, namelen);
3927 	if (error != 0)
3928 		goto done;
3929 
3930 	if (so->so_state & SS_DIRECT)
3931 		return (sodgram_direct(so, name, namelen, uiop, flags));
3932 
3933 	if (so->so_family == AF_UNIX) {
3934 		if (so->so_state & SS_FADDR_NOXLATE) {
3935 			/*
3936 			 * Already have a transport internal address. Do not
3937 			 * pass any (transport internal) source address.
3938 			 */
3939 			addr = name;
3940 			addrlen = namelen;
3941 			src = NULL;
3942 			srclen = 0;
3943 		} else {
3944 			/*
3945 			 * Pass the sockaddr_un source address as an option
3946 			 * and translate the remote address.
3947 			 *
3948 			 * Note that this code does not prevent so_laddr_sa
3949 			 * from changing while it is being used. Thus
3950 			 * if an unbind+bind occurs concurrently with this
3951 			 * send the peer might see a partially new and a
3952 			 * partially old "from" address.
3953 			 */
3954 			src = so->so_laddr_sa;
3955 			srclen = (socklen_t)so->so_laddr_len;
3956 			dprintso(so, 1,
3957 			    ("sosend_dgram UNIX: srclen %d, src %p\n",
3958 			    srclen, src));
3959 			error = so_ux_addr_xlate(so, name, namelen,
3960 			    (flags & MSG_XPG4_2),
3961 			    &addr, &addrlen);
3962 			if (error) {
3963 				eprintsoline(so, error);
3964 				goto done;
3965 			}
3966 		}
3967 	} else {
3968 		addr = name;
3969 		addrlen = namelen;
3970 		src = NULL;
3971 		srclen = 0;
3972 	}
3973 	tudr.PRIM_type = T_UNITDATA_REQ;
3974 	tudr.DEST_length = addrlen;
3975 	tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3976 	if (srclen == 0) {
3977 		tudr.OPT_length = 0;
3978 		tudr.OPT_offset = 0;
3979 
3980 		mp = soallocproto2(&tudr, sizeof (tudr),
3981 		    addr, addrlen, 0, _ALLOC_INTR);
3982 		if (mp == NULL) {
3983 			/*
3984 			 * Caught a signal waiting for memory.
3985 			 * Let send* return EINTR.
3986 			 */
3987 			error = EINTR;
3988 			goto done;
3989 		}
3990 	} else {
3991 		/*
3992 		 * There is a AF_UNIX sockaddr_un to include as a source
3993 		 * address option.
3994 		 */
3995 		struct T_opthdr toh;
3996 		ssize_t size;
3997 
3998 		tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3999 		    _TPI_ALIGN_TOPT(srclen));
4000 		tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4001 		    _TPI_ALIGN_TOPT(addrlen));
4002 
4003 		toh.level = SOL_SOCKET;
4004 		toh.name = SO_SRCADDR;
4005 		toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4006 		toh.status = 0;
4007 
4008 		size = tudr.OPT_offset + tudr.OPT_length;
4009 		mp = soallocproto2(&tudr, sizeof (tudr),
4010 		    addr, addrlen, size, _ALLOC_INTR);
4011 		if (mp == NULL) {
4012 			/*
4013 			 * Caught a signal waiting for memory.
4014 			 * Let send* return EINTR.
4015 			 */
4016 			error = EINTR;
4017 			goto done;
4018 		}
4019 		mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4020 		soappendmsg(mp, &toh, sizeof (toh));
4021 		soappendmsg(mp, src, srclen);
4022 		mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4023 		ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4024 	}
4025 
4026 	if (audit_active)
4027 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4028 
4029 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4030 done:
4031 #ifdef SOCK_DEBUG
4032 	if (error) {
4033 		eprintsoline(so, error);
4034 	}
4035 #endif /* SOCK_DEBUG */
4036 	return (error);
4037 }
4038 
4039 /*
4040  * Sending data on a connected stream socket.
4041  * Assumes caller has verified that SS_ISCONNECTED is set.
4042  */
4043 int
4044 sosend_svc(struct sonode *so,
4045 	struct uio *uiop,
4046 	t_scalar_t prim,
4047 	int more,
4048 	int sflag)
4049 {
4050 	struct T_data_req	tdr;
4051 	mblk_t			*mp;
4052 	int			error;
4053 	ssize_t			iosize;
4054 	int			first = 1;
4055 
4056 	dprintso(so, 1,
4057 	    ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4058 	    (void *)so, uiop->uio_resid, prim, sflag));
4059 
4060 	/*
4061 	 * Has to be bound and connected. However, since no locks are
4062 	 * held the state could have changed after sotpi_sendmsg checked it
4063 	 * thus it is not possible to ASSERT on the state.
4064 	 */
4065 
4066 	do {
4067 		/*
4068 		 * Set the MORE flag if uio_resid does not fit in this
4069 		 * message or if the caller passed in "more".
4070 		 * Error for transports with zero tidu_size.
4071 		 */
4072 		tdr.PRIM_type = prim;
4073 		iosize = so->so_tidu_size;
4074 		if (iosize <= 0)
4075 			return (EMSGSIZE);
4076 		if (uiop->uio_resid > iosize) {
4077 			tdr.MORE_flag = 1;
4078 		} else {
4079 			if (more)
4080 				tdr.MORE_flag = 1;
4081 			else
4082 				tdr.MORE_flag = 0;
4083 			iosize = uiop->uio_resid;
4084 		}
4085 		dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4086 		    prim, tdr.MORE_flag, iosize));
4087 		mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR);
4088 		if (mp == NULL) {
4089 			/*
4090 			 * Caught a signal waiting for memory.
4091 			 * Let send* return EINTR.
4092 			 */
4093 			if (first)
4094 				return (EINTR);
4095 			else
4096 				return (0);
4097 		}
4098 
4099 		error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4100 		    0, sflag | MSG_BAND, 0);
4101 		if (error) {
4102 			if (!first && error == EWOULDBLOCK)
4103 				return (0);
4104 			eprintsoline(so, error);
4105 			return (error);
4106 		}
4107 		first = 0;
4108 		if (uiop->uio_resid > 0) {
4109 			/*
4110 			 * Recheck for fatal errors. Fail write even though
4111 			 * some data have been written. This is consistent
4112 			 * with strwrite semantics and BSD sockets semantics.
4113 			 */
4114 			if (so->so_state & SS_CANTSENDMORE) {
4115 				tsignal(curthread, SIGPIPE);
4116 				eprintsoline(so, error);
4117 				return (EPIPE);
4118 			}
4119 			if (so->so_error != 0) {
4120 				mutex_enter(&so->so_lock);
4121 				error = sogeterr(so);
4122 				mutex_exit(&so->so_lock);
4123 				if (error != 0) {
4124 					eprintsoline(so, error);
4125 					return (error);
4126 				}
4127 			}
4128 		}
4129 	} while (uiop->uio_resid > 0);
4130 	return (0);
4131 }
4132 
4133 /*
4134  * Check the state for errors and call the appropriate send function.
4135  *
4136  * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4137  * this function issues a setsockopt to toggle SO_DONTROUTE before and
4138  * after sending the message.
4139  */
4140 static int
4141 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
4142 {
4143 	int		so_state;
4144 	int		so_mode;
4145 	int		error;
4146 	struct sockaddr *name;
4147 	t_uscalar_t	namelen;
4148 	int		dontroute;
4149 	int		flags;
4150 
4151 	dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4152 	    (void *)so, (void *)msg, msg->msg_flags,
4153 	    pr_state(so->so_state, so->so_mode), so->so_error));
4154 
4155 	mutex_enter(&so->so_lock);
4156 	so_state = so->so_state;
4157 
4158 	if (so_state & SS_CANTSENDMORE) {
4159 		mutex_exit(&so->so_lock);
4160 		tsignal(curthread, SIGPIPE);
4161 		return (EPIPE);
4162 	}
4163 
4164 	if (so->so_error != 0) {
4165 		error = sogeterr(so);
4166 		if (error != 0) {
4167 			mutex_exit(&so->so_lock);
4168 			return (error);
4169 		}
4170 	}
4171 
4172 	name = (struct sockaddr *)msg->msg_name;
4173 	namelen = msg->msg_namelen;
4174 
4175 	so_mode = so->so_mode;
4176 
4177 	if (name == NULL) {
4178 		if (!(so_state & SS_ISCONNECTED)) {
4179 			mutex_exit(&so->so_lock);
4180 			if (so_mode & SM_CONNREQUIRED)
4181 				return (ENOTCONN);
4182 			else
4183 				return (EDESTADDRREQ);
4184 		}
4185 		if (so_mode & SM_CONNREQUIRED) {
4186 			name = NULL;
4187 			namelen = 0;
4188 		} else {
4189 			/*
4190 			 * Note that this code does not prevent so_faddr_sa
4191 			 * from changing while it is being used. Thus
4192 			 * if an "unconnect"+connect occurs concurrently with
4193 			 * this send the datagram might be delivered to a
4194 			 * garbaled address.
4195 			 */
4196 			ASSERT(so->so_faddr_sa);
4197 			name = so->so_faddr_sa;
4198 			namelen = (t_uscalar_t)so->so_faddr_len;
4199 		}
4200 	} else {
4201 		if (!(so_state & SS_ISCONNECTED) &&
4202 		    (so_mode & SM_CONNREQUIRED)) {
4203 			/* Required but not connected */
4204 			mutex_exit(&so->so_lock);
4205 			return (ENOTCONN);
4206 		}
4207 		/*
4208 		 * Ignore the address on connection-oriented sockets.
4209 		 * Just like BSD this code does not generate an error for
4210 		 * TCP (a CONNREQUIRED socket) when sending to an address
4211 		 * passed in with sendto/sendmsg. Instead the data is
4212 		 * delivered on the connection as if no address had been
4213 		 * supplied.
4214 		 */
4215 		if ((so_state & SS_ISCONNECTED) &&
4216 		    !(so_mode & SM_CONNREQUIRED)) {
4217 			mutex_exit(&so->so_lock);
4218 			return (EISCONN);
4219 		}
4220 		if (!(so_state & SS_ISBOUND)) {
4221 			so_lock_single(so);	/* Set SOLOCKED */
4222 			error = sotpi_bind(so, NULL, 0,
4223 			    _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
4224 			so_unlock_single(so, SOLOCKED);
4225 			if (error) {
4226 				mutex_exit(&so->so_lock);
4227 				eprintsoline(so, error);
4228 				return (error);
4229 			}
4230 		}
4231 		/*
4232 		 * Handle delayed datagram errors. These are only queued
4233 		 * when the application sets SO_DGRAM_ERRIND.
4234 		 * Return the error if we are sending to the address
4235 		 * that was returned in the last T_UDERROR_IND.
4236 		 * If sending to some other address discard the delayed
4237 		 * error indication.
4238 		 */
4239 		if (so->so_delayed_error) {
4240 			struct T_uderror_ind	*tudi;
4241 			void			*addr;
4242 			t_uscalar_t		addrlen;
4243 			boolean_t		match = B_FALSE;
4244 
4245 			ASSERT(so->so_eaddr_mp);
4246 			error = so->so_delayed_error;
4247 			so->so_delayed_error = 0;
4248 			tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
4249 			addrlen = tudi->DEST_length;
4250 			addr = sogetoff(so->so_eaddr_mp,
4251 			    tudi->DEST_offset,
4252 			    addrlen, 1);
4253 			ASSERT(addr);	/* Checked by strsock_proto */
4254 			switch (so->so_family) {
4255 			case AF_INET: {
4256 				/* Compare just IP address and port */
4257 				sin_t *sin1 = (sin_t *)name;
4258 				sin_t *sin2 = (sin_t *)addr;
4259 
4260 				if (addrlen == sizeof (sin_t) &&
4261 				    namelen == addrlen &&
4262 				    sin1->sin_port == sin2->sin_port &&
4263 				    sin1->sin_addr.s_addr ==
4264 				    sin2->sin_addr.s_addr)
4265 					match = B_TRUE;
4266 				break;
4267 			}
4268 			case AF_INET6: {
4269 				/* Compare just IP address and port. Not flow */
4270 				sin6_t *sin1 = (sin6_t *)name;
4271 				sin6_t *sin2 = (sin6_t *)addr;
4272 
4273 				if (addrlen == sizeof (sin6_t) &&
4274 				    namelen == addrlen &&
4275 				    sin1->sin6_port == sin2->sin6_port &&
4276 				    IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4277 				    &sin2->sin6_addr))
4278 					match = B_TRUE;
4279 				break;
4280 			}
4281 			case AF_UNIX:
4282 			default:
4283 				if (namelen == addrlen &&
4284 				    bcmp(name, addr, namelen) == 0)
4285 					match = B_TRUE;
4286 			}
4287 			if (match) {
4288 				freemsg(so->so_eaddr_mp);
4289 				so->so_eaddr_mp = NULL;
4290 				mutex_exit(&so->so_lock);
4291 #ifdef DEBUG
4292 				dprintso(so, 0,
4293 				    ("sockfs delayed error %d for %s\n",
4294 				    error,
4295 				    pr_addr(so->so_family, name, namelen)));
4296 #endif /* DEBUG */
4297 				return (error);
4298 			}
4299 			freemsg(so->so_eaddr_mp);
4300 			so->so_eaddr_mp = NULL;
4301 		}
4302 	}
4303 	mutex_exit(&so->so_lock);
4304 
4305 	flags = msg->msg_flags;
4306 	dontroute = 0;
4307 	if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4308 		uint32_t	val;
4309 
4310 		val = 1;
4311 		error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4312 		    &val, (t_uscalar_t)sizeof (val));
4313 		if (error)
4314 			return (error);
4315 		dontroute = 1;
4316 	}
4317 
4318 	if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4319 		error = EOPNOTSUPP;
4320 		goto done;
4321 	}
4322 	if (msg->msg_controllen != 0) {
4323 		if (!(so_mode & SM_CONNREQUIRED)) {
4324 			error = sosend_dgramcmsg(so, name, namelen, uiop,
4325 			    msg->msg_control, msg->msg_controllen, flags);
4326 		} else {
4327 			if (flags & MSG_OOB) {
4328 				/* Can't generate T_EXDATA_REQ with options */
4329 				error = EOPNOTSUPP;
4330 				goto done;
4331 			}
4332 			error = sosend_svccmsg(so, uiop,
4333 			    !(flags & MSG_EOR),
4334 			    msg->msg_control, msg->msg_controllen,
4335 			    flags);
4336 		}
4337 		goto done;
4338 	}
4339 
4340 	if (!(so_mode & SM_CONNREQUIRED)) {
4341 		/*
4342 		 * If there is no SO_DONTROUTE to turn off return immediately
4343 		 * from send_dgram. This can allow tail-call optimizations.
4344 		 */
4345 		if (!dontroute) {
4346 			return (sosend_dgram(so, name, namelen, uiop, flags));
4347 		}
4348 		error = sosend_dgram(so, name, namelen, uiop, flags);
4349 	} else {
4350 		t_scalar_t prim;
4351 		int sflag;
4352 
4353 		/* Ignore msg_name in the connected state */
4354 		if (flags & MSG_OOB) {
4355 			prim = T_EXDATA_REQ;
4356 			/*
4357 			 * Send down T_EXDATA_REQ even if there is flow
4358 			 * control for data.
4359 			 */
4360 			sflag = MSG_IGNFLOW;
4361 		} else {
4362 			if (so_mode & SM_BYTESTREAM) {
4363 				/* Byte stream transport - use write */
4364 
4365 				dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4366 				/*
4367 				 * If there is no SO_DONTROUTE to turn off,
4368 				 * SS_DIRECT is on, and there is no flow
4369 				 * control, we can take the fast path.
4370 				 */
4371 				if (!dontroute &&
4372 				    (so_state & SS_DIRECT) &&
4373 				    canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4374 					return (sostream_direct(so, uiop,
4375 					    NULL, CRED()));
4376 				}
4377 				error = strwrite(SOTOV(so), uiop, CRED());
4378 				goto done;
4379 			}
4380 			prim = T_DATA_REQ;
4381 			sflag = 0;
4382 		}
4383 		/*
4384 		 * If there is no SO_DONTROUTE to turn off return immediately
4385 		 * from sosend_svc. This can allow tail-call optimizations.
4386 		 */
4387 		if (!dontroute)
4388 			return (sosend_svc(so, uiop, prim,
4389 			    !(flags & MSG_EOR), sflag));
4390 		error = sosend_svc(so, uiop, prim,
4391 		    !(flags & MSG_EOR), sflag);
4392 	}
4393 	ASSERT(dontroute);
4394 done:
4395 	if (dontroute) {
4396 		uint32_t	val;
4397 
4398 		val = 0;
4399 		(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4400 		    &val, (t_uscalar_t)sizeof (val));
4401 	}
4402 	return (error);
4403 }
4404 
4405 /*
4406  * Sending data on a datagram socket.
4407  * Assumes caller has verified that SS_ISBOUND etc. are set.
4408  */
4409 /* ARGSUSED */
4410 static int
4411 sodgram_direct(struct sonode *so, struct sockaddr *name,
4412     socklen_t namelen, struct uio *uiop, int flags)
4413 {
4414 	struct T_unitdata_req	tudr;
4415 	mblk_t			*mp = NULL;
4416 	int			error = 0;
4417 	void			*addr;
4418 	socklen_t		addrlen;
4419 	ssize_t			len;
4420 	struct stdata		*stp = SOTOV(so)->v_stream;
4421 	int			so_state;
4422 	queue_t			*udp_wq;
4423 	boolean_t		connected;
4424 	mblk_t			*mpdata = NULL;
4425 
4426 	ASSERT(name != NULL && namelen != 0);
4427 	ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4428 	ASSERT(!(so->so_mode & SM_EXDATA));
4429 	ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4430 	ASSERT(SOTOV(so)->v_type == VSOCK);
4431 
4432 	/* Caller checked for proper length */
4433 	len = uiop->uio_resid;
4434 	ASSERT(len <= so->so_tidu_size);
4435 
4436 	/* Length and family checks have been done by caller */
4437 	ASSERT(name->sa_family == so->so_family);
4438 	ASSERT(so->so_family == AF_INET ||
4439 	    (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4440 	ASSERT(so->so_family == AF_INET6 ||
4441 	    (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4442 
4443 	addr = name;
4444 	addrlen = namelen;
4445 
4446 	if (stp->sd_sidp != NULL &&
4447 	    (error = straccess(stp, JCWRITE)) != 0)
4448 		goto done;
4449 
4450 	so_state = so->so_state;
4451 
4452 	connected = so_state & SS_ISCONNECTED;
4453 	if (!connected) {
4454 		tudr.PRIM_type = T_UNITDATA_REQ;
4455 		tudr.DEST_length = addrlen;
4456 		tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4457 		tudr.OPT_length = 0;
4458 		tudr.OPT_offset = 0;
4459 
4460 		mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4461 		    _ALLOC_INTR);
4462 		if (mp == NULL) {
4463 			/*
4464 			 * Caught a signal waiting for memory.
4465 			 * Let send* return EINTR.
4466 			 */
4467 			error = EINTR;
4468 			goto done;
4469 		}
4470 	}
4471 
4472 	/*
4473 	 * For UDP we don't break up the copyin into smaller pieces
4474 	 * as in the TCP case.  That means if ENOMEM is returned by
4475 	 * mcopyinuio() then the uio vector has not been modified at
4476 	 * all and we fallback to either strwrite() or kstrputmsg()
4477 	 * below.  Note also that we never generate priority messages
4478 	 * from here.
4479 	 */
4480 	udp_wq = stp->sd_wrq->q_next;
4481 	if (canput(udp_wq) &&
4482 	    (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4483 		ASSERT(DB_TYPE(mpdata) == M_DATA);
4484 		ASSERT(uiop->uio_resid == 0);
4485 		if (!connected)
4486 			linkb(mp, mpdata);
4487 		else
4488 			mp = mpdata;
4489 		if (audit_active)
4490 			audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4491 
4492 		udp_wput(udp_wq, mp);
4493 		return (0);
4494 	}
4495 
4496 	ASSERT(mpdata == NULL);
4497 	if (error != 0 && error != ENOMEM) {
4498 		freemsg(mp);
4499 		return (error);
4500 	}
4501 
4502 	/*
4503 	 * For connected, let strwrite() handle the blocking case.
4504 	 * Otherwise we fall thru and use kstrputmsg().
4505 	 */
4506 	if (connected)
4507 		return (strwrite(SOTOV(so), uiop, CRED()));
4508 
4509 	if (audit_active)
4510 		audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4511 
4512 	error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4513 done:
4514 #ifdef SOCK_DEBUG
4515 	if (error != 0) {
4516 		eprintsoline(so, error);
4517 	}
4518 #endif /* SOCK_DEBUG */
4519 	return (error);
4520 }
4521 
4522 int
4523 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4524 {
4525 	struct stdata *stp = SOTOV(so)->v_stream;
4526 	ssize_t iosize, rmax, maxblk;
4527 	queue_t *tcp_wq = stp->sd_wrq->q_next;
4528 	mblk_t *newmp;
4529 	int error = 0, wflag = 0;
4530 
4531 	ASSERT(so->so_mode & SM_BYTESTREAM);
4532 	ASSERT(SOTOV(so)->v_type == VSOCK);
4533 
4534 	if (stp->sd_sidp != NULL &&
4535 	    (error = straccess(stp, JCWRITE)) != 0)
4536 		return (error);
4537 
4538 	if (uiop == NULL) {
4539 		/*
4540 		 * kstrwritemp() should have checked sd_flag and
4541 		 * flow-control before coming here.  If we end up
4542 		 * here it means that we can simply pass down the
4543 		 * data to tcp.
4544 		 */
4545 		ASSERT(mp != NULL);
4546 		if (stp->sd_wputdatafunc != NULL) {
4547 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4548 			    NULL, NULL, NULL);
4549 			if (newmp == NULL) {
4550 				/* The caller will free mp */
4551 				return (ECOMM);
4552 			}
4553 			mp = newmp;
4554 		}
4555 		tcp_wput(tcp_wq, mp);
4556 		return (0);
4557 	}
4558 
4559 	/* Fallback to strwrite() to do proper error handling */
4560 	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4561 		return (strwrite(SOTOV(so), uiop, cr));
4562 
4563 	rmax = stp->sd_qn_maxpsz;
4564 	ASSERT(rmax >= 0 || rmax == INFPSZ);
4565 	if (rmax == 0 || uiop->uio_resid <= 0)
4566 		return (0);
4567 
4568 	if (rmax == INFPSZ)
4569 		rmax = uiop->uio_resid;
4570 
4571 	maxblk = stp->sd_maxblk;
4572 
4573 	for (;;) {
4574 		iosize = MIN(uiop->uio_resid, rmax);
4575 
4576 		mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4577 		if (mp == NULL) {
4578 			/*
4579 			 * Fallback to strwrite() for ENOMEM; if this
4580 			 * is our first time in this routine and the uio
4581 			 * vector has not been modified, we will end up
4582 			 * calling strwrite() without any flag set.
4583 			 */
4584 			if (error == ENOMEM)
4585 				goto slow_send;
4586 			else
4587 				return (error);
4588 		}
4589 		ASSERT(uiop->uio_resid >= 0);
4590 		/*
4591 		 * If mp is non-NULL and ENOMEM is set, it means that
4592 		 * mcopyinuio() was able to break down some of the user
4593 		 * data into one or more mblks.  Send the partial data
4594 		 * to tcp and let the rest be handled in strwrite().
4595 		 */
4596 		ASSERT(error == 0 || error == ENOMEM);
4597 		if (stp->sd_wputdatafunc != NULL) {
4598 			newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4599 			    NULL, NULL, NULL);
4600 			if (newmp == NULL) {
4601 				/* The caller will free mp */
4602 				return (ECOMM);
4603 			}
4604 			mp = newmp;
4605 		}
4606 		tcp_wput(tcp_wq, mp);
4607 
4608 		wflag |= NOINTR;
4609 
4610 		if (uiop->uio_resid == 0) {	/* No more data; we're done */
4611 			ASSERT(error == 0);
4612 			break;
4613 		} else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4614 		    (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4615 slow_send:
4616 			/*
4617 			 * We were able to send down partial data using
4618 			 * the direct call interface, but are now relying
4619 			 * on strwrite() to handle the non-fastpath cases.
4620 			 * If the socket is blocking we will sleep in
4621 			 * strwaitq() until write is permitted, otherwise,
4622 			 * we will need to return the amount of bytes
4623 			 * written so far back to the app.  This is the
4624 			 * reason why we pass NOINTR flag to strwrite()
4625 			 * for non-blocking socket, because we don't want
4626 			 * to return EAGAIN when portion of the user data
4627 			 * has actually been sent down.
4628 			 */
4629 			return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4630 		}
4631 	}
4632 	return (0);
4633 }
4634 
4635 /*
4636  * Update so_faddr by asking the transport (unless AF_UNIX).
4637  */
4638 int
4639 sotpi_getpeername(struct sonode *so)
4640 {
4641 	struct strbuf	strbuf;
4642 	int		error = 0, res;
4643 	void		*addr;
4644 	t_uscalar_t	addrlen;
4645 	k_sigset_t	smask;
4646 
4647 	dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4648 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4649 
4650 	mutex_enter(&so->so_lock);
4651 	so_lock_single(so);	/* Set SOLOCKED */
4652 	if (!(so->so_state & SS_ISCONNECTED)) {
4653 		error = ENOTCONN;
4654 		goto done;
4655 	}
4656 	/* Added this check for X/Open */
4657 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4658 		error = EINVAL;
4659 		if (xnet_check_print) {
4660 			printf("sockfs: X/Open getpeername check => EINVAL\n");
4661 		}
4662 		goto done;
4663 	}
4664 #ifdef DEBUG
4665 	dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4666 	    pr_addr(so->so_family, so->so_faddr_sa,
4667 	    (t_uscalar_t)so->so_faddr_len)));
4668 #endif /* DEBUG */
4669 
4670 	if (so->so_family == AF_UNIX) {
4671 		/* Transport has different name space - return local info */
4672 		error = 0;
4673 		goto done;
4674 	}
4675 
4676 	ASSERT(so->so_faddr_sa);
4677 	/* Allocate local buffer to use with ioctl */
4678 	addrlen = (t_uscalar_t)so->so_faddr_maxlen;
4679 	mutex_exit(&so->so_lock);
4680 	addr = kmem_alloc(addrlen, KM_SLEEP);
4681 
4682 	/*
4683 	 * Issue TI_GETPEERNAME with signals masked.
4684 	 * Put the result in so_faddr_sa so that getpeername works after
4685 	 * a shutdown(output).
4686 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4687 	 * back to the socket.
4688 	 */
4689 	strbuf.buf = addr;
4690 	strbuf.maxlen = addrlen;
4691 	strbuf.len = 0;
4692 
4693 	sigintr(&smask, 0);
4694 	res = 0;
4695 	ASSERT(CRED());
4696 	error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4697 	    0, K_TO_K, CRED(), &res);
4698 	sigunintr(&smask);
4699 
4700 	mutex_enter(&so->so_lock);
4701 	/*
4702 	 * If there is an error record the error in so_error put don't fail
4703 	 * the getpeername. Instead fallback on the recorded
4704 	 * so->so_faddr_sa.
4705 	 */
4706 	if (error) {
4707 		/*
4708 		 * Various stream head errors can be returned to the ioctl.
4709 		 * However, it is impossible to determine which ones of
4710 		 * these are really socket level errors that were incorrectly
4711 		 * consumed by the ioctl. Thus this code silently ignores the
4712 		 * error - to code explicitly does not reinstate the error
4713 		 * using soseterror().
4714 		 * Experiments have shows that at least this set of
4715 		 * errors are reported and should not be reinstated on the
4716 		 * socket:
4717 		 *	EINVAL	E.g. if an I_LINK was in effect when
4718 		 *		getpeername was called.
4719 		 *	EPIPE	The ioctl error semantics prefer the write
4720 		 *		side error over the read side error.
4721 		 *	ENOTCONN The transport just got disconnected but
4722 		 *		sockfs had not yet seen the T_DISCON_IND
4723 		 *		when issuing the ioctl.
4724 		 */
4725 		error = 0;
4726 	} else if (res == 0 && strbuf.len > 0 &&
4727 	    (so->so_state & SS_ISCONNECTED)) {
4728 		ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
4729 		so->so_faddr_len = (socklen_t)strbuf.len;
4730 		bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
4731 		so->so_state |= SS_FADDR_VALID;
4732 	}
4733 	kmem_free(addr, addrlen);
4734 #ifdef DEBUG
4735 	dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4736 	    pr_addr(so->so_family, so->so_faddr_sa,
4737 	    (t_uscalar_t)so->so_faddr_len)));
4738 #endif /* DEBUG */
4739 done:
4740 	so_unlock_single(so, SOLOCKED);
4741 	mutex_exit(&so->so_lock);
4742 	return (error);
4743 }
4744 
4745 /*
4746  * Update so_laddr by asking the transport (unless AF_UNIX).
4747  */
4748 int
4749 sotpi_getsockname(struct sonode *so)
4750 {
4751 	struct strbuf	strbuf;
4752 	int		error = 0, res;
4753 	void		*addr;
4754 	t_uscalar_t	addrlen;
4755 	k_sigset_t	smask;
4756 
4757 	dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4758 	    (void *)so, pr_state(so->so_state, so->so_mode)));
4759 
4760 	mutex_enter(&so->so_lock);
4761 	so_lock_single(so);	/* Set SOLOCKED */
4762 	if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
4763 		/* Return an all zero address except for the family */
4764 		if (so->so_family == AF_INET)
4765 			so->so_laddr_len = (socklen_t)sizeof (sin_t);
4766 		else if (so->so_family == AF_INET6)
4767 			so->so_laddr_len = (socklen_t)sizeof (sin6_t);
4768 		ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
4769 		bzero(so->so_laddr_sa, so->so_laddr_len);
4770 		/*
4771 		 * Can not assume there is a sa_family for all
4772 		 * protocol families.
4773 		 */
4774 		if (so->so_family == AF_INET || so->so_family == AF_INET6)
4775 			so->so_laddr_sa->sa_family = so->so_family;
4776 	}
4777 #ifdef DEBUG
4778 	dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4779 	    pr_addr(so->so_family, so->so_laddr_sa,
4780 	    (t_uscalar_t)so->so_laddr_len)));
4781 #endif /* DEBUG */
4782 	if (so->so_family == AF_UNIX) {
4783 		/* Transport has different name space - return local info */
4784 		error = 0;
4785 		goto done;
4786 	}
4787 	if (!(so->so_state & SS_ISBOUND)) {
4788 		/* If not bound, then nothing to return. */
4789 		error = 0;
4790 		goto done;
4791 	}
4792 	/* Allocate local buffer to use with ioctl */
4793 	addrlen = (t_uscalar_t)so->so_laddr_maxlen;
4794 	mutex_exit(&so->so_lock);
4795 	addr = kmem_alloc(addrlen, KM_SLEEP);
4796 
4797 	/*
4798 	 * Issue TI_GETMYNAME with signals masked.
4799 	 * Put the result in so_laddr_sa so that getsockname works after
4800 	 * a shutdown(output).
4801 	 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4802 	 * back to the socket.
4803 	 */
4804 	strbuf.buf = addr;
4805 	strbuf.maxlen = addrlen;
4806 	strbuf.len = 0;
4807 
4808 	sigintr(&smask, 0);
4809 	res = 0;
4810 	ASSERT(CRED());
4811 	error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4812 	    0, K_TO_K, CRED(), &res);
4813 	sigunintr(&smask);
4814 
4815 	mutex_enter(&so->so_lock);
4816 	/*
4817 	 * If there is an error record the error in so_error put don't fail
4818 	 * the getsockname. Instead fallback on the recorded
4819 	 * so->so_laddr_sa.
4820 	 */
4821 	if (error) {
4822 		/*
4823 		 * Various stream head errors can be returned to the ioctl.
4824 		 * However, it is impossible to determine which ones of
4825 		 * these are really socket level errors that were incorrectly
4826 		 * consumed by the ioctl. Thus this code silently ignores the
4827 		 * error - to code explicitly does not reinstate the error
4828 		 * using soseterror().
4829 		 * Experiments have shows that at least this set of
4830 		 * errors are reported and should not be reinstated on the
4831 		 * socket:
4832 		 *	EINVAL	E.g. if an I_LINK was in effect when
4833 		 *		getsockname was called.
4834 		 *	EPIPE	The ioctl error semantics prefer the write
4835 		 *		side error over the read side error.
4836 		 */
4837 		error = 0;
4838 	} else if (res == 0 && strbuf.len > 0 &&
4839 	    (so->so_state & SS_ISBOUND)) {
4840 		ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
4841 		so->so_laddr_len = (socklen_t)strbuf.len;
4842 		bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
4843 		so->so_state |= SS_LADDR_VALID;
4844 	}
4845 	kmem_free(addr, addrlen);
4846 #ifdef DEBUG
4847 	dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4848 	    pr_addr(so->so_family, so->so_laddr_sa,
4849 	    (t_uscalar_t)so->so_laddr_len)));
4850 #endif /* DEBUG */
4851 done:
4852 	so_unlock_single(so, SOLOCKED);
4853 	mutex_exit(&so->so_lock);
4854 	return (error);
4855 }
4856 
4857 /*
4858  * Get socket options. For SOL_SOCKET options some options are handled
4859  * by the sockfs while others use the value recorded in the sonode as a
4860  * fallback should the T_SVR4_OPTMGMT_REQ fail.
4861  *
4862  * On the return most *optlenp bytes are copied to optval.
4863  */
4864 int
4865 sotpi_getsockopt(struct sonode *so, int level, int option_name,
4866 		void *optval, socklen_t *optlenp, int flags)
4867 {
4868 	struct T_optmgmt_req	optmgmt_req;
4869 	struct T_optmgmt_ack	*optmgmt_ack;
4870 	struct opthdr		oh;
4871 	struct opthdr		*opt_res;
4872 	mblk_t			*mp = NULL;
4873 	int			error = 0;
4874 	void			*option = NULL;	/* Set if fallback value */
4875 	t_uscalar_t		maxlen = *optlenp;
4876 	t_uscalar_t		len;
4877 	uint32_t		value;
4878 
4879 	dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
4880 	    (void *)so, level, option_name, optval, (void *)optlenp,
4881 	    pr_state(so->so_state, so->so_mode)));
4882 
4883 	mutex_enter(&so->so_lock);
4884 	so_lock_single(so);	/* Set SOLOCKED */
4885 
4886 	/*
4887 	 * Check for SOL_SOCKET options.
4888 	 * Certain SOL_SOCKET options are returned directly whereas
4889 	 * others only provide a default (fallback) value should
4890 	 * the T_SVR4_OPTMGMT_REQ fail.
4891 	 */
4892 	if (level == SOL_SOCKET) {
4893 		/* Check parameters */
4894 		switch (option_name) {
4895 		case SO_TYPE:
4896 		case SO_ERROR:
4897 		case SO_DEBUG:
4898 		case SO_ACCEPTCONN:
4899 		case SO_REUSEADDR:
4900 		case SO_KEEPALIVE:
4901 		case SO_DONTROUTE:
4902 		case SO_BROADCAST:
4903 		case SO_USELOOPBACK:
4904 		case SO_OOBINLINE:
4905 		case SO_SNDBUF:
4906 		case SO_RCVBUF:
4907 #ifdef notyet
4908 		case SO_SNDLOWAT:
4909 		case SO_RCVLOWAT:
4910 		case SO_SNDTIMEO:
4911 		case SO_RCVTIMEO:
4912 #endif /* notyet */
4913 		case SO_DOMAIN:
4914 		case SO_DGRAM_ERRIND:
4915 			if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
4916 				error = EINVAL;
4917 				eprintsoline(so, error);
4918 				goto done2;
4919 			}
4920 			break;
4921 		case SO_LINGER:
4922 			if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
4923 				error = EINVAL;
4924 				eprintsoline(so, error);
4925 				goto done2;
4926 			}
4927 			break;
4928 		}
4929 
4930 		len = (t_uscalar_t)sizeof (uint32_t);	/* Default */
4931 
4932 		switch (option_name) {
4933 		case SO_TYPE:
4934 			value = so->so_type;
4935 			option = &value;
4936 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4937 
4938 		case SO_ERROR:
4939 			value = sogeterr(so);
4940 			option = &value;
4941 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4942 
4943 		case SO_ACCEPTCONN:
4944 			if (so->so_state & SS_ACCEPTCONN)
4945 				value = SO_ACCEPTCONN;
4946 			else
4947 				value = 0;
4948 #ifdef DEBUG
4949 			if (value) {
4950 				dprintso(so, 1,
4951 				    ("sotpi_getsockopt: 0x%x is set\n",
4952 				    option_name));
4953 			} else {
4954 				dprintso(so, 1,
4955 				    ("sotpi_getsockopt: 0x%x not set\n",
4956 				    option_name));
4957 			}
4958 #endif /* DEBUG */
4959 			option = &value;
4960 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4961 
4962 		case SO_DEBUG:
4963 		case SO_REUSEADDR:
4964 		case SO_KEEPALIVE:
4965 		case SO_DONTROUTE:
4966 		case SO_BROADCAST:
4967 		case SO_USELOOPBACK:
4968 		case SO_OOBINLINE:
4969 		case SO_DGRAM_ERRIND:
4970 			value = (so->so_options & option_name);
4971 #ifdef DEBUG
4972 			if (value) {
4973 				dprintso(so, 1,
4974 				    ("sotpi_getsockopt: 0x%x is set\n",
4975 				    option_name));
4976 			} else {
4977 				dprintso(so, 1,
4978 				    ("sotpi_getsockopt: 0x%x not set\n",
4979 				    option_name));
4980 			}
4981 #endif /* DEBUG */
4982 			option = &value;
4983 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
4984 
4985 		/*
4986 		 * The following options are only returned by sockfs when the
4987 		 * T_SVR4_OPTMGMT_REQ fails.
4988 		 */
4989 		case SO_LINGER:
4990 			option = &so->so_linger;
4991 			len = (t_uscalar_t)sizeof (struct linger);
4992 			break;
4993 		case SO_SNDBUF: {
4994 			ssize_t lvalue;
4995 
4996 			/*
4997 			 * If the option has not been set then get a default
4998 			 * value from the read queue. This value is
4999 			 * returned if the transport fails
5000 			 * the T_SVR4_OPTMGMT_REQ.
5001 			 */
5002 			lvalue = so->so_sndbuf;
5003 			if (lvalue == 0) {
5004 				mutex_exit(&so->so_lock);
5005 				(void) strqget(strvp2wq(SOTOV(so))->q_next,
5006 				    QHIWAT, 0, &lvalue);
5007 				mutex_enter(&so->so_lock);
5008 				dprintso(so, 1,
5009 				    ("got SO_SNDBUF %ld from q\n", lvalue));
5010 			}
5011 			value = (int)lvalue;
5012 			option = &value;
5013 			len = (t_uscalar_t)sizeof (so->so_sndbuf);
5014 			break;
5015 		}
5016 		case SO_RCVBUF: {
5017 			ssize_t lvalue;
5018 
5019 			/*
5020 			 * If the option has not been set then get a default
5021 			 * value from the read queue. This value is
5022 			 * returned if the transport fails
5023 			 * the T_SVR4_OPTMGMT_REQ.
5024 			 *
5025 			 * XXX If SO_RCVBUF has been set and this is an
5026 			 * XPG 4.2 application then do not ask the transport
5027 			 * since the transport might adjust the value and not
5028 			 * return exactly what was set by the application.
5029 			 * For non-XPG 4.2 application we return the value
5030 			 * that the transport is actually using.
5031 			 */
5032 			lvalue = so->so_rcvbuf;
5033 			if (lvalue == 0) {
5034 				mutex_exit(&so->so_lock);
5035 				(void) strqget(RD(strvp2wq(SOTOV(so))),
5036 				    QHIWAT, 0, &lvalue);
5037 				mutex_enter(&so->so_lock);
5038 				dprintso(so, 1,
5039 				    ("got SO_RCVBUF %ld from q\n", lvalue));
5040 			} else if (flags & _SOGETSOCKOPT_XPG4_2) {
5041 				value = (int)lvalue;
5042 				option = &value;
5043 				goto copyout;	/* skip asking transport */
5044 			}
5045 			value = (int)lvalue;
5046 			option = &value;
5047 			len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5048 			break;
5049 		}
5050 		case SO_DOMAIN:
5051 			value = so->so_family;
5052 			option = &value;
5053 			goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5054 
5055 #ifdef notyet
5056 		/*
5057 		 * We do not implement the semantics of these options
5058 		 * thus we shouldn't implement the options either.
5059 		 */
5060 		case SO_SNDLOWAT:
5061 			value = so->so_sndlowat;
5062 			option = &value;
5063 			break;
5064 		case SO_RCVLOWAT:
5065 			value = so->so_rcvlowat;
5066 			option = &value;
5067 			break;
5068 		case SO_SNDTIMEO:
5069 			value = so->so_sndtimeo;
5070 			option = &value;
5071 			break;
5072 		case SO_RCVTIMEO:
5073 			value = so->so_rcvtimeo;
5074 			option = &value;
5075 			break;
5076 #endif /* notyet */
5077 		}
5078 	}
5079 
5080 	mutex_exit(&so->so_lock);
5081 
5082 	/* Send request */
5083 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5084 	optmgmt_req.MGMT_flags = T_CHECK;
5085 	optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5086 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5087 
5088 	oh.level = level;
5089 	oh.name = option_name;
5090 	oh.len = maxlen;
5091 
5092 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5093 	    &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP);
5094 	/* Let option management work in the presence of data flow control */
5095 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5096 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5097 	mp = NULL;
5098 	mutex_enter(&so->so_lock);
5099 	if (error) {
5100 		eprintsoline(so, error);
5101 		goto done2;
5102 	}
5103 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5104 	    (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5105 	if (error) {
5106 		if (option != NULL) {
5107 			/* We have a fallback value */
5108 			error = 0;
5109 			goto copyout;
5110 		}
5111 		eprintsoline(so, error);
5112 		goto done2;
5113 	}
5114 	ASSERT(mp);
5115 	optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5116 	opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5117 	    optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5118 	if (opt_res == NULL) {
5119 		if (option != NULL) {
5120 			/* We have a fallback value */
5121 			error = 0;
5122 			goto copyout;
5123 		}
5124 		error = EPROTO;
5125 		eprintsoline(so, error);
5126 		goto done;
5127 	}
5128 	option = &opt_res[1];
5129 
5130 	/* check to ensure that the option is within bounds */
5131 	if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5132 	    (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5133 		if (option != NULL) {
5134 			/* We have a fallback value */
5135 			error = 0;
5136 			goto copyout;
5137 		}
5138 		error = EPROTO;
5139 		eprintsoline(so, error);
5140 		goto done;
5141 	}
5142 
5143 	len = opt_res->len;
5144 
5145 copyout: {
5146 		t_uscalar_t size = MIN(len, maxlen);
5147 		bcopy(option, optval, size);
5148 		bcopy(&size, optlenp, sizeof (size));
5149 	}
5150 done:
5151 	freemsg(mp);
5152 done2:
5153 	so_unlock_single(so, SOLOCKED);
5154 	mutex_exit(&so->so_lock);
5155 	return (error);
5156 }
5157 
5158 /*
5159  * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5160  * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5161  * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5162  * setsockopt has to work even if the transport does not support the option.
5163  */
5164 int
5165 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5166 	const void *optval, t_uscalar_t optlen)
5167 {
5168 	struct T_optmgmt_req	optmgmt_req;
5169 	struct opthdr		oh;
5170 	mblk_t			*mp;
5171 	int			error = 0;
5172 	boolean_t		handled = B_FALSE;
5173 
5174 	dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5175 	    (void *)so, level, option_name, optval, optlen,
5176 	    pr_state(so->so_state, so->so_mode)));
5177 
5178 
5179 	/* X/Open requires this check */
5180 	if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5181 		if (xnet_check_print)
5182 			printf("sockfs: X/Open setsockopt check => EINVAL\n");
5183 		return (EINVAL);
5184 	}
5185 
5186 	/* Caller allocates aligned optval, or passes null */
5187 	ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
5188 	/* If optval is null optlen is 0, and vice-versa */
5189 	ASSERT(optval != NULL || optlen == 0);
5190 	ASSERT(optlen != 0 || optval == NULL);
5191 
5192 	mutex_enter(&so->so_lock);
5193 	so_lock_single(so);	/* Set SOLOCKED */
5194 	mutex_exit(&so->so_lock);
5195 
5196 	/*
5197 	 * For SOCKET or TCP level options, try to set it here itself
5198 	 * provided socket has not been popped and we know the tcp
5199 	 * structure (stored in so_priv).
5200 	 */
5201 	if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
5202 	    (so->so_family == AF_INET || so->so_family == AF_INET6) &&
5203 	    (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
5204 		tcp_t		*tcp = so->so_priv;
5205 		boolean_t	onoff;
5206 
5207 #define	intvalue	(*(int32_t *)optval)
5208 
5209 		switch (level) {
5210 		case SOL_SOCKET:
5211 			switch (option_name) {		/* Check length param */
5212 			case SO_DEBUG:
5213 			case SO_REUSEADDR:
5214 			case SO_DONTROUTE:
5215 			case SO_BROADCAST:
5216 			case SO_USELOOPBACK:
5217 			case SO_OOBINLINE:
5218 			case SO_DGRAM_ERRIND:
5219 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5220 					error = EINVAL;
5221 					eprintsoline(so, error);
5222 					mutex_enter(&so->so_lock);
5223 					goto done2;
5224 				}
5225 				ASSERT(optval);
5226 				onoff = intvalue != 0;
5227 				handled = B_TRUE;
5228 				break;
5229 			case SO_LINGER:
5230 				if (optlen !=
5231 				    (t_uscalar_t)sizeof (struct linger)) {
5232 					error = EINVAL;
5233 					eprintsoline(so, error);
5234 					mutex_enter(&so->so_lock);
5235 					goto done2;
5236 				}
5237 				ASSERT(optval);
5238 				handled = B_TRUE;
5239 				break;
5240 			}
5241 
5242 			switch (option_name) {			/* Do actions */
5243 			case SO_LINGER: {
5244 				struct linger *lgr = (struct linger *)optval;
5245 
5246 				if (lgr->l_onoff) {
5247 					tcp->tcp_linger = 1;
5248 					tcp->tcp_lingertime = lgr->l_linger;
5249 					so->so_linger.l_onoff = SO_LINGER;
5250 					so->so_options |= SO_LINGER;
5251 				} else {
5252 					tcp->tcp_linger = 0;
5253 					tcp->tcp_lingertime = 0;
5254 					so->so_linger.l_onoff = 0;
5255 					so->so_options &= ~SO_LINGER;
5256 				}
5257 				so->so_linger.l_linger = lgr->l_linger;
5258 				handled = B_TRUE;
5259 				break;
5260 			}
5261 			case SO_DEBUG:
5262 				tcp->tcp_debug = onoff;
5263 #ifdef SOCK_TEST
5264 				if (intvalue & 2)
5265 					sock_test_timelimit = 10 * hz;
5266 				else
5267 					sock_test_timelimit = 0;
5268 
5269 				if (intvalue & 4)
5270 					do_useracc = 0;
5271 				else
5272 					do_useracc = 1;
5273 #endif /* SOCK_TEST */
5274 				break;
5275 			case SO_DONTROUTE:
5276 				/*
5277 				 * SO_DONTROUTE, SO_USELOOPBACK and
5278 				 * SO_BROADCAST are only of interest to IP.
5279 				 * We track them here only so
5280 				 * that we can report their current value.
5281 				 */
5282 				tcp->tcp_dontroute = onoff;
5283 				if (onoff)
5284 					so->so_options |= option_name;
5285 				else
5286 					so->so_options &= ~option_name;
5287 				break;
5288 			case SO_USELOOPBACK:
5289 				tcp->tcp_useloopback = onoff;
5290 				if (onoff)
5291 					so->so_options |= option_name;
5292 				else
5293 					so->so_options &= ~option_name;
5294 				break;
5295 			case SO_BROADCAST:
5296 				tcp->tcp_broadcast = onoff;
5297 				if (onoff)
5298 					so->so_options |= option_name;
5299 				else
5300 					so->so_options &= ~option_name;
5301 				break;
5302 			case SO_REUSEADDR:
5303 				tcp->tcp_reuseaddr = onoff;
5304 				if (onoff)
5305 					so->so_options |= option_name;
5306 				else
5307 					so->so_options &= ~option_name;
5308 				break;
5309 			case SO_OOBINLINE:
5310 				tcp->tcp_oobinline = onoff;
5311 				if (onoff)
5312 					so->so_options |= option_name;
5313 				else
5314 					so->so_options &= ~option_name;
5315 				break;
5316 			case SO_DGRAM_ERRIND:
5317 				tcp->tcp_dgram_errind = onoff;
5318 				if (onoff)
5319 					so->so_options |= option_name;
5320 				else
5321 					so->so_options &= ~option_name;
5322 				break;
5323 			}
5324 			break;
5325 		case IPPROTO_TCP:
5326 			switch (option_name) {
5327 			case TCP_NODELAY:
5328 				if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5329 					error = EINVAL;
5330 					eprintsoline(so, error);
5331 					mutex_enter(&so->so_lock);
5332 					goto done2;
5333 				}
5334 				ASSERT(optval);
5335 				tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
5336 				handled = B_TRUE;
5337 				break;
5338 			}
5339 			break;
5340 		default:
5341 			handled = B_FALSE;
5342 			break;
5343 		}
5344 	}
5345 
5346 	if (handled) {
5347 		mutex_enter(&so->so_lock);
5348 		goto done2;
5349 	}
5350 
5351 	optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5352 	optmgmt_req.MGMT_flags = T_NEGOTIATE;
5353 	optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5354 	optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5355 
5356 	oh.level = level;
5357 	oh.name = option_name;
5358 	oh.len = optlen;
5359 
5360 	mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5361 	    &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP);
5362 	/* Let option management work in the presence of data flow control */
5363 	error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5364 	    MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5365 	mp = NULL;
5366 	mutex_enter(&so->so_lock);
5367 	if (error) {
5368 		eprintsoline(so, error);
5369 		goto done;
5370 	}
5371 	error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5372 	    (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5373 	if (error) {
5374 		eprintsoline(so, error);
5375 		goto done;
5376 	}
5377 	ASSERT(mp);
5378 	/* No need to verify T_optmgmt_ack */
5379 	freemsg(mp);
5380 done:
5381 	/*
5382 	 * Check for SOL_SOCKET options and record their values.
5383 	 * If we know about a SOL_SOCKET parameter and the transport
5384 	 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5385 	 * EPROTO) we let the setsockopt succeed.
5386 	 */
5387 	if (level == SOL_SOCKET) {
5388 		/* Check parameters */
5389 		switch (option_name) {
5390 		case SO_DEBUG:
5391 		case SO_REUSEADDR:
5392 		case SO_KEEPALIVE:
5393 		case SO_DONTROUTE:
5394 		case SO_BROADCAST:
5395 		case SO_USELOOPBACK:
5396 		case SO_OOBINLINE:
5397 		case SO_SNDBUF:
5398 		case SO_RCVBUF:
5399 #ifdef notyet
5400 		case SO_SNDLOWAT:
5401 		case SO_RCVLOWAT:
5402 		case SO_SNDTIMEO:
5403 		case SO_RCVTIMEO:
5404 #endif /* notyet */
5405 		case SO_DGRAM_ERRIND:
5406 			if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5407 				error = EINVAL;
5408 				eprintsoline(so, error);
5409 				goto done2;
5410 			}
5411 			ASSERT(optval);
5412 			handled = B_TRUE;
5413 			break;
5414 		case SO_LINGER:
5415 			if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5416 				error = EINVAL;
5417 				eprintsoline(so, error);
5418 				goto done2;
5419 			}
5420 			ASSERT(optval);
5421 			handled = B_TRUE;
5422 			break;
5423 		}
5424 
5425 #define	intvalue	(*(int32_t *)optval)
5426 
5427 		switch (option_name) {
5428 		case SO_TYPE:
5429 		case SO_ERROR:
5430 		case SO_ACCEPTCONN:
5431 			/* Can't be set */
5432 			error = ENOPROTOOPT;
5433 			goto done2;
5434 		case SO_LINGER: {
5435 			struct linger *l = (struct linger *)optval;
5436 
5437 			so->so_linger.l_linger = l->l_linger;
5438 			if (l->l_onoff) {
5439 				so->so_linger.l_onoff = SO_LINGER;
5440 				so->so_options |= SO_LINGER;
5441 			} else {
5442 				so->so_linger.l_onoff = 0;
5443 				so->so_options &= ~SO_LINGER;
5444 			}
5445 			break;
5446 		}
5447 
5448 		case SO_DEBUG:
5449 #ifdef SOCK_TEST
5450 			if (intvalue & 2)
5451 				sock_test_timelimit = 10 * hz;
5452 			else
5453 				sock_test_timelimit = 0;
5454 
5455 			if (intvalue & 4)
5456 				do_useracc = 0;
5457 			else
5458 				do_useracc = 1;
5459 #endif /* SOCK_TEST */
5460 			/* FALLTHRU */
5461 		case SO_REUSEADDR:
5462 		case SO_KEEPALIVE:
5463 		case SO_DONTROUTE:
5464 		case SO_BROADCAST:
5465 		case SO_USELOOPBACK:
5466 		case SO_OOBINLINE:
5467 		case SO_DGRAM_ERRIND:
5468 			if (intvalue != 0) {
5469 				dprintso(so, 1,
5470 				    ("sotpi_setsockopt: setting 0x%x\n",
5471 				    option_name));
5472 				so->so_options |= option_name;
5473 			} else {
5474 				dprintso(so, 1,
5475 				    ("sotpi_setsockopt: clearing 0x%x\n",
5476 				    option_name));
5477 				so->so_options &= ~option_name;
5478 			}
5479 			break;
5480 		/*
5481 		 * The following options are only returned by us when the
5482 		 * T_SVR4_OPTMGMT_REQ fails.
5483 		 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5484 		 * since the transport might adjust the value and not
5485 		 * return exactly what was set by the application.
5486 		 */
5487 		case SO_SNDBUF:
5488 			so->so_sndbuf = intvalue;
5489 			break;
5490 		case SO_RCVBUF:
5491 			so->so_rcvbuf = intvalue;
5492 			break;
5493 #ifdef notyet
5494 		/*
5495 		 * We do not implement the semantics of these options
5496 		 * thus we shouldn't implement the options either.
5497 		 */
5498 		case SO_SNDLOWAT:
5499 			so->so_sndlowat = intvalue;
5500 			break;
5501 		case SO_RCVLOWAT:
5502 			so->so_rcvlowat = intvalue;
5503 			break;
5504 		case SO_SNDTIMEO:
5505 			so->so_sndtimeo = intvalue;
5506 			break;
5507 		case SO_RCVTIMEO:
5508 			so->so_rcvtimeo = intvalue;
5509 			break;
5510 #endif /* notyet */
5511 		}
5512 #undef	intvalue
5513 
5514 		if (error) {
5515 			if ((error == ENOPROTOOPT || error == EPROTO ||
5516 			    error == EINVAL) && handled) {
5517 				dprintso(so, 1,
5518 				    ("setsockopt: ignoring error %d for 0x%x\n",
5519 				    error, option_name));
5520 				error = 0;
5521 			}
5522 		}
5523 	}
5524 done2:
5525 ret:
5526 	so_unlock_single(so, SOLOCKED);
5527 	mutex_exit(&so->so_lock);
5528 	return (error);
5529 }
5530