xref: /illumos-gate/usr/src/uts/common/inet/sockmods/sockmod_pfp.c (revision 44bc9120699af80bb18366ca474cb2c618608ca9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2015 Joyent, Inc. All rights reserved.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/stropts.h>
31 #include <sys/socket.h>
32 #include <sys/socketvar.h>
33 #include <sys/socket_proto.h>
34 #include <sys/sockio.h>
35 #include <sys/strsun.h>
36 #include <sys/kstat.h>
37 #include <sys/modctl.h>
38 #include <sys/policy.h>
39 #include <sys/priv_const.h>
40 #include <sys/tihdr.h>
41 #include <sys/zone.h>
42 #include <sys/time.h>
43 #include <sys/ethernet.h>
44 #include <sys/llc1.h>
45 #include <fs/sockfs/sockcommon.h>
46 #include <net/if.h>
47 #include <inet/ip_arp.h>
48 
49 #include <sys/dls.h>
50 #include <sys/mac.h>
51 #include <sys/mac_client.h>
52 #include <sys/mac_provider.h>
53 #include <sys/mac_client_priv.h>
54 
55 #include <netpacket/packet.h>
56 
57 static void pfp_close(mac_handle_t, mac_client_handle_t);
58 static int pfp_dl_to_arphrd(int);
59 static int pfp_getpacket_sockopt(sock_lower_handle_t, int, void *,
60     socklen_t *);
61 static int pfp_ifreq_getlinkid(intptr_t, struct ifreq *, datalink_id_t *, int);
62 static int pfp_lifreq_getlinkid(intptr_t, struct lifreq *, datalink_id_t *,
63     int);
64 static int pfp_open_index(int, mac_handle_t *, mac_client_handle_t *,
65     cred_t *);
66 static void pfp_packet(void *, mac_resource_handle_t, mblk_t *, boolean_t);
67 static void pfp_release_bpf(struct pfpsock *);
68 static int pfp_set_promisc(struct pfpsock *, mac_client_promisc_type_t);
69 static int pfp_setsocket_sockopt(sock_lower_handle_t, int, const void *,
70     socklen_t);
71 static int pfp_setpacket_sockopt(sock_lower_handle_t, int, const void *,
72     socklen_t);
73 
74 /*
75  * PFP sockfs operations
76  * Most are currently no-ops because they have no meaning for a connectionless
77  * socket.
78  */
79 static void sdpfp_activate(sock_lower_handle_t, sock_upper_handle_t,
80     sock_upcalls_t *, int, struct cred *);
81 static int sdpfp_bind(sock_lower_handle_t, struct sockaddr *, socklen_t,
82     struct cred *);
83 static int sdpfp_close(sock_lower_handle_t, int, struct cred *);
84 static void sdpfp_clr_flowctrl(sock_lower_handle_t);
85 static int sdpfp_getsockopt(sock_lower_handle_t, int, int, void *,
86     socklen_t *, struct cred *);
87 static int sdpfp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *,
88     struct cred *);
89 static int sdpfp_senduio(sock_lower_handle_t, struct uio *, struct nmsghdr *,
90     struct cred *);
91 static int sdpfp_setsockopt(sock_lower_handle_t, int, int, const void *,
92     socklen_t, struct cred *);
93 
94 static sock_lower_handle_t sockpfp_create(int, int, int, sock_downcalls_t **,
95     uint_t *, int *, int, cred_t *);
96 
97 static int sockpfp_init(void);
98 static void sockpfp_fini(void);
99 
100 static kstat_t *pfp_ksp;
101 static pfp_kstats_t ks_stats;
102 static pfp_kstats_t pfp_kstats = {
103 	/*
104 	 * Each one of these kstats is a different return path in handling
105 	 * a packet received from the mac layer.
106 	 */
107 	{ "recvMacHeaderFail",	KSTAT_DATA_UINT64 },
108 	{ "recvBadProtocol",	KSTAT_DATA_UINT64 },
109 	{ "recvAllocbFail",	KSTAT_DATA_UINT64 },
110 	{ "recvOk",		KSTAT_DATA_UINT64 },
111 	{ "recvFail",		KSTAT_DATA_UINT64 },
112 	{ "recvFiltered",	KSTAT_DATA_UINT64 },
113 	{ "recvFlowControl",	KSTAT_DATA_UINT64 },
114 	/*
115 	 * A global set of counters is maintained to track the behaviour
116 	 * of the system (kernel & applications) in sending packets.
117 	 */
118 	{ "sendUnbound",	KSTAT_DATA_UINT64 },
119 	{ "sendFailed",		KSTAT_DATA_UINT64 },
120 	{ "sendTooBig",		KSTAT_DATA_UINT64 },
121 	{ "sendAllocFail",	KSTAT_DATA_UINT64 },
122 	{ "sendUiomoveFail",	KSTAT_DATA_UINT64 },
123 	{ "sendNoMemory",	KSTAT_DATA_UINT64 },
124 	{ "sendOpenFail",	KSTAT_DATA_UINT64 },
125 	{ "sendWrongFamily",	KSTAT_DATA_UINT64 },
126 	{ "sendShortMsg",	KSTAT_DATA_UINT64 },
127 	{ "sendOk",		KSTAT_DATA_UINT64 }
128 };
129 
130 sock_downcalls_t pfp_downcalls = {
131 	sdpfp_activate,
132 	sock_accept_notsupp,
133 	sdpfp_bind,
134 	sock_listen_notsupp,
135 	sock_connect_notsupp,
136 	sock_getpeername_notsupp,
137 	sock_getsockname_notsupp,
138 	sdpfp_getsockopt,
139 	sdpfp_setsockopt,
140 	sock_send_notsupp,
141 	sdpfp_senduio,
142 	NULL,
143 	sock_poll_notsupp,
144 	sock_shutdown_notsupp,
145 	sdpfp_clr_flowctrl,
146 	sdpfp_ioctl,
147 	sdpfp_close,
148 };
149 
150 static smod_reg_t sinfo = {
151 	SOCKMOD_VERSION,
152 	"sockpfp",
153 	SOCK_UC_VERSION,
154 	SOCK_DC_VERSION,
155 	sockpfp_create,
156 	NULL
157 };
158 
159 static int accepted_protos[3][2] = {
160 	{ ETH_P_ALL,	0 },
161 	{ ETH_P_802_2,	LLC_SNAP_SAP },
162 	{ ETH_P_803_3,	0 },
163 };
164 
165 /*
166  * This sets an upper bound on the size of the receive buffer for a PF_PACKET
167  * socket. More properly, this should be controlled through ipadm, ala TCP, UDP,
168  * SCTP, etc. Until that's done, this provides a hard cap of 4 MB and allows an
169  * opportunity for it to be changed, should it be needed.
170  */
171 int sockmod_pfp_rcvbuf_max = 1024 * 1024 * 4;
172 
173 /*
174  * Module linkage information for the kernel.
175  */
176 static struct modlsockmod modlsockmod = {
177 	&mod_sockmodops, "PF Packet socket module", &sinfo
178 };
179 
180 static struct modlinkage modlinkage = {
181 	MODREV_1,
182 	&modlsockmod,
183 	NULL
184 };
185 
186 int
187 _init(void)
188 {
189 	int error;
190 
191 	error = sockpfp_init();
192 	if (error != 0)
193 		return (error);
194 
195 	error = mod_install(&modlinkage);
196 	if (error != 0)
197 		sockpfp_fini();
198 
199 	return (error);
200 }
201 
202 int
203 _fini(void)
204 {
205 	int error;
206 
207 	error = mod_remove(&modlinkage);
208 	if (error == 0)
209 		sockpfp_fini();
210 
211 	return (error);
212 }
213 
214 int
215 _info(struct modinfo *modinfop)
216 {
217 	return (mod_info(&modlinkage, modinfop));
218 }
219 
220 /*
221  * sockpfp_init: called as part of the initialisation of the module when
222  * loaded into the kernel.
223  *
224  * Being able to create and record the kstats data in the kernel is not
225  * considered to be vital to the operation of this kernel module, thus
226  * its failure is tolerated.
227  */
228 static int
229 sockpfp_init(void)
230 {
231 	(void) memset(&ks_stats, 0, sizeof (ks_stats));
232 
233 	(void) memcpy(&ks_stats, &pfp_kstats, sizeof (pfp_kstats));
234 
235 	pfp_ksp = kstat_create("pfpacket", 0, "global", "misc",
236 	    KSTAT_TYPE_NAMED, sizeof (pfp_kstats) / sizeof (kstat_named_t),
237 	    KSTAT_FLAG_VIRTUAL);
238 	if (pfp_ksp != NULL) {
239 		pfp_ksp->ks_data = &ks_stats;
240 		kstat_install(pfp_ksp);
241 	}
242 
243 	return (0);
244 }
245 
246 /*
247  * sockpfp_fini: called when the operating system wants to unload the
248  * socket module from the kernel.
249  */
250 static void
251 sockpfp_fini(void)
252 {
253 	if (pfp_ksp != NULL)
254 		kstat_delete(pfp_ksp);
255 }
256 
257 /*
258  * Due to sockets being created read-write by default, all PF_PACKET sockets
259  * therefore require the NET_RAWACCESS priviliege, even if the socket is only
260  * being used for reading packets from.
261  *
262  * This create function enforces this module only being used with PF_PACKET
263  * sockets and the policy that we support via the config file in sock2path.d:
264  * PF_PACKET sockets must be either SOCK_DGRAM or SOCK_RAW.
265  */
266 /* ARGSUSED */
267 static sock_lower_handle_t
268 sockpfp_create(int family, int type, int proto,
269     sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp,
270     int sflags, cred_t *cred)
271 {
272 	struct pfpsock *ps;
273 	int kmflags;
274 	int newproto;
275 	int i;
276 
277 	if (secpolicy_net_rawaccess(cred) != 0) {
278 		*errorp = EACCES;
279 		return (NULL);
280 	}
281 
282 	if (family != AF_PACKET) {
283 		*errorp = EAFNOSUPPORT;
284 		return (NULL);
285 	}
286 
287 	if ((type != SOCK_RAW) && (type != SOCK_DGRAM)) {
288 		*errorp = ESOCKTNOSUPPORT;
289 		return (NULL);
290 	}
291 
292 	/*
293 	 * First check to see if the protocol number passed in via the socket
294 	 * creation should be mapped to a different number for internal use.
295 	 */
296 	for (i = 0, newproto = -1;
297 	    i < sizeof (accepted_protos)/ sizeof (accepted_protos[0]); i++) {
298 		if (accepted_protos[i][0] == proto) {
299 			newproto = accepted_protos[i][1];
300 			break;
301 		}
302 	}
303 
304 	/*
305 	 * If the mapping of the protocol that was under 0x800 failed to find
306 	 * a local equivalent then fail the socket creation. If the protocol
307 	 * for the socket is over 0x800 and it was not found in the mapping
308 	 * table above, then use the value as is.
309 	 */
310 	if (newproto == -1) {
311 		if (proto < 0x800) {
312 			*errorp = ENOPROTOOPT;
313 			return (NULL);
314 		}
315 		newproto = proto;
316 	}
317 	proto = newproto;
318 
319 	kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
320 	ps = kmem_zalloc(sizeof (*ps), kmflags);
321 	if (ps == NULL) {
322 		*errorp = ENOMEM;
323 		return (NULL);
324 	}
325 
326 	ps->ps_type = type;
327 	ps->ps_proto = proto;
328 	rw_init(&ps->ps_bpflock, NULL, RW_DRIVER, NULL);
329 	mutex_init(&ps->ps_lock, NULL, MUTEX_DRIVER, NULL);
330 
331 	*sock_downcalls = &pfp_downcalls;
332 	/*
333 	 * Setting this causes bytes from a packet that do not fit into the
334 	 * destination user buffer to be discarded. Thus the API is one
335 	 * packet per receive and callers are required to use a buffer large
336 	 * enough for the biggest packet that the interface can provide.
337 	 */
338 	*smodep = SM_ATOMIC;
339 
340 	return ((sock_lower_handle_t)ps);
341 }
342 
343 /* ************************************************************************* */
344 
345 /*
346  * pfp_packet is the callback function that is given to the mac layer for
347  * PF_PACKET to receive packets with. One packet at a time is passed into
348  * this function from the mac layer. Each packet is a private copy given
349  * to PF_PACKET to modify or free as it wishes and does not harm the original
350  * packet from which it was cloned.
351  */
352 /* ARGSUSED */
353 static void
354 pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag)
355 {
356 	struct T_unitdata_ind *tunit;
357 	struct sockaddr_ll *sll;
358 	struct sockaddr_ll *sol;
359 	mac_header_info_t hdr;
360 	struct pfpsock *ps;
361 	size_t tusz;
362 	mblk_t *mp0;
363 	int error;
364 
365 	if (mp == NULL)
366 		return;
367 
368 	ps = arg;
369 	if (ps->ps_flow_ctrld) {
370 		ps->ps_flow_ctrl_drops++;
371 		ps->ps_stats.tp_drops++;
372 		ks_stats.kp_recv_flow_cntrld.value.ui64++;
373 		freemsg(mp);
374 		return;
375 	}
376 
377 	if (mac_header_info(ps->ps_mh, mp, &hdr) != 0) {
378 		/*
379 		 * Can't decode the packet header information so drop it.
380 		 */
381 		ps->ps_stats.tp_drops++;
382 		ks_stats.kp_recv_mac_hdr_fail.value.ui64++;
383 		freemsg(mp);
384 		return;
385 	}
386 
387 	if (mac_type(ps->ps_mh) == DL_ETHER &&
388 	    hdr.mhi_bindsap == ETHERTYPE_VLAN) {
389 		struct ether_vlan_header *evhp;
390 		struct ether_vlan_header evh;
391 
392 		hdr.mhi_hdrsize = sizeof (struct ether_vlan_header);
393 		hdr.mhi_istagged = B_TRUE;
394 
395 		if (MBLKL(mp) >= sizeof (*evhp)) {
396 			evhp = (struct ether_vlan_header *)mp->b_rptr;
397 		} else {
398 			int sz = sizeof (*evhp);
399 			char *s = (char *)&evh;
400 			mblk_t *tmp;
401 			int len;
402 
403 			for (tmp = mp; sz > 0 && tmp != NULL;
404 			    tmp = tmp->b_cont) {
405 				len = min(sz, MBLKL(tmp));
406 				bcopy(tmp->b_rptr, s, len);
407 				sz -= len;
408 			}
409 			evhp = &evh;
410 		}
411 		hdr.mhi_tci = ntohs(evhp->ether_tci);
412 		hdr.mhi_bindsap = ntohs(evhp->ether_type);
413 	}
414 
415 	if ((ps->ps_proto != 0) && (ps->ps_proto != hdr.mhi_bindsap)) {
416 		/*
417 		 * The packet is not of interest to this socket so
418 		 * drop it on the floor. Here the SAP is being used
419 		 * as a very course filter.
420 		 */
421 		ps->ps_stats.tp_drops++;
422 		ks_stats.kp_recv_bad_proto.value.ui64++;
423 		freemsg(mp);
424 		return;
425 	}
426 
427 	/*
428 	 * This field is not often set, even for ethernet,
429 	 * by mac_header_info, so compute it if it is 0.
430 	 */
431 	if (hdr.mhi_pktsize == 0)
432 		hdr.mhi_pktsize = msgdsize(mp);
433 
434 	/*
435 	 * If a BPF filter is present, pass the raw packet into that.
436 	 * A failed match will result in zero being returned, indicating
437 	 * that this socket is not interested in the packet.
438 	 */
439 	if (ps->ps_bpf.bf_len != 0) {
440 		uchar_t *buffer;
441 		int buflen;
442 
443 		buflen = MBLKL(mp);
444 		if (hdr.mhi_pktsize == buflen) {
445 			buffer = mp->b_rptr;
446 		} else {
447 			buflen = 0;
448 			buffer = (uchar_t *)mp;
449 		}
450 		rw_enter(&ps->ps_bpflock, RW_READER);
451 		if (bpf_filter(ps->ps_bpf.bf_insns, buffer,
452 		    hdr.mhi_pktsize, buflen) == 0) {
453 			rw_exit(&ps->ps_bpflock);
454 			ps->ps_stats.tp_drops++;
455 			ks_stats.kp_recv_filtered.value.ui64++;
456 			freemsg(mp);
457 			return;
458 		}
459 		rw_exit(&ps->ps_bpflock);
460 	}
461 
462 	if (ps->ps_type == SOCK_DGRAM) {
463 		/*
464 		 * SOCK_DGRAM socket expect a "layer 3" packet, so advance
465 		 * past the link layer header.
466 		 */
467 		mp->b_rptr += hdr.mhi_hdrsize;
468 		hdr.mhi_pktsize -= hdr.mhi_hdrsize;
469 	}
470 
471 	tusz = sizeof (struct T_unitdata_ind) + sizeof (struct sockaddr_ll);
472 	if (ps->ps_auxdata) {
473 		tusz += _TPI_ALIGN_TOPT(sizeof (struct tpacket_auxdata));
474 		tusz += _TPI_ALIGN_TOPT(sizeof (struct T_opthdr));
475 	}
476 
477 	/*
478 	 * It is tempting to think that this could be optimised by having
479 	 * the base mblk_t allocated and hung off the pfpsock structure,
480 	 * except that then another one would need to be allocated for the
481 	 * sockaddr_ll that is included. Even creating a template to copy
482 	 * from is of questionable value, as read-write from one structure
483 	 * to the other is going to be slower than all of the initialisation.
484 	 */
485 	mp0 = allocb(tusz, BPRI_HI);
486 	if (mp0 == NULL) {
487 		ps->ps_stats.tp_drops++;
488 		ks_stats.kp_recv_alloc_fail.value.ui64++;
489 		freemsg(mp);
490 		return;
491 	}
492 
493 	(void) memset(mp0->b_rptr, 0, tusz);
494 
495 	mp0->b_datap->db_type = M_PROTO;
496 	mp0->b_wptr = mp0->b_rptr + tusz;
497 
498 	tunit = (struct T_unitdata_ind *)mp0->b_rptr;
499 	tunit->PRIM_type = T_UNITDATA_IND;
500 	tunit->SRC_length = sizeof (struct sockaddr);
501 	tunit->SRC_offset = sizeof (*tunit);
502 
503 	sol = (struct sockaddr_ll *)&ps->ps_sock;
504 	sll = (struct sockaddr_ll *)(mp0->b_rptr + sizeof (*tunit));
505 	sll->sll_ifindex = sol->sll_ifindex;
506 	sll->sll_hatype = (uint16_t)hdr.mhi_origsap;
507 	sll->sll_halen = sol->sll_halen;
508 	if (hdr.mhi_saddr != NULL)
509 		(void) memcpy(sll->sll_addr, hdr.mhi_saddr, sll->sll_halen);
510 
511 	switch (hdr.mhi_dsttype) {
512 	case MAC_ADDRTYPE_MULTICAST :
513 		sll->sll_pkttype = PACKET_MULTICAST;
514 		break;
515 	case MAC_ADDRTYPE_BROADCAST :
516 		sll->sll_pkttype = PACKET_BROADCAST;
517 		break;
518 	case MAC_ADDRTYPE_UNICAST :
519 		if (memcmp(sol->sll_addr, hdr.mhi_daddr, sol->sll_halen) == 0)
520 			sll->sll_pkttype = PACKET_HOST;
521 		else
522 			sll->sll_pkttype = PACKET_OTHERHOST;
523 		break;
524 	}
525 
526 	if (ps->ps_auxdata) {
527 		struct tpacket_auxdata *aux;
528 		struct T_opthdr *topt;
529 
530 		tunit->OPT_offset = _TPI_ALIGN_TOPT(tunit->SRC_offset +
531 		    sizeof (struct sockaddr_ll));
532 		tunit->OPT_length = _TPI_ALIGN_TOPT(sizeof (struct T_opthdr)) +
533 		    _TPI_ALIGN_TOPT(sizeof (struct tpacket_auxdata));
534 
535 		topt = (struct T_opthdr *)(mp0->b_rptr + tunit->OPT_offset);
536 		aux = (struct tpacket_auxdata *)
537 		    ((char *)topt + _TPI_ALIGN_TOPT(sizeof (*topt)));
538 
539 		topt->len = tunit->OPT_length;
540 		topt->level = SOL_PACKET;
541 		topt->name = PACKET_AUXDATA;
542 		topt->status = 0;
543 		/*
544 		 * libpcap doesn't seem to use any other field,
545 		 * so it isn't clear how they should be filled in.
546 		 */
547 		aux->tp_vlan_vci = hdr.mhi_tci;
548 	}
549 
550 	linkb(mp0, mp);
551 
552 	(void) gethrestime(&ps->ps_timestamp);
553 
554 	ps->ps_upcalls->su_recv(ps->ps_upper, mp0, hdr.mhi_pktsize, 0,
555 	    &error, NULL);
556 
557 	if (error == 0) {
558 		ps->ps_stats.tp_packets++;
559 		ks_stats.kp_recv_ok.value.ui64++;
560 	} else {
561 		mutex_enter(&ps->ps_lock);
562 		if (error == ENOSPC) {
563 			ps->ps_upcalls->su_recv(ps->ps_upper, NULL, 0, 0,
564 			    &error, NULL);
565 			if (error == ENOSPC)
566 				ps->ps_flow_ctrld = B_TRUE;
567 		}
568 		mutex_exit(&ps->ps_lock);
569 		ps->ps_stats.tp_drops++;
570 		ks_stats.kp_recv_fail.value.ui64++;
571 	}
572 }
573 
574 /*
575  * Bind a PF_PACKET socket to a network interface.
576  *
577  * The default operation of this bind() is to place the socket (and thus the
578  * network interface) into promiscuous mode. It is then up to the application
579  * to turn that down by issuing the relevant ioctls, if desired.
580  */
581 /* ARGSUSED */
582 static int
583 sdpfp_bind(sock_lower_handle_t handle, struct sockaddr *addr,
584     socklen_t addrlen, struct cred *cred)
585 {
586 	struct sockaddr_ll *addr_ll, *sol;
587 	mac_client_handle_t mch;
588 	struct pfpsock *ps;
589 	mac_handle_t mh;
590 	int error;
591 
592 	ps = (struct pfpsock *)handle;
593 	if (ps->ps_bound)
594 		return (EINVAL);
595 
596 	addr_ll = (struct sockaddr_ll *)addr;
597 
598 	error = pfp_open_index(addr_ll->sll_ifindex, &mh, &mch, cred);
599 	if (error != 0)
600 		return (error);
601 	/*
602 	 * Ensure that each socket is only bound once.
603 	 */
604 	mutex_enter(&ps->ps_lock);
605 	if (ps->ps_mh != 0) {
606 		mutex_exit(&ps->ps_lock);
607 		pfp_close(mh, mch);
608 		return (EADDRINUSE);
609 	}
610 	ps->ps_mh = mh;
611 	ps->ps_mch = mch;
612 	mutex_exit(&ps->ps_lock);
613 
614 	/*
615 	 * Cache all of the information from bind so that it's in an easy
616 	 * place to get at when packets are received.
617 	 */
618 	sol = (struct sockaddr_ll *)&ps->ps_sock;
619 	sol->sll_family = AF_PACKET;
620 	sol->sll_ifindex = addr_ll->sll_ifindex;
621 	sol->sll_protocol = addr_ll->sll_protocol;
622 	sol->sll_halen = mac_addr_len(ps->ps_mh);
623 	mac_unicast_primary_get(ps->ps_mh, sol->sll_addr);
624 	mac_sdu_get(ps->ps_mh, NULL, &ps->ps_max_sdu);
625 	ps->ps_linkid = addr_ll->sll_ifindex;
626 
627 	error = mac_promisc_add(ps->ps_mch, MAC_CLIENT_PROMISC_ALL,
628 	    pfp_packet, ps, &ps->ps_phd, MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
629 	if (error == 0) {
630 		ps->ps_promisc = MAC_CLIENT_PROMISC_ALL;
631 		ps->ps_bound = B_TRUE;
632 	}
633 
634 	return (error);
635 }
636 
637 /* ARGSUSED */
638 static void
639 sdpfp_activate(sock_lower_handle_t lower, sock_upper_handle_t upper,
640     sock_upcalls_t *upcalls, int flags, cred_t *cred)
641 {
642 	struct pfpsock *ps;
643 
644 	ps = (struct pfpsock *)lower;
645 	ps->ps_upper = upper;
646 	ps->ps_upcalls = upcalls;
647 }
648 
649 /*
650  * This module only implements getting socket options for the new socket
651  * option level (SOL_PACKET) that it introduces. All other requests are
652  * passed back to the sockfs layer.
653  */
654 /* ARGSUSED */
655 static int
656 sdpfp_getsockopt(sock_lower_handle_t handle, int level, int option_name,
657     void *optval, socklen_t *optlenp, struct cred *cred)
658 {
659 	struct pfpsock *ps;
660 	int error = 0;
661 
662 	ps = (struct pfpsock *)handle;
663 
664 	switch (level) {
665 	case SOL_PACKET :
666 		error = pfp_getpacket_sockopt(handle, option_name, optval,
667 		    optlenp);
668 		break;
669 
670 	case SOL_SOCKET :
671 		if (option_name == SO_RCVBUF) {
672 			if (*optlenp < sizeof (int32_t))
673 				return (EINVAL);
674 			*((int32_t *)optval) = ps->ps_rcvbuf;
675 			*optlenp = sizeof (int32_t);
676 		} else {
677 			error = ENOPROTOOPT;
678 		}
679 		break;
680 
681 	default :
682 		/*
683 		 * If sockfs code receives this error in return from the
684 		 * getsockopt downcall it handles the option locally, if
685 		 * it can.
686 		 */
687 		error = ENOPROTOOPT;
688 		break;
689 	}
690 
691 	return (error);
692 }
693 
694 /*
695  * PF_PACKET supports setting socket options at only two levels:
696  * SOL_SOCKET and SOL_PACKET.
697  */
698 /* ARGSUSED */
699 static int
700 sdpfp_setsockopt(sock_lower_handle_t handle, int level, int option_name,
701     const void *optval, socklen_t optlen, struct cred *cred)
702 {
703 	int error = 0;
704 
705 	switch (level) {
706 	case SOL_SOCKET :
707 		error = pfp_setsocket_sockopt(handle, option_name, optval,
708 		    optlen);
709 		break;
710 	case SOL_PACKET :
711 		error = pfp_setpacket_sockopt(handle, option_name, optval,
712 		    optlen);
713 		break;
714 	default :
715 		error = EINVAL;
716 		break;
717 	}
718 
719 	return (error);
720 }
721 
722 /*
723  * This function is incredibly inefficient for sending any packet that
724  * comes with a msghdr asking to be sent to an interface to which the
725  * socket has not been bound. Some possibilities here are keeping a
726  * cache of all open mac's and mac_client's, for the purpose of sending,
727  * and closing them after some amount of inactivity. Clearly, applications
728  * should not be written to use one socket for multiple interfaces if
729  * performance is desired with the code as is.
730  */
731 /* ARGSUSED */
732 static int
733 sdpfp_senduio(sock_lower_handle_t handle, struct uio *uiop,
734     struct nmsghdr *msg, struct cred *cred)
735 {
736 	struct sockaddr_ll *sol;
737 	mac_client_handle_t mch;
738 	struct pfpsock *ps;
739 	boolean_t new_open;
740 	mac_handle_t mh;
741 	size_t mpsize;
742 	uint_t maxsdu;
743 	mblk_t *mp0;
744 	mblk_t *mp;
745 	int error;
746 
747 	mp = NULL;
748 	mp0 = NULL;
749 	new_open = B_FALSE;
750 	ps = (struct pfpsock *)handle;
751 	mh = ps->ps_mh;
752 	mch = ps->ps_mch;
753 	maxsdu = ps->ps_max_sdu;
754 
755 	sol = (struct sockaddr_ll *)msg->msg_name;
756 	if (sol == NULL) {
757 		/*
758 		 * If no sockaddr_ll has been provided with the send call,
759 		 * use the one constructed when the socket was bound to an
760 		 * interface and fail if it hasn't been bound.
761 		 */
762 		if (!ps->ps_bound) {
763 			ks_stats.kp_send_unbound.value.ui64++;
764 			return (EPROTO);
765 		}
766 		sol = (struct sockaddr_ll *)&ps->ps_sock;
767 	} else {
768 		/*
769 		 * Verify the sockaddr_ll message passed down before using
770 		 * it to send a packet out with. If it refers to an interface
771 		 * that has not been bound, it is necessary to open it.
772 		 */
773 		struct sockaddr_ll *sll;
774 
775 		if (msg->msg_namelen < sizeof (struct sockaddr_ll)) {
776 			ks_stats.kp_send_short_msg.value.ui64++;
777 			return (EINVAL);
778 		}
779 
780 		if (sol->sll_family != AF_PACKET) {
781 			ks_stats.kp_send_wrong_family.value.ui64++;
782 			return (EAFNOSUPPORT);
783 		}
784 
785 		sll = (struct sockaddr_ll *)&ps->ps_sock;
786 		if (sol->sll_ifindex != sll->sll_ifindex) {
787 			error = pfp_open_index(sol->sll_ifindex, &mh, &mch,
788 			    cred);
789 			if (error != 0) {
790 				ks_stats.kp_send_open_fail.value.ui64++;
791 				return (error);
792 			}
793 			mac_sdu_get(mh, NULL, &maxsdu);
794 			new_open = B_TRUE;
795 		}
796 	}
797 
798 	mpsize = uiop->uio_resid;
799 	if (mpsize > maxsdu) {
800 		ks_stats.kp_send_too_big.value.ui64++;
801 		error = EMSGSIZE;
802 		goto done;
803 	}
804 
805 	if ((mp = allocb(mpsize, BPRI_HI)) == NULL) {
806 		ks_stats.kp_send_alloc_fail.value.ui64++;
807 		error = ENOBUFS;
808 		goto done;
809 	}
810 
811 	mp->b_wptr = mp->b_rptr + mpsize;
812 	error = uiomove(mp->b_rptr, mpsize, UIO_WRITE, uiop);
813 	if (error != 0) {
814 		ks_stats.kp_send_uiomove_fail.value.ui64++;
815 		goto done;
816 	}
817 
818 	if (ps->ps_type == SOCK_DGRAM) {
819 		mp0 = mac_header(mh, sol->sll_addr, sol->sll_protocol, mp, 0);
820 		if (mp0 == NULL) {
821 			ks_stats.kp_send_no_memory.value.ui64++;
822 			error = ENOBUFS;
823 			goto done;
824 		}
825 		linkb(mp0, mp);
826 		mp = mp0;
827 	}
828 
829 	/*
830 	 * As this is sending datagrams and no promise is made about
831 	 * how or if a packet will be sent/delivered, no effort is to
832 	 * be expended in recovering from a situation where the packet
833 	 * cannot be sent - it is just dropped.
834 	 */
835 	error = mac_tx(mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL);
836 	if (error == 0) {
837 		mp = NULL;
838 		ks_stats.kp_send_ok.value.ui64++;
839 	} else {
840 		ks_stats.kp_send_failed.value.ui64++;
841 	}
842 
843 done:
844 
845 	if (new_open) {
846 		ASSERT(mch != ps->ps_mch);
847 		ASSERT(mh != ps->ps_mh);
848 		pfp_close(mh, mch);
849 	}
850 	if (mp != NULL)
851 		freemsg(mp);
852 
853 	return (error);
854 
855 }
856 
857 /*
858  * There's no use of a lock here, or at the bottom of pfp_packet() where
859  * ps_flow_ctrld is set to true, because in a situation where these two
860  * are racing to set the flag one way or the other, the end result is
861  * going to be ultimately determined by the scheduler anyway - which of
862  * the two threads gets the lock first? In such an operational environment,
863  * we've got packets arriving too fast to be delt with so packets are going
864  * to be dropped. Grabbing a lock just makes the drop more expensive.
865  */
866 static void
867 sdpfp_clr_flowctrl(sock_lower_handle_t handle)
868 {
869 	struct pfpsock *ps;
870 
871 	ps = (struct pfpsock *)handle;
872 
873 	mutex_enter(&ps->ps_lock);
874 	ps->ps_flow_ctrld = B_FALSE;
875 	mutex_exit(&ps->ps_lock);
876 }
877 
878 /*
879  * The implementation of this ioctl() handler is intended to function
880  * in the absence of a bind() being made before it is called. Thus the
881  * function calls mac_open() itself to provide a handle
882  * This function is structured like this:
883  * - determine the linkid for the interface being targetted
884  * - open the interface with said linkid
885  * - perform ioctl
886  * - copy results back to caller
887  *
888  * The ioctls that interact with interface flags have been implented below
889  * to assume that the interface is always up and running (IFF_RUNNING) and
890  * to use the state of this socket to determine whether or not the network
891  * interface is in promiscuous mode. Thus an ioctl to get the interface flags
892  * of an interface that has been put in promiscuous mode by another socket
893  * (in the same program or different), will not report that status.
894  */
895 /* ARGSUSED */
896 static int
897 sdpfp_ioctl(sock_lower_handle_t handle, int cmd, intptr_t arg, int mod,
898     int32_t *rval, struct cred *cr)
899 {
900 	struct timeval tival;
901 	mac_client_promisc_type_t mtype;
902 	struct sockaddr_dl *sock;
903 	datalink_id_t linkid;
904 	struct lifreq lifreq;
905 	struct ifreq ifreq;
906 	struct pfpsock *ps;
907 	mac_handle_t mh;
908 	int error;
909 
910 	ps = (struct pfpsock *)handle;
911 
912 	switch (cmd) {
913 	/*
914 	 * ioctls that work on "struct lifreq"
915 	 */
916 	case SIOCSLIFFLAGS :
917 	case SIOCGLIFINDEX :
918 	case SIOCGLIFFLAGS :
919 	case SIOCGLIFMTU :
920 	case SIOCGLIFHWADDR :
921 		error = pfp_lifreq_getlinkid(arg, &lifreq, &linkid, mod);
922 		if (error != 0)
923 			return (error);
924 		break;
925 
926 	/*
927 	 * ioctls that work on "struct ifreq".
928 	 * Not all of these have a "struct lifreq" partner, for example
929 	 * SIOCGIFHWADDR, for the simple reason that the logical interface
930 	 * does not have a hardware address.
931 	 */
932 	case SIOCSIFFLAGS :
933 	case SIOCGIFINDEX :
934 	case SIOCGIFFLAGS :
935 	case SIOCGIFMTU :
936 	case SIOCGIFHWADDR :
937 		error = pfp_ifreq_getlinkid(arg, &ifreq, &linkid, mod);
938 		if (error != 0)
939 			return (error);
940 		break;
941 
942 	case SIOCGSTAMP :
943 		tival.tv_sec = (time_t)ps->ps_timestamp.tv_sec;
944 		tival.tv_usec = ps->ps_timestamp.tv_nsec / 1000;
945 		if (get_udatamodel() == DATAMODEL_NATIVE) {
946 			error = ddi_copyout(&tival, (void *)arg,
947 			    sizeof (tival), mod);
948 		}
949 #ifdef _SYSCALL32_IMPL
950 		else {
951 			struct timeval32 tv32;
952 			TIMEVAL_TO_TIMEVAL32(&tv32, &tival);
953 			error = ddi_copyout(&tv32, (void *)arg,
954 			    sizeof (tv32), mod);
955 		}
956 #endif
957 		return (error);
958 	}
959 
960 	error =  mac_open_by_linkid(linkid, &mh);
961 	if (error != 0)
962 		return (error);
963 
964 	switch (cmd) {
965 	case SIOCGLIFINDEX :
966 		lifreq.lifr_index = linkid;
967 		break;
968 
969 	case SIOCGIFINDEX :
970 		ifreq.ifr_index = linkid;
971 		break;
972 
973 	case SIOCGIFFLAGS :
974 		ifreq.ifr_flags = IFF_RUNNING;
975 		if (ps->ps_promisc == MAC_CLIENT_PROMISC_ALL)
976 			ifreq.ifr_flags |= IFF_PROMISC;
977 		break;
978 
979 	case SIOCGLIFFLAGS :
980 		lifreq.lifr_flags = IFF_RUNNING;
981 		if (ps->ps_promisc == MAC_CLIENT_PROMISC_ALL)
982 			lifreq.lifr_flags |= IFF_PROMISC;
983 		break;
984 
985 	case SIOCSIFFLAGS :
986 		if (linkid != ps->ps_linkid) {
987 			error = EINVAL;
988 		} else {
989 			if ((ifreq.ifr_flags & IFF_PROMISC) != 0)
990 				mtype = MAC_CLIENT_PROMISC_ALL;
991 			else
992 				mtype = MAC_CLIENT_PROMISC_FILTERED;
993 			error = pfp_set_promisc(ps, mtype);
994 		}
995 		break;
996 
997 	case SIOCSLIFFLAGS :
998 		if (linkid != ps->ps_linkid) {
999 			error = EINVAL;
1000 		} else {
1001 			if ((lifreq.lifr_flags & IFF_PROMISC) != 0)
1002 				mtype = MAC_CLIENT_PROMISC_ALL;
1003 			else
1004 				mtype = MAC_CLIENT_PROMISC_FILTERED;
1005 			error = pfp_set_promisc(ps, mtype);
1006 		}
1007 		break;
1008 
1009 	case SIOCGIFMTU :
1010 		mac_sdu_get(mh, NULL, &ifreq.ifr_mtu);
1011 		break;
1012 
1013 	case SIOCGLIFMTU :
1014 		mac_sdu_get(mh, NULL, &lifreq.lifr_mtu);
1015 		break;
1016 
1017 	case SIOCGIFHWADDR :
1018 		if (mac_addr_len(mh) > sizeof (ifreq.ifr_addr.sa_data)) {
1019 			error = EPFNOSUPPORT;
1020 			break;
1021 		}
1022 
1023 		if (mac_addr_len(mh) == 0) {
1024 			(void) memset(ifreq.ifr_addr.sa_data, 0,
1025 			    sizeof (ifreq.ifr_addr.sa_data));
1026 		} else {
1027 			mac_unicast_primary_get(mh,
1028 			    (uint8_t *)ifreq.ifr_addr.sa_data);
1029 		}
1030 
1031 		/*
1032 		 * The behaviour here in setting sa_family is consistent
1033 		 * with what applications such as tcpdump would expect
1034 		 * for a Linux PF_PACKET socket.
1035 		 */
1036 		ifreq.ifr_addr.sa_family = pfp_dl_to_arphrd(mac_type(mh));
1037 		break;
1038 
1039 	case SIOCGLIFHWADDR :
1040 		lifreq.lifr_type = 0;
1041 		sock = (struct sockaddr_dl *)&lifreq.lifr_addr;
1042 
1043 		if (mac_addr_len(mh) > sizeof (sock->sdl_data)) {
1044 			error = EPFNOSUPPORT;
1045 			break;
1046 		}
1047 
1048 		/*
1049 		 * Fill in the sockaddr_dl with link layer details. Of note,
1050 		 * the index is returned as 0 for a couple of reasons:
1051 		 * (1) there is no public API that uses or requires it
1052 		 * (2) the MAC index is currently 32bits and sdl_index is 16.
1053 		 */
1054 		sock->sdl_family = AF_LINK;
1055 		sock->sdl_index = 0;
1056 		sock->sdl_type = mac_type(mh);
1057 		sock->sdl_nlen = 0;
1058 		sock->sdl_alen = mac_addr_len(mh);
1059 		sock->sdl_slen = 0;
1060 		if (mac_addr_len(mh) == 0) {
1061 			(void) memset(sock->sdl_data, 0,
1062 			    sizeof (sock->sdl_data));
1063 		} else {
1064 			mac_unicast_primary_get(mh, (uint8_t *)sock->sdl_data);
1065 		}
1066 		break;
1067 
1068 	default :
1069 		break;
1070 	}
1071 
1072 	mac_close(mh);
1073 
1074 	if (error == 0) {
1075 		/*
1076 		 * Only the "GET" ioctls need to copy data back to userace.
1077 		 */
1078 		switch (cmd) {
1079 		case SIOCGLIFINDEX :
1080 		case SIOCGLIFFLAGS :
1081 		case SIOCGLIFMTU :
1082 		case SIOCGLIFHWADDR :
1083 			error = ddi_copyout(&lifreq, (void *)arg,
1084 			    sizeof (lifreq), mod);
1085 			break;
1086 
1087 		case SIOCGIFINDEX :
1088 		case SIOCGIFFLAGS :
1089 		case SIOCGIFMTU :
1090 		case SIOCGIFHWADDR :
1091 			error = ddi_copyout(&ifreq, (void *)arg,
1092 			    sizeof (ifreq), mod);
1093 			break;
1094 		default :
1095 			break;
1096 		}
1097 	}
1098 
1099 	return (error);
1100 }
1101 
1102 /*
1103  * Closing the socket requires that all open references to network
1104  * interfaces be closed.
1105  */
1106 /* ARGSUSED */
1107 static int
1108 sdpfp_close(sock_lower_handle_t handle, int flag, struct cred *cr)
1109 {
1110 	struct pfpsock *ps = (struct pfpsock *)handle;
1111 
1112 	if (ps->ps_phd != 0) {
1113 		mac_promisc_remove(ps->ps_phd);
1114 		ps->ps_phd = 0;
1115 	}
1116 
1117 	if (ps->ps_mch != 0) {
1118 		mac_client_close(ps->ps_mch, 0);
1119 		ps->ps_mch = 0;
1120 	}
1121 
1122 	if (ps->ps_mh != 0) {
1123 		mac_close(ps->ps_mh);
1124 		ps->ps_mh = 0;
1125 	}
1126 
1127 	kmem_free(ps, sizeof (*ps));
1128 
1129 	return (0);
1130 }
1131 
1132 /* ************************************************************************* */
1133 
1134 /*
1135  * Given a pointer (arg) to a "struct ifreq" (potentially in user space),
1136  * determine the linkid for the interface name stored in that structure.
1137  * name is used as a buffer so that we can ensure a trailing \0 is appended
1138  * to the name safely.
1139  */
1140 static int
1141 pfp_ifreq_getlinkid(intptr_t arg, struct ifreq *ifreqp,
1142     datalink_id_t *linkidp, int mode)
1143 {
1144 	char name[IFNAMSIZ + 1];
1145 	int error;
1146 
1147 	if (ddi_copyin((void *)arg, ifreqp, sizeof (*ifreqp), mode) != 0)
1148 		return (EFAULT);
1149 
1150 	(void) strlcpy(name, ifreqp->ifr_name, sizeof (name));
1151 
1152 	error = dls_mgmt_get_linkid(name, linkidp);
1153 	if (error != 0)
1154 		error = dls_devnet_macname2linkid(name, linkidp);
1155 
1156 	return (error);
1157 }
1158 
1159 /*
1160  * Given a pointer (arg) to a "struct lifreq" (potentially in user space),
1161  * determine the linkid for the interface name stored in that structure.
1162  * name is used as a buffer so that we can ensure a trailing \0 is appended
1163  * to the name safely.
1164  */
1165 static int
1166 pfp_lifreq_getlinkid(intptr_t arg, struct lifreq *lifreqp,
1167     datalink_id_t *linkidp, int mode)
1168 {
1169 	char name[LIFNAMSIZ + 1];
1170 	int error;
1171 
1172 	if (ddi_copyin((void *)arg, lifreqp, sizeof (*lifreqp), mode) != 0)
1173 		return (EFAULT);
1174 
1175 	(void) strlcpy(name, lifreqp->lifr_name, sizeof (name));
1176 
1177 	error = dls_mgmt_get_linkid(name, linkidp);
1178 	if (error != 0)
1179 		error = dls_devnet_macname2linkid(name, linkidp);
1180 
1181 	return (error);
1182 }
1183 
1184 /*
1185  * Although there are several new SOL_PACKET options that can be set and
1186  * are specific to this implementation of PF_PACKET, the current API does
1187  * not support doing a get on them to retrieve accompanying status. Thus
1188  * it is only currently possible to use SOL_PACKET with getsockopt to
1189  * retrieve statistical information. This remains consistant with the
1190  * Linux API at the time of writing.
1191  */
1192 static int
1193 pfp_getpacket_sockopt(sock_lower_handle_t handle, int option_name,
1194     void *optval, socklen_t *optlenp)
1195 {
1196 	struct pfpsock *ps;
1197 	struct tpacket_stats_short tpss;
1198 	int error = 0;
1199 
1200 	ps = (struct pfpsock *)handle;
1201 
1202 	switch (option_name) {
1203 	case PACKET_STATISTICS :
1204 		if (*optlenp < sizeof (ps->ps_stats)) {
1205 			error = EINVAL;
1206 			break;
1207 		}
1208 		*optlenp = sizeof (ps->ps_stats);
1209 		bcopy(&ps->ps_stats, optval, sizeof (ps->ps_stats));
1210 		break;
1211 	case PACKET_STATISTICS_SHORT :
1212 		if (*optlenp < sizeof (tpss)) {
1213 			error = EINVAL;
1214 			break;
1215 		}
1216 		*optlenp = sizeof (tpss);
1217 		tpss.tp_packets = ps->ps_stats.tp_packets;
1218 		tpss.tp_drops = ps->ps_stats.tp_drops;
1219 		bcopy(&tpss, optval, sizeof (tpss));
1220 		break;
1221 	default :
1222 		error = EINVAL;
1223 		break;
1224 	}
1225 
1226 	return (error);
1227 }
1228 
1229 /*
1230  * The SOL_PACKET level for socket options supports three options,
1231  * PACKET_ADD_MEMBERSHIP, PACKET_DROP_MEMBERSHIP and PACKET_AUXDATA.
1232  * This function is responsible for mapping the two socket options
1233  * that manage multicast membership into the appropriate internal
1234  * function calls to bring the option into effect. Whilst direct
1235  * changes to the multicast membership (ADD/DROP) groups is handled
1236  * by calls directly into the mac module, changes to the promiscuos
1237  * mode are vectored through pfp_set_promisc() so that the logic for
1238  * managing the promiscuous mode is in one place.
1239  */
1240 /* ARGSUSED */
1241 static int
1242 pfp_setpacket_sockopt(sock_lower_handle_t handle, int option_name,
1243     const void *optval, socklen_t optlen)
1244 {
1245 	struct packet_mreq mreq;
1246 	struct pfpsock *ps;
1247 	int error = 0;
1248 	int opt;
1249 
1250 	ps = (struct pfpsock *)handle;
1251 	if (!ps->ps_bound)
1252 		return (EPROTO);
1253 
1254 	if ((option_name == PACKET_ADD_MEMBERSHIP) ||
1255 	    (option_name == PACKET_DROP_MEMBERSHIP)) {
1256 		if (!ps->ps_bound)
1257 			return (EPROTO);
1258 		bcopy(optval, &mreq, sizeof (mreq));
1259 		if (ps->ps_linkid != mreq.mr_ifindex)
1260 			return (EINVAL);
1261 	}
1262 
1263 	switch (option_name) {
1264 	case PACKET_ADD_MEMBERSHIP :
1265 		switch (mreq.mr_type) {
1266 		case PACKET_MR_MULTICAST :
1267 			if (mreq.mr_alen !=
1268 			    ((struct sockaddr_ll *)&ps->ps_sock)->sll_halen)
1269 				return (EINVAL);
1270 
1271 			error = mac_multicast_add(ps->ps_mch, mreq.mr_address);
1272 			break;
1273 
1274 		case PACKET_MR_PROMISC :
1275 			error = pfp_set_promisc(ps, MAC_CLIENT_PROMISC_ALL);
1276 			break;
1277 
1278 		case PACKET_MR_ALLMULTI :
1279 			error = pfp_set_promisc(ps, MAC_CLIENT_PROMISC_MULTI);
1280 			break;
1281 		}
1282 		break;
1283 
1284 	case PACKET_DROP_MEMBERSHIP :
1285 		switch (mreq.mr_type) {
1286 		case PACKET_MR_MULTICAST :
1287 			if (mreq.mr_alen !=
1288 			    ((struct sockaddr_ll *)&ps->ps_sock)->sll_halen)
1289 				return (EINVAL);
1290 
1291 			mac_multicast_remove(ps->ps_mch, mreq.mr_address);
1292 			break;
1293 
1294 		case PACKET_MR_PROMISC :
1295 			if (ps->ps_promisc != MAC_CLIENT_PROMISC_ALL)
1296 				return (EINVAL);
1297 			error = pfp_set_promisc(ps,
1298 			    MAC_CLIENT_PROMISC_FILTERED);
1299 			break;
1300 
1301 		case PACKET_MR_ALLMULTI :
1302 			if (ps->ps_promisc != MAC_CLIENT_PROMISC_MULTI)
1303 				return (EINVAL);
1304 			error = pfp_set_promisc(ps,
1305 			    MAC_CLIENT_PROMISC_FILTERED);
1306 			break;
1307 		}
1308 		break;
1309 
1310 	case PACKET_AUXDATA :
1311 		if (optlen == sizeof (int)) {
1312 			opt = *(int *)optval;
1313 			ps->ps_auxdata = (opt != 0);
1314 		} else {
1315 			error = EINVAL;
1316 		}
1317 		break;
1318 	default :
1319 		error = EINVAL;
1320 		break;
1321 	}
1322 
1323 	return (error);
1324 }
1325 
1326 /*
1327  * There are only two special setsockopt's for SOL_SOCKET with PF_PACKET:
1328  * SO_ATTACH_FILTER and SO_DETACH_FILTER.
1329  *
1330  * Both of these setsockopt values are candidates for being handled by the
1331  * socket layer itself in future, however this requires understanding how
1332  * they would interact with all other sockets.
1333  */
1334 static int
1335 pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
1336     const void *optval, socklen_t optlen)
1337 {
1338 	struct bpf_program prog;
1339 	struct bpf_insn *fcode;
1340 	struct pfpsock *ps;
1341 	struct sock_proto_props sopp;
1342 	int error = 0;
1343 	int size;
1344 
1345 	ps = (struct pfpsock *)handle;
1346 
1347 	switch (option_name) {
1348 	case SO_ATTACH_FILTER :
1349 #ifdef _LP64
1350 		if (optlen == sizeof (struct bpf_program32)) {
1351 			struct bpf_program32 prog32;
1352 
1353 			bcopy(optval, &prog32, sizeof (prog32));
1354 			prog.bf_len = prog32.bf_len;
1355 			prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1356 		} else
1357 #endif
1358 		if (optlen == sizeof (struct bpf_program)) {
1359 			bcopy(optval, &prog, sizeof (prog));
1360 		} else if (optlen != sizeof (struct bpf_program)) {
1361 			return (EINVAL);
1362 		}
1363 		if (prog.bf_len > BPF_MAXINSNS)
1364 			return (EINVAL);
1365 
1366 		size = prog.bf_len * sizeof (*prog.bf_insns);
1367 		fcode = kmem_alloc(size, KM_SLEEP);
1368 		if (ddi_copyin(prog.bf_insns, fcode, size, 0) != 0) {
1369 			kmem_free(fcode, size);
1370 			return (EFAULT);
1371 		}
1372 
1373 		if (bpf_validate(fcode, (int)prog.bf_len)) {
1374 			rw_enter(&ps->ps_bpflock, RW_WRITER);
1375 			pfp_release_bpf(ps);
1376 			ps->ps_bpf.bf_insns = fcode;
1377 			ps->ps_bpf.bf_len = size;
1378 			rw_exit(&ps->ps_bpflock);
1379 
1380 			return (0);
1381 		}
1382 		kmem_free(fcode, size);
1383 		error = EINVAL;
1384 		break;
1385 
1386 	case SO_DETACH_FILTER :
1387 		pfp_release_bpf(ps);
1388 		break;
1389 
1390 	case SO_RCVBUF :
1391 		size = *(int32_t *)optval;
1392 		if (size > sockmod_pfp_rcvbuf_max || size < 0)
1393 			return (ENOBUFS);
1394 		sopp.sopp_flags = SOCKOPT_RCVHIWAT;
1395 		sopp.sopp_rxhiwat = size;
1396 		ps->ps_upcalls->su_set_proto_props(ps->ps_upper, &sopp);
1397 		ps->ps_rcvbuf = size;
1398 		break;
1399 
1400 	default :
1401 		error = ENOPROTOOPT;
1402 		break;
1403 	}
1404 
1405 	return (error);
1406 }
1407 
1408 /*
1409  * pfp_open_index is an internal function used to open a MAC device by
1410  * its index. Both a mac_handle_t and mac_client_handle_t are acquired
1411  * because some of the interfaces provided by the mac layer require either
1412  * only the mac_handle_t or both it and mac_handle_t.
1413  *
1414  * Whilst inside the kernel we can access data structures supporting any
1415  * zone, access to interfaces from non-global zones is restricted to those
1416  * interfaces (if any) that are exclusively assigned to a zone.
1417  */
1418 static int
1419 pfp_open_index(int index, mac_handle_t *mhp, mac_client_handle_t *mcip,
1420     cred_t *cred)
1421 {
1422 	mac_client_handle_t mch;
1423 	zoneid_t ifzoneid;
1424 	mac_handle_t mh;
1425 	zoneid_t zoneid;
1426 	int error;
1427 
1428 	mh = 0;
1429 	mch = 0;
1430 	error = mac_open_by_linkid(index, &mh);
1431 	if (error != 0)
1432 		goto bad_open;
1433 
1434 	error = mac_client_open(mh, &mch, NULL,
1435 	    MAC_OPEN_FLAGS_USE_DATALINK_NAME);
1436 	if (error != 0)
1437 		goto bad_open;
1438 
1439 	zoneid = crgetzoneid(cred);
1440 	if (zoneid != GLOBAL_ZONEID) {
1441 		mac_perim_handle_t perim;
1442 
1443 		mac_perim_enter_by_mh(mh, &perim);
1444 		error = dls_link_getzid(mac_name(mh), &ifzoneid);
1445 		mac_perim_exit(perim);
1446 		if (error != 0)
1447 			goto bad_open;
1448 		if (ifzoneid != zoneid) {
1449 			error = EACCES;
1450 			goto bad_open;
1451 		}
1452 	}
1453 
1454 	*mcip = mch;
1455 	*mhp = mh;
1456 
1457 	return (0);
1458 bad_open:
1459 	if (mch != 0)
1460 		mac_client_close(mch, 0);
1461 	if (mh != 0)
1462 		mac_close(mh);
1463 	return (error);
1464 }
1465 
1466 static void
1467 pfp_close(mac_handle_t mh, mac_client_handle_t mch)
1468 {
1469 	mac_client_close(mch, 0);
1470 	mac_close(mh);
1471 }
1472 
1473 /*
1474  * The purpose of this function is to provide a single place where we free
1475  * the loaded BPF program and reset all pointers/counters associated with
1476  * it.
1477  */
1478 static void
1479 pfp_release_bpf(struct pfpsock *ps)
1480 {
1481 	if (ps->ps_bpf.bf_len != 0) {
1482 		kmem_free(ps->ps_bpf.bf_insns, ps->ps_bpf.bf_len);
1483 		ps->ps_bpf.bf_len = 0;
1484 		ps->ps_bpf.bf_insns = NULL;
1485 	}
1486 }
1487 
1488 /*
1489  * Set the promiscuous mode of a network interface.
1490  * This function only calls the mac layer when there is a change to the
1491  * status of a network interface's promiscous mode. Tracking of how many
1492  * sockets have the network interface in promiscuous mode, and thus the
1493  * control over the physical device's status, is left to the mac layer.
1494  */
1495 static int
1496 pfp_set_promisc(struct pfpsock *ps, mac_client_promisc_type_t turnon)
1497 {
1498 	int error = 0;
1499 	int flags;
1500 
1501 	/*
1502 	 * There are 4 combinations of turnon/ps_promisc.
1503 	 * This if handles 2 (both false, both true) and the if() below
1504 	 * handles the remaining one - when change is required.
1505 	 */
1506 	if (turnon == ps->ps_promisc)
1507 		return (error);
1508 
1509 	if (ps->ps_phd != 0) {
1510 		mac_promisc_remove(ps->ps_phd);
1511 		ps->ps_phd = 0;
1512 
1513 		/*
1514 		 * ps_promisc is set here in case the call to mac_promisc_add
1515 		 * fails: leaving it to indicate that the interface is still
1516 		 * in some sort of promiscuous mode is false.
1517 		 */
1518 		if (ps->ps_promisc != MAC_CLIENT_PROMISC_FILTERED) {
1519 			ps->ps_promisc = MAC_CLIENT_PROMISC_FILTERED;
1520 			flags = MAC_PROMISC_FLAGS_NO_PHYS;
1521 		} else {
1522 			flags = 0;
1523 		}
1524 		flags |= MAC_PROMISC_FLAGS_VLAN_TAG_STRIP;
1525 	}
1526 
1527 	error = mac_promisc_add(ps->ps_mch, turnon, pfp_packet, ps,
1528 	    &ps->ps_phd, flags);
1529 	if (error == 0)
1530 		ps->ps_promisc = turnon;
1531 
1532 	return (error);
1533 }
1534 
1535 /*
1536  * This table maps the MAC types in Solaris to the ARPHRD_* values used
1537  * on Linux. This is used with the SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl.
1538  *
1539  * The symbols in this table are *not* pulled in from <net/if_arp.h>,
1540  * they are pulled from <netpacket/packet.h>, thus it acts as a source
1541  * of supplementary information to the ARP table.
1542  */
1543 static uint_t arphrd_to_dl[][2] = {
1544 	{ ARPHRD_IEEE80211,	DL_WIFI },
1545 	{ ARPHRD_TUNNEL,	DL_IPV4 },
1546 	{ ARPHRD_TUNNEL,	DL_IPV6 },
1547 	{ ARPHRD_TUNNEL,	DL_6TO4 },
1548 	{ ARPHRD_AX25,		DL_X25 },
1549 	{ ARPHRD_ATM,		DL_ATM },
1550 	{ 0,			0 }
1551 };
1552 
1553 static int
1554 pfp_dl_to_arphrd(int dltype)
1555 {
1556 	int i;
1557 
1558 	for (i = 0; arphrd_to_dl[i][0] != 0; i++)
1559 		if (arphrd_to_dl[i][1] == dltype)
1560 			return (arphrd_to_dl[i][0]);
1561 	return (arp_hw_type(dltype));
1562 }
1563