xref: /illumos-gate/usr/src/cmd/bhyve/net_backends.c (revision f96a0cef040313f6281fbc014a0b63d5c5cc760f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #include <net/netmap.h>
50 #include <net/netmap_virt.h>
51 #define NETMAP_WITH_LIBS
52 #include <net/netmap_user.h>
53 
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
56 #endif
57 #include <err.h>
58 #include <errno.h>
59 #include <fcntl.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <stdint.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <sysexits.h>
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <poll.h>
70 #include <assert.h>
71 
72 #ifdef NETGRAPH
73 #include <sys/param.h>
74 #include <sys/sysctl.h>
75 #include <netgraph.h>
76 #endif
77 
78 #include "config.h"
79 #include "debug.h"
80 #include "iov.h"
81 #include "mevent.h"
82 #include "net_backends.h"
83 #include "pci_emul.h"
84 
85 #include <sys/linker_set.h>
86 
87 /*
88  * Each network backend registers a set of function pointers that are
89  * used to implement the net backends API.
90  * This might need to be exposed if we implement backends in separate files.
91  */
92 struct net_backend {
93 	const char *prefix;	/* prefix matching this backend */
94 
95 	/*
96 	 * Routines used to initialize and cleanup the resources needed
97 	 * by a backend. The cleanup function is used internally,
98 	 * and should not be called by the frontend.
99 	 */
100 	int (*init)(struct net_backend *be, const char *devname,
101 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
102 	void (*cleanup)(struct net_backend *be);
103 
104 	/*
105 	 * Called to serve a guest transmit request. The scatter-gather
106 	 * vector provided by the caller has 'iovcnt' elements and contains
107 	 * the packet to send.
108 	 */
109 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
110 	    int iovcnt);
111 
112 	/*
113 	 * Get the length of the next packet that can be received from
114 	 * the backend. If no packets are currently available, this
115 	 * function returns 0.
116 	 */
117 	ssize_t (*peek_recvlen)(struct net_backend *be);
118 
119 	/*
120 	 * Called to receive a packet from the backend. When the function
121 	 * returns a positive value 'len', the scatter-gather vector
122 	 * provided by the caller contains a packet with such length.
123 	 * The function returns 0 if the backend doesn't have a new packet to
124 	 * receive.
125 	 */
126 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
127 	    int iovcnt);
128 
129 	/*
130 	 * Ask the backend to enable or disable receive operation in the
131 	 * backend. On return from a disable operation, it is guaranteed
132 	 * that the receive callback won't be called until receive is
133 	 * enabled again. Note however that it is up to the caller to make
134 	 * sure that netbe_recv() is not currently being executed by another
135 	 * thread.
136 	 */
137 	void (*recv_enable)(struct net_backend *be);
138 	void (*recv_disable)(struct net_backend *be);
139 
140 	/*
141 	 * Ask the backend for the virtio-net features it is able to
142 	 * support. Possible features are TSO, UFO and checksum offloading
143 	 * in both rx and tx direction and for both IPv4 and IPv6.
144 	 */
145 	uint64_t (*get_cap)(struct net_backend *be);
146 
147 	/*
148 	 * Tell the backend to enable/disable the specified virtio-net
149 	 * features (capabilities).
150 	 */
151 	int (*set_cap)(struct net_backend *be, uint64_t features,
152 	    unsigned int vnet_hdr_len);
153 
154 	struct pci_vtnet_softc *sc;
155 	int fd;
156 
157 	/*
158 	 * Length of the virtio-net header used by the backend and the
159 	 * frontend, respectively. A zero value means that the header
160 	 * is not used.
161 	 */
162 	unsigned int be_vnet_hdr_len;
163 	unsigned int fe_vnet_hdr_len;
164 
165 	/* Size of backend-specific private data. */
166 	size_t priv_size;
167 
168 	/* Room for backend-specific data. */
169 	char opaque[0];
170 };
171 
172 SET_DECLARE(net_backend_set, struct net_backend);
173 
174 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
175 
176 #define WPRINTF(params) PRINTLN params
177 
178 /*
179  * The tap backend
180  */
181 
182 struct tap_priv {
183 	struct mevent *mevp;
184 	/*
185 	 * A bounce buffer that allows us to implement the peek_recvlen
186 	 * callback. In the future we may get the same information from
187 	 * the kevent data.
188 	 */
189 	char bbuf[1 << 16];
190 	ssize_t bbuflen;
191 };
192 
193 static void
194 tap_cleanup(struct net_backend *be)
195 {
196 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
197 
198 	if (priv->mevp) {
199 		mevent_delete(priv->mevp);
200 	}
201 	if (be->fd != -1) {
202 		close(be->fd);
203 		be->fd = -1;
204 	}
205 }
206 
207 static int
208 tap_init(struct net_backend *be, const char *devname,
209 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
210 {
211 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
212 	char tbuf[80];
213 	int opt = 1;
214 #ifndef WITHOUT_CAPSICUM
215 	cap_rights_t rights;
216 #endif
217 
218 	if (cb == NULL) {
219 		WPRINTF(("TAP backend requires non-NULL callback"));
220 		return (-1);
221 	}
222 
223 	strcpy(tbuf, "/dev/");
224 	strlcat(tbuf, devname, sizeof(tbuf));
225 
226 	be->fd = open(tbuf, O_RDWR);
227 	if (be->fd == -1) {
228 		WPRINTF(("open of tap device %s failed", tbuf));
229 		goto error;
230 	}
231 
232 	/*
233 	 * Set non-blocking and register for read
234 	 * notifications with the event loop
235 	 */
236 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
237 		WPRINTF(("tap device O_NONBLOCK failed"));
238 		goto error;
239 	}
240 
241 #ifndef WITHOUT_CAPSICUM
242 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
243 	if (caph_rights_limit(be->fd, &rights) == -1)
244 		errx(EX_OSERR, "Unable to apply rights for sandbox");
245 #endif
246 
247 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
248 	priv->bbuflen = 0;
249 
250 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
251 	if (priv->mevp == NULL) {
252 		WPRINTF(("Could not register event"));
253 		goto error;
254 	}
255 
256 	return (0);
257 
258 error:
259 	tap_cleanup(be);
260 	return (-1);
261 }
262 
263 /*
264  * Called to send a buffer chain out to the tap device
265  */
266 static ssize_t
267 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
268 {
269 	return (writev(be->fd, iov, iovcnt));
270 }
271 
272 static ssize_t
273 tap_peek_recvlen(struct net_backend *be)
274 {
275 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
276 	ssize_t ret;
277 
278 	if (priv->bbuflen > 0) {
279 		/*
280 		 * We already have a packet in the bounce buffer.
281 		 * Just return its length.
282 		 */
283 		return priv->bbuflen;
284 	}
285 
286 	/*
287 	 * Read the next packet (if any) into the bounce buffer, so
288 	 * that we get to know its length and we can return that
289 	 * to the caller.
290 	 */
291 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
292 	if (ret < 0 && errno == EWOULDBLOCK) {
293 		return (0);
294 	}
295 
296 	if (ret > 0)
297 		priv->bbuflen = ret;
298 
299 	return (ret);
300 }
301 
302 static ssize_t
303 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
304 {
305 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
306 	ssize_t ret;
307 
308 	if (priv->bbuflen > 0) {
309 		/*
310 		 * A packet is available in the bounce buffer, so
311 		 * we read it from there.
312 		 */
313 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
314 		    iov, iovcnt, 0);
315 
316 		/* Mark the bounce buffer as empty. */
317 		priv->bbuflen = 0;
318 
319 		return (ret);
320 	}
321 
322 	ret = readv(be->fd, iov, iovcnt);
323 	if (ret < 0 && errno == EWOULDBLOCK) {
324 		return (0);
325 	}
326 
327 	return (ret);
328 }
329 
330 static void
331 tap_recv_enable(struct net_backend *be)
332 {
333 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
334 
335 	mevent_enable(priv->mevp);
336 }
337 
338 static void
339 tap_recv_disable(struct net_backend *be)
340 {
341 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
342 
343 	mevent_disable(priv->mevp);
344 }
345 
346 static uint64_t
347 tap_get_cap(struct net_backend *be)
348 {
349 
350 	return (0); /* no capabilities for now */
351 }
352 
353 static int
354 tap_set_cap(struct net_backend *be, uint64_t features,
355 		unsigned vnet_hdr_len)
356 {
357 
358 	return ((features || vnet_hdr_len) ? -1 : 0);
359 }
360 
361 static struct net_backend tap_backend = {
362 	.prefix = "tap",
363 	.priv_size = sizeof(struct tap_priv),
364 	.init = tap_init,
365 	.cleanup = tap_cleanup,
366 	.send = tap_send,
367 	.peek_recvlen = tap_peek_recvlen,
368 	.recv = tap_recv,
369 	.recv_enable = tap_recv_enable,
370 	.recv_disable = tap_recv_disable,
371 	.get_cap = tap_get_cap,
372 	.set_cap = tap_set_cap,
373 };
374 
375 /* A clone of the tap backend, with a different prefix. */
376 static struct net_backend vmnet_backend = {
377 	.prefix = "vmnet",
378 	.priv_size = sizeof(struct tap_priv),
379 	.init = tap_init,
380 	.cleanup = tap_cleanup,
381 	.send = tap_send,
382 	.peek_recvlen = tap_peek_recvlen,
383 	.recv = tap_recv,
384 	.recv_enable = tap_recv_enable,
385 	.recv_disable = tap_recv_disable,
386 	.get_cap = tap_get_cap,
387 	.set_cap = tap_set_cap,
388 };
389 
390 DATA_SET(net_backend_set, tap_backend);
391 DATA_SET(net_backend_set, vmnet_backend);
392 
393 #ifdef NETGRAPH
394 
395 /*
396  * Netgraph backend
397  */
398 
399 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
400 
401 static int
402 ng_init(struct net_backend *be, const char *devname,
403 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
404 {
405 	struct tap_priv *p = (struct tap_priv *)be->opaque;
406 	struct ngm_connect ngc;
407 	const char *value, *nodename;
408 	int sbsz;
409 	int ctrl_sock;
410 	int flags;
411 	unsigned long maxsbsz;
412 	size_t msbsz;
413 #ifndef WITHOUT_CAPSICUM
414 	cap_rights_t rights;
415 #endif
416 
417 	if (cb == NULL) {
418 		WPRINTF(("Netgraph backend requires non-NULL callback"));
419 		return (-1);
420 	}
421 
422 	be->fd = -1;
423 
424 	memset(&ngc, 0, sizeof(ngc));
425 
426 	value = get_config_value_node(nvl, "path");
427 	if (value == NULL) {
428 		WPRINTF(("path must be provided"));
429 		return (-1);
430 	}
431 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
432 
433 	value = get_config_value_node(nvl, "hook");
434 	if (value == NULL)
435 		value = "vmlink";
436 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
437 
438 	value = get_config_value_node(nvl, "peerhook");
439 	if (value == NULL) {
440 		WPRINTF(("peer hook must be provided"));
441 		return (-1);
442 	}
443 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
444 
445 	nodename = get_config_value_node(nvl, "socket");
446 	if (NgMkSockNode(nodename,
447 		&ctrl_sock, &be->fd) < 0) {
448 		WPRINTF(("can't get Netgraph sockets"));
449 		return (-1);
450 	}
451 
452 	if (NgSendMsg(ctrl_sock, ".",
453 		NGM_GENERIC_COOKIE,
454 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
455 		WPRINTF(("can't connect to node"));
456 		close(ctrl_sock);
457 		goto error;
458 	}
459 
460 	close(ctrl_sock);
461 
462 	flags = fcntl(be->fd, F_GETFL);
463 
464 	if (flags < 0) {
465 		WPRINTF(("can't get socket flags"));
466 		goto error;
467 	}
468 
469 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
470 		WPRINTF(("can't set O_NONBLOCK flag"));
471 		goto error;
472 	}
473 
474 	/*
475 	 * The default ng_socket(4) buffer's size is too low.
476 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
477 	 * and kern.ipc.maxsockbuf.
478 	 */
479 	msbsz = sizeof(maxsbsz);
480 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
481 		NULL, 0) < 0) {
482 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
483 		goto error;
484 	}
485 
486 	/*
487 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
488 	 * as it takes into account the mbuf(9) overhead.
489 	 */
490 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
491 
492 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
493 
494 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
495 		sizeof(sbsz)) < 0) {
496 		WPRINTF(("can't set TX buffer size"));
497 		goto error;
498 	}
499 
500 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
501 		sizeof(sbsz)) < 0) {
502 		WPRINTF(("can't set RX buffer size"));
503 		goto error;
504 	}
505 
506 #ifndef WITHOUT_CAPSICUM
507 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
508 	if (caph_rights_limit(be->fd, &rights) == -1)
509 		errx(EX_OSERR, "Unable to apply rights for sandbox");
510 #endif
511 
512 	memset(p->bbuf, 0, sizeof(p->bbuf));
513 	p->bbuflen = 0;
514 
515 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
516 	if (p->mevp == NULL) {
517 		WPRINTF(("Could not register event"));
518 		goto error;
519 	}
520 
521 	return (0);
522 
523 error:
524 	tap_cleanup(be);
525 	return (-1);
526 }
527 
528 static struct net_backend ng_backend = {
529 	.prefix = "netgraph",
530 	.priv_size = sizeof(struct tap_priv),
531 	.init = ng_init,
532 	.cleanup = tap_cleanup,
533 	.send = tap_send,
534 	.peek_recvlen = tap_peek_recvlen,
535 	.recv = tap_recv,
536 	.recv_enable = tap_recv_enable,
537 	.recv_disable = tap_recv_disable,
538 	.get_cap = tap_get_cap,
539 	.set_cap = tap_set_cap,
540 };
541 
542 DATA_SET(net_backend_set, ng_backend);
543 
544 #endif /* NETGRAPH */
545 
546 /*
547  * The netmap backend
548  */
549 
550 /* The virtio-net features supported by netmap. */
551 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
552 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
553 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
554 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
555 
556 struct netmap_priv {
557 	char ifname[IFNAMSIZ];
558 	struct nm_desc *nmd;
559 	uint16_t memid;
560 	struct netmap_ring *rx;
561 	struct netmap_ring *tx;
562 	struct mevent *mevp;
563 	net_be_rxeof_t cb;
564 	void *cb_param;
565 };
566 
567 static void
568 nmreq_init(struct nmreq *req, char *ifname)
569 {
570 
571 	memset(req, 0, sizeof(*req));
572 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
573 	req->nr_version = NETMAP_API;
574 }
575 
576 static int
577 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
578 {
579 	int err;
580 	struct nmreq req;
581 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
582 
583 	nmreq_init(&req, priv->ifname);
584 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
585 	req.nr_arg1 = vnet_hdr_len;
586 	err = ioctl(be->fd, NIOCREGIF, &req);
587 	if (err) {
588 		WPRINTF(("Unable to set vnet header length %d",
589 				vnet_hdr_len));
590 		return (err);
591 	}
592 
593 	be->be_vnet_hdr_len = vnet_hdr_len;
594 
595 	return (0);
596 }
597 
598 static int
599 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
600 {
601 	int prev_hdr_len = be->be_vnet_hdr_len;
602 	int ret;
603 
604 	if (vnet_hdr_len == prev_hdr_len) {
605 		return (1);
606 	}
607 
608 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
609 	if (ret) {
610 		return (0);
611 	}
612 
613 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
614 
615 	return (1);
616 }
617 
618 static uint64_t
619 netmap_get_cap(struct net_backend *be)
620 {
621 
622 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
623 	    NETMAP_FEATURES : 0);
624 }
625 
626 static int
627 netmap_set_cap(struct net_backend *be, uint64_t features,
628 	       unsigned vnet_hdr_len)
629 {
630 
631 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
632 }
633 
634 static int
635 netmap_init(struct net_backend *be, const char *devname,
636 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param)
637 {
638 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
639 
640 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
641 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
642 
643 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
644 	if (priv->nmd == NULL) {
645 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
646 			devname, strerror(errno)));
647 		free(priv);
648 		return (-1);
649 	}
650 
651 	priv->memid = priv->nmd->req.nr_arg2;
652 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
653 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
654 	priv->cb = cb;
655 	priv->cb_param = param;
656 	be->fd = priv->nmd->fd;
657 
658 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
659 	if (priv->mevp == NULL) {
660 		WPRINTF(("Could not register event"));
661 		return (-1);
662 	}
663 
664 	return (0);
665 }
666 
667 static void
668 netmap_cleanup(struct net_backend *be)
669 {
670 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
671 
672 	if (priv->mevp) {
673 		mevent_delete(priv->mevp);
674 	}
675 	if (priv->nmd) {
676 		nm_close(priv->nmd);
677 	}
678 	be->fd = -1;
679 }
680 
681 static ssize_t
682 netmap_send(struct net_backend *be, const struct iovec *iov,
683 	    int iovcnt)
684 {
685 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
686 	struct netmap_ring *ring;
687 	ssize_t totlen = 0;
688 	int nm_buf_size;
689 	int nm_buf_len;
690 	uint32_t head;
691 	void *nm_buf;
692 	int j;
693 
694 	ring = priv->tx;
695 	head = ring->head;
696 	if (head == ring->tail) {
697 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
698 		goto txsync;
699 	}
700 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
701 	nm_buf_size = ring->nr_buf_size;
702 	nm_buf_len = 0;
703 
704 	for (j = 0; j < iovcnt; j++) {
705 		int iov_frag_size = iov[j].iov_len;
706 		void *iov_frag_buf = iov[j].iov_base;
707 
708 		totlen += iov_frag_size;
709 
710 		/*
711 		 * Split each iovec fragment over more netmap slots, if
712 		 * necessary.
713 		 */
714 		for (;;) {
715 			int copylen;
716 
717 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
718 			memcpy(nm_buf, iov_frag_buf, copylen);
719 
720 			iov_frag_buf += copylen;
721 			iov_frag_size -= copylen;
722 			nm_buf += copylen;
723 			nm_buf_size -= copylen;
724 			nm_buf_len += copylen;
725 
726 			if (iov_frag_size == 0) {
727 				break;
728 			}
729 
730 			ring->slot[head].len = nm_buf_len;
731 			ring->slot[head].flags = NS_MOREFRAG;
732 			head = nm_ring_next(ring, head);
733 			if (head == ring->tail) {
734 				/*
735 				 * We ran out of netmap slots while
736 				 * splitting the iovec fragments.
737 				 */
738 				WPRINTF(("No space, drop %zu bytes",
739 				   count_iov(iov, iovcnt)));
740 				goto txsync;
741 			}
742 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
743 			nm_buf_size = ring->nr_buf_size;
744 			nm_buf_len = 0;
745 		}
746 	}
747 
748 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
749 	ring->slot[head].len = nm_buf_len;
750 	ring->slot[head].flags = 0;
751 	head = nm_ring_next(ring, head);
752 
753 	/* Now update ring->head and ring->cur. */
754 	ring->head = ring->cur = head;
755 txsync:
756 	ioctl(be->fd, NIOCTXSYNC, NULL);
757 
758 	return (totlen);
759 }
760 
761 static ssize_t
762 netmap_peek_recvlen(struct net_backend *be)
763 {
764 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
765 	struct netmap_ring *ring = priv->rx;
766 	uint32_t head = ring->head;
767 	ssize_t totlen = 0;
768 
769 	while (head != ring->tail) {
770 		struct netmap_slot *slot = ring->slot + head;
771 
772 		totlen += slot->len;
773 		if ((slot->flags & NS_MOREFRAG) == 0)
774 			break;
775 		head = nm_ring_next(ring, head);
776 	}
777 
778 	return (totlen);
779 }
780 
781 static ssize_t
782 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
783 {
784 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
785 	struct netmap_slot *slot = NULL;
786 	struct netmap_ring *ring;
787 	void *iov_frag_buf;
788 	int iov_frag_size;
789 	ssize_t totlen = 0;
790 	uint32_t head;
791 
792 	assert(iovcnt);
793 
794 	ring = priv->rx;
795 	head = ring->head;
796 	iov_frag_buf = iov->iov_base;
797 	iov_frag_size = iov->iov_len;
798 
799 	do {
800 		int nm_buf_len;
801 		void *nm_buf;
802 
803 		if (head == ring->tail) {
804 			return (0);
805 		}
806 
807 		slot = ring->slot + head;
808 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
809 		nm_buf_len = slot->len;
810 
811 		for (;;) {
812 			int copylen = nm_buf_len < iov_frag_size ?
813 			    nm_buf_len : iov_frag_size;
814 
815 			memcpy(iov_frag_buf, nm_buf, copylen);
816 			nm_buf += copylen;
817 			nm_buf_len -= copylen;
818 			iov_frag_buf += copylen;
819 			iov_frag_size -= copylen;
820 			totlen += copylen;
821 
822 			if (nm_buf_len == 0) {
823 				break;
824 			}
825 
826 			iov++;
827 			iovcnt--;
828 			if (iovcnt == 0) {
829 				/* No space to receive. */
830 				WPRINTF(("Short iov, drop %zd bytes",
831 				    totlen));
832 				return (-ENOSPC);
833 			}
834 			iov_frag_buf = iov->iov_base;
835 			iov_frag_size = iov->iov_len;
836 		}
837 
838 		head = nm_ring_next(ring, head);
839 
840 	} while (slot->flags & NS_MOREFRAG);
841 
842 	/* Release slots to netmap. */
843 	ring->head = ring->cur = head;
844 
845 	return (totlen);
846 }
847 
848 static void
849 netmap_recv_enable(struct net_backend *be)
850 {
851 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
852 
853 	mevent_enable(priv->mevp);
854 }
855 
856 static void
857 netmap_recv_disable(struct net_backend *be)
858 {
859 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
860 
861 	mevent_disable(priv->mevp);
862 }
863 
864 static struct net_backend netmap_backend = {
865 	.prefix = "netmap",
866 	.priv_size = sizeof(struct netmap_priv),
867 	.init = netmap_init,
868 	.cleanup = netmap_cleanup,
869 	.send = netmap_send,
870 	.peek_recvlen = netmap_peek_recvlen,
871 	.recv = netmap_recv,
872 	.recv_enable = netmap_recv_enable,
873 	.recv_disable = netmap_recv_disable,
874 	.get_cap = netmap_get_cap,
875 	.set_cap = netmap_set_cap,
876 };
877 
878 /* A clone of the netmap backend, with a different prefix. */
879 static struct net_backend vale_backend = {
880 	.prefix = "vale",
881 	.priv_size = sizeof(struct netmap_priv),
882 	.init = netmap_init,
883 	.cleanup = netmap_cleanup,
884 	.send = netmap_send,
885 	.peek_recvlen = netmap_peek_recvlen,
886 	.recv = netmap_recv,
887 	.recv_enable = netmap_recv_enable,
888 	.recv_disable = netmap_recv_disable,
889 	.get_cap = netmap_get_cap,
890 	.set_cap = netmap_set_cap,
891 };
892 
893 DATA_SET(net_backend_set, netmap_backend);
894 DATA_SET(net_backend_set, vale_backend);
895 
896 int
897 netbe_legacy_config(nvlist_t *nvl, const char *opts)
898 {
899 	char *backend, *cp;
900 
901 	if (opts == NULL)
902 		return (0);
903 
904 	cp = strchr(opts, ',');
905 	if (cp == NULL) {
906 		set_config_value_node(nvl, "backend", opts);
907 		return (0);
908 	}
909 	backend = strndup(opts, cp - opts);
910 	set_config_value_node(nvl, "backend", backend);
911 	free(backend);
912 	return (pci_parse_legacy_config(nvl, cp + 1));
913 }
914 
915 /*
916  * Initialize a backend and attach to the frontend.
917  * This is called during frontend initialization.
918  *  @ret is a pointer to the backend to be initialized
919  *  @devname is the backend-name as supplied on the command line,
920  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
921  *  @cb is the receive callback supplied by the frontend,
922  *	and it is invoked in the event loop when a receive
923  *	event is generated in the hypervisor,
924  *  @param is a pointer to the frontend, and normally used as
925  *	the argument for the callback.
926  */
927 int
928 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
929     void *param)
930 {
931 	struct net_backend **pbe, *nbe, *tbe = NULL;
932 	const char *value;
933 	char *devname;
934 	int err;
935 
936 	value = get_config_value_node(nvl, "backend");
937 	if (value == NULL) {
938 		return (-1);
939 	}
940 	devname = strdup(value);
941 
942 	/*
943 	 * Find the network backend that matches the user-provided
944 	 * device name. net_backend_set is built using a linker set.
945 	 */
946 	SET_FOREACH(pbe, net_backend_set) {
947 		if (strncmp(devname, (*pbe)->prefix,
948 		    strlen((*pbe)->prefix)) == 0) {
949 			tbe = *pbe;
950 			assert(tbe->init != NULL);
951 			assert(tbe->cleanup != NULL);
952 			assert(tbe->send != NULL);
953 			assert(tbe->recv != NULL);
954 			assert(tbe->get_cap != NULL);
955 			assert(tbe->set_cap != NULL);
956 			break;
957 		}
958 	}
959 
960 	*ret = NULL;
961 	if (tbe == NULL) {
962 		free(devname);
963 		return (EINVAL);
964 	}
965 
966 	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
967 	*nbe = *tbe;	/* copy the template */
968 	nbe->fd = -1;
969 	nbe->sc = param;
970 	nbe->be_vnet_hdr_len = 0;
971 	nbe->fe_vnet_hdr_len = 0;
972 
973 	/* Initialize the backend. */
974 	err = nbe->init(nbe, devname, nvl, cb, param);
975 	if (err) {
976 		free(devname);
977 		free(nbe);
978 		return (err);
979 	}
980 
981 	*ret = nbe;
982 	free(devname);
983 
984 	return (0);
985 }
986 
987 void
988 netbe_cleanup(struct net_backend *be)
989 {
990 
991 	if (be != NULL) {
992 		be->cleanup(be);
993 		free(be);
994 	}
995 }
996 
997 uint64_t
998 netbe_get_cap(struct net_backend *be)
999 {
1000 
1001 	assert(be != NULL);
1002 	return (be->get_cap(be));
1003 }
1004 
1005 int
1006 netbe_set_cap(struct net_backend *be, uint64_t features,
1007 	      unsigned vnet_hdr_len)
1008 {
1009 	int ret;
1010 
1011 	assert(be != NULL);
1012 
1013 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1014 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1015 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1016 		return (-1);
1017 
1018 	be->fe_vnet_hdr_len = vnet_hdr_len;
1019 
1020 	ret = be->set_cap(be, features, vnet_hdr_len);
1021 	assert(be->be_vnet_hdr_len == 0 ||
1022 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1023 
1024 	return (ret);
1025 }
1026 
1027 ssize_t
1028 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1029 {
1030 
1031 	return (be->send(be, iov, iovcnt));
1032 }
1033 
1034 ssize_t
1035 netbe_peek_recvlen(struct net_backend *be)
1036 {
1037 
1038 	return (be->peek_recvlen(be));
1039 }
1040 
1041 /*
1042  * Try to read a packet from the backend, without blocking.
1043  * If no packets are available, return 0. In case of success, return
1044  * the length of the packet just read. Return -1 in case of errors.
1045  */
1046 ssize_t
1047 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1048 {
1049 
1050 	return (be->recv(be, iov, iovcnt));
1051 }
1052 
1053 /*
1054  * Read a packet from the backend and discard it.
1055  * Returns the size of the discarded packet or zero if no packet was available.
1056  * A negative error code is returned in case of read error.
1057  */
1058 ssize_t
1059 netbe_rx_discard(struct net_backend *be)
1060 {
1061 	/*
1062 	 * MP note: the dummybuf is only used to discard frames,
1063 	 * so there is no need for it to be per-vtnet or locked.
1064 	 * We only make it large enough for TSO-sized segment.
1065 	 */
1066 	static uint8_t dummybuf[65536 + 64];
1067 	struct iovec iov;
1068 
1069 	iov.iov_base = dummybuf;
1070 	iov.iov_len = sizeof(dummybuf);
1071 
1072 	return netbe_recv(be, &iov, 1);
1073 }
1074 
1075 void
1076 netbe_rx_disable(struct net_backend *be)
1077 {
1078 
1079 	return be->recv_disable(be);
1080 }
1081 
1082 void
1083 netbe_rx_enable(struct net_backend *be)
1084 {
1085 
1086 	return be->recv_enable(be);
1087 }
1088 
1089 size_t
1090 netbe_get_vnet_hdr_len(struct net_backend *be)
1091 {
1092 
1093 	return (be->be_vnet_hdr_len);
1094 }
1095