1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2013 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 */ 43 44 #include <sys/cdefs.h> 45 __FBSDID("$FreeBSD$"); 46 47 #include <sys/param.h> 48 #ifndef WITHOUT_CAPSICUM 49 #include <sys/capsicum.h> 50 #endif 51 #include <sys/linker_set.h> 52 #include <sys/select.h> 53 #include <sys/uio.h> 54 #include <sys/ioctl.h> 55 #include <net/ethernet.h> 56 #ifdef __FreeBSD__ 57 #ifndef NETMAP_WITH_LIBS 58 #define NETMAP_WITH_LIBS 59 #endif 60 #include <net/netmap_user.h> 61 #endif 62 63 #ifndef WITHOUT_CAPSICUM 64 #include <capsicum_helpers.h> 65 #endif 66 #include <err.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <stdint.h> 72 #include <string.h> 73 #include <strings.h> 74 #include <unistd.h> 75 #include <assert.h> 76 #include <md5.h> 77 #include <pthread.h> 78 #include <pthread_np.h> 79 #include <sysexits.h> 80 #ifndef __FreeBSD__ 81 #include <poll.h> 82 #include <libdlpi.h> 83 #endif 84 85 #include "bhyverun.h" 86 #include "config.h" 87 #include "debug.h" 88 #include "pci_emul.h" 89 #ifdef __FreeBSD__ 90 #include "mevent.h" 91 #endif 92 #include "virtio.h" 93 #include "net_utils.h" 94 95 #define VTNET_RINGSZ 1024 96 97 #define VTNET_MAXSEGS 256 98 99 /* 100 * Host capabilities. Note that we only offer a few of these. 101 */ 102 #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ 103 #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ 104 #define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ 105 #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ 106 #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ 107 #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ 108 #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ 109 #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ 110 #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ 111 #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ 112 #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ 113 #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ 114 #define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ 115 #define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ 116 #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ 117 #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ 118 #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ 119 #define VIRTIO_NET_F_GUEST_ANNOUNCE \ 120 (1 << 21) /* guest can send gratuitous pkts */ 121 122 #define VTNET_S_HOSTCAPS \ 123 ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ 124 VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) 125 126 /* 127 * PCI config-space "registers" 128 */ 129 struct virtio_net_config { 130 uint8_t mac[6]; 131 uint16_t status; 132 uint16_t max_virtqueue_pairs; 133 uint16_t mtu; 134 } __packed; 135 136 /* 137 * Queue definitions. 138 */ 139 #define VTNET_RXQ 0 140 #define VTNET_TXQ 1 141 #define VTNET_CTLQ 2 /* NB: not yet supported */ 142 143 #define VTNET_MAXQ 3 144 145 /* 146 * Fixed network header size 147 */ 148 struct virtio_net_rxhdr { 149 uint8_t vrh_flags; 150 uint8_t vrh_gso_type; 151 uint16_t vrh_hdr_len; 152 uint16_t vrh_gso_size; 153 uint16_t vrh_csum_start; 154 uint16_t vrh_csum_offset; 155 uint16_t vrh_bufs; 156 } __packed; 157 158 /* 159 * Debug printf 160 */ 161 static int pci_vtnet_debug; 162 #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params 163 #define WPRINTF(params) PRINTLN params 164 165 /* 166 * Per-device softc 167 */ 168 struct pci_vtnet_softc { 169 struct virtio_softc vsc_vs; 170 struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; 171 pthread_mutex_t vsc_mtx; 172 struct mevent *vsc_mevp; 173 174 #ifdef __FreeBSD 175 int vsc_tapfd; 176 #else 177 dlpi_handle_t vsc_dhp; 178 int vsc_dlpifd; 179 #endif 180 struct nm_desc *vsc_nmd; 181 182 int vsc_rx_ready; 183 bool features_negotiated; /* protected by rx_mtx */ 184 int resetting; /* protected by tx_mtx */ 185 186 uint64_t vsc_features; /* negotiated features */ 187 188 struct virtio_net_config vsc_config; 189 struct virtio_consts vsc_consts; 190 191 pthread_mutex_t rx_mtx; 192 int rx_vhdrlen; 193 int rx_merge; /* merged rx bufs in use */ 194 195 pthread_t tx_tid; 196 pthread_mutex_t tx_mtx; 197 pthread_cond_t tx_cond; 198 int tx_in_progress; 199 200 void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); 201 void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, 202 int iovcnt, int len); 203 }; 204 205 static void pci_vtnet_reset(void *); 206 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ 207 static int pci_vtnet_cfgread(void *, int, int, uint32_t *); 208 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); 209 static void pci_vtnet_neg_features(void *, uint64_t); 210 211 static struct virtio_consts vtnet_vi_consts = { 212 "vtnet", /* our name */ 213 VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ 214 sizeof(struct virtio_net_config), /* config reg size */ 215 pci_vtnet_reset, /* reset */ 216 NULL, /* device-wide qnotify -- not used */ 217 pci_vtnet_cfgread, /* read PCI config */ 218 pci_vtnet_cfgwrite, /* write PCI config */ 219 pci_vtnet_neg_features, /* apply negotiated features */ 220 VTNET_S_HOSTCAPS, /* our capabilities */ 221 }; 222 223 static void 224 pci_vtnet_reset(void *vsc) 225 { 226 struct pci_vtnet_softc *sc = vsc; 227 228 DPRINTF(("vtnet: device reset requested !")); 229 230 /* Acquire the RX lock to block RX processing. */ 231 pthread_mutex_lock(&sc->rx_mtx); 232 233 sc->features_negotiated = false; 234 235 /* Set sc->resetting and give a chance to the TX thread to stop. */ 236 pthread_mutex_lock(&sc->tx_mtx); 237 sc->resetting = 1; 238 while (sc->tx_in_progress) { 239 pthread_mutex_unlock(&sc->tx_mtx); 240 usleep(10000); 241 pthread_mutex_lock(&sc->tx_mtx); 242 } 243 244 sc->vsc_rx_ready = 0; 245 sc->rx_merge = 1; 246 sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); 247 248 /* 249 * Now reset rings, MSI-X vectors, and negotiated capabilities. 250 * Do that with the TX lock held, since we need to reset 251 * sc->resetting. 252 */ 253 vi_reset_dev(&sc->vsc_vs); 254 255 sc->resetting = 0; 256 pthread_mutex_unlock(&sc->tx_mtx); 257 pthread_mutex_unlock(&sc->rx_mtx); 258 } 259 260 /* 261 * Called to send a buffer chain out to the tap device 262 */ 263 #ifdef __FreeBSD__ 264 static void 265 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 266 int len) 267 { 268 static char pad[60]; /* all zero bytes */ 269 270 if (sc->vsc_tapfd == -1) 271 return; 272 273 /* 274 * If the length is < 60, pad out to that and add the 275 * extra zero'd segment to the iov. It is guaranteed that 276 * there is always an extra iov available by the caller. 277 */ 278 if (len < 60) { 279 iov[iovcnt].iov_base = pad; 280 iov[iovcnt].iov_len = 60 - len; 281 iovcnt++; 282 } 283 (void) writev(sc->vsc_tapfd, iov, iovcnt); 284 } 285 #else 286 static void 287 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 288 int len) 289 { 290 int i; 291 292 for (i = 0; i < iovcnt; i++) { 293 (void) dlpi_send(sc->vsc_dhp, NULL, 0, 294 iov[i].iov_base, iov[i].iov_len, NULL); 295 } 296 } 297 #endif /* __FreeBSD__ */ 298 299 #ifdef __FreeBSD__ 300 /* 301 * Called when there is read activity on the tap file descriptor. 302 * Each buffer posted by the guest is assumed to be able to contain 303 * an entire ethernet frame + rx header. 304 * MP note: the dummybuf is only used for discarding frames, so there 305 * is no need for it to be per-vtnet or locked. 306 */ 307 static uint8_t dummybuf[2048]; 308 #endif /* __FreeBSD__ */ 309 310 static __inline struct iovec * 311 rx_iov_trim(struct iovec *iov, int *niov, int tlen) 312 { 313 struct iovec *riov; 314 315 /* XXX short-cut: assume first segment is >= tlen */ 316 assert(iov[0].iov_len >= tlen); 317 318 iov[0].iov_len -= tlen; 319 if (iov[0].iov_len == 0) { 320 assert(*niov > 1); 321 *niov -= 1; 322 riov = &iov[1]; 323 } else { 324 iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); 325 riov = &iov[0]; 326 } 327 328 return (riov); 329 } 330 331 static void 332 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) 333 { 334 struct iovec iov[VTNET_MAXSEGS], *riov; 335 struct vqueue_info *vq; 336 void *vrx; 337 int n; 338 #ifdef __FreeBSD__ 339 int len; 340 #else 341 size_t len; 342 int ret; 343 #endif 344 uint16_t idx; 345 346 /* 347 * Should never be called without a valid tap fd 348 */ 349 #ifdef __FreeBSD__ 350 assert(sc->vsc_tapfd != -1); 351 #else 352 assert(sc->vsc_dlpifd != -1); 353 #endif 354 355 /* Features must be negotiated */ 356 if (!sc->features_negotiated) { 357 return; 358 } 359 360 /* 361 * But, will be called when the rx ring hasn't yet 362 * been set up. 363 */ 364 if (!sc->vsc_rx_ready) { 365 #ifdef __FreeBSD__ 366 /* 367 * Drop the packet and try later. 368 */ 369 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); 370 #endif 371 return; 372 } 373 374 /* 375 * Check for available rx buffers 376 */ 377 vq = &sc->vsc_queues[VTNET_RXQ]; 378 if (!vq_has_descs(vq)) { 379 /* 380 * Drop the packet and try later. Interrupt on 381 * empty, if that's negotiated. 382 */ 383 #ifdef __FreeBSD__ 384 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); 385 #endif 386 vq_endchains(vq, 1); 387 return; 388 } 389 390 do { 391 /* 392 * Get descriptor chain 393 */ 394 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 395 assert(n >= 1 && n <= VTNET_MAXSEGS); 396 397 /* 398 * Get a pointer to the rx header, and use the 399 * data immediately following it for the packet buffer. 400 */ 401 vrx = iov[0].iov_base; 402 riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); 403 #ifdef __FreeBSD__ 404 len = readv(sc->vsc_tapfd, riov, n); 405 #else 406 len = riov[0].iov_len; 407 ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, 408 (uint8_t *)riov[0].iov_base, &len, 0, NULL); 409 if (ret != DLPI_SUCCESS) { 410 errno = EWOULDBLOCK; 411 len = 0; 412 } 413 #endif 414 if (len <= 0 && errno == EWOULDBLOCK) { 415 /* 416 * No more packets, but still some avail ring 417 * entries. Interrupt if needed/appropriate. 418 */ 419 vq_retchains(vq, 1); 420 vq_endchains(vq, 0); 421 return; 422 } 423 424 /* 425 * The only valid field in the rx packet header is the 426 * number of buffers if merged rx bufs were negotiated. 427 */ 428 memset(vrx, 0, sc->rx_vhdrlen); 429 430 if (sc->rx_merge) { 431 struct virtio_net_rxhdr *vrxh; 432 433 vrxh = vrx; 434 vrxh->vrh_bufs = 1; 435 } 436 437 /* 438 * Release this chain and handle more chains. 439 */ 440 vq_relchain(vq, idx, len + sc->rx_vhdrlen); 441 } while (vq_has_descs(vq)); 442 443 /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ 444 vq_endchains(vq, 1); 445 } 446 447 #ifdef __FreeBSD__ 448 static __inline int 449 pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) 450 { 451 int r, i; 452 int len = 0; 453 454 for (r = nmd->cur_tx_ring; ; ) { 455 struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); 456 uint32_t cur, idx; 457 char *buf; 458 459 if (nm_ring_empty(ring)) { 460 r++; 461 if (r > nmd->last_tx_ring) 462 r = nmd->first_tx_ring; 463 if (r == nmd->cur_tx_ring) 464 break; 465 continue; 466 } 467 cur = ring->cur; 468 idx = ring->slot[cur].buf_idx; 469 buf = NETMAP_BUF(ring, idx); 470 471 for (i = 0; i < iovcnt; i++) { 472 if (len + iov[i].iov_len > 2048) 473 break; 474 memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); 475 len += iov[i].iov_len; 476 } 477 ring->slot[cur].len = len; 478 ring->head = ring->cur = nm_ring_next(ring, cur); 479 nmd->cur_tx_ring = r; 480 ioctl(nmd->fd, NIOCTXSYNC, NULL); 481 break; 482 } 483 484 return (len); 485 } 486 487 static __inline int 488 pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) 489 { 490 int len = 0; 491 int i = 0; 492 int r; 493 494 for (r = nmd->cur_rx_ring; ; ) { 495 struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); 496 uint32_t cur, idx; 497 char *buf; 498 size_t left; 499 500 if (nm_ring_empty(ring)) { 501 r++; 502 if (r > nmd->last_rx_ring) 503 r = nmd->first_rx_ring; 504 if (r == nmd->cur_rx_ring) 505 break; 506 continue; 507 } 508 cur = ring->cur; 509 idx = ring->slot[cur].buf_idx; 510 buf = NETMAP_BUF(ring, idx); 511 left = ring->slot[cur].len; 512 513 for (i = 0; i < iovcnt && left > 0; i++) { 514 if (iov[i].iov_len > left) 515 iov[i].iov_len = left; 516 memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); 517 len += iov[i].iov_len; 518 left -= iov[i].iov_len; 519 } 520 ring->head = ring->cur = nm_ring_next(ring, cur); 521 nmd->cur_rx_ring = r; 522 ioctl(nmd->fd, NIOCRXSYNC, NULL); 523 break; 524 } 525 for (; i < iovcnt; i++) 526 iov[i].iov_len = 0; 527 528 return (len); 529 } 530 531 /* 532 * Called to send a buffer chain out to the vale port 533 */ 534 static void 535 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 536 int len) 537 { 538 static char pad[60]; /* all zero bytes */ 539 540 if (sc->vsc_nmd == NULL) 541 return; 542 543 /* 544 * If the length is < 60, pad out to that and add the 545 * extra zero'd segment to the iov. It is guaranteed that 546 * there is always an extra iov available by the caller. 547 */ 548 if (len < 60) { 549 iov[iovcnt].iov_base = pad; 550 iov[iovcnt].iov_len = 60 - len; 551 iovcnt++; 552 } 553 (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); 554 } 555 556 static void 557 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) 558 { 559 struct iovec iov[VTNET_MAXSEGS], *riov; 560 struct vqueue_info *vq; 561 void *vrx; 562 int len, n; 563 uint16_t idx; 564 565 /* 566 * Should never be called without a valid netmap descriptor 567 */ 568 assert(sc->vsc_nmd != NULL); 569 570 /* Features must be negotiated */ 571 if (!sc->features_negotiated) { 572 return; 573 } 574 575 /* 576 * But, will be called when the rx ring hasn't yet 577 * been set up. 578 */ 579 if (!sc->vsc_rx_ready) { 580 /* 581 * Drop the packet and try later. 582 */ 583 (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); 584 return; 585 } 586 587 /* 588 * Check for available rx buffers 589 */ 590 vq = &sc->vsc_queues[VTNET_RXQ]; 591 if (!vq_has_descs(vq)) { 592 /* 593 * Drop the packet and try later. Interrupt on 594 * empty, if that's negotiated. 595 */ 596 (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); 597 vq_endchains(vq, 1); 598 return; 599 } 600 601 do { 602 /* 603 * Get descriptor chain. 604 */ 605 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 606 assert(n >= 1 && n <= VTNET_MAXSEGS); 607 608 /* 609 * Get a pointer to the rx header, and use the 610 * data immediately following it for the packet buffer. 611 */ 612 vrx = iov[0].iov_base; 613 riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); 614 615 len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); 616 617 if (len == 0) { 618 /* 619 * No more packets, but still some avail ring 620 * entries. Interrupt if needed/appropriate. 621 */ 622 vq_retchain(vq); 623 vq_endchains(vq, 0); 624 return; 625 } 626 627 /* 628 * The only valid field in the rx packet header is the 629 * number of buffers if merged rx bufs were negotiated. 630 */ 631 memset(vrx, 0, sc->rx_vhdrlen); 632 633 if (sc->rx_merge) { 634 struct virtio_net_rxhdr *vrxh; 635 636 vrxh = vrx; 637 vrxh->vrh_bufs = 1; 638 } 639 640 /* 641 * Release this chain and handle more chains. 642 */ 643 vq_relchain(vq, idx, len + sc->rx_vhdrlen); 644 } while (vq_has_descs(vq)); 645 646 /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ 647 vq_endchains(vq, 1); 648 } 649 #endif /* __FreeBSD__ */ 650 651 #ifdef __FreeBSD__ 652 static void 653 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) 654 { 655 struct pci_vtnet_softc *sc = param; 656 657 pthread_mutex_lock(&sc->rx_mtx); 658 sc->pci_vtnet_rx(sc); 659 pthread_mutex_unlock(&sc->rx_mtx); 660 661 } 662 #else 663 static void * 664 pci_vtnet_poll_thread(void *param) 665 { 666 struct pci_vtnet_softc *sc = param; 667 pollfd_t pollset; 668 669 pollset.fd = sc->vsc_dlpifd; 670 pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; 671 672 for (;;) { 673 if (poll(&pollset, 1, -1) < 0) { 674 if (errno == EINTR) 675 continue; 676 fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno); 677 continue; 678 } 679 pthread_mutex_lock(&sc->vsc_mtx); 680 pci_vtnet_tap_rx(sc); 681 pthread_mutex_unlock(&sc->vsc_mtx); 682 } 683 684 return (NULL); 685 } 686 #endif /* __FreeBSD__ */ 687 688 static void 689 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) 690 { 691 struct pci_vtnet_softc *sc = vsc; 692 693 /* 694 * A qnotify means that the rx process can now begin. 695 * Enable RX only if features are negotiated. 696 */ 697 pthread_mutex_lock(&sc->rx_mtx); 698 if (sc->vsc_rx_ready == 0 && sc->features_negotiated) { 699 sc->vsc_rx_ready = 1; 700 vq_kick_disable(vq); 701 } 702 pthread_mutex_unlock(&sc->rx_mtx); 703 } 704 705 static void 706 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) 707 { 708 struct iovec iov[VTNET_MAXSEGS + 1]; 709 int i, n; 710 int plen, tlen; 711 uint16_t idx; 712 713 /* 714 * Obtain chain of descriptors. The first one is 715 * really the header descriptor, so we need to sum 716 * up two lengths: packet length and transfer length. 717 */ 718 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 719 assert(n >= 1 && n <= VTNET_MAXSEGS); 720 plen = 0; 721 tlen = iov[0].iov_len; 722 for (i = 1; i < n; i++) { 723 plen += iov[i].iov_len; 724 tlen += iov[i].iov_len; 725 } 726 727 DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); 728 sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); 729 730 /* chain is processed, release it and set tlen */ 731 vq_relchain(vq, idx, tlen); 732 } 733 734 static void 735 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) 736 { 737 struct pci_vtnet_softc *sc = vsc; 738 739 /* 740 * Any ring entries to process? 741 */ 742 if (!vq_has_descs(vq)) 743 return; 744 745 /* Signal the tx thread for processing */ 746 pthread_mutex_lock(&sc->tx_mtx); 747 vq_kick_disable(vq); 748 if (sc->tx_in_progress == 0) 749 pthread_cond_signal(&sc->tx_cond); 750 pthread_mutex_unlock(&sc->tx_mtx); 751 } 752 753 /* 754 * Thread which will handle processing of TX desc 755 */ 756 static void * 757 pci_vtnet_tx_thread(void *param) 758 { 759 struct pci_vtnet_softc *sc = param; 760 struct vqueue_info *vq; 761 int error; 762 763 vq = &sc->vsc_queues[VTNET_TXQ]; 764 765 /* 766 * Let us wait till the tx queue pointers get initialised & 767 * first tx signaled 768 */ 769 pthread_mutex_lock(&sc->tx_mtx); 770 error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); 771 assert(error == 0); 772 773 for (;;) { 774 /* note - tx mutex is locked here */ 775 while (sc->resetting || !vq_has_descs(vq)) { 776 vq_kick_enable(vq); 777 if (!sc->resetting && vq_has_descs(vq)) 778 break; 779 780 sc->tx_in_progress = 0; 781 error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); 782 assert(error == 0); 783 } 784 vq_kick_disable(vq); 785 sc->tx_in_progress = 1; 786 pthread_mutex_unlock(&sc->tx_mtx); 787 788 do { 789 /* 790 * Run through entries, placing them into 791 * iovecs and sending when an end-of-packet 792 * is found 793 */ 794 pci_vtnet_proctx(sc, vq); 795 } while (vq_has_descs(vq)); 796 797 /* 798 * Generate an interrupt if needed. 799 */ 800 vq_endchains(vq, 1); 801 802 pthread_mutex_lock(&sc->tx_mtx); 803 } 804 return (NULL); 805 } 806 807 #ifdef __FreeBSD__ 808 static void 809 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) 810 { 811 812 DPRINTF(("vtnet: control qnotify!")); 813 } 814 #endif /* __FreeBSD__ */ 815 816 static void 817 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, const char *devname) 818 { 819 char tbuf[80]; 820 #ifndef WITHOUT_CAPSICUM 821 cap_rights_t rights; 822 #endif 823 #ifndef __FreeBSD__ 824 uchar_t physaddr[DLPI_PHYSADDR_MAX]; 825 size_t physaddrlen = DLPI_PHYSADDR_MAX; 826 int error; 827 #endif 828 829 strcpy(tbuf, "/dev/"); 830 strlcat(tbuf, devname, sizeof(tbuf)); 831 832 sc->pci_vtnet_rx = pci_vtnet_tap_rx; 833 sc->pci_vtnet_tx = pci_vtnet_tap_tx; 834 #ifdef __FreeBSD__ 835 sc->vsc_tapfd = open(tbuf, O_RDWR); 836 if (sc->vsc_tapfd == -1) { 837 WPRINTF(("open of tap device %s failed\n", tbuf)); 838 return; 839 } 840 841 /* 842 * Set non-blocking and register for read 843 * notifications with the event loop 844 */ 845 int opt = 1; 846 if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { 847 WPRINTF(("tap device O_NONBLOCK failed\n")); 848 close(sc->vsc_tapfd); 849 sc->vsc_tapfd = -1; 850 } 851 852 #ifndef WITHOUT_CAPSICUM 853 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 854 if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1) 855 errx(EX_OSERR, "Unable to apply rights for sandbox"); 856 #endif 857 858 sc->vsc_mevp = mevent_add(sc->vsc_tapfd, 859 EVF_READ, 860 pci_vtnet_rx_callback, 861 sc); 862 if (sc->vsc_mevp == NULL) { 863 WPRINTF(("Could not register event\n")); 864 close(sc->vsc_tapfd); 865 sc->vsc_tapfd = -1; 866 } 867 #else 868 if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { 869 WPRINTF(("open of vnic device %s failed\n", devname)); 870 } 871 872 if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, 873 &physaddrlen) != DLPI_SUCCESS) { 874 WPRINTF(("read MAC address of vnic device %s failed\n", 875 devname)); 876 } 877 if (physaddrlen != ETHERADDRL) { 878 WPRINTF(("bad MAC address len %d on vnic device %s\n", 879 physaddrlen, devname)); 880 } 881 memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); 882 883 if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { 884 WPRINTF(("bind of vnic device %s failed\n", devname)); 885 } 886 887 if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { 888 WPRINTF(("enable promiscous mode(physical) of vnic device %s " 889 "failed\n", devname)); 890 } 891 if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { 892 WPRINTF(("enable promiscous mode(SAP) of vnic device %s " 893 "failed\n", devname)); 894 } 895 896 sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); 897 898 if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { 899 WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", 900 devname)); 901 dlpi_close(sc->vsc_dhp); 902 sc->vsc_dlpifd = -1; 903 } 904 905 error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); 906 assert(error == 0); 907 #endif 908 } 909 910 #ifdef __FreeBSD__ 911 static void 912 pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) 913 { 914 sc->pci_vtnet_rx = pci_vtnet_netmap_rx; 915 sc->pci_vtnet_tx = pci_vtnet_netmap_tx; 916 917 sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); 918 if (sc->vsc_nmd == NULL) { 919 WPRINTF(("open of netmap device %s failed\n", ifname)); 920 return; 921 } 922 923 sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, 924 EVF_READ, 925 pci_vtnet_rx_callback, 926 sc); 927 if (sc->vsc_mevp == NULL) { 928 WPRINTF(("Could not register event\n")); 929 nm_close(sc->vsc_nmd); 930 sc->vsc_nmd = NULL; 931 } 932 } 933 #endif /* __FreeBSD__ */ 934 935 static int 936 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 937 { 938 struct pci_vtnet_softc *sc; 939 const char *value; 940 char tname[MAXCOMLEN + 1]; 941 #ifdef __FreeBSD__ 942 unsigned long mtu = ETHERMTU; 943 #else 944 int use_msix = 1; 945 #endif 946 int err; 947 948 /* 949 * Allocate data structures for further virtio initializations. 950 * sc also contains a copy of vtnet_vi_consts, since capabilities 951 * change depending on the backend. 952 */ 953 sc = calloc(1, sizeof(struct pci_vtnet_softc)); 954 955 sc->vsc_consts = vtnet_vi_consts; 956 pthread_mutex_init(&sc->vsc_mtx, NULL); 957 958 sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; 959 sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; 960 sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; 961 sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; 962 #ifdef notyet 963 sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; 964 sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; 965 #endif 966 967 value = get_config_value_node(nvl, "mac"); 968 if (value != NULL) { 969 err = net_parsemac(value, sc->vsc_config.mac); 970 if (err) { 971 free(sc); 972 return (err); 973 } 974 } else 975 net_genmac(pi, sc->vsc_config.mac); 976 977 #ifdef __FreeBSD__ 978 value = get_config_value_node(nvl, "mtu"); 979 if (value != NULL) { 980 err = net_parsemtu(value, &mtu); 981 if (err) { 982 free(sc); 983 return (err); 984 } 985 if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) { 986 err = EINVAL; 987 errno = EINVAL; 988 free(sc); 989 return (err); 990 } 991 sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU; 992 } 993 #endif 994 995 /* Permit interfaces without a configured backend. */ 996 if (get_config_value_node(nvl, "backend") != NULL) { 997 #ifdef __FreeBSD__ 998 err = netbe_init(&sc->vsc_be, nvl, pci_vtnet_rx_callback, sc); 999 if (err) { 1000 free(sc); 1001 return (err); 1002 } 1003 #else 1004 pci_vtnet_tap_setup(sc, get_config_value_node(nvl, "backend")); 1005 #endif 1006 } 1007 1008 1009 #ifdef __FreeBSD__ 1010 sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | 1011 netbe_get_cap(sc->vsc_be); 1012 #endif 1013 1014 /* 1015 * Since we do not actually support multiqueue, 1016 * set the maximum virtqueue pairs to 1. 1017 */ 1018 sc->vsc_config.max_virtqueue_pairs = 1; 1019 1020 /* initialize config space */ 1021 pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); 1022 pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); 1023 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); 1024 pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_ID_NETWORK); 1025 pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); 1026 1027 /* Link is always up. */ 1028 sc->vsc_config.status = 1; 1029 1030 /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ 1031 if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) 1032 return (1); 1033 1034 /* use BAR 0 to map config regs in IO space */ 1035 vi_set_io_bar(&sc->vsc_vs, 0); 1036 1037 sc->resetting = 0; 1038 1039 sc->rx_merge = 1; 1040 sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); 1041 pthread_mutex_init(&sc->rx_mtx, NULL); 1042 1043 /* 1044 * Initialize tx semaphore & spawn TX processing thread. 1045 * As of now, only one thread for TX desc processing is 1046 * spawned. 1047 */ 1048 sc->tx_in_progress = 0; 1049 pthread_mutex_init(&sc->tx_mtx, NULL); 1050 pthread_cond_init(&sc->tx_cond, NULL); 1051 pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); 1052 snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, 1053 pi->pi_func); 1054 pthread_set_name_np(sc->tx_tid, tname); 1055 1056 return (0); 1057 } 1058 1059 static int 1060 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) 1061 { 1062 struct pci_vtnet_softc *sc = vsc; 1063 void *ptr; 1064 1065 if (offset < 6) { 1066 assert(offset + size <= 6); 1067 /* 1068 * The driver is allowed to change the MAC address 1069 */ 1070 ptr = &sc->vsc_config.mac[offset]; 1071 memcpy(ptr, &value, size); 1072 } else { 1073 /* silently ignore other writes */ 1074 DPRINTF(("vtnet: write to readonly reg %d", offset)); 1075 } 1076 1077 return (0); 1078 } 1079 1080 static int 1081 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) 1082 { 1083 struct pci_vtnet_softc *sc = vsc; 1084 void *ptr; 1085 1086 ptr = (uint8_t *)&sc->vsc_config + offset; 1087 memcpy(retval, ptr, size); 1088 return (0); 1089 } 1090 1091 static void 1092 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) 1093 { 1094 struct pci_vtnet_softc *sc = vsc; 1095 1096 sc->vsc_features = negotiated_features; 1097 1098 if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { 1099 sc->rx_merge = 0; 1100 /* non-merge rx header is 2 bytes shorter */ 1101 sc->rx_vhdrlen -= 2; 1102 } 1103 1104 pthread_mutex_lock(&sc->rx_mtx); 1105 sc->features_negotiated = true; 1106 pthread_mutex_unlock(&sc->rx_mtx); 1107 } 1108 1109 #ifndef __FreeBSD__ 1110 static int 1111 pci_vtnet_legacy_config(nvlist_t *nvl, const char *opt) 1112 { 1113 char *config, *name, *tofree, *value; 1114 1115 if (opt == NULL) 1116 return (0); 1117 1118 config = tofree = strdup(opt); 1119 while ((name = strsep(&config, ",")) != NULL) { 1120 value = strchr(name, '='); 1121 if (value != NULL) { 1122 *value++ = '\0'; 1123 set_config_value_node(nvl, name, value); 1124 } else { 1125 set_config_value_node(nvl, "backend", name); 1126 } 1127 } 1128 free(tofree); 1129 return (0); 1130 } 1131 #endif 1132 1133 struct pci_devemu pci_de_vnet = { 1134 .pe_emu = "virtio-net", 1135 .pe_init = pci_vtnet_init, 1136 #ifdef __FreeBSD__ 1137 .pe_legacy_config = netbe_legacy_config, 1138 #else 1139 .pe_legacy_config = pci_vtnet_legacy_config, 1140 #endif 1141 .pe_barwrite = vi_pci_write, 1142 .pe_barread = vi_pci_read 1143 }; 1144 PCI_EMUL_SET(pci_de_vnet); 1145