1 /* A network driver using virtio. 2 * 3 * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, see <http://www.gnu.org/licenses/>. 17 */ 18 //#define DEBUG 19 #include <linux/netdevice.h> 20 #include <linux/etherdevice.h> 21 #include <linux/ethtool.h> 22 #include <linux/module.h> 23 #include <linux/virtio.h> 24 #include <linux/virtio_net.h> 25 #include <linux/bpf.h> 26 #include <linux/bpf_trace.h> 27 #include <linux/scatterlist.h> 28 #include <linux/if_vlan.h> 29 #include <linux/slab.h> 30 #include <linux/cpu.h> 31 #include <linux/average.h> 32 #include <linux/filter.h> 33 #include <linux/netdevice.h> 34 #include <linux/pci.h> 35 #include <net/route.h> 36 #include <net/xdp.h> 37 #include <net/net_failover.h> 38 39 static int napi_weight = NAPI_POLL_WEIGHT; 40 module_param(napi_weight, int, 0444); 41 42 static bool csum = true, gso = true, napi_tx; 43 module_param(csum, bool, 0444); 44 module_param(gso, bool, 0444); 45 module_param(napi_tx, bool, 0644); 46 47 /* FIXME: MTU in config. */ 48 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) 49 #define GOOD_COPY_LEN 128 50 51 #define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD) 52 53 /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */ 54 #define VIRTIO_XDP_HEADROOM 256 55 56 /* Separating two types of XDP xmit */ 57 #define VIRTIO_XDP_TX BIT(0) 58 #define VIRTIO_XDP_REDIR BIT(1) 59 60 /* RX packet size EWMA. The average packet size is used to determine the packet 61 * buffer size when refilling RX rings. As the entire RX ring may be refilled 62 * at once, the weight is chosen so that the EWMA will be insensitive to short- 63 * term, transient changes in packet size. 64 */ 65 DECLARE_EWMA(pkt_len, 0, 64) 66 67 #define VIRTNET_DRIVER_VERSION "1.0.0" 68 69 static const unsigned long guest_offloads[] = { 70 VIRTIO_NET_F_GUEST_TSO4, 71 VIRTIO_NET_F_GUEST_TSO6, 72 VIRTIO_NET_F_GUEST_ECN, 73 VIRTIO_NET_F_GUEST_UFO 74 }; 75 76 struct virtnet_stat_desc { 77 char desc[ETH_GSTRING_LEN]; 78 size_t offset; 79 }; 80 81 struct virtnet_sq_stats { 82 struct u64_stats_sync syncp; 83 u64 packets; 84 u64 bytes; 85 }; 86 87 struct virtnet_rq_stats { 88 struct u64_stats_sync syncp; 89 u64 packets; 90 u64 bytes; 91 }; 92 93 #define VIRTNET_SQ_STAT(m) offsetof(struct virtnet_sq_stats, m) 94 #define VIRTNET_RQ_STAT(m) offsetof(struct virtnet_rq_stats, m) 95 96 static const struct virtnet_stat_desc virtnet_sq_stats_desc[] = { 97 { "packets", VIRTNET_SQ_STAT(packets) }, 98 { "bytes", VIRTNET_SQ_STAT(bytes) }, 99 }; 100 101 static const struct virtnet_stat_desc virtnet_rq_stats_desc[] = { 102 { "packets", VIRTNET_RQ_STAT(packets) }, 103 { "bytes", VIRTNET_RQ_STAT(bytes) }, 104 }; 105 106 #define VIRTNET_SQ_STATS_LEN ARRAY_SIZE(virtnet_sq_stats_desc) 107 #define VIRTNET_RQ_STATS_LEN ARRAY_SIZE(virtnet_rq_stats_desc) 108 109 /* Internal representation of a send virtqueue */ 110 struct send_queue { 111 /* Virtqueue associated with this send _queue */ 112 struct virtqueue *vq; 113 114 /* TX: fragments + linear part + virtio header */ 115 struct scatterlist sg[MAX_SKB_FRAGS + 2]; 116 117 /* Name of the send queue: output.$index */ 118 char name[40]; 119 120 struct virtnet_sq_stats stats; 121 122 struct napi_struct napi; 123 }; 124 125 /* Internal representation of a receive virtqueue */ 126 struct receive_queue { 127 /* Virtqueue associated with this receive_queue */ 128 struct virtqueue *vq; 129 130 struct napi_struct napi; 131 132 struct bpf_prog __rcu *xdp_prog; 133 134 struct virtnet_rq_stats stats; 135 136 /* Chain pages by the private ptr. */ 137 struct page *pages; 138 139 /* Average packet length for mergeable receive buffers. */ 140 struct ewma_pkt_len mrg_avg_pkt_len; 141 142 /* Page frag for packet buffer allocation. */ 143 struct page_frag alloc_frag; 144 145 /* RX: fragments + linear part + virtio header */ 146 struct scatterlist sg[MAX_SKB_FRAGS + 2]; 147 148 /* Min single buffer size for mergeable buffers case. */ 149 unsigned int min_buf_len; 150 151 /* Name of this receive queue: input.$index */ 152 char name[40]; 153 154 struct xdp_rxq_info xdp_rxq; 155 }; 156 157 /* Control VQ buffers: protected by the rtnl lock */ 158 struct control_buf { 159 struct virtio_net_ctrl_hdr hdr; 160 virtio_net_ctrl_ack status; 161 struct virtio_net_ctrl_mq mq; 162 u8 promisc; 163 u8 allmulti; 164 __virtio16 vid; 165 __virtio64 offloads; 166 }; 167 168 struct virtnet_info { 169 struct virtio_device *vdev; 170 struct virtqueue *cvq; 171 struct net_device *dev; 172 struct send_queue *sq; 173 struct receive_queue *rq; 174 unsigned int status; 175 176 /* Max # of queue pairs supported by the device */ 177 u16 max_queue_pairs; 178 179 /* # of queue pairs currently used by the driver */ 180 u16 curr_queue_pairs; 181 182 /* # of XDP queue pairs currently used by the driver */ 183 u16 xdp_queue_pairs; 184 185 /* I like... big packets and I cannot lie! */ 186 bool big_packets; 187 188 /* Host will merge rx buffers for big packets (shake it! shake it!) */ 189 bool mergeable_rx_bufs; 190 191 /* Has control virtqueue */ 192 bool has_cvq; 193 194 /* Host can handle any s/g split between our header and packet data */ 195 bool any_header_sg; 196 197 /* Packet virtio header size */ 198 u8 hdr_len; 199 200 /* Work struct for refilling if we run low on memory. */ 201 struct delayed_work refill; 202 203 /* Work struct for config space updates */ 204 struct work_struct config_work; 205 206 /* Does the affinity hint is set for virtqueues? */ 207 bool affinity_hint_set; 208 209 /* CPU hotplug instances for online & dead */ 210 struct hlist_node node; 211 struct hlist_node node_dead; 212 213 struct control_buf *ctrl; 214 215 /* Ethtool settings */ 216 u8 duplex; 217 u32 speed; 218 219 unsigned long guest_offloads; 220 221 /* failover when STANDBY feature enabled */ 222 struct failover *failover; 223 }; 224 225 struct padded_vnet_hdr { 226 struct virtio_net_hdr_mrg_rxbuf hdr; 227 /* 228 * hdr is in a separate sg buffer, and data sg buffer shares same page 229 * with this header sg. This padding makes next sg 16 byte aligned 230 * after the header. 231 */ 232 char padding[4]; 233 }; 234 235 /* Converting between virtqueue no. and kernel tx/rx queue no. 236 * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq 237 */ 238 static int vq2txq(struct virtqueue *vq) 239 { 240 return (vq->index - 1) / 2; 241 } 242 243 static int txq2vq(int txq) 244 { 245 return txq * 2 + 1; 246 } 247 248 static int vq2rxq(struct virtqueue *vq) 249 { 250 return vq->index / 2; 251 } 252 253 static int rxq2vq(int rxq) 254 { 255 return rxq * 2; 256 } 257 258 static inline struct virtio_net_hdr_mrg_rxbuf *skb_vnet_hdr(struct sk_buff *skb) 259 { 260 return (struct virtio_net_hdr_mrg_rxbuf *)skb->cb; 261 } 262 263 /* 264 * private is used to chain pages for big packets, put the whole 265 * most recent used list in the beginning for reuse 266 */ 267 static void give_pages(struct receive_queue *rq, struct page *page) 268 { 269 struct page *end; 270 271 /* Find end of list, sew whole thing into vi->rq.pages. */ 272 for (end = page; end->private; end = (struct page *)end->private); 273 end->private = (unsigned long)rq->pages; 274 rq->pages = page; 275 } 276 277 static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask) 278 { 279 struct page *p = rq->pages; 280 281 if (p) { 282 rq->pages = (struct page *)p->private; 283 /* clear private here, it is used to chain pages */ 284 p->private = 0; 285 } else 286 p = alloc_page(gfp_mask); 287 return p; 288 } 289 290 static void virtqueue_napi_schedule(struct napi_struct *napi, 291 struct virtqueue *vq) 292 { 293 if (napi_schedule_prep(napi)) { 294 virtqueue_disable_cb(vq); 295 __napi_schedule(napi); 296 } 297 } 298 299 static void virtqueue_napi_complete(struct napi_struct *napi, 300 struct virtqueue *vq, int processed) 301 { 302 int opaque; 303 304 opaque = virtqueue_enable_cb_prepare(vq); 305 if (napi_complete_done(napi, processed)) { 306 if (unlikely(virtqueue_poll(vq, opaque))) 307 virtqueue_napi_schedule(napi, vq); 308 } else { 309 virtqueue_disable_cb(vq); 310 } 311 } 312 313 static void skb_xmit_done(struct virtqueue *vq) 314 { 315 struct virtnet_info *vi = vq->vdev->priv; 316 struct napi_struct *napi = &vi->sq[vq2txq(vq)].napi; 317 318 /* Suppress further interrupts. */ 319 virtqueue_disable_cb(vq); 320 321 if (napi->weight) 322 virtqueue_napi_schedule(napi, vq); 323 else 324 /* We were probably waiting for more output buffers. */ 325 netif_wake_subqueue(vi->dev, vq2txq(vq)); 326 } 327 328 #define MRG_CTX_HEADER_SHIFT 22 329 static void *mergeable_len_to_ctx(unsigned int truesize, 330 unsigned int headroom) 331 { 332 return (void *)(unsigned long)((headroom << MRG_CTX_HEADER_SHIFT) | truesize); 333 } 334 335 static unsigned int mergeable_ctx_to_headroom(void *mrg_ctx) 336 { 337 return (unsigned long)mrg_ctx >> MRG_CTX_HEADER_SHIFT; 338 } 339 340 static unsigned int mergeable_ctx_to_truesize(void *mrg_ctx) 341 { 342 return (unsigned long)mrg_ctx & ((1 << MRG_CTX_HEADER_SHIFT) - 1); 343 } 344 345 /* Called from bottom half context */ 346 static struct sk_buff *page_to_skb(struct virtnet_info *vi, 347 struct receive_queue *rq, 348 struct page *page, unsigned int offset, 349 unsigned int len, unsigned int truesize) 350 { 351 struct sk_buff *skb; 352 struct virtio_net_hdr_mrg_rxbuf *hdr; 353 unsigned int copy, hdr_len, hdr_padded_len; 354 char *p; 355 356 p = page_address(page) + offset; 357 358 /* copy small packet so we can reuse these pages for small data */ 359 skb = napi_alloc_skb(&rq->napi, GOOD_COPY_LEN); 360 if (unlikely(!skb)) 361 return NULL; 362 363 hdr = skb_vnet_hdr(skb); 364 365 hdr_len = vi->hdr_len; 366 if (vi->mergeable_rx_bufs) 367 hdr_padded_len = sizeof(*hdr); 368 else 369 hdr_padded_len = sizeof(struct padded_vnet_hdr); 370 371 memcpy(hdr, p, hdr_len); 372 373 len -= hdr_len; 374 offset += hdr_padded_len; 375 p += hdr_padded_len; 376 377 copy = len; 378 if (copy > skb_tailroom(skb)) 379 copy = skb_tailroom(skb); 380 skb_put_data(skb, p, copy); 381 382 len -= copy; 383 offset += copy; 384 385 if (vi->mergeable_rx_bufs) { 386 if (len) 387 skb_add_rx_frag(skb, 0, page, offset, len, truesize); 388 else 389 put_page(page); 390 return skb; 391 } 392 393 /* 394 * Verify that we can indeed put this data into a skb. 395 * This is here to handle cases when the device erroneously 396 * tries to receive more than is possible. This is usually 397 * the case of a broken device. 398 */ 399 if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) { 400 net_dbg_ratelimited("%s: too much data\n", skb->dev->name); 401 dev_kfree_skb(skb); 402 return NULL; 403 } 404 BUG_ON(offset >= PAGE_SIZE); 405 while (len) { 406 unsigned int frag_size = min((unsigned)PAGE_SIZE - offset, len); 407 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, offset, 408 frag_size, truesize); 409 len -= frag_size; 410 page = (struct page *)page->private; 411 offset = 0; 412 } 413 414 if (page) 415 give_pages(rq, page); 416 417 return skb; 418 } 419 420 static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, 421 struct send_queue *sq, 422 struct xdp_frame *xdpf) 423 { 424 struct virtio_net_hdr_mrg_rxbuf *hdr; 425 int err; 426 427 /* virtqueue want to use data area in-front of packet */ 428 if (unlikely(xdpf->metasize > 0)) 429 return -EOPNOTSUPP; 430 431 if (unlikely(xdpf->headroom < vi->hdr_len)) 432 return -EOVERFLOW; 433 434 /* Make room for virtqueue hdr (also change xdpf->headroom?) */ 435 xdpf->data -= vi->hdr_len; 436 /* Zero header and leave csum up to XDP layers */ 437 hdr = xdpf->data; 438 memset(hdr, 0, vi->hdr_len); 439 xdpf->len += vi->hdr_len; 440 441 sg_init_one(sq->sg, xdpf->data, xdpf->len); 442 443 err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC); 444 if (unlikely(err)) 445 return -ENOSPC; /* Caller handle free/refcnt */ 446 447 return 0; 448 } 449 450 static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi, 451 struct xdp_frame *xdpf) 452 { 453 struct xdp_frame *xdpf_sent; 454 struct send_queue *sq; 455 unsigned int len; 456 unsigned int qp; 457 458 qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); 459 sq = &vi->sq[qp]; 460 461 /* Free up any pending old buffers before queueing new ones. */ 462 while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) 463 xdp_return_frame(xdpf_sent); 464 465 return __virtnet_xdp_xmit_one(vi, sq, xdpf); 466 } 467 468 static int virtnet_xdp_xmit(struct net_device *dev, 469 int n, struct xdp_frame **frames, u32 flags) 470 { 471 struct virtnet_info *vi = netdev_priv(dev); 472 struct receive_queue *rq = vi->rq; 473 struct xdp_frame *xdpf_sent; 474 struct bpf_prog *xdp_prog; 475 struct send_queue *sq; 476 unsigned int len; 477 unsigned int qp; 478 int drops = 0; 479 int err; 480 int i; 481 482 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 483 return -EINVAL; 484 485 qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); 486 sq = &vi->sq[qp]; 487 488 /* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this 489 * indicate XDP resources have been successfully allocated. 490 */ 491 xdp_prog = rcu_dereference(rq->xdp_prog); 492 if (!xdp_prog) 493 return -ENXIO; 494 495 /* Free up any pending old buffers before queueing new ones. */ 496 while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) 497 xdp_return_frame(xdpf_sent); 498 499 for (i = 0; i < n; i++) { 500 struct xdp_frame *xdpf = frames[i]; 501 502 err = __virtnet_xdp_xmit_one(vi, sq, xdpf); 503 if (err) { 504 xdp_return_frame_rx_napi(xdpf); 505 drops++; 506 } 507 } 508 509 if (flags & XDP_XMIT_FLUSH) 510 virtqueue_kick(sq->vq); 511 512 return n - drops; 513 } 514 515 static unsigned int virtnet_get_headroom(struct virtnet_info *vi) 516 { 517 return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; 518 } 519 520 /* We copy the packet for XDP in the following cases: 521 * 522 * 1) Packet is scattered across multiple rx buffers. 523 * 2) Headroom space is insufficient. 524 * 525 * This is inefficient but it's a temporary condition that 526 * we hit right after XDP is enabled and until queue is refilled 527 * with large buffers with sufficient headroom - so it should affect 528 * at most queue size packets. 529 * Afterwards, the conditions to enable 530 * XDP should preclude the underlying device from sending packets 531 * across multiple buffers (num_buf > 1), and we make sure buffers 532 * have enough headroom. 533 */ 534 static struct page *xdp_linearize_page(struct receive_queue *rq, 535 u16 *num_buf, 536 struct page *p, 537 int offset, 538 int page_off, 539 unsigned int *len) 540 { 541 struct page *page = alloc_page(GFP_ATOMIC); 542 543 if (!page) 544 return NULL; 545 546 memcpy(page_address(page) + page_off, page_address(p) + offset, *len); 547 page_off += *len; 548 549 while (--*num_buf) { 550 int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 551 unsigned int buflen; 552 void *buf; 553 int off; 554 555 buf = virtqueue_get_buf(rq->vq, &buflen); 556 if (unlikely(!buf)) 557 goto err_buf; 558 559 p = virt_to_head_page(buf); 560 off = buf - page_address(p); 561 562 /* guard against a misconfigured or uncooperative backend that 563 * is sending packet larger than the MTU. 564 */ 565 if ((page_off + buflen + tailroom) > PAGE_SIZE) { 566 put_page(p); 567 goto err_buf; 568 } 569 570 memcpy(page_address(page) + page_off, 571 page_address(p) + off, buflen); 572 page_off += buflen; 573 put_page(p); 574 } 575 576 /* Headroom does not contribute to packet length */ 577 *len = page_off - VIRTIO_XDP_HEADROOM; 578 return page; 579 err_buf: 580 __free_pages(page, 0); 581 return NULL; 582 } 583 584 static struct sk_buff *receive_small(struct net_device *dev, 585 struct virtnet_info *vi, 586 struct receive_queue *rq, 587 void *buf, void *ctx, 588 unsigned int len, 589 unsigned int *xdp_xmit) 590 { 591 struct sk_buff *skb; 592 struct bpf_prog *xdp_prog; 593 unsigned int xdp_headroom = (unsigned long)ctx; 594 unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom; 595 unsigned int headroom = vi->hdr_len + header_offset; 596 unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + 597 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 598 struct page *page = virt_to_head_page(buf); 599 unsigned int delta = 0; 600 struct page *xdp_page; 601 int err; 602 603 len -= vi->hdr_len; 604 605 rcu_read_lock(); 606 xdp_prog = rcu_dereference(rq->xdp_prog); 607 if (xdp_prog) { 608 struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset; 609 struct xdp_frame *xdpf; 610 struct xdp_buff xdp; 611 void *orig_data; 612 u32 act; 613 614 if (unlikely(hdr->hdr.gso_type)) 615 goto err_xdp; 616 617 if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) { 618 int offset = buf - page_address(page) + header_offset; 619 unsigned int tlen = len + vi->hdr_len; 620 u16 num_buf = 1; 621 622 xdp_headroom = virtnet_get_headroom(vi); 623 header_offset = VIRTNET_RX_PAD + xdp_headroom; 624 headroom = vi->hdr_len + header_offset; 625 buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) + 626 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 627 xdp_page = xdp_linearize_page(rq, &num_buf, page, 628 offset, header_offset, 629 &tlen); 630 if (!xdp_page) 631 goto err_xdp; 632 633 buf = page_address(xdp_page); 634 put_page(page); 635 page = xdp_page; 636 } 637 638 xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; 639 xdp.data = xdp.data_hard_start + xdp_headroom; 640 xdp_set_data_meta_invalid(&xdp); 641 xdp.data_end = xdp.data + len; 642 xdp.rxq = &rq->xdp_rxq; 643 orig_data = xdp.data; 644 act = bpf_prog_run_xdp(xdp_prog, &xdp); 645 646 switch (act) { 647 case XDP_PASS: 648 /* Recalculate length in case bpf program changed it */ 649 delta = orig_data - xdp.data; 650 len = xdp.data_end - xdp.data; 651 break; 652 case XDP_TX: 653 xdpf = convert_to_xdp_frame(&xdp); 654 if (unlikely(!xdpf)) 655 goto err_xdp; 656 err = __virtnet_xdp_tx_xmit(vi, xdpf); 657 if (unlikely(err)) { 658 trace_xdp_exception(vi->dev, xdp_prog, act); 659 goto err_xdp; 660 } 661 *xdp_xmit |= VIRTIO_XDP_TX; 662 rcu_read_unlock(); 663 goto xdp_xmit; 664 case XDP_REDIRECT: 665 err = xdp_do_redirect(dev, &xdp, xdp_prog); 666 if (err) 667 goto err_xdp; 668 *xdp_xmit |= VIRTIO_XDP_REDIR; 669 rcu_read_unlock(); 670 goto xdp_xmit; 671 default: 672 bpf_warn_invalid_xdp_action(act); 673 case XDP_ABORTED: 674 trace_xdp_exception(vi->dev, xdp_prog, act); 675 case XDP_DROP: 676 goto err_xdp; 677 } 678 } 679 rcu_read_unlock(); 680 681 skb = build_skb(buf, buflen); 682 if (!skb) { 683 put_page(page); 684 goto err; 685 } 686 skb_reserve(skb, headroom - delta); 687 skb_put(skb, len); 688 if (!delta) { 689 buf += header_offset; 690 memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len); 691 } /* keep zeroed vnet hdr since packet was changed by bpf */ 692 693 err: 694 return skb; 695 696 err_xdp: 697 rcu_read_unlock(); 698 dev->stats.rx_dropped++; 699 put_page(page); 700 xdp_xmit: 701 return NULL; 702 } 703 704 static struct sk_buff *receive_big(struct net_device *dev, 705 struct virtnet_info *vi, 706 struct receive_queue *rq, 707 void *buf, 708 unsigned int len) 709 { 710 struct page *page = buf; 711 struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE); 712 713 if (unlikely(!skb)) 714 goto err; 715 716 return skb; 717 718 err: 719 dev->stats.rx_dropped++; 720 give_pages(rq, page); 721 return NULL; 722 } 723 724 static struct sk_buff *receive_mergeable(struct net_device *dev, 725 struct virtnet_info *vi, 726 struct receive_queue *rq, 727 void *buf, 728 void *ctx, 729 unsigned int len, 730 unsigned int *xdp_xmit) 731 { 732 struct virtio_net_hdr_mrg_rxbuf *hdr = buf; 733 u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers); 734 struct page *page = virt_to_head_page(buf); 735 int offset = buf - page_address(page); 736 struct sk_buff *head_skb, *curr_skb; 737 struct bpf_prog *xdp_prog; 738 unsigned int truesize; 739 unsigned int headroom = mergeable_ctx_to_headroom(ctx); 740 int err; 741 742 head_skb = NULL; 743 744 rcu_read_lock(); 745 xdp_prog = rcu_dereference(rq->xdp_prog); 746 if (xdp_prog) { 747 struct xdp_frame *xdpf; 748 struct page *xdp_page; 749 struct xdp_buff xdp; 750 void *data; 751 u32 act; 752 753 /* Transient failure which in theory could occur if 754 * in-flight packets from before XDP was enabled reach 755 * the receive path after XDP is loaded. 756 */ 757 if (unlikely(hdr->hdr.gso_type)) 758 goto err_xdp; 759 760 /* This happens when rx buffer size is underestimated 761 * or headroom is not enough because of the buffer 762 * was refilled before XDP is set. This should only 763 * happen for the first several packets, so we don't 764 * care much about its performance. 765 */ 766 if (unlikely(num_buf > 1 || 767 headroom < virtnet_get_headroom(vi))) { 768 /* linearize data for XDP */ 769 xdp_page = xdp_linearize_page(rq, &num_buf, 770 page, offset, 771 VIRTIO_XDP_HEADROOM, 772 &len); 773 if (!xdp_page) 774 goto err_xdp; 775 offset = VIRTIO_XDP_HEADROOM; 776 } else { 777 xdp_page = page; 778 } 779 780 /* Allow consuming headroom but reserve enough space to push 781 * the descriptor on if we get an XDP_TX return code. 782 */ 783 data = page_address(xdp_page) + offset; 784 xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; 785 xdp.data = data + vi->hdr_len; 786 xdp_set_data_meta_invalid(&xdp); 787 xdp.data_end = xdp.data + (len - vi->hdr_len); 788 xdp.rxq = &rq->xdp_rxq; 789 790 act = bpf_prog_run_xdp(xdp_prog, &xdp); 791 792 switch (act) { 793 case XDP_PASS: 794 /* recalculate offset to account for any header 795 * adjustments. Note other cases do not build an 796 * skb and avoid using offset 797 */ 798 offset = xdp.data - 799 page_address(xdp_page) - vi->hdr_len; 800 801 /* recalculate len if xdp.data or xdp.data_end were 802 * adjusted 803 */ 804 len = xdp.data_end - xdp.data + vi->hdr_len; 805 /* We can only create skb based on xdp_page. */ 806 if (unlikely(xdp_page != page)) { 807 rcu_read_unlock(); 808 put_page(page); 809 head_skb = page_to_skb(vi, rq, xdp_page, 810 offset, len, PAGE_SIZE); 811 return head_skb; 812 } 813 break; 814 case XDP_TX: 815 xdpf = convert_to_xdp_frame(&xdp); 816 if (unlikely(!xdpf)) 817 goto err_xdp; 818 err = __virtnet_xdp_tx_xmit(vi, xdpf); 819 if (unlikely(err)) { 820 trace_xdp_exception(vi->dev, xdp_prog, act); 821 if (unlikely(xdp_page != page)) 822 put_page(xdp_page); 823 goto err_xdp; 824 } 825 *xdp_xmit |= VIRTIO_XDP_TX; 826 if (unlikely(xdp_page != page)) 827 put_page(page); 828 rcu_read_unlock(); 829 goto xdp_xmit; 830 case XDP_REDIRECT: 831 err = xdp_do_redirect(dev, &xdp, xdp_prog); 832 if (err) { 833 if (unlikely(xdp_page != page)) 834 put_page(xdp_page); 835 goto err_xdp; 836 } 837 *xdp_xmit |= VIRTIO_XDP_REDIR; 838 if (unlikely(xdp_page != page)) 839 put_page(page); 840 rcu_read_unlock(); 841 goto xdp_xmit; 842 default: 843 bpf_warn_invalid_xdp_action(act); 844 case XDP_ABORTED: 845 trace_xdp_exception(vi->dev, xdp_prog, act); 846 case XDP_DROP: 847 if (unlikely(xdp_page != page)) 848 __free_pages(xdp_page, 0); 849 goto err_xdp; 850 } 851 } 852 rcu_read_unlock(); 853 854 truesize = mergeable_ctx_to_truesize(ctx); 855 if (unlikely(len > truesize)) { 856 pr_debug("%s: rx error: len %u exceeds truesize %lu\n", 857 dev->name, len, (unsigned long)ctx); 858 dev->stats.rx_length_errors++; 859 goto err_skb; 860 } 861 862 head_skb = page_to_skb(vi, rq, page, offset, len, truesize); 863 curr_skb = head_skb; 864 865 if (unlikely(!curr_skb)) 866 goto err_skb; 867 while (--num_buf) { 868 int num_skb_frags; 869 870 buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx); 871 if (unlikely(!buf)) { 872 pr_debug("%s: rx error: %d buffers out of %d missing\n", 873 dev->name, num_buf, 874 virtio16_to_cpu(vi->vdev, 875 hdr->num_buffers)); 876 dev->stats.rx_length_errors++; 877 goto err_buf; 878 } 879 880 page = virt_to_head_page(buf); 881 882 truesize = mergeable_ctx_to_truesize(ctx); 883 if (unlikely(len > truesize)) { 884 pr_debug("%s: rx error: len %u exceeds truesize %lu\n", 885 dev->name, len, (unsigned long)ctx); 886 dev->stats.rx_length_errors++; 887 goto err_skb; 888 } 889 890 num_skb_frags = skb_shinfo(curr_skb)->nr_frags; 891 if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) { 892 struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC); 893 894 if (unlikely(!nskb)) 895 goto err_skb; 896 if (curr_skb == head_skb) 897 skb_shinfo(curr_skb)->frag_list = nskb; 898 else 899 curr_skb->next = nskb; 900 curr_skb = nskb; 901 head_skb->truesize += nskb->truesize; 902 num_skb_frags = 0; 903 } 904 if (curr_skb != head_skb) { 905 head_skb->data_len += len; 906 head_skb->len += len; 907 head_skb->truesize += truesize; 908 } 909 offset = buf - page_address(page); 910 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) { 911 put_page(page); 912 skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1, 913 len, truesize); 914 } else { 915 skb_add_rx_frag(curr_skb, num_skb_frags, page, 916 offset, len, truesize); 917 } 918 } 919 920 ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len); 921 return head_skb; 922 923 err_xdp: 924 rcu_read_unlock(); 925 err_skb: 926 put_page(page); 927 while (num_buf-- > 1) { 928 buf = virtqueue_get_buf(rq->vq, &len); 929 if (unlikely(!buf)) { 930 pr_debug("%s: rx error: %d buffers missing\n", 931 dev->name, num_buf); 932 dev->stats.rx_length_errors++; 933 break; 934 } 935 page = virt_to_head_page(buf); 936 put_page(page); 937 } 938 err_buf: 939 dev->stats.rx_dropped++; 940 dev_kfree_skb(head_skb); 941 xdp_xmit: 942 return NULL; 943 } 944 945 static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq, 946 void *buf, unsigned int len, void **ctx, 947 unsigned int *xdp_xmit) 948 { 949 struct net_device *dev = vi->dev; 950 struct sk_buff *skb; 951 struct virtio_net_hdr_mrg_rxbuf *hdr; 952 int ret; 953 954 if (unlikely(len < vi->hdr_len + ETH_HLEN)) { 955 pr_debug("%s: short packet %i\n", dev->name, len); 956 dev->stats.rx_length_errors++; 957 if (vi->mergeable_rx_bufs) { 958 put_page(virt_to_head_page(buf)); 959 } else if (vi->big_packets) { 960 give_pages(rq, buf); 961 } else { 962 put_page(virt_to_head_page(buf)); 963 } 964 return 0; 965 } 966 967 if (vi->mergeable_rx_bufs) 968 skb = receive_mergeable(dev, vi, rq, buf, ctx, len, xdp_xmit); 969 else if (vi->big_packets) 970 skb = receive_big(dev, vi, rq, buf, len); 971 else 972 skb = receive_small(dev, vi, rq, buf, ctx, len, xdp_xmit); 973 974 if (unlikely(!skb)) 975 return 0; 976 977 hdr = skb_vnet_hdr(skb); 978 979 ret = skb->len; 980 981 if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) 982 skb->ip_summed = CHECKSUM_UNNECESSARY; 983 984 if (virtio_net_hdr_to_skb(skb, &hdr->hdr, 985 virtio_is_little_endian(vi->vdev))) { 986 net_warn_ratelimited("%s: bad gso: type: %u, size: %u\n", 987 dev->name, hdr->hdr.gso_type, 988 hdr->hdr.gso_size); 989 goto frame_err; 990 } 991 992 skb->protocol = eth_type_trans(skb, dev); 993 pr_debug("Receiving skb proto 0x%04x len %i type %i\n", 994 ntohs(skb->protocol), skb->len, skb->pkt_type); 995 996 napi_gro_receive(&rq->napi, skb); 997 return ret; 998 999 frame_err: 1000 dev->stats.rx_frame_errors++; 1001 dev_kfree_skb(skb); 1002 return 0; 1003 } 1004 1005 /* Unlike mergeable buffers, all buffers are allocated to the 1006 * same size, except for the headroom. For this reason we do 1007 * not need to use mergeable_len_to_ctx here - it is enough 1008 * to store the headroom as the context ignoring the truesize. 1009 */ 1010 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq, 1011 gfp_t gfp) 1012 { 1013 struct page_frag *alloc_frag = &rq->alloc_frag; 1014 char *buf; 1015 unsigned int xdp_headroom = virtnet_get_headroom(vi); 1016 void *ctx = (void *)(unsigned long)xdp_headroom; 1017 int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom; 1018 int err; 1019 1020 len = SKB_DATA_ALIGN(len) + 1021 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 1022 if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp))) 1023 return -ENOMEM; 1024 1025 buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; 1026 get_page(alloc_frag->page); 1027 alloc_frag->offset += len; 1028 sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom, 1029 vi->hdr_len + GOOD_PACKET_LEN); 1030 err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); 1031 if (err < 0) 1032 put_page(virt_to_head_page(buf)); 1033 return err; 1034 } 1035 1036 static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq, 1037 gfp_t gfp) 1038 { 1039 struct page *first, *list = NULL; 1040 char *p; 1041 int i, err, offset; 1042 1043 sg_init_table(rq->sg, MAX_SKB_FRAGS + 2); 1044 1045 /* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */ 1046 for (i = MAX_SKB_FRAGS + 1; i > 1; --i) { 1047 first = get_a_page(rq, gfp); 1048 if (!first) { 1049 if (list) 1050 give_pages(rq, list); 1051 return -ENOMEM; 1052 } 1053 sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE); 1054 1055 /* chain new page in list head to match sg */ 1056 first->private = (unsigned long)list; 1057 list = first; 1058 } 1059 1060 first = get_a_page(rq, gfp); 1061 if (!first) { 1062 give_pages(rq, list); 1063 return -ENOMEM; 1064 } 1065 p = page_address(first); 1066 1067 /* rq->sg[0], rq->sg[1] share the same page */ 1068 /* a separated rq->sg[0] for header - required in case !any_header_sg */ 1069 sg_set_buf(&rq->sg[0], p, vi->hdr_len); 1070 1071 /* rq->sg[1] for data packet, from offset */ 1072 offset = sizeof(struct padded_vnet_hdr); 1073 sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset); 1074 1075 /* chain first in list head */ 1076 first->private = (unsigned long)list; 1077 err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2, 1078 first, gfp); 1079 if (err < 0) 1080 give_pages(rq, first); 1081 1082 return err; 1083 } 1084 1085 static unsigned int get_mergeable_buf_len(struct receive_queue *rq, 1086 struct ewma_pkt_len *avg_pkt_len, 1087 unsigned int room) 1088 { 1089 const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); 1090 unsigned int len; 1091 1092 if (room) 1093 return PAGE_SIZE - room; 1094 1095 len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len), 1096 rq->min_buf_len, PAGE_SIZE - hdr_len); 1097 1098 return ALIGN(len, L1_CACHE_BYTES); 1099 } 1100 1101 static int add_recvbuf_mergeable(struct virtnet_info *vi, 1102 struct receive_queue *rq, gfp_t gfp) 1103 { 1104 struct page_frag *alloc_frag = &rq->alloc_frag; 1105 unsigned int headroom = virtnet_get_headroom(vi); 1106 unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; 1107 unsigned int room = SKB_DATA_ALIGN(headroom + tailroom); 1108 char *buf; 1109 void *ctx; 1110 int err; 1111 unsigned int len, hole; 1112 1113 /* Extra tailroom is needed to satisfy XDP's assumption. This 1114 * means rx frags coalescing won't work, but consider we've 1115 * disabled GSO for XDP, it won't be a big issue. 1116 */ 1117 len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room); 1118 if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp))) 1119 return -ENOMEM; 1120 1121 buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset; 1122 buf += headroom; /* advance address leaving hole at front of pkt */ 1123 get_page(alloc_frag->page); 1124 alloc_frag->offset += len + room; 1125 hole = alloc_frag->size - alloc_frag->offset; 1126 if (hole < len + room) { 1127 /* To avoid internal fragmentation, if there is very likely not 1128 * enough space for another buffer, add the remaining space to 1129 * the current buffer. 1130 */ 1131 len += hole; 1132 alloc_frag->offset += hole; 1133 } 1134 1135 sg_init_one(rq->sg, buf, len); 1136 ctx = mergeable_len_to_ctx(len, headroom); 1137 err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); 1138 if (err < 0) 1139 put_page(virt_to_head_page(buf)); 1140 1141 return err; 1142 } 1143 1144 /* 1145 * Returns false if we couldn't fill entirely (OOM). 1146 * 1147 * Normally run in the receive path, but can also be run from ndo_open 1148 * before we're receiving packets, or from refill_work which is 1149 * careful to disable receiving (using napi_disable). 1150 */ 1151 static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, 1152 gfp_t gfp) 1153 { 1154 int err; 1155 bool oom; 1156 1157 do { 1158 if (vi->mergeable_rx_bufs) 1159 err = add_recvbuf_mergeable(vi, rq, gfp); 1160 else if (vi->big_packets) 1161 err = add_recvbuf_big(vi, rq, gfp); 1162 else 1163 err = add_recvbuf_small(vi, rq, gfp); 1164 1165 oom = err == -ENOMEM; 1166 if (err) 1167 break; 1168 } while (rq->vq->num_free); 1169 virtqueue_kick(rq->vq); 1170 return !oom; 1171 } 1172 1173 static void skb_recv_done(struct virtqueue *rvq) 1174 { 1175 struct virtnet_info *vi = rvq->vdev->priv; 1176 struct receive_queue *rq = &vi->rq[vq2rxq(rvq)]; 1177 1178 virtqueue_napi_schedule(&rq->napi, rvq); 1179 } 1180 1181 static void virtnet_napi_enable(struct virtqueue *vq, struct napi_struct *napi) 1182 { 1183 napi_enable(napi); 1184 1185 /* If all buffers were filled by other side before we napi_enabled, we 1186 * won't get another interrupt, so process any outstanding packets now. 1187 * Call local_bh_enable after to trigger softIRQ processing. 1188 */ 1189 local_bh_disable(); 1190 virtqueue_napi_schedule(napi, vq); 1191 local_bh_enable(); 1192 } 1193 1194 static void virtnet_napi_tx_enable(struct virtnet_info *vi, 1195 struct virtqueue *vq, 1196 struct napi_struct *napi) 1197 { 1198 if (!napi->weight) 1199 return; 1200 1201 /* Tx napi touches cachelines on the cpu handling tx interrupts. Only 1202 * enable the feature if this is likely affine with the transmit path. 1203 */ 1204 if (!vi->affinity_hint_set) { 1205 napi->weight = 0; 1206 return; 1207 } 1208 1209 return virtnet_napi_enable(vq, napi); 1210 } 1211 1212 static void virtnet_napi_tx_disable(struct napi_struct *napi) 1213 { 1214 if (napi->weight) 1215 napi_disable(napi); 1216 } 1217 1218 static void refill_work(struct work_struct *work) 1219 { 1220 struct virtnet_info *vi = 1221 container_of(work, struct virtnet_info, refill.work); 1222 bool still_empty; 1223 int i; 1224 1225 for (i = 0; i < vi->curr_queue_pairs; i++) { 1226 struct receive_queue *rq = &vi->rq[i]; 1227 1228 napi_disable(&rq->napi); 1229 still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); 1230 virtnet_napi_enable(rq->vq, &rq->napi); 1231 1232 /* In theory, this can happen: if we don't get any buffers in 1233 * we will *never* try to fill again. 1234 */ 1235 if (still_empty) 1236 schedule_delayed_work(&vi->refill, HZ/2); 1237 } 1238 } 1239 1240 static int virtnet_receive(struct receive_queue *rq, int budget, 1241 unsigned int *xdp_xmit) 1242 { 1243 struct virtnet_info *vi = rq->vq->vdev->priv; 1244 unsigned int len, received = 0, bytes = 0; 1245 void *buf; 1246 1247 if (!vi->big_packets || vi->mergeable_rx_bufs) { 1248 void *ctx; 1249 1250 while (received < budget && 1251 (buf = virtqueue_get_buf_ctx(rq->vq, &len, &ctx))) { 1252 bytes += receive_buf(vi, rq, buf, len, ctx, xdp_xmit); 1253 received++; 1254 } 1255 } else { 1256 while (received < budget && 1257 (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) { 1258 bytes += receive_buf(vi, rq, buf, len, NULL, xdp_xmit); 1259 received++; 1260 } 1261 } 1262 1263 if (rq->vq->num_free > virtqueue_get_vring_size(rq->vq) / 2) { 1264 if (!try_fill_recv(vi, rq, GFP_ATOMIC)) 1265 schedule_delayed_work(&vi->refill, 0); 1266 } 1267 1268 u64_stats_update_begin(&rq->stats.syncp); 1269 rq->stats.bytes += bytes; 1270 rq->stats.packets += received; 1271 u64_stats_update_end(&rq->stats.syncp); 1272 1273 return received; 1274 } 1275 1276 static void free_old_xmit_skbs(struct send_queue *sq) 1277 { 1278 struct sk_buff *skb; 1279 unsigned int len; 1280 unsigned int packets = 0; 1281 unsigned int bytes = 0; 1282 1283 while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) { 1284 pr_debug("Sent skb %p\n", skb); 1285 1286 bytes += skb->len; 1287 packets++; 1288 1289 dev_consume_skb_any(skb); 1290 } 1291 1292 /* Avoid overhead when no packets have been processed 1293 * happens when called speculatively from start_xmit. 1294 */ 1295 if (!packets) 1296 return; 1297 1298 u64_stats_update_begin(&sq->stats.syncp); 1299 sq->stats.bytes += bytes; 1300 sq->stats.packets += packets; 1301 u64_stats_update_end(&sq->stats.syncp); 1302 } 1303 1304 static void virtnet_poll_cleantx(struct receive_queue *rq) 1305 { 1306 struct virtnet_info *vi = rq->vq->vdev->priv; 1307 unsigned int index = vq2rxq(rq->vq); 1308 struct send_queue *sq = &vi->sq[index]; 1309 struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, index); 1310 1311 if (!sq->napi.weight) 1312 return; 1313 1314 if (__netif_tx_trylock(txq)) { 1315 free_old_xmit_skbs(sq); 1316 __netif_tx_unlock(txq); 1317 } 1318 1319 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) 1320 netif_tx_wake_queue(txq); 1321 } 1322 1323 static int virtnet_poll(struct napi_struct *napi, int budget) 1324 { 1325 struct receive_queue *rq = 1326 container_of(napi, struct receive_queue, napi); 1327 struct virtnet_info *vi = rq->vq->vdev->priv; 1328 struct send_queue *sq; 1329 unsigned int received, qp; 1330 unsigned int xdp_xmit = 0; 1331 1332 virtnet_poll_cleantx(rq); 1333 1334 received = virtnet_receive(rq, budget, &xdp_xmit); 1335 1336 /* Out of packets? */ 1337 if (received < budget) 1338 virtqueue_napi_complete(napi, rq->vq, received); 1339 1340 if (xdp_xmit & VIRTIO_XDP_REDIR) 1341 xdp_do_flush_map(); 1342 1343 if (xdp_xmit & VIRTIO_XDP_TX) { 1344 qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + 1345 smp_processor_id(); 1346 sq = &vi->sq[qp]; 1347 virtqueue_kick(sq->vq); 1348 } 1349 1350 return received; 1351 } 1352 1353 static int virtnet_open(struct net_device *dev) 1354 { 1355 struct virtnet_info *vi = netdev_priv(dev); 1356 int i, err; 1357 1358 for (i = 0; i < vi->max_queue_pairs; i++) { 1359 if (i < vi->curr_queue_pairs) 1360 /* Make sure we have some buffers: if oom use wq. */ 1361 if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) 1362 schedule_delayed_work(&vi->refill, 0); 1363 1364 err = xdp_rxq_info_reg(&vi->rq[i].xdp_rxq, dev, i); 1365 if (err < 0) 1366 return err; 1367 1368 err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq, 1369 MEM_TYPE_PAGE_SHARED, NULL); 1370 if (err < 0) { 1371 xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); 1372 return err; 1373 } 1374 1375 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 1376 virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi); 1377 } 1378 1379 return 0; 1380 } 1381 1382 static int virtnet_poll_tx(struct napi_struct *napi, int budget) 1383 { 1384 struct send_queue *sq = container_of(napi, struct send_queue, napi); 1385 struct virtnet_info *vi = sq->vq->vdev->priv; 1386 struct netdev_queue *txq = netdev_get_tx_queue(vi->dev, vq2txq(sq->vq)); 1387 1388 __netif_tx_lock(txq, raw_smp_processor_id()); 1389 free_old_xmit_skbs(sq); 1390 __netif_tx_unlock(txq); 1391 1392 virtqueue_napi_complete(napi, sq->vq, 0); 1393 1394 if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) 1395 netif_tx_wake_queue(txq); 1396 1397 return 0; 1398 } 1399 1400 static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) 1401 { 1402 struct virtio_net_hdr_mrg_rxbuf *hdr; 1403 const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; 1404 struct virtnet_info *vi = sq->vq->vdev->priv; 1405 int num_sg; 1406 unsigned hdr_len = vi->hdr_len; 1407 bool can_push; 1408 1409 pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); 1410 1411 can_push = vi->any_header_sg && 1412 !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && 1413 !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; 1414 /* Even if we can, don't push here yet as this would skew 1415 * csum_start offset below. */ 1416 if (can_push) 1417 hdr = (struct virtio_net_hdr_mrg_rxbuf *)(skb->data - hdr_len); 1418 else 1419 hdr = skb_vnet_hdr(skb); 1420 1421 if (virtio_net_hdr_from_skb(skb, &hdr->hdr, 1422 virtio_is_little_endian(vi->vdev), false, 1423 0)) 1424 BUG(); 1425 1426 if (vi->mergeable_rx_bufs) 1427 hdr->num_buffers = 0; 1428 1429 sg_init_table(sq->sg, skb_shinfo(skb)->nr_frags + (can_push ? 1 : 2)); 1430 if (can_push) { 1431 __skb_push(skb, hdr_len); 1432 num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len); 1433 if (unlikely(num_sg < 0)) 1434 return num_sg; 1435 /* Pull header back to avoid skew in tx bytes calculations. */ 1436 __skb_pull(skb, hdr_len); 1437 } else { 1438 sg_set_buf(sq->sg, hdr, hdr_len); 1439 num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len); 1440 if (unlikely(num_sg < 0)) 1441 return num_sg; 1442 num_sg++; 1443 } 1444 return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC); 1445 } 1446 1447 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) 1448 { 1449 struct virtnet_info *vi = netdev_priv(dev); 1450 int qnum = skb_get_queue_mapping(skb); 1451 struct send_queue *sq = &vi->sq[qnum]; 1452 int err; 1453 struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum); 1454 bool kick = !skb->xmit_more; 1455 bool use_napi = sq->napi.weight; 1456 1457 /* Free up any pending old buffers before queueing new ones. */ 1458 free_old_xmit_skbs(sq); 1459 1460 if (use_napi && kick) 1461 virtqueue_enable_cb_delayed(sq->vq); 1462 1463 /* timestamp packet in software */ 1464 skb_tx_timestamp(skb); 1465 1466 /* Try to transmit */ 1467 err = xmit_skb(sq, skb); 1468 1469 /* This should not happen! */ 1470 if (unlikely(err)) { 1471 dev->stats.tx_fifo_errors++; 1472 if (net_ratelimit()) 1473 dev_warn(&dev->dev, 1474 "Unexpected TXQ (%d) queue failure: %d\n", qnum, err); 1475 dev->stats.tx_dropped++; 1476 dev_kfree_skb_any(skb); 1477 return NETDEV_TX_OK; 1478 } 1479 1480 /* Don't wait up for transmitted skbs to be freed. */ 1481 if (!use_napi) { 1482 skb_orphan(skb); 1483 nf_reset(skb); 1484 } 1485 1486 /* If running out of space, stop queue to avoid getting packets that we 1487 * are then unable to transmit. 1488 * An alternative would be to force queuing layer to requeue the skb by 1489 * returning NETDEV_TX_BUSY. However, NETDEV_TX_BUSY should not be 1490 * returned in a normal path of operation: it means that driver is not 1491 * maintaining the TX queue stop/start state properly, and causes 1492 * the stack to do a non-trivial amount of useless work. 1493 * Since most packets only take 1 or 2 ring slots, stopping the queue 1494 * early means 16 slots are typically wasted. 1495 */ 1496 if (sq->vq->num_free < 2+MAX_SKB_FRAGS) { 1497 netif_stop_subqueue(dev, qnum); 1498 if (!use_napi && 1499 unlikely(!virtqueue_enable_cb_delayed(sq->vq))) { 1500 /* More just got used, free them then recheck. */ 1501 free_old_xmit_skbs(sq); 1502 if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) { 1503 netif_start_subqueue(dev, qnum); 1504 virtqueue_disable_cb(sq->vq); 1505 } 1506 } 1507 } 1508 1509 if (kick || netif_xmit_stopped(txq)) 1510 virtqueue_kick(sq->vq); 1511 1512 return NETDEV_TX_OK; 1513 } 1514 1515 /* 1516 * Send command via the control virtqueue and check status. Commands 1517 * supported by the hypervisor, as indicated by feature bits, should 1518 * never fail unless improperly formatted. 1519 */ 1520 static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd, 1521 struct scatterlist *out) 1522 { 1523 struct scatterlist *sgs[4], hdr, stat; 1524 unsigned out_num = 0, tmp; 1525 1526 /* Caller should know better */ 1527 BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); 1528 1529 vi->ctrl->status = ~0; 1530 vi->ctrl->hdr.class = class; 1531 vi->ctrl->hdr.cmd = cmd; 1532 /* Add header */ 1533 sg_init_one(&hdr, &vi->ctrl->hdr, sizeof(vi->ctrl->hdr)); 1534 sgs[out_num++] = &hdr; 1535 1536 if (out) 1537 sgs[out_num++] = out; 1538 1539 /* Add return status. */ 1540 sg_init_one(&stat, &vi->ctrl->status, sizeof(vi->ctrl->status)); 1541 sgs[out_num] = &stat; 1542 1543 BUG_ON(out_num + 1 > ARRAY_SIZE(sgs)); 1544 virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, vi, GFP_ATOMIC); 1545 1546 if (unlikely(!virtqueue_kick(vi->cvq))) 1547 return vi->ctrl->status == VIRTIO_NET_OK; 1548 1549 /* Spin for a response, the kick causes an ioport write, trapping 1550 * into the hypervisor, so the request should be handled immediately. 1551 */ 1552 while (!virtqueue_get_buf(vi->cvq, &tmp) && 1553 !virtqueue_is_broken(vi->cvq)) 1554 cpu_relax(); 1555 1556 return vi->ctrl->status == VIRTIO_NET_OK; 1557 } 1558 1559 static int virtnet_set_mac_address(struct net_device *dev, void *p) 1560 { 1561 struct virtnet_info *vi = netdev_priv(dev); 1562 struct virtio_device *vdev = vi->vdev; 1563 int ret; 1564 struct sockaddr *addr; 1565 struct scatterlist sg; 1566 1567 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) 1568 return -EOPNOTSUPP; 1569 1570 addr = kmemdup(p, sizeof(*addr), GFP_KERNEL); 1571 if (!addr) 1572 return -ENOMEM; 1573 1574 ret = eth_prepare_mac_addr_change(dev, addr); 1575 if (ret) 1576 goto out; 1577 1578 if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR)) { 1579 sg_init_one(&sg, addr->sa_data, dev->addr_len); 1580 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, 1581 VIRTIO_NET_CTRL_MAC_ADDR_SET, &sg)) { 1582 dev_warn(&vdev->dev, 1583 "Failed to set mac address by vq command.\n"); 1584 ret = -EINVAL; 1585 goto out; 1586 } 1587 } else if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC) && 1588 !virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { 1589 unsigned int i; 1590 1591 /* Naturally, this has an atomicity problem. */ 1592 for (i = 0; i < dev->addr_len; i++) 1593 virtio_cwrite8(vdev, 1594 offsetof(struct virtio_net_config, mac) + 1595 i, addr->sa_data[i]); 1596 } 1597 1598 eth_commit_mac_addr_change(dev, p); 1599 ret = 0; 1600 1601 out: 1602 kfree(addr); 1603 return ret; 1604 } 1605 1606 static void virtnet_stats(struct net_device *dev, 1607 struct rtnl_link_stats64 *tot) 1608 { 1609 struct virtnet_info *vi = netdev_priv(dev); 1610 unsigned int start; 1611 int i; 1612 1613 for (i = 0; i < vi->max_queue_pairs; i++) { 1614 u64 tpackets, tbytes, rpackets, rbytes; 1615 struct receive_queue *rq = &vi->rq[i]; 1616 struct send_queue *sq = &vi->sq[i]; 1617 1618 do { 1619 start = u64_stats_fetch_begin_irq(&sq->stats.syncp); 1620 tpackets = sq->stats.packets; 1621 tbytes = sq->stats.bytes; 1622 } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start)); 1623 1624 do { 1625 start = u64_stats_fetch_begin_irq(&rq->stats.syncp); 1626 rpackets = rq->stats.packets; 1627 rbytes = rq->stats.bytes; 1628 } while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start)); 1629 1630 tot->rx_packets += rpackets; 1631 tot->tx_packets += tpackets; 1632 tot->rx_bytes += rbytes; 1633 tot->tx_bytes += tbytes; 1634 } 1635 1636 tot->tx_dropped = dev->stats.tx_dropped; 1637 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 1638 tot->rx_dropped = dev->stats.rx_dropped; 1639 tot->rx_length_errors = dev->stats.rx_length_errors; 1640 tot->rx_frame_errors = dev->stats.rx_frame_errors; 1641 } 1642 1643 #ifdef CONFIG_NET_POLL_CONTROLLER 1644 static void virtnet_netpoll(struct net_device *dev) 1645 { 1646 struct virtnet_info *vi = netdev_priv(dev); 1647 int i; 1648 1649 for (i = 0; i < vi->curr_queue_pairs; i++) 1650 napi_schedule(&vi->rq[i].napi); 1651 } 1652 #endif 1653 1654 static void virtnet_ack_link_announce(struct virtnet_info *vi) 1655 { 1656 rtnl_lock(); 1657 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE, 1658 VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL)) 1659 dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); 1660 rtnl_unlock(); 1661 } 1662 1663 static int _virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) 1664 { 1665 struct scatterlist sg; 1666 struct net_device *dev = vi->dev; 1667 1668 if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) 1669 return 0; 1670 1671 vi->ctrl->mq.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs); 1672 sg_init_one(&sg, &vi->ctrl->mq, sizeof(vi->ctrl->mq)); 1673 1674 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, 1675 VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg)) { 1676 dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", 1677 queue_pairs); 1678 return -EINVAL; 1679 } else { 1680 vi->curr_queue_pairs = queue_pairs; 1681 /* virtnet_open() will refill when device is going to up. */ 1682 if (dev->flags & IFF_UP) 1683 schedule_delayed_work(&vi->refill, 0); 1684 } 1685 1686 return 0; 1687 } 1688 1689 static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) 1690 { 1691 int err; 1692 1693 rtnl_lock(); 1694 err = _virtnet_set_queues(vi, queue_pairs); 1695 rtnl_unlock(); 1696 return err; 1697 } 1698 1699 static int virtnet_close(struct net_device *dev) 1700 { 1701 struct virtnet_info *vi = netdev_priv(dev); 1702 int i; 1703 1704 /* Make sure refill_work doesn't re-enable napi! */ 1705 cancel_delayed_work_sync(&vi->refill); 1706 1707 for (i = 0; i < vi->max_queue_pairs; i++) { 1708 xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); 1709 napi_disable(&vi->rq[i].napi); 1710 virtnet_napi_tx_disable(&vi->sq[i].napi); 1711 } 1712 1713 return 0; 1714 } 1715 1716 static void virtnet_set_rx_mode(struct net_device *dev) 1717 { 1718 struct virtnet_info *vi = netdev_priv(dev); 1719 struct scatterlist sg[2]; 1720 struct virtio_net_ctrl_mac *mac_data; 1721 struct netdev_hw_addr *ha; 1722 int uc_count; 1723 int mc_count; 1724 void *buf; 1725 int i; 1726 1727 /* We can't dynamically set ndo_set_rx_mode, so return gracefully */ 1728 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_RX)) 1729 return; 1730 1731 vi->ctrl->promisc = ((dev->flags & IFF_PROMISC) != 0); 1732 vi->ctrl->allmulti = ((dev->flags & IFF_ALLMULTI) != 0); 1733 1734 sg_init_one(sg, &vi->ctrl->promisc, sizeof(vi->ctrl->promisc)); 1735 1736 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, 1737 VIRTIO_NET_CTRL_RX_PROMISC, sg)) 1738 dev_warn(&dev->dev, "Failed to %sable promisc mode.\n", 1739 vi->ctrl->promisc ? "en" : "dis"); 1740 1741 sg_init_one(sg, &vi->ctrl->allmulti, sizeof(vi->ctrl->allmulti)); 1742 1743 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX, 1744 VIRTIO_NET_CTRL_RX_ALLMULTI, sg)) 1745 dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n", 1746 vi->ctrl->allmulti ? "en" : "dis"); 1747 1748 uc_count = netdev_uc_count(dev); 1749 mc_count = netdev_mc_count(dev); 1750 /* MAC filter - use one buffer for both lists */ 1751 buf = kzalloc(((uc_count + mc_count) * ETH_ALEN) + 1752 (2 * sizeof(mac_data->entries)), GFP_ATOMIC); 1753 mac_data = buf; 1754 if (!buf) 1755 return; 1756 1757 sg_init_table(sg, 2); 1758 1759 /* Store the unicast list and count in the front of the buffer */ 1760 mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count); 1761 i = 0; 1762 netdev_for_each_uc_addr(ha, dev) 1763 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); 1764 1765 sg_set_buf(&sg[0], mac_data, 1766 sizeof(mac_data->entries) + (uc_count * ETH_ALEN)); 1767 1768 /* multicast list and count fill the end */ 1769 mac_data = (void *)&mac_data->macs[uc_count][0]; 1770 1771 mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count); 1772 i = 0; 1773 netdev_for_each_mc_addr(ha, dev) 1774 memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); 1775 1776 sg_set_buf(&sg[1], mac_data, 1777 sizeof(mac_data->entries) + (mc_count * ETH_ALEN)); 1778 1779 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC, 1780 VIRTIO_NET_CTRL_MAC_TABLE_SET, sg)) 1781 dev_warn(&dev->dev, "Failed to set MAC filter table.\n"); 1782 1783 kfree(buf); 1784 } 1785 1786 static int virtnet_vlan_rx_add_vid(struct net_device *dev, 1787 __be16 proto, u16 vid) 1788 { 1789 struct virtnet_info *vi = netdev_priv(dev); 1790 struct scatterlist sg; 1791 1792 vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid); 1793 sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid)); 1794 1795 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, 1796 VIRTIO_NET_CTRL_VLAN_ADD, &sg)) 1797 dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid); 1798 return 0; 1799 } 1800 1801 static int virtnet_vlan_rx_kill_vid(struct net_device *dev, 1802 __be16 proto, u16 vid) 1803 { 1804 struct virtnet_info *vi = netdev_priv(dev); 1805 struct scatterlist sg; 1806 1807 vi->ctrl->vid = cpu_to_virtio16(vi->vdev, vid); 1808 sg_init_one(&sg, &vi->ctrl->vid, sizeof(vi->ctrl->vid)); 1809 1810 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN, 1811 VIRTIO_NET_CTRL_VLAN_DEL, &sg)) 1812 dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid); 1813 return 0; 1814 } 1815 1816 static void virtnet_clean_affinity(struct virtnet_info *vi, long hcpu) 1817 { 1818 int i; 1819 1820 if (vi->affinity_hint_set) { 1821 for (i = 0; i < vi->max_queue_pairs; i++) { 1822 virtqueue_set_affinity(vi->rq[i].vq, -1); 1823 virtqueue_set_affinity(vi->sq[i].vq, -1); 1824 } 1825 1826 vi->affinity_hint_set = false; 1827 } 1828 } 1829 1830 static void virtnet_set_affinity(struct virtnet_info *vi) 1831 { 1832 int i; 1833 int cpu; 1834 1835 /* In multiqueue mode, when the number of cpu is equal to the number of 1836 * queue pairs, we let the queue pairs to be private to one cpu by 1837 * setting the affinity hint to eliminate the contention. 1838 */ 1839 if (vi->curr_queue_pairs == 1 || 1840 vi->max_queue_pairs != num_online_cpus()) { 1841 virtnet_clean_affinity(vi, -1); 1842 return; 1843 } 1844 1845 i = 0; 1846 for_each_online_cpu(cpu) { 1847 virtqueue_set_affinity(vi->rq[i].vq, cpu); 1848 virtqueue_set_affinity(vi->sq[i].vq, cpu); 1849 netif_set_xps_queue(vi->dev, cpumask_of(cpu), i); 1850 i++; 1851 } 1852 1853 vi->affinity_hint_set = true; 1854 } 1855 1856 static int virtnet_cpu_online(unsigned int cpu, struct hlist_node *node) 1857 { 1858 struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, 1859 node); 1860 virtnet_set_affinity(vi); 1861 return 0; 1862 } 1863 1864 static int virtnet_cpu_dead(unsigned int cpu, struct hlist_node *node) 1865 { 1866 struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, 1867 node_dead); 1868 virtnet_set_affinity(vi); 1869 return 0; 1870 } 1871 1872 static int virtnet_cpu_down_prep(unsigned int cpu, struct hlist_node *node) 1873 { 1874 struct virtnet_info *vi = hlist_entry_safe(node, struct virtnet_info, 1875 node); 1876 1877 virtnet_clean_affinity(vi, cpu); 1878 return 0; 1879 } 1880 1881 static enum cpuhp_state virtionet_online; 1882 1883 static int virtnet_cpu_notif_add(struct virtnet_info *vi) 1884 { 1885 int ret; 1886 1887 ret = cpuhp_state_add_instance_nocalls(virtionet_online, &vi->node); 1888 if (ret) 1889 return ret; 1890 ret = cpuhp_state_add_instance_nocalls(CPUHP_VIRT_NET_DEAD, 1891 &vi->node_dead); 1892 if (!ret) 1893 return ret; 1894 cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); 1895 return ret; 1896 } 1897 1898 static void virtnet_cpu_notif_remove(struct virtnet_info *vi) 1899 { 1900 cpuhp_state_remove_instance_nocalls(virtionet_online, &vi->node); 1901 cpuhp_state_remove_instance_nocalls(CPUHP_VIRT_NET_DEAD, 1902 &vi->node_dead); 1903 } 1904 1905 static void virtnet_get_ringparam(struct net_device *dev, 1906 struct ethtool_ringparam *ring) 1907 { 1908 struct virtnet_info *vi = netdev_priv(dev); 1909 1910 ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq); 1911 ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq); 1912 ring->rx_pending = ring->rx_max_pending; 1913 ring->tx_pending = ring->tx_max_pending; 1914 } 1915 1916 1917 static void virtnet_get_drvinfo(struct net_device *dev, 1918 struct ethtool_drvinfo *info) 1919 { 1920 struct virtnet_info *vi = netdev_priv(dev); 1921 struct virtio_device *vdev = vi->vdev; 1922 1923 strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver)); 1924 strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version)); 1925 strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info)); 1926 1927 } 1928 1929 /* TODO: Eliminate OOO packets during switching */ 1930 static int virtnet_set_channels(struct net_device *dev, 1931 struct ethtool_channels *channels) 1932 { 1933 struct virtnet_info *vi = netdev_priv(dev); 1934 u16 queue_pairs = channels->combined_count; 1935 int err; 1936 1937 /* We don't support separate rx/tx channels. 1938 * We don't allow setting 'other' channels. 1939 */ 1940 if (channels->rx_count || channels->tx_count || channels->other_count) 1941 return -EINVAL; 1942 1943 if (queue_pairs > vi->max_queue_pairs || queue_pairs == 0) 1944 return -EINVAL; 1945 1946 /* For now we don't support modifying channels while XDP is loaded 1947 * also when XDP is loaded all RX queues have XDP programs so we only 1948 * need to check a single RX queue. 1949 */ 1950 if (vi->rq[0].xdp_prog) 1951 return -EINVAL; 1952 1953 get_online_cpus(); 1954 err = _virtnet_set_queues(vi, queue_pairs); 1955 if (!err) { 1956 netif_set_real_num_tx_queues(dev, queue_pairs); 1957 netif_set_real_num_rx_queues(dev, queue_pairs); 1958 1959 virtnet_set_affinity(vi); 1960 } 1961 put_online_cpus(); 1962 1963 return err; 1964 } 1965 1966 static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data) 1967 { 1968 struct virtnet_info *vi = netdev_priv(dev); 1969 char *p = (char *)data; 1970 unsigned int i, j; 1971 1972 switch (stringset) { 1973 case ETH_SS_STATS: 1974 for (i = 0; i < vi->curr_queue_pairs; i++) { 1975 for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) { 1976 snprintf(p, ETH_GSTRING_LEN, "rx_queue_%u_%s", 1977 i, virtnet_rq_stats_desc[j].desc); 1978 p += ETH_GSTRING_LEN; 1979 } 1980 } 1981 1982 for (i = 0; i < vi->curr_queue_pairs; i++) { 1983 for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) { 1984 snprintf(p, ETH_GSTRING_LEN, "tx_queue_%u_%s", 1985 i, virtnet_sq_stats_desc[j].desc); 1986 p += ETH_GSTRING_LEN; 1987 } 1988 } 1989 break; 1990 } 1991 } 1992 1993 static int virtnet_get_sset_count(struct net_device *dev, int sset) 1994 { 1995 struct virtnet_info *vi = netdev_priv(dev); 1996 1997 switch (sset) { 1998 case ETH_SS_STATS: 1999 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN + 2000 VIRTNET_SQ_STATS_LEN); 2001 default: 2002 return -EOPNOTSUPP; 2003 } 2004 } 2005 2006 static void virtnet_get_ethtool_stats(struct net_device *dev, 2007 struct ethtool_stats *stats, u64 *data) 2008 { 2009 struct virtnet_info *vi = netdev_priv(dev); 2010 unsigned int idx = 0, start, i, j; 2011 const u8 *stats_base; 2012 size_t offset; 2013 2014 for (i = 0; i < vi->curr_queue_pairs; i++) { 2015 struct receive_queue *rq = &vi->rq[i]; 2016 2017 stats_base = (u8 *)&rq->stats; 2018 do { 2019 start = u64_stats_fetch_begin_irq(&rq->stats.syncp); 2020 for (j = 0; j < VIRTNET_RQ_STATS_LEN; j++) { 2021 offset = virtnet_rq_stats_desc[j].offset; 2022 data[idx + j] = *(u64 *)(stats_base + offset); 2023 } 2024 } while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start)); 2025 idx += VIRTNET_RQ_STATS_LEN; 2026 } 2027 2028 for (i = 0; i < vi->curr_queue_pairs; i++) { 2029 struct send_queue *sq = &vi->sq[i]; 2030 2031 stats_base = (u8 *)&sq->stats; 2032 do { 2033 start = u64_stats_fetch_begin_irq(&sq->stats.syncp); 2034 for (j = 0; j < VIRTNET_SQ_STATS_LEN; j++) { 2035 offset = virtnet_sq_stats_desc[j].offset; 2036 data[idx + j] = *(u64 *)(stats_base + offset); 2037 } 2038 } while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start)); 2039 idx += VIRTNET_SQ_STATS_LEN; 2040 } 2041 } 2042 2043 static void virtnet_get_channels(struct net_device *dev, 2044 struct ethtool_channels *channels) 2045 { 2046 struct virtnet_info *vi = netdev_priv(dev); 2047 2048 channels->combined_count = vi->curr_queue_pairs; 2049 channels->max_combined = vi->max_queue_pairs; 2050 channels->max_other = 0; 2051 channels->rx_count = 0; 2052 channels->tx_count = 0; 2053 channels->other_count = 0; 2054 } 2055 2056 /* Check if the user is trying to change anything besides speed/duplex */ 2057 static bool 2058 virtnet_validate_ethtool_cmd(const struct ethtool_link_ksettings *cmd) 2059 { 2060 struct ethtool_link_ksettings diff1 = *cmd; 2061 struct ethtool_link_ksettings diff2 = {}; 2062 2063 /* cmd is always set so we need to clear it, validate the port type 2064 * and also without autonegotiation we can ignore advertising 2065 */ 2066 diff1.base.speed = 0; 2067 diff2.base.port = PORT_OTHER; 2068 ethtool_link_ksettings_zero_link_mode(&diff1, advertising); 2069 diff1.base.duplex = 0; 2070 diff1.base.cmd = 0; 2071 diff1.base.link_mode_masks_nwords = 0; 2072 2073 return !memcmp(&diff1.base, &diff2.base, sizeof(diff1.base)) && 2074 bitmap_empty(diff1.link_modes.supported, 2075 __ETHTOOL_LINK_MODE_MASK_NBITS) && 2076 bitmap_empty(diff1.link_modes.advertising, 2077 __ETHTOOL_LINK_MODE_MASK_NBITS) && 2078 bitmap_empty(diff1.link_modes.lp_advertising, 2079 __ETHTOOL_LINK_MODE_MASK_NBITS); 2080 } 2081 2082 static int virtnet_set_link_ksettings(struct net_device *dev, 2083 const struct ethtool_link_ksettings *cmd) 2084 { 2085 struct virtnet_info *vi = netdev_priv(dev); 2086 u32 speed; 2087 2088 speed = cmd->base.speed; 2089 /* don't allow custom speed and duplex */ 2090 if (!ethtool_validate_speed(speed) || 2091 !ethtool_validate_duplex(cmd->base.duplex) || 2092 !virtnet_validate_ethtool_cmd(cmd)) 2093 return -EINVAL; 2094 vi->speed = speed; 2095 vi->duplex = cmd->base.duplex; 2096 2097 return 0; 2098 } 2099 2100 static int virtnet_get_link_ksettings(struct net_device *dev, 2101 struct ethtool_link_ksettings *cmd) 2102 { 2103 struct virtnet_info *vi = netdev_priv(dev); 2104 2105 cmd->base.speed = vi->speed; 2106 cmd->base.duplex = vi->duplex; 2107 cmd->base.port = PORT_OTHER; 2108 2109 return 0; 2110 } 2111 2112 static void virtnet_init_settings(struct net_device *dev) 2113 { 2114 struct virtnet_info *vi = netdev_priv(dev); 2115 2116 vi->speed = SPEED_UNKNOWN; 2117 vi->duplex = DUPLEX_UNKNOWN; 2118 } 2119 2120 static void virtnet_update_settings(struct virtnet_info *vi) 2121 { 2122 u32 speed; 2123 u8 duplex; 2124 2125 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_SPEED_DUPLEX)) 2126 return; 2127 2128 speed = virtio_cread32(vi->vdev, offsetof(struct virtio_net_config, 2129 speed)); 2130 if (ethtool_validate_speed(speed)) 2131 vi->speed = speed; 2132 duplex = virtio_cread8(vi->vdev, offsetof(struct virtio_net_config, 2133 duplex)); 2134 if (ethtool_validate_duplex(duplex)) 2135 vi->duplex = duplex; 2136 } 2137 2138 static const struct ethtool_ops virtnet_ethtool_ops = { 2139 .get_drvinfo = virtnet_get_drvinfo, 2140 .get_link = ethtool_op_get_link, 2141 .get_ringparam = virtnet_get_ringparam, 2142 .get_strings = virtnet_get_strings, 2143 .get_sset_count = virtnet_get_sset_count, 2144 .get_ethtool_stats = virtnet_get_ethtool_stats, 2145 .set_channels = virtnet_set_channels, 2146 .get_channels = virtnet_get_channels, 2147 .get_ts_info = ethtool_op_get_ts_info, 2148 .get_link_ksettings = virtnet_get_link_ksettings, 2149 .set_link_ksettings = virtnet_set_link_ksettings, 2150 }; 2151 2152 static void virtnet_freeze_down(struct virtio_device *vdev) 2153 { 2154 struct virtnet_info *vi = vdev->priv; 2155 int i; 2156 2157 /* Make sure no work handler is accessing the device */ 2158 flush_work(&vi->config_work); 2159 2160 netif_device_detach(vi->dev); 2161 netif_tx_disable(vi->dev); 2162 cancel_delayed_work_sync(&vi->refill); 2163 2164 if (netif_running(vi->dev)) { 2165 for (i = 0; i < vi->max_queue_pairs; i++) { 2166 napi_disable(&vi->rq[i].napi); 2167 virtnet_napi_tx_disable(&vi->sq[i].napi); 2168 } 2169 } 2170 } 2171 2172 static int init_vqs(struct virtnet_info *vi); 2173 2174 static int virtnet_restore_up(struct virtio_device *vdev) 2175 { 2176 struct virtnet_info *vi = vdev->priv; 2177 int err, i; 2178 2179 err = init_vqs(vi); 2180 if (err) 2181 return err; 2182 2183 virtio_device_ready(vdev); 2184 2185 if (netif_running(vi->dev)) { 2186 for (i = 0; i < vi->curr_queue_pairs; i++) 2187 if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL)) 2188 schedule_delayed_work(&vi->refill, 0); 2189 2190 for (i = 0; i < vi->max_queue_pairs; i++) { 2191 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 2192 virtnet_napi_tx_enable(vi, vi->sq[i].vq, 2193 &vi->sq[i].napi); 2194 } 2195 } 2196 2197 netif_device_attach(vi->dev); 2198 return err; 2199 } 2200 2201 static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads) 2202 { 2203 struct scatterlist sg; 2204 vi->ctrl->offloads = cpu_to_virtio64(vi->vdev, offloads); 2205 2206 sg_init_one(&sg, &vi->ctrl->offloads, sizeof(vi->ctrl->offloads)); 2207 2208 if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_GUEST_OFFLOADS, 2209 VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET, &sg)) { 2210 dev_warn(&vi->dev->dev, "Fail to set guest offload. \n"); 2211 return -EINVAL; 2212 } 2213 2214 return 0; 2215 } 2216 2217 static int virtnet_clear_guest_offloads(struct virtnet_info *vi) 2218 { 2219 u64 offloads = 0; 2220 2221 if (!vi->guest_offloads) 2222 return 0; 2223 2224 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) 2225 offloads = 1ULL << VIRTIO_NET_F_GUEST_CSUM; 2226 2227 return virtnet_set_guest_offloads(vi, offloads); 2228 } 2229 2230 static int virtnet_restore_guest_offloads(struct virtnet_info *vi) 2231 { 2232 u64 offloads = vi->guest_offloads; 2233 2234 if (!vi->guest_offloads) 2235 return 0; 2236 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM)) 2237 offloads |= 1ULL << VIRTIO_NET_F_GUEST_CSUM; 2238 2239 return virtnet_set_guest_offloads(vi, offloads); 2240 } 2241 2242 static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, 2243 struct netlink_ext_ack *extack) 2244 { 2245 unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr); 2246 struct virtnet_info *vi = netdev_priv(dev); 2247 struct bpf_prog *old_prog; 2248 u16 xdp_qp = 0, curr_qp; 2249 int i, err; 2250 2251 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) 2252 && (virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO4) || 2253 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || 2254 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || 2255 virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO))) { 2256 NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first"); 2257 return -EOPNOTSUPP; 2258 } 2259 2260 if (vi->mergeable_rx_bufs && !vi->any_header_sg) { 2261 NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required"); 2262 return -EINVAL; 2263 } 2264 2265 if (dev->mtu > max_sz) { 2266 NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP"); 2267 netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz); 2268 return -EINVAL; 2269 } 2270 2271 curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs; 2272 if (prog) 2273 xdp_qp = nr_cpu_ids; 2274 2275 /* XDP requires extra queues for XDP_TX */ 2276 if (curr_qp + xdp_qp > vi->max_queue_pairs) { 2277 NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available"); 2278 netdev_warn(dev, "request %i queues but max is %i\n", 2279 curr_qp + xdp_qp, vi->max_queue_pairs); 2280 return -ENOMEM; 2281 } 2282 2283 if (prog) { 2284 prog = bpf_prog_add(prog, vi->max_queue_pairs - 1); 2285 if (IS_ERR(prog)) 2286 return PTR_ERR(prog); 2287 } 2288 2289 /* Make sure NAPI is not using any XDP TX queues for RX. */ 2290 if (netif_running(dev)) 2291 for (i = 0; i < vi->max_queue_pairs; i++) 2292 napi_disable(&vi->rq[i].napi); 2293 2294 netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); 2295 err = _virtnet_set_queues(vi, curr_qp + xdp_qp); 2296 if (err) 2297 goto err; 2298 vi->xdp_queue_pairs = xdp_qp; 2299 2300 for (i = 0; i < vi->max_queue_pairs; i++) { 2301 old_prog = rtnl_dereference(vi->rq[i].xdp_prog); 2302 rcu_assign_pointer(vi->rq[i].xdp_prog, prog); 2303 if (i == 0) { 2304 if (!old_prog) 2305 virtnet_clear_guest_offloads(vi); 2306 if (!prog) 2307 virtnet_restore_guest_offloads(vi); 2308 } 2309 if (old_prog) 2310 bpf_prog_put(old_prog); 2311 if (netif_running(dev)) 2312 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 2313 } 2314 2315 return 0; 2316 2317 err: 2318 for (i = 0; i < vi->max_queue_pairs; i++) 2319 virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); 2320 if (prog) 2321 bpf_prog_sub(prog, vi->max_queue_pairs - 1); 2322 return err; 2323 } 2324 2325 static u32 virtnet_xdp_query(struct net_device *dev) 2326 { 2327 struct virtnet_info *vi = netdev_priv(dev); 2328 const struct bpf_prog *xdp_prog; 2329 int i; 2330 2331 for (i = 0; i < vi->max_queue_pairs; i++) { 2332 xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog); 2333 if (xdp_prog) 2334 return xdp_prog->aux->id; 2335 } 2336 return 0; 2337 } 2338 2339 static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp) 2340 { 2341 switch (xdp->command) { 2342 case XDP_SETUP_PROG: 2343 return virtnet_xdp_set(dev, xdp->prog, xdp->extack); 2344 case XDP_QUERY_PROG: 2345 xdp->prog_id = virtnet_xdp_query(dev); 2346 xdp->prog_attached = !!xdp->prog_id; 2347 return 0; 2348 default: 2349 return -EINVAL; 2350 } 2351 } 2352 2353 static int virtnet_get_phys_port_name(struct net_device *dev, char *buf, 2354 size_t len) 2355 { 2356 struct virtnet_info *vi = netdev_priv(dev); 2357 int ret; 2358 2359 if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_STANDBY)) 2360 return -EOPNOTSUPP; 2361 2362 ret = snprintf(buf, len, "sby"); 2363 if (ret >= len) 2364 return -EOPNOTSUPP; 2365 2366 return 0; 2367 } 2368 2369 static const struct net_device_ops virtnet_netdev = { 2370 .ndo_open = virtnet_open, 2371 .ndo_stop = virtnet_close, 2372 .ndo_start_xmit = start_xmit, 2373 .ndo_validate_addr = eth_validate_addr, 2374 .ndo_set_mac_address = virtnet_set_mac_address, 2375 .ndo_set_rx_mode = virtnet_set_rx_mode, 2376 .ndo_get_stats64 = virtnet_stats, 2377 .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid, 2378 .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, 2379 #ifdef CONFIG_NET_POLL_CONTROLLER 2380 .ndo_poll_controller = virtnet_netpoll, 2381 #endif 2382 .ndo_bpf = virtnet_xdp, 2383 .ndo_xdp_xmit = virtnet_xdp_xmit, 2384 .ndo_features_check = passthru_features_check, 2385 .ndo_get_phys_port_name = virtnet_get_phys_port_name, 2386 }; 2387 2388 static void virtnet_config_changed_work(struct work_struct *work) 2389 { 2390 struct virtnet_info *vi = 2391 container_of(work, struct virtnet_info, config_work); 2392 u16 v; 2393 2394 if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS, 2395 struct virtio_net_config, status, &v) < 0) 2396 return; 2397 2398 if (v & VIRTIO_NET_S_ANNOUNCE) { 2399 netdev_notify_peers(vi->dev); 2400 virtnet_ack_link_announce(vi); 2401 } 2402 2403 /* Ignore unknown (future) status bits */ 2404 v &= VIRTIO_NET_S_LINK_UP; 2405 2406 if (vi->status == v) 2407 return; 2408 2409 vi->status = v; 2410 2411 if (vi->status & VIRTIO_NET_S_LINK_UP) { 2412 virtnet_update_settings(vi); 2413 netif_carrier_on(vi->dev); 2414 netif_tx_wake_all_queues(vi->dev); 2415 } else { 2416 netif_carrier_off(vi->dev); 2417 netif_tx_stop_all_queues(vi->dev); 2418 } 2419 } 2420 2421 static void virtnet_config_changed(struct virtio_device *vdev) 2422 { 2423 struct virtnet_info *vi = vdev->priv; 2424 2425 schedule_work(&vi->config_work); 2426 } 2427 2428 static void virtnet_free_queues(struct virtnet_info *vi) 2429 { 2430 int i; 2431 2432 for (i = 0; i < vi->max_queue_pairs; i++) { 2433 napi_hash_del(&vi->rq[i].napi); 2434 netif_napi_del(&vi->rq[i].napi); 2435 netif_napi_del(&vi->sq[i].napi); 2436 } 2437 2438 /* We called napi_hash_del() before netif_napi_del(), 2439 * we need to respect an RCU grace period before freeing vi->rq 2440 */ 2441 synchronize_net(); 2442 2443 kfree(vi->rq); 2444 kfree(vi->sq); 2445 kfree(vi->ctrl); 2446 } 2447 2448 static void _free_receive_bufs(struct virtnet_info *vi) 2449 { 2450 struct bpf_prog *old_prog; 2451 int i; 2452 2453 for (i = 0; i < vi->max_queue_pairs; i++) { 2454 while (vi->rq[i].pages) 2455 __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0); 2456 2457 old_prog = rtnl_dereference(vi->rq[i].xdp_prog); 2458 RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL); 2459 if (old_prog) 2460 bpf_prog_put(old_prog); 2461 } 2462 } 2463 2464 static void free_receive_bufs(struct virtnet_info *vi) 2465 { 2466 rtnl_lock(); 2467 _free_receive_bufs(vi); 2468 rtnl_unlock(); 2469 } 2470 2471 static void free_receive_page_frags(struct virtnet_info *vi) 2472 { 2473 int i; 2474 for (i = 0; i < vi->max_queue_pairs; i++) 2475 if (vi->rq[i].alloc_frag.page) 2476 put_page(vi->rq[i].alloc_frag.page); 2477 } 2478 2479 static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q) 2480 { 2481 if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs)) 2482 return false; 2483 else if (q < vi->curr_queue_pairs) 2484 return true; 2485 else 2486 return false; 2487 } 2488 2489 static void free_unused_bufs(struct virtnet_info *vi) 2490 { 2491 void *buf; 2492 int i; 2493 2494 for (i = 0; i < vi->max_queue_pairs; i++) { 2495 struct virtqueue *vq = vi->sq[i].vq; 2496 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { 2497 if (!is_xdp_raw_buffer_queue(vi, i)) 2498 dev_kfree_skb(buf); 2499 else 2500 put_page(virt_to_head_page(buf)); 2501 } 2502 } 2503 2504 for (i = 0; i < vi->max_queue_pairs; i++) { 2505 struct virtqueue *vq = vi->rq[i].vq; 2506 2507 while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) { 2508 if (vi->mergeable_rx_bufs) { 2509 put_page(virt_to_head_page(buf)); 2510 } else if (vi->big_packets) { 2511 give_pages(&vi->rq[i], buf); 2512 } else { 2513 put_page(virt_to_head_page(buf)); 2514 } 2515 } 2516 } 2517 } 2518 2519 static void virtnet_del_vqs(struct virtnet_info *vi) 2520 { 2521 struct virtio_device *vdev = vi->vdev; 2522 2523 virtnet_clean_affinity(vi, -1); 2524 2525 vdev->config->del_vqs(vdev); 2526 2527 virtnet_free_queues(vi); 2528 } 2529 2530 /* How large should a single buffer be so a queue full of these can fit at 2531 * least one full packet? 2532 * Logic below assumes the mergeable buffer header is used. 2533 */ 2534 static unsigned int mergeable_min_buf_len(struct virtnet_info *vi, struct virtqueue *vq) 2535 { 2536 const unsigned int hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); 2537 unsigned int rq_size = virtqueue_get_vring_size(vq); 2538 unsigned int packet_len = vi->big_packets ? IP_MAX_MTU : vi->dev->max_mtu; 2539 unsigned int buf_len = hdr_len + ETH_HLEN + VLAN_HLEN + packet_len; 2540 unsigned int min_buf_len = DIV_ROUND_UP(buf_len, rq_size); 2541 2542 return max(max(min_buf_len, hdr_len) - hdr_len, 2543 (unsigned int)GOOD_PACKET_LEN); 2544 } 2545 2546 static int virtnet_find_vqs(struct virtnet_info *vi) 2547 { 2548 vq_callback_t **callbacks; 2549 struct virtqueue **vqs; 2550 int ret = -ENOMEM; 2551 int i, total_vqs; 2552 const char **names; 2553 bool *ctx; 2554 2555 /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by 2556 * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by 2557 * possible control vq. 2558 */ 2559 total_vqs = vi->max_queue_pairs * 2 + 2560 virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ); 2561 2562 /* Allocate space for find_vqs parameters */ 2563 vqs = kcalloc(total_vqs, sizeof(*vqs), GFP_KERNEL); 2564 if (!vqs) 2565 goto err_vq; 2566 callbacks = kmalloc_array(total_vqs, sizeof(*callbacks), GFP_KERNEL); 2567 if (!callbacks) 2568 goto err_callback; 2569 names = kmalloc_array(total_vqs, sizeof(*names), GFP_KERNEL); 2570 if (!names) 2571 goto err_names; 2572 if (!vi->big_packets || vi->mergeable_rx_bufs) { 2573 ctx = kcalloc(total_vqs, sizeof(*ctx), GFP_KERNEL); 2574 if (!ctx) 2575 goto err_ctx; 2576 } else { 2577 ctx = NULL; 2578 } 2579 2580 /* Parameters for control virtqueue, if any */ 2581 if (vi->has_cvq) { 2582 callbacks[total_vqs - 1] = NULL; 2583 names[total_vqs - 1] = "control"; 2584 } 2585 2586 /* Allocate/initialize parameters for send/receive virtqueues */ 2587 for (i = 0; i < vi->max_queue_pairs; i++) { 2588 callbacks[rxq2vq(i)] = skb_recv_done; 2589 callbacks[txq2vq(i)] = skb_xmit_done; 2590 sprintf(vi->rq[i].name, "input.%d", i); 2591 sprintf(vi->sq[i].name, "output.%d", i); 2592 names[rxq2vq(i)] = vi->rq[i].name; 2593 names[txq2vq(i)] = vi->sq[i].name; 2594 if (ctx) 2595 ctx[rxq2vq(i)] = true; 2596 } 2597 2598 ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks, 2599 names, ctx, NULL); 2600 if (ret) 2601 goto err_find; 2602 2603 if (vi->has_cvq) { 2604 vi->cvq = vqs[total_vqs - 1]; 2605 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN)) 2606 vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; 2607 } 2608 2609 for (i = 0; i < vi->max_queue_pairs; i++) { 2610 vi->rq[i].vq = vqs[rxq2vq(i)]; 2611 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq); 2612 vi->sq[i].vq = vqs[txq2vq(i)]; 2613 } 2614 2615 /* run here: ret == 0. */ 2616 2617 2618 err_find: 2619 kfree(ctx); 2620 err_ctx: 2621 kfree(names); 2622 err_names: 2623 kfree(callbacks); 2624 err_callback: 2625 kfree(vqs); 2626 err_vq: 2627 return ret; 2628 } 2629 2630 static int virtnet_alloc_queues(struct virtnet_info *vi) 2631 { 2632 int i; 2633 2634 vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL); 2635 if (!vi->ctrl) 2636 goto err_ctrl; 2637 vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL); 2638 if (!vi->sq) 2639 goto err_sq; 2640 vi->rq = kcalloc(vi->max_queue_pairs, sizeof(*vi->rq), GFP_KERNEL); 2641 if (!vi->rq) 2642 goto err_rq; 2643 2644 INIT_DELAYED_WORK(&vi->refill, refill_work); 2645 for (i = 0; i < vi->max_queue_pairs; i++) { 2646 vi->rq[i].pages = NULL; 2647 netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll, 2648 napi_weight); 2649 netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, 2650 napi_tx ? napi_weight : 0); 2651 2652 sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); 2653 ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); 2654 sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); 2655 2656 u64_stats_init(&vi->rq[i].stats.syncp); 2657 u64_stats_init(&vi->sq[i].stats.syncp); 2658 } 2659 2660 return 0; 2661 2662 err_rq: 2663 kfree(vi->sq); 2664 err_sq: 2665 kfree(vi->ctrl); 2666 err_ctrl: 2667 return -ENOMEM; 2668 } 2669 2670 static int init_vqs(struct virtnet_info *vi) 2671 { 2672 int ret; 2673 2674 /* Allocate send & receive queues */ 2675 ret = virtnet_alloc_queues(vi); 2676 if (ret) 2677 goto err; 2678 2679 ret = virtnet_find_vqs(vi); 2680 if (ret) 2681 goto err_free; 2682 2683 get_online_cpus(); 2684 virtnet_set_affinity(vi); 2685 put_online_cpus(); 2686 2687 return 0; 2688 2689 err_free: 2690 virtnet_free_queues(vi); 2691 err: 2692 return ret; 2693 } 2694 2695 #ifdef CONFIG_SYSFS 2696 static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue, 2697 char *buf) 2698 { 2699 struct virtnet_info *vi = netdev_priv(queue->dev); 2700 unsigned int queue_index = get_netdev_rx_queue_index(queue); 2701 unsigned int headroom = virtnet_get_headroom(vi); 2702 unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0; 2703 struct ewma_pkt_len *avg; 2704 2705 BUG_ON(queue_index >= vi->max_queue_pairs); 2706 avg = &vi->rq[queue_index].mrg_avg_pkt_len; 2707 return sprintf(buf, "%u\n", 2708 get_mergeable_buf_len(&vi->rq[queue_index], avg, 2709 SKB_DATA_ALIGN(headroom + tailroom))); 2710 } 2711 2712 static struct rx_queue_attribute mergeable_rx_buffer_size_attribute = 2713 __ATTR_RO(mergeable_rx_buffer_size); 2714 2715 static struct attribute *virtio_net_mrg_rx_attrs[] = { 2716 &mergeable_rx_buffer_size_attribute.attr, 2717 NULL 2718 }; 2719 2720 static const struct attribute_group virtio_net_mrg_rx_group = { 2721 .name = "virtio_net", 2722 .attrs = virtio_net_mrg_rx_attrs 2723 }; 2724 #endif 2725 2726 static bool virtnet_fail_on_feature(struct virtio_device *vdev, 2727 unsigned int fbit, 2728 const char *fname, const char *dname) 2729 { 2730 if (!virtio_has_feature(vdev, fbit)) 2731 return false; 2732 2733 dev_err(&vdev->dev, "device advertises feature %s but not %s", 2734 fname, dname); 2735 2736 return true; 2737 } 2738 2739 #define VIRTNET_FAIL_ON(vdev, fbit, dbit) \ 2740 virtnet_fail_on_feature(vdev, fbit, #fbit, dbit) 2741 2742 static bool virtnet_validate_features(struct virtio_device *vdev) 2743 { 2744 if (!virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) && 2745 (VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_RX, 2746 "VIRTIO_NET_F_CTRL_VQ") || 2747 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_VLAN, 2748 "VIRTIO_NET_F_CTRL_VQ") || 2749 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE, 2750 "VIRTIO_NET_F_CTRL_VQ") || 2751 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_MQ, "VIRTIO_NET_F_CTRL_VQ") || 2752 VIRTNET_FAIL_ON(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR, 2753 "VIRTIO_NET_F_CTRL_VQ"))) { 2754 return false; 2755 } 2756 2757 return true; 2758 } 2759 2760 #define MIN_MTU ETH_MIN_MTU 2761 #define MAX_MTU ETH_MAX_MTU 2762 2763 static int virtnet_validate(struct virtio_device *vdev) 2764 { 2765 if (!vdev->config->get) { 2766 dev_err(&vdev->dev, "%s failure: config access disabled\n", 2767 __func__); 2768 return -EINVAL; 2769 } 2770 2771 if (!virtnet_validate_features(vdev)) 2772 return -EINVAL; 2773 2774 if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { 2775 int mtu = virtio_cread16(vdev, 2776 offsetof(struct virtio_net_config, 2777 mtu)); 2778 if (mtu < MIN_MTU) 2779 __virtio_clear_bit(vdev, VIRTIO_NET_F_MTU); 2780 } 2781 2782 return 0; 2783 } 2784 2785 static int virtnet_probe(struct virtio_device *vdev) 2786 { 2787 int i, err = -ENOMEM; 2788 struct net_device *dev; 2789 struct virtnet_info *vi; 2790 u16 max_queue_pairs; 2791 int mtu; 2792 2793 /* Find if host supports multiqueue virtio_net device */ 2794 err = virtio_cread_feature(vdev, VIRTIO_NET_F_MQ, 2795 struct virtio_net_config, 2796 max_virtqueue_pairs, &max_queue_pairs); 2797 2798 /* We need at least 2 queue's */ 2799 if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN || 2800 max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX || 2801 !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) 2802 max_queue_pairs = 1; 2803 2804 /* Allocate ourselves a network device with room for our info */ 2805 dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs); 2806 if (!dev) 2807 return -ENOMEM; 2808 2809 /* Set up network device as normal. */ 2810 dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; 2811 dev->netdev_ops = &virtnet_netdev; 2812 dev->features = NETIF_F_HIGHDMA; 2813 2814 dev->ethtool_ops = &virtnet_ethtool_ops; 2815 SET_NETDEV_DEV(dev, &vdev->dev); 2816 2817 /* Do we support "hardware" checksums? */ 2818 if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { 2819 /* This opens up the world of extra features. */ 2820 dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG; 2821 if (csum) 2822 dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG; 2823 2824 if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { 2825 dev->hw_features |= NETIF_F_TSO 2826 | NETIF_F_TSO_ECN | NETIF_F_TSO6; 2827 } 2828 /* Individual feature bits: what can host handle? */ 2829 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4)) 2830 dev->hw_features |= NETIF_F_TSO; 2831 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6)) 2832 dev->hw_features |= NETIF_F_TSO6; 2833 if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN)) 2834 dev->hw_features |= NETIF_F_TSO_ECN; 2835 2836 dev->features |= NETIF_F_GSO_ROBUST; 2837 2838 if (gso) 2839 dev->features |= dev->hw_features & NETIF_F_ALL_TSO; 2840 /* (!csum && gso) case will be fixed by register_netdev() */ 2841 } 2842 if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) 2843 dev->features |= NETIF_F_RXCSUM; 2844 2845 dev->vlan_features = dev->features; 2846 2847 /* MTU range: 68 - 65535 */ 2848 dev->min_mtu = MIN_MTU; 2849 dev->max_mtu = MAX_MTU; 2850 2851 /* Configuration may specify what MAC to use. Otherwise random. */ 2852 if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) 2853 virtio_cread_bytes(vdev, 2854 offsetof(struct virtio_net_config, mac), 2855 dev->dev_addr, dev->addr_len); 2856 else 2857 eth_hw_addr_random(dev); 2858 2859 /* Set up our device-specific information */ 2860 vi = netdev_priv(dev); 2861 vi->dev = dev; 2862 vi->vdev = vdev; 2863 vdev->priv = vi; 2864 2865 INIT_WORK(&vi->config_work, virtnet_config_changed_work); 2866 2867 /* If we can receive ANY GSO packets, we must allocate large ones. */ 2868 if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || 2869 virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) || 2870 virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) || 2871 virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO)) 2872 vi->big_packets = true; 2873 2874 if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) 2875 vi->mergeable_rx_bufs = true; 2876 2877 if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF) || 2878 virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) 2879 vi->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf); 2880 else 2881 vi->hdr_len = sizeof(struct virtio_net_hdr); 2882 2883 if (virtio_has_feature(vdev, VIRTIO_F_ANY_LAYOUT) || 2884 virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) 2885 vi->any_header_sg = true; 2886 2887 if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) 2888 vi->has_cvq = true; 2889 2890 if (virtio_has_feature(vdev, VIRTIO_NET_F_MTU)) { 2891 mtu = virtio_cread16(vdev, 2892 offsetof(struct virtio_net_config, 2893 mtu)); 2894 if (mtu < dev->min_mtu) { 2895 /* Should never trigger: MTU was previously validated 2896 * in virtnet_validate. 2897 */ 2898 dev_err(&vdev->dev, "device MTU appears to have changed " 2899 "it is now %d < %d", mtu, dev->min_mtu); 2900 goto free; 2901 } 2902 2903 dev->mtu = mtu; 2904 dev->max_mtu = mtu; 2905 2906 /* TODO: size buffers correctly in this case. */ 2907 if (dev->mtu > ETH_DATA_LEN) 2908 vi->big_packets = true; 2909 } 2910 2911 if (vi->any_header_sg) 2912 dev->needed_headroom = vi->hdr_len; 2913 2914 /* Enable multiqueue by default */ 2915 if (num_online_cpus() >= max_queue_pairs) 2916 vi->curr_queue_pairs = max_queue_pairs; 2917 else 2918 vi->curr_queue_pairs = num_online_cpus(); 2919 vi->max_queue_pairs = max_queue_pairs; 2920 2921 /* Allocate/initialize the rx/tx queues, and invoke find_vqs */ 2922 err = init_vqs(vi); 2923 if (err) 2924 goto free; 2925 2926 #ifdef CONFIG_SYSFS 2927 if (vi->mergeable_rx_bufs) 2928 dev->sysfs_rx_queue_group = &virtio_net_mrg_rx_group; 2929 #endif 2930 netif_set_real_num_tx_queues(dev, vi->curr_queue_pairs); 2931 netif_set_real_num_rx_queues(dev, vi->curr_queue_pairs); 2932 2933 virtnet_init_settings(dev); 2934 2935 if (virtio_has_feature(vdev, VIRTIO_NET_F_STANDBY)) { 2936 vi->failover = net_failover_create(vi->dev); 2937 if (IS_ERR(vi->failover)) { 2938 err = PTR_ERR(vi->failover); 2939 goto free_vqs; 2940 } 2941 } 2942 2943 err = register_netdev(dev); 2944 if (err) { 2945 pr_debug("virtio_net: registering device failed\n"); 2946 goto free_failover; 2947 } 2948 2949 virtio_device_ready(vdev); 2950 2951 err = virtnet_cpu_notif_add(vi); 2952 if (err) { 2953 pr_debug("virtio_net: registering cpu notifier failed\n"); 2954 goto free_unregister_netdev; 2955 } 2956 2957 virtnet_set_queues(vi, vi->curr_queue_pairs); 2958 2959 /* Assume link up if device can't report link status, 2960 otherwise get link status from config. */ 2961 netif_carrier_off(dev); 2962 if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { 2963 schedule_work(&vi->config_work); 2964 } else { 2965 vi->status = VIRTIO_NET_S_LINK_UP; 2966 virtnet_update_settings(vi); 2967 netif_carrier_on(dev); 2968 } 2969 2970 for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) 2971 if (virtio_has_feature(vi->vdev, guest_offloads[i])) 2972 set_bit(guest_offloads[i], &vi->guest_offloads); 2973 2974 pr_debug("virtnet: registered device %s with %d RX and TX vq's\n", 2975 dev->name, max_queue_pairs); 2976 2977 return 0; 2978 2979 free_unregister_netdev: 2980 vi->vdev->config->reset(vdev); 2981 2982 unregister_netdev(dev); 2983 free_failover: 2984 net_failover_destroy(vi->failover); 2985 free_vqs: 2986 cancel_delayed_work_sync(&vi->refill); 2987 free_receive_page_frags(vi); 2988 virtnet_del_vqs(vi); 2989 free: 2990 free_netdev(dev); 2991 return err; 2992 } 2993 2994 static void remove_vq_common(struct virtnet_info *vi) 2995 { 2996 vi->vdev->config->reset(vi->vdev); 2997 2998 /* Free unused buffers in both send and recv, if any. */ 2999 free_unused_bufs(vi); 3000 3001 free_receive_bufs(vi); 3002 3003 free_receive_page_frags(vi); 3004 3005 virtnet_del_vqs(vi); 3006 } 3007 3008 static void virtnet_remove(struct virtio_device *vdev) 3009 { 3010 struct virtnet_info *vi = vdev->priv; 3011 3012 virtnet_cpu_notif_remove(vi); 3013 3014 /* Make sure no work handler is accessing the device. */ 3015 flush_work(&vi->config_work); 3016 3017 unregister_netdev(vi->dev); 3018 3019 net_failover_destroy(vi->failover); 3020 3021 remove_vq_common(vi); 3022 3023 free_netdev(vi->dev); 3024 } 3025 3026 static __maybe_unused int virtnet_freeze(struct virtio_device *vdev) 3027 { 3028 struct virtnet_info *vi = vdev->priv; 3029 3030 virtnet_cpu_notif_remove(vi); 3031 virtnet_freeze_down(vdev); 3032 remove_vq_common(vi); 3033 3034 return 0; 3035 } 3036 3037 static __maybe_unused int virtnet_restore(struct virtio_device *vdev) 3038 { 3039 struct virtnet_info *vi = vdev->priv; 3040 int err; 3041 3042 err = virtnet_restore_up(vdev); 3043 if (err) 3044 return err; 3045 virtnet_set_queues(vi, vi->curr_queue_pairs); 3046 3047 err = virtnet_cpu_notif_add(vi); 3048 if (err) 3049 return err; 3050 3051 return 0; 3052 } 3053 3054 static struct virtio_device_id id_table[] = { 3055 { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, 3056 { 0 }, 3057 }; 3058 3059 #define VIRTNET_FEATURES \ 3060 VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \ 3061 VIRTIO_NET_F_MAC, \ 3062 VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \ 3063 VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \ 3064 VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \ 3065 VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \ 3066 VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ 3067 VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ 3068 VIRTIO_NET_F_CTRL_MAC_ADDR, \ 3069 VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \ 3070 VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY 3071 3072 static unsigned int features[] = { 3073 VIRTNET_FEATURES, 3074 }; 3075 3076 static unsigned int features_legacy[] = { 3077 VIRTNET_FEATURES, 3078 VIRTIO_NET_F_GSO, 3079 VIRTIO_F_ANY_LAYOUT, 3080 }; 3081 3082 static struct virtio_driver virtio_net_driver = { 3083 .feature_table = features, 3084 .feature_table_size = ARRAY_SIZE(features), 3085 .feature_table_legacy = features_legacy, 3086 .feature_table_size_legacy = ARRAY_SIZE(features_legacy), 3087 .driver.name = KBUILD_MODNAME, 3088 .driver.owner = THIS_MODULE, 3089 .id_table = id_table, 3090 .validate = virtnet_validate, 3091 .probe = virtnet_probe, 3092 .remove = virtnet_remove, 3093 .config_changed = virtnet_config_changed, 3094 #ifdef CONFIG_PM_SLEEP 3095 .freeze = virtnet_freeze, 3096 .restore = virtnet_restore, 3097 #endif 3098 }; 3099 3100 static __init int virtio_net_driver_init(void) 3101 { 3102 int ret; 3103 3104 ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "virtio/net:online", 3105 virtnet_cpu_online, 3106 virtnet_cpu_down_prep); 3107 if (ret < 0) 3108 goto out; 3109 virtionet_online = ret; 3110 ret = cpuhp_setup_state_multi(CPUHP_VIRT_NET_DEAD, "virtio/net:dead", 3111 NULL, virtnet_cpu_dead); 3112 if (ret) 3113 goto err_dead; 3114 3115 ret = register_virtio_driver(&virtio_net_driver); 3116 if (ret) 3117 goto err_virtio; 3118 return 0; 3119 err_virtio: 3120 cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); 3121 err_dead: 3122 cpuhp_remove_multi_state(virtionet_online); 3123 out: 3124 return ret; 3125 } 3126 module_init(virtio_net_driver_init); 3127 3128 static __exit void virtio_net_driver_exit(void) 3129 { 3130 unregister_virtio_driver(&virtio_net_driver); 3131 cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD); 3132 cpuhp_remove_multi_state(virtionet_online); 3133 } 3134 module_exit(virtio_net_driver_exit); 3135 3136 MODULE_DEVICE_TABLE(virtio, id_table); 3137 MODULE_DESCRIPTION("Virtio network driver"); 3138 MODULE_LICENSE("GPL"); 3139