1 /* 2 * drivers/net/veth.c 3 * 4 * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc 5 * 6 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com> 8 * 9 */ 10 11 #include <linux/netdevice.h> 12 #include <linux/slab.h> 13 #include <linux/ethtool.h> 14 #include <linux/etherdevice.h> 15 #include <linux/u64_stats_sync.h> 16 17 #include <net/rtnetlink.h> 18 #include <net/dst.h> 19 #include <net/xfrm.h> 20 #include <net/xdp.h> 21 #include <linux/veth.h> 22 #include <linux/module.h> 23 #include <linux/bpf.h> 24 #include <linux/filter.h> 25 #include <linux/ptr_ring.h> 26 #include <linux/bpf_trace.h> 27 28 #define DRV_NAME "veth" 29 #define DRV_VERSION "1.0" 30 31 #define VETH_XDP_FLAG BIT(0) 32 #define VETH_RING_SIZE 256 33 #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN) 34 35 /* Separating two types of XDP xmit */ 36 #define VETH_XDP_TX BIT(0) 37 #define VETH_XDP_REDIR BIT(1) 38 39 struct pcpu_vstats { 40 u64 packets; 41 u64 bytes; 42 struct u64_stats_sync syncp; 43 }; 44 45 struct veth_rq { 46 struct napi_struct xdp_napi; 47 struct net_device *dev; 48 struct bpf_prog __rcu *xdp_prog; 49 struct xdp_mem_info xdp_mem; 50 bool rx_notify_masked; 51 struct ptr_ring xdp_ring; 52 struct xdp_rxq_info xdp_rxq; 53 }; 54 55 struct veth_priv { 56 struct net_device __rcu *peer; 57 atomic64_t dropped; 58 struct bpf_prog *_xdp_prog; 59 struct veth_rq *rq; 60 unsigned int requested_headroom; 61 }; 62 63 /* 64 * ethtool interface 65 */ 66 67 static struct { 68 const char string[ETH_GSTRING_LEN]; 69 } ethtool_stats_keys[] = { 70 { "peer_ifindex" }, 71 }; 72 73 static int veth_get_link_ksettings(struct net_device *dev, 74 struct ethtool_link_ksettings *cmd) 75 { 76 cmd->base.speed = SPEED_10000; 77 cmd->base.duplex = DUPLEX_FULL; 78 cmd->base.port = PORT_TP; 79 cmd->base.autoneg = AUTONEG_DISABLE; 80 return 0; 81 } 82 83 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) 84 { 85 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 86 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 87 } 88 89 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) 90 { 91 switch(stringset) { 92 case ETH_SS_STATS: 93 memcpy(buf, ðtool_stats_keys, sizeof(ethtool_stats_keys)); 94 break; 95 } 96 } 97 98 static int veth_get_sset_count(struct net_device *dev, int sset) 99 { 100 switch (sset) { 101 case ETH_SS_STATS: 102 return ARRAY_SIZE(ethtool_stats_keys); 103 default: 104 return -EOPNOTSUPP; 105 } 106 } 107 108 static void veth_get_ethtool_stats(struct net_device *dev, 109 struct ethtool_stats *stats, u64 *data) 110 { 111 struct veth_priv *priv = netdev_priv(dev); 112 struct net_device *peer = rtnl_dereference(priv->peer); 113 114 data[0] = peer ? peer->ifindex : 0; 115 } 116 117 static const struct ethtool_ops veth_ethtool_ops = { 118 .get_drvinfo = veth_get_drvinfo, 119 .get_link = ethtool_op_get_link, 120 .get_strings = veth_get_strings, 121 .get_sset_count = veth_get_sset_count, 122 .get_ethtool_stats = veth_get_ethtool_stats, 123 .get_link_ksettings = veth_get_link_ksettings, 124 }; 125 126 /* general routines */ 127 128 static bool veth_is_xdp_frame(void *ptr) 129 { 130 return (unsigned long)ptr & VETH_XDP_FLAG; 131 } 132 133 static void *veth_ptr_to_xdp(void *ptr) 134 { 135 return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG); 136 } 137 138 static void *veth_xdp_to_ptr(void *ptr) 139 { 140 return (void *)((unsigned long)ptr | VETH_XDP_FLAG); 141 } 142 143 static void veth_ptr_free(void *ptr) 144 { 145 if (veth_is_xdp_frame(ptr)) 146 xdp_return_frame(veth_ptr_to_xdp(ptr)); 147 else 148 kfree_skb(ptr); 149 } 150 151 static void __veth_xdp_flush(struct veth_rq *rq) 152 { 153 /* Write ptr_ring before reading rx_notify_masked */ 154 smp_mb(); 155 if (!rq->rx_notify_masked) { 156 rq->rx_notify_masked = true; 157 napi_schedule(&rq->xdp_napi); 158 } 159 } 160 161 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) 162 { 163 if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { 164 dev_kfree_skb_any(skb); 165 return NET_RX_DROP; 166 } 167 168 return NET_RX_SUCCESS; 169 } 170 171 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, 172 struct veth_rq *rq, bool xdp) 173 { 174 return __dev_forward_skb(dev, skb) ?: xdp ? 175 veth_xdp_rx(rq, skb) : 176 netif_rx(skb); 177 } 178 179 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) 180 { 181 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 182 struct veth_rq *rq = NULL; 183 struct net_device *rcv; 184 int length = skb->len; 185 bool rcv_xdp = false; 186 int rxq; 187 188 rcu_read_lock(); 189 rcv = rcu_dereference(priv->peer); 190 if (unlikely(!rcv)) { 191 kfree_skb(skb); 192 goto drop; 193 } 194 195 rcv_priv = netdev_priv(rcv); 196 rxq = skb_get_queue_mapping(skb); 197 if (rxq < rcv->real_num_rx_queues) { 198 rq = &rcv_priv->rq[rxq]; 199 rcv_xdp = rcu_access_pointer(rq->xdp_prog); 200 if (rcv_xdp) 201 skb_record_rx_queue(skb, rxq); 202 } 203 204 if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { 205 struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); 206 207 u64_stats_update_begin(&stats->syncp); 208 stats->bytes += length; 209 stats->packets++; 210 u64_stats_update_end(&stats->syncp); 211 } else { 212 drop: 213 atomic64_inc(&priv->dropped); 214 } 215 216 if (rcv_xdp) 217 __veth_xdp_flush(rq); 218 219 rcu_read_unlock(); 220 221 return NETDEV_TX_OK; 222 } 223 224 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev) 225 { 226 struct veth_priv *priv = netdev_priv(dev); 227 int cpu; 228 229 result->packets = 0; 230 result->bytes = 0; 231 for_each_possible_cpu(cpu) { 232 struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu); 233 u64 packets, bytes; 234 unsigned int start; 235 236 do { 237 start = u64_stats_fetch_begin_irq(&stats->syncp); 238 packets = stats->packets; 239 bytes = stats->bytes; 240 } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); 241 result->packets += packets; 242 result->bytes += bytes; 243 } 244 return atomic64_read(&priv->dropped); 245 } 246 247 static void veth_get_stats64(struct net_device *dev, 248 struct rtnl_link_stats64 *tot) 249 { 250 struct veth_priv *priv = netdev_priv(dev); 251 struct net_device *peer; 252 struct pcpu_vstats one; 253 254 tot->tx_dropped = veth_stats_one(&one, dev); 255 tot->tx_bytes = one.bytes; 256 tot->tx_packets = one.packets; 257 258 rcu_read_lock(); 259 peer = rcu_dereference(priv->peer); 260 if (peer) { 261 tot->rx_dropped = veth_stats_one(&one, peer); 262 tot->rx_bytes = one.bytes; 263 tot->rx_packets = one.packets; 264 } 265 rcu_read_unlock(); 266 } 267 268 /* fake multicast ability */ 269 static void veth_set_multicast_list(struct net_device *dev) 270 { 271 } 272 273 static struct sk_buff *veth_build_skb(void *head, int headroom, int len, 274 int buflen) 275 { 276 struct sk_buff *skb; 277 278 if (!buflen) { 279 buflen = SKB_DATA_ALIGN(headroom + len) + 280 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 281 } 282 skb = build_skb(head, buflen); 283 if (!skb) 284 return NULL; 285 286 skb_reserve(skb, headroom); 287 skb_put(skb, len); 288 289 return skb; 290 } 291 292 static int veth_select_rxq(struct net_device *dev) 293 { 294 return smp_processor_id() % dev->real_num_rx_queues; 295 } 296 297 static int veth_xdp_xmit(struct net_device *dev, int n, 298 struct xdp_frame **frames, u32 flags) 299 { 300 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 301 struct net_device *rcv; 302 unsigned int max_len; 303 struct veth_rq *rq; 304 int i, drops = 0; 305 306 if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) 307 return -EINVAL; 308 309 rcv = rcu_dereference(priv->peer); 310 if (unlikely(!rcv)) 311 return -ENXIO; 312 313 rcv_priv = netdev_priv(rcv); 314 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 315 /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive 316 * side. This means an XDP program is loaded on the peer and the peer 317 * device is up. 318 */ 319 if (!rcu_access_pointer(rq->xdp_prog)) 320 return -ENXIO; 321 322 max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN; 323 324 spin_lock(&rq->xdp_ring.producer_lock); 325 for (i = 0; i < n; i++) { 326 struct xdp_frame *frame = frames[i]; 327 void *ptr = veth_xdp_to_ptr(frame); 328 329 if (unlikely(frame->len > max_len || 330 __ptr_ring_produce(&rq->xdp_ring, ptr))) { 331 xdp_return_frame_rx_napi(frame); 332 drops++; 333 } 334 } 335 spin_unlock(&rq->xdp_ring.producer_lock); 336 337 if (flags & XDP_XMIT_FLUSH) 338 __veth_xdp_flush(rq); 339 340 return n - drops; 341 } 342 343 static void veth_xdp_flush(struct net_device *dev) 344 { 345 struct veth_priv *rcv_priv, *priv = netdev_priv(dev); 346 struct net_device *rcv; 347 struct veth_rq *rq; 348 349 rcu_read_lock(); 350 rcv = rcu_dereference(priv->peer); 351 if (unlikely(!rcv)) 352 goto out; 353 354 rcv_priv = netdev_priv(rcv); 355 rq = &rcv_priv->rq[veth_select_rxq(rcv)]; 356 /* xdp_ring is initialized on receive side? */ 357 if (unlikely(!rcu_access_pointer(rq->xdp_prog))) 358 goto out; 359 360 __veth_xdp_flush(rq); 361 out: 362 rcu_read_unlock(); 363 } 364 365 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) 366 { 367 struct xdp_frame *frame = convert_to_xdp_frame(xdp); 368 369 if (unlikely(!frame)) 370 return -EOVERFLOW; 371 372 return veth_xdp_xmit(dev, 1, &frame, 0); 373 } 374 375 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq, 376 struct xdp_frame *frame, 377 unsigned int *xdp_xmit) 378 { 379 void *hard_start = frame->data - frame->headroom; 380 void *head = hard_start - sizeof(struct xdp_frame); 381 int len = frame->len, delta = 0; 382 struct xdp_frame orig_frame; 383 struct bpf_prog *xdp_prog; 384 unsigned int headroom; 385 struct sk_buff *skb; 386 387 rcu_read_lock(); 388 xdp_prog = rcu_dereference(rq->xdp_prog); 389 if (likely(xdp_prog)) { 390 struct xdp_buff xdp; 391 u32 act; 392 393 xdp.data_hard_start = hard_start; 394 xdp.data = frame->data; 395 xdp.data_end = frame->data + frame->len; 396 xdp.data_meta = frame->data - frame->metasize; 397 xdp.rxq = &rq->xdp_rxq; 398 399 act = bpf_prog_run_xdp(xdp_prog, &xdp); 400 401 switch (act) { 402 case XDP_PASS: 403 delta = frame->data - xdp.data; 404 len = xdp.data_end - xdp.data; 405 break; 406 case XDP_TX: 407 orig_frame = *frame; 408 xdp.data_hard_start = head; 409 xdp.rxq->mem = frame->mem; 410 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 411 trace_xdp_exception(rq->dev, xdp_prog, act); 412 frame = &orig_frame; 413 goto err_xdp; 414 } 415 *xdp_xmit |= VETH_XDP_TX; 416 rcu_read_unlock(); 417 goto xdp_xmit; 418 case XDP_REDIRECT: 419 orig_frame = *frame; 420 xdp.data_hard_start = head; 421 xdp.rxq->mem = frame->mem; 422 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) { 423 frame = &orig_frame; 424 goto err_xdp; 425 } 426 *xdp_xmit |= VETH_XDP_REDIR; 427 rcu_read_unlock(); 428 goto xdp_xmit; 429 default: 430 bpf_warn_invalid_xdp_action(act); 431 case XDP_ABORTED: 432 trace_xdp_exception(rq->dev, xdp_prog, act); 433 case XDP_DROP: 434 goto err_xdp; 435 } 436 } 437 rcu_read_unlock(); 438 439 headroom = sizeof(struct xdp_frame) + frame->headroom - delta; 440 skb = veth_build_skb(head, headroom, len, 0); 441 if (!skb) { 442 xdp_return_frame(frame); 443 goto err; 444 } 445 446 xdp_scrub_frame(frame); 447 skb->protocol = eth_type_trans(skb, rq->dev); 448 err: 449 return skb; 450 err_xdp: 451 rcu_read_unlock(); 452 xdp_return_frame(frame); 453 xdp_xmit: 454 return NULL; 455 } 456 457 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb, 458 unsigned int *xdp_xmit) 459 { 460 u32 pktlen, headroom, act, metalen; 461 void *orig_data, *orig_data_end; 462 struct bpf_prog *xdp_prog; 463 int mac_len, delta, off; 464 struct xdp_buff xdp; 465 466 rcu_read_lock(); 467 xdp_prog = rcu_dereference(rq->xdp_prog); 468 if (unlikely(!xdp_prog)) { 469 rcu_read_unlock(); 470 goto out; 471 } 472 473 mac_len = skb->data - skb_mac_header(skb); 474 pktlen = skb->len + mac_len; 475 headroom = skb_headroom(skb) - mac_len; 476 477 if (skb_shared(skb) || skb_head_is_locked(skb) || 478 skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) { 479 struct sk_buff *nskb; 480 int size, head_off; 481 void *head, *start; 482 struct page *page; 483 484 size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) + 485 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 486 if (size > PAGE_SIZE) 487 goto drop; 488 489 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 490 if (!page) 491 goto drop; 492 493 head = page_address(page); 494 start = head + VETH_XDP_HEADROOM; 495 if (skb_copy_bits(skb, -mac_len, start, pktlen)) { 496 page_frag_free(head); 497 goto drop; 498 } 499 500 nskb = veth_build_skb(head, 501 VETH_XDP_HEADROOM + mac_len, skb->len, 502 PAGE_SIZE); 503 if (!nskb) { 504 page_frag_free(head); 505 goto drop; 506 } 507 508 skb_copy_header(nskb, skb); 509 head_off = skb_headroom(nskb) - skb_headroom(skb); 510 skb_headers_offset_update(nskb, head_off); 511 if (skb->sk) 512 skb_set_owner_w(nskb, skb->sk); 513 consume_skb(skb); 514 skb = nskb; 515 } 516 517 xdp.data_hard_start = skb->head; 518 xdp.data = skb_mac_header(skb); 519 xdp.data_end = xdp.data + pktlen; 520 xdp.data_meta = xdp.data; 521 xdp.rxq = &rq->xdp_rxq; 522 orig_data = xdp.data; 523 orig_data_end = xdp.data_end; 524 525 act = bpf_prog_run_xdp(xdp_prog, &xdp); 526 527 switch (act) { 528 case XDP_PASS: 529 break; 530 case XDP_TX: 531 get_page(virt_to_page(xdp.data)); 532 consume_skb(skb); 533 xdp.rxq->mem = rq->xdp_mem; 534 if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) { 535 trace_xdp_exception(rq->dev, xdp_prog, act); 536 goto err_xdp; 537 } 538 *xdp_xmit |= VETH_XDP_TX; 539 rcu_read_unlock(); 540 goto xdp_xmit; 541 case XDP_REDIRECT: 542 get_page(virt_to_page(xdp.data)); 543 consume_skb(skb); 544 xdp.rxq->mem = rq->xdp_mem; 545 if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) 546 goto err_xdp; 547 *xdp_xmit |= VETH_XDP_REDIR; 548 rcu_read_unlock(); 549 goto xdp_xmit; 550 default: 551 bpf_warn_invalid_xdp_action(act); 552 case XDP_ABORTED: 553 trace_xdp_exception(rq->dev, xdp_prog, act); 554 case XDP_DROP: 555 goto drop; 556 } 557 rcu_read_unlock(); 558 559 delta = orig_data - xdp.data; 560 off = mac_len + delta; 561 if (off > 0) 562 __skb_push(skb, off); 563 else if (off < 0) 564 __skb_pull(skb, -off); 565 skb->mac_header -= delta; 566 off = xdp.data_end - orig_data_end; 567 if (off != 0) 568 __skb_put(skb, off); 569 skb->protocol = eth_type_trans(skb, rq->dev); 570 571 metalen = xdp.data - xdp.data_meta; 572 if (metalen) 573 skb_metadata_set(skb, metalen); 574 out: 575 return skb; 576 drop: 577 rcu_read_unlock(); 578 kfree_skb(skb); 579 return NULL; 580 err_xdp: 581 rcu_read_unlock(); 582 page_frag_free(xdp.data); 583 xdp_xmit: 584 return NULL; 585 } 586 587 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit) 588 { 589 int i, done = 0; 590 591 for (i = 0; i < budget; i++) { 592 void *ptr = __ptr_ring_consume(&rq->xdp_ring); 593 struct sk_buff *skb; 594 595 if (!ptr) 596 break; 597 598 if (veth_is_xdp_frame(ptr)) { 599 skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr), 600 xdp_xmit); 601 } else { 602 skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit); 603 } 604 605 if (skb) 606 napi_gro_receive(&rq->xdp_napi, skb); 607 608 done++; 609 } 610 611 return done; 612 } 613 614 static int veth_poll(struct napi_struct *napi, int budget) 615 { 616 struct veth_rq *rq = 617 container_of(napi, struct veth_rq, xdp_napi); 618 unsigned int xdp_xmit = 0; 619 int done; 620 621 xdp_set_return_frame_no_direct(); 622 done = veth_xdp_rcv(rq, budget, &xdp_xmit); 623 624 if (done < budget && napi_complete_done(napi, done)) { 625 /* Write rx_notify_masked before reading ptr_ring */ 626 smp_store_mb(rq->rx_notify_masked, false); 627 if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) { 628 rq->rx_notify_masked = true; 629 napi_schedule(&rq->xdp_napi); 630 } 631 } 632 633 if (xdp_xmit & VETH_XDP_TX) 634 veth_xdp_flush(rq->dev); 635 if (xdp_xmit & VETH_XDP_REDIR) 636 xdp_do_flush_map(); 637 xdp_clear_return_frame_no_direct(); 638 639 return done; 640 } 641 642 static int veth_napi_add(struct net_device *dev) 643 { 644 struct veth_priv *priv = netdev_priv(dev); 645 int err, i; 646 647 for (i = 0; i < dev->real_num_rx_queues; i++) { 648 struct veth_rq *rq = &priv->rq[i]; 649 650 err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL); 651 if (err) 652 goto err_xdp_ring; 653 } 654 655 for (i = 0; i < dev->real_num_rx_queues; i++) { 656 struct veth_rq *rq = &priv->rq[i]; 657 658 netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); 659 napi_enable(&rq->xdp_napi); 660 } 661 662 return 0; 663 err_xdp_ring: 664 for (i--; i >= 0; i--) 665 ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free); 666 667 return err; 668 } 669 670 static void veth_napi_del(struct net_device *dev) 671 { 672 struct veth_priv *priv = netdev_priv(dev); 673 int i; 674 675 for (i = 0; i < dev->real_num_rx_queues; i++) { 676 struct veth_rq *rq = &priv->rq[i]; 677 678 napi_disable(&rq->xdp_napi); 679 napi_hash_del(&rq->xdp_napi); 680 } 681 synchronize_net(); 682 683 for (i = 0; i < dev->real_num_rx_queues; i++) { 684 struct veth_rq *rq = &priv->rq[i]; 685 686 netif_napi_del(&rq->xdp_napi); 687 rq->rx_notify_masked = false; 688 ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free); 689 } 690 } 691 692 static int veth_enable_xdp(struct net_device *dev) 693 { 694 struct veth_priv *priv = netdev_priv(dev); 695 int err, i; 696 697 if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) { 698 for (i = 0; i < dev->real_num_rx_queues; i++) { 699 struct veth_rq *rq = &priv->rq[i]; 700 701 err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i); 702 if (err < 0) 703 goto err_rxq_reg; 704 705 err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, 706 MEM_TYPE_PAGE_SHARED, 707 NULL); 708 if (err < 0) 709 goto err_reg_mem; 710 711 /* Save original mem info as it can be overwritten */ 712 rq->xdp_mem = rq->xdp_rxq.mem; 713 } 714 715 err = veth_napi_add(dev); 716 if (err) 717 goto err_rxq_reg; 718 } 719 720 for (i = 0; i < dev->real_num_rx_queues; i++) 721 rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); 722 723 return 0; 724 err_reg_mem: 725 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 726 err_rxq_reg: 727 for (i--; i >= 0; i--) 728 xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq); 729 730 return err; 731 } 732 733 static void veth_disable_xdp(struct net_device *dev) 734 { 735 struct veth_priv *priv = netdev_priv(dev); 736 int i; 737 738 for (i = 0; i < dev->real_num_rx_queues; i++) 739 rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); 740 veth_napi_del(dev); 741 for (i = 0; i < dev->real_num_rx_queues; i++) { 742 struct veth_rq *rq = &priv->rq[i]; 743 744 rq->xdp_rxq.mem = rq->xdp_mem; 745 xdp_rxq_info_unreg(&rq->xdp_rxq); 746 } 747 } 748 749 static int veth_open(struct net_device *dev) 750 { 751 struct veth_priv *priv = netdev_priv(dev); 752 struct net_device *peer = rtnl_dereference(priv->peer); 753 int err; 754 755 if (!peer) 756 return -ENOTCONN; 757 758 if (priv->_xdp_prog) { 759 err = veth_enable_xdp(dev); 760 if (err) 761 return err; 762 } 763 764 if (peer->flags & IFF_UP) { 765 netif_carrier_on(dev); 766 netif_carrier_on(peer); 767 } 768 769 return 0; 770 } 771 772 static int veth_close(struct net_device *dev) 773 { 774 struct veth_priv *priv = netdev_priv(dev); 775 struct net_device *peer = rtnl_dereference(priv->peer); 776 777 netif_carrier_off(dev); 778 if (peer) 779 netif_carrier_off(peer); 780 781 if (priv->_xdp_prog) 782 veth_disable_xdp(dev); 783 784 return 0; 785 } 786 787 static int is_valid_veth_mtu(int mtu) 788 { 789 return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU; 790 } 791 792 static int veth_dev_init(struct net_device *dev) 793 { 794 dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats); 795 if (!dev->vstats) 796 return -ENOMEM; 797 return 0; 798 } 799 800 static void veth_dev_free(struct net_device *dev) 801 { 802 free_percpu(dev->vstats); 803 } 804 805 #ifdef CONFIG_NET_POLL_CONTROLLER 806 static void veth_poll_controller(struct net_device *dev) 807 { 808 /* veth only receives frames when its peer sends one 809 * Since it has nothing to do with disabling irqs, we are guaranteed 810 * never to have pending data when we poll for it so 811 * there is nothing to do here. 812 * 813 * We need this though so netpoll recognizes us as an interface that 814 * supports polling, which enables bridge devices in virt setups to 815 * still use netconsole 816 */ 817 } 818 #endif /* CONFIG_NET_POLL_CONTROLLER */ 819 820 static int veth_get_iflink(const struct net_device *dev) 821 { 822 struct veth_priv *priv = netdev_priv(dev); 823 struct net_device *peer; 824 int iflink; 825 826 rcu_read_lock(); 827 peer = rcu_dereference(priv->peer); 828 iflink = peer ? peer->ifindex : 0; 829 rcu_read_unlock(); 830 831 return iflink; 832 } 833 834 static netdev_features_t veth_fix_features(struct net_device *dev, 835 netdev_features_t features) 836 { 837 struct veth_priv *priv = netdev_priv(dev); 838 struct net_device *peer; 839 840 peer = rtnl_dereference(priv->peer); 841 if (peer) { 842 struct veth_priv *peer_priv = netdev_priv(peer); 843 844 if (peer_priv->_xdp_prog) 845 features &= ~NETIF_F_GSO_SOFTWARE; 846 } 847 848 return features; 849 } 850 851 static void veth_set_rx_headroom(struct net_device *dev, int new_hr) 852 { 853 struct veth_priv *peer_priv, *priv = netdev_priv(dev); 854 struct net_device *peer; 855 856 if (new_hr < 0) 857 new_hr = 0; 858 859 rcu_read_lock(); 860 peer = rcu_dereference(priv->peer); 861 if (unlikely(!peer)) 862 goto out; 863 864 peer_priv = netdev_priv(peer); 865 priv->requested_headroom = new_hr; 866 new_hr = max(priv->requested_headroom, peer_priv->requested_headroom); 867 dev->needed_headroom = new_hr; 868 peer->needed_headroom = new_hr; 869 870 out: 871 rcu_read_unlock(); 872 } 873 874 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog, 875 struct netlink_ext_ack *extack) 876 { 877 struct veth_priv *priv = netdev_priv(dev); 878 struct bpf_prog *old_prog; 879 struct net_device *peer; 880 unsigned int max_mtu; 881 int err; 882 883 old_prog = priv->_xdp_prog; 884 priv->_xdp_prog = prog; 885 peer = rtnl_dereference(priv->peer); 886 887 if (prog) { 888 if (!peer) { 889 NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached"); 890 err = -ENOTCONN; 891 goto err; 892 } 893 894 max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM - 895 peer->hard_header_len - 896 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 897 if (peer->mtu > max_mtu) { 898 NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP"); 899 err = -ERANGE; 900 goto err; 901 } 902 903 if (dev->real_num_rx_queues < peer->real_num_tx_queues) { 904 NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues"); 905 err = -ENOSPC; 906 goto err; 907 } 908 909 if (dev->flags & IFF_UP) { 910 err = veth_enable_xdp(dev); 911 if (err) { 912 NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed"); 913 goto err; 914 } 915 } 916 917 if (!old_prog) { 918 peer->hw_features &= ~NETIF_F_GSO_SOFTWARE; 919 peer->max_mtu = max_mtu; 920 } 921 } 922 923 if (old_prog) { 924 if (!prog) { 925 if (dev->flags & IFF_UP) 926 veth_disable_xdp(dev); 927 928 if (peer) { 929 peer->hw_features |= NETIF_F_GSO_SOFTWARE; 930 peer->max_mtu = ETH_MAX_MTU; 931 } 932 } 933 bpf_prog_put(old_prog); 934 } 935 936 if ((!!old_prog ^ !!prog) && peer) 937 netdev_update_features(peer); 938 939 return 0; 940 err: 941 priv->_xdp_prog = old_prog; 942 943 return err; 944 } 945 946 static u32 veth_xdp_query(struct net_device *dev) 947 { 948 struct veth_priv *priv = netdev_priv(dev); 949 const struct bpf_prog *xdp_prog; 950 951 xdp_prog = priv->_xdp_prog; 952 if (xdp_prog) 953 return xdp_prog->aux->id; 954 955 return 0; 956 } 957 958 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp) 959 { 960 switch (xdp->command) { 961 case XDP_SETUP_PROG: 962 return veth_xdp_set(dev, xdp->prog, xdp->extack); 963 case XDP_QUERY_PROG: 964 xdp->prog_id = veth_xdp_query(dev); 965 return 0; 966 default: 967 return -EINVAL; 968 } 969 } 970 971 static const struct net_device_ops veth_netdev_ops = { 972 .ndo_init = veth_dev_init, 973 .ndo_open = veth_open, 974 .ndo_stop = veth_close, 975 .ndo_start_xmit = veth_xmit, 976 .ndo_get_stats64 = veth_get_stats64, 977 .ndo_set_rx_mode = veth_set_multicast_list, 978 .ndo_set_mac_address = eth_mac_addr, 979 #ifdef CONFIG_NET_POLL_CONTROLLER 980 .ndo_poll_controller = veth_poll_controller, 981 #endif 982 .ndo_get_iflink = veth_get_iflink, 983 .ndo_fix_features = veth_fix_features, 984 .ndo_features_check = passthru_features_check, 985 .ndo_set_rx_headroom = veth_set_rx_headroom, 986 .ndo_bpf = veth_xdp, 987 .ndo_xdp_xmit = veth_xdp_xmit, 988 }; 989 990 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ 991 NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \ 992 NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ 993 NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \ 994 NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX ) 995 996 static void veth_setup(struct net_device *dev) 997 { 998 ether_setup(dev); 999 1000 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 1001 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 1002 dev->priv_flags |= IFF_NO_QUEUE; 1003 dev->priv_flags |= IFF_PHONY_HEADROOM; 1004 1005 dev->netdev_ops = &veth_netdev_ops; 1006 dev->ethtool_ops = &veth_ethtool_ops; 1007 dev->features |= NETIF_F_LLTX; 1008 dev->features |= VETH_FEATURES; 1009 dev->vlan_features = dev->features & 1010 ~(NETIF_F_HW_VLAN_CTAG_TX | 1011 NETIF_F_HW_VLAN_STAG_TX | 1012 NETIF_F_HW_VLAN_CTAG_RX | 1013 NETIF_F_HW_VLAN_STAG_RX); 1014 dev->needs_free_netdev = true; 1015 dev->priv_destructor = veth_dev_free; 1016 dev->max_mtu = ETH_MAX_MTU; 1017 1018 dev->hw_features = VETH_FEATURES; 1019 dev->hw_enc_features = VETH_FEATURES; 1020 dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; 1021 } 1022 1023 /* 1024 * netlink interface 1025 */ 1026 1027 static int veth_validate(struct nlattr *tb[], struct nlattr *data[], 1028 struct netlink_ext_ack *extack) 1029 { 1030 if (tb[IFLA_ADDRESS]) { 1031 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1032 return -EINVAL; 1033 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1034 return -EADDRNOTAVAIL; 1035 } 1036 if (tb[IFLA_MTU]) { 1037 if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU]))) 1038 return -EINVAL; 1039 } 1040 return 0; 1041 } 1042 1043 static int veth_alloc_queues(struct net_device *dev) 1044 { 1045 struct veth_priv *priv = netdev_priv(dev); 1046 1047 priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL); 1048 if (!priv->rq) 1049 return -ENOMEM; 1050 1051 return 0; 1052 } 1053 1054 static void veth_free_queues(struct net_device *dev) 1055 { 1056 struct veth_priv *priv = netdev_priv(dev); 1057 1058 kfree(priv->rq); 1059 } 1060 1061 static struct rtnl_link_ops veth_link_ops; 1062 1063 static int veth_newlink(struct net *src_net, struct net_device *dev, 1064 struct nlattr *tb[], struct nlattr *data[], 1065 struct netlink_ext_ack *extack) 1066 { 1067 int err, i; 1068 struct net_device *peer; 1069 struct veth_priv *priv; 1070 char ifname[IFNAMSIZ]; 1071 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp; 1072 unsigned char name_assign_type; 1073 struct ifinfomsg *ifmp; 1074 struct net *net; 1075 1076 /* 1077 * create and register peer first 1078 */ 1079 if (data != NULL && data[VETH_INFO_PEER] != NULL) { 1080 struct nlattr *nla_peer; 1081 1082 nla_peer = data[VETH_INFO_PEER]; 1083 ifmp = nla_data(nla_peer); 1084 err = rtnl_nla_parse_ifla(peer_tb, 1085 nla_data(nla_peer) + sizeof(struct ifinfomsg), 1086 nla_len(nla_peer) - sizeof(struct ifinfomsg), 1087 NULL); 1088 if (err < 0) 1089 return err; 1090 1091 err = veth_validate(peer_tb, NULL, extack); 1092 if (err < 0) 1093 return err; 1094 1095 tbp = peer_tb; 1096 } else { 1097 ifmp = NULL; 1098 tbp = tb; 1099 } 1100 1101 if (ifmp && tbp[IFLA_IFNAME]) { 1102 nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); 1103 name_assign_type = NET_NAME_USER; 1104 } else { 1105 snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); 1106 name_assign_type = NET_NAME_ENUM; 1107 } 1108 1109 net = rtnl_link_get_net(src_net, tbp); 1110 if (IS_ERR(net)) 1111 return PTR_ERR(net); 1112 1113 peer = rtnl_create_link(net, ifname, name_assign_type, 1114 &veth_link_ops, tbp); 1115 if (IS_ERR(peer)) { 1116 put_net(net); 1117 return PTR_ERR(peer); 1118 } 1119 1120 err = veth_alloc_queues(peer); 1121 if (err) { 1122 put_net(net); 1123 goto err_peer_alloc_queues; 1124 } 1125 1126 if (!ifmp || !tbp[IFLA_ADDRESS]) 1127 eth_hw_addr_random(peer); 1128 1129 if (ifmp && (dev->ifindex != 0)) 1130 peer->ifindex = ifmp->ifi_index; 1131 1132 peer->gso_max_size = dev->gso_max_size; 1133 peer->gso_max_segs = dev->gso_max_segs; 1134 1135 err = register_netdevice(peer); 1136 put_net(net); 1137 net = NULL; 1138 if (err < 0) 1139 goto err_register_peer; 1140 1141 netif_carrier_off(peer); 1142 1143 err = rtnl_configure_link(peer, ifmp); 1144 if (err < 0) 1145 goto err_configure_peer; 1146 1147 /* 1148 * register dev last 1149 * 1150 * note, that since we've registered new device the dev's name 1151 * should be re-allocated 1152 */ 1153 1154 err = veth_alloc_queues(dev); 1155 if (err) 1156 goto err_alloc_queues; 1157 1158 if (tb[IFLA_ADDRESS] == NULL) 1159 eth_hw_addr_random(dev); 1160 1161 if (tb[IFLA_IFNAME]) 1162 nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 1163 else 1164 snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); 1165 1166 err = register_netdevice(dev); 1167 if (err < 0) 1168 goto err_register_dev; 1169 1170 netif_carrier_off(dev); 1171 1172 /* 1173 * tie the deviced together 1174 */ 1175 1176 priv = netdev_priv(dev); 1177 for (i = 0; i < dev->real_num_rx_queues; i++) 1178 priv->rq[i].dev = dev; 1179 rcu_assign_pointer(priv->peer, peer); 1180 1181 priv = netdev_priv(peer); 1182 for (i = 0; i < peer->real_num_rx_queues; i++) 1183 priv->rq[i].dev = peer; 1184 rcu_assign_pointer(priv->peer, dev); 1185 1186 return 0; 1187 1188 err_register_dev: 1189 veth_free_queues(dev); 1190 err_alloc_queues: 1191 /* nothing to do */ 1192 err_configure_peer: 1193 unregister_netdevice(peer); 1194 return err; 1195 1196 err_register_peer: 1197 veth_free_queues(peer); 1198 err_peer_alloc_queues: 1199 free_netdev(peer); 1200 return err; 1201 } 1202 1203 static void veth_dellink(struct net_device *dev, struct list_head *head) 1204 { 1205 struct veth_priv *priv; 1206 struct net_device *peer; 1207 1208 priv = netdev_priv(dev); 1209 peer = rtnl_dereference(priv->peer); 1210 1211 /* Note : dellink() is called from default_device_exit_batch(), 1212 * before a rcu_synchronize() point. The devices are guaranteed 1213 * not being freed before one RCU grace period. 1214 */ 1215 RCU_INIT_POINTER(priv->peer, NULL); 1216 unregister_netdevice_queue(dev, head); 1217 1218 if (peer) { 1219 priv = netdev_priv(peer); 1220 RCU_INIT_POINTER(priv->peer, NULL); 1221 unregister_netdevice_queue(peer, head); 1222 } 1223 } 1224 1225 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { 1226 [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, 1227 }; 1228 1229 static struct net *veth_get_link_net(const struct net_device *dev) 1230 { 1231 struct veth_priv *priv = netdev_priv(dev); 1232 struct net_device *peer = rtnl_dereference(priv->peer); 1233 1234 return peer ? dev_net(peer) : dev_net(dev); 1235 } 1236 1237 static struct rtnl_link_ops veth_link_ops = { 1238 .kind = DRV_NAME, 1239 .priv_size = sizeof(struct veth_priv), 1240 .setup = veth_setup, 1241 .validate = veth_validate, 1242 .newlink = veth_newlink, 1243 .dellink = veth_dellink, 1244 .policy = veth_policy, 1245 .maxtype = VETH_INFO_MAX, 1246 .get_link_net = veth_get_link_net, 1247 }; 1248 1249 /* 1250 * init/fini 1251 */ 1252 1253 static __init int veth_init(void) 1254 { 1255 return rtnl_link_register(&veth_link_ops); 1256 } 1257 1258 static __exit void veth_exit(void) 1259 { 1260 rtnl_link_unregister(&veth_link_ops); 1261 } 1262 1263 module_init(veth_init); 1264 module_exit(veth_exit); 1265 1266 MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); 1267 MODULE_LICENSE("GPL v2"); 1268 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1269