xref: /linux/drivers/net/veth.c (revision 307797159ac25fe5a2048bf5c6a5718298edca57)
1 /*
2  *  drivers/net/veth.c
3  *
4  *  Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
5  *
6  * Author: Pavel Emelianov <xemul@openvz.org>
7  * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
8  *
9  */
10 
11 #include <linux/netdevice.h>
12 #include <linux/slab.h>
13 #include <linux/ethtool.h>
14 #include <linux/etherdevice.h>
15 #include <linux/u64_stats_sync.h>
16 
17 #include <net/rtnetlink.h>
18 #include <net/dst.h>
19 #include <net/xfrm.h>
20 #include <net/xdp.h>
21 #include <linux/veth.h>
22 #include <linux/module.h>
23 #include <linux/bpf.h>
24 #include <linux/filter.h>
25 #include <linux/ptr_ring.h>
26 #include <linux/bpf_trace.h>
27 
28 #define DRV_NAME	"veth"
29 #define DRV_VERSION	"1.0"
30 
31 #define VETH_XDP_FLAG		BIT(0)
32 #define VETH_RING_SIZE		256
33 #define VETH_XDP_HEADROOM	(XDP_PACKET_HEADROOM + NET_IP_ALIGN)
34 
35 /* Separating two types of XDP xmit */
36 #define VETH_XDP_TX		BIT(0)
37 #define VETH_XDP_REDIR		BIT(1)
38 
39 struct pcpu_vstats {
40 	u64			packets;
41 	u64			bytes;
42 	struct u64_stats_sync	syncp;
43 };
44 
45 struct veth_rq {
46 	struct napi_struct	xdp_napi;
47 	struct net_device	*dev;
48 	struct bpf_prog __rcu	*xdp_prog;
49 	struct xdp_mem_info	xdp_mem;
50 	bool			rx_notify_masked;
51 	struct ptr_ring		xdp_ring;
52 	struct xdp_rxq_info	xdp_rxq;
53 };
54 
55 struct veth_priv {
56 	struct net_device __rcu	*peer;
57 	atomic64_t		dropped;
58 	struct bpf_prog		*_xdp_prog;
59 	struct veth_rq		*rq;
60 	unsigned int		requested_headroom;
61 };
62 
63 /*
64  * ethtool interface
65  */
66 
67 static struct {
68 	const char string[ETH_GSTRING_LEN];
69 } ethtool_stats_keys[] = {
70 	{ "peer_ifindex" },
71 };
72 
73 static int veth_get_link_ksettings(struct net_device *dev,
74 				   struct ethtool_link_ksettings *cmd)
75 {
76 	cmd->base.speed		= SPEED_10000;
77 	cmd->base.duplex	= DUPLEX_FULL;
78 	cmd->base.port		= PORT_TP;
79 	cmd->base.autoneg	= AUTONEG_DISABLE;
80 	return 0;
81 }
82 
83 static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
84 {
85 	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
86 	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
87 }
88 
89 static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
90 {
91 	switch(stringset) {
92 	case ETH_SS_STATS:
93 		memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
94 		break;
95 	}
96 }
97 
98 static int veth_get_sset_count(struct net_device *dev, int sset)
99 {
100 	switch (sset) {
101 	case ETH_SS_STATS:
102 		return ARRAY_SIZE(ethtool_stats_keys);
103 	default:
104 		return -EOPNOTSUPP;
105 	}
106 }
107 
108 static void veth_get_ethtool_stats(struct net_device *dev,
109 		struct ethtool_stats *stats, u64 *data)
110 {
111 	struct veth_priv *priv = netdev_priv(dev);
112 	struct net_device *peer = rtnl_dereference(priv->peer);
113 
114 	data[0] = peer ? peer->ifindex : 0;
115 }
116 
117 static const struct ethtool_ops veth_ethtool_ops = {
118 	.get_drvinfo		= veth_get_drvinfo,
119 	.get_link		= ethtool_op_get_link,
120 	.get_strings		= veth_get_strings,
121 	.get_sset_count		= veth_get_sset_count,
122 	.get_ethtool_stats	= veth_get_ethtool_stats,
123 	.get_link_ksettings	= veth_get_link_ksettings,
124 };
125 
126 /* general routines */
127 
128 static bool veth_is_xdp_frame(void *ptr)
129 {
130 	return (unsigned long)ptr & VETH_XDP_FLAG;
131 }
132 
133 static void *veth_ptr_to_xdp(void *ptr)
134 {
135 	return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
136 }
137 
138 static void *veth_xdp_to_ptr(void *ptr)
139 {
140 	return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
141 }
142 
143 static void veth_ptr_free(void *ptr)
144 {
145 	if (veth_is_xdp_frame(ptr))
146 		xdp_return_frame(veth_ptr_to_xdp(ptr));
147 	else
148 		kfree_skb(ptr);
149 }
150 
151 static void __veth_xdp_flush(struct veth_rq *rq)
152 {
153 	/* Write ptr_ring before reading rx_notify_masked */
154 	smp_mb();
155 	if (!rq->rx_notify_masked) {
156 		rq->rx_notify_masked = true;
157 		napi_schedule(&rq->xdp_napi);
158 	}
159 }
160 
161 static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
162 {
163 	if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
164 		dev_kfree_skb_any(skb);
165 		return NET_RX_DROP;
166 	}
167 
168 	return NET_RX_SUCCESS;
169 }
170 
171 static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
172 			    struct veth_rq *rq, bool xdp)
173 {
174 	return __dev_forward_skb(dev, skb) ?: xdp ?
175 		veth_xdp_rx(rq, skb) :
176 		netif_rx(skb);
177 }
178 
179 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
180 {
181 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
182 	struct veth_rq *rq = NULL;
183 	struct net_device *rcv;
184 	int length = skb->len;
185 	bool rcv_xdp = false;
186 	int rxq;
187 
188 	rcu_read_lock();
189 	rcv = rcu_dereference(priv->peer);
190 	if (unlikely(!rcv)) {
191 		kfree_skb(skb);
192 		goto drop;
193 	}
194 
195 	rcv_priv = netdev_priv(rcv);
196 	rxq = skb_get_queue_mapping(skb);
197 	if (rxq < rcv->real_num_rx_queues) {
198 		rq = &rcv_priv->rq[rxq];
199 		rcv_xdp = rcu_access_pointer(rq->xdp_prog);
200 		if (rcv_xdp)
201 			skb_record_rx_queue(skb, rxq);
202 	}
203 
204 	if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
205 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
206 
207 		u64_stats_update_begin(&stats->syncp);
208 		stats->bytes += length;
209 		stats->packets++;
210 		u64_stats_update_end(&stats->syncp);
211 	} else {
212 drop:
213 		atomic64_inc(&priv->dropped);
214 	}
215 
216 	if (rcv_xdp)
217 		__veth_xdp_flush(rq);
218 
219 	rcu_read_unlock();
220 
221 	return NETDEV_TX_OK;
222 }
223 
224 static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
225 {
226 	struct veth_priv *priv = netdev_priv(dev);
227 	int cpu;
228 
229 	result->packets = 0;
230 	result->bytes = 0;
231 	for_each_possible_cpu(cpu) {
232 		struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu);
233 		u64 packets, bytes;
234 		unsigned int start;
235 
236 		do {
237 			start = u64_stats_fetch_begin_irq(&stats->syncp);
238 			packets = stats->packets;
239 			bytes = stats->bytes;
240 		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
241 		result->packets += packets;
242 		result->bytes += bytes;
243 	}
244 	return atomic64_read(&priv->dropped);
245 }
246 
247 static void veth_get_stats64(struct net_device *dev,
248 			     struct rtnl_link_stats64 *tot)
249 {
250 	struct veth_priv *priv = netdev_priv(dev);
251 	struct net_device *peer;
252 	struct pcpu_vstats one;
253 
254 	tot->tx_dropped = veth_stats_one(&one, dev);
255 	tot->tx_bytes = one.bytes;
256 	tot->tx_packets = one.packets;
257 
258 	rcu_read_lock();
259 	peer = rcu_dereference(priv->peer);
260 	if (peer) {
261 		tot->rx_dropped = veth_stats_one(&one, peer);
262 		tot->rx_bytes = one.bytes;
263 		tot->rx_packets = one.packets;
264 	}
265 	rcu_read_unlock();
266 }
267 
268 /* fake multicast ability */
269 static void veth_set_multicast_list(struct net_device *dev)
270 {
271 }
272 
273 static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
274 				      int buflen)
275 {
276 	struct sk_buff *skb;
277 
278 	if (!buflen) {
279 		buflen = SKB_DATA_ALIGN(headroom + len) +
280 			 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
281 	}
282 	skb = build_skb(head, buflen);
283 	if (!skb)
284 		return NULL;
285 
286 	skb_reserve(skb, headroom);
287 	skb_put(skb, len);
288 
289 	return skb;
290 }
291 
292 static int veth_select_rxq(struct net_device *dev)
293 {
294 	return smp_processor_id() % dev->real_num_rx_queues;
295 }
296 
297 static int veth_xdp_xmit(struct net_device *dev, int n,
298 			 struct xdp_frame **frames, u32 flags)
299 {
300 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
301 	struct net_device *rcv;
302 	unsigned int max_len;
303 	struct veth_rq *rq;
304 	int i, drops = 0;
305 
306 	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
307 		return -EINVAL;
308 
309 	rcv = rcu_dereference(priv->peer);
310 	if (unlikely(!rcv))
311 		return -ENXIO;
312 
313 	rcv_priv = netdev_priv(rcv);
314 	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
315 	/* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
316 	 * side. This means an XDP program is loaded on the peer and the peer
317 	 * device is up.
318 	 */
319 	if (!rcu_access_pointer(rq->xdp_prog))
320 		return -ENXIO;
321 
322 	max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
323 
324 	spin_lock(&rq->xdp_ring.producer_lock);
325 	for (i = 0; i < n; i++) {
326 		struct xdp_frame *frame = frames[i];
327 		void *ptr = veth_xdp_to_ptr(frame);
328 
329 		if (unlikely(frame->len > max_len ||
330 			     __ptr_ring_produce(&rq->xdp_ring, ptr))) {
331 			xdp_return_frame_rx_napi(frame);
332 			drops++;
333 		}
334 	}
335 	spin_unlock(&rq->xdp_ring.producer_lock);
336 
337 	if (flags & XDP_XMIT_FLUSH)
338 		__veth_xdp_flush(rq);
339 
340 	return n - drops;
341 }
342 
343 static void veth_xdp_flush(struct net_device *dev)
344 {
345 	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
346 	struct net_device *rcv;
347 	struct veth_rq *rq;
348 
349 	rcu_read_lock();
350 	rcv = rcu_dereference(priv->peer);
351 	if (unlikely(!rcv))
352 		goto out;
353 
354 	rcv_priv = netdev_priv(rcv);
355 	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
356 	/* xdp_ring is initialized on receive side? */
357 	if (unlikely(!rcu_access_pointer(rq->xdp_prog)))
358 		goto out;
359 
360 	__veth_xdp_flush(rq);
361 out:
362 	rcu_read_unlock();
363 }
364 
365 static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
366 {
367 	struct xdp_frame *frame = convert_to_xdp_frame(xdp);
368 
369 	if (unlikely(!frame))
370 		return -EOVERFLOW;
371 
372 	return veth_xdp_xmit(dev, 1, &frame, 0);
373 }
374 
375 static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
376 					struct xdp_frame *frame,
377 					unsigned int *xdp_xmit)
378 {
379 	void *hard_start = frame->data - frame->headroom;
380 	void *head = hard_start - sizeof(struct xdp_frame);
381 	int len = frame->len, delta = 0;
382 	struct xdp_frame orig_frame;
383 	struct bpf_prog *xdp_prog;
384 	unsigned int headroom;
385 	struct sk_buff *skb;
386 
387 	rcu_read_lock();
388 	xdp_prog = rcu_dereference(rq->xdp_prog);
389 	if (likely(xdp_prog)) {
390 		struct xdp_buff xdp;
391 		u32 act;
392 
393 		xdp.data_hard_start = hard_start;
394 		xdp.data = frame->data;
395 		xdp.data_end = frame->data + frame->len;
396 		xdp.data_meta = frame->data - frame->metasize;
397 		xdp.rxq = &rq->xdp_rxq;
398 
399 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
400 
401 		switch (act) {
402 		case XDP_PASS:
403 			delta = frame->data - xdp.data;
404 			len = xdp.data_end - xdp.data;
405 			break;
406 		case XDP_TX:
407 			orig_frame = *frame;
408 			xdp.data_hard_start = head;
409 			xdp.rxq->mem = frame->mem;
410 			if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
411 				trace_xdp_exception(rq->dev, xdp_prog, act);
412 				frame = &orig_frame;
413 				goto err_xdp;
414 			}
415 			*xdp_xmit |= VETH_XDP_TX;
416 			rcu_read_unlock();
417 			goto xdp_xmit;
418 		case XDP_REDIRECT:
419 			orig_frame = *frame;
420 			xdp.data_hard_start = head;
421 			xdp.rxq->mem = frame->mem;
422 			if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
423 				frame = &orig_frame;
424 				goto err_xdp;
425 			}
426 			*xdp_xmit |= VETH_XDP_REDIR;
427 			rcu_read_unlock();
428 			goto xdp_xmit;
429 		default:
430 			bpf_warn_invalid_xdp_action(act);
431 		case XDP_ABORTED:
432 			trace_xdp_exception(rq->dev, xdp_prog, act);
433 		case XDP_DROP:
434 			goto err_xdp;
435 		}
436 	}
437 	rcu_read_unlock();
438 
439 	headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
440 	skb = veth_build_skb(head, headroom, len, 0);
441 	if (!skb) {
442 		xdp_return_frame(frame);
443 		goto err;
444 	}
445 
446 	xdp_scrub_frame(frame);
447 	skb->protocol = eth_type_trans(skb, rq->dev);
448 err:
449 	return skb;
450 err_xdp:
451 	rcu_read_unlock();
452 	xdp_return_frame(frame);
453 xdp_xmit:
454 	return NULL;
455 }
456 
457 static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
458 					unsigned int *xdp_xmit)
459 {
460 	u32 pktlen, headroom, act, metalen;
461 	void *orig_data, *orig_data_end;
462 	struct bpf_prog *xdp_prog;
463 	int mac_len, delta, off;
464 	struct xdp_buff xdp;
465 
466 	rcu_read_lock();
467 	xdp_prog = rcu_dereference(rq->xdp_prog);
468 	if (unlikely(!xdp_prog)) {
469 		rcu_read_unlock();
470 		goto out;
471 	}
472 
473 	mac_len = skb->data - skb_mac_header(skb);
474 	pktlen = skb->len + mac_len;
475 	headroom = skb_headroom(skb) - mac_len;
476 
477 	if (skb_shared(skb) || skb_head_is_locked(skb) ||
478 	    skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
479 		struct sk_buff *nskb;
480 		int size, head_off;
481 		void *head, *start;
482 		struct page *page;
483 
484 		size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
485 		       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
486 		if (size > PAGE_SIZE)
487 			goto drop;
488 
489 		page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
490 		if (!page)
491 			goto drop;
492 
493 		head = page_address(page);
494 		start = head + VETH_XDP_HEADROOM;
495 		if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
496 			page_frag_free(head);
497 			goto drop;
498 		}
499 
500 		nskb = veth_build_skb(head,
501 				      VETH_XDP_HEADROOM + mac_len, skb->len,
502 				      PAGE_SIZE);
503 		if (!nskb) {
504 			page_frag_free(head);
505 			goto drop;
506 		}
507 
508 		skb_copy_header(nskb, skb);
509 		head_off = skb_headroom(nskb) - skb_headroom(skb);
510 		skb_headers_offset_update(nskb, head_off);
511 		if (skb->sk)
512 			skb_set_owner_w(nskb, skb->sk);
513 		consume_skb(skb);
514 		skb = nskb;
515 	}
516 
517 	xdp.data_hard_start = skb->head;
518 	xdp.data = skb_mac_header(skb);
519 	xdp.data_end = xdp.data + pktlen;
520 	xdp.data_meta = xdp.data;
521 	xdp.rxq = &rq->xdp_rxq;
522 	orig_data = xdp.data;
523 	orig_data_end = xdp.data_end;
524 
525 	act = bpf_prog_run_xdp(xdp_prog, &xdp);
526 
527 	switch (act) {
528 	case XDP_PASS:
529 		break;
530 	case XDP_TX:
531 		get_page(virt_to_page(xdp.data));
532 		consume_skb(skb);
533 		xdp.rxq->mem = rq->xdp_mem;
534 		if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
535 			trace_xdp_exception(rq->dev, xdp_prog, act);
536 			goto err_xdp;
537 		}
538 		*xdp_xmit |= VETH_XDP_TX;
539 		rcu_read_unlock();
540 		goto xdp_xmit;
541 	case XDP_REDIRECT:
542 		get_page(virt_to_page(xdp.data));
543 		consume_skb(skb);
544 		xdp.rxq->mem = rq->xdp_mem;
545 		if (xdp_do_redirect(rq->dev, &xdp, xdp_prog))
546 			goto err_xdp;
547 		*xdp_xmit |= VETH_XDP_REDIR;
548 		rcu_read_unlock();
549 		goto xdp_xmit;
550 	default:
551 		bpf_warn_invalid_xdp_action(act);
552 	case XDP_ABORTED:
553 		trace_xdp_exception(rq->dev, xdp_prog, act);
554 	case XDP_DROP:
555 		goto drop;
556 	}
557 	rcu_read_unlock();
558 
559 	delta = orig_data - xdp.data;
560 	off = mac_len + delta;
561 	if (off > 0)
562 		__skb_push(skb, off);
563 	else if (off < 0)
564 		__skb_pull(skb, -off);
565 	skb->mac_header -= delta;
566 	off = xdp.data_end - orig_data_end;
567 	if (off != 0)
568 		__skb_put(skb, off);
569 	skb->protocol = eth_type_trans(skb, rq->dev);
570 
571 	metalen = xdp.data - xdp.data_meta;
572 	if (metalen)
573 		skb_metadata_set(skb, metalen);
574 out:
575 	return skb;
576 drop:
577 	rcu_read_unlock();
578 	kfree_skb(skb);
579 	return NULL;
580 err_xdp:
581 	rcu_read_unlock();
582 	page_frag_free(xdp.data);
583 xdp_xmit:
584 	return NULL;
585 }
586 
587 static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
588 {
589 	int i, done = 0;
590 
591 	for (i = 0; i < budget; i++) {
592 		void *ptr = __ptr_ring_consume(&rq->xdp_ring);
593 		struct sk_buff *skb;
594 
595 		if (!ptr)
596 			break;
597 
598 		if (veth_is_xdp_frame(ptr)) {
599 			skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr),
600 					       xdp_xmit);
601 		} else {
602 			skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit);
603 		}
604 
605 		if (skb)
606 			napi_gro_receive(&rq->xdp_napi, skb);
607 
608 		done++;
609 	}
610 
611 	return done;
612 }
613 
614 static int veth_poll(struct napi_struct *napi, int budget)
615 {
616 	struct veth_rq *rq =
617 		container_of(napi, struct veth_rq, xdp_napi);
618 	unsigned int xdp_xmit = 0;
619 	int done;
620 
621 	xdp_set_return_frame_no_direct();
622 	done = veth_xdp_rcv(rq, budget, &xdp_xmit);
623 
624 	if (done < budget && napi_complete_done(napi, done)) {
625 		/* Write rx_notify_masked before reading ptr_ring */
626 		smp_store_mb(rq->rx_notify_masked, false);
627 		if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
628 			rq->rx_notify_masked = true;
629 			napi_schedule(&rq->xdp_napi);
630 		}
631 	}
632 
633 	if (xdp_xmit & VETH_XDP_TX)
634 		veth_xdp_flush(rq->dev);
635 	if (xdp_xmit & VETH_XDP_REDIR)
636 		xdp_do_flush_map();
637 	xdp_clear_return_frame_no_direct();
638 
639 	return done;
640 }
641 
642 static int veth_napi_add(struct net_device *dev)
643 {
644 	struct veth_priv *priv = netdev_priv(dev);
645 	int err, i;
646 
647 	for (i = 0; i < dev->real_num_rx_queues; i++) {
648 		struct veth_rq *rq = &priv->rq[i];
649 
650 		err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
651 		if (err)
652 			goto err_xdp_ring;
653 	}
654 
655 	for (i = 0; i < dev->real_num_rx_queues; i++) {
656 		struct veth_rq *rq = &priv->rq[i];
657 
658 		netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
659 		napi_enable(&rq->xdp_napi);
660 	}
661 
662 	return 0;
663 err_xdp_ring:
664 	for (i--; i >= 0; i--)
665 		ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
666 
667 	return err;
668 }
669 
670 static void veth_napi_del(struct net_device *dev)
671 {
672 	struct veth_priv *priv = netdev_priv(dev);
673 	int i;
674 
675 	for (i = 0; i < dev->real_num_rx_queues; i++) {
676 		struct veth_rq *rq = &priv->rq[i];
677 
678 		napi_disable(&rq->xdp_napi);
679 		napi_hash_del(&rq->xdp_napi);
680 	}
681 	synchronize_net();
682 
683 	for (i = 0; i < dev->real_num_rx_queues; i++) {
684 		struct veth_rq *rq = &priv->rq[i];
685 
686 		netif_napi_del(&rq->xdp_napi);
687 		rq->rx_notify_masked = false;
688 		ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
689 	}
690 }
691 
692 static int veth_enable_xdp(struct net_device *dev)
693 {
694 	struct veth_priv *priv = netdev_priv(dev);
695 	int err, i;
696 
697 	if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
698 		for (i = 0; i < dev->real_num_rx_queues; i++) {
699 			struct veth_rq *rq = &priv->rq[i];
700 
701 			err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
702 			if (err < 0)
703 				goto err_rxq_reg;
704 
705 			err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
706 							 MEM_TYPE_PAGE_SHARED,
707 							 NULL);
708 			if (err < 0)
709 				goto err_reg_mem;
710 
711 			/* Save original mem info as it can be overwritten */
712 			rq->xdp_mem = rq->xdp_rxq.mem;
713 		}
714 
715 		err = veth_napi_add(dev);
716 		if (err)
717 			goto err_rxq_reg;
718 	}
719 
720 	for (i = 0; i < dev->real_num_rx_queues; i++)
721 		rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
722 
723 	return 0;
724 err_reg_mem:
725 	xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
726 err_rxq_reg:
727 	for (i--; i >= 0; i--)
728 		xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
729 
730 	return err;
731 }
732 
733 static void veth_disable_xdp(struct net_device *dev)
734 {
735 	struct veth_priv *priv = netdev_priv(dev);
736 	int i;
737 
738 	for (i = 0; i < dev->real_num_rx_queues; i++)
739 		rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
740 	veth_napi_del(dev);
741 	for (i = 0; i < dev->real_num_rx_queues; i++) {
742 		struct veth_rq *rq = &priv->rq[i];
743 
744 		rq->xdp_rxq.mem = rq->xdp_mem;
745 		xdp_rxq_info_unreg(&rq->xdp_rxq);
746 	}
747 }
748 
749 static int veth_open(struct net_device *dev)
750 {
751 	struct veth_priv *priv = netdev_priv(dev);
752 	struct net_device *peer = rtnl_dereference(priv->peer);
753 	int err;
754 
755 	if (!peer)
756 		return -ENOTCONN;
757 
758 	if (priv->_xdp_prog) {
759 		err = veth_enable_xdp(dev);
760 		if (err)
761 			return err;
762 	}
763 
764 	if (peer->flags & IFF_UP) {
765 		netif_carrier_on(dev);
766 		netif_carrier_on(peer);
767 	}
768 
769 	return 0;
770 }
771 
772 static int veth_close(struct net_device *dev)
773 {
774 	struct veth_priv *priv = netdev_priv(dev);
775 	struct net_device *peer = rtnl_dereference(priv->peer);
776 
777 	netif_carrier_off(dev);
778 	if (peer)
779 		netif_carrier_off(peer);
780 
781 	if (priv->_xdp_prog)
782 		veth_disable_xdp(dev);
783 
784 	return 0;
785 }
786 
787 static int is_valid_veth_mtu(int mtu)
788 {
789 	return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
790 }
791 
792 static int veth_dev_init(struct net_device *dev)
793 {
794 	dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats);
795 	if (!dev->vstats)
796 		return -ENOMEM;
797 	return 0;
798 }
799 
800 static void veth_dev_free(struct net_device *dev)
801 {
802 	free_percpu(dev->vstats);
803 }
804 
805 #ifdef CONFIG_NET_POLL_CONTROLLER
806 static void veth_poll_controller(struct net_device *dev)
807 {
808 	/* veth only receives frames when its peer sends one
809 	 * Since it has nothing to do with disabling irqs, we are guaranteed
810 	 * never to have pending data when we poll for it so
811 	 * there is nothing to do here.
812 	 *
813 	 * We need this though so netpoll recognizes us as an interface that
814 	 * supports polling, which enables bridge devices in virt setups to
815 	 * still use netconsole
816 	 */
817 }
818 #endif	/* CONFIG_NET_POLL_CONTROLLER */
819 
820 static int veth_get_iflink(const struct net_device *dev)
821 {
822 	struct veth_priv *priv = netdev_priv(dev);
823 	struct net_device *peer;
824 	int iflink;
825 
826 	rcu_read_lock();
827 	peer = rcu_dereference(priv->peer);
828 	iflink = peer ? peer->ifindex : 0;
829 	rcu_read_unlock();
830 
831 	return iflink;
832 }
833 
834 static netdev_features_t veth_fix_features(struct net_device *dev,
835 					   netdev_features_t features)
836 {
837 	struct veth_priv *priv = netdev_priv(dev);
838 	struct net_device *peer;
839 
840 	peer = rtnl_dereference(priv->peer);
841 	if (peer) {
842 		struct veth_priv *peer_priv = netdev_priv(peer);
843 
844 		if (peer_priv->_xdp_prog)
845 			features &= ~NETIF_F_GSO_SOFTWARE;
846 	}
847 
848 	return features;
849 }
850 
851 static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
852 {
853 	struct veth_priv *peer_priv, *priv = netdev_priv(dev);
854 	struct net_device *peer;
855 
856 	if (new_hr < 0)
857 		new_hr = 0;
858 
859 	rcu_read_lock();
860 	peer = rcu_dereference(priv->peer);
861 	if (unlikely(!peer))
862 		goto out;
863 
864 	peer_priv = netdev_priv(peer);
865 	priv->requested_headroom = new_hr;
866 	new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
867 	dev->needed_headroom = new_hr;
868 	peer->needed_headroom = new_hr;
869 
870 out:
871 	rcu_read_unlock();
872 }
873 
874 static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
875 			struct netlink_ext_ack *extack)
876 {
877 	struct veth_priv *priv = netdev_priv(dev);
878 	struct bpf_prog *old_prog;
879 	struct net_device *peer;
880 	unsigned int max_mtu;
881 	int err;
882 
883 	old_prog = priv->_xdp_prog;
884 	priv->_xdp_prog = prog;
885 	peer = rtnl_dereference(priv->peer);
886 
887 	if (prog) {
888 		if (!peer) {
889 			NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
890 			err = -ENOTCONN;
891 			goto err;
892 		}
893 
894 		max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
895 			  peer->hard_header_len -
896 			  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
897 		if (peer->mtu > max_mtu) {
898 			NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
899 			err = -ERANGE;
900 			goto err;
901 		}
902 
903 		if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
904 			NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
905 			err = -ENOSPC;
906 			goto err;
907 		}
908 
909 		if (dev->flags & IFF_UP) {
910 			err = veth_enable_xdp(dev);
911 			if (err) {
912 				NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
913 				goto err;
914 			}
915 		}
916 
917 		if (!old_prog) {
918 			peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
919 			peer->max_mtu = max_mtu;
920 		}
921 	}
922 
923 	if (old_prog) {
924 		if (!prog) {
925 			if (dev->flags & IFF_UP)
926 				veth_disable_xdp(dev);
927 
928 			if (peer) {
929 				peer->hw_features |= NETIF_F_GSO_SOFTWARE;
930 				peer->max_mtu = ETH_MAX_MTU;
931 			}
932 		}
933 		bpf_prog_put(old_prog);
934 	}
935 
936 	if ((!!old_prog ^ !!prog) && peer)
937 		netdev_update_features(peer);
938 
939 	return 0;
940 err:
941 	priv->_xdp_prog = old_prog;
942 
943 	return err;
944 }
945 
946 static u32 veth_xdp_query(struct net_device *dev)
947 {
948 	struct veth_priv *priv = netdev_priv(dev);
949 	const struct bpf_prog *xdp_prog;
950 
951 	xdp_prog = priv->_xdp_prog;
952 	if (xdp_prog)
953 		return xdp_prog->aux->id;
954 
955 	return 0;
956 }
957 
958 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
959 {
960 	switch (xdp->command) {
961 	case XDP_SETUP_PROG:
962 		return veth_xdp_set(dev, xdp->prog, xdp->extack);
963 	case XDP_QUERY_PROG:
964 		xdp->prog_id = veth_xdp_query(dev);
965 		return 0;
966 	default:
967 		return -EINVAL;
968 	}
969 }
970 
971 static const struct net_device_ops veth_netdev_ops = {
972 	.ndo_init            = veth_dev_init,
973 	.ndo_open            = veth_open,
974 	.ndo_stop            = veth_close,
975 	.ndo_start_xmit      = veth_xmit,
976 	.ndo_get_stats64     = veth_get_stats64,
977 	.ndo_set_rx_mode     = veth_set_multicast_list,
978 	.ndo_set_mac_address = eth_mac_addr,
979 #ifdef CONFIG_NET_POLL_CONTROLLER
980 	.ndo_poll_controller	= veth_poll_controller,
981 #endif
982 	.ndo_get_iflink		= veth_get_iflink,
983 	.ndo_fix_features	= veth_fix_features,
984 	.ndo_features_check	= passthru_features_check,
985 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
986 	.ndo_bpf		= veth_xdp,
987 	.ndo_xdp_xmit		= veth_xdp_xmit,
988 };
989 
990 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
991 		       NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
992 		       NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
993 		       NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
994 		       NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
995 
996 static void veth_setup(struct net_device *dev)
997 {
998 	ether_setup(dev);
999 
1000 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1001 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1002 	dev->priv_flags |= IFF_NO_QUEUE;
1003 	dev->priv_flags |= IFF_PHONY_HEADROOM;
1004 
1005 	dev->netdev_ops = &veth_netdev_ops;
1006 	dev->ethtool_ops = &veth_ethtool_ops;
1007 	dev->features |= NETIF_F_LLTX;
1008 	dev->features |= VETH_FEATURES;
1009 	dev->vlan_features = dev->features &
1010 			     ~(NETIF_F_HW_VLAN_CTAG_TX |
1011 			       NETIF_F_HW_VLAN_STAG_TX |
1012 			       NETIF_F_HW_VLAN_CTAG_RX |
1013 			       NETIF_F_HW_VLAN_STAG_RX);
1014 	dev->needs_free_netdev = true;
1015 	dev->priv_destructor = veth_dev_free;
1016 	dev->max_mtu = ETH_MAX_MTU;
1017 
1018 	dev->hw_features = VETH_FEATURES;
1019 	dev->hw_enc_features = VETH_FEATURES;
1020 	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
1021 }
1022 
1023 /*
1024  * netlink interface
1025  */
1026 
1027 static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
1028 			 struct netlink_ext_ack *extack)
1029 {
1030 	if (tb[IFLA_ADDRESS]) {
1031 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1032 			return -EINVAL;
1033 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1034 			return -EADDRNOTAVAIL;
1035 	}
1036 	if (tb[IFLA_MTU]) {
1037 		if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
1038 			return -EINVAL;
1039 	}
1040 	return 0;
1041 }
1042 
1043 static int veth_alloc_queues(struct net_device *dev)
1044 {
1045 	struct veth_priv *priv = netdev_priv(dev);
1046 
1047 	priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
1048 	if (!priv->rq)
1049 		return -ENOMEM;
1050 
1051 	return 0;
1052 }
1053 
1054 static void veth_free_queues(struct net_device *dev)
1055 {
1056 	struct veth_priv *priv = netdev_priv(dev);
1057 
1058 	kfree(priv->rq);
1059 }
1060 
1061 static struct rtnl_link_ops veth_link_ops;
1062 
1063 static int veth_newlink(struct net *src_net, struct net_device *dev,
1064 			struct nlattr *tb[], struct nlattr *data[],
1065 			struct netlink_ext_ack *extack)
1066 {
1067 	int err, i;
1068 	struct net_device *peer;
1069 	struct veth_priv *priv;
1070 	char ifname[IFNAMSIZ];
1071 	struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
1072 	unsigned char name_assign_type;
1073 	struct ifinfomsg *ifmp;
1074 	struct net *net;
1075 
1076 	/*
1077 	 * create and register peer first
1078 	 */
1079 	if (data != NULL && data[VETH_INFO_PEER] != NULL) {
1080 		struct nlattr *nla_peer;
1081 
1082 		nla_peer = data[VETH_INFO_PEER];
1083 		ifmp = nla_data(nla_peer);
1084 		err = rtnl_nla_parse_ifla(peer_tb,
1085 					  nla_data(nla_peer) + sizeof(struct ifinfomsg),
1086 					  nla_len(nla_peer) - sizeof(struct ifinfomsg),
1087 					  NULL);
1088 		if (err < 0)
1089 			return err;
1090 
1091 		err = veth_validate(peer_tb, NULL, extack);
1092 		if (err < 0)
1093 			return err;
1094 
1095 		tbp = peer_tb;
1096 	} else {
1097 		ifmp = NULL;
1098 		tbp = tb;
1099 	}
1100 
1101 	if (ifmp && tbp[IFLA_IFNAME]) {
1102 		nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
1103 		name_assign_type = NET_NAME_USER;
1104 	} else {
1105 		snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
1106 		name_assign_type = NET_NAME_ENUM;
1107 	}
1108 
1109 	net = rtnl_link_get_net(src_net, tbp);
1110 	if (IS_ERR(net))
1111 		return PTR_ERR(net);
1112 
1113 	peer = rtnl_create_link(net, ifname, name_assign_type,
1114 				&veth_link_ops, tbp);
1115 	if (IS_ERR(peer)) {
1116 		put_net(net);
1117 		return PTR_ERR(peer);
1118 	}
1119 
1120 	err = veth_alloc_queues(peer);
1121 	if (err) {
1122 		put_net(net);
1123 		goto err_peer_alloc_queues;
1124 	}
1125 
1126 	if (!ifmp || !tbp[IFLA_ADDRESS])
1127 		eth_hw_addr_random(peer);
1128 
1129 	if (ifmp && (dev->ifindex != 0))
1130 		peer->ifindex = ifmp->ifi_index;
1131 
1132 	peer->gso_max_size = dev->gso_max_size;
1133 	peer->gso_max_segs = dev->gso_max_segs;
1134 
1135 	err = register_netdevice(peer);
1136 	put_net(net);
1137 	net = NULL;
1138 	if (err < 0)
1139 		goto err_register_peer;
1140 
1141 	netif_carrier_off(peer);
1142 
1143 	err = rtnl_configure_link(peer, ifmp);
1144 	if (err < 0)
1145 		goto err_configure_peer;
1146 
1147 	/*
1148 	 * register dev last
1149 	 *
1150 	 * note, that since we've registered new device the dev's name
1151 	 * should be re-allocated
1152 	 */
1153 
1154 	err = veth_alloc_queues(dev);
1155 	if (err)
1156 		goto err_alloc_queues;
1157 
1158 	if (tb[IFLA_ADDRESS] == NULL)
1159 		eth_hw_addr_random(dev);
1160 
1161 	if (tb[IFLA_IFNAME])
1162 		nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
1163 	else
1164 		snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
1165 
1166 	err = register_netdevice(dev);
1167 	if (err < 0)
1168 		goto err_register_dev;
1169 
1170 	netif_carrier_off(dev);
1171 
1172 	/*
1173 	 * tie the deviced together
1174 	 */
1175 
1176 	priv = netdev_priv(dev);
1177 	for (i = 0; i < dev->real_num_rx_queues; i++)
1178 		priv->rq[i].dev = dev;
1179 	rcu_assign_pointer(priv->peer, peer);
1180 
1181 	priv = netdev_priv(peer);
1182 	for (i = 0; i < peer->real_num_rx_queues; i++)
1183 		priv->rq[i].dev = peer;
1184 	rcu_assign_pointer(priv->peer, dev);
1185 
1186 	return 0;
1187 
1188 err_register_dev:
1189 	veth_free_queues(dev);
1190 err_alloc_queues:
1191 	/* nothing to do */
1192 err_configure_peer:
1193 	unregister_netdevice(peer);
1194 	return err;
1195 
1196 err_register_peer:
1197 	veth_free_queues(peer);
1198 err_peer_alloc_queues:
1199 	free_netdev(peer);
1200 	return err;
1201 }
1202 
1203 static void veth_dellink(struct net_device *dev, struct list_head *head)
1204 {
1205 	struct veth_priv *priv;
1206 	struct net_device *peer;
1207 
1208 	priv = netdev_priv(dev);
1209 	peer = rtnl_dereference(priv->peer);
1210 
1211 	/* Note : dellink() is called from default_device_exit_batch(),
1212 	 * before a rcu_synchronize() point. The devices are guaranteed
1213 	 * not being freed before one RCU grace period.
1214 	 */
1215 	RCU_INIT_POINTER(priv->peer, NULL);
1216 	unregister_netdevice_queue(dev, head);
1217 
1218 	if (peer) {
1219 		priv = netdev_priv(peer);
1220 		RCU_INIT_POINTER(priv->peer, NULL);
1221 		unregister_netdevice_queue(peer, head);
1222 	}
1223 }
1224 
1225 static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
1226 	[VETH_INFO_PEER]	= { .len = sizeof(struct ifinfomsg) },
1227 };
1228 
1229 static struct net *veth_get_link_net(const struct net_device *dev)
1230 {
1231 	struct veth_priv *priv = netdev_priv(dev);
1232 	struct net_device *peer = rtnl_dereference(priv->peer);
1233 
1234 	return peer ? dev_net(peer) : dev_net(dev);
1235 }
1236 
1237 static struct rtnl_link_ops veth_link_ops = {
1238 	.kind		= DRV_NAME,
1239 	.priv_size	= sizeof(struct veth_priv),
1240 	.setup		= veth_setup,
1241 	.validate	= veth_validate,
1242 	.newlink	= veth_newlink,
1243 	.dellink	= veth_dellink,
1244 	.policy		= veth_policy,
1245 	.maxtype	= VETH_INFO_MAX,
1246 	.get_link_net	= veth_get_link_net,
1247 };
1248 
1249 /*
1250  * init/fini
1251  */
1252 
1253 static __init int veth_init(void)
1254 {
1255 	return rtnl_link_register(&veth_link_ops);
1256 }
1257 
1258 static __exit void veth_exit(void)
1259 {
1260 	rtnl_link_unregister(&veth_link_ops);
1261 }
1262 
1263 module_init(veth_init);
1264 module_exit(veth_exit);
1265 
1266 MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
1267 MODULE_LICENSE("GPL v2");
1268 MODULE_ALIAS_RTNL_LINK(DRV_NAME);
1269