xref: /linux/net/ipv6/ip6_output.c (revision a13d7201d7deedcbb6ac6efa94a1a7d34d3d79ec)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (skb_warn_if_lro(skb))
380 		goto drop;
381 
382 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
384 				 IPSTATS_MIB_INDISCARDS);
385 		goto drop;
386 	}
387 
388 	skb_forward_csum(skb);
389 
390 	/*
391 	 *	We DO NOT make any processing on
392 	 *	RA packets, pushing them to user level AS IS
393 	 *	without ane WARRANTY that application will be able
394 	 *	to interpret them. The reason is that we
395 	 *	cannot make anything clever here.
396 	 *
397 	 *	We are not end-node, so that if packet contains
398 	 *	AH/ESP, we cannot make anything.
399 	 *	Defragmentation also would be mistake, RA packets
400 	 *	cannot be fragmented, because there is no warranty
401 	 *	that different fragments will go along one path. --ANK
402 	 */
403 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
404 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
405 			return 0;
406 	}
407 
408 	/*
409 	 *	check and decrement ttl
410 	 */
411 	if (hdr->hop_limit <= 1) {
412 		/* Force OUTPUT device used as source address */
413 		skb->dev = dst->dev;
414 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
415 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
416 				 IPSTATS_MIB_INHDRERRORS);
417 
418 		kfree_skb(skb);
419 		return -ETIMEDOUT;
420 	}
421 
422 	/* XXX: idev->cnf.proxy_ndp? */
423 	if (net->ipv6.devconf_all->proxy_ndp &&
424 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
425 		int proxied = ip6_forward_proxy_check(skb);
426 		if (proxied > 0)
427 			return ip6_input(skb);
428 		else if (proxied < 0) {
429 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
430 					 IPSTATS_MIB_INDISCARDS);
431 			goto drop;
432 		}
433 	}
434 
435 	if (!xfrm6_route_forward(skb)) {
436 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
437 				 IPSTATS_MIB_INDISCARDS);
438 		goto drop;
439 	}
440 	dst = skb_dst(skb);
441 
442 	/* IPv6 specs say nothing about it, but it is clear that we cannot
443 	   send redirects to source routed frames.
444 	   We don't send redirects to frames decapsulated from IPsec.
445 	 */
446 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
447 		struct in6_addr *target = NULL;
448 		struct inet_peer *peer;
449 		struct rt6_info *rt;
450 
451 		/*
452 		 *	incoming and outgoing devices are the same
453 		 *	send a redirect.
454 		 */
455 
456 		rt = (struct rt6_info *) dst;
457 		if (rt->rt6i_flags & RTF_GATEWAY)
458 			target = &rt->rt6i_gateway;
459 		else
460 			target = &hdr->daddr;
461 
462 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
463 
464 		/* Limit redirects both by destination (here)
465 		   and by source (inside ndisc_send_redirect)
466 		 */
467 		if (inet_peer_xrlim_allow(peer, 1*HZ))
468 			ndisc_send_redirect(skb, target);
469 		if (peer)
470 			inet_putpeer(peer);
471 	} else {
472 		int addrtype = ipv6_addr_type(&hdr->saddr);
473 
474 		/* This check is security critical. */
475 		if (addrtype == IPV6_ADDR_ANY ||
476 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
477 			goto error;
478 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
479 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
480 				    ICMPV6_NOT_NEIGHBOUR, 0);
481 			goto error;
482 		}
483 	}
484 
485 	mtu = ip6_dst_mtu_forward(dst);
486 	if (mtu < IPV6_MIN_MTU)
487 		mtu = IPV6_MIN_MTU;
488 
489 	if (ip6_pkt_too_big(skb, mtu)) {
490 		/* Again, force OUTPUT device used as source address */
491 		skb->dev = dst->dev;
492 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
493 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
494 				 IPSTATS_MIB_INTOOBIGERRORS);
495 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
496 				 IPSTATS_MIB_FRAGFAILS);
497 		kfree_skb(skb);
498 		return -EMSGSIZE;
499 	}
500 
501 	if (skb_cow(skb, dst->dev->hard_header_len)) {
502 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
503 				 IPSTATS_MIB_OUTDISCARDS);
504 		goto drop;
505 	}
506 
507 	hdr = ipv6_hdr(skb);
508 
509 	/* Mangling hops number delayed to point after skb COW */
510 
511 	hdr->hop_limit--;
512 
513 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
514 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
515 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
516 		       skb->dev, dst->dev,
517 		       ip6_forward_finish);
518 
519 error:
520 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
521 drop:
522 	kfree_skb(skb);
523 	return -EINVAL;
524 }
525 
526 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
527 {
528 	to->pkt_type = from->pkt_type;
529 	to->priority = from->priority;
530 	to->protocol = from->protocol;
531 	skb_dst_drop(to);
532 	skb_dst_set(to, dst_clone(skb_dst(from)));
533 	to->dev = from->dev;
534 	to->mark = from->mark;
535 
536 #ifdef CONFIG_NET_SCHED
537 	to->tc_index = from->tc_index;
538 #endif
539 	nf_copy(to, from);
540 	skb_copy_secmark(to, from);
541 }
542 
543 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
544 		 int (*output)(struct sock *, struct sk_buff *))
545 {
546 	struct sk_buff *frag;
547 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
548 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
549 				inet6_sk(skb->sk) : NULL;
550 	struct ipv6hdr *tmp_hdr;
551 	struct frag_hdr *fh;
552 	unsigned int mtu, hlen, left, len;
553 	int hroom, troom;
554 	__be32 frag_id;
555 	int ptr, offset = 0, err = 0;
556 	u8 *prevhdr, nexthdr = 0;
557 	struct net *net = dev_net(skb_dst(skb)->dev);
558 
559 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
560 	nexthdr = *prevhdr;
561 
562 	mtu = ip6_skb_dst_mtu(skb);
563 
564 	/* We must not fragment if the socket is set to force MTU discovery
565 	 * or if the skb it not generated by a local socket.
566 	 */
567 	if (unlikely(!skb->ignore_df && skb->len > mtu))
568 		goto fail_toobig;
569 
570 	if (IP6CB(skb)->frag_max_size) {
571 		if (IP6CB(skb)->frag_max_size > mtu)
572 			goto fail_toobig;
573 
574 		/* don't send fragments larger than what we received */
575 		mtu = IP6CB(skb)->frag_max_size;
576 		if (mtu < IPV6_MIN_MTU)
577 			mtu = IPV6_MIN_MTU;
578 	}
579 
580 	if (np && np->frag_size < mtu) {
581 		if (np->frag_size)
582 			mtu = np->frag_size;
583 	}
584 	mtu -= hlen + sizeof(struct frag_hdr);
585 
586 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
587 				    &ipv6_hdr(skb)->saddr);
588 
589 	if (skb_has_frag_list(skb)) {
590 		int first_len = skb_pagelen(skb);
591 		struct sk_buff *frag2;
592 
593 		if (first_len - hlen > mtu ||
594 		    ((first_len - hlen) & 7) ||
595 		    skb_cloned(skb))
596 			goto slow_path;
597 
598 		skb_walk_frags(skb, frag) {
599 			/* Correct geometry. */
600 			if (frag->len > mtu ||
601 			    ((frag->len & 7) && frag->next) ||
602 			    skb_headroom(frag) < hlen)
603 				goto slow_path_clean;
604 
605 			/* Partially cloned skb? */
606 			if (skb_shared(frag))
607 				goto slow_path_clean;
608 
609 			BUG_ON(frag->sk);
610 			if (skb->sk) {
611 				frag->sk = skb->sk;
612 				frag->destructor = sock_wfree;
613 			}
614 			skb->truesize -= frag->truesize;
615 		}
616 
617 		err = 0;
618 		offset = 0;
619 		frag = skb_shinfo(skb)->frag_list;
620 		skb_frag_list_init(skb);
621 		/* BUILD HEADER */
622 
623 		*prevhdr = NEXTHDR_FRAGMENT;
624 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
625 		if (!tmp_hdr) {
626 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
627 				      IPSTATS_MIB_FRAGFAILS);
628 			return -ENOMEM;
629 		}
630 
631 		__skb_pull(skb, hlen);
632 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
633 		__skb_push(skb, hlen);
634 		skb_reset_network_header(skb);
635 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
636 
637 		fh->nexthdr = nexthdr;
638 		fh->reserved = 0;
639 		fh->frag_off = htons(IP6_MF);
640 		fh->identification = frag_id;
641 
642 		first_len = skb_pagelen(skb);
643 		skb->data_len = first_len - skb_headlen(skb);
644 		skb->len = first_len;
645 		ipv6_hdr(skb)->payload_len = htons(first_len -
646 						   sizeof(struct ipv6hdr));
647 
648 		dst_hold(&rt->dst);
649 
650 		for (;;) {
651 			/* Prepare header of the next frame,
652 			 * before previous one went down. */
653 			if (frag) {
654 				frag->ip_summed = CHECKSUM_NONE;
655 				skb_reset_transport_header(frag);
656 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
657 				__skb_push(frag, hlen);
658 				skb_reset_network_header(frag);
659 				memcpy(skb_network_header(frag), tmp_hdr,
660 				       hlen);
661 				offset += skb->len - hlen - sizeof(struct frag_hdr);
662 				fh->nexthdr = nexthdr;
663 				fh->reserved = 0;
664 				fh->frag_off = htons(offset);
665 				if (frag->next)
666 					fh->frag_off |= htons(IP6_MF);
667 				fh->identification = frag_id;
668 				ipv6_hdr(frag)->payload_len =
669 						htons(frag->len -
670 						      sizeof(struct ipv6hdr));
671 				ip6_copy_metadata(frag, skb);
672 			}
673 
674 			err = output(sk, skb);
675 			if (!err)
676 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
677 					      IPSTATS_MIB_FRAGCREATES);
678 
679 			if (err || !frag)
680 				break;
681 
682 			skb = frag;
683 			frag = skb->next;
684 			skb->next = NULL;
685 		}
686 
687 		kfree(tmp_hdr);
688 
689 		if (err == 0) {
690 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
691 				      IPSTATS_MIB_FRAGOKS);
692 			ip6_rt_put(rt);
693 			return 0;
694 		}
695 
696 		kfree_skb_list(frag);
697 
698 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
699 			      IPSTATS_MIB_FRAGFAILS);
700 		ip6_rt_put(rt);
701 		return err;
702 
703 slow_path_clean:
704 		skb_walk_frags(skb, frag2) {
705 			if (frag2 == frag)
706 				break;
707 			frag2->sk = NULL;
708 			frag2->destructor = NULL;
709 			skb->truesize += frag2->truesize;
710 		}
711 	}
712 
713 slow_path:
714 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
715 	    skb_checksum_help(skb))
716 		goto fail;
717 
718 	left = skb->len - hlen;		/* Space per frame */
719 	ptr = hlen;			/* Where to start from */
720 
721 	/*
722 	 *	Fragment the datagram.
723 	 */
724 
725 	*prevhdr = NEXTHDR_FRAGMENT;
726 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
727 	troom = rt->dst.dev->needed_tailroom;
728 
729 	/*
730 	 *	Keep copying data until we run out.
731 	 */
732 	while (left > 0)	{
733 		len = left;
734 		/* IF: it doesn't fit, use 'mtu' - the data space left */
735 		if (len > mtu)
736 			len = mtu;
737 		/* IF: we are not sending up to and including the packet end
738 		   then align the next start on an eight byte boundary */
739 		if (len < left)	{
740 			len &= ~7;
741 		}
742 
743 		/* Allocate buffer */
744 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
745 				 hroom + troom, GFP_ATOMIC);
746 		if (!frag) {
747 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
748 				      IPSTATS_MIB_FRAGFAILS);
749 			err = -ENOMEM;
750 			goto fail;
751 		}
752 
753 		/*
754 		 *	Set up data on packet
755 		 */
756 
757 		ip6_copy_metadata(frag, skb);
758 		skb_reserve(frag, hroom);
759 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
760 		skb_reset_network_header(frag);
761 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
762 		frag->transport_header = (frag->network_header + hlen +
763 					  sizeof(struct frag_hdr));
764 
765 		/*
766 		 *	Charge the memory for the fragment to any owner
767 		 *	it might possess
768 		 */
769 		if (skb->sk)
770 			skb_set_owner_w(frag, skb->sk);
771 
772 		/*
773 		 *	Copy the packet header into the new buffer.
774 		 */
775 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
776 
777 		/*
778 		 *	Build fragment header.
779 		 */
780 		fh->nexthdr = nexthdr;
781 		fh->reserved = 0;
782 		fh->identification = frag_id;
783 
784 		/*
785 		 *	Copy a block of the IP datagram.
786 		 */
787 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
788 				     len));
789 		left -= len;
790 
791 		fh->frag_off = htons(offset);
792 		if (left > 0)
793 			fh->frag_off |= htons(IP6_MF);
794 		ipv6_hdr(frag)->payload_len = htons(frag->len -
795 						    sizeof(struct ipv6hdr));
796 
797 		ptr += len;
798 		offset += len;
799 
800 		/*
801 		 *	Put this fragment into the sending queue.
802 		 */
803 		err = output(sk, frag);
804 		if (err)
805 			goto fail;
806 
807 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
808 			      IPSTATS_MIB_FRAGCREATES);
809 	}
810 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
811 		      IPSTATS_MIB_FRAGOKS);
812 	consume_skb(skb);
813 	return err;
814 
815 fail_toobig:
816 	if (skb->sk && dst_allfrag(skb_dst(skb)))
817 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
818 
819 	skb->dev = skb_dst(skb)->dev;
820 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
821 	err = -EMSGSIZE;
822 
823 fail:
824 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
825 		      IPSTATS_MIB_FRAGFAILS);
826 	kfree_skb(skb);
827 	return err;
828 }
829 
830 static inline int ip6_rt_check(const struct rt6key *rt_key,
831 			       const struct in6_addr *fl_addr,
832 			       const struct in6_addr *addr_cache)
833 {
834 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
835 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
836 }
837 
838 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
839 					  struct dst_entry *dst,
840 					  const struct flowi6 *fl6)
841 {
842 	struct ipv6_pinfo *np = inet6_sk(sk);
843 	struct rt6_info *rt;
844 
845 	if (!dst)
846 		goto out;
847 
848 	if (dst->ops->family != AF_INET6) {
849 		dst_release(dst);
850 		return NULL;
851 	}
852 
853 	rt = (struct rt6_info *)dst;
854 	/* Yes, checking route validity in not connected
855 	 * case is not very simple. Take into account,
856 	 * that we do not support routing by source, TOS,
857 	 * and MSG_DONTROUTE		--ANK (980726)
858 	 *
859 	 * 1. ip6_rt_check(): If route was host route,
860 	 *    check that cached destination is current.
861 	 *    If it is network route, we still may
862 	 *    check its validity using saved pointer
863 	 *    to the last used address: daddr_cache.
864 	 *    We do not want to save whole address now,
865 	 *    (because main consumer of this service
866 	 *    is tcp, which has not this problem),
867 	 *    so that the last trick works only on connected
868 	 *    sockets.
869 	 * 2. oif also should be the same.
870 	 */
871 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
872 #ifdef CONFIG_IPV6_SUBTREES
873 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
874 #endif
875 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
876 		dst_release(dst);
877 		dst = NULL;
878 	}
879 
880 out:
881 	return dst;
882 }
883 
884 static int ip6_dst_lookup_tail(struct sock *sk,
885 			       struct dst_entry **dst, struct flowi6 *fl6)
886 {
887 	struct net *net = sock_net(sk);
888 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
889 	struct neighbour *n;
890 	struct rt6_info *rt;
891 #endif
892 	int err;
893 
894 	/* The correct way to handle this would be to do
895 	 * ip6_route_get_saddr, and then ip6_route_output; however,
896 	 * the route-specific preferred source forces the
897 	 * ip6_route_output call _before_ ip6_route_get_saddr.
898 	 *
899 	 * In source specific routing (no src=any default route),
900 	 * ip6_route_output will fail given src=any saddr, though, so
901 	 * that's why we try it again later.
902 	 */
903 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
904 		struct rt6_info *rt;
905 		bool had_dst = *dst != NULL;
906 
907 		if (!had_dst)
908 			*dst = ip6_route_output(net, sk, fl6);
909 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
910 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
911 					  sk ? inet6_sk(sk)->srcprefs : 0,
912 					  &fl6->saddr);
913 		if (err)
914 			goto out_err_release;
915 
916 		/* If we had an erroneous initial result, pretend it
917 		 * never existed and let the SA-enabled version take
918 		 * over.
919 		 */
920 		if (!had_dst && (*dst)->error) {
921 			dst_release(*dst);
922 			*dst = NULL;
923 		}
924 	}
925 
926 	if (!*dst)
927 		*dst = ip6_route_output(net, sk, fl6);
928 
929 	err = (*dst)->error;
930 	if (err)
931 		goto out_err_release;
932 
933 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
934 	/*
935 	 * Here if the dst entry we've looked up
936 	 * has a neighbour entry that is in the INCOMPLETE
937 	 * state and the src address from the flow is
938 	 * marked as OPTIMISTIC, we release the found
939 	 * dst entry and replace it instead with the
940 	 * dst entry of the nexthop router
941 	 */
942 	rt = (struct rt6_info *) *dst;
943 	rcu_read_lock_bh();
944 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
945 				      rt6_nexthop(rt, &fl6->daddr));
946 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
947 	rcu_read_unlock_bh();
948 
949 	if (err) {
950 		struct inet6_ifaddr *ifp;
951 		struct flowi6 fl_gw6;
952 		int redirect;
953 
954 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
955 				      (*dst)->dev, 1);
956 
957 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
958 		if (ifp)
959 			in6_ifa_put(ifp);
960 
961 		if (redirect) {
962 			/*
963 			 * We need to get the dst entry for the
964 			 * default router instead
965 			 */
966 			dst_release(*dst);
967 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
968 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
969 			*dst = ip6_route_output(net, sk, &fl_gw6);
970 			err = (*dst)->error;
971 			if (err)
972 				goto out_err_release;
973 		}
974 	}
975 #endif
976 
977 	return 0;
978 
979 out_err_release:
980 	if (err == -ENETUNREACH)
981 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
982 	dst_release(*dst);
983 	*dst = NULL;
984 	return err;
985 }
986 
987 /**
988  *	ip6_dst_lookup - perform route lookup on flow
989  *	@sk: socket which provides route info
990  *	@dst: pointer to dst_entry * for result
991  *	@fl6: flow to lookup
992  *
993  *	This function performs a route lookup on the given flow.
994  *
995  *	It returns zero on success, or a standard errno code on error.
996  */
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
998 {
999 	*dst = NULL;
1000 	return ip6_dst_lookup_tail(sk, dst, fl6);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003 
1004 /**
1005  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1006  *	@sk: socket which provides route info
1007  *	@fl6: flow to lookup
1008  *	@final_dst: final destination address for ipsec lookup
1009  *
1010  *	This function performs a route lookup on the given flow.
1011  *
1012  *	It returns a valid dst pointer on success, or a pointer encoded
1013  *	error code.
1014  */
1015 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1016 				      const struct in6_addr *final_dst)
1017 {
1018 	struct dst_entry *dst = NULL;
1019 	int err;
1020 
1021 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1022 	if (err)
1023 		return ERR_PTR(err);
1024 	if (final_dst)
1025 		fl6->daddr = *final_dst;
1026 
1027 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1028 }
1029 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1030 
1031 /**
1032  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1033  *	@sk: socket which provides the dst cache and route info
1034  *	@fl6: flow to lookup
1035  *	@final_dst: final destination address for ipsec lookup
1036  *
1037  *	This function performs a route lookup on the given flow with the
1038  *	possibility of using the cached route in the socket if it is valid.
1039  *	It will take the socket dst lock when operating on the dst cache.
1040  *	As a result, this function can only be used in process context.
1041  *
1042  *	It returns a valid dst pointer on success, or a pointer encoded
1043  *	error code.
1044  */
1045 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046 					 const struct in6_addr *final_dst)
1047 {
1048 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1049 	int err;
1050 
1051 	dst = ip6_sk_dst_check(sk, dst, fl6);
1052 
1053 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1054 	if (err)
1055 		return ERR_PTR(err);
1056 	if (final_dst)
1057 		fl6->daddr = *final_dst;
1058 
1059 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1060 }
1061 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1062 
1063 static inline int ip6_ufo_append_data(struct sock *sk,
1064 			struct sk_buff_head *queue,
1065 			int getfrag(void *from, char *to, int offset, int len,
1066 			int odd, struct sk_buff *skb),
1067 			void *from, int length, int hh_len, int fragheaderlen,
1068 			int transhdrlen, int mtu, unsigned int flags,
1069 			const struct flowi6 *fl6)
1070 
1071 {
1072 	struct sk_buff *skb;
1073 	int err;
1074 
1075 	/* There is support for UDP large send offload by network
1076 	 * device, so create one single skb packet containing complete
1077 	 * udp datagram
1078 	 */
1079 	skb = skb_peek_tail(queue);
1080 	if (!skb) {
1081 		skb = sock_alloc_send_skb(sk,
1082 			hh_len + fragheaderlen + transhdrlen + 20,
1083 			(flags & MSG_DONTWAIT), &err);
1084 		if (!skb)
1085 			return err;
1086 
1087 		/* reserve space for Hardware header */
1088 		skb_reserve(skb, hh_len);
1089 
1090 		/* create space for UDP/IP header */
1091 		skb_put(skb, fragheaderlen + transhdrlen);
1092 
1093 		/* initialize network header pointer */
1094 		skb_reset_network_header(skb);
1095 
1096 		/* initialize protocol header pointer */
1097 		skb->transport_header = skb->network_header + fragheaderlen;
1098 
1099 		skb->protocol = htons(ETH_P_IPV6);
1100 		skb->csum = 0;
1101 
1102 		__skb_queue_tail(queue, skb);
1103 	} else if (skb_is_gso(skb)) {
1104 		goto append;
1105 	}
1106 
1107 	skb->ip_summed = CHECKSUM_PARTIAL;
1108 	/* Specify the length of each IPv6 datagram fragment.
1109 	 * It has to be a multiple of 8.
1110 	 */
1111 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1112 				     sizeof(struct frag_hdr)) & ~7;
1113 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1114 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1115 							 &fl6->daddr,
1116 							 &fl6->saddr);
1117 
1118 append:
1119 	return skb_append_datato_frags(sk, skb, getfrag, from,
1120 				       (length - transhdrlen));
1121 }
1122 
1123 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1124 					       gfp_t gfp)
1125 {
1126 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1127 }
1128 
1129 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1130 						gfp_t gfp)
1131 {
1132 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1133 }
1134 
1135 static void ip6_append_data_mtu(unsigned int *mtu,
1136 				int *maxfraglen,
1137 				unsigned int fragheaderlen,
1138 				struct sk_buff *skb,
1139 				struct rt6_info *rt,
1140 				unsigned int orig_mtu)
1141 {
1142 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1143 		if (!skb) {
1144 			/* first fragment, reserve header_len */
1145 			*mtu = orig_mtu - rt->dst.header_len;
1146 
1147 		} else {
1148 			/*
1149 			 * this fragment is not first, the headers
1150 			 * space is regarded as data space.
1151 			 */
1152 			*mtu = orig_mtu;
1153 		}
1154 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1155 			      + fragheaderlen - sizeof(struct frag_hdr);
1156 	}
1157 }
1158 
1159 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1160 			  struct inet6_cork *v6_cork,
1161 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1162 			  struct rt6_info *rt, struct flowi6 *fl6)
1163 {
1164 	struct ipv6_pinfo *np = inet6_sk(sk);
1165 	unsigned int mtu;
1166 
1167 	/*
1168 	 * setup for corking
1169 	 */
1170 	if (opt) {
1171 		if (WARN_ON(v6_cork->opt))
1172 			return -EINVAL;
1173 
1174 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1175 		if (unlikely(!v6_cork->opt))
1176 			return -ENOBUFS;
1177 
1178 		v6_cork->opt->tot_len = opt->tot_len;
1179 		v6_cork->opt->opt_flen = opt->opt_flen;
1180 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1181 
1182 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1183 						    sk->sk_allocation);
1184 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1185 			return -ENOBUFS;
1186 
1187 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1188 						    sk->sk_allocation);
1189 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1190 			return -ENOBUFS;
1191 
1192 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1193 						   sk->sk_allocation);
1194 		if (opt->hopopt && !v6_cork->opt->hopopt)
1195 			return -ENOBUFS;
1196 
1197 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1198 						    sk->sk_allocation);
1199 		if (opt->srcrt && !v6_cork->opt->srcrt)
1200 			return -ENOBUFS;
1201 
1202 		/* need source address above miyazawa*/
1203 	}
1204 	dst_hold(&rt->dst);
1205 	cork->base.dst = &rt->dst;
1206 	cork->fl.u.ip6 = *fl6;
1207 	v6_cork->hop_limit = hlimit;
1208 	v6_cork->tclass = tclass;
1209 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1210 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1211 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1212 	else
1213 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1214 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1215 	if (np->frag_size < mtu) {
1216 		if (np->frag_size)
1217 			mtu = np->frag_size;
1218 	}
1219 	cork->base.fragsize = mtu;
1220 	if (dst_allfrag(rt->dst.path))
1221 		cork->base.flags |= IPCORK_ALLFRAG;
1222 	cork->base.length = 0;
1223 
1224 	return 0;
1225 }
1226 
1227 static int __ip6_append_data(struct sock *sk,
1228 			     struct flowi6 *fl6,
1229 			     struct sk_buff_head *queue,
1230 			     struct inet_cork *cork,
1231 			     struct inet6_cork *v6_cork,
1232 			     struct page_frag *pfrag,
1233 			     int getfrag(void *from, char *to, int offset,
1234 					 int len, int odd, struct sk_buff *skb),
1235 			     void *from, int length, int transhdrlen,
1236 			     unsigned int flags, int dontfrag)
1237 {
1238 	struct sk_buff *skb, *skb_prev = NULL;
1239 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1240 	int exthdrlen = 0;
1241 	int dst_exthdrlen = 0;
1242 	int hh_len;
1243 	int copy;
1244 	int err;
1245 	int offset = 0;
1246 	__u8 tx_flags = 0;
1247 	u32 tskey = 0;
1248 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1249 	struct ipv6_txoptions *opt = v6_cork->opt;
1250 	int csummode = CHECKSUM_NONE;
1251 
1252 	skb = skb_peek_tail(queue);
1253 	if (!skb) {
1254 		exthdrlen = opt ? opt->opt_flen : 0;
1255 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1256 	}
1257 
1258 	mtu = cork->fragsize;
1259 	orig_mtu = mtu;
1260 
1261 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1262 
1263 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1264 			(opt ? opt->opt_nflen : 0);
1265 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1266 		     sizeof(struct frag_hdr);
1267 
1268 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1269 		unsigned int maxnonfragsize, headersize;
1270 
1271 		headersize = sizeof(struct ipv6hdr) +
1272 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1273 			     (dst_allfrag(&rt->dst) ?
1274 			      sizeof(struct frag_hdr) : 0) +
1275 			     rt->rt6i_nfheader_len;
1276 
1277 		if (ip6_sk_ignore_df(sk))
1278 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1279 		else
1280 			maxnonfragsize = mtu;
1281 
1282 		/* dontfrag active */
1283 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1284 		    (sk->sk_protocol == IPPROTO_UDP ||
1285 		     sk->sk_protocol == IPPROTO_RAW)) {
1286 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1287 						   sizeof(struct ipv6hdr));
1288 			goto emsgsize;
1289 		}
1290 
1291 		if (cork->length + length > maxnonfragsize - headersize) {
1292 emsgsize:
1293 			ipv6_local_error(sk, EMSGSIZE, fl6,
1294 					 mtu - headersize +
1295 					 sizeof(struct ipv6hdr));
1296 			return -EMSGSIZE;
1297 		}
1298 	}
1299 
1300 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1301 		sock_tx_timestamp(sk, &tx_flags);
1302 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1303 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1304 			tskey = sk->sk_tskey++;
1305 	}
1306 
1307 	/* If this is the first and only packet and device
1308 	 * supports checksum offloading, let's use it.
1309 	 * Use transhdrlen, same as IPv4, because partial
1310 	 * sums only work when transhdrlen is set.
1311 	 */
1312 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1313 	    length + fragheaderlen < mtu &&
1314 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1315 	    !exthdrlen)
1316 		csummode = CHECKSUM_PARTIAL;
1317 	/*
1318 	 * Let's try using as much space as possible.
1319 	 * Use MTU if total length of the message fits into the MTU.
1320 	 * Otherwise, we need to reserve fragment header and
1321 	 * fragment alignment (= 8-15 octects, in total).
1322 	 *
1323 	 * Note that we may need to "move" the data from the tail of
1324 	 * of the buffer to the new fragment when we split
1325 	 * the message.
1326 	 *
1327 	 * FIXME: It may be fragmented into multiple chunks
1328 	 *        at once if non-fragmentable extension headers
1329 	 *        are too large.
1330 	 * --yoshfuji
1331 	 */
1332 
1333 	cork->length += length;
1334 	if (((length > mtu) ||
1335 	     (skb && skb_is_gso(skb))) &&
1336 	    (sk->sk_protocol == IPPROTO_UDP) &&
1337 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1338 	    (sk->sk_type == SOCK_DGRAM)) {
1339 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1340 					  hh_len, fragheaderlen,
1341 					  transhdrlen, mtu, flags, fl6);
1342 		if (err)
1343 			goto error;
1344 		return 0;
1345 	}
1346 
1347 	if (!skb)
1348 		goto alloc_new_skb;
1349 
1350 	while (length > 0) {
1351 		/* Check if the remaining data fits into current packet. */
1352 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1353 		if (copy < length)
1354 			copy = maxfraglen - skb->len;
1355 
1356 		if (copy <= 0) {
1357 			char *data;
1358 			unsigned int datalen;
1359 			unsigned int fraglen;
1360 			unsigned int fraggap;
1361 			unsigned int alloclen;
1362 alloc_new_skb:
1363 			/* There's no room in the current skb */
1364 			if (skb)
1365 				fraggap = skb->len - maxfraglen;
1366 			else
1367 				fraggap = 0;
1368 			/* update mtu and maxfraglen if necessary */
1369 			if (!skb || !skb_prev)
1370 				ip6_append_data_mtu(&mtu, &maxfraglen,
1371 						    fragheaderlen, skb, rt,
1372 						    orig_mtu);
1373 
1374 			skb_prev = skb;
1375 
1376 			/*
1377 			 * If remaining data exceeds the mtu,
1378 			 * we know we need more fragment(s).
1379 			 */
1380 			datalen = length + fraggap;
1381 
1382 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384 			if ((flags & MSG_MORE) &&
1385 			    !(rt->dst.dev->features&NETIF_F_SG))
1386 				alloclen = mtu;
1387 			else
1388 				alloclen = datalen + fragheaderlen;
1389 
1390 			alloclen += dst_exthdrlen;
1391 
1392 			if (datalen != length + fraggap) {
1393 				/*
1394 				 * this is not the last fragment, the trailer
1395 				 * space is regarded as data space.
1396 				 */
1397 				datalen += rt->dst.trailer_len;
1398 			}
1399 
1400 			alloclen += rt->dst.trailer_len;
1401 			fraglen = datalen + fragheaderlen;
1402 
1403 			/*
1404 			 * We just reserve space for fragment header.
1405 			 * Note: this may be overallocation if the message
1406 			 * (without MSG_MORE) fits into the MTU.
1407 			 */
1408 			alloclen += sizeof(struct frag_hdr);
1409 
1410 			if (transhdrlen) {
1411 				skb = sock_alloc_send_skb(sk,
1412 						alloclen + hh_len,
1413 						(flags & MSG_DONTWAIT), &err);
1414 			} else {
1415 				skb = NULL;
1416 				if (atomic_read(&sk->sk_wmem_alloc) <=
1417 				    2 * sk->sk_sndbuf)
1418 					skb = sock_wmalloc(sk,
1419 							   alloclen + hh_len, 1,
1420 							   sk->sk_allocation);
1421 				if (unlikely(!skb))
1422 					err = -ENOBUFS;
1423 			}
1424 			if (!skb)
1425 				goto error;
1426 			/*
1427 			 *	Fill in the control structures
1428 			 */
1429 			skb->protocol = htons(ETH_P_IPV6);
1430 			skb->ip_summed = csummode;
1431 			skb->csum = 0;
1432 			/* reserve for fragmentation and ipsec header */
1433 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1434 				    dst_exthdrlen);
1435 
1436 			/* Only the initial fragment is time stamped */
1437 			skb_shinfo(skb)->tx_flags = tx_flags;
1438 			tx_flags = 0;
1439 			skb_shinfo(skb)->tskey = tskey;
1440 			tskey = 0;
1441 
1442 			/*
1443 			 *	Find where to start putting bytes
1444 			 */
1445 			data = skb_put(skb, fraglen);
1446 			skb_set_network_header(skb, exthdrlen);
1447 			data += fragheaderlen;
1448 			skb->transport_header = (skb->network_header +
1449 						 fragheaderlen);
1450 			if (fraggap) {
1451 				skb->csum = skb_copy_and_csum_bits(
1452 					skb_prev, maxfraglen,
1453 					data + transhdrlen, fraggap, 0);
1454 				skb_prev->csum = csum_sub(skb_prev->csum,
1455 							  skb->csum);
1456 				data += fraggap;
1457 				pskb_trim_unique(skb_prev, maxfraglen);
1458 			}
1459 			copy = datalen - transhdrlen - fraggap;
1460 
1461 			if (copy < 0) {
1462 				err = -EINVAL;
1463 				kfree_skb(skb);
1464 				goto error;
1465 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1466 				err = -EFAULT;
1467 				kfree_skb(skb);
1468 				goto error;
1469 			}
1470 
1471 			offset += copy;
1472 			length -= datalen - fraggap;
1473 			transhdrlen = 0;
1474 			exthdrlen = 0;
1475 			dst_exthdrlen = 0;
1476 
1477 			/*
1478 			 * Put the packet on the pending queue
1479 			 */
1480 			__skb_queue_tail(queue, skb);
1481 			continue;
1482 		}
1483 
1484 		if (copy > length)
1485 			copy = length;
1486 
1487 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1488 			unsigned int off;
1489 
1490 			off = skb->len;
1491 			if (getfrag(from, skb_put(skb, copy),
1492 						offset, copy, off, skb) < 0) {
1493 				__skb_trim(skb, off);
1494 				err = -EFAULT;
1495 				goto error;
1496 			}
1497 		} else {
1498 			int i = skb_shinfo(skb)->nr_frags;
1499 
1500 			err = -ENOMEM;
1501 			if (!sk_page_frag_refill(sk, pfrag))
1502 				goto error;
1503 
1504 			if (!skb_can_coalesce(skb, i, pfrag->page,
1505 					      pfrag->offset)) {
1506 				err = -EMSGSIZE;
1507 				if (i == MAX_SKB_FRAGS)
1508 					goto error;
1509 
1510 				__skb_fill_page_desc(skb, i, pfrag->page,
1511 						     pfrag->offset, 0);
1512 				skb_shinfo(skb)->nr_frags = ++i;
1513 				get_page(pfrag->page);
1514 			}
1515 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1516 			if (getfrag(from,
1517 				    page_address(pfrag->page) + pfrag->offset,
1518 				    offset, copy, skb->len, skb) < 0)
1519 				goto error_efault;
1520 
1521 			pfrag->offset += copy;
1522 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1523 			skb->len += copy;
1524 			skb->data_len += copy;
1525 			skb->truesize += copy;
1526 			atomic_add(copy, &sk->sk_wmem_alloc);
1527 		}
1528 		offset += copy;
1529 		length -= copy;
1530 	}
1531 
1532 	return 0;
1533 
1534 error_efault:
1535 	err = -EFAULT;
1536 error:
1537 	cork->length -= length;
1538 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1539 	return err;
1540 }
1541 
1542 int ip6_append_data(struct sock *sk,
1543 		    int getfrag(void *from, char *to, int offset, int len,
1544 				int odd, struct sk_buff *skb),
1545 		    void *from, int length, int transhdrlen, int hlimit,
1546 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1547 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1548 {
1549 	struct inet_sock *inet = inet_sk(sk);
1550 	struct ipv6_pinfo *np = inet6_sk(sk);
1551 	int exthdrlen;
1552 	int err;
1553 
1554 	if (flags&MSG_PROBE)
1555 		return 0;
1556 	if (skb_queue_empty(&sk->sk_write_queue)) {
1557 		/*
1558 		 * setup for corking
1559 		 */
1560 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1561 				     tclass, opt, rt, fl6);
1562 		if (err)
1563 			return err;
1564 
1565 		exthdrlen = (opt ? opt->opt_flen : 0);
1566 		length += exthdrlen;
1567 		transhdrlen += exthdrlen;
1568 	} else {
1569 		fl6 = &inet->cork.fl.u.ip6;
1570 		transhdrlen = 0;
1571 	}
1572 
1573 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1574 				 &np->cork, sk_page_frag(sk), getfrag,
1575 				 from, length, transhdrlen, flags, dontfrag);
1576 }
1577 EXPORT_SYMBOL_GPL(ip6_append_data);
1578 
1579 static void ip6_cork_release(struct inet_cork_full *cork,
1580 			     struct inet6_cork *v6_cork)
1581 {
1582 	if (v6_cork->opt) {
1583 		kfree(v6_cork->opt->dst0opt);
1584 		kfree(v6_cork->opt->dst1opt);
1585 		kfree(v6_cork->opt->hopopt);
1586 		kfree(v6_cork->opt->srcrt);
1587 		kfree(v6_cork->opt);
1588 		v6_cork->opt = NULL;
1589 	}
1590 
1591 	if (cork->base.dst) {
1592 		dst_release(cork->base.dst);
1593 		cork->base.dst = NULL;
1594 		cork->base.flags &= ~IPCORK_ALLFRAG;
1595 	}
1596 	memset(&cork->fl, 0, sizeof(cork->fl));
1597 }
1598 
1599 struct sk_buff *__ip6_make_skb(struct sock *sk,
1600 			       struct sk_buff_head *queue,
1601 			       struct inet_cork_full *cork,
1602 			       struct inet6_cork *v6_cork)
1603 {
1604 	struct sk_buff *skb, *tmp_skb;
1605 	struct sk_buff **tail_skb;
1606 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1607 	struct ipv6_pinfo *np = inet6_sk(sk);
1608 	struct net *net = sock_net(sk);
1609 	struct ipv6hdr *hdr;
1610 	struct ipv6_txoptions *opt = v6_cork->opt;
1611 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1612 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1613 	unsigned char proto = fl6->flowi6_proto;
1614 
1615 	skb = __skb_dequeue(queue);
1616 	if (!skb)
1617 		goto out;
1618 	tail_skb = &(skb_shinfo(skb)->frag_list);
1619 
1620 	/* move skb->data to ip header from ext header */
1621 	if (skb->data < skb_network_header(skb))
1622 		__skb_pull(skb, skb_network_offset(skb));
1623 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1624 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1625 		*tail_skb = tmp_skb;
1626 		tail_skb = &(tmp_skb->next);
1627 		skb->len += tmp_skb->len;
1628 		skb->data_len += tmp_skb->len;
1629 		skb->truesize += tmp_skb->truesize;
1630 		tmp_skb->destructor = NULL;
1631 		tmp_skb->sk = NULL;
1632 	}
1633 
1634 	/* Allow local fragmentation. */
1635 	skb->ignore_df = ip6_sk_ignore_df(sk);
1636 
1637 	*final_dst = fl6->daddr;
1638 	__skb_pull(skb, skb_network_header_len(skb));
1639 	if (opt && opt->opt_flen)
1640 		ipv6_push_frag_opts(skb, opt, &proto);
1641 	if (opt && opt->opt_nflen)
1642 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1643 
1644 	skb_push(skb, sizeof(struct ipv6hdr));
1645 	skb_reset_network_header(skb);
1646 	hdr = ipv6_hdr(skb);
1647 
1648 	ip6_flow_hdr(hdr, v6_cork->tclass,
1649 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1650 					np->autoflowlabel));
1651 	hdr->hop_limit = v6_cork->hop_limit;
1652 	hdr->nexthdr = proto;
1653 	hdr->saddr = fl6->saddr;
1654 	hdr->daddr = *final_dst;
1655 
1656 	skb->priority = sk->sk_priority;
1657 	skb->mark = sk->sk_mark;
1658 
1659 	skb_dst_set(skb, dst_clone(&rt->dst));
1660 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1661 	if (proto == IPPROTO_ICMPV6) {
1662 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1663 
1664 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1665 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1666 	}
1667 
1668 	ip6_cork_release(cork, v6_cork);
1669 out:
1670 	return skb;
1671 }
1672 
1673 int ip6_send_skb(struct sk_buff *skb)
1674 {
1675 	struct net *net = sock_net(skb->sk);
1676 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1677 	int err;
1678 
1679 	err = ip6_local_out(skb);
1680 	if (err) {
1681 		if (err > 0)
1682 			err = net_xmit_errno(err);
1683 		if (err)
1684 			IP6_INC_STATS(net, rt->rt6i_idev,
1685 				      IPSTATS_MIB_OUTDISCARDS);
1686 	}
1687 
1688 	return err;
1689 }
1690 
1691 int ip6_push_pending_frames(struct sock *sk)
1692 {
1693 	struct sk_buff *skb;
1694 
1695 	skb = ip6_finish_skb(sk);
1696 	if (!skb)
1697 		return 0;
1698 
1699 	return ip6_send_skb(skb);
1700 }
1701 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1702 
1703 static void __ip6_flush_pending_frames(struct sock *sk,
1704 				       struct sk_buff_head *queue,
1705 				       struct inet_cork_full *cork,
1706 				       struct inet6_cork *v6_cork)
1707 {
1708 	struct sk_buff *skb;
1709 
1710 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1711 		if (skb_dst(skb))
1712 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1713 				      IPSTATS_MIB_OUTDISCARDS);
1714 		kfree_skb(skb);
1715 	}
1716 
1717 	ip6_cork_release(cork, v6_cork);
1718 }
1719 
1720 void ip6_flush_pending_frames(struct sock *sk)
1721 {
1722 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1723 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1724 }
1725 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1726 
1727 struct sk_buff *ip6_make_skb(struct sock *sk,
1728 			     int getfrag(void *from, char *to, int offset,
1729 					 int len, int odd, struct sk_buff *skb),
1730 			     void *from, int length, int transhdrlen,
1731 			     int hlimit, int tclass,
1732 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1733 			     struct rt6_info *rt, unsigned int flags,
1734 			     int dontfrag)
1735 {
1736 	struct inet_cork_full cork;
1737 	struct inet6_cork v6_cork;
1738 	struct sk_buff_head queue;
1739 	int exthdrlen = (opt ? opt->opt_flen : 0);
1740 	int err;
1741 
1742 	if (flags & MSG_PROBE)
1743 		return NULL;
1744 
1745 	__skb_queue_head_init(&queue);
1746 
1747 	cork.base.flags = 0;
1748 	cork.base.addr = 0;
1749 	cork.base.opt = NULL;
1750 	v6_cork.opt = NULL;
1751 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1752 	if (err)
1753 		return ERR_PTR(err);
1754 
1755 	if (dontfrag < 0)
1756 		dontfrag = inet6_sk(sk)->dontfrag;
1757 
1758 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1759 				&current->task_frag, getfrag, from,
1760 				length + exthdrlen, transhdrlen + exthdrlen,
1761 				flags, dontfrag);
1762 	if (err) {
1763 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1764 		return ERR_PTR(err);
1765 	}
1766 
1767 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1768 }
1769