xref: /linux/net/ipv6/route.c (revision 33619f0d3ff715a2a5499520967d526ad931d70d)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   struct in6_addr *prefix, int prefixlen,
93 					   struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   struct in6_addr *prefix, int prefixlen,
97 					   struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static struct dst_ops ip6_dst_ops_template = {
101 	.family			=	AF_INET6,
102 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
103 	.gc			=	ip6_dst_gc,
104 	.gc_thresh		=	1024,
105 	.check			=	ip6_dst_check,
106 	.default_advmss		=	ip6_default_advmss,
107 	.default_mtu		=	ip6_default_mtu,
108 	.destroy		=	ip6_dst_destroy,
109 	.ifdown			=	ip6_dst_ifdown,
110 	.negative_advice	=	ip6_negative_advice,
111 	.link_failure		=	ip6_link_failure,
112 	.update_pmtu		=	ip6_rt_update_pmtu,
113 	.local_out		=	__ip6_local_out,
114 };
115 
116 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
117 {
118 	return 0;
119 }
120 
121 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
122 {
123 }
124 
125 static struct dst_ops ip6_dst_blackhole_ops = {
126 	.family			=	AF_INET6,
127 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
128 	.destroy		=	ip6_dst_destroy,
129 	.check			=	ip6_dst_check,
130 	.default_mtu		=	ip6_blackhole_default_mtu,
131 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
132 };
133 
134 static struct rt6_info ip6_null_entry_template = {
135 	.dst = {
136 		.__refcnt	= ATOMIC_INIT(1),
137 		.__use		= 1,
138 		.obsolete	= -1,
139 		.error		= -ENETUNREACH,
140 		.input		= ip6_pkt_discard,
141 		.output		= ip6_pkt_discard_out,
142 	},
143 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
144 	.rt6i_protocol  = RTPROT_KERNEL,
145 	.rt6i_metric	= ~(u32) 0,
146 	.rt6i_ref	= ATOMIC_INIT(1),
147 };
148 
149 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
150 
151 static int ip6_pkt_prohibit(struct sk_buff *skb);
152 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
153 
154 static struct rt6_info ip6_prohibit_entry_template = {
155 	.dst = {
156 		.__refcnt	= ATOMIC_INIT(1),
157 		.__use		= 1,
158 		.obsolete	= -1,
159 		.error		= -EACCES,
160 		.input		= ip6_pkt_prohibit,
161 		.output		= ip6_pkt_prohibit_out,
162 	},
163 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
164 	.rt6i_protocol  = RTPROT_KERNEL,
165 	.rt6i_metric	= ~(u32) 0,
166 	.rt6i_ref	= ATOMIC_INIT(1),
167 };
168 
169 static struct rt6_info ip6_blk_hole_entry_template = {
170 	.dst = {
171 		.__refcnt	= ATOMIC_INIT(1),
172 		.__use		= 1,
173 		.obsolete	= -1,
174 		.error		= -EINVAL,
175 		.input		= dst_discard,
176 		.output		= dst_discard,
177 	},
178 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
179 	.rt6i_protocol  = RTPROT_KERNEL,
180 	.rt6i_metric	= ~(u32) 0,
181 	.rt6i_ref	= ATOMIC_INIT(1),
182 };
183 
184 #endif
185 
186 /* allocate dst with ip6_dst_ops */
187 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
188 {
189 	return (struct rt6_info *)dst_alloc(ops);
190 }
191 
192 static void ip6_dst_destroy(struct dst_entry *dst)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 	struct inet6_dev *idev = rt->rt6i_idev;
196 	struct inet_peer *peer = rt->rt6i_peer;
197 
198 	if (idev != NULL) {
199 		rt->rt6i_idev = NULL;
200 		in6_dev_put(idev);
201 	}
202 	if (peer) {
203 		rt->rt6i_peer = NULL;
204 		inet_putpeer(peer);
205 	}
206 }
207 
208 void rt6_bind_peer(struct rt6_info *rt, int create)
209 {
210 	struct inet_peer *peer;
211 
212 	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
213 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
214 		inet_putpeer(peer);
215 }
216 
217 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
218 			   int how)
219 {
220 	struct rt6_info *rt = (struct rt6_info *)dst;
221 	struct inet6_dev *idev = rt->rt6i_idev;
222 	struct net_device *loopback_dev =
223 		dev_net(dev)->loopback_dev;
224 
225 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
226 		struct inet6_dev *loopback_idev =
227 			in6_dev_get(loopback_dev);
228 		if (loopback_idev != NULL) {
229 			rt->rt6i_idev = loopback_idev;
230 			in6_dev_put(idev);
231 		}
232 	}
233 }
234 
235 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
236 {
237 	return (rt->rt6i_flags & RTF_EXPIRES) &&
238 		time_after(jiffies, rt->rt6i_expires);
239 }
240 
241 static inline int rt6_need_strict(struct in6_addr *daddr)
242 {
243 	return ipv6_addr_type(daddr) &
244 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
245 }
246 
247 /*
248  *	Route lookup. Any table->tb6_lock is implied.
249  */
250 
251 static inline struct rt6_info *rt6_device_match(struct net *net,
252 						    struct rt6_info *rt,
253 						    struct in6_addr *saddr,
254 						    int oif,
255 						    int flags)
256 {
257 	struct rt6_info *local = NULL;
258 	struct rt6_info *sprt;
259 
260 	if (!oif && ipv6_addr_any(saddr))
261 		goto out;
262 
263 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
264 		struct net_device *dev = sprt->rt6i_dev;
265 
266 		if (oif) {
267 			if (dev->ifindex == oif)
268 				return sprt;
269 			if (dev->flags & IFF_LOOPBACK) {
270 				if (sprt->rt6i_idev == NULL ||
271 				    sprt->rt6i_idev->dev->ifindex != oif) {
272 					if (flags & RT6_LOOKUP_F_IFACE && oif)
273 						continue;
274 					if (local && (!oif ||
275 						      local->rt6i_idev->dev->ifindex == oif))
276 						continue;
277 				}
278 				local = sprt;
279 			}
280 		} else {
281 			if (ipv6_chk_addr(net, saddr, dev,
282 					  flags & RT6_LOOKUP_F_IFACE))
283 				return sprt;
284 		}
285 	}
286 
287 	if (oif) {
288 		if (local)
289 			return local;
290 
291 		if (flags & RT6_LOOKUP_F_IFACE)
292 			return net->ipv6.ip6_null_entry;
293 	}
294 out:
295 	return rt;
296 }
297 
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299 static void rt6_probe(struct rt6_info *rt)
300 {
301 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
302 	/*
303 	 * Okay, this does not seem to be appropriate
304 	 * for now, however, we need to check if it
305 	 * is really so; aka Router Reachability Probing.
306 	 *
307 	 * Router Reachability Probe MUST be rate-limited
308 	 * to no more than one per minute.
309 	 */
310 	if (!neigh || (neigh->nud_state & NUD_VALID))
311 		return;
312 	read_lock_bh(&neigh->lock);
313 	if (!(neigh->nud_state & NUD_VALID) &&
314 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
315 		struct in6_addr mcaddr;
316 		struct in6_addr *target;
317 
318 		neigh->updated = jiffies;
319 		read_unlock_bh(&neigh->lock);
320 
321 		target = (struct in6_addr *)&neigh->primary_key;
322 		addrconf_addr_solict_mult(target, &mcaddr);
323 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
324 	} else
325 		read_unlock_bh(&neigh->lock);
326 }
327 #else
328 static inline void rt6_probe(struct rt6_info *rt)
329 {
330 }
331 #endif
332 
333 /*
334  * Default Router Selection (RFC 2461 6.3.6)
335  */
336 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
337 {
338 	struct net_device *dev = rt->rt6i_dev;
339 	if (!oif || dev->ifindex == oif)
340 		return 2;
341 	if ((dev->flags & IFF_LOOPBACK) &&
342 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
343 		return 1;
344 	return 0;
345 }
346 
347 static inline int rt6_check_neigh(struct rt6_info *rt)
348 {
349 	struct neighbour *neigh = rt->rt6i_nexthop;
350 	int m;
351 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
352 	    !(rt->rt6i_flags & RTF_GATEWAY))
353 		m = 1;
354 	else if (neigh) {
355 		read_lock_bh(&neigh->lock);
356 		if (neigh->nud_state & NUD_VALID)
357 			m = 2;
358 #ifdef CONFIG_IPV6_ROUTER_PREF
359 		else if (neigh->nud_state & NUD_FAILED)
360 			m = 0;
361 #endif
362 		else
363 			m = 1;
364 		read_unlock_bh(&neigh->lock);
365 	} else
366 		m = 0;
367 	return m;
368 }
369 
370 static int rt6_score_route(struct rt6_info *rt, int oif,
371 			   int strict)
372 {
373 	int m, n;
374 
375 	m = rt6_check_dev(rt, oif);
376 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
377 		return -1;
378 #ifdef CONFIG_IPV6_ROUTER_PREF
379 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
380 #endif
381 	n = rt6_check_neigh(rt);
382 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
383 		return -1;
384 	return m;
385 }
386 
387 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
388 				   int *mpri, struct rt6_info *match)
389 {
390 	int m;
391 
392 	if (rt6_check_expired(rt))
393 		goto out;
394 
395 	m = rt6_score_route(rt, oif, strict);
396 	if (m < 0)
397 		goto out;
398 
399 	if (m > *mpri) {
400 		if (strict & RT6_LOOKUP_F_REACHABLE)
401 			rt6_probe(match);
402 		*mpri = m;
403 		match = rt;
404 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
405 		rt6_probe(rt);
406 	}
407 
408 out:
409 	return match;
410 }
411 
412 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
413 				     struct rt6_info *rr_head,
414 				     u32 metric, int oif, int strict)
415 {
416 	struct rt6_info *rt, *match;
417 	int mpri = -1;
418 
419 	match = NULL;
420 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
421 	     rt = rt->dst.rt6_next)
422 		match = find_match(rt, oif, strict, &mpri, match);
423 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
424 	     rt = rt->dst.rt6_next)
425 		match = find_match(rt, oif, strict, &mpri, match);
426 
427 	return match;
428 }
429 
430 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
431 {
432 	struct rt6_info *match, *rt0;
433 	struct net *net;
434 
435 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
436 		  __func__, fn->leaf, oif);
437 
438 	rt0 = fn->rr_ptr;
439 	if (!rt0)
440 		fn->rr_ptr = rt0 = fn->leaf;
441 
442 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
443 
444 	if (!match &&
445 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
446 		struct rt6_info *next = rt0->dst.rt6_next;
447 
448 		/* no entries matched; do round-robin */
449 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
450 			next = fn->leaf;
451 
452 		if (next != rt0)
453 			fn->rr_ptr = next;
454 	}
455 
456 	RT6_TRACE("%s() => %p\n",
457 		  __func__, match);
458 
459 	net = dev_net(rt0->rt6i_dev);
460 	return match ? match : net->ipv6.ip6_null_entry;
461 }
462 
463 #ifdef CONFIG_IPV6_ROUTE_INFO
464 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
465 		  struct in6_addr *gwaddr)
466 {
467 	struct net *net = dev_net(dev);
468 	struct route_info *rinfo = (struct route_info *) opt;
469 	struct in6_addr prefix_buf, *prefix;
470 	unsigned int pref;
471 	unsigned long lifetime;
472 	struct rt6_info *rt;
473 
474 	if (len < sizeof(struct route_info)) {
475 		return -EINVAL;
476 	}
477 
478 	/* Sanity check for prefix_len and length */
479 	if (rinfo->length > 3) {
480 		return -EINVAL;
481 	} else if (rinfo->prefix_len > 128) {
482 		return -EINVAL;
483 	} else if (rinfo->prefix_len > 64) {
484 		if (rinfo->length < 2) {
485 			return -EINVAL;
486 		}
487 	} else if (rinfo->prefix_len > 0) {
488 		if (rinfo->length < 1) {
489 			return -EINVAL;
490 		}
491 	}
492 
493 	pref = rinfo->route_pref;
494 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
495 		return -EINVAL;
496 
497 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
498 
499 	if (rinfo->length == 3)
500 		prefix = (struct in6_addr *)rinfo->prefix;
501 	else {
502 		/* this function is safe */
503 		ipv6_addr_prefix(&prefix_buf,
504 				 (struct in6_addr *)rinfo->prefix,
505 				 rinfo->prefix_len);
506 		prefix = &prefix_buf;
507 	}
508 
509 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
510 				dev->ifindex);
511 
512 	if (rt && !lifetime) {
513 		ip6_del_rt(rt);
514 		rt = NULL;
515 	}
516 
517 	if (!rt && lifetime)
518 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
519 					pref);
520 	else if (rt)
521 		rt->rt6i_flags = RTF_ROUTEINFO |
522 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
523 
524 	if (rt) {
525 		if (!addrconf_finite_timeout(lifetime)) {
526 			rt->rt6i_flags &= ~RTF_EXPIRES;
527 		} else {
528 			rt->rt6i_expires = jiffies + HZ * lifetime;
529 			rt->rt6i_flags |= RTF_EXPIRES;
530 		}
531 		dst_release(&rt->dst);
532 	}
533 	return 0;
534 }
535 #endif
536 
537 #define BACKTRACK(__net, saddr)			\
538 do { \
539 	if (rt == __net->ipv6.ip6_null_entry) {	\
540 		struct fib6_node *pn; \
541 		while (1) { \
542 			if (fn->fn_flags & RTN_TL_ROOT) \
543 				goto out; \
544 			pn = fn->parent; \
545 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
546 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
547 			else \
548 				fn = pn; \
549 			if (fn->fn_flags & RTN_RTINFO) \
550 				goto restart; \
551 		} \
552 	} \
553 } while(0)
554 
555 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
556 					     struct fib6_table *table,
557 					     struct flowi *fl, int flags)
558 {
559 	struct fib6_node *fn;
560 	struct rt6_info *rt;
561 
562 	read_lock_bh(&table->tb6_lock);
563 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
564 restart:
565 	rt = fn->leaf;
566 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
567 	BACKTRACK(net, &fl->fl6_src);
568 out:
569 	dst_use(&rt->dst, jiffies);
570 	read_unlock_bh(&table->tb6_lock);
571 	return rt;
572 
573 }
574 
575 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
576 			    const struct in6_addr *saddr, int oif, int strict)
577 {
578 	struct flowi fl = {
579 		.oif = oif,
580 		.fl6_dst = *daddr,
581 	};
582 	struct dst_entry *dst;
583 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
584 
585 	if (saddr) {
586 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
587 		flags |= RT6_LOOKUP_F_HAS_SADDR;
588 	}
589 
590 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
591 	if (dst->error == 0)
592 		return (struct rt6_info *) dst;
593 
594 	dst_release(dst);
595 
596 	return NULL;
597 }
598 
599 EXPORT_SYMBOL(rt6_lookup);
600 
601 /* ip6_ins_rt is called with FREE table->tb6_lock.
602    It takes new route entry, the addition fails by any reason the
603    route is freed. In any case, if caller does not hold it, it may
604    be destroyed.
605  */
606 
607 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
608 {
609 	int err;
610 	struct fib6_table *table;
611 
612 	table = rt->rt6i_table;
613 	write_lock_bh(&table->tb6_lock);
614 	err = fib6_add(&table->tb6_root, rt, info);
615 	write_unlock_bh(&table->tb6_lock);
616 
617 	return err;
618 }
619 
620 int ip6_ins_rt(struct rt6_info *rt)
621 {
622 	struct nl_info info = {
623 		.nl_net = dev_net(rt->rt6i_dev),
624 	};
625 	return __ip6_ins_rt(rt, &info);
626 }
627 
628 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
629 				      struct in6_addr *saddr)
630 {
631 	struct rt6_info *rt;
632 
633 	/*
634 	 *	Clone the route.
635 	 */
636 
637 	rt = ip6_rt_copy(ort);
638 
639 	if (rt) {
640 		struct neighbour *neigh;
641 		int attempts = !in_softirq();
642 
643 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
644 			if (rt->rt6i_dst.plen != 128 &&
645 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
646 				rt->rt6i_flags |= RTF_ANYCAST;
647 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
648 		}
649 
650 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
651 		rt->rt6i_dst.plen = 128;
652 		rt->rt6i_flags |= RTF_CACHE;
653 		rt->dst.flags |= DST_HOST;
654 
655 #ifdef CONFIG_IPV6_SUBTREES
656 		if (rt->rt6i_src.plen && saddr) {
657 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
658 			rt->rt6i_src.plen = 128;
659 		}
660 #endif
661 
662 	retry:
663 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
664 		if (IS_ERR(neigh)) {
665 			struct net *net = dev_net(rt->rt6i_dev);
666 			int saved_rt_min_interval =
667 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
668 			int saved_rt_elasticity =
669 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
670 
671 			if (attempts-- > 0) {
672 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
673 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
674 
675 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
676 
677 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
678 					saved_rt_elasticity;
679 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
680 					saved_rt_min_interval;
681 				goto retry;
682 			}
683 
684 			if (net_ratelimit())
685 				printk(KERN_WARNING
686 				       "ipv6: Neighbour table overflow.\n");
687 			dst_free(&rt->dst);
688 			return NULL;
689 		}
690 		rt->rt6i_nexthop = neigh;
691 
692 	}
693 
694 	return rt;
695 }
696 
697 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
698 {
699 	struct rt6_info *rt = ip6_rt_copy(ort);
700 	if (rt) {
701 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
702 		rt->rt6i_dst.plen = 128;
703 		rt->rt6i_flags |= RTF_CACHE;
704 		rt->dst.flags |= DST_HOST;
705 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
706 	}
707 	return rt;
708 }
709 
710 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
711 				      struct flowi *fl, int flags)
712 {
713 	struct fib6_node *fn;
714 	struct rt6_info *rt, *nrt;
715 	int strict = 0;
716 	int attempts = 3;
717 	int err;
718 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
719 
720 	strict |= flags & RT6_LOOKUP_F_IFACE;
721 
722 relookup:
723 	read_lock_bh(&table->tb6_lock);
724 
725 restart_2:
726 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
727 
728 restart:
729 	rt = rt6_select(fn, oif, strict | reachable);
730 
731 	BACKTRACK(net, &fl->fl6_src);
732 	if (rt == net->ipv6.ip6_null_entry ||
733 	    rt->rt6i_flags & RTF_CACHE)
734 		goto out;
735 
736 	dst_hold(&rt->dst);
737 	read_unlock_bh(&table->tb6_lock);
738 
739 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
740 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
741 	else
742 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
743 
744 	dst_release(&rt->dst);
745 	rt = nrt ? : net->ipv6.ip6_null_entry;
746 
747 	dst_hold(&rt->dst);
748 	if (nrt) {
749 		err = ip6_ins_rt(nrt);
750 		if (!err)
751 			goto out2;
752 	}
753 
754 	if (--attempts <= 0)
755 		goto out2;
756 
757 	/*
758 	 * Race condition! In the gap, when table->tb6_lock was
759 	 * released someone could insert this route.  Relookup.
760 	 */
761 	dst_release(&rt->dst);
762 	goto relookup;
763 
764 out:
765 	if (reachable) {
766 		reachable = 0;
767 		goto restart_2;
768 	}
769 	dst_hold(&rt->dst);
770 	read_unlock_bh(&table->tb6_lock);
771 out2:
772 	rt->dst.lastuse = jiffies;
773 	rt->dst.__use++;
774 
775 	return rt;
776 }
777 
778 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
779 					    struct flowi *fl, int flags)
780 {
781 	return ip6_pol_route(net, table, fl->iif, fl, flags);
782 }
783 
784 void ip6_route_input(struct sk_buff *skb)
785 {
786 	struct ipv6hdr *iph = ipv6_hdr(skb);
787 	struct net *net = dev_net(skb->dev);
788 	int flags = RT6_LOOKUP_F_HAS_SADDR;
789 	struct flowi fl = {
790 		.iif = skb->dev->ifindex,
791 		.fl6_dst = iph->daddr,
792 		.fl6_src = iph->saddr,
793 		.fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
794 		.mark = skb->mark,
795 		.proto = iph->nexthdr,
796 	};
797 
798 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
799 		flags |= RT6_LOOKUP_F_IFACE;
800 
801 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
802 }
803 
804 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
805 					     struct flowi *fl, int flags)
806 {
807 	return ip6_pol_route(net, table, fl->oif, fl, flags);
808 }
809 
810 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
811 				    struct flowi *fl)
812 {
813 	int flags = 0;
814 
815 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
816 		flags |= RT6_LOOKUP_F_IFACE;
817 
818 	if (!ipv6_addr_any(&fl->fl6_src))
819 		flags |= RT6_LOOKUP_F_HAS_SADDR;
820 	else if (sk)
821 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
822 
823 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
824 }
825 
826 EXPORT_SYMBOL(ip6_route_output);
827 
828 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
829 {
830 	struct rt6_info *ort = (struct rt6_info *) *dstp;
831 	struct rt6_info *rt = (struct rt6_info *)
832 		dst_alloc(&ip6_dst_blackhole_ops);
833 	struct dst_entry *new = NULL;
834 
835 	if (rt) {
836 		new = &rt->dst;
837 
838 		atomic_set(&new->__refcnt, 1);
839 		new->__use = 1;
840 		new->input = dst_discard;
841 		new->output = dst_discard;
842 
843 		dst_copy_metrics(new, &ort->dst);
844 		new->dev = ort->dst.dev;
845 		if (new->dev)
846 			dev_hold(new->dev);
847 		rt->rt6i_idev = ort->rt6i_idev;
848 		if (rt->rt6i_idev)
849 			in6_dev_hold(rt->rt6i_idev);
850 		rt->rt6i_expires = 0;
851 
852 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
853 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
854 		rt->rt6i_metric = 0;
855 
856 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
857 #ifdef CONFIG_IPV6_SUBTREES
858 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
859 #endif
860 
861 		dst_free(new);
862 	}
863 
864 	dst_release(*dstp);
865 	*dstp = new;
866 	return new ? 0 : -ENOMEM;
867 }
868 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
869 
870 /*
871  *	Destination cache support functions
872  */
873 
874 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
875 {
876 	struct rt6_info *rt;
877 
878 	rt = (struct rt6_info *) dst;
879 
880 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
881 		return dst;
882 
883 	return NULL;
884 }
885 
886 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
887 {
888 	struct rt6_info *rt = (struct rt6_info *) dst;
889 
890 	if (rt) {
891 		if (rt->rt6i_flags & RTF_CACHE) {
892 			if (rt6_check_expired(rt)) {
893 				ip6_del_rt(rt);
894 				dst = NULL;
895 			}
896 		} else {
897 			dst_release(dst);
898 			dst = NULL;
899 		}
900 	}
901 	return dst;
902 }
903 
904 static void ip6_link_failure(struct sk_buff *skb)
905 {
906 	struct rt6_info *rt;
907 
908 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
909 
910 	rt = (struct rt6_info *) skb_dst(skb);
911 	if (rt) {
912 		if (rt->rt6i_flags&RTF_CACHE) {
913 			dst_set_expires(&rt->dst, 0);
914 			rt->rt6i_flags |= RTF_EXPIRES;
915 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
916 			rt->rt6i_node->fn_sernum = -1;
917 	}
918 }
919 
920 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
921 {
922 	struct rt6_info *rt6 = (struct rt6_info*)dst;
923 
924 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
925 		rt6->rt6i_flags |= RTF_MODIFIED;
926 		if (mtu < IPV6_MIN_MTU) {
927 			u32 features = dst_metric(dst, RTAX_FEATURES);
928 			mtu = IPV6_MIN_MTU;
929 			features |= RTAX_FEATURE_ALLFRAG;
930 			dst_metric_set(dst, RTAX_FEATURES, features);
931 		}
932 		dst_metric_set(dst, RTAX_MTU, mtu);
933 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
934 	}
935 }
936 
937 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
938 {
939 	struct net_device *dev = dst->dev;
940 	unsigned int mtu = dst_mtu(dst);
941 	struct net *net = dev_net(dev);
942 
943 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
944 
945 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
946 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
947 
948 	/*
949 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
950 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
951 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
952 	 * rely only on pmtu discovery"
953 	 */
954 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
955 		mtu = IPV6_MAXPLEN;
956 	return mtu;
957 }
958 
959 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
960 {
961 	unsigned int mtu = IPV6_MIN_MTU;
962 	struct inet6_dev *idev;
963 
964 	rcu_read_lock();
965 	idev = __in6_dev_get(dst->dev);
966 	if (idev)
967 		mtu = idev->cnf.mtu6;
968 	rcu_read_unlock();
969 
970 	return mtu;
971 }
972 
973 static struct dst_entry *icmp6_dst_gc_list;
974 static DEFINE_SPINLOCK(icmp6_dst_lock);
975 
976 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
977 				  struct neighbour *neigh,
978 				  const struct in6_addr *addr)
979 {
980 	struct rt6_info *rt;
981 	struct inet6_dev *idev = in6_dev_get(dev);
982 	struct net *net = dev_net(dev);
983 
984 	if (unlikely(idev == NULL))
985 		return NULL;
986 
987 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
988 	if (unlikely(rt == NULL)) {
989 		in6_dev_put(idev);
990 		goto out;
991 	}
992 
993 	dev_hold(dev);
994 	if (neigh)
995 		neigh_hold(neigh);
996 	else {
997 		neigh = ndisc_get_neigh(dev, addr);
998 		if (IS_ERR(neigh))
999 			neigh = NULL;
1000 	}
1001 
1002 	rt->rt6i_dev	  = dev;
1003 	rt->rt6i_idev     = idev;
1004 	rt->rt6i_nexthop  = neigh;
1005 	atomic_set(&rt->dst.__refcnt, 1);
1006 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1007 	rt->dst.output  = ip6_output;
1008 
1009 #if 0	/* there's no chance to use these for ndisc */
1010 	rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1011 				? DST_HOST
1012 				: 0;
1013 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1014 	rt->rt6i_dst.plen = 128;
1015 #endif
1016 
1017 	spin_lock_bh(&icmp6_dst_lock);
1018 	rt->dst.next = icmp6_dst_gc_list;
1019 	icmp6_dst_gc_list = &rt->dst;
1020 	spin_unlock_bh(&icmp6_dst_lock);
1021 
1022 	fib6_force_start_gc(net);
1023 
1024 out:
1025 	return &rt->dst;
1026 }
1027 
1028 int icmp6_dst_gc(void)
1029 {
1030 	struct dst_entry *dst, *next, **pprev;
1031 	int more = 0;
1032 
1033 	next = NULL;
1034 
1035 	spin_lock_bh(&icmp6_dst_lock);
1036 	pprev = &icmp6_dst_gc_list;
1037 
1038 	while ((dst = *pprev) != NULL) {
1039 		if (!atomic_read(&dst->__refcnt)) {
1040 			*pprev = dst->next;
1041 			dst_free(dst);
1042 		} else {
1043 			pprev = &dst->next;
1044 			++more;
1045 		}
1046 	}
1047 
1048 	spin_unlock_bh(&icmp6_dst_lock);
1049 
1050 	return more;
1051 }
1052 
1053 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1054 			    void *arg)
1055 {
1056 	struct dst_entry *dst, **pprev;
1057 
1058 	spin_lock_bh(&icmp6_dst_lock);
1059 	pprev = &icmp6_dst_gc_list;
1060 	while ((dst = *pprev) != NULL) {
1061 		struct rt6_info *rt = (struct rt6_info *) dst;
1062 		if (func(rt, arg)) {
1063 			*pprev = dst->next;
1064 			dst_free(dst);
1065 		} else {
1066 			pprev = &dst->next;
1067 		}
1068 	}
1069 	spin_unlock_bh(&icmp6_dst_lock);
1070 }
1071 
1072 static int ip6_dst_gc(struct dst_ops *ops)
1073 {
1074 	unsigned long now = jiffies;
1075 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1076 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1077 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1078 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1079 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1080 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1081 	int entries;
1082 
1083 	entries = dst_entries_get_fast(ops);
1084 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1085 	    entries <= rt_max_size)
1086 		goto out;
1087 
1088 	net->ipv6.ip6_rt_gc_expire++;
1089 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1090 	net->ipv6.ip6_rt_last_gc = now;
1091 	entries = dst_entries_get_slow(ops);
1092 	if (entries < ops->gc_thresh)
1093 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1094 out:
1095 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1096 	return entries > rt_max_size;
1097 }
1098 
1099 /* Clean host part of a prefix. Not necessary in radix tree,
1100    but results in cleaner routing tables.
1101 
1102    Remove it only when all the things will work!
1103  */
1104 
1105 int ip6_dst_hoplimit(struct dst_entry *dst)
1106 {
1107 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1108 	if (hoplimit == 0) {
1109 		struct net_device *dev = dst->dev;
1110 		struct inet6_dev *idev;
1111 
1112 		rcu_read_lock();
1113 		idev = __in6_dev_get(dev);
1114 		if (idev)
1115 			hoplimit = idev->cnf.hop_limit;
1116 		else
1117 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1118 		rcu_read_unlock();
1119 	}
1120 	return hoplimit;
1121 }
1122 EXPORT_SYMBOL(ip6_dst_hoplimit);
1123 
1124 /*
1125  *
1126  */
1127 
1128 int ip6_route_add(struct fib6_config *cfg)
1129 {
1130 	int err;
1131 	struct net *net = cfg->fc_nlinfo.nl_net;
1132 	struct rt6_info *rt = NULL;
1133 	struct net_device *dev = NULL;
1134 	struct inet6_dev *idev = NULL;
1135 	struct fib6_table *table;
1136 	int addr_type;
1137 
1138 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1139 		return -EINVAL;
1140 #ifndef CONFIG_IPV6_SUBTREES
1141 	if (cfg->fc_src_len)
1142 		return -EINVAL;
1143 #endif
1144 	if (cfg->fc_ifindex) {
1145 		err = -ENODEV;
1146 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1147 		if (!dev)
1148 			goto out;
1149 		idev = in6_dev_get(dev);
1150 		if (!idev)
1151 			goto out;
1152 	}
1153 
1154 	if (cfg->fc_metric == 0)
1155 		cfg->fc_metric = IP6_RT_PRIO_USER;
1156 
1157 	table = fib6_new_table(net, cfg->fc_table);
1158 	if (table == NULL) {
1159 		err = -ENOBUFS;
1160 		goto out;
1161 	}
1162 
1163 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1164 
1165 	if (rt == NULL) {
1166 		err = -ENOMEM;
1167 		goto out;
1168 	}
1169 
1170 	rt->dst.obsolete = -1;
1171 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1172 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1173 				0;
1174 
1175 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1176 		cfg->fc_protocol = RTPROT_BOOT;
1177 	rt->rt6i_protocol = cfg->fc_protocol;
1178 
1179 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1180 
1181 	if (addr_type & IPV6_ADDR_MULTICAST)
1182 		rt->dst.input = ip6_mc_input;
1183 	else if (cfg->fc_flags & RTF_LOCAL)
1184 		rt->dst.input = ip6_input;
1185 	else
1186 		rt->dst.input = ip6_forward;
1187 
1188 	rt->dst.output = ip6_output;
1189 
1190 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1191 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1192 	if (rt->rt6i_dst.plen == 128)
1193 	       rt->dst.flags = DST_HOST;
1194 
1195 #ifdef CONFIG_IPV6_SUBTREES
1196 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1197 	rt->rt6i_src.plen = cfg->fc_src_len;
1198 #endif
1199 
1200 	rt->rt6i_metric = cfg->fc_metric;
1201 
1202 	/* We cannot add true routes via loopback here,
1203 	   they would result in kernel looping; promote them to reject routes
1204 	 */
1205 	if ((cfg->fc_flags & RTF_REJECT) ||
1206 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1207 					      && !(cfg->fc_flags&RTF_LOCAL))) {
1208 		/* hold loopback dev/idev if we haven't done so. */
1209 		if (dev != net->loopback_dev) {
1210 			if (dev) {
1211 				dev_put(dev);
1212 				in6_dev_put(idev);
1213 			}
1214 			dev = net->loopback_dev;
1215 			dev_hold(dev);
1216 			idev = in6_dev_get(dev);
1217 			if (!idev) {
1218 				err = -ENODEV;
1219 				goto out;
1220 			}
1221 		}
1222 		rt->dst.output = ip6_pkt_discard_out;
1223 		rt->dst.input = ip6_pkt_discard;
1224 		rt->dst.error = -ENETUNREACH;
1225 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1226 		goto install_route;
1227 	}
1228 
1229 	if (cfg->fc_flags & RTF_GATEWAY) {
1230 		struct in6_addr *gw_addr;
1231 		int gwa_type;
1232 
1233 		gw_addr = &cfg->fc_gateway;
1234 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1235 		gwa_type = ipv6_addr_type(gw_addr);
1236 
1237 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1238 			struct rt6_info *grt;
1239 
1240 			/* IPv6 strictly inhibits using not link-local
1241 			   addresses as nexthop address.
1242 			   Otherwise, router will not able to send redirects.
1243 			   It is very good, but in some (rare!) circumstances
1244 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1245 			   some exceptions. --ANK
1246 			 */
1247 			err = -EINVAL;
1248 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1249 				goto out;
1250 
1251 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1252 
1253 			err = -EHOSTUNREACH;
1254 			if (grt == NULL)
1255 				goto out;
1256 			if (dev) {
1257 				if (dev != grt->rt6i_dev) {
1258 					dst_release(&grt->dst);
1259 					goto out;
1260 				}
1261 			} else {
1262 				dev = grt->rt6i_dev;
1263 				idev = grt->rt6i_idev;
1264 				dev_hold(dev);
1265 				in6_dev_hold(grt->rt6i_idev);
1266 			}
1267 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1268 				err = 0;
1269 			dst_release(&grt->dst);
1270 
1271 			if (err)
1272 				goto out;
1273 		}
1274 		err = -EINVAL;
1275 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1276 			goto out;
1277 	}
1278 
1279 	err = -ENODEV;
1280 	if (dev == NULL)
1281 		goto out;
1282 
1283 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1284 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1285 		if (IS_ERR(rt->rt6i_nexthop)) {
1286 			err = PTR_ERR(rt->rt6i_nexthop);
1287 			rt->rt6i_nexthop = NULL;
1288 			goto out;
1289 		}
1290 	}
1291 
1292 	rt->rt6i_flags = cfg->fc_flags;
1293 
1294 install_route:
1295 	if (cfg->fc_mx) {
1296 		struct nlattr *nla;
1297 		int remaining;
1298 
1299 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1300 			int type = nla_type(nla);
1301 
1302 			if (type) {
1303 				if (type > RTAX_MAX) {
1304 					err = -EINVAL;
1305 					goto out;
1306 				}
1307 
1308 				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1309 			}
1310 		}
1311 	}
1312 
1313 	rt->dst.dev = dev;
1314 	rt->rt6i_idev = idev;
1315 	rt->rt6i_table = table;
1316 
1317 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1318 
1319 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1320 
1321 out:
1322 	if (dev)
1323 		dev_put(dev);
1324 	if (idev)
1325 		in6_dev_put(idev);
1326 	if (rt)
1327 		dst_free(&rt->dst);
1328 	return err;
1329 }
1330 
1331 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1332 {
1333 	int err;
1334 	struct fib6_table *table;
1335 	struct net *net = dev_net(rt->rt6i_dev);
1336 
1337 	if (rt == net->ipv6.ip6_null_entry)
1338 		return -ENOENT;
1339 
1340 	table = rt->rt6i_table;
1341 	write_lock_bh(&table->tb6_lock);
1342 
1343 	err = fib6_del(rt, info);
1344 	dst_release(&rt->dst);
1345 
1346 	write_unlock_bh(&table->tb6_lock);
1347 
1348 	return err;
1349 }
1350 
1351 int ip6_del_rt(struct rt6_info *rt)
1352 {
1353 	struct nl_info info = {
1354 		.nl_net = dev_net(rt->rt6i_dev),
1355 	};
1356 	return __ip6_del_rt(rt, &info);
1357 }
1358 
1359 static int ip6_route_del(struct fib6_config *cfg)
1360 {
1361 	struct fib6_table *table;
1362 	struct fib6_node *fn;
1363 	struct rt6_info *rt;
1364 	int err = -ESRCH;
1365 
1366 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1367 	if (table == NULL)
1368 		return err;
1369 
1370 	read_lock_bh(&table->tb6_lock);
1371 
1372 	fn = fib6_locate(&table->tb6_root,
1373 			 &cfg->fc_dst, cfg->fc_dst_len,
1374 			 &cfg->fc_src, cfg->fc_src_len);
1375 
1376 	if (fn) {
1377 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1378 			if (cfg->fc_ifindex &&
1379 			    (rt->rt6i_dev == NULL ||
1380 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1381 				continue;
1382 			if (cfg->fc_flags & RTF_GATEWAY &&
1383 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1384 				continue;
1385 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1386 				continue;
1387 			dst_hold(&rt->dst);
1388 			read_unlock_bh(&table->tb6_lock);
1389 
1390 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1391 		}
1392 	}
1393 	read_unlock_bh(&table->tb6_lock);
1394 
1395 	return err;
1396 }
1397 
1398 /*
1399  *	Handle redirects
1400  */
1401 struct ip6rd_flowi {
1402 	struct flowi fl;
1403 	struct in6_addr gateway;
1404 };
1405 
1406 static struct rt6_info *__ip6_route_redirect(struct net *net,
1407 					     struct fib6_table *table,
1408 					     struct flowi *fl,
1409 					     int flags)
1410 {
1411 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1412 	struct rt6_info *rt;
1413 	struct fib6_node *fn;
1414 
1415 	/*
1416 	 * Get the "current" route for this destination and
1417 	 * check if the redirect has come from approriate router.
1418 	 *
1419 	 * RFC 2461 specifies that redirects should only be
1420 	 * accepted if they come from the nexthop to the target.
1421 	 * Due to the way the routes are chosen, this notion
1422 	 * is a bit fuzzy and one might need to check all possible
1423 	 * routes.
1424 	 */
1425 
1426 	read_lock_bh(&table->tb6_lock);
1427 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1428 restart:
1429 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1430 		/*
1431 		 * Current route is on-link; redirect is always invalid.
1432 		 *
1433 		 * Seems, previous statement is not true. It could
1434 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1435 		 * But then router serving it might decide, that we should
1436 		 * know truth 8)8) --ANK (980726).
1437 		 */
1438 		if (rt6_check_expired(rt))
1439 			continue;
1440 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1441 			continue;
1442 		if (fl->oif != rt->rt6i_dev->ifindex)
1443 			continue;
1444 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1445 			continue;
1446 		break;
1447 	}
1448 
1449 	if (!rt)
1450 		rt = net->ipv6.ip6_null_entry;
1451 	BACKTRACK(net, &fl->fl6_src);
1452 out:
1453 	dst_hold(&rt->dst);
1454 
1455 	read_unlock_bh(&table->tb6_lock);
1456 
1457 	return rt;
1458 };
1459 
1460 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1461 					   struct in6_addr *src,
1462 					   struct in6_addr *gateway,
1463 					   struct net_device *dev)
1464 {
1465 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1466 	struct net *net = dev_net(dev);
1467 	struct ip6rd_flowi rdfl = {
1468 		.fl = {
1469 			.oif = dev->ifindex,
1470 			.fl6_dst = *dest,
1471 			.fl6_src = *src,
1472 		},
1473 	};
1474 
1475 	ipv6_addr_copy(&rdfl.gateway, gateway);
1476 
1477 	if (rt6_need_strict(dest))
1478 		flags |= RT6_LOOKUP_F_IFACE;
1479 
1480 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1481 						   flags, __ip6_route_redirect);
1482 }
1483 
1484 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1485 		  struct in6_addr *saddr,
1486 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1487 {
1488 	struct rt6_info *rt, *nrt = NULL;
1489 	struct netevent_redirect netevent;
1490 	struct net *net = dev_net(neigh->dev);
1491 
1492 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1493 
1494 	if (rt == net->ipv6.ip6_null_entry) {
1495 		if (net_ratelimit())
1496 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1497 			       "for redirect target\n");
1498 		goto out;
1499 	}
1500 
1501 	/*
1502 	 *	We have finally decided to accept it.
1503 	 */
1504 
1505 	neigh_update(neigh, lladdr, NUD_STALE,
1506 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1507 		     NEIGH_UPDATE_F_OVERRIDE|
1508 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1509 				     NEIGH_UPDATE_F_ISROUTER))
1510 		     );
1511 
1512 	/*
1513 	 * Redirect received -> path was valid.
1514 	 * Look, redirects are sent only in response to data packets,
1515 	 * so that this nexthop apparently is reachable. --ANK
1516 	 */
1517 	dst_confirm(&rt->dst);
1518 
1519 	/* Duplicate redirect: silently ignore. */
1520 	if (neigh == rt->dst.neighbour)
1521 		goto out;
1522 
1523 	nrt = ip6_rt_copy(rt);
1524 	if (nrt == NULL)
1525 		goto out;
1526 
1527 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1528 	if (on_link)
1529 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1530 
1531 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1532 	nrt->rt6i_dst.plen = 128;
1533 	nrt->dst.flags |= DST_HOST;
1534 
1535 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1536 	nrt->rt6i_nexthop = neigh_clone(neigh);
1537 
1538 	if (ip6_ins_rt(nrt))
1539 		goto out;
1540 
1541 	netevent.old = &rt->dst;
1542 	netevent.new = &nrt->dst;
1543 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1544 
1545 	if (rt->rt6i_flags&RTF_CACHE) {
1546 		ip6_del_rt(rt);
1547 		return;
1548 	}
1549 
1550 out:
1551 	dst_release(&rt->dst);
1552 }
1553 
1554 /*
1555  *	Handle ICMP "packet too big" messages
1556  *	i.e. Path MTU discovery
1557  */
1558 
1559 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1560 			     struct net *net, u32 pmtu, int ifindex)
1561 {
1562 	struct rt6_info *rt, *nrt;
1563 	int allfrag = 0;
1564 again:
1565 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1566 	if (rt == NULL)
1567 		return;
1568 
1569 	if (rt6_check_expired(rt)) {
1570 		ip6_del_rt(rt);
1571 		goto again;
1572 	}
1573 
1574 	if (pmtu >= dst_mtu(&rt->dst))
1575 		goto out;
1576 
1577 	if (pmtu < IPV6_MIN_MTU) {
1578 		/*
1579 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1580 		 * MTU (1280) and a fragment header should always be included
1581 		 * after a node receiving Too Big message reporting PMTU is
1582 		 * less than the IPv6 Minimum Link MTU.
1583 		 */
1584 		pmtu = IPV6_MIN_MTU;
1585 		allfrag = 1;
1586 	}
1587 
1588 	/* New mtu received -> path was valid.
1589 	   They are sent only in response to data packets,
1590 	   so that this nexthop apparently is reachable. --ANK
1591 	 */
1592 	dst_confirm(&rt->dst);
1593 
1594 	/* Host route. If it is static, it would be better
1595 	   not to override it, but add new one, so that
1596 	   when cache entry will expire old pmtu
1597 	   would return automatically.
1598 	 */
1599 	if (rt->rt6i_flags & RTF_CACHE) {
1600 		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1601 		if (allfrag) {
1602 			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1603 			features |= RTAX_FEATURE_ALLFRAG;
1604 			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1605 		}
1606 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1607 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1608 		goto out;
1609 	}
1610 
1611 	/* Network route.
1612 	   Two cases are possible:
1613 	   1. It is connected route. Action: COW
1614 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1615 	 */
1616 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1617 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1618 	else
1619 		nrt = rt6_alloc_clone(rt, daddr);
1620 
1621 	if (nrt) {
1622 		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1623 		if (allfrag) {
1624 			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1625 			features |= RTAX_FEATURE_ALLFRAG;
1626 			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1627 		}
1628 
1629 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1630 		 * happened within 5 mins, the recommended timer is 10 mins.
1631 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1632 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1633 		 * and detecting PMTU increase will be automatically happened.
1634 		 */
1635 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1636 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1637 
1638 		ip6_ins_rt(nrt);
1639 	}
1640 out:
1641 	dst_release(&rt->dst);
1642 }
1643 
1644 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1645 			struct net_device *dev, u32 pmtu)
1646 {
1647 	struct net *net = dev_net(dev);
1648 
1649 	/*
1650 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1651 	 * is sending along the path" that caused the Packet Too Big message.
1652 	 * Since it's not possible in the general case to determine which
1653 	 * interface was used to send the original packet, we update the MTU
1654 	 * on the interface that will be used to send future packets. We also
1655 	 * update the MTU on the interface that received the Packet Too Big in
1656 	 * case the original packet was forced out that interface with
1657 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1658 	 * correct behaviour, which would be to update the MTU on all
1659 	 * interfaces.
1660 	 */
1661 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1662 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1663 }
1664 
1665 /*
1666  *	Misc support functions
1667  */
1668 
1669 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1670 {
1671 	struct net *net = dev_net(ort->rt6i_dev);
1672 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1673 
1674 	if (rt) {
1675 		rt->dst.input = ort->dst.input;
1676 		rt->dst.output = ort->dst.output;
1677 
1678 		dst_copy_metrics(&rt->dst, &ort->dst);
1679 		rt->dst.error = ort->dst.error;
1680 		rt->dst.dev = ort->dst.dev;
1681 		if (rt->dst.dev)
1682 			dev_hold(rt->dst.dev);
1683 		rt->rt6i_idev = ort->rt6i_idev;
1684 		if (rt->rt6i_idev)
1685 			in6_dev_hold(rt->rt6i_idev);
1686 		rt->dst.lastuse = jiffies;
1687 		rt->rt6i_expires = 0;
1688 
1689 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1690 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1691 		rt->rt6i_metric = 0;
1692 
1693 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1694 #ifdef CONFIG_IPV6_SUBTREES
1695 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1696 #endif
1697 		rt->rt6i_table = ort->rt6i_table;
1698 	}
1699 	return rt;
1700 }
1701 
1702 #ifdef CONFIG_IPV6_ROUTE_INFO
1703 static struct rt6_info *rt6_get_route_info(struct net *net,
1704 					   struct in6_addr *prefix, int prefixlen,
1705 					   struct in6_addr *gwaddr, int ifindex)
1706 {
1707 	struct fib6_node *fn;
1708 	struct rt6_info *rt = NULL;
1709 	struct fib6_table *table;
1710 
1711 	table = fib6_get_table(net, RT6_TABLE_INFO);
1712 	if (table == NULL)
1713 		return NULL;
1714 
1715 	write_lock_bh(&table->tb6_lock);
1716 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1717 	if (!fn)
1718 		goto out;
1719 
1720 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1721 		if (rt->rt6i_dev->ifindex != ifindex)
1722 			continue;
1723 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1724 			continue;
1725 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1726 			continue;
1727 		dst_hold(&rt->dst);
1728 		break;
1729 	}
1730 out:
1731 	write_unlock_bh(&table->tb6_lock);
1732 	return rt;
1733 }
1734 
1735 static struct rt6_info *rt6_add_route_info(struct net *net,
1736 					   struct in6_addr *prefix, int prefixlen,
1737 					   struct in6_addr *gwaddr, int ifindex,
1738 					   unsigned pref)
1739 {
1740 	struct fib6_config cfg = {
1741 		.fc_table	= RT6_TABLE_INFO,
1742 		.fc_metric	= IP6_RT_PRIO_USER,
1743 		.fc_ifindex	= ifindex,
1744 		.fc_dst_len	= prefixlen,
1745 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1746 				  RTF_UP | RTF_PREF(pref),
1747 		.fc_nlinfo.pid = 0,
1748 		.fc_nlinfo.nlh = NULL,
1749 		.fc_nlinfo.nl_net = net,
1750 	};
1751 
1752 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1753 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1754 
1755 	/* We should treat it as a default route if prefix length is 0. */
1756 	if (!prefixlen)
1757 		cfg.fc_flags |= RTF_DEFAULT;
1758 
1759 	ip6_route_add(&cfg);
1760 
1761 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1762 }
1763 #endif
1764 
1765 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1766 {
1767 	struct rt6_info *rt;
1768 	struct fib6_table *table;
1769 
1770 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1771 	if (table == NULL)
1772 		return NULL;
1773 
1774 	write_lock_bh(&table->tb6_lock);
1775 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1776 		if (dev == rt->rt6i_dev &&
1777 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1778 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1779 			break;
1780 	}
1781 	if (rt)
1782 		dst_hold(&rt->dst);
1783 	write_unlock_bh(&table->tb6_lock);
1784 	return rt;
1785 }
1786 
1787 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1788 				     struct net_device *dev,
1789 				     unsigned int pref)
1790 {
1791 	struct fib6_config cfg = {
1792 		.fc_table	= RT6_TABLE_DFLT,
1793 		.fc_metric	= IP6_RT_PRIO_USER,
1794 		.fc_ifindex	= dev->ifindex,
1795 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1796 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1797 		.fc_nlinfo.pid = 0,
1798 		.fc_nlinfo.nlh = NULL,
1799 		.fc_nlinfo.nl_net = dev_net(dev),
1800 	};
1801 
1802 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1803 
1804 	ip6_route_add(&cfg);
1805 
1806 	return rt6_get_dflt_router(gwaddr, dev);
1807 }
1808 
1809 void rt6_purge_dflt_routers(struct net *net)
1810 {
1811 	struct rt6_info *rt;
1812 	struct fib6_table *table;
1813 
1814 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1815 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1816 	if (table == NULL)
1817 		return;
1818 
1819 restart:
1820 	read_lock_bh(&table->tb6_lock);
1821 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1822 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1823 			dst_hold(&rt->dst);
1824 			read_unlock_bh(&table->tb6_lock);
1825 			ip6_del_rt(rt);
1826 			goto restart;
1827 		}
1828 	}
1829 	read_unlock_bh(&table->tb6_lock);
1830 }
1831 
1832 static void rtmsg_to_fib6_config(struct net *net,
1833 				 struct in6_rtmsg *rtmsg,
1834 				 struct fib6_config *cfg)
1835 {
1836 	memset(cfg, 0, sizeof(*cfg));
1837 
1838 	cfg->fc_table = RT6_TABLE_MAIN;
1839 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1840 	cfg->fc_metric = rtmsg->rtmsg_metric;
1841 	cfg->fc_expires = rtmsg->rtmsg_info;
1842 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1843 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1844 	cfg->fc_flags = rtmsg->rtmsg_flags;
1845 
1846 	cfg->fc_nlinfo.nl_net = net;
1847 
1848 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1849 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1850 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1851 }
1852 
1853 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1854 {
1855 	struct fib6_config cfg;
1856 	struct in6_rtmsg rtmsg;
1857 	int err;
1858 
1859 	switch(cmd) {
1860 	case SIOCADDRT:		/* Add a route */
1861 	case SIOCDELRT:		/* Delete a route */
1862 		if (!capable(CAP_NET_ADMIN))
1863 			return -EPERM;
1864 		err = copy_from_user(&rtmsg, arg,
1865 				     sizeof(struct in6_rtmsg));
1866 		if (err)
1867 			return -EFAULT;
1868 
1869 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1870 
1871 		rtnl_lock();
1872 		switch (cmd) {
1873 		case SIOCADDRT:
1874 			err = ip6_route_add(&cfg);
1875 			break;
1876 		case SIOCDELRT:
1877 			err = ip6_route_del(&cfg);
1878 			break;
1879 		default:
1880 			err = -EINVAL;
1881 		}
1882 		rtnl_unlock();
1883 
1884 		return err;
1885 	}
1886 
1887 	return -EINVAL;
1888 }
1889 
1890 /*
1891  *	Drop the packet on the floor
1892  */
1893 
1894 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1895 {
1896 	int type;
1897 	struct dst_entry *dst = skb_dst(skb);
1898 	switch (ipstats_mib_noroutes) {
1899 	case IPSTATS_MIB_INNOROUTES:
1900 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1901 		if (type == IPV6_ADDR_ANY) {
1902 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1903 				      IPSTATS_MIB_INADDRERRORS);
1904 			break;
1905 		}
1906 		/* FALLTHROUGH */
1907 	case IPSTATS_MIB_OUTNOROUTES:
1908 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1909 			      ipstats_mib_noroutes);
1910 		break;
1911 	}
1912 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1913 	kfree_skb(skb);
1914 	return 0;
1915 }
1916 
1917 static int ip6_pkt_discard(struct sk_buff *skb)
1918 {
1919 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1920 }
1921 
1922 static int ip6_pkt_discard_out(struct sk_buff *skb)
1923 {
1924 	skb->dev = skb_dst(skb)->dev;
1925 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1926 }
1927 
1928 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1929 
1930 static int ip6_pkt_prohibit(struct sk_buff *skb)
1931 {
1932 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1933 }
1934 
1935 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1936 {
1937 	skb->dev = skb_dst(skb)->dev;
1938 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1939 }
1940 
1941 #endif
1942 
1943 /*
1944  *	Allocate a dst for local (unicast / anycast) address.
1945  */
1946 
1947 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1948 				    const struct in6_addr *addr,
1949 				    int anycast)
1950 {
1951 	struct net *net = dev_net(idev->dev);
1952 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1953 	struct neighbour *neigh;
1954 
1955 	if (rt == NULL) {
1956 		if (net_ratelimit())
1957 			pr_warning("IPv6:  Maximum number of routes reached,"
1958 				   " consider increasing route/max_size.\n");
1959 		return ERR_PTR(-ENOMEM);
1960 	}
1961 
1962 	dev_hold(net->loopback_dev);
1963 	in6_dev_hold(idev);
1964 
1965 	rt->dst.flags = DST_HOST;
1966 	rt->dst.input = ip6_input;
1967 	rt->dst.output = ip6_output;
1968 	rt->rt6i_dev = net->loopback_dev;
1969 	rt->rt6i_idev = idev;
1970 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1971 	rt->dst.obsolete = -1;
1972 
1973 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1974 	if (anycast)
1975 		rt->rt6i_flags |= RTF_ANYCAST;
1976 	else
1977 		rt->rt6i_flags |= RTF_LOCAL;
1978 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1979 	if (IS_ERR(neigh)) {
1980 		dst_free(&rt->dst);
1981 
1982 		/* We are casting this because that is the return
1983 		 * value type.  But an errno encoded pointer is the
1984 		 * same regardless of the underlying pointer type,
1985 		 * and that's what we are returning.  So this is OK.
1986 		 */
1987 		return (struct rt6_info *) neigh;
1988 	}
1989 	rt->rt6i_nexthop = neigh;
1990 
1991 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1992 	rt->rt6i_dst.plen = 128;
1993 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1994 
1995 	atomic_set(&rt->dst.__refcnt, 1);
1996 
1997 	return rt;
1998 }
1999 
2000 struct arg_dev_net {
2001 	struct net_device *dev;
2002 	struct net *net;
2003 };
2004 
2005 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2006 {
2007 	const struct arg_dev_net *adn = arg;
2008 	const struct net_device *dev = adn->dev;
2009 
2010 	if ((rt->rt6i_dev == dev || dev == NULL) &&
2011 	    rt != adn->net->ipv6.ip6_null_entry) {
2012 		RT6_TRACE("deleted by ifdown %p\n", rt);
2013 		return -1;
2014 	}
2015 	return 0;
2016 }
2017 
2018 void rt6_ifdown(struct net *net, struct net_device *dev)
2019 {
2020 	struct arg_dev_net adn = {
2021 		.dev = dev,
2022 		.net = net,
2023 	};
2024 
2025 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2026 	icmp6_clean_all(fib6_ifdown, &adn);
2027 }
2028 
2029 struct rt6_mtu_change_arg
2030 {
2031 	struct net_device *dev;
2032 	unsigned mtu;
2033 };
2034 
2035 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2036 {
2037 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2038 	struct inet6_dev *idev;
2039 
2040 	/* In IPv6 pmtu discovery is not optional,
2041 	   so that RTAX_MTU lock cannot disable it.
2042 	   We still use this lock to block changes
2043 	   caused by addrconf/ndisc.
2044 	*/
2045 
2046 	idev = __in6_dev_get(arg->dev);
2047 	if (idev == NULL)
2048 		return 0;
2049 
2050 	/* For administrative MTU increase, there is no way to discover
2051 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2052 	   Since RFC 1981 doesn't include administrative MTU increase
2053 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2054 	 */
2055 	/*
2056 	   If new MTU is less than route PMTU, this new MTU will be the
2057 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2058 	   decreases; if new MTU is greater than route PMTU, and the
2059 	   old MTU is the lowest MTU in the path, update the route PMTU
2060 	   to reflect the increase. In this case if the other nodes' MTU
2061 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2062 	   PMTU discouvery.
2063 	 */
2064 	if (rt->rt6i_dev == arg->dev &&
2065 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2066 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2067 	     (dst_mtu(&rt->dst) < arg->mtu &&
2068 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2069 		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2070 	}
2071 	return 0;
2072 }
2073 
2074 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2075 {
2076 	struct rt6_mtu_change_arg arg = {
2077 		.dev = dev,
2078 		.mtu = mtu,
2079 	};
2080 
2081 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2082 }
2083 
2084 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2085 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2086 	[RTA_OIF]               = { .type = NLA_U32 },
2087 	[RTA_IIF]		= { .type = NLA_U32 },
2088 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2089 	[RTA_METRICS]           = { .type = NLA_NESTED },
2090 };
2091 
2092 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2093 			      struct fib6_config *cfg)
2094 {
2095 	struct rtmsg *rtm;
2096 	struct nlattr *tb[RTA_MAX+1];
2097 	int err;
2098 
2099 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2100 	if (err < 0)
2101 		goto errout;
2102 
2103 	err = -EINVAL;
2104 	rtm = nlmsg_data(nlh);
2105 	memset(cfg, 0, sizeof(*cfg));
2106 
2107 	cfg->fc_table = rtm->rtm_table;
2108 	cfg->fc_dst_len = rtm->rtm_dst_len;
2109 	cfg->fc_src_len = rtm->rtm_src_len;
2110 	cfg->fc_flags = RTF_UP;
2111 	cfg->fc_protocol = rtm->rtm_protocol;
2112 
2113 	if (rtm->rtm_type == RTN_UNREACHABLE)
2114 		cfg->fc_flags |= RTF_REJECT;
2115 
2116 	if (rtm->rtm_type == RTN_LOCAL)
2117 		cfg->fc_flags |= RTF_LOCAL;
2118 
2119 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2120 	cfg->fc_nlinfo.nlh = nlh;
2121 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2122 
2123 	if (tb[RTA_GATEWAY]) {
2124 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2125 		cfg->fc_flags |= RTF_GATEWAY;
2126 	}
2127 
2128 	if (tb[RTA_DST]) {
2129 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2130 
2131 		if (nla_len(tb[RTA_DST]) < plen)
2132 			goto errout;
2133 
2134 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2135 	}
2136 
2137 	if (tb[RTA_SRC]) {
2138 		int plen = (rtm->rtm_src_len + 7) >> 3;
2139 
2140 		if (nla_len(tb[RTA_SRC]) < plen)
2141 			goto errout;
2142 
2143 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2144 	}
2145 
2146 	if (tb[RTA_OIF])
2147 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2148 
2149 	if (tb[RTA_PRIORITY])
2150 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2151 
2152 	if (tb[RTA_METRICS]) {
2153 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2154 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2155 	}
2156 
2157 	if (tb[RTA_TABLE])
2158 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2159 
2160 	err = 0;
2161 errout:
2162 	return err;
2163 }
2164 
2165 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2166 {
2167 	struct fib6_config cfg;
2168 	int err;
2169 
2170 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2171 	if (err < 0)
2172 		return err;
2173 
2174 	return ip6_route_del(&cfg);
2175 }
2176 
2177 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2178 {
2179 	struct fib6_config cfg;
2180 	int err;
2181 
2182 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2183 	if (err < 0)
2184 		return err;
2185 
2186 	return ip6_route_add(&cfg);
2187 }
2188 
2189 static inline size_t rt6_nlmsg_size(void)
2190 {
2191 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2192 	       + nla_total_size(16) /* RTA_SRC */
2193 	       + nla_total_size(16) /* RTA_DST */
2194 	       + nla_total_size(16) /* RTA_GATEWAY */
2195 	       + nla_total_size(16) /* RTA_PREFSRC */
2196 	       + nla_total_size(4) /* RTA_TABLE */
2197 	       + nla_total_size(4) /* RTA_IIF */
2198 	       + nla_total_size(4) /* RTA_OIF */
2199 	       + nla_total_size(4) /* RTA_PRIORITY */
2200 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2201 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2202 }
2203 
2204 static int rt6_fill_node(struct net *net,
2205 			 struct sk_buff *skb, struct rt6_info *rt,
2206 			 struct in6_addr *dst, struct in6_addr *src,
2207 			 int iif, int type, u32 pid, u32 seq,
2208 			 int prefix, int nowait, unsigned int flags)
2209 {
2210 	struct rtmsg *rtm;
2211 	struct nlmsghdr *nlh;
2212 	long expires;
2213 	u32 table;
2214 
2215 	if (prefix) {	/* user wants prefix routes only */
2216 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2217 			/* success since this is not a prefix route */
2218 			return 1;
2219 		}
2220 	}
2221 
2222 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2223 	if (nlh == NULL)
2224 		return -EMSGSIZE;
2225 
2226 	rtm = nlmsg_data(nlh);
2227 	rtm->rtm_family = AF_INET6;
2228 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2229 	rtm->rtm_src_len = rt->rt6i_src.plen;
2230 	rtm->rtm_tos = 0;
2231 	if (rt->rt6i_table)
2232 		table = rt->rt6i_table->tb6_id;
2233 	else
2234 		table = RT6_TABLE_UNSPEC;
2235 	rtm->rtm_table = table;
2236 	NLA_PUT_U32(skb, RTA_TABLE, table);
2237 	if (rt->rt6i_flags&RTF_REJECT)
2238 		rtm->rtm_type = RTN_UNREACHABLE;
2239 	else if (rt->rt6i_flags&RTF_LOCAL)
2240 		rtm->rtm_type = RTN_LOCAL;
2241 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2242 		rtm->rtm_type = RTN_LOCAL;
2243 	else
2244 		rtm->rtm_type = RTN_UNICAST;
2245 	rtm->rtm_flags = 0;
2246 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2247 	rtm->rtm_protocol = rt->rt6i_protocol;
2248 	if (rt->rt6i_flags&RTF_DYNAMIC)
2249 		rtm->rtm_protocol = RTPROT_REDIRECT;
2250 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2251 		rtm->rtm_protocol = RTPROT_KERNEL;
2252 	else if (rt->rt6i_flags&RTF_DEFAULT)
2253 		rtm->rtm_protocol = RTPROT_RA;
2254 
2255 	if (rt->rt6i_flags&RTF_CACHE)
2256 		rtm->rtm_flags |= RTM_F_CLONED;
2257 
2258 	if (dst) {
2259 		NLA_PUT(skb, RTA_DST, 16, dst);
2260 		rtm->rtm_dst_len = 128;
2261 	} else if (rtm->rtm_dst_len)
2262 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2263 #ifdef CONFIG_IPV6_SUBTREES
2264 	if (src) {
2265 		NLA_PUT(skb, RTA_SRC, 16, src);
2266 		rtm->rtm_src_len = 128;
2267 	} else if (rtm->rtm_src_len)
2268 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2269 #endif
2270 	if (iif) {
2271 #ifdef CONFIG_IPV6_MROUTE
2272 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2273 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2274 			if (err <= 0) {
2275 				if (!nowait) {
2276 					if (err == 0)
2277 						return 0;
2278 					goto nla_put_failure;
2279 				} else {
2280 					if (err == -EMSGSIZE)
2281 						goto nla_put_failure;
2282 				}
2283 			}
2284 		} else
2285 #endif
2286 			NLA_PUT_U32(skb, RTA_IIF, iif);
2287 	} else if (dst) {
2288 		struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2289 		struct in6_addr saddr_buf;
2290 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2291 				       dst, 0, &saddr_buf) == 0)
2292 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2293 	}
2294 
2295 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2296 		goto nla_put_failure;
2297 
2298 	if (rt->dst.neighbour)
2299 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2300 
2301 	if (rt->dst.dev)
2302 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2303 
2304 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2305 
2306 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2307 		expires = 0;
2308 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2309 		expires = rt->rt6i_expires - jiffies;
2310 	else
2311 		expires = INT_MAX;
2312 
2313 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2314 			       expires, rt->dst.error) < 0)
2315 		goto nla_put_failure;
2316 
2317 	return nlmsg_end(skb, nlh);
2318 
2319 nla_put_failure:
2320 	nlmsg_cancel(skb, nlh);
2321 	return -EMSGSIZE;
2322 }
2323 
2324 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2325 {
2326 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2327 	int prefix;
2328 
2329 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2330 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2331 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2332 	} else
2333 		prefix = 0;
2334 
2335 	return rt6_fill_node(arg->net,
2336 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2337 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2338 		     prefix, 0, NLM_F_MULTI);
2339 }
2340 
2341 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2342 {
2343 	struct net *net = sock_net(in_skb->sk);
2344 	struct nlattr *tb[RTA_MAX+1];
2345 	struct rt6_info *rt;
2346 	struct sk_buff *skb;
2347 	struct rtmsg *rtm;
2348 	struct flowi fl;
2349 	int err, iif = 0;
2350 
2351 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2352 	if (err < 0)
2353 		goto errout;
2354 
2355 	err = -EINVAL;
2356 	memset(&fl, 0, sizeof(fl));
2357 
2358 	if (tb[RTA_SRC]) {
2359 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2360 			goto errout;
2361 
2362 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2363 	}
2364 
2365 	if (tb[RTA_DST]) {
2366 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2367 			goto errout;
2368 
2369 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2370 	}
2371 
2372 	if (tb[RTA_IIF])
2373 		iif = nla_get_u32(tb[RTA_IIF]);
2374 
2375 	if (tb[RTA_OIF])
2376 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2377 
2378 	if (iif) {
2379 		struct net_device *dev;
2380 		dev = __dev_get_by_index(net, iif);
2381 		if (!dev) {
2382 			err = -ENODEV;
2383 			goto errout;
2384 		}
2385 	}
2386 
2387 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2388 	if (skb == NULL) {
2389 		err = -ENOBUFS;
2390 		goto errout;
2391 	}
2392 
2393 	/* Reserve room for dummy headers, this skb can pass
2394 	   through good chunk of routing engine.
2395 	 */
2396 	skb_reset_mac_header(skb);
2397 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2398 
2399 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2400 	skb_dst_set(skb, &rt->dst);
2401 
2402 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2403 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2404 			    nlh->nlmsg_seq, 0, 0, 0);
2405 	if (err < 0) {
2406 		kfree_skb(skb);
2407 		goto errout;
2408 	}
2409 
2410 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2411 errout:
2412 	return err;
2413 }
2414 
2415 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2416 {
2417 	struct sk_buff *skb;
2418 	struct net *net = info->nl_net;
2419 	u32 seq;
2420 	int err;
2421 
2422 	err = -ENOBUFS;
2423 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2424 
2425 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2426 	if (skb == NULL)
2427 		goto errout;
2428 
2429 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2430 				event, info->pid, seq, 0, 0, 0);
2431 	if (err < 0) {
2432 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2433 		WARN_ON(err == -EMSGSIZE);
2434 		kfree_skb(skb);
2435 		goto errout;
2436 	}
2437 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2438 		    info->nlh, gfp_any());
2439 	return;
2440 errout:
2441 	if (err < 0)
2442 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2443 }
2444 
2445 static int ip6_route_dev_notify(struct notifier_block *this,
2446 				unsigned long event, void *data)
2447 {
2448 	struct net_device *dev = (struct net_device *)data;
2449 	struct net *net = dev_net(dev);
2450 
2451 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2452 		net->ipv6.ip6_null_entry->dst.dev = dev;
2453 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2454 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2455 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2456 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2457 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2458 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2459 #endif
2460 	}
2461 
2462 	return NOTIFY_OK;
2463 }
2464 
2465 /*
2466  *	/proc
2467  */
2468 
2469 #ifdef CONFIG_PROC_FS
2470 
2471 struct rt6_proc_arg
2472 {
2473 	char *buffer;
2474 	int offset;
2475 	int length;
2476 	int skip;
2477 	int len;
2478 };
2479 
2480 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2481 {
2482 	struct seq_file *m = p_arg;
2483 
2484 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2485 
2486 #ifdef CONFIG_IPV6_SUBTREES
2487 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2488 #else
2489 	seq_puts(m, "00000000000000000000000000000000 00 ");
2490 #endif
2491 
2492 	if (rt->rt6i_nexthop) {
2493 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2494 	} else {
2495 		seq_puts(m, "00000000000000000000000000000000");
2496 	}
2497 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2498 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2499 		   rt->dst.__use, rt->rt6i_flags,
2500 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2501 	return 0;
2502 }
2503 
2504 static int ipv6_route_show(struct seq_file *m, void *v)
2505 {
2506 	struct net *net = (struct net *)m->private;
2507 	fib6_clean_all(net, rt6_info_route, 0, m);
2508 	return 0;
2509 }
2510 
2511 static int ipv6_route_open(struct inode *inode, struct file *file)
2512 {
2513 	return single_open_net(inode, file, ipv6_route_show);
2514 }
2515 
2516 static const struct file_operations ipv6_route_proc_fops = {
2517 	.owner		= THIS_MODULE,
2518 	.open		= ipv6_route_open,
2519 	.read		= seq_read,
2520 	.llseek		= seq_lseek,
2521 	.release	= single_release_net,
2522 };
2523 
2524 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2525 {
2526 	struct net *net = (struct net *)seq->private;
2527 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2528 		   net->ipv6.rt6_stats->fib_nodes,
2529 		   net->ipv6.rt6_stats->fib_route_nodes,
2530 		   net->ipv6.rt6_stats->fib_rt_alloc,
2531 		   net->ipv6.rt6_stats->fib_rt_entries,
2532 		   net->ipv6.rt6_stats->fib_rt_cache,
2533 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2534 		   net->ipv6.rt6_stats->fib_discarded_routes);
2535 
2536 	return 0;
2537 }
2538 
2539 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2540 {
2541 	return single_open_net(inode, file, rt6_stats_seq_show);
2542 }
2543 
2544 static const struct file_operations rt6_stats_seq_fops = {
2545 	.owner	 = THIS_MODULE,
2546 	.open	 = rt6_stats_seq_open,
2547 	.read	 = seq_read,
2548 	.llseek	 = seq_lseek,
2549 	.release = single_release_net,
2550 };
2551 #endif	/* CONFIG_PROC_FS */
2552 
2553 #ifdef CONFIG_SYSCTL
2554 
2555 static
2556 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2557 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2558 {
2559 	struct net *net = current->nsproxy->net_ns;
2560 	int delay = net->ipv6.sysctl.flush_delay;
2561 	if (write) {
2562 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2563 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2564 		return 0;
2565 	} else
2566 		return -EINVAL;
2567 }
2568 
2569 ctl_table ipv6_route_table_template[] = {
2570 	{
2571 		.procname	=	"flush",
2572 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2573 		.maxlen		=	sizeof(int),
2574 		.mode		=	0200,
2575 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2576 	},
2577 	{
2578 		.procname	=	"gc_thresh",
2579 		.data		=	&ip6_dst_ops_template.gc_thresh,
2580 		.maxlen		=	sizeof(int),
2581 		.mode		=	0644,
2582 		.proc_handler	=	proc_dointvec,
2583 	},
2584 	{
2585 		.procname	=	"max_size",
2586 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2587 		.maxlen		=	sizeof(int),
2588 		.mode		=	0644,
2589 		.proc_handler	=	proc_dointvec,
2590 	},
2591 	{
2592 		.procname	=	"gc_min_interval",
2593 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2594 		.maxlen		=	sizeof(int),
2595 		.mode		=	0644,
2596 		.proc_handler	=	proc_dointvec_jiffies,
2597 	},
2598 	{
2599 		.procname	=	"gc_timeout",
2600 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2601 		.maxlen		=	sizeof(int),
2602 		.mode		=	0644,
2603 		.proc_handler	=	proc_dointvec_jiffies,
2604 	},
2605 	{
2606 		.procname	=	"gc_interval",
2607 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2608 		.maxlen		=	sizeof(int),
2609 		.mode		=	0644,
2610 		.proc_handler	=	proc_dointvec_jiffies,
2611 	},
2612 	{
2613 		.procname	=	"gc_elasticity",
2614 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2615 		.maxlen		=	sizeof(int),
2616 		.mode		=	0644,
2617 		.proc_handler	=	proc_dointvec,
2618 	},
2619 	{
2620 		.procname	=	"mtu_expires",
2621 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2622 		.maxlen		=	sizeof(int),
2623 		.mode		=	0644,
2624 		.proc_handler	=	proc_dointvec_jiffies,
2625 	},
2626 	{
2627 		.procname	=	"min_adv_mss",
2628 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2629 		.maxlen		=	sizeof(int),
2630 		.mode		=	0644,
2631 		.proc_handler	=	proc_dointvec,
2632 	},
2633 	{
2634 		.procname	=	"gc_min_interval_ms",
2635 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2636 		.maxlen		=	sizeof(int),
2637 		.mode		=	0644,
2638 		.proc_handler	=	proc_dointvec_ms_jiffies,
2639 	},
2640 	{ }
2641 };
2642 
2643 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2644 {
2645 	struct ctl_table *table;
2646 
2647 	table = kmemdup(ipv6_route_table_template,
2648 			sizeof(ipv6_route_table_template),
2649 			GFP_KERNEL);
2650 
2651 	if (table) {
2652 		table[0].data = &net->ipv6.sysctl.flush_delay;
2653 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2654 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2655 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2656 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2657 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2658 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2659 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2660 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2661 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2662 	}
2663 
2664 	return table;
2665 }
2666 #endif
2667 
2668 static int __net_init ip6_route_net_init(struct net *net)
2669 {
2670 	int ret = -ENOMEM;
2671 
2672 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2673 	       sizeof(net->ipv6.ip6_dst_ops));
2674 
2675 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2676 		goto out_ip6_dst_ops;
2677 
2678 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2679 					   sizeof(*net->ipv6.ip6_null_entry),
2680 					   GFP_KERNEL);
2681 	if (!net->ipv6.ip6_null_entry)
2682 		goto out_ip6_dst_entries;
2683 	net->ipv6.ip6_null_entry->dst.path =
2684 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2685 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2686 	dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2687 
2688 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2689 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2690 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2691 					       GFP_KERNEL);
2692 	if (!net->ipv6.ip6_prohibit_entry)
2693 		goto out_ip6_null_entry;
2694 	net->ipv6.ip6_prohibit_entry->dst.path =
2695 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2696 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2697 	dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2698 
2699 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2700 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2701 					       GFP_KERNEL);
2702 	if (!net->ipv6.ip6_blk_hole_entry)
2703 		goto out_ip6_prohibit_entry;
2704 	net->ipv6.ip6_blk_hole_entry->dst.path =
2705 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2706 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2707 	dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2708 #endif
2709 
2710 	net->ipv6.sysctl.flush_delay = 0;
2711 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2712 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2713 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2714 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2715 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2716 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2717 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2718 
2719 #ifdef CONFIG_PROC_FS
2720 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2721 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2722 #endif
2723 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2724 
2725 	ret = 0;
2726 out:
2727 	return ret;
2728 
2729 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2730 out_ip6_prohibit_entry:
2731 	kfree(net->ipv6.ip6_prohibit_entry);
2732 out_ip6_null_entry:
2733 	kfree(net->ipv6.ip6_null_entry);
2734 #endif
2735 out_ip6_dst_entries:
2736 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2737 out_ip6_dst_ops:
2738 	goto out;
2739 }
2740 
2741 static void __net_exit ip6_route_net_exit(struct net *net)
2742 {
2743 #ifdef CONFIG_PROC_FS
2744 	proc_net_remove(net, "ipv6_route");
2745 	proc_net_remove(net, "rt6_stats");
2746 #endif
2747 	kfree(net->ipv6.ip6_null_entry);
2748 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2749 	kfree(net->ipv6.ip6_prohibit_entry);
2750 	kfree(net->ipv6.ip6_blk_hole_entry);
2751 #endif
2752 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2753 }
2754 
2755 static struct pernet_operations ip6_route_net_ops = {
2756 	.init = ip6_route_net_init,
2757 	.exit = ip6_route_net_exit,
2758 };
2759 
2760 static struct notifier_block ip6_route_dev_notifier = {
2761 	.notifier_call = ip6_route_dev_notify,
2762 	.priority = 0,
2763 };
2764 
2765 int __init ip6_route_init(void)
2766 {
2767 	int ret;
2768 
2769 	ret = -ENOMEM;
2770 	ip6_dst_ops_template.kmem_cachep =
2771 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2772 				  SLAB_HWCACHE_ALIGN, NULL);
2773 	if (!ip6_dst_ops_template.kmem_cachep)
2774 		goto out;
2775 
2776 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2777 	if (ret)
2778 		goto out_kmem_cache;
2779 
2780 	ret = register_pernet_subsys(&ip6_route_net_ops);
2781 	if (ret)
2782 		goto out_dst_entries;
2783 
2784 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2785 
2786 	/* Registering of the loopback is done before this portion of code,
2787 	 * the loopback reference in rt6_info will not be taken, do it
2788 	 * manually for init_net */
2789 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2790 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2791   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2792 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2793 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2794 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2795 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2796   #endif
2797 	ret = fib6_init();
2798 	if (ret)
2799 		goto out_register_subsys;
2800 
2801 	ret = xfrm6_init();
2802 	if (ret)
2803 		goto out_fib6_init;
2804 
2805 	ret = fib6_rules_init();
2806 	if (ret)
2807 		goto xfrm6_init;
2808 
2809 	ret = -ENOBUFS;
2810 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2811 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2812 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2813 		goto fib6_rules_init;
2814 
2815 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2816 	if (ret)
2817 		goto fib6_rules_init;
2818 
2819 out:
2820 	return ret;
2821 
2822 fib6_rules_init:
2823 	fib6_rules_cleanup();
2824 xfrm6_init:
2825 	xfrm6_fini();
2826 out_fib6_init:
2827 	fib6_gc_cleanup();
2828 out_register_subsys:
2829 	unregister_pernet_subsys(&ip6_route_net_ops);
2830 out_dst_entries:
2831 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2832 out_kmem_cache:
2833 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2834 	goto out;
2835 }
2836 
2837 void ip6_route_cleanup(void)
2838 {
2839 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2840 	fib6_rules_cleanup();
2841 	xfrm6_fini();
2842 	fib6_gc_cleanup();
2843 	unregister_pernet_subsys(&ip6_route_net_ops);
2844 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2845 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2846 }
2847