1 /* Connection state tracking for netfilter. This is separated from, 2 but required by, the NAT layer; it can also be used by an iptables 3 extension. */ 4 5 /* (C) 1999-2001 Paul `Rusty' Russell 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 8 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License version 2 as 12 * published by the Free Software Foundation. 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/types.h> 18 #include <linux/netfilter.h> 19 #include <linux/module.h> 20 #include <linux/sched.h> 21 #include <linux/skbuff.h> 22 #include <linux/proc_fs.h> 23 #include <linux/vmalloc.h> 24 #include <linux/stddef.h> 25 #include <linux/slab.h> 26 #include <linux/random.h> 27 #include <linux/jhash.h> 28 #include <linux/err.h> 29 #include <linux/percpu.h> 30 #include <linux/moduleparam.h> 31 #include <linux/notifier.h> 32 #include <linux/kernel.h> 33 #include <linux/netdevice.h> 34 #include <linux/socket.h> 35 #include <linux/mm.h> 36 #include <linux/nsproxy.h> 37 #include <linux/rculist_nulls.h> 38 39 #include <net/netfilter/nf_conntrack.h> 40 #include <net/netfilter/nf_conntrack_l3proto.h> 41 #include <net/netfilter/nf_conntrack_l4proto.h> 42 #include <net/netfilter/nf_conntrack_expect.h> 43 #include <net/netfilter/nf_conntrack_helper.h> 44 #include <net/netfilter/nf_conntrack_seqadj.h> 45 #include <net/netfilter/nf_conntrack_core.h> 46 #include <net/netfilter/nf_conntrack_extend.h> 47 #include <net/netfilter/nf_conntrack_acct.h> 48 #include <net/netfilter/nf_conntrack_ecache.h> 49 #include <net/netfilter/nf_conntrack_zones.h> 50 #include <net/netfilter/nf_conntrack_timestamp.h> 51 #include <net/netfilter/nf_conntrack_timeout.h> 52 #include <net/netfilter/nf_conntrack_labels.h> 53 #include <net/netfilter/nf_conntrack_synproxy.h> 54 #include <net/netfilter/nf_nat.h> 55 #include <net/netfilter/nf_nat_core.h> 56 #include <net/netfilter/nf_nat_helper.h> 57 #include <net/netns/hash.h> 58 59 #define NF_CONNTRACK_VERSION "0.5.0" 60 61 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, 62 enum nf_nat_manip_type manip, 63 const struct nlattr *attr) __read_mostly; 64 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); 65 66 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 67 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 68 69 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 70 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 71 72 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 73 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 74 75 struct conntrack_gc_work { 76 struct delayed_work dwork; 77 u32 last_bucket; 78 bool exiting; 79 long next_gc_run; 80 }; 81 82 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 83 static __read_mostly spinlock_t nf_conntrack_locks_all_lock; 84 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 85 static __read_mostly bool nf_conntrack_locks_all; 86 87 /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ 88 #define GC_MAX_BUCKETS_DIV 128u 89 /* upper bound of full table scan */ 90 #define GC_MAX_SCAN_JIFFIES (16u * HZ) 91 /* desired ratio of entries found to be expired */ 92 #define GC_EVICT_RATIO 50u 93 94 static struct conntrack_gc_work conntrack_gc_work; 95 96 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 97 { 98 spin_lock(lock); 99 while (unlikely(nf_conntrack_locks_all)) { 100 spin_unlock(lock); 101 102 /* 103 * Order the 'nf_conntrack_locks_all' load vs. the 104 * spin_unlock_wait() loads below, to ensure 105 * that 'nf_conntrack_locks_all_lock' is indeed held: 106 */ 107 smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ 108 spin_unlock_wait(&nf_conntrack_locks_all_lock); 109 spin_lock(lock); 110 } 111 } 112 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 113 114 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 115 { 116 h1 %= CONNTRACK_LOCKS; 117 h2 %= CONNTRACK_LOCKS; 118 spin_unlock(&nf_conntrack_locks[h1]); 119 if (h1 != h2) 120 spin_unlock(&nf_conntrack_locks[h2]); 121 } 122 123 /* return true if we need to recompute hashes (in case hash table was resized) */ 124 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 125 unsigned int h2, unsigned int sequence) 126 { 127 h1 %= CONNTRACK_LOCKS; 128 h2 %= CONNTRACK_LOCKS; 129 if (h1 <= h2) { 130 nf_conntrack_lock(&nf_conntrack_locks[h1]); 131 if (h1 != h2) 132 spin_lock_nested(&nf_conntrack_locks[h2], 133 SINGLE_DEPTH_NESTING); 134 } else { 135 nf_conntrack_lock(&nf_conntrack_locks[h2]); 136 spin_lock_nested(&nf_conntrack_locks[h1], 137 SINGLE_DEPTH_NESTING); 138 } 139 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 140 nf_conntrack_double_unlock(h1, h2); 141 return true; 142 } 143 return false; 144 } 145 146 static void nf_conntrack_all_lock(void) 147 { 148 int i; 149 150 spin_lock(&nf_conntrack_locks_all_lock); 151 nf_conntrack_locks_all = true; 152 153 /* 154 * Order the above store of 'nf_conntrack_locks_all' against 155 * the spin_unlock_wait() loads below, such that if 156 * nf_conntrack_lock() observes 'nf_conntrack_locks_all' 157 * we must observe nf_conntrack_locks[] held: 158 */ 159 smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ 160 161 for (i = 0; i < CONNTRACK_LOCKS; i++) { 162 spin_unlock_wait(&nf_conntrack_locks[i]); 163 } 164 } 165 166 static void nf_conntrack_all_unlock(void) 167 { 168 /* 169 * All prior stores must be complete before we clear 170 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 171 * might observe the false value but not the entire 172 * critical section: 173 */ 174 smp_store_release(&nf_conntrack_locks_all, false); 175 spin_unlock(&nf_conntrack_locks_all_lock); 176 } 177 178 unsigned int nf_conntrack_htable_size __read_mostly; 179 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 180 181 unsigned int nf_conntrack_max __read_mostly; 182 seqcount_t nf_conntrack_generation __read_mostly; 183 184 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); 185 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); 186 187 static unsigned int nf_conntrack_hash_rnd __read_mostly; 188 189 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 190 const struct net *net) 191 { 192 unsigned int n; 193 u32 seed; 194 195 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 196 197 /* The direction must be ignored, so we hash everything up to the 198 * destination ports (which is a multiple of 4) and treat the last 199 * three bytes manually. 200 */ 201 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); 202 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 203 return jhash2((u32 *)tuple, n, seed ^ 204 (((__force __u16)tuple->dst.u.all << 16) | 205 tuple->dst.protonum)); 206 } 207 208 static u32 scale_hash(u32 hash) 209 { 210 return reciprocal_scale(hash, nf_conntrack_htable_size); 211 } 212 213 static u32 __hash_conntrack(const struct net *net, 214 const struct nf_conntrack_tuple *tuple, 215 unsigned int size) 216 { 217 return reciprocal_scale(hash_conntrack_raw(tuple, net), size); 218 } 219 220 static u32 hash_conntrack(const struct net *net, 221 const struct nf_conntrack_tuple *tuple) 222 { 223 return scale_hash(hash_conntrack_raw(tuple, net)); 224 } 225 226 bool 227 nf_ct_get_tuple(const struct sk_buff *skb, 228 unsigned int nhoff, 229 unsigned int dataoff, 230 u_int16_t l3num, 231 u_int8_t protonum, 232 struct net *net, 233 struct nf_conntrack_tuple *tuple, 234 const struct nf_conntrack_l3proto *l3proto, 235 const struct nf_conntrack_l4proto *l4proto) 236 { 237 memset(tuple, 0, sizeof(*tuple)); 238 239 tuple->src.l3num = l3num; 240 if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) 241 return false; 242 243 tuple->dst.protonum = protonum; 244 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 245 246 return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); 247 } 248 EXPORT_SYMBOL_GPL(nf_ct_get_tuple); 249 250 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 251 u_int16_t l3num, 252 struct net *net, struct nf_conntrack_tuple *tuple) 253 { 254 struct nf_conntrack_l3proto *l3proto; 255 struct nf_conntrack_l4proto *l4proto; 256 unsigned int protoff; 257 u_int8_t protonum; 258 int ret; 259 260 rcu_read_lock(); 261 262 l3proto = __nf_ct_l3proto_find(l3num); 263 ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum); 264 if (ret != NF_ACCEPT) { 265 rcu_read_unlock(); 266 return false; 267 } 268 269 l4proto = __nf_ct_l4proto_find(l3num, protonum); 270 271 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, 272 l3proto, l4proto); 273 274 rcu_read_unlock(); 275 return ret; 276 } 277 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 278 279 bool 280 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 281 const struct nf_conntrack_tuple *orig, 282 const struct nf_conntrack_l3proto *l3proto, 283 const struct nf_conntrack_l4proto *l4proto) 284 { 285 memset(inverse, 0, sizeof(*inverse)); 286 287 inverse->src.l3num = orig->src.l3num; 288 if (l3proto->invert_tuple(inverse, orig) == 0) 289 return false; 290 291 inverse->dst.dir = !orig->dst.dir; 292 293 inverse->dst.protonum = orig->dst.protonum; 294 return l4proto->invert_tuple(inverse, orig); 295 } 296 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 297 298 static void 299 clean_from_lists(struct nf_conn *ct) 300 { 301 pr_debug("clean_from_lists(%p)\n", ct); 302 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 303 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 304 305 /* Destroy all pending expectations */ 306 nf_ct_remove_expectations(ct); 307 } 308 309 /* must be called with local_bh_disable */ 310 static void nf_ct_add_to_dying_list(struct nf_conn *ct) 311 { 312 struct ct_pcpu *pcpu; 313 314 /* add this conntrack to the (per cpu) dying list */ 315 ct->cpu = smp_processor_id(); 316 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 317 318 spin_lock(&pcpu->lock); 319 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 320 &pcpu->dying); 321 spin_unlock(&pcpu->lock); 322 } 323 324 /* must be called with local_bh_disable */ 325 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) 326 { 327 struct ct_pcpu *pcpu; 328 329 /* add this conntrack to the (per cpu) unconfirmed list */ 330 ct->cpu = smp_processor_id(); 331 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 332 333 spin_lock(&pcpu->lock); 334 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 335 &pcpu->unconfirmed); 336 spin_unlock(&pcpu->lock); 337 } 338 339 /* must be called with local_bh_disable */ 340 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) 341 { 342 struct ct_pcpu *pcpu; 343 344 /* We overload first tuple to link into unconfirmed or dying list.*/ 345 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 346 347 spin_lock(&pcpu->lock); 348 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 349 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 350 spin_unlock(&pcpu->lock); 351 } 352 353 /* Released via destroy_conntrack() */ 354 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 355 const struct nf_conntrack_zone *zone, 356 gfp_t flags) 357 { 358 struct nf_conn *tmpl; 359 360 tmpl = kzalloc(sizeof(*tmpl), flags); 361 if (tmpl == NULL) 362 return NULL; 363 364 tmpl->status = IPS_TEMPLATE; 365 write_pnet(&tmpl->ct_net, net); 366 nf_ct_zone_add(tmpl, zone); 367 atomic_set(&tmpl->ct_general.use, 0); 368 369 return tmpl; 370 } 371 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 372 373 void nf_ct_tmpl_free(struct nf_conn *tmpl) 374 { 375 nf_ct_ext_destroy(tmpl); 376 nf_ct_ext_free(tmpl); 377 kfree(tmpl); 378 } 379 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 380 381 static void 382 destroy_conntrack(struct nf_conntrack *nfct) 383 { 384 struct nf_conn *ct = (struct nf_conn *)nfct; 385 struct nf_conntrack_l4proto *l4proto; 386 387 pr_debug("destroy_conntrack(%p)\n", ct); 388 NF_CT_ASSERT(atomic_read(&nfct->use) == 0); 389 390 if (unlikely(nf_ct_is_template(ct))) { 391 nf_ct_tmpl_free(ct); 392 return; 393 } 394 rcu_read_lock(); 395 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 396 if (l4proto->destroy) 397 l4proto->destroy(ct); 398 399 rcu_read_unlock(); 400 401 local_bh_disable(); 402 /* Expectations will have been removed in clean_from_lists, 403 * except TFTP can create an expectation on the first packet, 404 * before connection is in the list, so we need to clean here, 405 * too. 406 */ 407 nf_ct_remove_expectations(ct); 408 409 nf_ct_del_from_dying_or_unconfirmed_list(ct); 410 411 local_bh_enable(); 412 413 if (ct->master) 414 nf_ct_put(ct->master); 415 416 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); 417 nf_conntrack_free(ct); 418 } 419 420 static void nf_ct_delete_from_lists(struct nf_conn *ct) 421 { 422 struct net *net = nf_ct_net(ct); 423 unsigned int hash, reply_hash; 424 unsigned int sequence; 425 426 nf_ct_helper_destroy(ct); 427 428 local_bh_disable(); 429 do { 430 sequence = read_seqcount_begin(&nf_conntrack_generation); 431 hash = hash_conntrack(net, 432 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 433 reply_hash = hash_conntrack(net, 434 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 435 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 436 437 clean_from_lists(ct); 438 nf_conntrack_double_unlock(hash, reply_hash); 439 440 nf_ct_add_to_dying_list(ct); 441 442 local_bh_enable(); 443 } 444 445 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 446 { 447 struct nf_conn_tstamp *tstamp; 448 449 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 450 return false; 451 452 tstamp = nf_conn_tstamp_find(ct); 453 if (tstamp && tstamp->stop == 0) 454 tstamp->stop = ktime_get_real_ns(); 455 456 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 457 portid, report) < 0) { 458 /* destroy event was not delivered. nf_ct_put will 459 * be done by event cache worker on redelivery. 460 */ 461 nf_ct_delete_from_lists(ct); 462 nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); 463 return false; 464 } 465 466 nf_conntrack_ecache_work(nf_ct_net(ct)); 467 nf_ct_delete_from_lists(ct); 468 nf_ct_put(ct); 469 return true; 470 } 471 EXPORT_SYMBOL_GPL(nf_ct_delete); 472 473 static inline bool 474 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 475 const struct nf_conntrack_tuple *tuple, 476 const struct nf_conntrack_zone *zone, 477 const struct net *net) 478 { 479 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 480 481 /* A conntrack can be recreated with the equal tuple, 482 * so we need to check that the conntrack is confirmed 483 */ 484 return nf_ct_tuple_equal(tuple, &h->tuple) && 485 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 486 nf_ct_is_confirmed(ct) && 487 net_eq(net, nf_ct_net(ct)); 488 } 489 490 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 491 static void nf_ct_gc_expired(struct nf_conn *ct) 492 { 493 if (!atomic_inc_not_zero(&ct->ct_general.use)) 494 return; 495 496 if (nf_ct_should_gc(ct)) 497 nf_ct_kill(ct); 498 499 nf_ct_put(ct); 500 } 501 502 /* 503 * Warning : 504 * - Caller must take a reference on returned object 505 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 506 */ 507 static struct nf_conntrack_tuple_hash * 508 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 509 const struct nf_conntrack_tuple *tuple, u32 hash) 510 { 511 struct nf_conntrack_tuple_hash *h; 512 struct hlist_nulls_head *ct_hash; 513 struct hlist_nulls_node *n; 514 unsigned int bucket, hsize; 515 516 begin: 517 nf_conntrack_get_ht(&ct_hash, &hsize); 518 bucket = reciprocal_scale(hash, hsize); 519 520 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 521 struct nf_conn *ct; 522 523 ct = nf_ct_tuplehash_to_ctrack(h); 524 if (nf_ct_is_expired(ct)) { 525 nf_ct_gc_expired(ct); 526 continue; 527 } 528 529 if (nf_ct_is_dying(ct)) 530 continue; 531 532 if (nf_ct_key_equal(h, tuple, zone, net)) 533 return h; 534 } 535 /* 536 * if the nulls value we got at the end of this lookup is 537 * not the expected one, we must restart lookup. 538 * We probably met an item that was moved to another chain. 539 */ 540 if (get_nulls_value(n) != bucket) { 541 NF_CT_STAT_INC_ATOMIC(net, search_restart); 542 goto begin; 543 } 544 545 return NULL; 546 } 547 548 /* Find a connection corresponding to a tuple. */ 549 static struct nf_conntrack_tuple_hash * 550 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 551 const struct nf_conntrack_tuple *tuple, u32 hash) 552 { 553 struct nf_conntrack_tuple_hash *h; 554 struct nf_conn *ct; 555 556 rcu_read_lock(); 557 begin: 558 h = ____nf_conntrack_find(net, zone, tuple, hash); 559 if (h) { 560 ct = nf_ct_tuplehash_to_ctrack(h); 561 if (unlikely(nf_ct_is_dying(ct) || 562 !atomic_inc_not_zero(&ct->ct_general.use))) 563 h = NULL; 564 else { 565 if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { 566 nf_ct_put(ct); 567 goto begin; 568 } 569 } 570 } 571 rcu_read_unlock(); 572 573 return h; 574 } 575 576 struct nf_conntrack_tuple_hash * 577 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 578 const struct nf_conntrack_tuple *tuple) 579 { 580 return __nf_conntrack_find_get(net, zone, tuple, 581 hash_conntrack_raw(tuple, net)); 582 } 583 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 584 585 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 586 unsigned int hash, 587 unsigned int reply_hash) 588 { 589 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 590 &nf_conntrack_hash[hash]); 591 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 592 &nf_conntrack_hash[reply_hash]); 593 } 594 595 int 596 nf_conntrack_hash_check_insert(struct nf_conn *ct) 597 { 598 const struct nf_conntrack_zone *zone; 599 struct net *net = nf_ct_net(ct); 600 unsigned int hash, reply_hash; 601 struct nf_conntrack_tuple_hash *h; 602 struct hlist_nulls_node *n; 603 unsigned int sequence; 604 605 zone = nf_ct_zone(ct); 606 607 local_bh_disable(); 608 do { 609 sequence = read_seqcount_begin(&nf_conntrack_generation); 610 hash = hash_conntrack(net, 611 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 612 reply_hash = hash_conntrack(net, 613 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 614 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 615 616 /* See if there's one in the list already, including reverse */ 617 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 618 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 619 zone, net)) 620 goto out; 621 622 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 623 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 624 zone, net)) 625 goto out; 626 627 smp_wmb(); 628 /* The caller holds a reference to this object */ 629 atomic_set(&ct->ct_general.use, 2); 630 __nf_conntrack_hash_insert(ct, hash, reply_hash); 631 nf_conntrack_double_unlock(hash, reply_hash); 632 NF_CT_STAT_INC(net, insert); 633 local_bh_enable(); 634 return 0; 635 636 out: 637 nf_conntrack_double_unlock(hash, reply_hash); 638 NF_CT_STAT_INC(net, insert_failed); 639 local_bh_enable(); 640 return -EEXIST; 641 } 642 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 643 644 static inline void nf_ct_acct_update(struct nf_conn *ct, 645 enum ip_conntrack_info ctinfo, 646 unsigned int len) 647 { 648 struct nf_conn_acct *acct; 649 650 acct = nf_conn_acct_find(ct); 651 if (acct) { 652 struct nf_conn_counter *counter = acct->counter; 653 654 atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); 655 atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); 656 } 657 } 658 659 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 660 const struct nf_conn *loser_ct) 661 { 662 struct nf_conn_acct *acct; 663 664 acct = nf_conn_acct_find(loser_ct); 665 if (acct) { 666 struct nf_conn_counter *counter = acct->counter; 667 unsigned int bytes; 668 669 /* u32 should be fine since we must have seen one packet. */ 670 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 671 nf_ct_acct_update(ct, ctinfo, bytes); 672 } 673 } 674 675 /* Resolve race on insertion if this protocol allows this. */ 676 static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, 677 enum ip_conntrack_info ctinfo, 678 struct nf_conntrack_tuple_hash *h) 679 { 680 /* This is the conntrack entry already in hashes that won race. */ 681 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 682 struct nf_conntrack_l4proto *l4proto; 683 684 l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 685 if (l4proto->allow_clash && 686 !nfct_nat(ct) && 687 !nf_ct_is_dying(ct) && 688 atomic_inc_not_zero(&ct->ct_general.use)) { 689 nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); 690 nf_conntrack_put(skb->nfct); 691 /* Assign conntrack already in hashes to this skbuff. Don't 692 * modify skb->nfctinfo to ensure consistent stateful filtering. 693 */ 694 skb->nfct = &ct->ct_general; 695 return NF_ACCEPT; 696 } 697 NF_CT_STAT_INC(net, drop); 698 return NF_DROP; 699 } 700 701 /* Confirm a connection given skb; places it in hash table */ 702 int 703 __nf_conntrack_confirm(struct sk_buff *skb) 704 { 705 const struct nf_conntrack_zone *zone; 706 unsigned int hash, reply_hash; 707 struct nf_conntrack_tuple_hash *h; 708 struct nf_conn *ct; 709 struct nf_conn_help *help; 710 struct nf_conn_tstamp *tstamp; 711 struct hlist_nulls_node *n; 712 enum ip_conntrack_info ctinfo; 713 struct net *net; 714 unsigned int sequence; 715 int ret = NF_DROP; 716 717 ct = nf_ct_get(skb, &ctinfo); 718 net = nf_ct_net(ct); 719 720 /* ipt_REJECT uses nf_conntrack_attach to attach related 721 ICMP/TCP RST packets in other direction. Actual packet 722 which created connection will be IP_CT_NEW or for an 723 expected connection, IP_CT_RELATED. */ 724 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 725 return NF_ACCEPT; 726 727 zone = nf_ct_zone(ct); 728 local_bh_disable(); 729 730 do { 731 sequence = read_seqcount_begin(&nf_conntrack_generation); 732 /* reuse the hash saved before */ 733 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 734 hash = scale_hash(hash); 735 reply_hash = hash_conntrack(net, 736 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 737 738 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 739 740 /* We're not in hash table, and we refuse to set up related 741 * connections for unconfirmed conns. But packet copies and 742 * REJECT will give spurious warnings here. 743 */ 744 /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ 745 746 /* No external references means no one else could have 747 * confirmed us. 748 */ 749 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 750 pr_debug("Confirming conntrack %p\n", ct); 751 /* We have to check the DYING flag after unlink to prevent 752 * a race against nf_ct_get_next_corpse() possibly called from 753 * user context, else we insert an already 'dead' hash, blocking 754 * further use of that particular connection -JM. 755 */ 756 nf_ct_del_from_dying_or_unconfirmed_list(ct); 757 758 if (unlikely(nf_ct_is_dying(ct))) { 759 nf_ct_add_to_dying_list(ct); 760 goto dying; 761 } 762 763 /* See if there's one in the list already, including reverse: 764 NAT could have grabbed it without realizing, since we're 765 not in the hash. If there is, we lost race. */ 766 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 767 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 768 zone, net)) 769 goto out; 770 771 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 772 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 773 zone, net)) 774 goto out; 775 776 /* Timer relative to confirmation time, not original 777 setting time, otherwise we'd get timer wrap in 778 weird delay cases. */ 779 ct->timeout += nfct_time_stamp; 780 atomic_inc(&ct->ct_general.use); 781 ct->status |= IPS_CONFIRMED; 782 783 /* set conntrack timestamp, if enabled. */ 784 tstamp = nf_conn_tstamp_find(ct); 785 if (tstamp) { 786 if (skb->tstamp == 0) 787 __net_timestamp(skb); 788 789 tstamp->start = ktime_to_ns(skb->tstamp); 790 } 791 /* Since the lookup is lockless, hash insertion must be done after 792 * starting the timer and setting the CONFIRMED bit. The RCU barriers 793 * guarantee that no other CPU can find the conntrack before the above 794 * stores are visible. 795 */ 796 __nf_conntrack_hash_insert(ct, hash, reply_hash); 797 nf_conntrack_double_unlock(hash, reply_hash); 798 local_bh_enable(); 799 800 help = nfct_help(ct); 801 if (help && help->helper) 802 nf_conntrack_event_cache(IPCT_HELPER, ct); 803 804 nf_conntrack_event_cache(master_ct(ct) ? 805 IPCT_RELATED : IPCT_NEW, ct); 806 return NF_ACCEPT; 807 808 out: 809 nf_ct_add_to_dying_list(ct); 810 ret = nf_ct_resolve_clash(net, skb, ctinfo, h); 811 dying: 812 nf_conntrack_double_unlock(hash, reply_hash); 813 NF_CT_STAT_INC(net, insert_failed); 814 local_bh_enable(); 815 return ret; 816 } 817 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 818 819 /* Returns true if a connection correspondings to the tuple (required 820 for NAT). */ 821 int 822 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 823 const struct nf_conn *ignored_conntrack) 824 { 825 struct net *net = nf_ct_net(ignored_conntrack); 826 const struct nf_conntrack_zone *zone; 827 struct nf_conntrack_tuple_hash *h; 828 struct hlist_nulls_head *ct_hash; 829 unsigned int hash, hsize; 830 struct hlist_nulls_node *n; 831 struct nf_conn *ct; 832 833 zone = nf_ct_zone(ignored_conntrack); 834 835 rcu_read_lock(); 836 begin: 837 nf_conntrack_get_ht(&ct_hash, &hsize); 838 hash = __hash_conntrack(net, tuple, hsize); 839 840 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 841 ct = nf_ct_tuplehash_to_ctrack(h); 842 843 if (ct == ignored_conntrack) 844 continue; 845 846 if (nf_ct_is_expired(ct)) { 847 nf_ct_gc_expired(ct); 848 continue; 849 } 850 851 if (nf_ct_key_equal(h, tuple, zone, net)) { 852 NF_CT_STAT_INC_ATOMIC(net, found); 853 rcu_read_unlock(); 854 return 1; 855 } 856 } 857 858 if (get_nulls_value(n) != hash) { 859 NF_CT_STAT_INC_ATOMIC(net, search_restart); 860 goto begin; 861 } 862 863 rcu_read_unlock(); 864 865 return 0; 866 } 867 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 868 869 #define NF_CT_EVICTION_RANGE 8 870 871 /* There's a small race here where we may free a just-assured 872 connection. Too bad: we're in trouble anyway. */ 873 static unsigned int early_drop_list(struct net *net, 874 struct hlist_nulls_head *head) 875 { 876 struct nf_conntrack_tuple_hash *h; 877 struct hlist_nulls_node *n; 878 unsigned int drops = 0; 879 struct nf_conn *tmp; 880 881 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 882 tmp = nf_ct_tuplehash_to_ctrack(h); 883 884 if (nf_ct_is_expired(tmp)) { 885 nf_ct_gc_expired(tmp); 886 continue; 887 } 888 889 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 890 !net_eq(nf_ct_net(tmp), net) || 891 nf_ct_is_dying(tmp)) 892 continue; 893 894 if (!atomic_inc_not_zero(&tmp->ct_general.use)) 895 continue; 896 897 /* kill only if still in same netns -- might have moved due to 898 * SLAB_DESTROY_BY_RCU rules. 899 * 900 * We steal the timer reference. If that fails timer has 901 * already fired or someone else deleted it. Just drop ref 902 * and move to next entry. 903 */ 904 if (net_eq(nf_ct_net(tmp), net) && 905 nf_ct_is_confirmed(tmp) && 906 nf_ct_delete(tmp, 0, 0)) 907 drops++; 908 909 nf_ct_put(tmp); 910 } 911 912 return drops; 913 } 914 915 static noinline int early_drop(struct net *net, unsigned int _hash) 916 { 917 unsigned int i; 918 919 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 920 struct hlist_nulls_head *ct_hash; 921 unsigned int hash, hsize, drops; 922 923 rcu_read_lock(); 924 nf_conntrack_get_ht(&ct_hash, &hsize); 925 hash = reciprocal_scale(_hash++, hsize); 926 927 drops = early_drop_list(net, &ct_hash[hash]); 928 rcu_read_unlock(); 929 930 if (drops) { 931 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 932 return true; 933 } 934 } 935 936 return false; 937 } 938 939 static void gc_worker(struct work_struct *work) 940 { 941 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); 942 unsigned int i, goal, buckets = 0, expired_count = 0; 943 struct conntrack_gc_work *gc_work; 944 unsigned int ratio, scanned = 0; 945 unsigned long next_run; 946 947 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 948 949 goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; 950 i = gc_work->last_bucket; 951 952 do { 953 struct nf_conntrack_tuple_hash *h; 954 struct hlist_nulls_head *ct_hash; 955 struct hlist_nulls_node *n; 956 unsigned int hashsz; 957 struct nf_conn *tmp; 958 959 i++; 960 rcu_read_lock(); 961 962 nf_conntrack_get_ht(&ct_hash, &hashsz); 963 if (i >= hashsz) 964 i = 0; 965 966 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 967 tmp = nf_ct_tuplehash_to_ctrack(h); 968 969 scanned++; 970 if (nf_ct_is_expired(tmp)) { 971 nf_ct_gc_expired(tmp); 972 expired_count++; 973 continue; 974 } 975 } 976 977 /* could check get_nulls_value() here and restart if ct 978 * was moved to another chain. But given gc is best-effort 979 * we will just continue with next hash slot. 980 */ 981 rcu_read_unlock(); 982 cond_resched_rcu_qs(); 983 } while (++buckets < goal); 984 985 if (gc_work->exiting) 986 return; 987 988 /* 989 * Eviction will normally happen from the packet path, and not 990 * from this gc worker. 991 * 992 * This worker is only here to reap expired entries when system went 993 * idle after a busy period. 994 * 995 * The heuristics below are supposed to balance conflicting goals: 996 * 997 * 1. Minimize time until we notice a stale entry 998 * 2. Maximize scan intervals to not waste cycles 999 * 1000 * Normally, expire ratio will be close to 0. 1001 * 1002 * As soon as a sizeable fraction of the entries have expired 1003 * increase scan frequency. 1004 */ 1005 ratio = scanned ? expired_count * 100 / scanned : 0; 1006 if (ratio > GC_EVICT_RATIO) { 1007 gc_work->next_gc_run = min_interval; 1008 } else { 1009 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; 1010 1011 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); 1012 1013 gc_work->next_gc_run += min_interval; 1014 if (gc_work->next_gc_run > max) 1015 gc_work->next_gc_run = max; 1016 } 1017 1018 next_run = gc_work->next_gc_run; 1019 gc_work->last_bucket = i; 1020 queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); 1021 } 1022 1023 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1024 { 1025 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1026 gc_work->next_gc_run = HZ; 1027 gc_work->exiting = false; 1028 } 1029 1030 static struct nf_conn * 1031 __nf_conntrack_alloc(struct net *net, 1032 const struct nf_conntrack_zone *zone, 1033 const struct nf_conntrack_tuple *orig, 1034 const struct nf_conntrack_tuple *repl, 1035 gfp_t gfp, u32 hash) 1036 { 1037 struct nf_conn *ct; 1038 1039 /* We don't want any race condition at early drop stage */ 1040 atomic_inc(&net->ct.count); 1041 1042 if (nf_conntrack_max && 1043 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 1044 if (!early_drop(net, hash)) { 1045 atomic_dec(&net->ct.count); 1046 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1047 return ERR_PTR(-ENOMEM); 1048 } 1049 } 1050 1051 /* 1052 * Do not use kmem_cache_zalloc(), as this cache uses 1053 * SLAB_DESTROY_BY_RCU. 1054 */ 1055 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1056 if (ct == NULL) 1057 goto out; 1058 1059 spin_lock_init(&ct->lock); 1060 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1061 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1062 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1063 /* save hash for reusing when confirming */ 1064 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1065 ct->status = 0; 1066 write_pnet(&ct->ct_net, net); 1067 memset(&ct->__nfct_init_offset[0], 0, 1068 offsetof(struct nf_conn, proto) - 1069 offsetof(struct nf_conn, __nfct_init_offset[0])); 1070 1071 nf_ct_zone_add(ct, zone); 1072 1073 /* Because we use RCU lookups, we set ct_general.use to zero before 1074 * this is inserted in any list. 1075 */ 1076 atomic_set(&ct->ct_general.use, 0); 1077 return ct; 1078 out: 1079 atomic_dec(&net->ct.count); 1080 return ERR_PTR(-ENOMEM); 1081 } 1082 1083 struct nf_conn *nf_conntrack_alloc(struct net *net, 1084 const struct nf_conntrack_zone *zone, 1085 const struct nf_conntrack_tuple *orig, 1086 const struct nf_conntrack_tuple *repl, 1087 gfp_t gfp) 1088 { 1089 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1090 } 1091 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1092 1093 void nf_conntrack_free(struct nf_conn *ct) 1094 { 1095 struct net *net = nf_ct_net(ct); 1096 1097 /* A freed object has refcnt == 0, that's 1098 * the golden rule for SLAB_DESTROY_BY_RCU 1099 */ 1100 NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); 1101 1102 nf_ct_ext_destroy(ct); 1103 nf_ct_ext_free(ct); 1104 kmem_cache_free(nf_conntrack_cachep, ct); 1105 smp_mb__before_atomic(); 1106 atomic_dec(&net->ct.count); 1107 } 1108 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1109 1110 1111 /* Allocate a new conntrack: we return -ENOMEM if classification 1112 failed due to stress. Otherwise it really is unclassifiable. */ 1113 static struct nf_conntrack_tuple_hash * 1114 init_conntrack(struct net *net, struct nf_conn *tmpl, 1115 const struct nf_conntrack_tuple *tuple, 1116 struct nf_conntrack_l3proto *l3proto, 1117 struct nf_conntrack_l4proto *l4proto, 1118 struct sk_buff *skb, 1119 unsigned int dataoff, u32 hash) 1120 { 1121 struct nf_conn *ct; 1122 struct nf_conn_help *help; 1123 struct nf_conntrack_tuple repl_tuple; 1124 struct nf_conntrack_ecache *ecache; 1125 struct nf_conntrack_expect *exp = NULL; 1126 const struct nf_conntrack_zone *zone; 1127 struct nf_conn_timeout *timeout_ext; 1128 struct nf_conntrack_zone tmp; 1129 unsigned int *timeouts; 1130 1131 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { 1132 pr_debug("Can't invert tuple.\n"); 1133 return NULL; 1134 } 1135 1136 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1137 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1138 hash); 1139 if (IS_ERR(ct)) 1140 return (struct nf_conntrack_tuple_hash *)ct; 1141 1142 if (!nf_ct_add_synproxy(ct, tmpl)) { 1143 nf_conntrack_free(ct); 1144 return ERR_PTR(-ENOMEM); 1145 } 1146 1147 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1148 if (timeout_ext) { 1149 timeouts = nf_ct_timeout_data(timeout_ext); 1150 if (unlikely(!timeouts)) 1151 timeouts = l4proto->get_timeouts(net); 1152 } else { 1153 timeouts = l4proto->get_timeouts(net); 1154 } 1155 1156 if (!l4proto->new(ct, skb, dataoff, timeouts)) { 1157 nf_conntrack_free(ct); 1158 pr_debug("can't track with proto module\n"); 1159 return NULL; 1160 } 1161 1162 if (timeout_ext) 1163 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1164 GFP_ATOMIC); 1165 1166 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1167 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1168 nf_ct_labels_ext_add(ct); 1169 1170 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1171 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1172 ecache ? ecache->expmask : 0, 1173 GFP_ATOMIC); 1174 1175 local_bh_disable(); 1176 if (net->ct.expect_count) { 1177 spin_lock(&nf_conntrack_expect_lock); 1178 exp = nf_ct_find_expectation(net, zone, tuple); 1179 if (exp) { 1180 pr_debug("expectation arrives ct=%p exp=%p\n", 1181 ct, exp); 1182 /* Welcome, Mr. Bond. We've been expecting you... */ 1183 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1184 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1185 ct->master = exp->master; 1186 if (exp->helper) { 1187 help = nf_ct_helper_ext_add(ct, exp->helper, 1188 GFP_ATOMIC); 1189 if (help) 1190 rcu_assign_pointer(help->helper, exp->helper); 1191 } 1192 1193 #ifdef CONFIG_NF_CONNTRACK_MARK 1194 ct->mark = exp->master->mark; 1195 #endif 1196 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1197 ct->secmark = exp->master->secmark; 1198 #endif 1199 NF_CT_STAT_INC(net, expect_new); 1200 } 1201 spin_unlock(&nf_conntrack_expect_lock); 1202 } 1203 if (!exp) 1204 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1205 1206 /* Now it is inserted into the unconfirmed list, bump refcount */ 1207 nf_conntrack_get(&ct->ct_general); 1208 nf_ct_add_to_unconfirmed_list(ct); 1209 1210 local_bh_enable(); 1211 1212 if (exp) { 1213 if (exp->expectfn) 1214 exp->expectfn(ct, exp); 1215 nf_ct_expect_put(exp); 1216 } 1217 1218 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1219 } 1220 1221 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ 1222 static inline struct nf_conn * 1223 resolve_normal_ct(struct net *net, struct nf_conn *tmpl, 1224 struct sk_buff *skb, 1225 unsigned int dataoff, 1226 u_int16_t l3num, 1227 u_int8_t protonum, 1228 struct nf_conntrack_l3proto *l3proto, 1229 struct nf_conntrack_l4proto *l4proto, 1230 int *set_reply, 1231 enum ip_conntrack_info *ctinfo) 1232 { 1233 const struct nf_conntrack_zone *zone; 1234 struct nf_conntrack_tuple tuple; 1235 struct nf_conntrack_tuple_hash *h; 1236 struct nf_conntrack_zone tmp; 1237 struct nf_conn *ct; 1238 u32 hash; 1239 1240 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1241 dataoff, l3num, protonum, net, &tuple, l3proto, 1242 l4proto)) { 1243 pr_debug("Can't get tuple\n"); 1244 return NULL; 1245 } 1246 1247 /* look for tuple match */ 1248 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1249 hash = hash_conntrack_raw(&tuple, net); 1250 h = __nf_conntrack_find_get(net, zone, &tuple, hash); 1251 if (!h) { 1252 h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, 1253 skb, dataoff, hash); 1254 if (!h) 1255 return NULL; 1256 if (IS_ERR(h)) 1257 return (void *)h; 1258 } 1259 ct = nf_ct_tuplehash_to_ctrack(h); 1260 1261 /* It exists; we have (non-exclusive) reference. */ 1262 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1263 *ctinfo = IP_CT_ESTABLISHED_REPLY; 1264 /* Please set reply bit if this packet OK */ 1265 *set_reply = 1; 1266 } else { 1267 /* Once we've had two way comms, always ESTABLISHED. */ 1268 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1269 pr_debug("normal packet for %p\n", ct); 1270 *ctinfo = IP_CT_ESTABLISHED; 1271 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1272 pr_debug("related packet for %p\n", ct); 1273 *ctinfo = IP_CT_RELATED; 1274 } else { 1275 pr_debug("new packet for %p\n", ct); 1276 *ctinfo = IP_CT_NEW; 1277 } 1278 *set_reply = 0; 1279 } 1280 skb->nfct = &ct->ct_general; 1281 skb->nfctinfo = *ctinfo; 1282 return ct; 1283 } 1284 1285 unsigned int 1286 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, 1287 struct sk_buff *skb) 1288 { 1289 struct nf_conn *ct, *tmpl = NULL; 1290 enum ip_conntrack_info ctinfo; 1291 struct nf_conntrack_l3proto *l3proto; 1292 struct nf_conntrack_l4proto *l4proto; 1293 unsigned int *timeouts; 1294 unsigned int dataoff; 1295 u_int8_t protonum; 1296 int set_reply = 0; 1297 int ret; 1298 1299 if (skb->nfct) { 1300 /* Previously seen (loopback or untracked)? Ignore. */ 1301 tmpl = (struct nf_conn *)skb->nfct; 1302 if (!nf_ct_is_template(tmpl)) { 1303 NF_CT_STAT_INC_ATOMIC(net, ignore); 1304 return NF_ACCEPT; 1305 } 1306 skb->nfct = NULL; 1307 } 1308 1309 /* rcu_read_lock()ed by nf_hook_thresh */ 1310 l3proto = __nf_ct_l3proto_find(pf); 1311 ret = l3proto->get_l4proto(skb, skb_network_offset(skb), 1312 &dataoff, &protonum); 1313 if (ret <= 0) { 1314 pr_debug("not prepared to track yet or error occurred\n"); 1315 NF_CT_STAT_INC_ATOMIC(net, error); 1316 NF_CT_STAT_INC_ATOMIC(net, invalid); 1317 ret = -ret; 1318 goto out; 1319 } 1320 1321 l4proto = __nf_ct_l4proto_find(pf, protonum); 1322 1323 /* It may be an special packet, error, unclean... 1324 * inverse of the return code tells to the netfilter 1325 * core what to do with the packet. */ 1326 if (l4proto->error != NULL) { 1327 ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo, 1328 pf, hooknum); 1329 if (ret <= 0) { 1330 NF_CT_STAT_INC_ATOMIC(net, error); 1331 NF_CT_STAT_INC_ATOMIC(net, invalid); 1332 ret = -ret; 1333 goto out; 1334 } 1335 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1336 if (skb->nfct) 1337 goto out; 1338 } 1339 repeat: 1340 ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, 1341 l3proto, l4proto, &set_reply, &ctinfo); 1342 if (!ct) { 1343 /* Not valid part of a connection */ 1344 NF_CT_STAT_INC_ATOMIC(net, invalid); 1345 ret = NF_ACCEPT; 1346 goto out; 1347 } 1348 1349 if (IS_ERR(ct)) { 1350 /* Too stressed to deal. */ 1351 NF_CT_STAT_INC_ATOMIC(net, drop); 1352 ret = NF_DROP; 1353 goto out; 1354 } 1355 1356 NF_CT_ASSERT(skb->nfct); 1357 1358 /* Decide what timeout policy we want to apply to this flow. */ 1359 timeouts = nf_ct_timeout_lookup(net, ct, l4proto); 1360 1361 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts); 1362 if (ret <= 0) { 1363 /* Invalid: inverse of the return code tells 1364 * the netfilter core what to do */ 1365 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1366 nf_conntrack_put(skb->nfct); 1367 skb->nfct = NULL; 1368 NF_CT_STAT_INC_ATOMIC(net, invalid); 1369 if (ret == -NF_DROP) 1370 NF_CT_STAT_INC_ATOMIC(net, drop); 1371 /* Special case: TCP tracker reports an attempt to reopen a 1372 * closed/aborted connection. We have to go back and create a 1373 * fresh conntrack. 1374 */ 1375 if (ret == -NF_REPEAT) 1376 goto repeat; 1377 ret = -ret; 1378 goto out; 1379 } 1380 1381 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 1382 nf_conntrack_event_cache(IPCT_REPLY, ct); 1383 out: 1384 if (tmpl) 1385 nf_ct_put(tmpl); 1386 1387 return ret; 1388 } 1389 EXPORT_SYMBOL_GPL(nf_conntrack_in); 1390 1391 bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, 1392 const struct nf_conntrack_tuple *orig) 1393 { 1394 bool ret; 1395 1396 rcu_read_lock(); 1397 ret = nf_ct_invert_tuple(inverse, orig, 1398 __nf_ct_l3proto_find(orig->src.l3num), 1399 __nf_ct_l4proto_find(orig->src.l3num, 1400 orig->dst.protonum)); 1401 rcu_read_unlock(); 1402 return ret; 1403 } 1404 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); 1405 1406 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 1407 implicitly racy: see __nf_conntrack_confirm */ 1408 void nf_conntrack_alter_reply(struct nf_conn *ct, 1409 const struct nf_conntrack_tuple *newreply) 1410 { 1411 struct nf_conn_help *help = nfct_help(ct); 1412 1413 /* Should be unconfirmed, so not in hash table yet */ 1414 NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); 1415 1416 pr_debug("Altering reply tuple of %p to ", ct); 1417 nf_ct_dump_tuple(newreply); 1418 1419 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1420 if (ct->master || (help && !hlist_empty(&help->expectations))) 1421 return; 1422 1423 rcu_read_lock(); 1424 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); 1425 rcu_read_unlock(); 1426 } 1427 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 1428 1429 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 1430 void __nf_ct_refresh_acct(struct nf_conn *ct, 1431 enum ip_conntrack_info ctinfo, 1432 const struct sk_buff *skb, 1433 unsigned long extra_jiffies, 1434 int do_acct) 1435 { 1436 NF_CT_ASSERT(skb); 1437 1438 /* Only update if this is not a fixed timeout */ 1439 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 1440 goto acct; 1441 1442 /* If not in hash table, timer will not be active yet */ 1443 if (nf_ct_is_confirmed(ct)) 1444 extra_jiffies += nfct_time_stamp; 1445 1446 ct->timeout = extra_jiffies; 1447 acct: 1448 if (do_acct) 1449 nf_ct_acct_update(ct, ctinfo, skb->len); 1450 } 1451 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 1452 1453 bool nf_ct_kill_acct(struct nf_conn *ct, 1454 enum ip_conntrack_info ctinfo, 1455 const struct sk_buff *skb) 1456 { 1457 nf_ct_acct_update(ct, ctinfo, skb->len); 1458 1459 return nf_ct_delete(ct, 0, 0); 1460 } 1461 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 1462 1463 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1464 1465 #include <linux/netfilter/nfnetlink.h> 1466 #include <linux/netfilter/nfnetlink_conntrack.h> 1467 #include <linux/mutex.h> 1468 1469 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be 1470 * in ip_conntrack_core, since we don't want the protocols to autoload 1471 * or depend on ctnetlink */ 1472 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 1473 const struct nf_conntrack_tuple *tuple) 1474 { 1475 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 1476 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 1477 goto nla_put_failure; 1478 return 0; 1479 1480 nla_put_failure: 1481 return -1; 1482 } 1483 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 1484 1485 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 1486 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 1487 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 1488 }; 1489 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 1490 1491 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 1492 struct nf_conntrack_tuple *t) 1493 { 1494 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) 1495 return -EINVAL; 1496 1497 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 1498 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 1499 1500 return 0; 1501 } 1502 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 1503 1504 int nf_ct_port_nlattr_tuple_size(void) 1505 { 1506 return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 1507 } 1508 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 1509 #endif 1510 1511 /* Used by ipt_REJECT and ip6t_REJECT. */ 1512 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 1513 { 1514 struct nf_conn *ct; 1515 enum ip_conntrack_info ctinfo; 1516 1517 /* This ICMP is in reverse direction to the packet which caused it */ 1518 ct = nf_ct_get(skb, &ctinfo); 1519 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 1520 ctinfo = IP_CT_RELATED_REPLY; 1521 else 1522 ctinfo = IP_CT_RELATED; 1523 1524 /* Attach to new skbuff, and increment count */ 1525 nskb->nfct = &ct->ct_general; 1526 nskb->nfctinfo = ctinfo; 1527 nf_conntrack_get(nskb->nfct); 1528 } 1529 1530 /* Bring out ya dead! */ 1531 static struct nf_conn * 1532 get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), 1533 void *data, unsigned int *bucket) 1534 { 1535 struct nf_conntrack_tuple_hash *h; 1536 struct nf_conn *ct; 1537 struct hlist_nulls_node *n; 1538 int cpu; 1539 spinlock_t *lockp; 1540 1541 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 1542 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 1543 local_bh_disable(); 1544 nf_conntrack_lock(lockp); 1545 if (*bucket < nf_conntrack_htable_size) { 1546 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { 1547 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 1548 continue; 1549 ct = nf_ct_tuplehash_to_ctrack(h); 1550 if (net_eq(nf_ct_net(ct), net) && 1551 iter(ct, data)) 1552 goto found; 1553 } 1554 } 1555 spin_unlock(lockp); 1556 local_bh_enable(); 1557 cond_resched(); 1558 } 1559 1560 for_each_possible_cpu(cpu) { 1561 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 1562 1563 spin_lock_bh(&pcpu->lock); 1564 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { 1565 ct = nf_ct_tuplehash_to_ctrack(h); 1566 if (iter(ct, data)) 1567 set_bit(IPS_DYING_BIT, &ct->status); 1568 } 1569 spin_unlock_bh(&pcpu->lock); 1570 cond_resched(); 1571 } 1572 return NULL; 1573 found: 1574 atomic_inc(&ct->ct_general.use); 1575 spin_unlock(lockp); 1576 local_bh_enable(); 1577 return ct; 1578 } 1579 1580 void nf_ct_iterate_cleanup(struct net *net, 1581 int (*iter)(struct nf_conn *i, void *data), 1582 void *data, u32 portid, int report) 1583 { 1584 struct nf_conn *ct; 1585 unsigned int bucket = 0; 1586 1587 might_sleep(); 1588 1589 if (atomic_read(&net->ct.count) == 0) 1590 return; 1591 1592 while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { 1593 /* Time to push up daises... */ 1594 1595 nf_ct_delete(ct, portid, report); 1596 nf_ct_put(ct); 1597 cond_resched(); 1598 } 1599 } 1600 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); 1601 1602 static int kill_all(struct nf_conn *i, void *data) 1603 { 1604 return 1; 1605 } 1606 1607 void nf_ct_free_hashtable(void *hash, unsigned int size) 1608 { 1609 if (is_vmalloc_addr(hash)) 1610 vfree(hash); 1611 else 1612 free_pages((unsigned long)hash, 1613 get_order(sizeof(struct hlist_head) * size)); 1614 } 1615 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); 1616 1617 static int untrack_refs(void) 1618 { 1619 int cnt = 0, cpu; 1620 1621 for_each_possible_cpu(cpu) { 1622 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); 1623 1624 cnt += atomic_read(&ct->ct_general.use) - 1; 1625 } 1626 return cnt; 1627 } 1628 1629 void nf_conntrack_cleanup_start(void) 1630 { 1631 conntrack_gc_work.exiting = true; 1632 RCU_INIT_POINTER(ip_ct_attach, NULL); 1633 } 1634 1635 void nf_conntrack_cleanup_end(void) 1636 { 1637 RCU_INIT_POINTER(nf_ct_destroy, NULL); 1638 while (untrack_refs() > 0) 1639 schedule(); 1640 1641 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 1642 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 1643 1644 nf_conntrack_proto_fini(); 1645 nf_conntrack_seqadj_fini(); 1646 nf_conntrack_labels_fini(); 1647 nf_conntrack_helper_fini(); 1648 nf_conntrack_timeout_fini(); 1649 nf_conntrack_ecache_fini(); 1650 nf_conntrack_tstamp_fini(); 1651 nf_conntrack_acct_fini(); 1652 nf_conntrack_expect_fini(); 1653 1654 kmem_cache_destroy(nf_conntrack_cachep); 1655 } 1656 1657 /* 1658 * Mishearing the voices in his head, our hero wonders how he's 1659 * supposed to kill the mall. 1660 */ 1661 void nf_conntrack_cleanup_net(struct net *net) 1662 { 1663 LIST_HEAD(single); 1664 1665 list_add(&net->exit_list, &single); 1666 nf_conntrack_cleanup_net_list(&single); 1667 } 1668 1669 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 1670 { 1671 int busy; 1672 struct net *net; 1673 1674 /* 1675 * This makes sure all current packets have passed through 1676 * netfilter framework. Roll on, two-stage module 1677 * delete... 1678 */ 1679 synchronize_net(); 1680 i_see_dead_people: 1681 busy = 0; 1682 list_for_each_entry(net, net_exit_list, exit_list) { 1683 nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0); 1684 if (atomic_read(&net->ct.count) != 0) 1685 busy = 1; 1686 } 1687 if (busy) { 1688 schedule(); 1689 goto i_see_dead_people; 1690 } 1691 1692 list_for_each_entry(net, net_exit_list, exit_list) { 1693 nf_conntrack_proto_pernet_fini(net); 1694 nf_conntrack_helper_pernet_fini(net); 1695 nf_conntrack_ecache_pernet_fini(net); 1696 nf_conntrack_tstamp_pernet_fini(net); 1697 nf_conntrack_acct_pernet_fini(net); 1698 nf_conntrack_expect_pernet_fini(net); 1699 free_percpu(net->ct.stat); 1700 free_percpu(net->ct.pcpu_lists); 1701 } 1702 } 1703 1704 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 1705 { 1706 struct hlist_nulls_head *hash; 1707 unsigned int nr_slots, i; 1708 size_t sz; 1709 1710 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 1711 return NULL; 1712 1713 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 1714 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 1715 1716 if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) 1717 return NULL; 1718 1719 sz = nr_slots * sizeof(struct hlist_nulls_head); 1720 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, 1721 get_order(sz)); 1722 if (!hash) 1723 hash = vzalloc(sz); 1724 1725 if (hash && nulls) 1726 for (i = 0; i < nr_slots; i++) 1727 INIT_HLIST_NULLS_HEAD(&hash[i], i); 1728 1729 return hash; 1730 } 1731 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 1732 1733 int nf_conntrack_hash_resize(unsigned int hashsize) 1734 { 1735 int i, bucket; 1736 unsigned int old_size; 1737 struct hlist_nulls_head *hash, *old_hash; 1738 struct nf_conntrack_tuple_hash *h; 1739 struct nf_conn *ct; 1740 1741 if (!hashsize) 1742 return -EINVAL; 1743 1744 hash = nf_ct_alloc_hashtable(&hashsize, 1); 1745 if (!hash) 1746 return -ENOMEM; 1747 1748 old_size = nf_conntrack_htable_size; 1749 if (old_size == hashsize) { 1750 nf_ct_free_hashtable(hash, hashsize); 1751 return 0; 1752 } 1753 1754 local_bh_disable(); 1755 nf_conntrack_all_lock(); 1756 write_seqcount_begin(&nf_conntrack_generation); 1757 1758 /* Lookups in the old hash might happen in parallel, which means we 1759 * might get false negatives during connection lookup. New connections 1760 * created because of a false negative won't make it into the hash 1761 * though since that required taking the locks. 1762 */ 1763 1764 for (i = 0; i < nf_conntrack_htable_size; i++) { 1765 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 1766 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 1767 struct nf_conntrack_tuple_hash, hnnode); 1768 ct = nf_ct_tuplehash_to_ctrack(h); 1769 hlist_nulls_del_rcu(&h->hnnode); 1770 bucket = __hash_conntrack(nf_ct_net(ct), 1771 &h->tuple, hashsize); 1772 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 1773 } 1774 } 1775 old_size = nf_conntrack_htable_size; 1776 old_hash = nf_conntrack_hash; 1777 1778 nf_conntrack_hash = hash; 1779 nf_conntrack_htable_size = hashsize; 1780 1781 write_seqcount_end(&nf_conntrack_generation); 1782 nf_conntrack_all_unlock(); 1783 local_bh_enable(); 1784 1785 synchronize_net(); 1786 nf_ct_free_hashtable(old_hash, old_size); 1787 return 0; 1788 } 1789 1790 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) 1791 { 1792 unsigned int hashsize; 1793 int rc; 1794 1795 if (current->nsproxy->net_ns != &init_net) 1796 return -EOPNOTSUPP; 1797 1798 /* On boot, we can set this without any fancy locking. */ 1799 if (!nf_conntrack_htable_size) 1800 return param_set_uint(val, kp); 1801 1802 rc = kstrtouint(val, 0, &hashsize); 1803 if (rc) 1804 return rc; 1805 1806 return nf_conntrack_hash_resize(hashsize); 1807 } 1808 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 1809 1810 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, 1811 &nf_conntrack_htable_size, 0600); 1812 1813 void nf_ct_untracked_status_or(unsigned long bits) 1814 { 1815 int cpu; 1816 1817 for_each_possible_cpu(cpu) 1818 per_cpu(nf_conntrack_untracked, cpu).status |= bits; 1819 } 1820 EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); 1821 1822 int nf_conntrack_init_start(void) 1823 { 1824 int max_factor = 8; 1825 int ret = -ENOMEM; 1826 int i, cpu; 1827 1828 seqcount_init(&nf_conntrack_generation); 1829 1830 for (i = 0; i < CONNTRACK_LOCKS; i++) 1831 spin_lock_init(&nf_conntrack_locks[i]); 1832 1833 if (!nf_conntrack_htable_size) { 1834 /* Idea from tcp.c: use 1/16384 of memory. 1835 * On i386: 32MB machine has 512 buckets. 1836 * >= 1GB machines have 16384 buckets. 1837 * >= 4GB machines have 65536 buckets. 1838 */ 1839 nf_conntrack_htable_size 1840 = (((totalram_pages << PAGE_SHIFT) / 16384) 1841 / sizeof(struct hlist_head)); 1842 if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 1843 nf_conntrack_htable_size = 65536; 1844 else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 1845 nf_conntrack_htable_size = 16384; 1846 if (nf_conntrack_htable_size < 32) 1847 nf_conntrack_htable_size = 32; 1848 1849 /* Use a max. factor of four by default to get the same max as 1850 * with the old struct list_heads. When a table size is given 1851 * we use the old value of 8 to avoid reducing the max. 1852 * entries. */ 1853 max_factor = 4; 1854 } 1855 1856 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 1857 if (!nf_conntrack_hash) 1858 return -ENOMEM; 1859 1860 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 1861 1862 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 1863 sizeof(struct nf_conn), 0, 1864 SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 1865 if (!nf_conntrack_cachep) 1866 goto err_cachep; 1867 1868 printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", 1869 NF_CONNTRACK_VERSION, nf_conntrack_htable_size, 1870 nf_conntrack_max); 1871 1872 ret = nf_conntrack_expect_init(); 1873 if (ret < 0) 1874 goto err_expect; 1875 1876 ret = nf_conntrack_acct_init(); 1877 if (ret < 0) 1878 goto err_acct; 1879 1880 ret = nf_conntrack_tstamp_init(); 1881 if (ret < 0) 1882 goto err_tstamp; 1883 1884 ret = nf_conntrack_ecache_init(); 1885 if (ret < 0) 1886 goto err_ecache; 1887 1888 ret = nf_conntrack_timeout_init(); 1889 if (ret < 0) 1890 goto err_timeout; 1891 1892 ret = nf_conntrack_helper_init(); 1893 if (ret < 0) 1894 goto err_helper; 1895 1896 ret = nf_conntrack_labels_init(); 1897 if (ret < 0) 1898 goto err_labels; 1899 1900 ret = nf_conntrack_seqadj_init(); 1901 if (ret < 0) 1902 goto err_seqadj; 1903 1904 ret = nf_conntrack_proto_init(); 1905 if (ret < 0) 1906 goto err_proto; 1907 1908 /* Set up fake conntrack: to never be deleted, not in any hashes */ 1909 for_each_possible_cpu(cpu) { 1910 struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); 1911 write_pnet(&ct->ct_net, &init_net); 1912 atomic_set(&ct->ct_general.use, 1); 1913 } 1914 /* - and look it like as a confirmed connection */ 1915 nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); 1916 1917 conntrack_gc_work_init(&conntrack_gc_work); 1918 queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); 1919 1920 return 0; 1921 1922 err_proto: 1923 nf_conntrack_seqadj_fini(); 1924 err_seqadj: 1925 nf_conntrack_labels_fini(); 1926 err_labels: 1927 nf_conntrack_helper_fini(); 1928 err_helper: 1929 nf_conntrack_timeout_fini(); 1930 err_timeout: 1931 nf_conntrack_ecache_fini(); 1932 err_ecache: 1933 nf_conntrack_tstamp_fini(); 1934 err_tstamp: 1935 nf_conntrack_acct_fini(); 1936 err_acct: 1937 nf_conntrack_expect_fini(); 1938 err_expect: 1939 kmem_cache_destroy(nf_conntrack_cachep); 1940 err_cachep: 1941 nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); 1942 return ret; 1943 } 1944 1945 void nf_conntrack_init_end(void) 1946 { 1947 /* For use by REJECT target */ 1948 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); 1949 RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack); 1950 } 1951 1952 /* 1953 * We need to use special "null" values, not used in hash table 1954 */ 1955 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 1956 #define DYING_NULLS_VAL ((1<<30)+1) 1957 #define TEMPLATE_NULLS_VAL ((1<<30)+2) 1958 1959 int nf_conntrack_init_net(struct net *net) 1960 { 1961 int ret = -ENOMEM; 1962 int cpu; 1963 1964 atomic_set(&net->ct.count, 0); 1965 1966 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); 1967 if (!net->ct.pcpu_lists) 1968 goto err_stat; 1969 1970 for_each_possible_cpu(cpu) { 1971 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 1972 1973 spin_lock_init(&pcpu->lock); 1974 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); 1975 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); 1976 } 1977 1978 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 1979 if (!net->ct.stat) 1980 goto err_pcpu_lists; 1981 1982 ret = nf_conntrack_expect_pernet_init(net); 1983 if (ret < 0) 1984 goto err_expect; 1985 ret = nf_conntrack_acct_pernet_init(net); 1986 if (ret < 0) 1987 goto err_acct; 1988 ret = nf_conntrack_tstamp_pernet_init(net); 1989 if (ret < 0) 1990 goto err_tstamp; 1991 ret = nf_conntrack_ecache_pernet_init(net); 1992 if (ret < 0) 1993 goto err_ecache; 1994 ret = nf_conntrack_helper_pernet_init(net); 1995 if (ret < 0) 1996 goto err_helper; 1997 ret = nf_conntrack_proto_pernet_init(net); 1998 if (ret < 0) 1999 goto err_proto; 2000 return 0; 2001 2002 err_proto: 2003 nf_conntrack_helper_pernet_fini(net); 2004 err_helper: 2005 nf_conntrack_ecache_pernet_fini(net); 2006 err_ecache: 2007 nf_conntrack_tstamp_pernet_fini(net); 2008 err_tstamp: 2009 nf_conntrack_acct_pernet_fini(net); 2010 err_acct: 2011 nf_conntrack_expect_pernet_fini(net); 2012 err_expect: 2013 free_percpu(net->ct.stat); 2014 err_pcpu_lists: 2015 free_percpu(net->ct.pcpu_lists); 2016 err_stat: 2017 return ret; 2018 } 2019