1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Event cache for netfilter. */ 3 4 /* 5 * (C) 2005 Harald Welte <laforge@gnumonks.org> 6 * (C) 2005 Patrick McHardy <kaber@trash.net> 7 * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 */ 10 11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13 #include <linux/types.h> 14 #include <linux/netfilter.h> 15 #include <linux/skbuff.h> 16 #include <linux/vmalloc.h> 17 #include <linux/stddef.h> 18 #include <linux/err.h> 19 #include <linux/percpu.h> 20 #include <linux/kernel.h> 21 #include <linux/netdevice.h> 22 #include <linux/slab.h> 23 #include <linux/export.h> 24 25 #include <net/netfilter/nf_conntrack.h> 26 #include <net/netfilter/nf_conntrack_core.h> 27 #include <net/netfilter/nf_conntrack_ecache.h> 28 #include <net/netfilter/nf_conntrack_extend.h> 29 30 static DEFINE_MUTEX(nf_ct_ecache_mutex); 31 32 #define ECACHE_RETRY_WAIT (HZ/10) 33 #define ECACHE_STACK_ALLOC (256 / sizeof(void *)) 34 35 enum retry_state { 36 STATE_CONGESTED, 37 STATE_RESTART, 38 STATE_DONE, 39 }; 40 41 static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) 42 { 43 struct nf_conn *refs[ECACHE_STACK_ALLOC]; 44 enum retry_state ret = STATE_DONE; 45 struct nf_conntrack_tuple_hash *h; 46 struct hlist_nulls_node *n; 47 unsigned int evicted = 0; 48 49 spin_lock(&pcpu->lock); 50 51 hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { 52 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 53 struct nf_conntrack_ecache *e; 54 55 if (!nf_ct_is_confirmed(ct)) 56 continue; 57 58 /* This ecache access is safe because the ct is on the 59 * pcpu dying list and we hold the spinlock -- the entry 60 * cannot be free'd until after the lock is released. 61 * 62 * This is true even if ct has a refcount of 0: the 63 * cpu that is about to free the entry must remove it 64 * from the dying list and needs the lock to do so. 65 */ 66 e = nf_ct_ecache_find(ct); 67 if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) 68 continue; 69 70 /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means 71 * the worker owns this entry: the ct will remain valid 72 * until the worker puts its ct reference. 73 */ 74 if (nf_conntrack_event(IPCT_DESTROY, ct)) { 75 ret = STATE_CONGESTED; 76 break; 77 } 78 79 e->state = NFCT_ECACHE_DESTROY_SENT; 80 refs[evicted] = ct; 81 82 if (++evicted >= ARRAY_SIZE(refs)) { 83 ret = STATE_RESTART; 84 break; 85 } 86 } 87 88 spin_unlock(&pcpu->lock); 89 90 /* can't _put while holding lock */ 91 while (evicted) 92 nf_ct_put(refs[--evicted]); 93 94 return ret; 95 } 96 97 static void ecache_work(struct work_struct *work) 98 { 99 struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache_dwork.work); 100 struct netns_ct *ctnet = cnet->ct_net; 101 int cpu, delay = -1; 102 struct ct_pcpu *pcpu; 103 104 local_bh_disable(); 105 106 for_each_possible_cpu(cpu) { 107 enum retry_state ret; 108 109 pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); 110 111 ret = ecache_work_evict_list(pcpu); 112 113 switch (ret) { 114 case STATE_CONGESTED: 115 delay = ECACHE_RETRY_WAIT; 116 goto out; 117 case STATE_RESTART: 118 delay = 0; 119 break; 120 case STATE_DONE: 121 break; 122 } 123 } 124 125 out: 126 local_bh_enable(); 127 128 ctnet->ecache_dwork_pending = delay > 0; 129 if (delay >= 0) 130 schedule_delayed_work(&cnet->ecache_dwork, delay); 131 } 132 133 static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, 134 const u32 events, 135 const u32 missed, 136 const struct nf_ct_event *item) 137 { 138 struct net *net = nf_ct_net(item->ct); 139 struct nf_ct_event_notifier *notify; 140 u32 old, want; 141 int ret; 142 143 if (!((events | missed) & e->ctmask)) 144 return 0; 145 146 rcu_read_lock(); 147 148 notify = rcu_dereference(net->ct.nf_conntrack_event_cb); 149 if (!notify) { 150 rcu_read_unlock(); 151 return 0; 152 } 153 154 ret = notify->ct_event(events | missed, item); 155 rcu_read_unlock(); 156 157 if (likely(ret >= 0 && missed == 0)) 158 return 0; 159 160 do { 161 old = READ_ONCE(e->missed); 162 if (ret < 0) 163 want = old | events; 164 else 165 want = old & ~missed; 166 } while (cmpxchg(&e->missed, old, want) != old); 167 168 return ret; 169 } 170 171 int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, 172 u32 portid, int report) 173 { 174 struct nf_conntrack_ecache *e; 175 struct nf_ct_event item; 176 unsigned int missed; 177 int ret; 178 179 if (!nf_ct_is_confirmed(ct)) 180 return 0; 181 182 e = nf_ct_ecache_find(ct); 183 if (!e) 184 return 0; 185 186 memset(&item, 0, sizeof(item)); 187 188 item.ct = ct; 189 item.portid = e->portid ? e->portid : portid; 190 item.report = report; 191 192 /* This is a resent of a destroy event? If so, skip missed */ 193 missed = e->portid ? 0 : e->missed; 194 195 ret = __nf_conntrack_eventmask_report(e, events, missed, &item); 196 if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { 197 /* This is a destroy event that has been triggered by a process, 198 * we store the PORTID to include it in the retransmission. 199 */ 200 if (e->portid == 0 && portid != 0) 201 e->portid = portid; 202 e->state = NFCT_ECACHE_DESTROY_FAIL; 203 } 204 205 return ret; 206 } 207 EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); 208 209 /* deliver cached events and clear cache entry - must be called with locally 210 * disabled softirqs */ 211 void nf_ct_deliver_cached_events(struct nf_conn *ct) 212 { 213 struct nf_conntrack_ecache *e; 214 struct nf_ct_event item; 215 unsigned int events; 216 217 if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) 218 return; 219 220 e = nf_ct_ecache_find(ct); 221 if (e == NULL) 222 return; 223 224 events = xchg(&e->cache, 0); 225 226 item.ct = ct; 227 item.portid = 0; 228 item.report = 0; 229 230 /* We make a copy of the missed event cache without taking 231 * the lock, thus we may send missed events twice. However, 232 * this does not harm and it happens very rarely. 233 */ 234 __nf_conntrack_eventmask_report(e, events, e->missed, &item); 235 } 236 EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); 237 238 void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, 239 struct nf_conntrack_expect *exp, 240 u32 portid, int report) 241 242 { 243 struct net *net = nf_ct_exp_net(exp); 244 struct nf_ct_event_notifier *notify; 245 struct nf_conntrack_ecache *e; 246 247 rcu_read_lock(); 248 notify = rcu_dereference(net->ct.nf_conntrack_event_cb); 249 if (!notify) 250 goto out_unlock; 251 252 e = nf_ct_ecache_find(exp->master); 253 if (!e) 254 goto out_unlock; 255 256 if (e->expmask & (1 << event)) { 257 struct nf_exp_event item = { 258 .exp = exp, 259 .portid = portid, 260 .report = report 261 }; 262 notify->exp_event(1 << event, &item); 263 } 264 out_unlock: 265 rcu_read_unlock(); 266 } 267 268 void nf_conntrack_register_notifier(struct net *net, 269 const struct nf_ct_event_notifier *new) 270 { 271 struct nf_ct_event_notifier *notify; 272 273 mutex_lock(&nf_ct_ecache_mutex); 274 notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, 275 lockdep_is_held(&nf_ct_ecache_mutex)); 276 WARN_ON_ONCE(notify); 277 rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); 278 mutex_unlock(&nf_ct_ecache_mutex); 279 } 280 EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); 281 282 void nf_conntrack_unregister_notifier(struct net *net) 283 { 284 mutex_lock(&nf_ct_ecache_mutex); 285 RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); 286 mutex_unlock(&nf_ct_ecache_mutex); 287 /* synchronize_rcu() is called after netns pre_exit */ 288 } 289 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); 290 291 void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) 292 { 293 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 294 295 if (state == NFCT_ECACHE_DESTROY_FAIL && 296 !delayed_work_pending(&cnet->ecache_dwork)) { 297 schedule_delayed_work(&cnet->ecache_dwork, HZ); 298 net->ct.ecache_dwork_pending = true; 299 } else if (state == NFCT_ECACHE_DESTROY_SENT) { 300 net->ct.ecache_dwork_pending = false; 301 mod_delayed_work(system_wq, &cnet->ecache_dwork, 0); 302 } 303 } 304 305 #define NF_CT_EVENTS_DEFAULT 1 306 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; 307 308 void nf_conntrack_ecache_pernet_init(struct net *net) 309 { 310 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 311 312 net->ct.sysctl_events = nf_ct_events; 313 cnet->ct_net = &net->ct; 314 INIT_DELAYED_WORK(&cnet->ecache_dwork, ecache_work); 315 316 BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ 317 } 318 319 void nf_conntrack_ecache_pernet_fini(struct net *net) 320 { 321 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 322 323 cancel_delayed_work_sync(&cnet->ecache_dwork); 324 } 325