1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * IPVS An implementation of the IP virtual server support for the 4 * LINUX operating system. IPVS is now implemented as a module 5 * over the NetFilter framework. IPVS can be used to build a 6 * high-performance and highly available server based on a 7 * cluster of servers. 8 * 9 * Version 1, is capable of handling both version 0 and 1 messages. 10 * Version 0 is the plain old format. 11 * Note Version 0 receivers will just drop Ver 1 messages. 12 * Version 1 is capable of handle IPv6, Persistence data, 13 * time-outs, and firewall marks. 14 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. 15 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 16 * 17 * Definitions Message: is a complete datagram 18 * Sync_conn: is a part of a Message 19 * Param Data is an option to a Sync_conn. 20 * 21 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 22 * 23 * ip_vs_sync: sync connection info from master load balancer to backups 24 * through multicast 25 * 26 * Changes: 27 * Alexandre Cassen : Added master & backup support at a time. 28 * Alexandre Cassen : Added SyncID support for incoming sync 29 * messages filtering. 30 * Justin Ossevoort : Fix endian problem on sync message size. 31 * Hans Schillstrom : Added Version 1: i.e. IPv6, 32 * Persistence support, fwmark and time-out. 33 */ 34 35 #define KMSG_COMPONENT "IPVS" 36 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 37 38 #include <linux/module.h> 39 #include <linux/slab.h> 40 #include <linux/inetdevice.h> 41 #include <linux/net.h> 42 #include <linux/completion.h> 43 #include <linux/delay.h> 44 #include <linux/skbuff.h> 45 #include <linux/in.h> 46 #include <linux/igmp.h> /* for ip_mc_join_group */ 47 #include <linux/udp.h> 48 #include <linux/err.h> 49 #include <linux/kthread.h> 50 #include <linux/wait.h> 51 #include <linux/kernel.h> 52 53 #include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ 54 55 #include <net/ip.h> 56 #include <net/sock.h> 57 58 #include <net/ip_vs.h> 59 60 #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 61 #define IP_VS_SYNC_PORT 8848 /* multicast port */ 62 63 #define SYNC_PROTO_VER 1 /* Protocol version in header */ 64 65 static struct lock_class_key __ipvs_sync_key; 66 /* 67 * IPVS sync connection entry 68 * Version 0, i.e. original version. 69 */ 70 struct ip_vs_sync_conn_v0 { 71 __u8 reserved; 72 73 /* Protocol, addresses and port numbers */ 74 __u8 protocol; /* Which protocol (TCP/UDP) */ 75 __be16 cport; 76 __be16 vport; 77 __be16 dport; 78 __be32 caddr; /* client address */ 79 __be32 vaddr; /* virtual address */ 80 __be32 daddr; /* destination address */ 81 82 /* Flags and state transition */ 83 __be16 flags; /* status flags */ 84 __be16 state; /* state info */ 85 86 /* The sequence options start here */ 87 }; 88 89 struct ip_vs_sync_conn_options { 90 struct ip_vs_seq in_seq; /* incoming seq. struct */ 91 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 92 }; 93 94 /* 95 Sync Connection format (sync_conn) 96 97 0 1 2 3 98 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 100 | Type | Protocol | Ver. | Size | 101 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 102 | Flags | 103 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104 | State | cport | 105 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106 | vport | dport | 107 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 108 | fwmark | 109 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 110 | timeout (in sec.) | 111 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 112 | ... | 113 | IP-Addresses (v4 or v6) | 114 | ... | 115 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 116 Optional Parameters. 117 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 118 | Param. Type | Param. Length | Param. data | 119 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | 120 | ... | 121 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 122 | | Param Type | Param. Length | 123 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 124 | Param data | 125 | Last Param data should be padded for 32 bit alignment | 126 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 127 */ 128 129 /* 130 * Type 0, IPv4 sync connection format 131 */ 132 struct ip_vs_sync_v4 { 133 __u8 type; 134 __u8 protocol; /* Which protocol (TCP/UDP) */ 135 __be16 ver_size; /* Version msb 4 bits */ 136 /* Flags and state transition */ 137 __be32 flags; /* status flags */ 138 __be16 state; /* state info */ 139 /* Protocol, addresses and port numbers */ 140 __be16 cport; 141 __be16 vport; 142 __be16 dport; 143 __be32 fwmark; /* Firewall mark from skb */ 144 __be32 timeout; /* cp timeout */ 145 __be32 caddr; /* client address */ 146 __be32 vaddr; /* virtual address */ 147 __be32 daddr; /* destination address */ 148 /* The sequence options start here */ 149 /* PE data padded to 32bit alignment after seq. options */ 150 }; 151 /* 152 * Type 2 messages IPv6 153 */ 154 struct ip_vs_sync_v6 { 155 __u8 type; 156 __u8 protocol; /* Which protocol (TCP/UDP) */ 157 __be16 ver_size; /* Version msb 4 bits */ 158 /* Flags and state transition */ 159 __be32 flags; /* status flags */ 160 __be16 state; /* state info */ 161 /* Protocol, addresses and port numbers */ 162 __be16 cport; 163 __be16 vport; 164 __be16 dport; 165 __be32 fwmark; /* Firewall mark from skb */ 166 __be32 timeout; /* cp timeout */ 167 struct in6_addr caddr; /* client address */ 168 struct in6_addr vaddr; /* virtual address */ 169 struct in6_addr daddr; /* destination address */ 170 /* The sequence options start here */ 171 /* PE data padded to 32bit alignment after seq. options */ 172 }; 173 174 union ip_vs_sync_conn { 175 struct ip_vs_sync_v4 v4; 176 struct ip_vs_sync_v6 v6; 177 }; 178 179 /* Bits in Type field in above */ 180 #define STYPE_INET6 0 181 #define STYPE_F_INET6 (1 << STYPE_INET6) 182 183 #define SVER_SHIFT 12 /* Shift to get version */ 184 #define SVER_MASK 0x0fff /* Mask to strip version */ 185 186 #define IPVS_OPT_SEQ_DATA 1 187 #define IPVS_OPT_PE_DATA 2 188 #define IPVS_OPT_PE_NAME 3 189 #define IPVS_OPT_PARAM 7 190 191 #define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) 192 #define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) 193 #define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) 194 #define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) 195 196 struct ip_vs_sync_thread_data { 197 struct netns_ipvs *ipvs; 198 struct socket *sock; 199 char *buf; 200 int id; 201 }; 202 203 /* Version 0 definition of packet sizes */ 204 #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) 205 #define FULL_CONN_SIZE \ 206 (sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) 207 208 209 /* 210 The master mulitcasts messages (Datagrams) to the backup load balancers 211 in the following format. 212 213 Version 1: 214 Note, first byte should be Zero, so ver 0 receivers will drop the packet. 215 216 0 1 2 3 217 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 218 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 219 | 0 | SyncID | Size | 220 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 221 | Count Conns | Version | Reserved, set to Zero | 222 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 223 | | 224 | IPVS Sync Connection (1) | 225 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 226 | . | 227 ~ . ~ 228 | . | 229 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 230 | | 231 | IPVS Sync Connection (n) | 232 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 233 234 Version 0 Header 235 0 1 2 3 236 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 237 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 238 | Count Conns | SyncID | Size | 239 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 240 | IPVS Sync Connection (1) | 241 */ 242 243 #define SYNC_MESG_HEADER_LEN 4 244 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ 245 246 /* Version 0 header */ 247 struct ip_vs_sync_mesg_v0 { 248 __u8 nr_conns; 249 __u8 syncid; 250 __be16 size; 251 252 /* ip_vs_sync_conn entries start here */ 253 }; 254 255 /* Version 1 header */ 256 struct ip_vs_sync_mesg { 257 __u8 reserved; /* must be zero */ 258 __u8 syncid; 259 __be16 size; 260 __u8 nr_conns; 261 __s8 version; /* SYNC_PROTO_VER */ 262 __u16 spare; 263 /* ip_vs_sync_conn entries start here */ 264 }; 265 266 union ipvs_sockaddr { 267 struct sockaddr_in in; 268 struct sockaddr_in6 in6; 269 }; 270 271 struct ip_vs_sync_buff { 272 struct list_head list; 273 unsigned long firstuse; 274 275 /* pointers for the message data */ 276 struct ip_vs_sync_mesg *mesg; 277 unsigned char *head; 278 unsigned char *end; 279 }; 280 281 /* 282 * Copy of struct ip_vs_seq 283 * From unaligned network order to aligned host order 284 */ 285 static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) 286 { 287 memset(ho, 0, sizeof(*ho)); 288 ho->init_seq = get_unaligned_be32(&no->init_seq); 289 ho->delta = get_unaligned_be32(&no->delta); 290 ho->previous_delta = get_unaligned_be32(&no->previous_delta); 291 } 292 293 /* 294 * Copy of struct ip_vs_seq 295 * From Aligned host order to unaligned network order 296 */ 297 static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) 298 { 299 put_unaligned_be32(ho->init_seq, &no->init_seq); 300 put_unaligned_be32(ho->delta, &no->delta); 301 put_unaligned_be32(ho->previous_delta, &no->previous_delta); 302 } 303 304 static inline struct ip_vs_sync_buff * 305 sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 306 { 307 struct ip_vs_sync_buff *sb; 308 309 spin_lock_bh(&ipvs->sync_lock); 310 if (list_empty(&ms->sync_queue)) { 311 sb = NULL; 312 __set_current_state(TASK_INTERRUPTIBLE); 313 } else { 314 sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff, 315 list); 316 list_del(&sb->list); 317 ms->sync_queue_len--; 318 if (!ms->sync_queue_len) 319 ms->sync_queue_delay = 0; 320 } 321 spin_unlock_bh(&ipvs->sync_lock); 322 323 return sb; 324 } 325 326 /* 327 * Create a new sync buffer for Version 1 proto. 328 */ 329 static inline struct ip_vs_sync_buff * 330 ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) 331 { 332 struct ip_vs_sync_buff *sb; 333 334 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 335 return NULL; 336 337 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), 338 ipvs->mcfg.sync_maxlen); 339 sb->mesg = kmalloc(len, GFP_ATOMIC); 340 if (!sb->mesg) { 341 kfree(sb); 342 return NULL; 343 } 344 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ 345 sb->mesg->version = SYNC_PROTO_VER; 346 sb->mesg->syncid = ipvs->mcfg.syncid; 347 sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); 348 sb->mesg->nr_conns = 0; 349 sb->mesg->spare = 0; 350 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); 351 sb->end = (unsigned char *)sb->mesg + len; 352 353 sb->firstuse = jiffies; 354 return sb; 355 } 356 357 static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) 358 { 359 kfree(sb->mesg); 360 kfree(sb); 361 } 362 363 static inline void sb_queue_tail(struct netns_ipvs *ipvs, 364 struct ipvs_master_sync_state *ms) 365 { 366 struct ip_vs_sync_buff *sb = ms->sync_buff; 367 368 spin_lock(&ipvs->sync_lock); 369 if (ipvs->sync_state & IP_VS_STATE_MASTER && 370 ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) { 371 if (!ms->sync_queue_len) 372 schedule_delayed_work(&ms->master_wakeup_work, 373 max(IPVS_SYNC_SEND_DELAY, 1)); 374 ms->sync_queue_len++; 375 list_add_tail(&sb->list, &ms->sync_queue); 376 if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE) 377 wake_up_process(ms->master_thread); 378 } else 379 ip_vs_sync_buff_release(sb); 380 spin_unlock(&ipvs->sync_lock); 381 } 382 383 /* 384 * Get the current sync buffer if it has been created for more 385 * than the specified time or the specified time is zero. 386 */ 387 static inline struct ip_vs_sync_buff * 388 get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms, 389 unsigned long time) 390 { 391 struct ip_vs_sync_buff *sb; 392 393 spin_lock_bh(&ipvs->sync_buff_lock); 394 sb = ms->sync_buff; 395 if (sb && time_after_eq(jiffies - sb->firstuse, time)) { 396 ms->sync_buff = NULL; 397 __set_current_state(TASK_RUNNING); 398 } else 399 sb = NULL; 400 spin_unlock_bh(&ipvs->sync_buff_lock); 401 return sb; 402 } 403 404 static inline int 405 select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) 406 { 407 return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask; 408 } 409 410 /* 411 * Create a new sync buffer for Version 0 proto. 412 */ 413 static inline struct ip_vs_sync_buff * 414 ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) 415 { 416 struct ip_vs_sync_buff *sb; 417 struct ip_vs_sync_mesg_v0 *mesg; 418 419 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 420 return NULL; 421 422 len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), 423 ipvs->mcfg.sync_maxlen); 424 sb->mesg = kmalloc(len, GFP_ATOMIC); 425 if (!sb->mesg) { 426 kfree(sb); 427 return NULL; 428 } 429 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; 430 mesg->nr_conns = 0; 431 mesg->syncid = ipvs->mcfg.syncid; 432 mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); 433 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); 434 sb->end = (unsigned char *)mesg + len; 435 sb->firstuse = jiffies; 436 return sb; 437 } 438 439 /* Check if connection is controlled by persistence */ 440 static inline bool in_persistence(struct ip_vs_conn *cp) 441 { 442 for (cp = cp->control; cp; cp = cp->control) { 443 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 444 return true; 445 } 446 return false; 447 } 448 449 /* Check if conn should be synced. 450 * pkts: conn packets, use sysctl_sync_threshold to avoid packet check 451 * - (1) sync_refresh_period: reduce sync rate. Additionally, retry 452 * sync_retries times with period of sync_refresh_period/8 453 * - (2) if both sync_refresh_period and sync_period are 0 send sync only 454 * for state changes or only once when pkts matches sync_threshold 455 * - (3) templates: rate can be reduced only with sync_refresh_period or 456 * with (2) 457 */ 458 static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, 459 struct ip_vs_conn *cp, int pkts) 460 { 461 unsigned long orig = ACCESS_ONCE(cp->sync_endtime); 462 unsigned long now = jiffies; 463 unsigned long n = (now + cp->timeout) & ~3UL; 464 unsigned int sync_refresh_period; 465 int sync_period; 466 int force; 467 468 /* Check if we sync in current state */ 469 if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE)) 470 force = 0; 471 else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp))) 472 return 0; 473 else if (likely(cp->protocol == IPPROTO_TCP)) { 474 if (!((1 << cp->state) & 475 ((1 << IP_VS_TCP_S_ESTABLISHED) | 476 (1 << IP_VS_TCP_S_FIN_WAIT) | 477 (1 << IP_VS_TCP_S_CLOSE) | 478 (1 << IP_VS_TCP_S_CLOSE_WAIT) | 479 (1 << IP_VS_TCP_S_TIME_WAIT)))) 480 return 0; 481 force = cp->state != cp->old_state; 482 if (force && cp->state != IP_VS_TCP_S_ESTABLISHED) 483 goto set; 484 } else if (unlikely(cp->protocol == IPPROTO_SCTP)) { 485 if (!((1 << cp->state) & 486 ((1 << IP_VS_SCTP_S_ESTABLISHED) | 487 (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) | 488 (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) | 489 (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) | 490 (1 << IP_VS_SCTP_S_CLOSED)))) 491 return 0; 492 force = cp->state != cp->old_state; 493 if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED) 494 goto set; 495 } else { 496 /* UDP or another protocol with single state */ 497 force = 0; 498 } 499 500 sync_refresh_period = sysctl_sync_refresh_period(ipvs); 501 if (sync_refresh_period > 0) { 502 long diff = n - orig; 503 long min_diff = max(cp->timeout >> 1, 10UL * HZ); 504 505 /* Avoid sync if difference is below sync_refresh_period 506 * and below the half timeout. 507 */ 508 if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) { 509 int retries = orig & 3; 510 511 if (retries >= sysctl_sync_retries(ipvs)) 512 return 0; 513 if (time_before(now, orig - cp->timeout + 514 (sync_refresh_period >> 3))) 515 return 0; 516 n |= retries + 1; 517 } 518 } 519 sync_period = sysctl_sync_period(ipvs); 520 if (sync_period > 0) { 521 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && 522 pkts % sync_period != sysctl_sync_threshold(ipvs)) 523 return 0; 524 } else if (!sync_refresh_period && 525 pkts != sysctl_sync_threshold(ipvs)) 526 return 0; 527 528 set: 529 cp->old_state = cp->state; 530 n = cmpxchg(&cp->sync_endtime, orig, n); 531 return n == orig || force; 532 } 533 534 /* 535 * Version 0 , could be switched in by sys_ctl. 536 * Add an ip_vs_conn information into the current sync_buff. 537 */ 538 static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, 539 int pkts) 540 { 541 struct ip_vs_sync_mesg_v0 *m; 542 struct ip_vs_sync_conn_v0 *s; 543 struct ip_vs_sync_buff *buff; 544 struct ipvs_master_sync_state *ms; 545 int id; 546 unsigned int len; 547 548 if (unlikely(cp->af != AF_INET)) 549 return; 550 /* Do not sync ONE PACKET */ 551 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 552 return; 553 554 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 555 return; 556 557 spin_lock_bh(&ipvs->sync_buff_lock); 558 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 559 spin_unlock_bh(&ipvs->sync_buff_lock); 560 return; 561 } 562 563 id = select_master_thread_id(ipvs, cp); 564 ms = &ipvs->ms[id]; 565 buff = ms->sync_buff; 566 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 567 SIMPLE_CONN_SIZE; 568 if (buff) { 569 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 570 /* Send buffer if it is for v1 */ 571 if (buff->head + len > buff->end || !m->nr_conns) { 572 sb_queue_tail(ipvs, ms); 573 ms->sync_buff = NULL; 574 buff = NULL; 575 } 576 } 577 if (!buff) { 578 buff = ip_vs_sync_buff_create_v0(ipvs, len); 579 if (!buff) { 580 spin_unlock_bh(&ipvs->sync_buff_lock); 581 pr_err("ip_vs_sync_buff_create failed.\n"); 582 return; 583 } 584 ms->sync_buff = buff; 585 } 586 587 m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; 588 s = (struct ip_vs_sync_conn_v0 *) buff->head; 589 590 /* copy members */ 591 s->reserved = 0; 592 s->protocol = cp->protocol; 593 s->cport = cp->cport; 594 s->vport = cp->vport; 595 s->dport = cp->dport; 596 s->caddr = cp->caddr.ip; 597 s->vaddr = cp->vaddr.ip; 598 s->daddr = cp->daddr.ip; 599 s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); 600 s->state = htons(cp->state); 601 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 602 struct ip_vs_sync_conn_options *opt = 603 (struct ip_vs_sync_conn_options *)&s[1]; 604 memcpy(opt, &cp->in_seq, sizeof(*opt)); 605 } 606 607 m->nr_conns++; 608 m->size = htons(ntohs(m->size) + len); 609 buff->head += len; 610 spin_unlock_bh(&ipvs->sync_buff_lock); 611 612 /* synchronize its controller if it has */ 613 cp = cp->control; 614 if (cp) { 615 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 616 pkts = atomic_add_return(1, &cp->in_pkts); 617 else 618 pkts = sysctl_sync_threshold(ipvs); 619 ip_vs_sync_conn(ipvs, cp, pkts); 620 } 621 } 622 623 /* 624 * Add an ip_vs_conn information into the current sync_buff. 625 * Called by ip_vs_in. 626 * Sending Version 1 messages 627 */ 628 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts) 629 { 630 struct ip_vs_sync_mesg *m; 631 union ip_vs_sync_conn *s; 632 struct ip_vs_sync_buff *buff; 633 struct ipvs_master_sync_state *ms; 634 int id; 635 __u8 *p; 636 unsigned int len, pe_name_len, pad; 637 638 /* Handle old version of the protocol */ 639 if (sysctl_sync_ver(ipvs) == 0) { 640 ip_vs_sync_conn_v0(ipvs, cp, pkts); 641 return; 642 } 643 /* Do not sync ONE PACKET */ 644 if (cp->flags & IP_VS_CONN_F_ONE_PACKET) 645 goto control; 646 sloop: 647 if (!ip_vs_sync_conn_needed(ipvs, cp, pkts)) 648 goto control; 649 650 /* Sanity checks */ 651 pe_name_len = 0; 652 if (cp->pe_data_len) { 653 if (!cp->pe_data || !cp->dest) { 654 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); 655 return; 656 } 657 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); 658 } 659 660 spin_lock_bh(&ipvs->sync_buff_lock); 661 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 662 spin_unlock_bh(&ipvs->sync_buff_lock); 663 return; 664 } 665 666 id = select_master_thread_id(ipvs, cp); 667 ms = &ipvs->ms[id]; 668 669 #ifdef CONFIG_IP_VS_IPV6 670 if (cp->af == AF_INET6) 671 len = sizeof(struct ip_vs_sync_v6); 672 else 673 #endif 674 len = sizeof(struct ip_vs_sync_v4); 675 676 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) 677 len += sizeof(struct ip_vs_sync_conn_options) + 2; 678 679 if (cp->pe_data_len) 680 len += cp->pe_data_len + 2; /* + Param hdr field */ 681 if (pe_name_len) 682 len += pe_name_len + 2; 683 684 /* check if there is a space for this one */ 685 pad = 0; 686 buff = ms->sync_buff; 687 if (buff) { 688 m = buff->mesg; 689 pad = (4 - (size_t) buff->head) & 3; 690 /* Send buffer if it is for v0 */ 691 if (buff->head + len + pad > buff->end || m->reserved) { 692 sb_queue_tail(ipvs, ms); 693 ms->sync_buff = NULL; 694 buff = NULL; 695 pad = 0; 696 } 697 } 698 699 if (!buff) { 700 buff = ip_vs_sync_buff_create(ipvs, len); 701 if (!buff) { 702 spin_unlock_bh(&ipvs->sync_buff_lock); 703 pr_err("ip_vs_sync_buff_create failed.\n"); 704 return; 705 } 706 ms->sync_buff = buff; 707 m = buff->mesg; 708 } 709 710 p = buff->head; 711 buff->head += pad + len; 712 m->size = htons(ntohs(m->size) + pad + len); 713 /* Add ev. padding from prev. sync_conn */ 714 while (pad--) 715 *(p++) = 0; 716 717 s = (union ip_vs_sync_conn *)p; 718 719 /* Set message type & copy members */ 720 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); 721 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ 722 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); 723 s->v4.state = htons(cp->state); 724 s->v4.protocol = cp->protocol; 725 s->v4.cport = cp->cport; 726 s->v4.vport = cp->vport; 727 s->v4.dport = cp->dport; 728 s->v4.fwmark = htonl(cp->fwmark); 729 s->v4.timeout = htonl(cp->timeout / HZ); 730 m->nr_conns++; 731 732 #ifdef CONFIG_IP_VS_IPV6 733 if (cp->af == AF_INET6) { 734 p += sizeof(struct ip_vs_sync_v6); 735 s->v6.caddr = cp->caddr.in6; 736 s->v6.vaddr = cp->vaddr.in6; 737 s->v6.daddr = cp->daddr.in6; 738 } else 739 #endif 740 { 741 p += sizeof(struct ip_vs_sync_v4); /* options ptr */ 742 s->v4.caddr = cp->caddr.ip; 743 s->v4.vaddr = cp->vaddr.ip; 744 s->v4.daddr = cp->daddr.ip; 745 } 746 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { 747 *(p++) = IPVS_OPT_SEQ_DATA; 748 *(p++) = sizeof(struct ip_vs_sync_conn_options); 749 hton_seq((struct ip_vs_seq *)p, &cp->in_seq); 750 p += sizeof(struct ip_vs_seq); 751 hton_seq((struct ip_vs_seq *)p, &cp->out_seq); 752 p += sizeof(struct ip_vs_seq); 753 } 754 /* Handle pe data */ 755 if (cp->pe_data_len && cp->pe_data) { 756 *(p++) = IPVS_OPT_PE_DATA; 757 *(p++) = cp->pe_data_len; 758 memcpy(p, cp->pe_data, cp->pe_data_len); 759 p += cp->pe_data_len; 760 if (pe_name_len) { 761 /* Add PE_NAME */ 762 *(p++) = IPVS_OPT_PE_NAME; 763 *(p++) = pe_name_len; 764 memcpy(p, cp->pe->name, pe_name_len); 765 p += pe_name_len; 766 } 767 } 768 769 spin_unlock_bh(&ipvs->sync_buff_lock); 770 771 control: 772 /* synchronize its controller if it has */ 773 cp = cp->control; 774 if (!cp) 775 return; 776 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 777 pkts = atomic_add_return(1, &cp->in_pkts); 778 else 779 pkts = sysctl_sync_threshold(ipvs); 780 goto sloop; 781 } 782 783 /* 784 * fill_param used by version 1 785 */ 786 static inline int 787 ip_vs_conn_fill_param_sync(struct netns_ipvs *ipvs, int af, union ip_vs_sync_conn *sc, 788 struct ip_vs_conn_param *p, 789 __u8 *pe_data, unsigned int pe_data_len, 790 __u8 *pe_name, unsigned int pe_name_len) 791 { 792 #ifdef CONFIG_IP_VS_IPV6 793 if (af == AF_INET6) 794 ip_vs_conn_fill_param(ipvs, af, sc->v6.protocol, 795 (const union nf_inet_addr *)&sc->v6.caddr, 796 sc->v6.cport, 797 (const union nf_inet_addr *)&sc->v6.vaddr, 798 sc->v6.vport, p); 799 else 800 #endif 801 ip_vs_conn_fill_param(ipvs, af, sc->v4.protocol, 802 (const union nf_inet_addr *)&sc->v4.caddr, 803 sc->v4.cport, 804 (const union nf_inet_addr *)&sc->v4.vaddr, 805 sc->v4.vport, p); 806 /* Handle pe data */ 807 if (pe_data_len) { 808 if (pe_name_len) { 809 char buff[IP_VS_PENAME_MAXLEN+1]; 810 811 memcpy(buff, pe_name, pe_name_len); 812 buff[pe_name_len]=0; 813 p->pe = __ip_vs_pe_getbyname(buff); 814 if (!p->pe) { 815 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", 816 buff); 817 return 1; 818 } 819 } else { 820 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); 821 return 1; 822 } 823 824 p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC); 825 if (!p->pe_data) { 826 module_put(p->pe->module); 827 return -ENOMEM; 828 } 829 p->pe_data_len = pe_data_len; 830 } 831 return 0; 832 } 833 834 /* 835 * Connection Add / Update. 836 * Common for version 0 and 1 reception of backup sync_conns. 837 * Param: ... 838 * timeout is in sec. 839 */ 840 static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *param, 841 unsigned int flags, unsigned int state, 842 unsigned int protocol, unsigned int type, 843 const union nf_inet_addr *daddr, __be16 dport, 844 unsigned long timeout, __u32 fwmark, 845 struct ip_vs_sync_conn_options *opt) 846 { 847 struct ip_vs_dest *dest; 848 struct ip_vs_conn *cp; 849 850 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 851 cp = ip_vs_conn_in_get(param); 852 if (cp && ((cp->dport != dport) || 853 !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) { 854 if (!(flags & IP_VS_CONN_F_INACTIVE)) { 855 ip_vs_conn_expire_now(cp); 856 __ip_vs_conn_put(cp); 857 cp = NULL; 858 } else { 859 /* This is the expiration message for the 860 * connection that was already replaced, so we 861 * just ignore it. 862 */ 863 __ip_vs_conn_put(cp); 864 kfree(param->pe_data); 865 return; 866 } 867 } 868 } else { 869 cp = ip_vs_ct_in_get(param); 870 } 871 872 if (cp) { 873 /* Free pe_data */ 874 kfree(param->pe_data); 875 876 dest = cp->dest; 877 spin_lock_bh(&cp->lock); 878 if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE && 879 !(flags & IP_VS_CONN_F_TEMPLATE) && dest) { 880 if (flags & IP_VS_CONN_F_INACTIVE) { 881 atomic_dec(&dest->activeconns); 882 atomic_inc(&dest->inactconns); 883 } else { 884 atomic_inc(&dest->activeconns); 885 atomic_dec(&dest->inactconns); 886 } 887 } 888 flags &= IP_VS_CONN_F_BACKUP_UPD_MASK; 889 flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK; 890 cp->flags = flags; 891 spin_unlock_bh(&cp->lock); 892 if (!dest) 893 ip_vs_try_bind_dest(cp); 894 } else { 895 /* 896 * Find the appropriate destination for the connection. 897 * If it is not found the connection will remain unbound 898 * but still handled. 899 */ 900 rcu_read_lock(); 901 /* This function is only invoked by the synchronization 902 * code. We do not currently support heterogeneous pools 903 * with synchronization, so we can make the assumption that 904 * the svc_af is the same as the dest_af 905 */ 906 dest = ip_vs_find_dest(ipvs, type, type, daddr, dport, 907 param->vaddr, param->vport, protocol, 908 fwmark, flags); 909 910 cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest, 911 fwmark); 912 rcu_read_unlock(); 913 if (!cp) { 914 kfree(param->pe_data); 915 IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); 916 return; 917 } 918 if (!(flags & IP_VS_CONN_F_TEMPLATE)) 919 kfree(param->pe_data); 920 } 921 922 if (opt) { 923 cp->in_seq = opt->in_seq; 924 cp->out_seq = opt->out_seq; 925 } 926 atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); 927 cp->state = state; 928 cp->old_state = cp->state; 929 /* 930 * For Ver 0 messages style 931 * - Not possible to recover the right timeout for templates 932 * - can not find the right fwmark 933 * virtual service. If needed, we can do it for 934 * non-fwmark persistent services. 935 * Ver 1 messages style. 936 * - No problem. 937 */ 938 if (timeout) { 939 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) 940 timeout = MAX_SCHEDULE_TIMEOUT / HZ; 941 cp->timeout = timeout*HZ; 942 } else { 943 struct ip_vs_proto_data *pd; 944 945 pd = ip_vs_proto_data_get(ipvs, protocol); 946 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) 947 cp->timeout = pd->timeout_table[state]; 948 else 949 cp->timeout = (3*60*HZ); 950 } 951 ip_vs_conn_put(cp); 952 } 953 954 /* 955 * Process received multicast message for Version 0 956 */ 957 static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer, 958 const size_t buflen) 959 { 960 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; 961 struct ip_vs_sync_conn_v0 *s; 962 struct ip_vs_sync_conn_options *opt; 963 struct ip_vs_protocol *pp; 964 struct ip_vs_conn_param param; 965 char *p; 966 int i; 967 968 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); 969 for (i=0; i<m->nr_conns; i++) { 970 unsigned int flags, state; 971 972 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 973 IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); 974 return; 975 } 976 s = (struct ip_vs_sync_conn_v0 *) p; 977 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 978 flags &= ~IP_VS_CONN_F_HASHED; 979 if (flags & IP_VS_CONN_F_SEQ_MASK) { 980 opt = (struct ip_vs_sync_conn_options *)&s[1]; 981 p += FULL_CONN_SIZE; 982 if (p > buffer+buflen) { 983 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); 984 return; 985 } 986 } else { 987 opt = NULL; 988 p += SIMPLE_CONN_SIZE; 989 } 990 991 state = ntohs(s->state); 992 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 993 pp = ip_vs_proto_get(s->protocol); 994 if (!pp) { 995 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", 996 s->protocol); 997 continue; 998 } 999 if (state >= pp->num_states) { 1000 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", 1001 pp->name, state); 1002 continue; 1003 } 1004 } else { 1005 /* protocol in templates is not used for state/timeout */ 1006 if (state > 0) { 1007 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", 1008 state); 1009 state = 0; 1010 } 1011 } 1012 1013 ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol, 1014 (const union nf_inet_addr *)&s->caddr, 1015 s->cport, 1016 (const union nf_inet_addr *)&s->vaddr, 1017 s->vport, ¶m); 1018 1019 /* Send timeout as Zero */ 1020 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->protocol, AF_INET, 1021 (union nf_inet_addr *)&s->daddr, s->dport, 1022 0, 0, opt); 1023 } 1024 } 1025 1026 /* 1027 * Handle options 1028 */ 1029 static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, 1030 __u32 *opt_flags, 1031 struct ip_vs_sync_conn_options *opt) 1032 { 1033 struct ip_vs_sync_conn_options *topt; 1034 1035 topt = (struct ip_vs_sync_conn_options *)p; 1036 1037 if (plen != sizeof(struct ip_vs_sync_conn_options)) { 1038 IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); 1039 return -EINVAL; 1040 } 1041 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { 1042 IP_VS_DBG(2, "BACKUP, conn options found twice\n"); 1043 return -EINVAL; 1044 } 1045 ntoh_seq(&topt->in_seq, &opt->in_seq); 1046 ntoh_seq(&topt->out_seq, &opt->out_seq); 1047 *opt_flags |= IPVS_OPT_F_SEQ_DATA; 1048 return 0; 1049 } 1050 1051 static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, 1052 __u8 **data, unsigned int maxlen, 1053 __u32 *opt_flags, __u32 flag) 1054 { 1055 if (plen > maxlen) { 1056 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); 1057 return -EINVAL; 1058 } 1059 if (*opt_flags & flag) { 1060 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); 1061 return -EINVAL; 1062 } 1063 *data_len = plen; 1064 *data = p; 1065 *opt_flags |= flag; 1066 return 0; 1067 } 1068 /* 1069 * Process a Version 1 sync. connection 1070 */ 1071 static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *msg_end) 1072 { 1073 struct ip_vs_sync_conn_options opt; 1074 union ip_vs_sync_conn *s; 1075 struct ip_vs_protocol *pp; 1076 struct ip_vs_conn_param param; 1077 __u32 flags; 1078 unsigned int af, state, pe_data_len=0, pe_name_len=0; 1079 __u8 *pe_data=NULL, *pe_name=NULL; 1080 __u32 opt_flags=0; 1081 int retc=0; 1082 1083 s = (union ip_vs_sync_conn *) p; 1084 1085 if (s->v6.type & STYPE_F_INET6) { 1086 #ifdef CONFIG_IP_VS_IPV6 1087 af = AF_INET6; 1088 p += sizeof(struct ip_vs_sync_v6); 1089 #else 1090 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); 1091 retc = 10; 1092 goto out; 1093 #endif 1094 } else if (!s->v4.type) { 1095 af = AF_INET; 1096 p += sizeof(struct ip_vs_sync_v4); 1097 } else { 1098 return -10; 1099 } 1100 if (p > msg_end) 1101 return -20; 1102 1103 /* Process optional params check Type & Len. */ 1104 while (p < msg_end) { 1105 int ptype; 1106 int plen; 1107 1108 if (p+2 > msg_end) 1109 return -30; 1110 ptype = *(p++); 1111 plen = *(p++); 1112 1113 if (!plen || ((p + plen) > msg_end)) 1114 return -40; 1115 /* Handle seq option p = param data */ 1116 switch (ptype & ~IPVS_OPT_F_PARAM) { 1117 case IPVS_OPT_SEQ_DATA: 1118 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) 1119 return -50; 1120 break; 1121 1122 case IPVS_OPT_PE_DATA: 1123 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, 1124 IP_VS_PEDATA_MAXLEN, &opt_flags, 1125 IPVS_OPT_F_PE_DATA)) 1126 return -60; 1127 break; 1128 1129 case IPVS_OPT_PE_NAME: 1130 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, 1131 IP_VS_PENAME_MAXLEN, &opt_flags, 1132 IPVS_OPT_F_PE_NAME)) 1133 return -70; 1134 break; 1135 1136 default: 1137 /* Param data mandatory ? */ 1138 if (!(ptype & IPVS_OPT_F_PARAM)) { 1139 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", 1140 ptype & ~IPVS_OPT_F_PARAM); 1141 retc = 20; 1142 goto out; 1143 } 1144 } 1145 p += plen; /* Next option */ 1146 } 1147 1148 /* Get flags and Mask off unsupported */ 1149 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; 1150 flags |= IP_VS_CONN_F_SYNC; 1151 state = ntohs(s->v4.state); 1152 1153 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 1154 pp = ip_vs_proto_get(s->v4.protocol); 1155 if (!pp) { 1156 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", 1157 s->v4.protocol); 1158 retc = 30; 1159 goto out; 1160 } 1161 if (state >= pp->num_states) { 1162 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", 1163 pp->name, state); 1164 retc = 40; 1165 goto out; 1166 } 1167 } else { 1168 /* protocol in templates is not used for state/timeout */ 1169 if (state > 0) { 1170 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", 1171 state); 1172 state = 0; 1173 } 1174 } 1175 if (ip_vs_conn_fill_param_sync(ipvs, af, s, ¶m, pe_data, 1176 pe_data_len, pe_name, pe_name_len)) { 1177 retc = 50; 1178 goto out; 1179 } 1180 /* If only IPv4, just silent skip IPv6 */ 1181 if (af == AF_INET) 1182 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v4.protocol, af, 1183 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, 1184 ntohl(s->v4.timeout), ntohl(s->v4.fwmark), 1185 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1186 ); 1187 #ifdef CONFIG_IP_VS_IPV6 1188 else 1189 ip_vs_proc_conn(ipvs, ¶m, flags, state, s->v6.protocol, af, 1190 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, 1191 ntohl(s->v6.timeout), ntohl(s->v6.fwmark), 1192 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) 1193 ); 1194 #endif 1195 ip_vs_pe_put(param.pe); 1196 return 0; 1197 /* Error exit */ 1198 out: 1199 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); 1200 return retc; 1201 1202 } 1203 /* 1204 * Process received multicast message and create the corresponding 1205 * ip_vs_conn entries. 1206 * Handles Version 0 & 1 1207 */ 1208 static void ip_vs_process_message(struct netns_ipvs *ipvs, __u8 *buffer, 1209 const size_t buflen) 1210 { 1211 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; 1212 __u8 *p, *msg_end; 1213 int i, nr_conns; 1214 1215 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { 1216 IP_VS_DBG(2, "BACKUP, message header too short\n"); 1217 return; 1218 } 1219 1220 if (buflen != ntohs(m2->size)) { 1221 IP_VS_DBG(2, "BACKUP, bogus message size\n"); 1222 return; 1223 } 1224 /* SyncID sanity check */ 1225 if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { 1226 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); 1227 return; 1228 } 1229 /* Handle version 1 message */ 1230 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) 1231 && (m2->spare == 0)) { 1232 1233 msg_end = buffer + sizeof(struct ip_vs_sync_mesg); 1234 nr_conns = m2->nr_conns; 1235 1236 for (i=0; i<nr_conns; i++) { 1237 union ip_vs_sync_conn *s; 1238 unsigned int size; 1239 int retc; 1240 1241 p = msg_end; 1242 if (p + sizeof(s->v4) > buffer+buflen) { 1243 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); 1244 return; 1245 } 1246 s = (union ip_vs_sync_conn *)p; 1247 size = ntohs(s->v4.ver_size) & SVER_MASK; 1248 msg_end = p + size; 1249 /* Basic sanity checks */ 1250 if (msg_end > buffer+buflen) { 1251 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); 1252 return; 1253 } 1254 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { 1255 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", 1256 ntohs(s->v4.ver_size) >> SVER_SHIFT); 1257 return; 1258 } 1259 /* Process a single sync_conn */ 1260 retc = ip_vs_proc_sync_conn(ipvs, p, msg_end); 1261 if (retc < 0) { 1262 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", 1263 retc); 1264 return; 1265 } 1266 /* Make sure we have 32 bit alignment */ 1267 msg_end = p + ((size + 3) & ~3); 1268 } 1269 } else { 1270 /* Old type of message */ 1271 ip_vs_process_message_v0(ipvs, buffer, buflen); 1272 return; 1273 } 1274 } 1275 1276 1277 /* 1278 * Setup sndbuf (mode=1) or rcvbuf (mode=0) 1279 */ 1280 static void set_sock_size(struct sock *sk, int mode, int val) 1281 { 1282 /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */ 1283 /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */ 1284 lock_sock(sk); 1285 if (mode) { 1286 val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, 1287 sysctl_wmem_max); 1288 sk->sk_sndbuf = val * 2; 1289 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1290 } else { 1291 val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, 1292 sysctl_rmem_max); 1293 sk->sk_rcvbuf = val * 2; 1294 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 1295 } 1296 release_sock(sk); 1297 } 1298 1299 /* 1300 * Setup loopback of outgoing multicasts on a sending socket 1301 */ 1302 static void set_mcast_loop(struct sock *sk, u_char loop) 1303 { 1304 struct inet_sock *inet = inet_sk(sk); 1305 1306 /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ 1307 lock_sock(sk); 1308 inet->mc_loop = loop ? 1 : 0; 1309 #ifdef CONFIG_IP_VS_IPV6 1310 if (sk->sk_family == AF_INET6) { 1311 struct ipv6_pinfo *np = inet6_sk(sk); 1312 1313 /* IPV6_MULTICAST_LOOP */ 1314 np->mc_loop = loop ? 1 : 0; 1315 } 1316 #endif 1317 release_sock(sk); 1318 } 1319 1320 /* 1321 * Specify TTL for outgoing multicasts on a sending socket 1322 */ 1323 static void set_mcast_ttl(struct sock *sk, u_char ttl) 1324 { 1325 struct inet_sock *inet = inet_sk(sk); 1326 1327 /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ 1328 lock_sock(sk); 1329 inet->mc_ttl = ttl; 1330 #ifdef CONFIG_IP_VS_IPV6 1331 if (sk->sk_family == AF_INET6) { 1332 struct ipv6_pinfo *np = inet6_sk(sk); 1333 1334 /* IPV6_MULTICAST_HOPS */ 1335 np->mcast_hops = ttl; 1336 } 1337 #endif 1338 release_sock(sk); 1339 } 1340 1341 /* Control fragmentation of messages */ 1342 static void set_mcast_pmtudisc(struct sock *sk, int val) 1343 { 1344 struct inet_sock *inet = inet_sk(sk); 1345 1346 /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ 1347 lock_sock(sk); 1348 inet->pmtudisc = val; 1349 #ifdef CONFIG_IP_VS_IPV6 1350 if (sk->sk_family == AF_INET6) { 1351 struct ipv6_pinfo *np = inet6_sk(sk); 1352 1353 /* IPV6_MTU_DISCOVER */ 1354 np->pmtudisc = val; 1355 } 1356 #endif 1357 release_sock(sk); 1358 } 1359 1360 /* 1361 * Specifiy default interface for outgoing multicasts 1362 */ 1363 static int set_mcast_if(struct sock *sk, char *ifname) 1364 { 1365 struct net_device *dev; 1366 struct inet_sock *inet = inet_sk(sk); 1367 struct net *net = sock_net(sk); 1368 1369 dev = __dev_get_by_name(net, ifname); 1370 if (!dev) 1371 return -ENODEV; 1372 1373 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1374 return -EINVAL; 1375 1376 lock_sock(sk); 1377 inet->mc_index = dev->ifindex; 1378 /* inet->mc_addr = 0; */ 1379 #ifdef CONFIG_IP_VS_IPV6 1380 if (sk->sk_family == AF_INET6) { 1381 struct ipv6_pinfo *np = inet6_sk(sk); 1382 1383 /* IPV6_MULTICAST_IF */ 1384 np->mcast_oif = dev->ifindex; 1385 } 1386 #endif 1387 release_sock(sk); 1388 1389 return 0; 1390 } 1391 1392 1393 /* 1394 * Join a multicast group. 1395 * the group is specified by a class D multicast address 224.0.0.0/8 1396 * in the in_addr structure passed in as a parameter. 1397 */ 1398 static int 1399 join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) 1400 { 1401 struct net *net = sock_net(sk); 1402 struct ip_mreqn mreq; 1403 struct net_device *dev; 1404 int ret; 1405 1406 memset(&mreq, 0, sizeof(mreq)); 1407 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1408 1409 dev = __dev_get_by_name(net, ifname); 1410 if (!dev) 1411 return -ENODEV; 1412 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1413 return -EINVAL; 1414 1415 mreq.imr_ifindex = dev->ifindex; 1416 1417 lock_sock(sk); 1418 ret = ip_mc_join_group(sk, &mreq); 1419 release_sock(sk); 1420 1421 return ret; 1422 } 1423 1424 #ifdef CONFIG_IP_VS_IPV6 1425 static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, 1426 char *ifname) 1427 { 1428 struct net *net = sock_net(sk); 1429 struct net_device *dev; 1430 int ret; 1431 1432 dev = __dev_get_by_name(net, ifname); 1433 if (!dev) 1434 return -ENODEV; 1435 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1436 return -EINVAL; 1437 1438 lock_sock(sk); 1439 ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); 1440 release_sock(sk); 1441 1442 return ret; 1443 } 1444 #endif 1445 1446 static int bind_mcastif_addr(struct socket *sock, char *ifname) 1447 { 1448 struct net *net = sock_net(sock->sk); 1449 struct net_device *dev; 1450 __be32 addr; 1451 struct sockaddr_in sin; 1452 1453 dev = __dev_get_by_name(net, ifname); 1454 if (!dev) 1455 return -ENODEV; 1456 1457 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1458 if (!addr) 1459 pr_err("You probably need to specify IP address on " 1460 "multicast interface.\n"); 1461 1462 IP_VS_DBG(7, "binding socket with (%s) %pI4\n", 1463 ifname, &addr); 1464 1465 /* Now bind the socket with the address of multicast interface */ 1466 sin.sin_family = AF_INET; 1467 sin.sin_addr.s_addr = addr; 1468 sin.sin_port = 0; 1469 1470 return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); 1471 } 1472 1473 static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, 1474 struct ipvs_sync_daemon_cfg *c, int id) 1475 { 1476 if (AF_INET6 == c->mcast_af) { 1477 sa->in6 = (struct sockaddr_in6) { 1478 .sin6_family = AF_INET6, 1479 .sin6_port = htons(c->mcast_port + id), 1480 }; 1481 sa->in6.sin6_addr = c->mcast_group.in6; 1482 *salen = sizeof(sa->in6); 1483 } else { 1484 sa->in = (struct sockaddr_in) { 1485 .sin_family = AF_INET, 1486 .sin_port = htons(c->mcast_port + id), 1487 }; 1488 sa->in.sin_addr = c->mcast_group.in; 1489 *salen = sizeof(sa->in); 1490 } 1491 } 1492 1493 /* 1494 * Set up sending multicast socket over UDP 1495 */ 1496 static struct socket *make_send_sock(struct netns_ipvs *ipvs, int id) 1497 { 1498 /* multicast addr */ 1499 union ipvs_sockaddr mcast_addr; 1500 struct socket *sock; 1501 int result, salen; 1502 1503 /* First create a socket */ 1504 result = sock_create_kern(ipvs->net, ipvs->mcfg.mcast_af, SOCK_DGRAM, 1505 IPPROTO_UDP, &sock); 1506 if (result < 0) { 1507 pr_err("Error during creation of socket; terminating\n"); 1508 return ERR_PTR(result); 1509 } 1510 result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); 1511 if (result < 0) { 1512 pr_err("Error setting outbound mcast interface\n"); 1513 goto error; 1514 } 1515 1516 set_mcast_loop(sock->sk, 0); 1517 set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); 1518 /* Allow fragmentation if MTU changes */ 1519 set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); 1520 result = sysctl_sync_sock_size(ipvs); 1521 if (result > 0) 1522 set_sock_size(sock->sk, 1, result); 1523 1524 if (AF_INET == ipvs->mcfg.mcast_af) 1525 result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); 1526 else 1527 result = 0; 1528 if (result < 0) { 1529 pr_err("Error binding address of the mcast interface\n"); 1530 goto error; 1531 } 1532 1533 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); 1534 result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, 1535 salen, 0); 1536 if (result < 0) { 1537 pr_err("Error connecting to the multicast addr\n"); 1538 goto error; 1539 } 1540 1541 return sock; 1542 1543 error: 1544 sock_release(sock); 1545 return ERR_PTR(result); 1546 } 1547 1548 1549 /* 1550 * Set up receiving multicast socket over UDP 1551 */ 1552 static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, 1553 int ifindex) 1554 { 1555 /* multicast addr */ 1556 union ipvs_sockaddr mcast_addr; 1557 struct socket *sock; 1558 int result, salen; 1559 1560 /* First create a socket */ 1561 result = sock_create_kern(ipvs->net, ipvs->bcfg.mcast_af, SOCK_DGRAM, 1562 IPPROTO_UDP, &sock); 1563 if (result < 0) { 1564 pr_err("Error during creation of socket; terminating\n"); 1565 return ERR_PTR(result); 1566 } 1567 /* it is equivalent to the REUSEADDR option in user-space */ 1568 sock->sk->sk_reuse = SK_CAN_REUSE; 1569 result = sysctl_sync_sock_size(ipvs); 1570 if (result > 0) 1571 set_sock_size(sock->sk, 0, result); 1572 1573 get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); 1574 sock->sk->sk_bound_dev_if = ifindex; 1575 result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); 1576 if (result < 0) { 1577 pr_err("Error binding to the multicast addr\n"); 1578 goto error; 1579 } 1580 1581 /* join the multicast group */ 1582 #ifdef CONFIG_IP_VS_IPV6 1583 if (ipvs->bcfg.mcast_af == AF_INET6) 1584 result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, 1585 ipvs->bcfg.mcast_ifn); 1586 else 1587 #endif 1588 result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, 1589 ipvs->bcfg.mcast_ifn); 1590 if (result < 0) { 1591 pr_err("Error joining to the multicast group\n"); 1592 goto error; 1593 } 1594 1595 return sock; 1596 1597 error: 1598 sock_release(sock); 1599 return ERR_PTR(result); 1600 } 1601 1602 1603 static int 1604 ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) 1605 { 1606 struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; 1607 struct kvec iov; 1608 int len; 1609 1610 EnterFunction(7); 1611 iov.iov_base = (void *)buffer; 1612 iov.iov_len = length; 1613 1614 len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); 1615 1616 LeaveFunction(7); 1617 return len; 1618 } 1619 1620 static int 1621 ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) 1622 { 1623 int msize; 1624 int ret; 1625 1626 msize = ntohs(msg->size); 1627 1628 ret = ip_vs_send_async(sock, (char *)msg, msize); 1629 if (ret >= 0 || ret == -EAGAIN) 1630 return ret; 1631 pr_err("ip_vs_send_async error %d\n", ret); 1632 return 0; 1633 } 1634 1635 static int 1636 ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) 1637 { 1638 struct msghdr msg = {NULL,}; 1639 struct kvec iov; 1640 int len; 1641 1642 EnterFunction(7); 1643 1644 /* Receive a packet */ 1645 iov.iov_base = buffer; 1646 iov.iov_len = (size_t)buflen; 1647 1648 len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT); 1649 1650 if (len < 0) 1651 return len; 1652 1653 LeaveFunction(7); 1654 return len; 1655 } 1656 1657 /* Wakeup the master thread for sending */ 1658 static void master_wakeup_work_handler(struct work_struct *work) 1659 { 1660 struct ipvs_master_sync_state *ms = 1661 container_of(work, struct ipvs_master_sync_state, 1662 master_wakeup_work.work); 1663 struct netns_ipvs *ipvs = ms->ipvs; 1664 1665 spin_lock_bh(&ipvs->sync_lock); 1666 if (ms->sync_queue_len && 1667 ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) { 1668 ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE; 1669 wake_up_process(ms->master_thread); 1670 } 1671 spin_unlock_bh(&ipvs->sync_lock); 1672 } 1673 1674 /* Get next buffer to send */ 1675 static inline struct ip_vs_sync_buff * 1676 next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) 1677 { 1678 struct ip_vs_sync_buff *sb; 1679 1680 sb = sb_dequeue(ipvs, ms); 1681 if (sb) 1682 return sb; 1683 /* Do not delay entries in buffer for more than 2 seconds */ 1684 return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME); 1685 } 1686 1687 static int sync_thread_master(void *data) 1688 { 1689 struct ip_vs_sync_thread_data *tinfo = data; 1690 struct netns_ipvs *ipvs = tinfo->ipvs; 1691 struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id]; 1692 struct sock *sk = tinfo->sock->sk; 1693 struct ip_vs_sync_buff *sb; 1694 1695 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1696 "syncid = %d, id = %d\n", 1697 ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); 1698 1699 for (;;) { 1700 sb = next_sync_buff(ipvs, ms); 1701 if (unlikely(kthread_should_stop())) 1702 break; 1703 if (!sb) { 1704 schedule_timeout(IPVS_SYNC_CHECK_PERIOD); 1705 continue; 1706 } 1707 while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) { 1708 /* (Ab)use interruptible sleep to avoid increasing 1709 * the load avg. 1710 */ 1711 __wait_event_interruptible(*sk_sleep(sk), 1712 sock_writeable(sk) || 1713 kthread_should_stop()); 1714 if (unlikely(kthread_should_stop())) 1715 goto done; 1716 } 1717 ip_vs_sync_buff_release(sb); 1718 } 1719 1720 done: 1721 __set_current_state(TASK_RUNNING); 1722 if (sb) 1723 ip_vs_sync_buff_release(sb); 1724 1725 /* clean up the sync_buff queue */ 1726 while ((sb = sb_dequeue(ipvs, ms))) 1727 ip_vs_sync_buff_release(sb); 1728 __set_current_state(TASK_RUNNING); 1729 1730 /* clean up the current sync_buff */ 1731 sb = get_curr_sync_buff(ipvs, ms, 0); 1732 if (sb) 1733 ip_vs_sync_buff_release(sb); 1734 1735 /* release the sending multicast socket */ 1736 sock_release(tinfo->sock); 1737 kfree(tinfo); 1738 1739 return 0; 1740 } 1741 1742 1743 static int sync_thread_backup(void *data) 1744 { 1745 struct ip_vs_sync_thread_data *tinfo = data; 1746 struct netns_ipvs *ipvs = tinfo->ipvs; 1747 int len; 1748 1749 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1750 "syncid = %d, id = %d\n", 1751 ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); 1752 1753 while (!kthread_should_stop()) { 1754 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), 1755 !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) 1756 || kthread_should_stop()); 1757 1758 /* do we have data now? */ 1759 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 1760 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1761 ipvs->bcfg.sync_maxlen); 1762 if (len <= 0) { 1763 if (len != -EAGAIN) 1764 pr_err("receiving message error\n"); 1765 break; 1766 } 1767 1768 ip_vs_process_message(ipvs, tinfo->buf, len); 1769 } 1770 } 1771 1772 /* release the sending multicast socket */ 1773 sock_release(tinfo->sock); 1774 kfree(tinfo->buf); 1775 kfree(tinfo); 1776 1777 return 0; 1778 } 1779 1780 1781 int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, 1782 int state) 1783 { 1784 struct ip_vs_sync_thread_data *tinfo; 1785 struct task_struct **array = NULL, *task; 1786 struct socket *sock; 1787 struct net_device *dev; 1788 char *name; 1789 int (*threadfn)(void *data); 1790 int id, count, hlen; 1791 int result = -ENOMEM; 1792 u16 mtu, min_mtu; 1793 1794 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1795 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %zd bytes\n", 1796 sizeof(struct ip_vs_sync_conn_v0)); 1797 1798 if (!ipvs->sync_state) { 1799 count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX); 1800 ipvs->threads_mask = count - 1; 1801 } else 1802 count = ipvs->threads_mask + 1; 1803 1804 if (c->mcast_af == AF_UNSPEC) { 1805 c->mcast_af = AF_INET; 1806 c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); 1807 } 1808 if (!c->mcast_port) 1809 c->mcast_port = IP_VS_SYNC_PORT; 1810 if (!c->mcast_ttl) 1811 c->mcast_ttl = 1; 1812 1813 dev = __dev_get_by_name(ipvs->net, c->mcast_ifn); 1814 if (!dev) { 1815 pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); 1816 return -ENODEV; 1817 } 1818 hlen = (AF_INET6 == c->mcast_af) ? 1819 sizeof(struct ipv6hdr) + sizeof(struct udphdr) : 1820 sizeof(struct iphdr) + sizeof(struct udphdr); 1821 mtu = (state == IP_VS_STATE_BACKUP) ? 1822 clamp(dev->mtu, 1500U, 65535U) : 1500U; 1823 min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; 1824 1825 if (c->sync_maxlen) 1826 c->sync_maxlen = clamp_t(unsigned int, 1827 c->sync_maxlen, min_mtu, 1828 65535 - hlen); 1829 else 1830 c->sync_maxlen = mtu - hlen; 1831 1832 if (state == IP_VS_STATE_MASTER) { 1833 if (ipvs->ms) 1834 return -EEXIST; 1835 1836 ipvs->mcfg = *c; 1837 name = "ipvs-m:%d:%d"; 1838 threadfn = sync_thread_master; 1839 } else if (state == IP_VS_STATE_BACKUP) { 1840 if (ipvs->backup_threads) 1841 return -EEXIST; 1842 1843 ipvs->bcfg = *c; 1844 name = "ipvs-b:%d:%d"; 1845 threadfn = sync_thread_backup; 1846 } else { 1847 return -EINVAL; 1848 } 1849 1850 if (state == IP_VS_STATE_MASTER) { 1851 struct ipvs_master_sync_state *ms; 1852 1853 ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); 1854 if (!ipvs->ms) 1855 goto out; 1856 ms = ipvs->ms; 1857 for (id = 0; id < count; id++, ms++) { 1858 INIT_LIST_HEAD(&ms->sync_queue); 1859 ms->sync_queue_len = 0; 1860 ms->sync_queue_delay = 0; 1861 INIT_DELAYED_WORK(&ms->master_wakeup_work, 1862 master_wakeup_work_handler); 1863 ms->ipvs = ipvs; 1864 } 1865 } else { 1866 array = kcalloc(count, sizeof(struct task_struct *), 1867 GFP_KERNEL); 1868 if (!array) 1869 goto out; 1870 } 1871 1872 tinfo = NULL; 1873 for (id = 0; id < count; id++) { 1874 if (state == IP_VS_STATE_MASTER) 1875 sock = make_send_sock(ipvs, id); 1876 else 1877 sock = make_receive_sock(ipvs, id, dev->ifindex); 1878 if (IS_ERR(sock)) { 1879 result = PTR_ERR(sock); 1880 goto outtinfo; 1881 } 1882 tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); 1883 if (!tinfo) 1884 goto outsocket; 1885 tinfo->ipvs = ipvs; 1886 tinfo->sock = sock; 1887 if (state == IP_VS_STATE_BACKUP) { 1888 tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, 1889 GFP_KERNEL); 1890 if (!tinfo->buf) 1891 goto outtinfo; 1892 } else { 1893 tinfo->buf = NULL; 1894 } 1895 tinfo->id = id; 1896 1897 task = kthread_run(threadfn, tinfo, name, ipvs->gen, id); 1898 if (IS_ERR(task)) { 1899 result = PTR_ERR(task); 1900 goto outtinfo; 1901 } 1902 tinfo = NULL; 1903 if (state == IP_VS_STATE_MASTER) 1904 ipvs->ms[id].master_thread = task; 1905 else 1906 array[id] = task; 1907 } 1908 1909 /* mark as active */ 1910 1911 if (state == IP_VS_STATE_BACKUP) 1912 ipvs->backup_threads = array; 1913 spin_lock_bh(&ipvs->sync_buff_lock); 1914 ipvs->sync_state |= state; 1915 spin_unlock_bh(&ipvs->sync_buff_lock); 1916 1917 /* increase the module use count */ 1918 ip_vs_use_count_inc(); 1919 1920 return 0; 1921 1922 outsocket: 1923 sock_release(sock); 1924 1925 outtinfo: 1926 if (tinfo) { 1927 sock_release(tinfo->sock); 1928 kfree(tinfo->buf); 1929 kfree(tinfo); 1930 } 1931 count = id; 1932 while (count-- > 0) { 1933 if (state == IP_VS_STATE_MASTER) 1934 kthread_stop(ipvs->ms[count].master_thread); 1935 else 1936 kthread_stop(array[count]); 1937 } 1938 kfree(array); 1939 1940 out: 1941 if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) { 1942 kfree(ipvs->ms); 1943 ipvs->ms = NULL; 1944 } 1945 return result; 1946 } 1947 1948 1949 int stop_sync_thread(struct netns_ipvs *ipvs, int state) 1950 { 1951 struct task_struct **array; 1952 int id; 1953 int retc = -EINVAL; 1954 1955 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1956 1957 if (state == IP_VS_STATE_MASTER) { 1958 if (!ipvs->ms) 1959 return -ESRCH; 1960 1961 /* 1962 * The lock synchronizes with sb_queue_tail(), so that we don't 1963 * add sync buffers to the queue, when we are already in 1964 * progress of stopping the master sync daemon. 1965 */ 1966 1967 spin_lock_bh(&ipvs->sync_buff_lock); 1968 spin_lock(&ipvs->sync_lock); 1969 ipvs->sync_state &= ~IP_VS_STATE_MASTER; 1970 spin_unlock(&ipvs->sync_lock); 1971 spin_unlock_bh(&ipvs->sync_buff_lock); 1972 1973 retc = 0; 1974 for (id = ipvs->threads_mask; id >= 0; id--) { 1975 struct ipvs_master_sync_state *ms = &ipvs->ms[id]; 1976 int ret; 1977 1978 pr_info("stopping master sync thread %d ...\n", 1979 task_pid_nr(ms->master_thread)); 1980 cancel_delayed_work_sync(&ms->master_wakeup_work); 1981 ret = kthread_stop(ms->master_thread); 1982 if (retc >= 0) 1983 retc = ret; 1984 } 1985 kfree(ipvs->ms); 1986 ipvs->ms = NULL; 1987 } else if (state == IP_VS_STATE_BACKUP) { 1988 if (!ipvs->backup_threads) 1989 return -ESRCH; 1990 1991 ipvs->sync_state &= ~IP_VS_STATE_BACKUP; 1992 array = ipvs->backup_threads; 1993 retc = 0; 1994 for (id = ipvs->threads_mask; id >= 0; id--) { 1995 int ret; 1996 1997 pr_info("stopping backup sync thread %d ...\n", 1998 task_pid_nr(array[id])); 1999 ret = kthread_stop(array[id]); 2000 if (retc >= 0) 2001 retc = ret; 2002 } 2003 kfree(array); 2004 ipvs->backup_threads = NULL; 2005 } 2006 2007 /* decrease the module use count */ 2008 ip_vs_use_count_dec(); 2009 2010 return retc; 2011 } 2012 2013 /* 2014 * Initialize data struct for each netns 2015 */ 2016 int __net_init ip_vs_sync_net_init(struct netns_ipvs *ipvs) 2017 { 2018 __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key); 2019 spin_lock_init(&ipvs->sync_lock); 2020 spin_lock_init(&ipvs->sync_buff_lock); 2021 return 0; 2022 } 2023 2024 void ip_vs_sync_net_cleanup(struct netns_ipvs *ipvs) 2025 { 2026 int retc; 2027 2028 mutex_lock(&ipvs->sync_mutex); 2029 retc = stop_sync_thread(ipvs, IP_VS_STATE_MASTER); 2030 if (retc && retc != -ESRCH) 2031 pr_err("Failed to stop Master Daemon\n"); 2032 2033 retc = stop_sync_thread(ipvs, IP_VS_STATE_BACKUP); 2034 if (retc && retc != -ESRCH) 2035 pr_err("Failed to stop Backup Daemon\n"); 2036 mutex_unlock(&ipvs->sync_mutex); 2037 } 2038