xref: /linux/net/ipv4/tcp_ipv4.c (revision 14b9f27886ce69c5f11445d107dd020f6fc5754b)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64 
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84 
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87 EXPORT_SYMBOL(sysctl_tcp_low_latency);
88 
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92 						   __be32 addr);
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95 #else
96 static inline
97 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98 {
99 	return NULL;
100 }
101 #endif
102 
103 struct inet_hashinfo tcp_hashinfo;
104 EXPORT_SYMBOL(tcp_hashinfo);
105 
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 					  ip_hdr(skb)->saddr,
110 					  tcp_hdr(skb)->dest,
111 					  tcp_hdr(skb)->source);
112 }
113 
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 	struct tcp_sock *tp = tcp_sk(sk);
118 
119 	/* With PAWS, it is safe from the viewpoint
120 	   of data integrity. Even without PAWS it is safe provided sequence
121 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122 
123 	   Actually, the idea is close to VJ's one, only timestamp cache is
124 	   held not per host, but per port pair and TW bucket is used as state
125 	   holder.
126 
127 	   If TW bucket has been already destroyed we fall back to VJ's scheme
128 	   and use initial timestamp retrieved from peer table.
129 	 */
130 	if (tcptw->tw_ts_recent_stamp &&
131 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 		if (tp->write_seq == 0)
135 			tp->write_seq = 1;
136 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 		sock_hold(sktw);
139 		return 1;
140 	}
141 
142 	return 0;
143 }
144 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145 
146 /* This will initiate an outgoing connection. */
147 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148 {
149 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150 	struct inet_sock *inet = inet_sk(sk);
151 	struct tcp_sock *tp = tcp_sk(sk);
152 	__be16 orig_sport, orig_dport;
153 	__be32 daddr, nexthop;
154 	struct flowi4 *fl4;
155 	struct rtable *rt;
156 	int err;
157 	struct ip_options_rcu *inet_opt;
158 
159 	if (addr_len < sizeof(struct sockaddr_in))
160 		return -EINVAL;
161 
162 	if (usin->sin_family != AF_INET)
163 		return -EAFNOSUPPORT;
164 
165 	nexthop = daddr = usin->sin_addr.s_addr;
166 	inet_opt = rcu_dereference_protected(inet->inet_opt,
167 					     sock_owned_by_user(sk));
168 	if (inet_opt && inet_opt->opt.srr) {
169 		if (!daddr)
170 			return -EINVAL;
171 		nexthop = inet_opt->opt.faddr;
172 	}
173 
174 	orig_sport = inet->inet_sport;
175 	orig_dport = usin->sin_port;
176 	fl4 = &inet->cork.fl.u.ip4;
177 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
178 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
179 			      IPPROTO_TCP,
180 			      orig_sport, orig_dport, sk, true);
181 	if (IS_ERR(rt)) {
182 		err = PTR_ERR(rt);
183 		if (err == -ENETUNREACH)
184 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
185 		return err;
186 	}
187 
188 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
189 		ip_rt_put(rt);
190 		return -ENETUNREACH;
191 	}
192 
193 	if (!inet_opt || !inet_opt->opt.srr)
194 		daddr = fl4->daddr;
195 
196 	if (!inet->inet_saddr)
197 		inet->inet_saddr = fl4->saddr;
198 	inet->inet_rcv_saddr = inet->inet_saddr;
199 
200 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
201 		/* Reset inherited state */
202 		tp->rx_opt.ts_recent	   = 0;
203 		tp->rx_opt.ts_recent_stamp = 0;
204 		tp->write_seq		   = 0;
205 	}
206 
207 	if (tcp_death_row.sysctl_tw_recycle &&
208 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
209 		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
210 		/*
211 		 * VJ's idea. We save last timestamp seen from
212 		 * the destination in peer table, when entering state
213 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
214 		 * when trying new connection.
215 		 */
216 		if (peer) {
217 			inet_peer_refcheck(peer);
218 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
219 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
220 				tp->rx_opt.ts_recent = peer->tcp_ts;
221 			}
222 		}
223 	}
224 
225 	inet->inet_dport = usin->sin_port;
226 	inet->inet_daddr = daddr;
227 
228 	inet_csk(sk)->icsk_ext_hdr_len = 0;
229 	if (inet_opt)
230 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
231 
232 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
233 
234 	/* Socket identity is still unknown (sport may be zero).
235 	 * However we set state to SYN-SENT and not releasing socket
236 	 * lock select source port, enter ourselves into the hash tables and
237 	 * complete initialization after this.
238 	 */
239 	tcp_set_state(sk, TCP_SYN_SENT);
240 	err = inet_hash_connect(&tcp_death_row, sk);
241 	if (err)
242 		goto failure;
243 
244 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
245 			       inet->inet_sport, inet->inet_dport, sk);
246 	if (IS_ERR(rt)) {
247 		err = PTR_ERR(rt);
248 		rt = NULL;
249 		goto failure;
250 	}
251 	/* OK, now commit destination to socket.  */
252 	sk->sk_gso_type = SKB_GSO_TCPV4;
253 	sk_setup_caps(sk, &rt->dst);
254 
255 	if (!tp->write_seq)
256 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
257 							   inet->inet_daddr,
258 							   inet->inet_sport,
259 							   usin->sin_port);
260 
261 	inet->inet_id = tp->write_seq ^ jiffies;
262 
263 	err = tcp_connect(sk);
264 	rt = NULL;
265 	if (err)
266 		goto failure;
267 
268 	return 0;
269 
270 failure:
271 	/*
272 	 * This unhashes the socket and releases the local port,
273 	 * if necessary.
274 	 */
275 	tcp_set_state(sk, TCP_CLOSE);
276 	ip_rt_put(rt);
277 	sk->sk_route_caps = 0;
278 	inet->inet_dport = 0;
279 	return err;
280 }
281 EXPORT_SYMBOL(tcp_v4_connect);
282 
283 /*
284  * This routine does path mtu discovery as defined in RFC1191.
285  */
286 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
287 {
288 	struct dst_entry *dst;
289 	struct inet_sock *inet = inet_sk(sk);
290 
291 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
292 	 * send out by Linux are always <576bytes so they should go through
293 	 * unfragmented).
294 	 */
295 	if (sk->sk_state == TCP_LISTEN)
296 		return;
297 
298 	/* We don't check in the destentry if pmtu discovery is forbidden
299 	 * on this route. We just assume that no packet_to_big packets
300 	 * are send back when pmtu discovery is not active.
301 	 * There is a small race when the user changes this flag in the
302 	 * route, but I think that's acceptable.
303 	 */
304 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
305 		return;
306 
307 	dst->ops->update_pmtu(dst, mtu);
308 
309 	/* Something is about to be wrong... Remember soft error
310 	 * for the case, if this connection will not able to recover.
311 	 */
312 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
313 		sk->sk_err_soft = EMSGSIZE;
314 
315 	mtu = dst_mtu(dst);
316 
317 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
318 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
319 		tcp_sync_mss(sk, mtu);
320 
321 		/* Resend the TCP packet because it's
322 		 * clear that the old packet has been
323 		 * dropped. This is the new "fast" path mtu
324 		 * discovery.
325 		 */
326 		tcp_simple_retransmit(sk);
327 	} /* else let the usual retransmit timer handle it */
328 }
329 
330 /*
331  * This routine is called by the ICMP module when it gets some
332  * sort of error condition.  If err < 0 then the socket should
333  * be closed and the error returned to the user.  If err > 0
334  * it's just the icmp type << 8 | icmp code.  After adjustment
335  * header points to the first 8 bytes of the tcp header.  We need
336  * to find the appropriate port.
337  *
338  * The locking strategy used here is very "optimistic". When
339  * someone else accesses the socket the ICMP is just dropped
340  * and for some paths there is no check at all.
341  * A more general error queue to queue errors for later handling
342  * is probably better.
343  *
344  */
345 
346 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
347 {
348 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
349 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
350 	struct inet_connection_sock *icsk;
351 	struct tcp_sock *tp;
352 	struct inet_sock *inet;
353 	const int type = icmp_hdr(icmp_skb)->type;
354 	const int code = icmp_hdr(icmp_skb)->code;
355 	struct sock *sk;
356 	struct sk_buff *skb;
357 	__u32 seq;
358 	__u32 remaining;
359 	int err;
360 	struct net *net = dev_net(icmp_skb->dev);
361 
362 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
363 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
364 		return;
365 	}
366 
367 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
368 			iph->saddr, th->source, inet_iif(icmp_skb));
369 	if (!sk) {
370 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
371 		return;
372 	}
373 	if (sk->sk_state == TCP_TIME_WAIT) {
374 		inet_twsk_put(inet_twsk(sk));
375 		return;
376 	}
377 
378 	bh_lock_sock(sk);
379 	/* If too many ICMPs get dropped on busy
380 	 * servers this needs to be solved differently.
381 	 */
382 	if (sock_owned_by_user(sk))
383 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
384 
385 	if (sk->sk_state == TCP_CLOSE)
386 		goto out;
387 
388 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
389 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
390 		goto out;
391 	}
392 
393 	icsk = inet_csk(sk);
394 	tp = tcp_sk(sk);
395 	seq = ntohl(th->seq);
396 	if (sk->sk_state != TCP_LISTEN &&
397 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
398 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
399 		goto out;
400 	}
401 
402 	switch (type) {
403 	case ICMP_SOURCE_QUENCH:
404 		/* Just silently ignore these. */
405 		goto out;
406 	case ICMP_PARAMETERPROB:
407 		err = EPROTO;
408 		break;
409 	case ICMP_DEST_UNREACH:
410 		if (code > NR_ICMP_UNREACH)
411 			goto out;
412 
413 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
414 			if (!sock_owned_by_user(sk))
415 				do_pmtu_discovery(sk, iph, info);
416 			goto out;
417 		}
418 
419 		err = icmp_err_convert[code].errno;
420 		/* check if icmp_skb allows revert of backoff
421 		 * (see draft-zimmermann-tcp-lcd) */
422 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
423 			break;
424 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
425 		    !icsk->icsk_backoff)
426 			break;
427 
428 		if (sock_owned_by_user(sk))
429 			break;
430 
431 		icsk->icsk_backoff--;
432 		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
433 					 icsk->icsk_backoff;
434 		tcp_bound_rto(sk);
435 
436 		skb = tcp_write_queue_head(sk);
437 		BUG_ON(!skb);
438 
439 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
440 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
441 
442 		if (remaining) {
443 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
444 						  remaining, TCP_RTO_MAX);
445 		} else {
446 			/* RTO revert clocked out retransmission.
447 			 * Will retransmit now */
448 			tcp_retransmit_timer(sk);
449 		}
450 
451 		break;
452 	case ICMP_TIME_EXCEEDED:
453 		err = EHOSTUNREACH;
454 		break;
455 	default:
456 		goto out;
457 	}
458 
459 	switch (sk->sk_state) {
460 		struct request_sock *req, **prev;
461 	case TCP_LISTEN:
462 		if (sock_owned_by_user(sk))
463 			goto out;
464 
465 		req = inet_csk_search_req(sk, &prev, th->dest,
466 					  iph->daddr, iph->saddr);
467 		if (!req)
468 			goto out;
469 
470 		/* ICMPs are not backlogged, hence we cannot get
471 		   an established socket here.
472 		 */
473 		WARN_ON(req->sk);
474 
475 		if (seq != tcp_rsk(req)->snt_isn) {
476 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
477 			goto out;
478 		}
479 
480 		/*
481 		 * Still in SYN_RECV, just remove it silently.
482 		 * There is no good way to pass the error to the newly
483 		 * created socket, and POSIX does not want network
484 		 * errors returned from accept().
485 		 */
486 		inet_csk_reqsk_queue_drop(sk, req, prev);
487 		goto out;
488 
489 	case TCP_SYN_SENT:
490 	case TCP_SYN_RECV:  /* Cannot happen.
491 			       It can f.e. if SYNs crossed.
492 			     */
493 		if (!sock_owned_by_user(sk)) {
494 			sk->sk_err = err;
495 
496 			sk->sk_error_report(sk);
497 
498 			tcp_done(sk);
499 		} else {
500 			sk->sk_err_soft = err;
501 		}
502 		goto out;
503 	}
504 
505 	/* If we've already connected we will keep trying
506 	 * until we time out, or the user gives up.
507 	 *
508 	 * rfc1122 4.2.3.9 allows to consider as hard errors
509 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
510 	 * but it is obsoleted by pmtu discovery).
511 	 *
512 	 * Note, that in modern internet, where routing is unreliable
513 	 * and in each dark corner broken firewalls sit, sending random
514 	 * errors ordered by their masters even this two messages finally lose
515 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
516 	 *
517 	 * Now we are in compliance with RFCs.
518 	 *							--ANK (980905)
519 	 */
520 
521 	inet = inet_sk(sk);
522 	if (!sock_owned_by_user(sk) && inet->recverr) {
523 		sk->sk_err = err;
524 		sk->sk_error_report(sk);
525 	} else	{ /* Only an error on timeout */
526 		sk->sk_err_soft = err;
527 	}
528 
529 out:
530 	bh_unlock_sock(sk);
531 	sock_put(sk);
532 }
533 
534 static void __tcp_v4_send_check(struct sk_buff *skb,
535 				__be32 saddr, __be32 daddr)
536 {
537 	struct tcphdr *th = tcp_hdr(skb);
538 
539 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
540 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
541 		skb->csum_start = skb_transport_header(skb) - skb->head;
542 		skb->csum_offset = offsetof(struct tcphdr, check);
543 	} else {
544 		th->check = tcp_v4_check(skb->len, saddr, daddr,
545 					 csum_partial(th,
546 						      th->doff << 2,
547 						      skb->csum));
548 	}
549 }
550 
551 /* This routine computes an IPv4 TCP checksum. */
552 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
553 {
554 	struct inet_sock *inet = inet_sk(sk);
555 
556 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
557 }
558 EXPORT_SYMBOL(tcp_v4_send_check);
559 
560 int tcp_v4_gso_send_check(struct sk_buff *skb)
561 {
562 	const struct iphdr *iph;
563 	struct tcphdr *th;
564 
565 	if (!pskb_may_pull(skb, sizeof(*th)))
566 		return -EINVAL;
567 
568 	iph = ip_hdr(skb);
569 	th = tcp_hdr(skb);
570 
571 	th->check = 0;
572 	skb->ip_summed = CHECKSUM_PARTIAL;
573 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
574 	return 0;
575 }
576 
577 /*
578  *	This routine will send an RST to the other tcp.
579  *
580  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
581  *		      for reset.
582  *	Answer: if a packet caused RST, it is not for a socket
583  *		existing in our system, if it is matched to a socket,
584  *		it is just duplicate segment or bug in other side's TCP.
585  *		So that we build reply only basing on parameters
586  *		arrived with segment.
587  *	Exception: precedence violation. We do not implement it in any case.
588  */
589 
590 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
591 {
592 	struct tcphdr *th = tcp_hdr(skb);
593 	struct {
594 		struct tcphdr th;
595 #ifdef CONFIG_TCP_MD5SIG
596 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
597 #endif
598 	} rep;
599 	struct ip_reply_arg arg;
600 #ifdef CONFIG_TCP_MD5SIG
601 	struct tcp_md5sig_key *key;
602 #endif
603 	struct net *net;
604 
605 	/* Never send a reset in response to a reset. */
606 	if (th->rst)
607 		return;
608 
609 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
610 		return;
611 
612 	/* Swap the send and the receive. */
613 	memset(&rep, 0, sizeof(rep));
614 	rep.th.dest   = th->source;
615 	rep.th.source = th->dest;
616 	rep.th.doff   = sizeof(struct tcphdr) / 4;
617 	rep.th.rst    = 1;
618 
619 	if (th->ack) {
620 		rep.th.seq = th->ack_seq;
621 	} else {
622 		rep.th.ack = 1;
623 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
624 				       skb->len - (th->doff << 2));
625 	}
626 
627 	memset(&arg, 0, sizeof(arg));
628 	arg.iov[0].iov_base = (unsigned char *)&rep;
629 	arg.iov[0].iov_len  = sizeof(rep.th);
630 
631 #ifdef CONFIG_TCP_MD5SIG
632 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
633 	if (key) {
634 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
635 				   (TCPOPT_NOP << 16) |
636 				   (TCPOPT_MD5SIG << 8) |
637 				   TCPOLEN_MD5SIG);
638 		/* Update length and the length the header thinks exists */
639 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
640 		rep.th.doff = arg.iov[0].iov_len / 4;
641 
642 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
643 				     key, ip_hdr(skb)->saddr,
644 				     ip_hdr(skb)->daddr, &rep.th);
645 	}
646 #endif
647 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
648 				      ip_hdr(skb)->saddr, /* XXX */
649 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
650 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
651 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
652 
653 	net = dev_net(skb_dst(skb)->dev);
654 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
655 		      &arg, arg.iov[0].iov_len);
656 
657 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
658 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
659 }
660 
661 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
662    outside socket context is ugly, certainly. What can I do?
663  */
664 
665 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
666 			    u32 win, u32 ts, int oif,
667 			    struct tcp_md5sig_key *key,
668 			    int reply_flags)
669 {
670 	struct tcphdr *th = tcp_hdr(skb);
671 	struct {
672 		struct tcphdr th;
673 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
674 #ifdef CONFIG_TCP_MD5SIG
675 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
676 #endif
677 			];
678 	} rep;
679 	struct ip_reply_arg arg;
680 	struct net *net = dev_net(skb_dst(skb)->dev);
681 
682 	memset(&rep.th, 0, sizeof(struct tcphdr));
683 	memset(&arg, 0, sizeof(arg));
684 
685 	arg.iov[0].iov_base = (unsigned char *)&rep;
686 	arg.iov[0].iov_len  = sizeof(rep.th);
687 	if (ts) {
688 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
689 				   (TCPOPT_TIMESTAMP << 8) |
690 				   TCPOLEN_TIMESTAMP);
691 		rep.opt[1] = htonl(tcp_time_stamp);
692 		rep.opt[2] = htonl(ts);
693 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
694 	}
695 
696 	/* Swap the send and the receive. */
697 	rep.th.dest    = th->source;
698 	rep.th.source  = th->dest;
699 	rep.th.doff    = arg.iov[0].iov_len / 4;
700 	rep.th.seq     = htonl(seq);
701 	rep.th.ack_seq = htonl(ack);
702 	rep.th.ack     = 1;
703 	rep.th.window  = htons(win);
704 
705 #ifdef CONFIG_TCP_MD5SIG
706 	if (key) {
707 		int offset = (ts) ? 3 : 0;
708 
709 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
710 					  (TCPOPT_NOP << 16) |
711 					  (TCPOPT_MD5SIG << 8) |
712 					  TCPOLEN_MD5SIG);
713 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
714 		rep.th.doff = arg.iov[0].iov_len/4;
715 
716 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
717 				    key, ip_hdr(skb)->saddr,
718 				    ip_hdr(skb)->daddr, &rep.th);
719 	}
720 #endif
721 	arg.flags = reply_flags;
722 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
723 				      ip_hdr(skb)->saddr, /* XXX */
724 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
725 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
726 	if (oif)
727 		arg.bound_dev_if = oif;
728 
729 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
730 		      &arg, arg.iov[0].iov_len);
731 
732 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
733 }
734 
735 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
736 {
737 	struct inet_timewait_sock *tw = inet_twsk(sk);
738 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
739 
740 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
741 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
742 			tcptw->tw_ts_recent,
743 			tw->tw_bound_dev_if,
744 			tcp_twsk_md5_key(tcptw),
745 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
746 			);
747 
748 	inet_twsk_put(tw);
749 }
750 
751 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
752 				  struct request_sock *req)
753 {
754 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
755 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
756 			req->ts_recent,
757 			0,
758 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
759 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
760 }
761 
762 /*
763  *	Send a SYN-ACK after having received a SYN.
764  *	This still operates on a request_sock only, not on a big
765  *	socket.
766  */
767 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
768 			      struct request_sock *req,
769 			      struct request_values *rvp)
770 {
771 	const struct inet_request_sock *ireq = inet_rsk(req);
772 	struct flowi4 fl4;
773 	int err = -1;
774 	struct sk_buff * skb;
775 
776 	/* First, grab a route. */
777 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
778 		return -1;
779 
780 	skb = tcp_make_synack(sk, dst, req, rvp);
781 
782 	if (skb) {
783 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
784 
785 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
786 					    ireq->rmt_addr,
787 					    ireq->opt);
788 		err = net_xmit_eval(err);
789 	}
790 
791 	dst_release(dst);
792 	return err;
793 }
794 
795 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
796 			      struct request_values *rvp)
797 {
798 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
799 	return tcp_v4_send_synack(sk, NULL, req, rvp);
800 }
801 
802 /*
803  *	IPv4 request_sock destructor.
804  */
805 static void tcp_v4_reqsk_destructor(struct request_sock *req)
806 {
807 	kfree(inet_rsk(req)->opt);
808 }
809 
810 static void syn_flood_warning(const struct sk_buff *skb)
811 {
812 	const char *msg;
813 
814 #ifdef CONFIG_SYN_COOKIES
815 	if (sysctl_tcp_syncookies)
816 		msg = "Sending cookies";
817 	else
818 #endif
819 		msg = "Dropping request";
820 
821 	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
822 				ntohs(tcp_hdr(skb)->dest), msg);
823 }
824 
825 /*
826  * Save and compile IPv4 options into the request_sock if needed.
827  */
828 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
829 						  struct sk_buff *skb)
830 {
831 	const struct ip_options *opt = &(IPCB(skb)->opt);
832 	struct ip_options_rcu *dopt = NULL;
833 
834 	if (opt && opt->optlen) {
835 		int opt_size = sizeof(*dopt) + opt->optlen;
836 
837 		dopt = kmalloc(opt_size, GFP_ATOMIC);
838 		if (dopt) {
839 			if (ip_options_echo(&dopt->opt, skb)) {
840 				kfree(dopt);
841 				dopt = NULL;
842 			}
843 		}
844 	}
845 	return dopt;
846 }
847 
848 #ifdef CONFIG_TCP_MD5SIG
849 /*
850  * RFC2385 MD5 checksumming requires a mapping of
851  * IP address->MD5 Key.
852  * We need to maintain these in the sk structure.
853  */
854 
855 /* Find the Key structure for an address.  */
856 static struct tcp_md5sig_key *
857 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
858 {
859 	struct tcp_sock *tp = tcp_sk(sk);
860 	int i;
861 
862 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
863 		return NULL;
864 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
865 		if (tp->md5sig_info->keys4[i].addr == addr)
866 			return &tp->md5sig_info->keys4[i].base;
867 	}
868 	return NULL;
869 }
870 
871 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
872 					 struct sock *addr_sk)
873 {
874 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
875 }
876 EXPORT_SYMBOL(tcp_v4_md5_lookup);
877 
878 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
879 						      struct request_sock *req)
880 {
881 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
882 }
883 
884 /* This can be called on a newly created socket, from other files */
885 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
886 		      u8 *newkey, u8 newkeylen)
887 {
888 	/* Add Key to the list */
889 	struct tcp_md5sig_key *key;
890 	struct tcp_sock *tp = tcp_sk(sk);
891 	struct tcp4_md5sig_key *keys;
892 
893 	key = tcp_v4_md5_do_lookup(sk, addr);
894 	if (key) {
895 		/* Pre-existing entry - just update that one. */
896 		kfree(key->key);
897 		key->key = newkey;
898 		key->keylen = newkeylen;
899 	} else {
900 		struct tcp_md5sig_info *md5sig;
901 
902 		if (!tp->md5sig_info) {
903 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
904 						  GFP_ATOMIC);
905 			if (!tp->md5sig_info) {
906 				kfree(newkey);
907 				return -ENOMEM;
908 			}
909 			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
910 		}
911 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
912 			kfree(newkey);
913 			return -ENOMEM;
914 		}
915 		md5sig = tp->md5sig_info;
916 
917 		if (md5sig->alloced4 == md5sig->entries4) {
918 			keys = kmalloc((sizeof(*keys) *
919 					(md5sig->entries4 + 1)), GFP_ATOMIC);
920 			if (!keys) {
921 				kfree(newkey);
922 				tcp_free_md5sig_pool();
923 				return -ENOMEM;
924 			}
925 
926 			if (md5sig->entries4)
927 				memcpy(keys, md5sig->keys4,
928 				       sizeof(*keys) * md5sig->entries4);
929 
930 			/* Free old key list, and reference new one */
931 			kfree(md5sig->keys4);
932 			md5sig->keys4 = keys;
933 			md5sig->alloced4++;
934 		}
935 		md5sig->entries4++;
936 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
937 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
938 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
939 	}
940 	return 0;
941 }
942 EXPORT_SYMBOL(tcp_v4_md5_do_add);
943 
944 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
945 			       u8 *newkey, u8 newkeylen)
946 {
947 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
948 				 newkey, newkeylen);
949 }
950 
951 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
952 {
953 	struct tcp_sock *tp = tcp_sk(sk);
954 	int i;
955 
956 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
957 		if (tp->md5sig_info->keys4[i].addr == addr) {
958 			/* Free the key */
959 			kfree(tp->md5sig_info->keys4[i].base.key);
960 			tp->md5sig_info->entries4--;
961 
962 			if (tp->md5sig_info->entries4 == 0) {
963 				kfree(tp->md5sig_info->keys4);
964 				tp->md5sig_info->keys4 = NULL;
965 				tp->md5sig_info->alloced4 = 0;
966 			} else if (tp->md5sig_info->entries4 != i) {
967 				/* Need to do some manipulation */
968 				memmove(&tp->md5sig_info->keys4[i],
969 					&tp->md5sig_info->keys4[i+1],
970 					(tp->md5sig_info->entries4 - i) *
971 					 sizeof(struct tcp4_md5sig_key));
972 			}
973 			tcp_free_md5sig_pool();
974 			return 0;
975 		}
976 	}
977 	return -ENOENT;
978 }
979 EXPORT_SYMBOL(tcp_v4_md5_do_del);
980 
981 static void tcp_v4_clear_md5_list(struct sock *sk)
982 {
983 	struct tcp_sock *tp = tcp_sk(sk);
984 
985 	/* Free each key, then the set of key keys,
986 	 * the crypto element, and then decrement our
987 	 * hold on the last resort crypto.
988 	 */
989 	if (tp->md5sig_info->entries4) {
990 		int i;
991 		for (i = 0; i < tp->md5sig_info->entries4; i++)
992 			kfree(tp->md5sig_info->keys4[i].base.key);
993 		tp->md5sig_info->entries4 = 0;
994 		tcp_free_md5sig_pool();
995 	}
996 	if (tp->md5sig_info->keys4) {
997 		kfree(tp->md5sig_info->keys4);
998 		tp->md5sig_info->keys4 = NULL;
999 		tp->md5sig_info->alloced4  = 0;
1000 	}
1001 }
1002 
1003 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1004 				 int optlen)
1005 {
1006 	struct tcp_md5sig cmd;
1007 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1008 	u8 *newkey;
1009 
1010 	if (optlen < sizeof(cmd))
1011 		return -EINVAL;
1012 
1013 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1014 		return -EFAULT;
1015 
1016 	if (sin->sin_family != AF_INET)
1017 		return -EINVAL;
1018 
1019 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1020 		if (!tcp_sk(sk)->md5sig_info)
1021 			return -ENOENT;
1022 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1023 	}
1024 
1025 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1026 		return -EINVAL;
1027 
1028 	if (!tcp_sk(sk)->md5sig_info) {
1029 		struct tcp_sock *tp = tcp_sk(sk);
1030 		struct tcp_md5sig_info *p;
1031 
1032 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1033 		if (!p)
1034 			return -EINVAL;
1035 
1036 		tp->md5sig_info = p;
1037 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1038 	}
1039 
1040 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1041 	if (!newkey)
1042 		return -ENOMEM;
1043 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1044 				 newkey, cmd.tcpm_keylen);
1045 }
1046 
1047 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1048 					__be32 daddr, __be32 saddr, int nbytes)
1049 {
1050 	struct tcp4_pseudohdr *bp;
1051 	struct scatterlist sg;
1052 
1053 	bp = &hp->md5_blk.ip4;
1054 
1055 	/*
1056 	 * 1. the TCP pseudo-header (in the order: source IP address,
1057 	 * destination IP address, zero-padded protocol number, and
1058 	 * segment length)
1059 	 */
1060 	bp->saddr = saddr;
1061 	bp->daddr = daddr;
1062 	bp->pad = 0;
1063 	bp->protocol = IPPROTO_TCP;
1064 	bp->len = cpu_to_be16(nbytes);
1065 
1066 	sg_init_one(&sg, bp, sizeof(*bp));
1067 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1068 }
1069 
1070 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1071 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1072 {
1073 	struct tcp_md5sig_pool *hp;
1074 	struct hash_desc *desc;
1075 
1076 	hp = tcp_get_md5sig_pool();
1077 	if (!hp)
1078 		goto clear_hash_noput;
1079 	desc = &hp->md5_desc;
1080 
1081 	if (crypto_hash_init(desc))
1082 		goto clear_hash;
1083 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1084 		goto clear_hash;
1085 	if (tcp_md5_hash_header(hp, th))
1086 		goto clear_hash;
1087 	if (tcp_md5_hash_key(hp, key))
1088 		goto clear_hash;
1089 	if (crypto_hash_final(desc, md5_hash))
1090 		goto clear_hash;
1091 
1092 	tcp_put_md5sig_pool();
1093 	return 0;
1094 
1095 clear_hash:
1096 	tcp_put_md5sig_pool();
1097 clear_hash_noput:
1098 	memset(md5_hash, 0, 16);
1099 	return 1;
1100 }
1101 
1102 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1103 			struct sock *sk, struct request_sock *req,
1104 			struct sk_buff *skb)
1105 {
1106 	struct tcp_md5sig_pool *hp;
1107 	struct hash_desc *desc;
1108 	struct tcphdr *th = tcp_hdr(skb);
1109 	__be32 saddr, daddr;
1110 
1111 	if (sk) {
1112 		saddr = inet_sk(sk)->inet_saddr;
1113 		daddr = inet_sk(sk)->inet_daddr;
1114 	} else if (req) {
1115 		saddr = inet_rsk(req)->loc_addr;
1116 		daddr = inet_rsk(req)->rmt_addr;
1117 	} else {
1118 		const struct iphdr *iph = ip_hdr(skb);
1119 		saddr = iph->saddr;
1120 		daddr = iph->daddr;
1121 	}
1122 
1123 	hp = tcp_get_md5sig_pool();
1124 	if (!hp)
1125 		goto clear_hash_noput;
1126 	desc = &hp->md5_desc;
1127 
1128 	if (crypto_hash_init(desc))
1129 		goto clear_hash;
1130 
1131 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1132 		goto clear_hash;
1133 	if (tcp_md5_hash_header(hp, th))
1134 		goto clear_hash;
1135 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1136 		goto clear_hash;
1137 	if (tcp_md5_hash_key(hp, key))
1138 		goto clear_hash;
1139 	if (crypto_hash_final(desc, md5_hash))
1140 		goto clear_hash;
1141 
1142 	tcp_put_md5sig_pool();
1143 	return 0;
1144 
1145 clear_hash:
1146 	tcp_put_md5sig_pool();
1147 clear_hash_noput:
1148 	memset(md5_hash, 0, 16);
1149 	return 1;
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1152 
1153 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1154 {
1155 	/*
1156 	 * This gets called for each TCP segment that arrives
1157 	 * so we want to be efficient.
1158 	 * We have 3 drop cases:
1159 	 * o No MD5 hash and one expected.
1160 	 * o MD5 hash and we're not expecting one.
1161 	 * o MD5 hash and its wrong.
1162 	 */
1163 	__u8 *hash_location = NULL;
1164 	struct tcp_md5sig_key *hash_expected;
1165 	const struct iphdr *iph = ip_hdr(skb);
1166 	struct tcphdr *th = tcp_hdr(skb);
1167 	int genhash;
1168 	unsigned char newhash[16];
1169 
1170 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1171 	hash_location = tcp_parse_md5sig_option(th);
1172 
1173 	/* We've parsed the options - do we have a hash? */
1174 	if (!hash_expected && !hash_location)
1175 		return 0;
1176 
1177 	if (hash_expected && !hash_location) {
1178 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1179 		return 1;
1180 	}
1181 
1182 	if (!hash_expected && hash_location) {
1183 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1184 		return 1;
1185 	}
1186 
1187 	/* Okay, so this is hash_expected and hash_location -
1188 	 * so we need to calculate the checksum.
1189 	 */
1190 	genhash = tcp_v4_md5_hash_skb(newhash,
1191 				      hash_expected,
1192 				      NULL, NULL, skb);
1193 
1194 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1195 		if (net_ratelimit()) {
1196 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1197 			       &iph->saddr, ntohs(th->source),
1198 			       &iph->daddr, ntohs(th->dest),
1199 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1200 		}
1201 		return 1;
1202 	}
1203 	return 0;
1204 }
1205 
1206 #endif
1207 
1208 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1209 	.family		=	PF_INET,
1210 	.obj_size	=	sizeof(struct tcp_request_sock),
1211 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1212 	.send_ack	=	tcp_v4_reqsk_send_ack,
1213 	.destructor	=	tcp_v4_reqsk_destructor,
1214 	.send_reset	=	tcp_v4_send_reset,
1215 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1216 };
1217 
1218 #ifdef CONFIG_TCP_MD5SIG
1219 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1220 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1221 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1222 };
1223 #endif
1224 
1225 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1226 {
1227 	struct tcp_extend_values tmp_ext;
1228 	struct tcp_options_received tmp_opt;
1229 	u8 *hash_location;
1230 	struct request_sock *req;
1231 	struct inet_request_sock *ireq;
1232 	struct tcp_sock *tp = tcp_sk(sk);
1233 	struct dst_entry *dst = NULL;
1234 	__be32 saddr = ip_hdr(skb)->saddr;
1235 	__be32 daddr = ip_hdr(skb)->daddr;
1236 	__u32 isn = TCP_SKB_CB(skb)->when;
1237 #ifdef CONFIG_SYN_COOKIES
1238 	int want_cookie = 0;
1239 #else
1240 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1241 #endif
1242 
1243 	/* Never answer to SYNs send to broadcast or multicast */
1244 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1245 		goto drop;
1246 
1247 	/* TW buckets are converted to open requests without
1248 	 * limitations, they conserve resources and peer is
1249 	 * evidently real one.
1250 	 */
1251 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1252 		if (net_ratelimit())
1253 			syn_flood_warning(skb);
1254 #ifdef CONFIG_SYN_COOKIES
1255 		if (sysctl_tcp_syncookies) {
1256 			want_cookie = 1;
1257 		} else
1258 #endif
1259 		goto drop;
1260 	}
1261 
1262 	/* Accept backlog is full. If we have already queued enough
1263 	 * of warm entries in syn queue, drop request. It is better than
1264 	 * clogging syn queue with openreqs with exponentially increasing
1265 	 * timeout.
1266 	 */
1267 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1268 		goto drop;
1269 
1270 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1271 	if (!req)
1272 		goto drop;
1273 
1274 #ifdef CONFIG_TCP_MD5SIG
1275 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1276 #endif
1277 
1278 	tcp_clear_options(&tmp_opt);
1279 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1280 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1281 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1282 
1283 	if (tmp_opt.cookie_plus > 0 &&
1284 	    tmp_opt.saw_tstamp &&
1285 	    !tp->rx_opt.cookie_out_never &&
1286 	    (sysctl_tcp_cookie_size > 0 ||
1287 	     (tp->cookie_values != NULL &&
1288 	      tp->cookie_values->cookie_desired > 0))) {
1289 		u8 *c;
1290 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1291 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1292 
1293 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1294 			goto drop_and_release;
1295 
1296 		/* Secret recipe starts with IP addresses */
1297 		*mess++ ^= (__force u32)daddr;
1298 		*mess++ ^= (__force u32)saddr;
1299 
1300 		/* plus variable length Initiator Cookie */
1301 		c = (u8 *)mess;
1302 		while (l-- > 0)
1303 			*c++ ^= *hash_location++;
1304 
1305 #ifdef CONFIG_SYN_COOKIES
1306 		want_cookie = 0;	/* not our kind of cookie */
1307 #endif
1308 		tmp_ext.cookie_out_never = 0; /* false */
1309 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1310 	} else if (!tp->rx_opt.cookie_in_always) {
1311 		/* redundant indications, but ensure initialization. */
1312 		tmp_ext.cookie_out_never = 1; /* true */
1313 		tmp_ext.cookie_plus = 0;
1314 	} else {
1315 		goto drop_and_release;
1316 	}
1317 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1318 
1319 	if (want_cookie && !tmp_opt.saw_tstamp)
1320 		tcp_clear_options(&tmp_opt);
1321 
1322 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1323 	tcp_openreq_init(req, &tmp_opt, skb);
1324 
1325 	ireq = inet_rsk(req);
1326 	ireq->loc_addr = daddr;
1327 	ireq->rmt_addr = saddr;
1328 	ireq->no_srccheck = inet_sk(sk)->transparent;
1329 	ireq->opt = tcp_v4_save_options(sk, skb);
1330 
1331 	if (security_inet_conn_request(sk, skb, req))
1332 		goto drop_and_free;
1333 
1334 	if (!want_cookie || tmp_opt.tstamp_ok)
1335 		TCP_ECN_create_request(req, tcp_hdr(skb));
1336 
1337 	if (want_cookie) {
1338 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1339 		req->cookie_ts = tmp_opt.tstamp_ok;
1340 	} else if (!isn) {
1341 		struct inet_peer *peer = NULL;
1342 		struct flowi4 fl4;
1343 
1344 		/* VJ's idea. We save last timestamp seen
1345 		 * from the destination in peer table, when entering
1346 		 * state TIME-WAIT, and check against it before
1347 		 * accepting new connection request.
1348 		 *
1349 		 * If "isn" is not zero, this request hit alive
1350 		 * timewait bucket, so that all the necessary checks
1351 		 * are made in the function processing timewait state.
1352 		 */
1353 		if (tmp_opt.saw_tstamp &&
1354 		    tcp_death_row.sysctl_tw_recycle &&
1355 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1356 		    fl4.daddr == saddr &&
1357 		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1358 			inet_peer_refcheck(peer);
1359 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1360 			    (s32)(peer->tcp_ts - req->ts_recent) >
1361 							TCP_PAWS_WINDOW) {
1362 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1363 				goto drop_and_release;
1364 			}
1365 		}
1366 		/* Kill the following clause, if you dislike this way. */
1367 		else if (!sysctl_tcp_syncookies &&
1368 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1369 			  (sysctl_max_syn_backlog >> 2)) &&
1370 			 (!peer || !peer->tcp_ts_stamp) &&
1371 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1372 			/* Without syncookies last quarter of
1373 			 * backlog is filled with destinations,
1374 			 * proven to be alive.
1375 			 * It means that we continue to communicate
1376 			 * to destinations, already remembered
1377 			 * to the moment of synflood.
1378 			 */
1379 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1380 				       &saddr, ntohs(tcp_hdr(skb)->source));
1381 			goto drop_and_release;
1382 		}
1383 
1384 		isn = tcp_v4_init_sequence(skb);
1385 	}
1386 	tcp_rsk(req)->snt_isn = isn;
1387 
1388 	if (tcp_v4_send_synack(sk, dst, req,
1389 			       (struct request_values *)&tmp_ext) ||
1390 	    want_cookie)
1391 		goto drop_and_free;
1392 
1393 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1394 	return 0;
1395 
1396 drop_and_release:
1397 	dst_release(dst);
1398 drop_and_free:
1399 	reqsk_free(req);
1400 drop:
1401 	return 0;
1402 }
1403 EXPORT_SYMBOL(tcp_v4_conn_request);
1404 
1405 
1406 /*
1407  * The three way handshake has completed - we got a valid synack -
1408  * now create the new socket.
1409  */
1410 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1411 				  struct request_sock *req,
1412 				  struct dst_entry *dst)
1413 {
1414 	struct inet_request_sock *ireq;
1415 	struct inet_sock *newinet;
1416 	struct tcp_sock *newtp;
1417 	struct sock *newsk;
1418 #ifdef CONFIG_TCP_MD5SIG
1419 	struct tcp_md5sig_key *key;
1420 #endif
1421 	struct ip_options_rcu *inet_opt;
1422 
1423 	if (sk_acceptq_is_full(sk))
1424 		goto exit_overflow;
1425 
1426 	newsk = tcp_create_openreq_child(sk, req, skb);
1427 	if (!newsk)
1428 		goto exit_nonewsk;
1429 
1430 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1431 
1432 	newtp		      = tcp_sk(newsk);
1433 	newinet		      = inet_sk(newsk);
1434 	ireq		      = inet_rsk(req);
1435 	newinet->inet_daddr   = ireq->rmt_addr;
1436 	newinet->inet_rcv_saddr = ireq->loc_addr;
1437 	newinet->inet_saddr	      = ireq->loc_addr;
1438 	inet_opt	      = ireq->opt;
1439 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1440 	ireq->opt	      = NULL;
1441 	newinet->mc_index     = inet_iif(skb);
1442 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1443 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1444 	if (inet_opt)
1445 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1446 	newinet->inet_id = newtp->write_seq ^ jiffies;
1447 
1448 	if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
1449 		goto put_and_exit;
1450 
1451 	sk_setup_caps(newsk, dst);
1452 
1453 	tcp_mtup_init(newsk);
1454 	tcp_sync_mss(newsk, dst_mtu(dst));
1455 	newtp->advmss = dst_metric_advmss(dst);
1456 	if (tcp_sk(sk)->rx_opt.user_mss &&
1457 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1458 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1459 
1460 	tcp_initialize_rcv_mss(newsk);
1461 
1462 #ifdef CONFIG_TCP_MD5SIG
1463 	/* Copy over the MD5 key from the original socket */
1464 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1465 	if (key != NULL) {
1466 		/*
1467 		 * We're using one, so create a matching key
1468 		 * on the newsk structure. If we fail to get
1469 		 * memory, then we end up not copying the key
1470 		 * across. Shucks.
1471 		 */
1472 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1473 		if (newkey != NULL)
1474 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1475 					  newkey, key->keylen);
1476 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1477 	}
1478 #endif
1479 
1480 	if (__inet_inherit_port(sk, newsk) < 0)
1481 		goto put_and_exit;
1482 	__inet_hash_nolisten(newsk, NULL);
1483 
1484 	return newsk;
1485 
1486 exit_overflow:
1487 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1488 exit_nonewsk:
1489 	dst_release(dst);
1490 exit:
1491 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1492 	return NULL;
1493 put_and_exit:
1494 	sock_put(newsk);
1495 	goto exit;
1496 }
1497 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1498 
1499 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1500 {
1501 	struct tcphdr *th = tcp_hdr(skb);
1502 	const struct iphdr *iph = ip_hdr(skb);
1503 	struct sock *nsk;
1504 	struct request_sock **prev;
1505 	/* Find possible connection requests. */
1506 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1507 						       iph->saddr, iph->daddr);
1508 	if (req)
1509 		return tcp_check_req(sk, skb, req, prev);
1510 
1511 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1512 			th->source, iph->daddr, th->dest, inet_iif(skb));
1513 
1514 	if (nsk) {
1515 		if (nsk->sk_state != TCP_TIME_WAIT) {
1516 			bh_lock_sock(nsk);
1517 			return nsk;
1518 		}
1519 		inet_twsk_put(inet_twsk(nsk));
1520 		return NULL;
1521 	}
1522 
1523 #ifdef CONFIG_SYN_COOKIES
1524 	if (!th->syn)
1525 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1526 #endif
1527 	return sk;
1528 }
1529 
1530 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1531 {
1532 	const struct iphdr *iph = ip_hdr(skb);
1533 
1534 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1535 		if (!tcp_v4_check(skb->len, iph->saddr,
1536 				  iph->daddr, skb->csum)) {
1537 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1538 			return 0;
1539 		}
1540 	}
1541 
1542 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1543 				       skb->len, IPPROTO_TCP, 0);
1544 
1545 	if (skb->len <= 76) {
1546 		return __skb_checksum_complete(skb);
1547 	}
1548 	return 0;
1549 }
1550 
1551 
1552 /* The socket must have it's spinlock held when we get
1553  * here.
1554  *
1555  * We have a potential double-lock case here, so even when
1556  * doing backlog processing we use the BH locking scheme.
1557  * This is because we cannot sleep with the original spinlock
1558  * held.
1559  */
1560 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1561 {
1562 	struct sock *rsk;
1563 #ifdef CONFIG_TCP_MD5SIG
1564 	/*
1565 	 * We really want to reject the packet as early as possible
1566 	 * if:
1567 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1568 	 *  o There is an MD5 option and we're not expecting one
1569 	 */
1570 	if (tcp_v4_inbound_md5_hash(sk, skb))
1571 		goto discard;
1572 #endif
1573 
1574 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1575 		sock_rps_save_rxhash(sk, skb->rxhash);
1576 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1577 			rsk = sk;
1578 			goto reset;
1579 		}
1580 		return 0;
1581 	}
1582 
1583 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1584 		goto csum_err;
1585 
1586 	if (sk->sk_state == TCP_LISTEN) {
1587 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1588 		if (!nsk)
1589 			goto discard;
1590 
1591 		if (nsk != sk) {
1592 			sock_rps_save_rxhash(nsk, skb->rxhash);
1593 			if (tcp_child_process(sk, nsk, skb)) {
1594 				rsk = nsk;
1595 				goto reset;
1596 			}
1597 			return 0;
1598 		}
1599 	} else
1600 		sock_rps_save_rxhash(sk, skb->rxhash);
1601 
1602 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1603 		rsk = sk;
1604 		goto reset;
1605 	}
1606 	return 0;
1607 
1608 reset:
1609 	tcp_v4_send_reset(rsk, skb);
1610 discard:
1611 	kfree_skb(skb);
1612 	/* Be careful here. If this function gets more complicated and
1613 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1614 	 * might be destroyed here. This current version compiles correctly,
1615 	 * but you have been warned.
1616 	 */
1617 	return 0;
1618 
1619 csum_err:
1620 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1621 	goto discard;
1622 }
1623 EXPORT_SYMBOL(tcp_v4_do_rcv);
1624 
1625 /*
1626  *	From tcp_input.c
1627  */
1628 
1629 int tcp_v4_rcv(struct sk_buff *skb)
1630 {
1631 	const struct iphdr *iph;
1632 	struct tcphdr *th;
1633 	struct sock *sk;
1634 	int ret;
1635 	struct net *net = dev_net(skb->dev);
1636 
1637 	if (skb->pkt_type != PACKET_HOST)
1638 		goto discard_it;
1639 
1640 	/* Count it even if it's bad */
1641 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1642 
1643 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1644 		goto discard_it;
1645 
1646 	th = tcp_hdr(skb);
1647 
1648 	if (th->doff < sizeof(struct tcphdr) / 4)
1649 		goto bad_packet;
1650 	if (!pskb_may_pull(skb, th->doff * 4))
1651 		goto discard_it;
1652 
1653 	/* An explanation is required here, I think.
1654 	 * Packet length and doff are validated by header prediction,
1655 	 * provided case of th->doff==0 is eliminated.
1656 	 * So, we defer the checks. */
1657 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1658 		goto bad_packet;
1659 
1660 	th = tcp_hdr(skb);
1661 	iph = ip_hdr(skb);
1662 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1663 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1664 				    skb->len - th->doff * 4);
1665 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1666 	TCP_SKB_CB(skb)->when	 = 0;
1667 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1668 	TCP_SKB_CB(skb)->sacked	 = 0;
1669 
1670 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1671 	if (!sk)
1672 		goto no_tcp_socket;
1673 
1674 process:
1675 	if (sk->sk_state == TCP_TIME_WAIT)
1676 		goto do_time_wait;
1677 
1678 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1679 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1680 		goto discard_and_relse;
1681 	}
1682 
1683 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1684 		goto discard_and_relse;
1685 	nf_reset(skb);
1686 
1687 	if (sk_filter(sk, skb))
1688 		goto discard_and_relse;
1689 
1690 	skb->dev = NULL;
1691 
1692 	bh_lock_sock_nested(sk);
1693 	ret = 0;
1694 	if (!sock_owned_by_user(sk)) {
1695 #ifdef CONFIG_NET_DMA
1696 		struct tcp_sock *tp = tcp_sk(sk);
1697 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1698 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1699 		if (tp->ucopy.dma_chan)
1700 			ret = tcp_v4_do_rcv(sk, skb);
1701 		else
1702 #endif
1703 		{
1704 			if (!tcp_prequeue(sk, skb))
1705 				ret = tcp_v4_do_rcv(sk, skb);
1706 		}
1707 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1708 		bh_unlock_sock(sk);
1709 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1710 		goto discard_and_relse;
1711 	}
1712 	bh_unlock_sock(sk);
1713 
1714 	sock_put(sk);
1715 
1716 	return ret;
1717 
1718 no_tcp_socket:
1719 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1720 		goto discard_it;
1721 
1722 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1723 bad_packet:
1724 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1725 	} else {
1726 		tcp_v4_send_reset(NULL, skb);
1727 	}
1728 
1729 discard_it:
1730 	/* Discard frame. */
1731 	kfree_skb(skb);
1732 	return 0;
1733 
1734 discard_and_relse:
1735 	sock_put(sk);
1736 	goto discard_it;
1737 
1738 do_time_wait:
1739 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1740 		inet_twsk_put(inet_twsk(sk));
1741 		goto discard_it;
1742 	}
1743 
1744 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1745 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1746 		inet_twsk_put(inet_twsk(sk));
1747 		goto discard_it;
1748 	}
1749 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1750 	case TCP_TW_SYN: {
1751 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1752 							&tcp_hashinfo,
1753 							iph->daddr, th->dest,
1754 							inet_iif(skb));
1755 		if (sk2) {
1756 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1757 			inet_twsk_put(inet_twsk(sk));
1758 			sk = sk2;
1759 			goto process;
1760 		}
1761 		/* Fall through to ACK */
1762 	}
1763 	case TCP_TW_ACK:
1764 		tcp_v4_timewait_ack(sk, skb);
1765 		break;
1766 	case TCP_TW_RST:
1767 		goto no_tcp_socket;
1768 	case TCP_TW_SUCCESS:;
1769 	}
1770 	goto discard_it;
1771 }
1772 
1773 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1774 {
1775 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1776 	struct inet_sock *inet = inet_sk(sk);
1777 	struct inet_peer *peer;
1778 
1779 	if (!rt ||
1780 	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1781 		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1782 		*release_it = true;
1783 	} else {
1784 		if (!rt->peer)
1785 			rt_bind_peer(rt, inet->inet_daddr, 1);
1786 		peer = rt->peer;
1787 		*release_it = false;
1788 	}
1789 
1790 	return peer;
1791 }
1792 EXPORT_SYMBOL(tcp_v4_get_peer);
1793 
1794 void *tcp_v4_tw_get_peer(struct sock *sk)
1795 {
1796 	struct inet_timewait_sock *tw = inet_twsk(sk);
1797 
1798 	return inet_getpeer_v4(tw->tw_daddr, 1);
1799 }
1800 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1801 
1802 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1803 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1804 	.twsk_unique	= tcp_twsk_unique,
1805 	.twsk_destructor= tcp_twsk_destructor,
1806 	.twsk_getpeer	= tcp_v4_tw_get_peer,
1807 };
1808 
1809 const struct inet_connection_sock_af_ops ipv4_specific = {
1810 	.queue_xmit	   = ip_queue_xmit,
1811 	.send_check	   = tcp_v4_send_check,
1812 	.rebuild_header	   = inet_sk_rebuild_header,
1813 	.conn_request	   = tcp_v4_conn_request,
1814 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1815 	.get_peer	   = tcp_v4_get_peer,
1816 	.net_header_len	   = sizeof(struct iphdr),
1817 	.setsockopt	   = ip_setsockopt,
1818 	.getsockopt	   = ip_getsockopt,
1819 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1820 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1821 	.bind_conflict	   = inet_csk_bind_conflict,
1822 #ifdef CONFIG_COMPAT
1823 	.compat_setsockopt = compat_ip_setsockopt,
1824 	.compat_getsockopt = compat_ip_getsockopt,
1825 #endif
1826 };
1827 EXPORT_SYMBOL(ipv4_specific);
1828 
1829 #ifdef CONFIG_TCP_MD5SIG
1830 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1831 	.md5_lookup		= tcp_v4_md5_lookup,
1832 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1833 	.md5_add		= tcp_v4_md5_add_func,
1834 	.md5_parse		= tcp_v4_parse_md5_keys,
1835 };
1836 #endif
1837 
1838 /* NOTE: A lot of things set to zero explicitly by call to
1839  *       sk_alloc() so need not be done here.
1840  */
1841 static int tcp_v4_init_sock(struct sock *sk)
1842 {
1843 	struct inet_connection_sock *icsk = inet_csk(sk);
1844 	struct tcp_sock *tp = tcp_sk(sk);
1845 
1846 	skb_queue_head_init(&tp->out_of_order_queue);
1847 	tcp_init_xmit_timers(sk);
1848 	tcp_prequeue_init(tp);
1849 
1850 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1851 	tp->mdev = TCP_TIMEOUT_INIT;
1852 
1853 	/* So many TCP implementations out there (incorrectly) count the
1854 	 * initial SYN frame in their delayed-ACK and congestion control
1855 	 * algorithms that we must have the following bandaid to talk
1856 	 * efficiently to them.  -DaveM
1857 	 */
1858 	tp->snd_cwnd = 2;
1859 
1860 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1861 	 * initialization of these values.
1862 	 */
1863 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1864 	tp->snd_cwnd_clamp = ~0;
1865 	tp->mss_cache = TCP_MSS_DEFAULT;
1866 
1867 	tp->reordering = sysctl_tcp_reordering;
1868 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1869 
1870 	sk->sk_state = TCP_CLOSE;
1871 
1872 	sk->sk_write_space = sk_stream_write_space;
1873 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1874 
1875 	icsk->icsk_af_ops = &ipv4_specific;
1876 	icsk->icsk_sync_mss = tcp_sync_mss;
1877 #ifdef CONFIG_TCP_MD5SIG
1878 	tp->af_specific = &tcp_sock_ipv4_specific;
1879 #endif
1880 
1881 	/* TCP Cookie Transactions */
1882 	if (sysctl_tcp_cookie_size > 0) {
1883 		/* Default, cookies without s_data_payload. */
1884 		tp->cookie_values =
1885 			kzalloc(sizeof(*tp->cookie_values),
1886 				sk->sk_allocation);
1887 		if (tp->cookie_values != NULL)
1888 			kref_init(&tp->cookie_values->kref);
1889 	}
1890 	/* Presumed zeroed, in order of appearance:
1891 	 *	cookie_in_always, cookie_out_never,
1892 	 *	s_data_constant, s_data_in, s_data_out
1893 	 */
1894 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1895 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1896 
1897 	local_bh_disable();
1898 	percpu_counter_inc(&tcp_sockets_allocated);
1899 	local_bh_enable();
1900 
1901 	return 0;
1902 }
1903 
1904 void tcp_v4_destroy_sock(struct sock *sk)
1905 {
1906 	struct tcp_sock *tp = tcp_sk(sk);
1907 
1908 	tcp_clear_xmit_timers(sk);
1909 
1910 	tcp_cleanup_congestion_control(sk);
1911 
1912 	/* Cleanup up the write buffer. */
1913 	tcp_write_queue_purge(sk);
1914 
1915 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1916 	__skb_queue_purge(&tp->out_of_order_queue);
1917 
1918 #ifdef CONFIG_TCP_MD5SIG
1919 	/* Clean up the MD5 key list, if any */
1920 	if (tp->md5sig_info) {
1921 		tcp_v4_clear_md5_list(sk);
1922 		kfree(tp->md5sig_info);
1923 		tp->md5sig_info = NULL;
1924 	}
1925 #endif
1926 
1927 #ifdef CONFIG_NET_DMA
1928 	/* Cleans up our sk_async_wait_queue */
1929 	__skb_queue_purge(&sk->sk_async_wait_queue);
1930 #endif
1931 
1932 	/* Clean prequeue, it must be empty really */
1933 	__skb_queue_purge(&tp->ucopy.prequeue);
1934 
1935 	/* Clean up a referenced TCP bind bucket. */
1936 	if (inet_csk(sk)->icsk_bind_hash)
1937 		inet_put_port(sk);
1938 
1939 	/*
1940 	 * If sendmsg cached page exists, toss it.
1941 	 */
1942 	if (sk->sk_sndmsg_page) {
1943 		__free_page(sk->sk_sndmsg_page);
1944 		sk->sk_sndmsg_page = NULL;
1945 	}
1946 
1947 	/* TCP Cookie Transactions */
1948 	if (tp->cookie_values != NULL) {
1949 		kref_put(&tp->cookie_values->kref,
1950 			 tcp_cookie_values_release);
1951 		tp->cookie_values = NULL;
1952 	}
1953 
1954 	percpu_counter_dec(&tcp_sockets_allocated);
1955 }
1956 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1957 
1958 #ifdef CONFIG_PROC_FS
1959 /* Proc filesystem TCP sock list dumping. */
1960 
1961 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1962 {
1963 	return hlist_nulls_empty(head) ? NULL :
1964 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1965 }
1966 
1967 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1968 {
1969 	return !is_a_nulls(tw->tw_node.next) ?
1970 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1971 }
1972 
1973 /*
1974  * Get next listener socket follow cur.  If cur is NULL, get first socket
1975  * starting from bucket given in st->bucket; when st->bucket is zero the
1976  * very first socket in the hash table is returned.
1977  */
1978 static void *listening_get_next(struct seq_file *seq, void *cur)
1979 {
1980 	struct inet_connection_sock *icsk;
1981 	struct hlist_nulls_node *node;
1982 	struct sock *sk = cur;
1983 	struct inet_listen_hashbucket *ilb;
1984 	struct tcp_iter_state *st = seq->private;
1985 	struct net *net = seq_file_net(seq);
1986 
1987 	if (!sk) {
1988 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1989 		spin_lock_bh(&ilb->lock);
1990 		sk = sk_nulls_head(&ilb->head);
1991 		st->offset = 0;
1992 		goto get_sk;
1993 	}
1994 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1995 	++st->num;
1996 	++st->offset;
1997 
1998 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1999 		struct request_sock *req = cur;
2000 
2001 		icsk = inet_csk(st->syn_wait_sk);
2002 		req = req->dl_next;
2003 		while (1) {
2004 			while (req) {
2005 				if (req->rsk_ops->family == st->family) {
2006 					cur = req;
2007 					goto out;
2008 				}
2009 				req = req->dl_next;
2010 			}
2011 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2012 				break;
2013 get_req:
2014 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2015 		}
2016 		sk	  = sk_nulls_next(st->syn_wait_sk);
2017 		st->state = TCP_SEQ_STATE_LISTENING;
2018 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2019 	} else {
2020 		icsk = inet_csk(sk);
2021 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2022 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2023 			goto start_req;
2024 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2025 		sk = sk_nulls_next(sk);
2026 	}
2027 get_sk:
2028 	sk_nulls_for_each_from(sk, node) {
2029 		if (!net_eq(sock_net(sk), net))
2030 			continue;
2031 		if (sk->sk_family == st->family) {
2032 			cur = sk;
2033 			goto out;
2034 		}
2035 		icsk = inet_csk(sk);
2036 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2037 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2038 start_req:
2039 			st->uid		= sock_i_uid(sk);
2040 			st->syn_wait_sk = sk;
2041 			st->state	= TCP_SEQ_STATE_OPENREQ;
2042 			st->sbucket	= 0;
2043 			goto get_req;
2044 		}
2045 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2046 	}
2047 	spin_unlock_bh(&ilb->lock);
2048 	st->offset = 0;
2049 	if (++st->bucket < INET_LHTABLE_SIZE) {
2050 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2051 		spin_lock_bh(&ilb->lock);
2052 		sk = sk_nulls_head(&ilb->head);
2053 		goto get_sk;
2054 	}
2055 	cur = NULL;
2056 out:
2057 	return cur;
2058 }
2059 
2060 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2061 {
2062 	struct tcp_iter_state *st = seq->private;
2063 	void *rc;
2064 
2065 	st->bucket = 0;
2066 	st->offset = 0;
2067 	rc = listening_get_next(seq, NULL);
2068 
2069 	while (rc && *pos) {
2070 		rc = listening_get_next(seq, rc);
2071 		--*pos;
2072 	}
2073 	return rc;
2074 }
2075 
2076 static inline int empty_bucket(struct tcp_iter_state *st)
2077 {
2078 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2079 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2080 }
2081 
2082 /*
2083  * Get first established socket starting from bucket given in st->bucket.
2084  * If st->bucket is zero, the very first socket in the hash is returned.
2085  */
2086 static void *established_get_first(struct seq_file *seq)
2087 {
2088 	struct tcp_iter_state *st = seq->private;
2089 	struct net *net = seq_file_net(seq);
2090 	void *rc = NULL;
2091 
2092 	st->offset = 0;
2093 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2094 		struct sock *sk;
2095 		struct hlist_nulls_node *node;
2096 		struct inet_timewait_sock *tw;
2097 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2098 
2099 		/* Lockless fast path for the common case of empty buckets */
2100 		if (empty_bucket(st))
2101 			continue;
2102 
2103 		spin_lock_bh(lock);
2104 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2105 			if (sk->sk_family != st->family ||
2106 			    !net_eq(sock_net(sk), net)) {
2107 				continue;
2108 			}
2109 			rc = sk;
2110 			goto out;
2111 		}
2112 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2113 		inet_twsk_for_each(tw, node,
2114 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2115 			if (tw->tw_family != st->family ||
2116 			    !net_eq(twsk_net(tw), net)) {
2117 				continue;
2118 			}
2119 			rc = tw;
2120 			goto out;
2121 		}
2122 		spin_unlock_bh(lock);
2123 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2124 	}
2125 out:
2126 	return rc;
2127 }
2128 
2129 static void *established_get_next(struct seq_file *seq, void *cur)
2130 {
2131 	struct sock *sk = cur;
2132 	struct inet_timewait_sock *tw;
2133 	struct hlist_nulls_node *node;
2134 	struct tcp_iter_state *st = seq->private;
2135 	struct net *net = seq_file_net(seq);
2136 
2137 	++st->num;
2138 	++st->offset;
2139 
2140 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2141 		tw = cur;
2142 		tw = tw_next(tw);
2143 get_tw:
2144 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2145 			tw = tw_next(tw);
2146 		}
2147 		if (tw) {
2148 			cur = tw;
2149 			goto out;
2150 		}
2151 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2152 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2153 
2154 		/* Look for next non empty bucket */
2155 		st->offset = 0;
2156 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2157 				empty_bucket(st))
2158 			;
2159 		if (st->bucket > tcp_hashinfo.ehash_mask)
2160 			return NULL;
2161 
2162 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2163 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2164 	} else
2165 		sk = sk_nulls_next(sk);
2166 
2167 	sk_nulls_for_each_from(sk, node) {
2168 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2169 			goto found;
2170 	}
2171 
2172 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2173 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2174 	goto get_tw;
2175 found:
2176 	cur = sk;
2177 out:
2178 	return cur;
2179 }
2180 
2181 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2182 {
2183 	struct tcp_iter_state *st = seq->private;
2184 	void *rc;
2185 
2186 	st->bucket = 0;
2187 	rc = established_get_first(seq);
2188 
2189 	while (rc && pos) {
2190 		rc = established_get_next(seq, rc);
2191 		--pos;
2192 	}
2193 	return rc;
2194 }
2195 
2196 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2197 {
2198 	void *rc;
2199 	struct tcp_iter_state *st = seq->private;
2200 
2201 	st->state = TCP_SEQ_STATE_LISTENING;
2202 	rc	  = listening_get_idx(seq, &pos);
2203 
2204 	if (!rc) {
2205 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2206 		rc	  = established_get_idx(seq, pos);
2207 	}
2208 
2209 	return rc;
2210 }
2211 
2212 static void *tcp_seek_last_pos(struct seq_file *seq)
2213 {
2214 	struct tcp_iter_state *st = seq->private;
2215 	int offset = st->offset;
2216 	int orig_num = st->num;
2217 	void *rc = NULL;
2218 
2219 	switch (st->state) {
2220 	case TCP_SEQ_STATE_OPENREQ:
2221 	case TCP_SEQ_STATE_LISTENING:
2222 		if (st->bucket >= INET_LHTABLE_SIZE)
2223 			break;
2224 		st->state = TCP_SEQ_STATE_LISTENING;
2225 		rc = listening_get_next(seq, NULL);
2226 		while (offset-- && rc)
2227 			rc = listening_get_next(seq, rc);
2228 		if (rc)
2229 			break;
2230 		st->bucket = 0;
2231 		/* Fallthrough */
2232 	case TCP_SEQ_STATE_ESTABLISHED:
2233 	case TCP_SEQ_STATE_TIME_WAIT:
2234 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2235 		if (st->bucket > tcp_hashinfo.ehash_mask)
2236 			break;
2237 		rc = established_get_first(seq);
2238 		while (offset-- && rc)
2239 			rc = established_get_next(seq, rc);
2240 	}
2241 
2242 	st->num = orig_num;
2243 
2244 	return rc;
2245 }
2246 
2247 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2248 {
2249 	struct tcp_iter_state *st = seq->private;
2250 	void *rc;
2251 
2252 	if (*pos && *pos == st->last_pos) {
2253 		rc = tcp_seek_last_pos(seq);
2254 		if (rc)
2255 			goto out;
2256 	}
2257 
2258 	st->state = TCP_SEQ_STATE_LISTENING;
2259 	st->num = 0;
2260 	st->bucket = 0;
2261 	st->offset = 0;
2262 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2263 
2264 out:
2265 	st->last_pos = *pos;
2266 	return rc;
2267 }
2268 
2269 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2270 {
2271 	struct tcp_iter_state *st = seq->private;
2272 	void *rc = NULL;
2273 
2274 	if (v == SEQ_START_TOKEN) {
2275 		rc = tcp_get_idx(seq, 0);
2276 		goto out;
2277 	}
2278 
2279 	switch (st->state) {
2280 	case TCP_SEQ_STATE_OPENREQ:
2281 	case TCP_SEQ_STATE_LISTENING:
2282 		rc = listening_get_next(seq, v);
2283 		if (!rc) {
2284 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2285 			st->bucket = 0;
2286 			st->offset = 0;
2287 			rc	  = established_get_first(seq);
2288 		}
2289 		break;
2290 	case TCP_SEQ_STATE_ESTABLISHED:
2291 	case TCP_SEQ_STATE_TIME_WAIT:
2292 		rc = established_get_next(seq, v);
2293 		break;
2294 	}
2295 out:
2296 	++*pos;
2297 	st->last_pos = *pos;
2298 	return rc;
2299 }
2300 
2301 static void tcp_seq_stop(struct seq_file *seq, void *v)
2302 {
2303 	struct tcp_iter_state *st = seq->private;
2304 
2305 	switch (st->state) {
2306 	case TCP_SEQ_STATE_OPENREQ:
2307 		if (v) {
2308 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2309 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2310 		}
2311 	case TCP_SEQ_STATE_LISTENING:
2312 		if (v != SEQ_START_TOKEN)
2313 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2314 		break;
2315 	case TCP_SEQ_STATE_TIME_WAIT:
2316 	case TCP_SEQ_STATE_ESTABLISHED:
2317 		if (v)
2318 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2319 		break;
2320 	}
2321 }
2322 
2323 static int tcp_seq_open(struct inode *inode, struct file *file)
2324 {
2325 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2326 	struct tcp_iter_state *s;
2327 	int err;
2328 
2329 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2330 			  sizeof(struct tcp_iter_state));
2331 	if (err < 0)
2332 		return err;
2333 
2334 	s = ((struct seq_file *)file->private_data)->private;
2335 	s->family		= afinfo->family;
2336 	s->last_pos 		= 0;
2337 	return 0;
2338 }
2339 
2340 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2341 {
2342 	int rc = 0;
2343 	struct proc_dir_entry *p;
2344 
2345 	afinfo->seq_fops.open		= tcp_seq_open;
2346 	afinfo->seq_fops.read		= seq_read;
2347 	afinfo->seq_fops.llseek		= seq_lseek;
2348 	afinfo->seq_fops.release	= seq_release_net;
2349 
2350 	afinfo->seq_ops.start		= tcp_seq_start;
2351 	afinfo->seq_ops.next		= tcp_seq_next;
2352 	afinfo->seq_ops.stop		= tcp_seq_stop;
2353 
2354 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2355 			     &afinfo->seq_fops, afinfo);
2356 	if (!p)
2357 		rc = -ENOMEM;
2358 	return rc;
2359 }
2360 EXPORT_SYMBOL(tcp_proc_register);
2361 
2362 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2363 {
2364 	proc_net_remove(net, afinfo->name);
2365 }
2366 EXPORT_SYMBOL(tcp_proc_unregister);
2367 
2368 static void get_openreq4(struct sock *sk, struct request_sock *req,
2369 			 struct seq_file *f, int i, int uid, int *len)
2370 {
2371 	const struct inet_request_sock *ireq = inet_rsk(req);
2372 	int ttd = req->expires - jiffies;
2373 
2374 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2375 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2376 		i,
2377 		ireq->loc_addr,
2378 		ntohs(inet_sk(sk)->inet_sport),
2379 		ireq->rmt_addr,
2380 		ntohs(ireq->rmt_port),
2381 		TCP_SYN_RECV,
2382 		0, 0, /* could print option size, but that is af dependent. */
2383 		1,    /* timers active (only the expire timer) */
2384 		jiffies_to_clock_t(ttd),
2385 		req->retrans,
2386 		uid,
2387 		0,  /* non standard timer */
2388 		0, /* open_requests have no inode */
2389 		atomic_read(&sk->sk_refcnt),
2390 		req,
2391 		len);
2392 }
2393 
2394 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2395 {
2396 	int timer_active;
2397 	unsigned long timer_expires;
2398 	struct tcp_sock *tp = tcp_sk(sk);
2399 	const struct inet_connection_sock *icsk = inet_csk(sk);
2400 	struct inet_sock *inet = inet_sk(sk);
2401 	__be32 dest = inet->inet_daddr;
2402 	__be32 src = inet->inet_rcv_saddr;
2403 	__u16 destp = ntohs(inet->inet_dport);
2404 	__u16 srcp = ntohs(inet->inet_sport);
2405 	int rx_queue;
2406 
2407 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2408 		timer_active	= 1;
2409 		timer_expires	= icsk->icsk_timeout;
2410 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2411 		timer_active	= 4;
2412 		timer_expires	= icsk->icsk_timeout;
2413 	} else if (timer_pending(&sk->sk_timer)) {
2414 		timer_active	= 2;
2415 		timer_expires	= sk->sk_timer.expires;
2416 	} else {
2417 		timer_active	= 0;
2418 		timer_expires = jiffies;
2419 	}
2420 
2421 	if (sk->sk_state == TCP_LISTEN)
2422 		rx_queue = sk->sk_ack_backlog;
2423 	else
2424 		/*
2425 		 * because we dont lock socket, we might find a transient negative value
2426 		 */
2427 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2428 
2429 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2430 			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2431 		i, src, srcp, dest, destp, sk->sk_state,
2432 		tp->write_seq - tp->snd_una,
2433 		rx_queue,
2434 		timer_active,
2435 		jiffies_to_clock_t(timer_expires - jiffies),
2436 		icsk->icsk_retransmits,
2437 		sock_i_uid(sk),
2438 		icsk->icsk_probes_out,
2439 		sock_i_ino(sk),
2440 		atomic_read(&sk->sk_refcnt), sk,
2441 		jiffies_to_clock_t(icsk->icsk_rto),
2442 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2443 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2444 		tp->snd_cwnd,
2445 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2446 		len);
2447 }
2448 
2449 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2450 			       struct seq_file *f, int i, int *len)
2451 {
2452 	__be32 dest, src;
2453 	__u16 destp, srcp;
2454 	int ttd = tw->tw_ttd - jiffies;
2455 
2456 	if (ttd < 0)
2457 		ttd = 0;
2458 
2459 	dest  = tw->tw_daddr;
2460 	src   = tw->tw_rcv_saddr;
2461 	destp = ntohs(tw->tw_dport);
2462 	srcp  = ntohs(tw->tw_sport);
2463 
2464 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2465 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2466 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2467 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2468 		atomic_read(&tw->tw_refcnt), tw, len);
2469 }
2470 
2471 #define TMPSZ 150
2472 
2473 static int tcp4_seq_show(struct seq_file *seq, void *v)
2474 {
2475 	struct tcp_iter_state *st;
2476 	int len;
2477 
2478 	if (v == SEQ_START_TOKEN) {
2479 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2480 			   "  sl  local_address rem_address   st tx_queue "
2481 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2482 			   "inode");
2483 		goto out;
2484 	}
2485 	st = seq->private;
2486 
2487 	switch (st->state) {
2488 	case TCP_SEQ_STATE_LISTENING:
2489 	case TCP_SEQ_STATE_ESTABLISHED:
2490 		get_tcp4_sock(v, seq, st->num, &len);
2491 		break;
2492 	case TCP_SEQ_STATE_OPENREQ:
2493 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2494 		break;
2495 	case TCP_SEQ_STATE_TIME_WAIT:
2496 		get_timewait4_sock(v, seq, st->num, &len);
2497 		break;
2498 	}
2499 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2500 out:
2501 	return 0;
2502 }
2503 
2504 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2505 	.name		= "tcp",
2506 	.family		= AF_INET,
2507 	.seq_fops	= {
2508 		.owner		= THIS_MODULE,
2509 	},
2510 	.seq_ops	= {
2511 		.show		= tcp4_seq_show,
2512 	},
2513 };
2514 
2515 static int __net_init tcp4_proc_init_net(struct net *net)
2516 {
2517 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2518 }
2519 
2520 static void __net_exit tcp4_proc_exit_net(struct net *net)
2521 {
2522 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2523 }
2524 
2525 static struct pernet_operations tcp4_net_ops = {
2526 	.init = tcp4_proc_init_net,
2527 	.exit = tcp4_proc_exit_net,
2528 };
2529 
2530 int __init tcp4_proc_init(void)
2531 {
2532 	return register_pernet_subsys(&tcp4_net_ops);
2533 }
2534 
2535 void tcp4_proc_exit(void)
2536 {
2537 	unregister_pernet_subsys(&tcp4_net_ops);
2538 }
2539 #endif /* CONFIG_PROC_FS */
2540 
2541 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2542 {
2543 	const struct iphdr *iph = skb_gro_network_header(skb);
2544 
2545 	switch (skb->ip_summed) {
2546 	case CHECKSUM_COMPLETE:
2547 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2548 				  skb->csum)) {
2549 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2550 			break;
2551 		}
2552 
2553 		/* fall through */
2554 	case CHECKSUM_NONE:
2555 		NAPI_GRO_CB(skb)->flush = 1;
2556 		return NULL;
2557 	}
2558 
2559 	return tcp_gro_receive(head, skb);
2560 }
2561 
2562 int tcp4_gro_complete(struct sk_buff *skb)
2563 {
2564 	const struct iphdr *iph = ip_hdr(skb);
2565 	struct tcphdr *th = tcp_hdr(skb);
2566 
2567 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2568 				  iph->saddr, iph->daddr, 0);
2569 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2570 
2571 	return tcp_gro_complete(skb);
2572 }
2573 
2574 struct proto tcp_prot = {
2575 	.name			= "TCP",
2576 	.owner			= THIS_MODULE,
2577 	.close			= tcp_close,
2578 	.connect		= tcp_v4_connect,
2579 	.disconnect		= tcp_disconnect,
2580 	.accept			= inet_csk_accept,
2581 	.ioctl			= tcp_ioctl,
2582 	.init			= tcp_v4_init_sock,
2583 	.destroy		= tcp_v4_destroy_sock,
2584 	.shutdown		= tcp_shutdown,
2585 	.setsockopt		= tcp_setsockopt,
2586 	.getsockopt		= tcp_getsockopt,
2587 	.recvmsg		= tcp_recvmsg,
2588 	.sendmsg		= tcp_sendmsg,
2589 	.sendpage		= tcp_sendpage,
2590 	.backlog_rcv		= tcp_v4_do_rcv,
2591 	.hash			= inet_hash,
2592 	.unhash			= inet_unhash,
2593 	.get_port		= inet_csk_get_port,
2594 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2595 	.sockets_allocated	= &tcp_sockets_allocated,
2596 	.orphan_count		= &tcp_orphan_count,
2597 	.memory_allocated	= &tcp_memory_allocated,
2598 	.memory_pressure	= &tcp_memory_pressure,
2599 	.sysctl_mem		= sysctl_tcp_mem,
2600 	.sysctl_wmem		= sysctl_tcp_wmem,
2601 	.sysctl_rmem		= sysctl_tcp_rmem,
2602 	.max_header		= MAX_TCP_HEADER,
2603 	.obj_size		= sizeof(struct tcp_sock),
2604 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2605 	.twsk_prot		= &tcp_timewait_sock_ops,
2606 	.rsk_prot		= &tcp_request_sock_ops,
2607 	.h.hashinfo		= &tcp_hashinfo,
2608 	.no_autobind		= true,
2609 #ifdef CONFIG_COMPAT
2610 	.compat_setsockopt	= compat_tcp_setsockopt,
2611 	.compat_getsockopt	= compat_tcp_getsockopt,
2612 #endif
2613 };
2614 EXPORT_SYMBOL(tcp_prot);
2615 
2616 
2617 static int __net_init tcp_sk_init(struct net *net)
2618 {
2619 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2620 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2621 }
2622 
2623 static void __net_exit tcp_sk_exit(struct net *net)
2624 {
2625 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2626 }
2627 
2628 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2629 {
2630 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2631 }
2632 
2633 static struct pernet_operations __net_initdata tcp_sk_ops = {
2634        .init	   = tcp_sk_init,
2635        .exit	   = tcp_sk_exit,
2636        .exit_batch = tcp_sk_exit_batch,
2637 };
2638 
2639 void __init tcp_v4_init(void)
2640 {
2641 	inet_hashinfo_init(&tcp_hashinfo);
2642 	if (register_pernet_subsys(&tcp_sk_ops))
2643 		panic("Failed to create the TCP control socket.\n");
2644 }
2645