xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_time_wait.c (revision dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2016 Joyent, Inc.
25  */
26 
27 /*
28  * This file contains functions related to TCP time wait processing.  Also
29  * refer to the time wait handling comments in tcp_impl.h.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/strsun.h>
34 #include <sys/squeue_impl.h>
35 #include <sys/squeue.h>
36 #include <sys/callo.h>
37 
38 #include <inet/common.h>
39 #include <inet/ip.h>
40 #include <inet/tcp.h>
41 #include <inet/tcp_impl.h>
42 #include <inet/tcp_cluster.h>
43 
44 static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
45 
46 #define	TW_BUCKET(t)					\
47 	(((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
48 
49 #define	TW_BUCKET_NEXT(b)	(((b) + 1) % TCP_TIME_WAIT_BUCKETS)
50 
51 
52 /*
53  * Remove a connection from the list of detached TIME_WAIT connections.
54  * It returns B_FALSE if it can't remove the connection from the list
55  * as the connection has already been removed from the list due to an
56  * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
57  */
58 boolean_t
59 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
60 {
61 	boolean_t	locked = B_FALSE;
62 
63 	if (tsp == NULL) {
64 		tsp = *((tcp_squeue_priv_t **)
65 		    squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
66 		mutex_enter(&tsp->tcp_time_wait_lock);
67 		locked = B_TRUE;
68 	} else {
69 		ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
70 	}
71 
72 	/* 0 means that the tcp_t has not been added to the time wait list. */
73 	if (tcp->tcp_time_wait_expire == 0) {
74 		ASSERT(tcp->tcp_time_wait_next == NULL);
75 		ASSERT(tcp->tcp_time_wait_prev == NULL);
76 		if (locked)
77 			mutex_exit(&tsp->tcp_time_wait_lock);
78 		return (B_FALSE);
79 	}
80 	ASSERT(TCP_IS_DETACHED(tcp));
81 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
82 	ASSERT(tsp->tcp_time_wait_cnt > 0);
83 
84 	if (tcp->tcp_time_wait_next != NULL) {
85 		tcp->tcp_time_wait_next->tcp_time_wait_prev =
86 		    tcp->tcp_time_wait_prev;
87 	}
88 	if (tcp->tcp_time_wait_prev != NULL) {
89 		tcp->tcp_time_wait_prev->tcp_time_wait_next =
90 		    tcp->tcp_time_wait_next;
91 	} else {
92 		unsigned int bucket;
93 
94 		bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
95 		ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
96 		tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
97 	}
98 	tcp->tcp_time_wait_next = NULL;
99 	tcp->tcp_time_wait_prev = NULL;
100 	tcp->tcp_time_wait_expire = 0;
101 	tsp->tcp_time_wait_cnt--;
102 
103 	if (locked)
104 		mutex_exit(&tsp->tcp_time_wait_lock);
105 	return (B_TRUE);
106 }
107 
108 /* Constants used for fast checking of a localhost address */
109 #if defined(_BIG_ENDIAN)
110 #define	IPv4_LOCALHOST	0x7f000000U
111 #define	IPv4_LH_MASK	0xffffff00U
112 #else
113 #define	IPv4_LOCALHOST	0x0000007fU
114 #define	IPv4_LH_MASK	0x00ffffffU
115 #endif
116 
117 #define	IS_LOCAL_HOST(x)	( \
118 	((x)->tcp_connp->conn_ipversion == IPV4_VERSION && \
119 	((x)->tcp_connp->conn_laddr_v4 & IPv4_LH_MASK) == IPv4_LOCALHOST) || \
120 	((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
121 	IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
122 
123 
124 /*
125  * Add a connection to the list of detached TIME_WAIT connections
126  * and set its time to expire.
127  */
128 void
129 tcp_time_wait_append(tcp_t *tcp)
130 {
131 	tcp_stack_t	*tcps = tcp->tcp_tcps;
132 	squeue_t	*sqp = tcp->tcp_connp->conn_sqp;
133 	tcp_squeue_priv_t *tsp =
134 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
135 	int64_t		now, schedule;
136 	unsigned int	bucket;
137 
138 	tcp_timers_stop(tcp);
139 
140 	/* Freed above */
141 	ASSERT(tcp->tcp_timer_tid == 0);
142 	ASSERT(tcp->tcp_ack_tid == 0);
143 
144 	/* must have happened at the time of detaching the tcp */
145 	ASSERT(TCP_IS_DETACHED(tcp));
146 	ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
147 	ASSERT(tcp->tcp_ptpahn == NULL);
148 	ASSERT(tcp->tcp_flow_stopped == 0);
149 	ASSERT(tcp->tcp_time_wait_next == NULL);
150 	ASSERT(tcp->tcp_time_wait_prev == NULL);
151 	ASSERT(tcp->tcp_time_wait_expire == 0);
152 	ASSERT(tcp->tcp_listener == NULL);
153 
154 	TCP_DBGSTAT(tcps, tcp_time_wait);
155 	mutex_enter(&tsp->tcp_time_wait_lock);
156 
157 	/*
158 	 * Immediately expire loopback connections.  Since there is no worry
159 	 * about packets on the local host showing up after a long network
160 	 * delay, this is safe and allows much higher rates of connection churn
161 	 * for applications operating locally.
162 	 *
163 	 * This typically bypasses the tcp_free_list fast path due to squeue
164 	 * re-entry for the loopback close operation.
165 	 */
166 	if (tcp->tcp_loopback) {
167 		tcp_time_wait_purge(tcp, tsp);
168 		mutex_exit(&tsp->tcp_time_wait_lock);
169 		return;
170 	}
171 
172 	/*
173 	 * In order to reap TIME_WAITs reliably, we should use a source of time
174 	 * that is not adjustable by the user.  While it would be more accurate
175 	 * to grab this timestamp before (potentially) sleeping on the
176 	 * tcp_time_wait_lock, doing so complicates bucket addressing later.
177 	 */
178 	now = ddi_get_lbolt64();
179 
180 	/*
181 	 * Each squeue uses an arbitrary time offset when scheduling
182 	 * expiration timers.  This prevents the bucketing from forcing
183 	 * tcp_time_wait_collector to run in locksetup across squeues.
184 	 *
185 	 * This offset is (re)initialized when a new TIME_WAIT connection is
186 	 * added to an squeue which has no connections waiting to expire.
187 	 */
188 	if (tsp->tcp_time_wait_tid == 0) {
189 		ASSERT(tsp->tcp_time_wait_cnt == 0);
190 		tsp->tcp_time_wait_offset =
191 		    now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
192 	}
193 	now -= tsp->tcp_time_wait_offset;
194 
195 	/*
196 	 * Use the netstack-defined timeout, rounded up to the minimum
197 	 * time_wait_collector interval.
198 	 */
199 	schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
200 	tcp->tcp_time_wait_expire = schedule;
201 
202 	/*
203 	 * Append the connection into the appropriate bucket.
204 	 */
205 	bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
206 	tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
207 	tsp->tcp_time_wait_bucket[bucket] = tcp;
208 	if (tcp->tcp_time_wait_next != NULL) {
209 		ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
210 		tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
211 	}
212 	tsp->tcp_time_wait_cnt++;
213 
214 	/*
215 	 * Round delay up to the nearest bucket boundary.
216 	 */
217 	schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
218 	schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
219 
220 	/*
221 	 * The newly inserted entry may require a tighter schedule for the
222 	 * expiration timer.
223 	 */
224 	if (schedule < tsp->tcp_time_wait_schedule) {
225 		callout_id_t old_tid = tsp->tcp_time_wait_tid;
226 
227 		tsp->tcp_time_wait_schedule = schedule;
228 		tsp->tcp_time_wait_tid =
229 		    timeout_generic(CALLOUT_NORMAL,
230 		    tcp_time_wait_collector, sqp,
231 		    TICK_TO_NSEC(schedule - now),
232 		    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
233 
234 		/*
235 		 * It is possible for the timer to fire before the untimeout
236 		 * action is able to complete.  In that case, the exclusion
237 		 * offered by the tcp_time_wait_collector_active flag will
238 		 * prevent multiple collector threads from processing records
239 		 * simultaneously from the same squeue.
240 		 */
241 		mutex_exit(&tsp->tcp_time_wait_lock);
242 		(void) untimeout_default(old_tid, 0);
243 		return;
244 	}
245 
246 	/*
247 	 * Start a fresh timer if none exists.
248 	 */
249 	if (tsp->tcp_time_wait_schedule == 0) {
250 		ASSERT(tsp->tcp_time_wait_tid == 0);
251 
252 		tsp->tcp_time_wait_schedule = schedule;
253 		tsp->tcp_time_wait_tid =
254 		    timeout_generic(CALLOUT_NORMAL,
255 		    tcp_time_wait_collector, sqp,
256 		    TICK_TO_NSEC(schedule - now),
257 		    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
258 	}
259 	mutex_exit(&tsp->tcp_time_wait_lock);
260 }
261 
262 /*
263  * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT
264  * tcp_t.  Used in tcp_time_wait_collector().
265  */
266 /* ARGSUSED */
267 static void
268 tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
269 {
270 	conn_t	*connp = (conn_t *)arg;
271 	tcp_t	*tcp = connp->conn_tcp;
272 
273 	ASSERT(tcp != NULL);
274 	if (tcp->tcp_state == TCPS_CLOSED) {
275 		return;
276 	}
277 
278 	ASSERT((connp->conn_family == AF_INET &&
279 	    connp->conn_ipversion == IPV4_VERSION) ||
280 	    (connp->conn_family == AF_INET6 &&
281 	    (connp->conn_ipversion == IPV4_VERSION ||
282 	    connp->conn_ipversion == IPV6_VERSION)));
283 	ASSERT(!tcp->tcp_listener);
284 
285 	ASSERT(TCP_IS_DETACHED(tcp));
286 
287 	/*
288 	 * Because they have no upstream client to rebind or tcp_close()
289 	 * them later, we axe the connection here and now.
290 	 */
291 	tcp_close_detached(tcp);
292 }
293 
294 
295 static void
296 tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
297 {
298 	mblk_t *mp;
299 	conn_t *connp = tcp->tcp_connp;
300 	kmutex_t *lock;
301 
302 	ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
303 	ASSERT(connp->conn_fanout != NULL);
304 
305 	lock = &connp->conn_fanout->connf_lock;
306 
307 	/*
308 	 * This is essentially a TIME_WAIT reclaim fast path optimization for
309 	 * performance where the connection is checked under the fanout lock
310 	 * (so that no one else can get access to the conn_t) that the refcnt
311 	 * is 2 (one each for TCP and the classifier hash list).  That is the
312 	 * case and clustering callbacks are not enabled, the conn can be
313 	 * removed under the fanout lock and avoid clean-up under the squeue.
314 	 *
315 	 * This optimization is forgone when clustering is enabled since the
316 	 * clustering callback must be made before setting the CONDEMNED flag
317 	 * and after dropping all locks
318 	 *
319 	 * See the comments in tcp_closei_local for additional information
320 	 * regarding the refcnt logic.
321 	 */
322 	if (mutex_tryenter(lock)) {
323 		mutex_enter(&connp->conn_lock);
324 		if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
325 			ipcl_hash_remove_locked(connp, connp->conn_fanout);
326 			/*
327 			 * Set the CONDEMNED flag now itself so that the refcnt
328 			 * cannot increase due to any walker.
329 			 */
330 			connp->conn_state_flags |= CONN_CONDEMNED;
331 			mutex_exit(&connp->conn_lock);
332 			mutex_exit(lock);
333 			if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
334 				/*
335 				 * Add to head of tcp_free_list
336 				 */
337 				tcp_cleanup(tcp);
338 				ASSERT(connp->conn_latch == NULL);
339 				ASSERT(connp->conn_policy == NULL);
340 				ASSERT(tcp->tcp_tcps == NULL);
341 				ASSERT(connp->conn_netstack == NULL);
342 
343 				tcp->tcp_time_wait_next = tsp->tcp_free_list;
344 				tcp->tcp_in_free_list = B_TRUE;
345 				tsp->tcp_free_list = tcp;
346 				tsp->tcp_free_list_cnt++;
347 			} else {
348 				/*
349 				 * Do not add to tcp_free_list
350 				 */
351 				tcp_bind_hash_remove(tcp);
352 				ixa_cleanup(tcp->tcp_connp->conn_ixa);
353 				tcp_ipsec_cleanup(tcp);
354 				CONN_DEC_REF(tcp->tcp_connp);
355 			}
356 
357 			/*
358 			 * With the fast-path complete, we can bail.
359 			 */
360 			return;
361 		} else {
362 			/*
363 			 * Fall back to slow path.
364 			 */
365 			CONN_INC_REF_LOCKED(connp);
366 			mutex_exit(&connp->conn_lock);
367 			mutex_exit(lock);
368 		}
369 	} else {
370 		CONN_INC_REF(connp);
371 	}
372 
373 	/*
374 	 * We can reuse the closemp here since conn has detached (otherwise we
375 	 * wouldn't even be in time_wait list). It is safe to change
376 	 * tcp_closemp_used without taking a lock as no other thread can
377 	 * concurrently access it at this point in the connection lifecycle.
378 	 */
379 	if (tcp->tcp_closemp.b_prev == NULL) {
380 		tcp->tcp_closemp_used = B_TRUE;
381 	} else {
382 		cmn_err(CE_PANIC,
383 		    "tcp_timewait_collector: concurrent use of tcp_closemp: "
384 		    "connp %p tcp %p\n", (void *)connp, (void *)tcp);
385 	}
386 
387 	TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
388 	mp = &tcp->tcp_closemp;
389 	mutex_exit(&tsp->tcp_time_wait_lock);
390 	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
391 	    SQ_FILL, SQTAG_TCP_TIMEWAIT);
392 	mutex_enter(&tsp->tcp_time_wait_lock);
393 }
394 
395 /*
396  * Purge any tcp_t instances associated with this squeue which have expired
397  * from the TIME_WAIT state.
398  */
399 void
400 tcp_time_wait_collector(void *arg)
401 {
402 	tcp_t *tcp;
403 	int64_t now, active_schedule, new_schedule;
404 	unsigned int idx;
405 
406 	squeue_t *sqp = (squeue_t *)arg;
407 	tcp_squeue_priv_t *tsp =
408 	    *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
409 
410 	mutex_enter(&tsp->tcp_time_wait_lock);
411 
412 	/*
413 	 * Because of timer scheduling complexity and the fact that the
414 	 * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
415 	 * possible for multiple tcp_time_wait_collector threads to run against
416 	 * the same squeue.  This flag is used to exclude other collectors from
417 	 * the squeue during execution.
418 	 */
419 	if (tsp->tcp_time_wait_collector_active) {
420 		mutex_exit(&tsp->tcp_time_wait_lock);
421 		return;
422 	}
423 	tsp->tcp_time_wait_collector_active = B_TRUE;
424 
425 	/*
426 	 * Purge the free list if necessary
427 	 */
428 	if (tsp->tcp_free_list != NULL) {
429 		TCP_G_STAT(tcp_freelist_cleanup);
430 		while ((tcp = tsp->tcp_free_list) != NULL) {
431 			tsp->tcp_free_list = tcp->tcp_time_wait_next;
432 			tcp->tcp_time_wait_next = NULL;
433 			tsp->tcp_free_list_cnt--;
434 			ASSERT(tcp->tcp_tcps == NULL);
435 			CONN_DEC_REF(tcp->tcp_connp);
436 		}
437 		ASSERT(tsp->tcp_free_list_cnt == 0);
438 	}
439 
440 	/*
441 	 * If there are no connections pending, clear timer-related state to be
442 	 * reinitialized by the next caller.
443 	 */
444 	if (tsp->tcp_time_wait_cnt == 0) {
445 		tsp->tcp_time_wait_offset = 0;
446 		tsp->tcp_time_wait_schedule = 0;
447 		tsp->tcp_time_wait_tid = 0;
448 		tsp->tcp_time_wait_collector_active = B_FALSE;
449 		mutex_exit(&tsp->tcp_time_wait_lock);
450 		return;
451 	}
452 
453 	/*
454 	 * Grab the bucket which we were scheduled to cleanse.
455 	 */
456 	active_schedule = tsp->tcp_time_wait_schedule;
457 	idx = TW_BUCKET(active_schedule - 1);
458 	now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
459 retry:
460 	tcp = tsp->tcp_time_wait_bucket[idx];
461 
462 	while (tcp != NULL) {
463 		/*
464 		 * Since the bucket count is sized to prevent wrap-around
465 		 * during typical operation and timers are schedule to process
466 		 * buckets with only expired connections, there is only one
467 		 * reason to encounter a connection expiring in the future:
468 		 * The tcp_time_wait_collector thread has been so delayed in
469 		 * its processing that connections have wrapped around the
470 		 * timing wheel into this bucket.
471 		 *
472 		 * In that case, the remaining entires in the bucket can be
473 		 * ignored since, being appended sequentially, they should all
474 		 * expire in the future.
475 		 */
476 		if (now < tcp->tcp_time_wait_expire) {
477 			break;
478 		}
479 
480 		/*
481 		 * Pull the connection out of the bucket.
482 		 */
483 		VERIFY(tcp_time_wait_remove(tcp, tsp));
484 
485 		/*
486 		 * Purge the connection.
487 		 *
488 		 * While tcp_time_wait_lock will be temporarily dropped as part
489 		 * of the process, there is no risk of the timer being
490 		 * (re)scheduled while the collector is running since a value
491 		 * corresponding to the past is left in tcp_time_wait_schedule.
492 		 */
493 		tcp_time_wait_purge(tcp, tsp);
494 
495 		/*
496 		 * Because tcp_time_wait_remove clears the tcp_time_wait_next
497 		 * field, the next item must be grabbed directly from the
498 		 * bucket itself.
499 		 */
500 		tcp = tsp->tcp_time_wait_bucket[idx];
501 	}
502 
503 	if (tsp->tcp_time_wait_cnt == 0) {
504 		/*
505 		 * There is not a need for the collector to schedule a new
506 		 * timer if no pending items remain.  The timer state can be
507 		 * cleared only if it was untouched while the collector dropped
508 		 * its locks during tcp_time_wait_purge.
509 		 */
510 		if (tsp->tcp_time_wait_schedule == active_schedule) {
511 			tsp->tcp_time_wait_offset = 0;
512 			tsp->tcp_time_wait_schedule = 0;
513 			tsp->tcp_time_wait_tid = 0;
514 		}
515 		tsp->tcp_time_wait_collector_active = B_FALSE;
516 		mutex_exit(&tsp->tcp_time_wait_lock);
517 		return;
518 	} else {
519 		unsigned int nidx;
520 
521 		/*
522 		 * Locate the next bucket containing entries.
523 		 */
524 		new_schedule = active_schedule
525 		    + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
526 		nidx = TW_BUCKET_NEXT(idx);
527 		while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
528 			if (nidx == idx) {
529 				break;
530 			}
531 			nidx = TW_BUCKET_NEXT(nidx);
532 			new_schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
533 		}
534 		ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
535 	}
536 
537 	/*
538 	 * It is possible that the system is under such dire load that between
539 	 * the timer scheduling and TIME_WAIT processing delay, execution
540 	 * overran the interval allocated to this bucket.
541 	 */
542 	now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
543 	if (new_schedule <= now) {
544 		/*
545 		 * Attempt to right the situation by immediately performing a
546 		 * purge on the next bucket.  This loop will continue as needed
547 		 * until the schedule can be pushed out ahead of the clock.
548 		 */
549 		idx = TW_BUCKET(new_schedule - 1);
550 		goto retry;
551 	}
552 
553 	/*
554 	 * Another thread may have snuck in to reschedule the timer while locks
555 	 * were dropped during tcp_time_wait_purge.  Defer to the running timer
556 	 * if that is the case.
557 	 */
558 	if (tsp->tcp_time_wait_schedule != active_schedule) {
559 		tsp->tcp_time_wait_collector_active = B_FALSE;
560 		mutex_exit(&tsp->tcp_time_wait_lock);
561 		return;
562 	}
563 
564 	/*
565 	 * Schedule the next timer.
566 	 */
567 	tsp->tcp_time_wait_schedule = new_schedule;
568 	tsp->tcp_time_wait_tid =
569 	    timeout_generic(CALLOUT_NORMAL,
570 	    tcp_time_wait_collector, sqp,
571 	    TICK_TO_NSEC(new_schedule - now),
572 	    CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
573 	tsp->tcp_time_wait_collector_active = B_FALSE;
574 	mutex_exit(&tsp->tcp_time_wait_lock);
575 }
576 
577 /*
578  * tcp_time_wait_processing() handles processing of incoming packets when
579  * the tcp_t is in the TIME_WAIT state.
580  *
581  * A TIME_WAIT tcp_t that has an associated open TCP end point (not in
582  * detached state) is never put on the time wait list.
583  */
584 void
585 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
586     uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
587 {
588 	int32_t		bytes_acked;
589 	int32_t		gap;
590 	int32_t		rgap;
591 	tcp_opt_t	tcpopt;
592 	uint_t		flags;
593 	uint32_t	new_swnd = 0;
594 	conn_t		*nconnp;
595 	conn_t		*connp = tcp->tcp_connp;
596 	tcp_stack_t	*tcps = tcp->tcp_tcps;
597 
598 	BUMP_LOCAL(tcp->tcp_ibsegs);
599 	DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
600 
601 	flags = (unsigned int)tcpha->tha_flags & 0xFF;
602 	new_swnd = ntohs(tcpha->tha_win) <<
603 	    ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
604 
605 	if (tcp->tcp_snd_ts_ok && !(tcpha->tha_flags & TH_RST)) {
606 		int options;
607 		if (tcp->tcp_snd_sack_ok)
608 			tcpopt.tcp = tcp;
609 		else
610 			tcpopt.tcp = NULL;
611 		options = tcp_parse_options(tcpha, &tcpopt);
612 		if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
613 			DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
614 			goto done;
615 		} else if (!tcp_paws_check(tcp, &tcpopt)) {
616 			tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
617 			    TH_ACK);
618 			goto done;
619 		}
620 	}
621 	gap = seg_seq - tcp->tcp_rnxt;
622 	rgap = tcp->tcp_rwnd - (gap + seg_len);
623 	if (gap < 0) {
624 		TCPS_BUMP_MIB(tcps, tcpInDataDupSegs);
625 		TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes,
626 		    (seg_len > -gap ? -gap : seg_len));
627 		seg_len += gap;
628 		if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) {
629 			if (flags & TH_RST) {
630 				goto done;
631 			}
632 			if ((flags & TH_FIN) && seg_len == -1) {
633 				/*
634 				 * When TCP receives a duplicate FIN in
635 				 * TIME_WAIT state, restart the 2 MSL timer.
636 				 * See page 73 in RFC 793. Make sure this TCP
637 				 * is already on the TIME_WAIT list. If not,
638 				 * just restart the timer.
639 				 */
640 				if (TCP_IS_DETACHED(tcp)) {
641 					if (tcp_time_wait_remove(tcp, NULL) ==
642 					    B_TRUE) {
643 						tcp_time_wait_append(tcp);
644 						TCP_DBGSTAT(tcps,
645 						    tcp_rput_time_wait);
646 					}
647 				} else {
648 					ASSERT(tcp != NULL);
649 					TCP_TIMER_RESTART(tcp,
650 					    tcps->tcps_time_wait_interval);
651 				}
652 				tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
653 				    tcp->tcp_rnxt, TH_ACK);
654 				goto done;
655 			}
656 			flags |=  TH_ACK_NEEDED;
657 			seg_len = 0;
658 			goto process_ack;
659 		}
660 
661 		/* Fix seg_seq, and chew the gap off the front. */
662 		seg_seq = tcp->tcp_rnxt;
663 	}
664 
665 	if ((flags & TH_SYN) && gap > 0 && rgap < 0) {
666 		/*
667 		 * Make sure that when we accept the connection, pick
668 		 * an ISS greater than (tcp_snxt + tcp_iss_incr/2) for the
669 		 * old connection.
670 		 *
671 		 * The next ISS generated is equal to tcp_iss_incr_extra
672 		 * + tcp_iss_incr/2 + other components depending on the
673 		 * value of tcp_strong_iss.  We pre-calculate the new
674 		 * ISS here and compare with tcp_snxt to determine if
675 		 * we need to make adjustment to tcp_iss_incr_extra.
676 		 *
677 		 * The above calculation is ugly and is a
678 		 * waste of CPU cycles...
679 		 */
680 		uint32_t new_iss = tcps->tcps_iss_incr_extra;
681 		int32_t adj;
682 		ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
683 
684 		switch (tcps->tcps_strong_iss) {
685 		case 2: {
686 			/* Add time and MD5 components. */
687 			uint32_t answer[4];
688 			struct {
689 				uint32_t ports;
690 				in6_addr_t src;
691 				in6_addr_t dst;
692 			} arg;
693 			MD5_CTX context;
694 
695 			mutex_enter(&tcps->tcps_iss_key_lock);
696 			context = tcps->tcps_iss_key;
697 			mutex_exit(&tcps->tcps_iss_key_lock);
698 			arg.ports = connp->conn_ports;
699 			/* We use MAPPED addresses in tcp_iss_init */
700 			arg.src = connp->conn_laddr_v6;
701 			arg.dst = connp->conn_faddr_v6;
702 			MD5Update(&context, (uchar_t *)&arg,
703 			    sizeof (arg));
704 			MD5Final((uchar_t *)answer, &context);
705 			answer[0] ^= answer[1] ^ answer[2] ^ answer[3];
706 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0];
707 			break;
708 		}
709 		case 1:
710 			/* Add time component and min random (i.e. 1). */
711 			new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1;
712 			break;
713 		default:
714 			/* Add only time component. */
715 			new_iss += (uint32_t)gethrestime_sec() *
716 			    tcps->tcps_iss_incr;
717 			break;
718 		}
719 		if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) {
720 			/*
721 			 * New ISS not guaranteed to be tcp_iss_incr/2
722 			 * ahead of the current tcp_snxt, so add the
723 			 * difference to tcp_iss_incr_extra.
724 			 */
725 			tcps->tcps_iss_incr_extra += adj;
726 		}
727 		/*
728 		 * If tcp_clean_death() can not perform the task now,
729 		 * drop the SYN packet and let the other side re-xmit.
730 		 * Otherwise pass the SYN packet back in, since the
731 		 * old tcp state has been cleaned up or freed.
732 		 */
733 		if (tcp_clean_death(tcp, 0) == -1)
734 			goto done;
735 		nconnp = ipcl_classify(mp, ira, ipst);
736 		if (nconnp != NULL) {
737 			TCP_STAT(tcps, tcp_time_wait_syn_success);
738 			/* Drops ref on nconnp */
739 			tcp_reinput(nconnp, mp, ira, ipst);
740 			return;
741 		}
742 		goto done;
743 	}
744 
745 	/*
746 	 * rgap is the amount of stuff received out of window.  A negative
747 	 * value is the amount out of window.
748 	 */
749 	if (rgap < 0) {
750 		TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs);
751 		TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap);
752 		/* Fix seg_len and make sure there is something left. */
753 		seg_len += rgap;
754 		if (seg_len <= 0) {
755 			if (flags & TH_RST) {
756 				goto done;
757 			}
758 			flags |=  TH_ACK_NEEDED;
759 			seg_len = 0;
760 			goto process_ack;
761 		}
762 	}
763 	/*
764 	 * Check whether we can update tcp_ts_recent. This test is from RFC
765 	 * 7323, section 5.3.
766 	 */
767 	if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
768 	    TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
769 	    SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
770 		tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
771 		tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64();
772 	}
773 
774 	if (seg_seq != tcp->tcp_rnxt && seg_len > 0) {
775 		/* Always ack out of order packets */
776 		flags |= TH_ACK_NEEDED;
777 		seg_len = 0;
778 	} else if (seg_len > 0) {
779 		TCPS_BUMP_MIB(tcps, tcpInClosed);
780 		TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
781 		TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len);
782 	}
783 	if (flags & TH_RST) {
784 		(void) tcp_clean_death(tcp, 0);
785 		goto done;
786 	}
787 	if (flags & TH_SYN) {
788 		tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1,
789 		    TH_RST|TH_ACK);
790 		/*
791 		 * Do not delete the TCP structure if it is in
792 		 * TIME_WAIT state.  Refer to RFC 1122, 4.2.2.13.
793 		 */
794 		goto done;
795 	}
796 process_ack:
797 	if (flags & TH_ACK) {
798 		bytes_acked = (int)(seg_ack - tcp->tcp_suna);
799 		if (bytes_acked <= 0) {
800 			if (bytes_acked == 0 && seg_len == 0 &&
801 			    new_swnd == tcp->tcp_swnd)
802 				TCPS_BUMP_MIB(tcps, tcpInDupAck);
803 		} else {
804 			/* Acks something not sent */
805 			flags |= TH_ACK_NEEDED;
806 		}
807 	}
808 	if (flags & TH_ACK_NEEDED) {
809 		/*
810 		 * Time to send an ack for some reason.
811 		 */
812 		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
813 		    tcp->tcp_rnxt, TH_ACK);
814 	}
815 done:
816 	freemsg(mp);
817 }
818