xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_ndp.c (revision e0731422366620894c16c1ee6515551c5f00733d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/stream.h>
27 #include <sys/stropts.h>
28 #include <sys/strsun.h>
29 #include <sys/sysmacros.h>
30 #include <sys/errno.h>
31 #include <sys/dlpi.h>
32 #include <sys/socket.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/vtrace.h>
38 #include <sys/kmem.h>
39 #include <sys/zone.h>
40 #include <sys/ethernet.h>
41 #include <sys/sdt.h>
42 #include <sys/mac.h>
43 
44 #include <net/if.h>
45 #include <net/if_types.h>
46 #include <net/if_dl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/ip6.h>
50 #include <netinet/icmp6.h>
51 
52 #include <inet/common.h>
53 #include <inet/mi.h>
54 #include <inet/mib2.h>
55 #include <inet/nd.h>
56 #include <inet/ip.h>
57 #include <inet/ip_impl.h>
58 #include <inet/ipclassifier.h>
59 #include <inet/ip_if.h>
60 #include <inet/ip_ire.h>
61 #include <inet/ip_rts.h>
62 #include <inet/ip6.h>
63 #include <inet/ip_ndp.h>
64 #include <inet/sctp_ip.h>
65 #include <inet/ip_arp.h>
66 #include <inet/ip2mac_impl.h>
67 
68 #define	ANNOUNCE_INTERVAL(isv6) \
69 	(isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
70 	ipst->ips_ip_arp_publish_interval)
71 
72 #define	DEFENSE_INTERVAL(isv6) \
73 	(isv6 ? ipst->ips_ndp_defend_interval : \
74 	ipst->ips_arp_defend_interval)
75 
76 /* Non-tunable probe interval, based on link capabilities */
77 #define	ILL_PROBE_INTERVAL(ill)	((ill)->ill_note_link ? 150 : 1500)
78 
79 /*
80  * The IPv4 Link Local address space is special; we do extra duplicate checking
81  * there, as the entire assignment mechanism rests on random numbers.
82  */
83 #define	IS_IPV4_LL_SPACE(ptr)	(((uchar_t *)ptr)[0] == 169 && \
84 				((uchar_t *)ptr)[1] == 254)
85 
86 /*
87  * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
88  * in to the ncec*add* functions.
89  *
90  * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
91  * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
92  * that we will respond to requests for the protocol address.
93  */
94 #define	NCE_EXTERNAL_FLAGS_MASK \
95 	(NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
96 	NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
97 	NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
98 
99 /*
100  * Lock ordering:
101  *
102  *	ndp_g_lock -> ill_lock -> ncec_lock
103  *
104  * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
105  * ncec_next.  ncec_lock protects the contents of the NCE (particularly
106  * ncec_refcnt).
107  */
108 
109 static	void	nce_cleanup_list(ncec_t *ncec);
110 static	void 	nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
111 static	ncec_t	*ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
112     ncec_t *);
113 static	nce_t	*nce_lookup_addr(ill_t *, const in6_addr_t *);
114 static	int	nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
115     uint16_t ncec_flags, nce_t **newnce);
116 static	int	nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
117     uint16_t ncec_flags, nce_t **newnce);
118 static	boolean_t	ndp_xmit(ill_t *ill, uint32_t operation,
119     uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
120     const in6_addr_t *target, int flag);
121 static void	ncec_refhold_locked(ncec_t *);
122 static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
123 static	void	nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
124 static	int	nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
125     uint16_t, uint16_t, nce_t **);
126 static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
127 static nce_t *nce_add(ill_t *, ncec_t *);
128 static void nce_inactive(nce_t *);
129 extern nce_t 	*nce_lookup(ill_t *, const in6_addr_t *);
130 static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
131 static int	nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
132     uint16_t, uint16_t, nce_t **);
133 static int	nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
134     uint16_t, uint16_t, nce_t **);
135 static int  nce_add_v6_postprocess(nce_t *);
136 static int  nce_add_v4_postprocess(nce_t *);
137 static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
138 static clock_t nce_fuzz_interval(clock_t, boolean_t);
139 static void nce_resolv_ipmp_ok(ncec_t *);
140 static void nce_walk_common(ill_t *, pfi_t, void *);
141 static void nce_start_timer(ncec_t *, uint_t);
142 static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
143 static void nce_fastpath_trigger(nce_t *);
144 static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
145 
146 #ifdef DEBUG
147 static void	ncec_trace_cleanup(const ncec_t *);
148 #endif
149 
150 #define	NCE_HASH_PTR_V4(ipst, addr)					\
151 	(&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)]))
152 
153 #define	NCE_HASH_PTR_V6(ipst, addr)				 \
154 	(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
155 		NCE_TABLE_SIZE)]))
156 
157 extern kmem_cache_t *ncec_cache;
158 extern kmem_cache_t *nce_cache;
159 
160 /*
161  * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
162  * If src_ill is not null, the ncec_addr is bound to src_ill. The
163  * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
164  * the probe is sent on the ncec_ill (in the non-IPMP case) or the
165  * IPMP cast_ill (in the IPMP case).
166  *
167  * Note that the probe interval is based on the src_ill for IPv6, and
168  * the ncec_xmit_interval for IPv4.
169  */
170 static void
171 nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
172 {
173 	boolean_t dropped;
174 	uint32_t probe_interval;
175 
176 	ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
177 	ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
178 	if (ncec->ncec_ipversion == IPV6_VERSION) {
179 		dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
180 		    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
181 		    &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
182 		probe_interval = ILL_PROBE_INTERVAL(src_ill);
183 	} else {
184 		/* IPv4 DAD delay the initial probe. */
185 		if (send_probe)
186 			dropped = arp_probe(ncec);
187 		else
188 			dropped = B_TRUE;
189 		probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
190 		    !send_probe);
191 	}
192 	if (!dropped) {
193 		mutex_enter(&ncec->ncec_lock);
194 		ncec->ncec_pcnt--;
195 		mutex_exit(&ncec->ncec_lock);
196 	}
197 	nce_restart_timer(ncec, probe_interval);
198 }
199 
200 /*
201  * Compute default flags to use for an advertisement of this ncec's address.
202  */
203 static int
204 nce_advert_flags(const ncec_t *ncec)
205 {
206 	int flag = 0;
207 
208 	if (ncec->ncec_flags & NCE_F_ISROUTER)
209 		flag |= NDP_ISROUTER;
210 	if (!(ncec->ncec_flags & NCE_F_ANYCAST))
211 		flag |= NDP_ORIDE;
212 
213 	return (flag);
214 }
215 
216 /*
217  * NDP Cache Entry creation routine.
218  * This routine must always be called with ndp6->ndp_g_lock held.
219  */
220 int
221 nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
222     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
223 {
224 	int		err;
225 	nce_t		*nce;
226 
227 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
228 	ASSERT(ill != NULL && ill->ill_isv6);
229 
230 	err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
231 	    &nce);
232 	if (err != 0)
233 		return (err);
234 	ASSERT(newnce != NULL);
235 	*newnce = nce;
236 	return (err);
237 }
238 
239 /*
240  * Post-processing routine to be executed after nce_add_v6(). This function
241  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
242  * and must be called without any locks held.
243  */
244 int
245 nce_add_v6_postprocess(nce_t *nce)
246 {
247 	ncec_t		*ncec = nce->nce_common;
248 	boolean_t	dropped = B_FALSE;
249 	uchar_t		*hw_addr = ncec->ncec_lladdr;
250 	uint_t		hw_addr_len = ncec->ncec_lladdr_length;
251 	ill_t		*ill = ncec->ncec_ill;
252 	int		err = 0;
253 	uint16_t	flags = ncec->ncec_flags;
254 	ip_stack_t	*ipst = ill->ill_ipst;
255 	boolean_t	trigger_fastpath = B_TRUE;
256 
257 	/*
258 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
259 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
260 	 * We call nce_fastpath from nce_update if the link layer address of
261 	 * the peer changes from nce_update
262 	 */
263 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
264 	    (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
265 		trigger_fastpath = B_FALSE;
266 
267 	if (trigger_fastpath)
268 		nce_fastpath_trigger(nce);
269 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
270 		ill_t *hwaddr_ill;
271 		/*
272 		 * Unicast entry that needs DAD.
273 		 */
274 		if (IS_IPMP(ill)) {
275 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
276 			    hw_addr, hw_addr_len);
277 		} else {
278 			hwaddr_ill = ill;
279 		}
280 		nce_dad(ncec, hwaddr_ill, B_TRUE);
281 		err = EINPROGRESS;
282 	} else if (flags & NCE_F_UNSOL_ADV) {
283 		/*
284 		 * We account for the transmit below by assigning one
285 		 * less than the ndd variable. Subsequent decrements
286 		 * are done in nce_timer.
287 		 */
288 		mutex_enter(&ncec->ncec_lock);
289 		ncec->ncec_unsolicit_count =
290 		    ipst->ips_ip_ndp_unsolicit_count - 1;
291 		mutex_exit(&ncec->ncec_lock);
292 		dropped = ndp_xmit(ill,
293 		    ND_NEIGHBOR_ADVERT,
294 		    hw_addr,
295 		    hw_addr_len,
296 		    &ncec->ncec_addr,	/* Source and target of the adv */
297 		    &ipv6_all_hosts_mcast, /* Destination of the packet */
298 		    nce_advert_flags(ncec));
299 		mutex_enter(&ncec->ncec_lock);
300 		if (dropped)
301 			ncec->ncec_unsolicit_count++;
302 		else
303 			ncec->ncec_last_time_defended = ddi_get_lbolt();
304 		if (ncec->ncec_unsolicit_count != 0) {
305 			nce_start_timer(ncec,
306 			    ipst->ips_ip_ndp_unsolicit_interval);
307 		}
308 		mutex_exit(&ncec->ncec_lock);
309 	}
310 	return (err);
311 }
312 
313 /*
314  * Atomically lookup and add (if needed) Neighbor Cache information for
315  * an address.
316  *
317  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
318  * are always added pointing at the ipmp_ill. Thus, when the ill passed
319  * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
320  * entries will be created, both pointing at the same ncec_t. The nce_t
321  * entries will have their nce_ill set to the ipmp_ill and the under_ill
322  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
323  * Local addresses are always created on the ill passed to nce_add_v6.
324  */
325 int
326 nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
327     const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
328 {
329 	int		err = 0;
330 	ip_stack_t	*ipst = ill->ill_ipst;
331 	nce_t		*nce, *upper_nce = NULL;
332 	ill_t		*in_ill = ill;
333 	boolean_t	need_ill_refrele = B_FALSE;
334 
335 	if (flags & NCE_F_MCAST) {
336 		/*
337 		 * hw_addr will be figured out in nce_set_multicast_v6;
338 		 * caller has to select the cast_ill
339 		 */
340 		ASSERT(hw_addr == NULL);
341 		ASSERT(!IS_IPMP(ill));
342 		err = nce_set_multicast_v6(ill, addr, flags, newnce);
343 		return (err);
344 	}
345 	ASSERT(ill->ill_isv6);
346 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
347 		ill = ipmp_ill_hold_ipmp_ill(ill);
348 		if (ill == NULL)
349 			return (ENXIO);
350 		need_ill_refrele = B_TRUE;
351 	}
352 
353 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
354 	nce = nce_lookup_addr(ill, addr);
355 	if (nce == NULL) {
356 		err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
357 		    &nce);
358 	} else {
359 		err = EEXIST;
360 	}
361 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
362 	if (err == 0)
363 		err = nce_add_v6_postprocess(nce);
364 	if (in_ill != ill && nce != NULL) {
365 		nce_t *under_nce = NULL;
366 
367 		/*
368 		 * in_ill was the under_ill. Try to create the under_nce.
369 		 * Hold the ill_g_lock to prevent changes to group membership
370 		 * until we are done.
371 		 */
372 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
373 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
374 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
375 			    ill_t *, ill);
376 			rw_exit(&ipst->ips_ill_g_lock);
377 			err = ENXIO;
378 			nce_refrele(nce);
379 			nce = NULL;
380 			goto bail;
381 		}
382 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
383 		if (under_nce == NULL) {
384 			rw_exit(&ipst->ips_ill_g_lock);
385 			err = EINVAL;
386 			nce_refrele(nce);
387 			nce = NULL;
388 			goto bail;
389 		}
390 		rw_exit(&ipst->ips_ill_g_lock);
391 		upper_nce = nce;
392 		nce = under_nce; /* will be returned to caller */
393 		if (NCE_ISREACHABLE(nce->nce_common))
394 			nce_fastpath_trigger(under_nce);
395 	}
396 	/* nce_refrele is deferred until the lock is dropped  */
397 	if (nce != NULL) {
398 		if (newnce != NULL)
399 			*newnce = nce;
400 		else
401 			nce_refrele(nce);
402 	}
403 bail:
404 	if (upper_nce != NULL)
405 		nce_refrele(upper_nce);
406 	if (need_ill_refrele)
407 		ill_refrele(ill);
408 	return (err);
409 }
410 
411 /*
412  * Remove all the CONDEMNED nces from the appropriate hash table.
413  * We create a private list of NCEs, these may have ires pointing
414  * to them, so the list will be passed through to clean up dependent
415  * ires and only then we can do ncec_refrele() which can make NCE inactive.
416  */
417 static void
418 nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
419 {
420 	ncec_t *ncec1;
421 	ncec_t **ptpn;
422 
423 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
424 	ASSERT(ndp->ndp_g_walker == 0);
425 	for (; ncec; ncec = ncec1) {
426 		ncec1 = ncec->ncec_next;
427 		mutex_enter(&ncec->ncec_lock);
428 		if (NCE_ISCONDEMNED(ncec)) {
429 			ptpn = ncec->ncec_ptpn;
430 			ncec1 = ncec->ncec_next;
431 			if (ncec1 != NULL)
432 				ncec1->ncec_ptpn = ptpn;
433 			*ptpn = ncec1;
434 			ncec->ncec_ptpn = NULL;
435 			ncec->ncec_next = NULL;
436 			ncec->ncec_next = *free_nce_list;
437 			*free_nce_list = ncec;
438 		}
439 		mutex_exit(&ncec->ncec_lock);
440 	}
441 }
442 
443 /*
444  * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
445  *    will return this NCE. Also no new timeouts will
446  *    be started (See nce_restart_timer).
447  * 2. Cancel any currently running timeouts.
448  * 3. If there is an ndp walker, return. The walker will do the cleanup.
449  *    This ensures that walkers see a consistent list of NCEs while walking.
450  * 4. Otherwise remove the NCE from the list of NCEs
451  */
452 void
453 ncec_delete(ncec_t *ncec)
454 {
455 	ncec_t	**ptpn;
456 	ncec_t	*ncec1;
457 	int	ipversion = ncec->ncec_ipversion;
458 	ndp_g_t *ndp;
459 	ip_stack_t	*ipst = ncec->ncec_ipst;
460 
461 	if (ipversion == IPV4_VERSION)
462 		ndp = ipst->ips_ndp4;
463 	else
464 		ndp = ipst->ips_ndp6;
465 
466 	/* Serialize deletes */
467 	mutex_enter(&ncec->ncec_lock);
468 	if (NCE_ISCONDEMNED(ncec)) {
469 		/* Some other thread is doing the delete */
470 		mutex_exit(&ncec->ncec_lock);
471 		return;
472 	}
473 	/*
474 	 * Caller has a refhold. Also 1 ref for being in the list. Thus
475 	 * refcnt has to be >= 2
476 	 */
477 	ASSERT(ncec->ncec_refcnt >= 2);
478 	ncec->ncec_flags |= NCE_F_CONDEMNED;
479 	mutex_exit(&ncec->ncec_lock);
480 
481 	/* Count how many condemned ires for kmem_cache callback */
482 	atomic_add_32(&ipst->ips_num_nce_condemned, 1);
483 	nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
484 
485 	/* Complete any waiting callbacks */
486 	ncec_cb_dispatch(ncec);
487 
488 	/*
489 	 * Cancel any running timer. Timeout can't be restarted
490 	 * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
491 	 * Passing invalid timeout id is fine.
492 	 */
493 	if (ncec->ncec_timeout_id != 0) {
494 		(void) untimeout(ncec->ncec_timeout_id);
495 		ncec->ncec_timeout_id = 0;
496 	}
497 
498 	mutex_enter(&ndp->ndp_g_lock);
499 	if (ncec->ncec_ptpn == NULL) {
500 		/*
501 		 * The last ndp walker has already removed this ncec from
502 		 * the list after we marked the ncec CONDEMNED and before
503 		 * we grabbed the global lock.
504 		 */
505 		mutex_exit(&ndp->ndp_g_lock);
506 		return;
507 	}
508 	if (ndp->ndp_g_walker > 0) {
509 		/*
510 		 * Can't unlink. The walker will clean up
511 		 */
512 		ndp->ndp_g_walker_cleanup = B_TRUE;
513 		mutex_exit(&ndp->ndp_g_lock);
514 		return;
515 	}
516 
517 	/*
518 	 * Now remove the ncec from the list. nce_restart_timer won't restart
519 	 * the timer since it is marked CONDEMNED.
520 	 */
521 	ptpn = ncec->ncec_ptpn;
522 	ncec1 = ncec->ncec_next;
523 	if (ncec1 != NULL)
524 		ncec1->ncec_ptpn = ptpn;
525 	*ptpn = ncec1;
526 	ncec->ncec_ptpn = NULL;
527 	ncec->ncec_next = NULL;
528 	mutex_exit(&ndp->ndp_g_lock);
529 
530 	/* Removed from ncec_ptpn/ncec_next list */
531 	ncec_refrele_notr(ncec);
532 }
533 
534 void
535 ncec_inactive(ncec_t *ncec)
536 {
537 	mblk_t		**mpp;
538 	ill_t		*ill = ncec->ncec_ill;
539 	ip_stack_t	*ipst = ncec->ncec_ipst;
540 
541 	ASSERT(ncec->ncec_refcnt == 0);
542 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
543 
544 	/* Count how many condemned nces for kmem_cache callback */
545 	if (NCE_ISCONDEMNED(ncec))
546 		atomic_add_32(&ipst->ips_num_nce_condemned, -1);
547 
548 	/* Free all allocated messages */
549 	mpp = &ncec->ncec_qd_mp;
550 	while (*mpp != NULL) {
551 		mblk_t  *mp;
552 
553 		mp = *mpp;
554 		*mpp = mp->b_next;
555 
556 		inet_freemsg(mp);
557 	}
558 	/*
559 	 * must have been cleaned up in ncec_delete
560 	 */
561 	ASSERT(list_is_empty(&ncec->ncec_cb));
562 	list_destroy(&ncec->ncec_cb);
563 	/*
564 	 * free the ncec_lladdr if one was allocated in nce_add_common()
565 	 */
566 	if (ncec->ncec_lladdr_length > 0)
567 		kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
568 
569 #ifdef DEBUG
570 	ncec_trace_cleanup(ncec);
571 #endif
572 
573 	mutex_enter(&ill->ill_lock);
574 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
575 	    (char *), "ncec", (void *), ncec);
576 	ill->ill_ncec_cnt--;
577 	ncec->ncec_ill = NULL;
578 	/*
579 	 * If the number of ncec's associated with this ill have dropped
580 	 * to zero, check whether we need to restart any operation that
581 	 * is waiting for this to happen.
582 	 */
583 	if (ILL_DOWN_OK(ill)) {
584 		/* ipif_ill_refrele_tail drops the ill_lock */
585 		ipif_ill_refrele_tail(ill);
586 	} else {
587 		mutex_exit(&ill->ill_lock);
588 	}
589 
590 	mutex_destroy(&ncec->ncec_lock);
591 	kmem_cache_free(ncec_cache, ncec);
592 }
593 
594 /*
595  * ncec_walk routine.  Delete the ncec if it is associated with the ill
596  * that is going away.  Always called as a writer.
597  */
598 void
599 ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
600 {
601 	if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
602 		ncec_delete(ncec);
603 	}
604 }
605 
606 /*
607  * Neighbor Cache cleanup logic for a list of ncec_t entries.
608  */
609 static void
610 nce_cleanup_list(ncec_t *ncec)
611 {
612 	ncec_t *ncec_next;
613 
614 	ASSERT(ncec != NULL);
615 	while (ncec != NULL) {
616 		ncec_next = ncec->ncec_next;
617 		ncec->ncec_next = NULL;
618 
619 		/*
620 		 * It is possible for the last ndp walker (this thread)
621 		 * to come here after ncec_delete has marked the ncec CONDEMNED
622 		 * and before it has removed the ncec from the fastpath list
623 		 * or called untimeout. So we need to do it here. It is safe
624 		 * for both ncec_delete and this thread to do it twice or
625 		 * even simultaneously since each of the threads has a
626 		 * reference on the ncec.
627 		 */
628 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
629 		/*
630 		 * Cancel any running timer. Timeout can't be restarted
631 		 * since CONDEMNED is set. The ncec_lock can't be
632 		 * held across untimeout though passing invalid timeout
633 		 * id is fine.
634 		 */
635 		if (ncec->ncec_timeout_id != 0) {
636 			(void) untimeout(ncec->ncec_timeout_id);
637 			ncec->ncec_timeout_id = 0;
638 		}
639 		/* Removed from ncec_ptpn/ncec_next list */
640 		ncec_refrele_notr(ncec);
641 		ncec = ncec_next;
642 	}
643 }
644 
645 /*
646  * Restart DAD on given NCE.  Returns B_TRUE if DAD has been restarted.
647  */
648 boolean_t
649 nce_restart_dad(ncec_t *ncec)
650 {
651 	boolean_t started;
652 	ill_t *ill, *hwaddr_ill;
653 
654 	if (ncec == NULL)
655 		return (B_FALSE);
656 	ill = ncec->ncec_ill;
657 	mutex_enter(&ncec->ncec_lock);
658 	if (ncec->ncec_state == ND_PROBE) {
659 		mutex_exit(&ncec->ncec_lock);
660 		started = B_TRUE;
661 	} else if (ncec->ncec_state == ND_REACHABLE) {
662 		ASSERT(ncec->ncec_lladdr != NULL);
663 		ncec->ncec_state = ND_PROBE;
664 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
665 		/*
666 		 * Slight cheat here: we don't use the initial probe delay
667 		 * for IPv4 in this obscure case.
668 		 */
669 		mutex_exit(&ncec->ncec_lock);
670 		if (IS_IPMP(ill)) {
671 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
672 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length);
673 		} else {
674 			hwaddr_ill = ill;
675 		}
676 		nce_dad(ncec, hwaddr_ill, B_TRUE);
677 		started = B_TRUE;
678 	} else {
679 		mutex_exit(&ncec->ncec_lock);
680 		started = B_FALSE;
681 	}
682 	return (started);
683 }
684 
685 /*
686  * IPv6 Cache entry lookup.  Try to find an ncec matching the parameters passed.
687  * If one is found, the refcnt on the ncec will be incremented.
688  */
689 ncec_t *
690 ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
691 {
692 	ncec_t		*ncec;
693 	ip_stack_t	*ipst = ill->ill_ipst;
694 
695 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
696 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
697 
698 	/* Get head of v6 hash table */
699 	ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
700 	ncec = ncec_lookup_illgrp(ill, addr, ncec);
701 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
702 	rw_exit(&ipst->ips_ill_g_lock);
703 	return (ncec);
704 }
705 /*
706  * IPv4 Cache entry lookup.  Try to find an ncec matching the parameters passed.
707  * If one is found, the refcnt on the ncec will be incremented.
708  */
709 ncec_t *
710 ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
711 {
712 	ncec_t	*ncec = NULL;
713 	in6_addr_t addr6;
714 	ip_stack_t *ipst = ill->ill_ipst;
715 
716 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
717 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
718 
719 	/* Get head of v4 hash table */
720 	ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
721 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
722 	ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
723 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
724 	rw_exit(&ipst->ips_ill_g_lock);
725 	return (ncec);
726 }
727 
728 /*
729  * Cache entry lookup.  Try to find an ncec matching the parameters passed.
730  * If an ncec is found, increment the hold count on that ncec.
731  * The caller passes in the start of the appropriate hash table, and must
732  * be holding the appropriate global lock (ndp_g_lock). In addition, since
733  * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
734  * must be held as reader.
735  *
736  * This function always matches across the ipmp group.
737  */
738 ncec_t *
739 ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
740 {
741 	ndp_g_t		*ndp;
742 	ip_stack_t	*ipst = ill->ill_ipst;
743 
744 	if (ill->ill_isv6)
745 		ndp = ipst->ips_ndp6;
746 	else
747 		ndp = ipst->ips_ndp4;
748 
749 	ASSERT(ill != NULL);
750 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
751 	if (IN6_IS_ADDR_UNSPECIFIED(addr))
752 		return (NULL);
753 	for (; ncec != NULL; ncec = ncec->ncec_next) {
754 		if (ncec->ncec_ill == ill ||
755 		    IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
756 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
757 				mutex_enter(&ncec->ncec_lock);
758 				if (!NCE_ISCONDEMNED(ncec)) {
759 					ncec_refhold_locked(ncec);
760 					mutex_exit(&ncec->ncec_lock);
761 					break;
762 				}
763 				mutex_exit(&ncec->ncec_lock);
764 			}
765 		}
766 	}
767 	return (ncec);
768 }
769 
770 /*
771  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
772  * entries for ill only, i.e., when ill is part of an ipmp group,
773  * nce_lookup_v4 will never try to match across the group.
774  */
775 nce_t *
776 nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
777 {
778 	nce_t *nce;
779 	in6_addr_t addr6;
780 	ip_stack_t *ipst = ill->ill_ipst;
781 
782 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
783 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
784 	nce = nce_lookup_addr(ill, &addr6);
785 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
786 	return (nce);
787 }
788 
789 /*
790  * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
791  * entries for ill only, i.e., when ill is part of an ipmp group,
792  * nce_lookup_v6 will never try to match across the group.
793  */
794 nce_t *
795 nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
796 {
797 	nce_t *nce;
798 	ip_stack_t *ipst = ill->ill_ipst;
799 
800 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
801 	nce = nce_lookup_addr(ill, addr6);
802 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
803 	return (nce);
804 }
805 
806 static nce_t *
807 nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
808 {
809 	nce_t *nce;
810 
811 	ASSERT(ill != NULL);
812 #ifdef DEBUG
813 	if (ill->ill_isv6)
814 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
815 	else
816 		ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
817 #endif
818 	mutex_enter(&ill->ill_lock);
819 	nce = nce_lookup(ill, addr);
820 	mutex_exit(&ill->ill_lock);
821 	return (nce);
822 }
823 
824 
825 /*
826  * Router turned to host.  We need to make sure that cached copies of the ncec
827  * are not used for forwarding packets if they were derived from the default
828  * route, and that the default route itself is removed, as  required by
829  * section 7.2.5 of RFC 2461.
830  *
831  * Note that the ncec itself probably has valid link-layer information for the
832  * nexthop, so that there is no reason to delete the ncec, as long as the
833  * ISROUTER flag is turned off.
834  */
835 static void
836 ncec_router_to_host(ncec_t *ncec)
837 {
838 	ire_t		*ire;
839 	ip_stack_t	*ipst = ncec->ncec_ipst;
840 
841 	mutex_enter(&ncec->ncec_lock);
842 	ncec->ncec_flags &= ~NCE_F_ISROUTER;
843 	mutex_exit(&ncec->ncec_lock);
844 
845 	ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
846 	    &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
847 	    MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
848 	if (ire != NULL) {
849 		ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
850 		ire_delete(ire);
851 		ire_refrele(ire);
852 	}
853 }
854 
855 /*
856  * Process passed in parameters either from an incoming packet or via
857  * user ioctl.
858  */
859 void
860 nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
861 {
862 	ill_t	*ill = ncec->ncec_ill;
863 	uint32_t hw_addr_len = ill->ill_phys_addr_length;
864 	boolean_t ll_updated = B_FALSE;
865 	boolean_t ll_changed;
866 	nce_t	*nce;
867 
868 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
869 	/*
870 	 * No updates of link layer address or the neighbor state is
871 	 * allowed, when the cache is in NONUD state.  This still
872 	 * allows for responding to reachability solicitation.
873 	 */
874 	mutex_enter(&ncec->ncec_lock);
875 	if (ncec->ncec_state == ND_INCOMPLETE) {
876 		if (hw_addr == NULL) {
877 			mutex_exit(&ncec->ncec_lock);
878 			return;
879 		}
880 		nce_set_ll(ncec, hw_addr);
881 		/*
882 		 * Update ncec state and send the queued packets
883 		 * back to ip this time ire will be added.
884 		 */
885 		if (flag & ND_NA_FLAG_SOLICITED) {
886 			nce_update(ncec, ND_REACHABLE, NULL);
887 		} else {
888 			nce_update(ncec, ND_STALE, NULL);
889 		}
890 		mutex_exit(&ncec->ncec_lock);
891 		nce = nce_fastpath(ncec, B_TRUE, NULL);
892 		nce_resolv_ok(ncec);
893 		if (nce != NULL)
894 			nce_refrele(nce);
895 		return;
896 	}
897 	ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
898 	if (!is_adv) {
899 		/* If this is a SOLICITATION request only */
900 		if (ll_changed)
901 			nce_update(ncec, ND_STALE, hw_addr);
902 		mutex_exit(&ncec->ncec_lock);
903 		ncec_cb_dispatch(ncec);
904 		return;
905 	}
906 	if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
907 		/* If in any other state than REACHABLE, ignore */
908 		if (ncec->ncec_state == ND_REACHABLE) {
909 			nce_update(ncec, ND_STALE, NULL);
910 		}
911 		mutex_exit(&ncec->ncec_lock);
912 		ncec_cb_dispatch(ncec);
913 		return;
914 	} else {
915 		if (ll_changed) {
916 			nce_update(ncec, ND_UNCHANGED, hw_addr);
917 			ll_updated = B_TRUE;
918 		}
919 		if (flag & ND_NA_FLAG_SOLICITED) {
920 			nce_update(ncec, ND_REACHABLE, NULL);
921 		} else {
922 			if (ll_updated) {
923 				nce_update(ncec, ND_STALE, NULL);
924 			}
925 		}
926 		mutex_exit(&ncec->ncec_lock);
927 		if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
928 		    NCE_F_ISROUTER)) {
929 			ncec_router_to_host(ncec);
930 		} else {
931 			ncec_cb_dispatch(ncec);
932 		}
933 	}
934 }
935 
936 /*
937  * Pass arg1 to the pfi supplied, along with each ncec in existence.
938  * ncec_walk() places a REFHOLD on the ncec and drops the lock when
939  * walking the hash list.
940  */
941 void
942 ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
943     boolean_t trace)
944 {
945 	ncec_t	*ncec;
946 	ncec_t	*ncec1;
947 	ncec_t	**ncep;
948 	ncec_t	*free_nce_list = NULL;
949 
950 	mutex_enter(&ndp->ndp_g_lock);
951 	/* Prevent ncec_delete from unlink and free of NCE */
952 	ndp->ndp_g_walker++;
953 	mutex_exit(&ndp->ndp_g_lock);
954 	for (ncep = ndp->nce_hash_tbl;
955 	    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
956 		for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
957 			ncec1 = ncec->ncec_next;
958 			if (ill == NULL || ncec->ncec_ill == ill) {
959 				if (trace) {
960 					ncec_refhold(ncec);
961 					(*pfi)(ncec, arg1);
962 					ncec_refrele(ncec);
963 				} else {
964 					ncec_refhold_notr(ncec);
965 					(*pfi)(ncec, arg1);
966 					ncec_refrele_notr(ncec);
967 				}
968 			}
969 		}
970 	}
971 	mutex_enter(&ndp->ndp_g_lock);
972 	ndp->ndp_g_walker--;
973 	if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
974 		/* Time to delete condemned entries */
975 		for (ncep = ndp->nce_hash_tbl;
976 		    ncep < A_END(ndp->nce_hash_tbl); ncep++) {
977 			ncec = *ncep;
978 			if (ncec != NULL) {
979 				nce_remove(ndp, ncec, &free_nce_list);
980 			}
981 		}
982 		ndp->ndp_g_walker_cleanup = B_FALSE;
983 	}
984 
985 	mutex_exit(&ndp->ndp_g_lock);
986 
987 	if (free_nce_list != NULL) {
988 		nce_cleanup_list(free_nce_list);
989 	}
990 }
991 
992 /*
993  * Walk everything.
994  * Note that ill can be NULL hence can't derive the ipst from it.
995  */
996 void
997 ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
998 {
999 	ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
1000 	ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
1001 }
1002 
1003 /*
1004  * For each interface an entry is added for the unspecified multicast group.
1005  * Here that mapping is used to form the multicast cache entry for a particular
1006  * multicast destination.
1007  */
1008 static int
1009 nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1010     uint16_t flags, nce_t **newnce)
1011 {
1012 	uchar_t		*hw_addr;
1013 	int		err = 0;
1014 	ip_stack_t	*ipst = ill->ill_ipst;
1015 	nce_t		*nce;
1016 
1017 	ASSERT(ill != NULL);
1018 	ASSERT(ill->ill_isv6);
1019 	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
1020 
1021 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
1022 	nce = nce_lookup_addr(ill, dst);
1023 	if (nce != NULL) {
1024 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1025 		goto done;
1026 	}
1027 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
1028 		/*
1029 		 * For IRE_IF_RESOLVER a hardware mapping can be
1030 		 * generated.
1031 		 */
1032 		hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
1033 		if (hw_addr == NULL) {
1034 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1035 			return (ENOMEM);
1036 		}
1037 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
1038 	} else {
1039 		/* No hw_addr is needed for IRE_IF_NORESOLVER. */
1040 		hw_addr = NULL;
1041 	}
1042 	ASSERT((flags & NCE_F_MCAST) != 0);
1043 	ASSERT((flags & NCE_F_NONUD) != 0);
1044 	/* nce_state will be computed by nce_add_common() */
1045 	err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1046 	    ND_UNCHANGED, &nce);
1047 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1048 	if (err == 0)
1049 		err = nce_add_v6_postprocess(nce);
1050 	if (hw_addr != NULL)
1051 		kmem_free(hw_addr, ill->ill_nd_lla_len);
1052 	if (err != 0) {
1053 		ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1054 		return (err);
1055 	}
1056 done:
1057 	ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1058 	if (newnce != NULL)
1059 		*newnce = nce;
1060 	else
1061 		nce_refrele(nce);
1062 	return (0);
1063 }
1064 
1065 /*
1066  * Return the link layer address, and any flags of a ncec.
1067  */
1068 int
1069 ndp_query(ill_t *ill, struct lif_nd_req *lnr)
1070 {
1071 	ncec_t		*ncec;
1072 	in6_addr_t	*addr;
1073 	sin6_t		*sin6;
1074 
1075 	ASSERT(ill != NULL && ill->ill_isv6);
1076 	sin6 = (sin6_t *)&lnr->lnr_addr;
1077 	addr =  &sin6->sin6_addr;
1078 
1079 	/*
1080 	 * NOTE: if the ill is an IPMP interface, then match against the whole
1081 	 * illgrp.  This e.g. allows in.ndpd to retrieve the link layer
1082 	 * addresses for the data addresses on an IPMP interface even though
1083 	 * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
1084 	 */
1085 	ncec = ncec_lookup_illgrp_v6(ill, addr);
1086 	if (ncec == NULL)
1087 		return (ESRCH);
1088 	/* If no link layer address is available yet, return ESRCH */
1089 	if (!NCE_ISREACHABLE(ncec)) {
1090 		ncec_refrele(ncec);
1091 		return (ESRCH);
1092 	}
1093 	lnr->lnr_hdw_len = ill->ill_phys_addr_length;
1094 	bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
1095 	    lnr->lnr_hdw_len);
1096 	if (ncec->ncec_flags & NCE_F_ISROUTER)
1097 		lnr->lnr_flags = NDF_ISROUTER_ON;
1098 	if (ncec->ncec_flags & NCE_F_ANYCAST)
1099 		lnr->lnr_flags |= NDF_ANYCAST_ON;
1100 	ncec_refrele(ncec);
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Finish setting up the Enable/Disable multicast for the driver.
1106  */
1107 mblk_t *
1108 ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
1109     uint32_t hw_addr_offset, mblk_t *mp)
1110 {
1111 	uchar_t		*hw_addr;
1112 	ipaddr_t	v4group;
1113 	uchar_t		*addr;
1114 
1115 	ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
1116 	if (IN6_IS_ADDR_V4MAPPED(v6group)) {
1117 		IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
1118 
1119 		ASSERT(CLASSD(v4group));
1120 		ASSERT(!(ill->ill_isv6));
1121 
1122 		addr = (uchar_t *)&v4group;
1123 	} else {
1124 		ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
1125 		ASSERT(ill->ill_isv6);
1126 
1127 		addr = (uchar_t *)v6group;
1128 	}
1129 	hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
1130 	if (hw_addr == NULL) {
1131 		ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
1132 		freemsg(mp);
1133 		return (NULL);
1134 	}
1135 
1136 	ip_mcast_mapping(ill, addr, hw_addr);
1137 	return (mp);
1138 }
1139 
1140 void
1141 ip_ndp_resolve(ncec_t *ncec)
1142 {
1143 	in_addr_t	sender4 = INADDR_ANY;
1144 	in6_addr_t	sender6 = ipv6_all_zeros;
1145 	ill_t		*src_ill;
1146 	uint32_t	ms;
1147 
1148 	src_ill = nce_resolve_src(ncec, &sender6);
1149 	if (src_ill == NULL) {
1150 		/* Make sure we try again later */
1151 		ms = ncec->ncec_ill->ill_reachable_retrans_time;
1152 		nce_restart_timer(ncec, (clock_t)ms);
1153 		return;
1154 	}
1155 	if (ncec->ncec_ipversion == IPV4_VERSION)
1156 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
1157 	mutex_enter(&ncec->ncec_lock);
1158 	if (ncec->ncec_ipversion == IPV6_VERSION)
1159 		ms = ndp_solicit(ncec, sender6, src_ill);
1160 	else
1161 		ms = arp_request(ncec, sender4, src_ill);
1162 	mutex_exit(&ncec->ncec_lock);
1163 	if (ms == 0) {
1164 		if (ncec->ncec_state != ND_REACHABLE) {
1165 			if (ncec->ncec_ipversion == IPV6_VERSION)
1166 				ndp_resolv_failed(ncec);
1167 			else
1168 				arp_resolv_failed(ncec);
1169 			ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
1170 			nce_make_unreachable(ncec);
1171 			ncec_delete(ncec);
1172 		}
1173 	} else {
1174 		nce_restart_timer(ncec, (clock_t)ms);
1175 	}
1176 done:
1177 	ill_refrele(src_ill);
1178 }
1179 
1180 /*
1181  * Send an IPv6 neighbor solicitation.
1182  * Returns number of milliseconds after which we should either rexmit or abort.
1183  * Return of zero means we should abort.
1184  * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
1185  * The optional source address is used as a hint to ndp_solicit for
1186  * which source to use in the packet.
1187  *
1188  * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
1189  * the packet.
1190  */
1191 uint32_t
1192 ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
1193 {
1194 	in6_addr_t	dst;
1195 	boolean_t	dropped = B_FALSE;
1196 
1197 	ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
1198 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
1199 
1200 	if (ncec->ncec_rcnt == 0)
1201 		return (0);
1202 
1203 	dst = ncec->ncec_addr;
1204 	ncec->ncec_rcnt--;
1205 	mutex_exit(&ncec->ncec_lock);
1206 	dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
1207 	    ill->ill_phys_addr_length, &src, &dst, 0);
1208 	mutex_enter(&ncec->ncec_lock);
1209 	if (dropped)
1210 		ncec->ncec_rcnt++;
1211 	return (ncec->ncec_ill->ill_reachable_retrans_time);
1212 }
1213 
1214 /*
1215  * Attempt to recover an address on an interface that's been marked as a
1216  * duplicate.  Because NCEs are destroyed when the interface goes down, there's
1217  * no easy way to just probe the address and have the right thing happen if
1218  * it's no longer in use.  Instead, we just bring it up normally and allow the
1219  * regular interface start-up logic to probe for a remaining duplicate and take
1220  * us back down if necessary.
1221  * Neither DHCP nor temporary addresses arrive here; they're excluded by
1222  * ip_ndp_excl.
1223  */
1224 /* ARGSUSED */
1225 void
1226 ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1227 {
1228 	ill_t	*ill = rq->q_ptr;
1229 	ipif_t	*ipif;
1230 	in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
1231 	in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
1232 	boolean_t addr_equal;
1233 
1234 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1235 		/*
1236 		 * We do not support recovery of proxy ARP'd interfaces,
1237 		 * because the system lacks a complete proxy ARP mechanism.
1238 		 */
1239 		if (ill->ill_isv6) {
1240 			addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
1241 			    addr6);
1242 		} else {
1243 			addr_equal = (ipif->ipif_lcl_addr == *addr4);
1244 		}
1245 
1246 		if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
1247 			continue;
1248 
1249 		/*
1250 		 * If we have already recovered or if the interface is going
1251 		 * away, then ignore.
1252 		 */
1253 		mutex_enter(&ill->ill_lock);
1254 		if (!(ipif->ipif_flags & IPIF_DUPLICATE) ||
1255 		    (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1256 			mutex_exit(&ill->ill_lock);
1257 			continue;
1258 		}
1259 
1260 		ipif->ipif_flags &= ~IPIF_DUPLICATE;
1261 		ill->ill_ipif_dup_count--;
1262 		mutex_exit(&ill->ill_lock);
1263 		ipif->ipif_was_dup = B_TRUE;
1264 
1265 		if (ill->ill_isv6) {
1266 			VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
1267 			(void) ipif_up_done_v6(ipif);
1268 		} else {
1269 			VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
1270 			    EINPROGRESS);
1271 			(void) ipif_up_done(ipif);
1272 		}
1273 	}
1274 	freeb(mp);
1275 }
1276 
1277 /*
1278  * Attempt to recover an IPv6 interface that's been shut down as a duplicate.
1279  * As long as someone else holds the address, the interface will stay down.
1280  * When that conflict goes away, the interface is brought back up.  This is
1281  * done so that accidental shutdowns of addresses aren't made permanent.  Your
1282  * server will recover from a failure.
1283  *
1284  * For DHCP and temporary addresses, recovery is not done in the kernel.
1285  * Instead, it's handled by user space processes (dhcpagent and in.ndpd).
1286  *
1287  * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
1288  */
1289 void
1290 ipif_dup_recovery(void *arg)
1291 {
1292 	ipif_t *ipif = arg;
1293 
1294 	ipif->ipif_recovery_id = 0;
1295 	if (!(ipif->ipif_flags & IPIF_DUPLICATE))
1296 		return;
1297 
1298 	/*
1299 	 * No lock, because this is just an optimization.
1300 	 */
1301 	if (ipif->ipif_state_flags & IPIF_CONDEMNED)
1302 		return;
1303 
1304 	/* If the link is down, we'll retry this later */
1305 	if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
1306 		return;
1307 
1308 	ipif_do_recovery(ipif);
1309 }
1310 
1311 /*
1312  * Perform interface recovery by forcing the duplicate interfaces up and
1313  * allowing the system to determine which ones should stay up.
1314  *
1315  * Called both by recovery timer expiry and link-up notification.
1316  */
1317 void
1318 ipif_do_recovery(ipif_t *ipif)
1319 {
1320 	ill_t *ill = ipif->ipif_ill;
1321 	mblk_t *mp;
1322 	ip_stack_t *ipst = ill->ill_ipst;
1323 	size_t mp_size;
1324 
1325 	if (ipif->ipif_isv6)
1326 		mp_size = sizeof (ipif->ipif_v6lcl_addr);
1327 	else
1328 		mp_size = sizeof (ipif->ipif_lcl_addr);
1329 	mp = allocb(mp_size, BPRI_MED);
1330 	if (mp == NULL) {
1331 		mutex_enter(&ill->ill_lock);
1332 		if (ipst->ips_ip_dup_recovery > 0 &&
1333 		    ipif->ipif_recovery_id == 0 &&
1334 		    !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
1335 			ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1336 			    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1337 		}
1338 		mutex_exit(&ill->ill_lock);
1339 	} else {
1340 		/*
1341 		 * A recovery timer may still be running if we got here from
1342 		 * ill_restart_dad(); cancel that timer.
1343 		 */
1344 		if (ipif->ipif_recovery_id != 0)
1345 			(void) untimeout(ipif->ipif_recovery_id);
1346 		ipif->ipif_recovery_id = 0;
1347 
1348 		if (ipif->ipif_isv6) {
1349 			bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
1350 			    sizeof (ipif->ipif_v6lcl_addr));
1351 		} else  {
1352 			bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
1353 			    sizeof (ipif->ipif_lcl_addr));
1354 		}
1355 		ill_refhold(ill);
1356 		qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
1357 		    B_FALSE);
1358 	}
1359 }
1360 
1361 /*
1362  * Find the MAC and IP addresses in an NA/NS message.
1363  */
1364 static void
1365 ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
1366     in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
1367 {
1368 	icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1369 	nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
1370 	uchar_t *addr;
1371 	int alen;
1372 
1373 	/* icmp_inbound_v6 ensures this */
1374 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1375 
1376 	addr = ira->ira_l2src;
1377 	alen = ill->ill_phys_addr_length;
1378 	if (alen > 0) {
1379 		*haddr = addr;
1380 		*haddrlenp = alen;
1381 	} else {
1382 		*haddr = NULL;
1383 		*haddrlenp = 0;
1384 	}
1385 
1386 	/* nd_ns_target and nd_na_target are at the same offset, so we cheat */
1387 	*targp = ns->nd_ns_target;
1388 }
1389 
1390 /*
1391  * This is for exclusive changes due to NDP duplicate address detection
1392  * failure.
1393  */
1394 /* ARGSUSED */
1395 static void
1396 ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
1397 {
1398 	ill_t	*ill = rq->q_ptr;
1399 	ipif_t	*ipif;
1400 	uchar_t	*haddr;
1401 	uint_t	haddrlen;
1402 	ip_stack_t *ipst = ill->ill_ipst;
1403 	in6_addr_t targ;
1404 	ip_recv_attr_t iras;
1405 	mblk_t	*attrmp;
1406 
1407 	attrmp = mp;
1408 	mp = mp->b_cont;
1409 	attrmp->b_cont = NULL;
1410 	if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
1411 		/* The ill or ip_stack_t disappeared on us */
1412 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1413 		ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
1414 		freemsg(mp);
1415 		ira_cleanup(&iras, B_TRUE);
1416 		return;
1417 	}
1418 
1419 	ASSERT(ill == iras.ira_rill);
1420 
1421 	ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
1422 	if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
1423 		/*
1424 		 * Ignore conflicts generated by misbehaving switches that
1425 		 * just reflect our own messages back to us.  For IPMP, we may
1426 		 * see reflections across any ill in the illgrp.
1427 		 *
1428 		 * RFC2462 and revisions tried to detect both the case
1429 		 * when a statically configured IPv6 address is a duplicate,
1430 		 * and the case when the L2 address itself is a duplicate. The
1431 		 * later is important because, with stateles address autoconf,
1432 		 * if the L2 address is a duplicate, the resulting IPv6
1433 		 * address(es) would also be duplicates. We rely on DAD of the
1434 		 * IPv6 address itself to detect the latter case.
1435 		 */
1436 		/* For an under ill_grp can change under lock */
1437 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1438 		if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
1439 		    IS_UNDER_IPMP(ill) &&
1440 		    ipmp_illgrp_find_ill(ill->ill_grp, haddr,
1441 		    haddrlen) != NULL) {
1442 			rw_exit(&ipst->ips_ill_g_lock);
1443 			goto ignore_conflict;
1444 		}
1445 		rw_exit(&ipst->ips_ill_g_lock);
1446 	}
1447 
1448 	/*
1449 	 * Look up the appropriate ipif.
1450 	 */
1451 	ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
1452 	if (ipif == NULL)
1453 		goto ignore_conflict;
1454 
1455 	/* Reload the ill to match the ipif */
1456 	ill = ipif->ipif_ill;
1457 
1458 	/* If it's already duplicate or ineligible, then don't do anything. */
1459 	if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
1460 		ipif_refrele(ipif);
1461 		goto ignore_conflict;
1462 	}
1463 
1464 	/*
1465 	 * If this is a failure during duplicate recovery, then don't
1466 	 * complain.  It may take a long time to recover.
1467 	 */
1468 	if (!ipif->ipif_was_dup) {
1469 		char ibuf[LIFNAMSIZ];
1470 		char hbuf[MAC_STR_LEN];
1471 		char sbuf[INET6_ADDRSTRLEN];
1472 
1473 		ipif_get_name(ipif, ibuf, sizeof (ibuf));
1474 		cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
1475 		    " disabled", ibuf,
1476 		    inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
1477 		    mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)));
1478 	}
1479 	mutex_enter(&ill->ill_lock);
1480 	ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
1481 	ipif->ipif_flags |= IPIF_DUPLICATE;
1482 	ill->ill_ipif_dup_count++;
1483 	mutex_exit(&ill->ill_lock);
1484 	(void) ipif_down(ipif, NULL, NULL);
1485 	(void) ipif_down_tail(ipif);
1486 	mutex_enter(&ill->ill_lock);
1487 	if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
1488 	    ill->ill_net_type == IRE_IF_RESOLVER &&
1489 	    !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1490 	    ipst->ips_ip_dup_recovery > 0) {
1491 		ASSERT(ipif->ipif_recovery_id == 0);
1492 		ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
1493 		    ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
1494 	}
1495 	mutex_exit(&ill->ill_lock);
1496 	ipif_refrele(ipif);
1497 
1498 ignore_conflict:
1499 	freemsg(mp);
1500 	ira_cleanup(&iras, B_TRUE);
1501 }
1502 
1503 /*
1504  * Handle failure by tearing down the ipifs with the specified address.  Note
1505  * that tearing down the ipif also means deleting the ncec through ipif_down, so
1506  * it's not possible to do recovery by just restarting the ncec timer.  Instead,
1507  * we start a timer on the ipif.
1508  * Caller has to free mp;
1509  */
1510 static void
1511 ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
1512 {
1513 	const uchar_t	*haddr;
1514 	ill_t		*ill = ira->ira_rill;
1515 
1516 	/*
1517 	 * Ignore conflicts generated by misbehaving switches that just
1518 	 * reflect our own messages back to us.
1519 	 */
1520 
1521 	/* icmp_inbound_v6 ensures this */
1522 	ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
1523 	haddr = ira->ira_l2src;
1524 	if (haddr != NULL &&
1525 	    bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
1526 		return;
1527 	}
1528 
1529 	if ((mp = copymsg(mp)) != NULL) {
1530 		mblk_t	*attrmp;
1531 
1532 		attrmp = ip_recv_attr_to_mblk(ira);
1533 		if (attrmp == NULL) {
1534 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1535 			ip_drop_input("ipIfStatsInDiscards", mp, ill);
1536 			freemsg(mp);
1537 		} else {
1538 			ASSERT(attrmp->b_cont == NULL);
1539 			attrmp->b_cont = mp;
1540 			mp = attrmp;
1541 			ill_refhold(ill);
1542 			qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
1543 			    B_FALSE);
1544 		}
1545 	}
1546 }
1547 
1548 /*
1549  * Handle a discovered conflict: some other system is advertising that it owns
1550  * one of our IP addresses.  We need to defend ourselves, or just shut down the
1551  * interface.
1552  *
1553  * Handles both IPv4 and IPv6
1554  */
1555 boolean_t
1556 ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
1557 {
1558 	ipif_t		*ipif;
1559 	clock_t		now;
1560 	uint_t		maxdefense;
1561 	uint_t		defs;
1562 	ill_t		*ill = ira->ira_ill;
1563 	ip_stack_t	*ipst = ill->ill_ipst;
1564 	uint32_t	elapsed;
1565 	boolean_t	isv6 = ill->ill_isv6;
1566 	ipaddr_t	ncec_addr;
1567 
1568 	if (isv6) {
1569 		ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
1570 		    ipst);
1571 	} else {
1572 		if (arp_no_defense) {
1573 			/*
1574 			 * Yes, there is a conflict, but no, we do not
1575 			 * defend ourself.
1576 			 */
1577 			return (B_TRUE);
1578 		}
1579 		IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
1580 		ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
1581 		    ipst);
1582 	}
1583 	if (ipif == NULL)
1584 		return (B_FALSE);
1585 
1586 	/*
1587 	 * First, figure out if this address is disposable.
1588 	 */
1589 	if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY))
1590 		maxdefense = ipst->ips_ip_max_temp_defend;
1591 	else
1592 		maxdefense = ipst->ips_ip_max_defend;
1593 
1594 	/*
1595 	 * Now figure out how many times we've defended ourselves.  Ignore
1596 	 * defenses that happened long in the past.
1597 	 */
1598 	now = ddi_get_lbolt();
1599 	elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
1600 	mutex_enter(&ncec->ncec_lock);
1601 	if ((defs = ncec->ncec_defense_count) > 0 &&
1602 	    elapsed > ipst->ips_ip_defend_interval) {
1603 		/*
1604 		 * ip_defend_interval has elapsed.
1605 		 * reset the defense count.
1606 		 */
1607 		ncec->ncec_defense_count = defs = 0;
1608 	}
1609 	ncec->ncec_defense_count++;
1610 	ncec->ncec_last_time_defended = now;
1611 	mutex_exit(&ncec->ncec_lock);
1612 	ipif_refrele(ipif);
1613 
1614 	/*
1615 	 * If we've defended ourselves too many times already, then give up and
1616 	 * tear down the interface(s) using this address.
1617 	 * Otherwise, caller has to defend by sending out an announce.
1618 	 */
1619 	if (defs >= maxdefense) {
1620 		if (isv6)
1621 			ndp_failure(mp, ira);
1622 		else
1623 			arp_failure(mp, ira);
1624 	} else {
1625 		return (B_TRUE); /* caller must defend this address */
1626 	}
1627 	return (B_FALSE);
1628 }
1629 
1630 /*
1631  * Handle reception of Neighbor Solicitation messages.
1632  */
1633 static void
1634 ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
1635 {
1636 	ill_t		*ill = ira->ira_ill, *under_ill;
1637 	nd_neighbor_solicit_t *ns;
1638 	uint32_t	hlen = ill->ill_phys_addr_length;
1639 	uchar_t		*haddr = NULL;
1640 	icmp6_t		*icmp_nd;
1641 	ip6_t		*ip6h;
1642 	ncec_t		*our_ncec = NULL;
1643 	in6_addr_t	target;
1644 	in6_addr_t	src;
1645 	int		len;
1646 	int		flag = 0;
1647 	nd_opt_hdr_t	*opt = NULL;
1648 	boolean_t	bad_solicit = B_FALSE;
1649 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1650 	boolean_t	need_ill_refrele = B_FALSE;
1651 
1652 	ip6h = (ip6_t *)mp->b_rptr;
1653 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1654 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1655 	src = ip6h->ip6_src;
1656 	ns = (nd_neighbor_solicit_t *)icmp_nd;
1657 	target = ns->nd_ns_target;
1658 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1659 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1660 		if (ip_debug > 2) {
1661 			/* ip1dbg */
1662 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1663 			    AF_INET6, &target);
1664 		}
1665 		bad_solicit = B_TRUE;
1666 		goto done;
1667 	}
1668 	if (len > sizeof (nd_neighbor_solicit_t)) {
1669 		/* Options present */
1670 		opt = (nd_opt_hdr_t *)&ns[1];
1671 		len -= sizeof (nd_neighbor_solicit_t);
1672 		if (!ndp_verify_optlen(opt, len)) {
1673 			ip1dbg(("ndp_input_solicit: Bad opt len\n"));
1674 			bad_solicit = B_TRUE;
1675 			goto done;
1676 		}
1677 	}
1678 	if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
1679 		/* Check to see if this is a valid DAD solicitation */
1680 		if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
1681 			if (ip_debug > 2) {
1682 				/* ip1dbg */
1683 				pr_addr_dbg("ndp_input_solicit: IPv6 "
1684 				    "Destination is not solicited node "
1685 				    "multicast %s\n", AF_INET6,
1686 				    &ip6h->ip6_dst);
1687 			}
1688 			bad_solicit = B_TRUE;
1689 			goto done;
1690 		}
1691 	}
1692 
1693 	/*
1694 	 * NOTE: with IPMP, it's possible the nominated multicast ill (which
1695 	 * received this packet if it's multicast) is not the ill tied to
1696 	 * e.g. the IPMP ill's data link-local.  So we match across the illgrp
1697 	 * to ensure we find the associated NCE.
1698 	 */
1699 	our_ncec = ncec_lookup_illgrp_v6(ill, &target);
1700 	/*
1701 	 * If this is a valid Solicitation for an address we are publishing,
1702 	 * then a PUBLISH entry should exist in the cache
1703 	 */
1704 	if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
1705 		ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
1706 		    "ifname=%s ", ill->ill_name));
1707 		if (ip_debug > 2) {
1708 			/* ip1dbg */
1709 			pr_addr_dbg(" dst %s\n", AF_INET6, &target);
1710 		}
1711 		if (our_ncec == NULL)
1712 			bad_solicit = B_TRUE;
1713 		goto done;
1714 	}
1715 
1716 	/* At this point we should have a verified NS per spec */
1717 	if (opt != NULL) {
1718 		opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR);
1719 		if (opt != NULL) {
1720 			haddr = (uchar_t *)&opt[1];
1721 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1722 			    hlen == 0) {
1723 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1724 				bad_solicit = B_TRUE;
1725 				goto done;
1726 			}
1727 		}
1728 	}
1729 
1730 	/* If sending directly to peer, set the unicast flag */
1731 	if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
1732 		flag |= NDP_UNICAST;
1733 
1734 	/*
1735 	 * Create/update the entry for the soliciting node on the ipmp_ill.
1736 	 * or respond to outstanding queries, don't if
1737 	 * the source is unspecified address.
1738 	 */
1739 	if (!IN6_IS_ADDR_UNSPECIFIED(&src)) {
1740 		int	err;
1741 		nce_t	*nnce;
1742 
1743 		ASSERT(ill->ill_isv6);
1744 		/*
1745 		 * Regular solicitations *must* include the Source Link-Layer
1746 		 * Address option.  Ignore messages that do not.
1747 		 */
1748 		if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
1749 			ip1dbg(("ndp_input_solicit: source link-layer address "
1750 			    "option missing with a specified source.\n"));
1751 			bad_solicit = B_TRUE;
1752 			goto done;
1753 		}
1754 
1755 		/*
1756 		 * This is a regular solicitation.  If we're still in the
1757 		 * process of verifying the address, then don't respond at all
1758 		 * and don't keep track of the sender.
1759 		 */
1760 		if (our_ncec->ncec_state == ND_PROBE)
1761 			goto done;
1762 
1763 		/*
1764 		 * If the solicitation doesn't have sender hardware address
1765 		 * (legal for unicast solicitation), then process without
1766 		 * installing the return NCE.  Either we already know it, or
1767 		 * we'll be forced to look it up when (and if) we reply to the
1768 		 * packet.
1769 		 */
1770 		if (haddr == NULL)
1771 			goto no_source;
1772 
1773 		under_ill = ill;
1774 		if (IS_UNDER_IPMP(under_ill)) {
1775 			ill = ipmp_ill_hold_ipmp_ill(under_ill);
1776 			if (ill == NULL)
1777 				ill = under_ill;
1778 			else
1779 				need_ill_refrele = B_TRUE;
1780 		}
1781 		err = nce_lookup_then_add_v6(ill,
1782 		    haddr, hlen,
1783 		    &src,	/* Soliciting nodes address */
1784 		    0,
1785 		    ND_STALE,
1786 		    &nnce);
1787 
1788 		if (need_ill_refrele) {
1789 			ill_refrele(ill);
1790 			ill = under_ill;
1791 			need_ill_refrele =  B_FALSE;
1792 		}
1793 		switch (err) {
1794 		case 0:
1795 			/* done with this entry */
1796 			nce_refrele(nnce);
1797 			break;
1798 		case EEXIST:
1799 			/*
1800 			 * B_FALSE indicates this is not an an advertisement.
1801 			 */
1802 			nce_process(nnce->nce_common, haddr, 0, B_FALSE);
1803 			nce_refrele(nnce);
1804 			break;
1805 		default:
1806 			ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
1807 			    err));
1808 			goto done;
1809 		}
1810 no_source:
1811 		flag |= NDP_SOLICITED;
1812 	} else {
1813 		/*
1814 		 * No source link layer address option should be present in a
1815 		 * valid DAD request.
1816 		 */
1817 		if (haddr != NULL) {
1818 			ip1dbg(("ndp_input_solicit: source link-layer address "
1819 			    "option present with an unspecified source.\n"));
1820 			bad_solicit = B_TRUE;
1821 			goto done;
1822 		}
1823 		if (our_ncec->ncec_state == ND_PROBE) {
1824 			/*
1825 			 * Internally looped-back probes will have
1826 			 * IRAF_L2SRC_LOOPBACK set so we can ignore our own
1827 			 * transmissions.
1828 			 */
1829 			if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
1830 				/*
1831 				 * If someone else is probing our address, then
1832 				 * we've crossed wires.  Declare failure.
1833 				 */
1834 				ndp_failure(mp, ira);
1835 			}
1836 			goto done;
1837 		}
1838 		/*
1839 		 * This is a DAD probe.  Multicast the advertisement to the
1840 		 * all-nodes address.
1841 		 */
1842 		src = ipv6_all_hosts_mcast;
1843 	}
1844 	flag |= nce_advert_flags(our_ncec);
1845 	(void) ndp_xmit(ill,
1846 	    ND_NEIGHBOR_ADVERT,
1847 	    our_ncec->ncec_lladdr,
1848 	    our_ncec->ncec_lladdr_length,
1849 	    &target,	/* Source and target of the advertisement pkt */
1850 	    &src,	/* IP Destination (source of original pkt) */
1851 	    flag);
1852 done:
1853 	if (bad_solicit)
1854 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
1855 	if (our_ncec != NULL)
1856 		ncec_refrele(our_ncec);
1857 }
1858 
1859 /*
1860  * Handle reception of Neighbor Solicitation messages
1861  */
1862 void
1863 ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
1864 {
1865 	ill_t		*ill = ira->ira_ill;
1866 	nd_neighbor_advert_t *na;
1867 	uint32_t	hlen = ill->ill_phys_addr_length;
1868 	uchar_t		*haddr = NULL;
1869 	icmp6_t		*icmp_nd;
1870 	ip6_t		*ip6h;
1871 	ncec_t		*dst_ncec = NULL;
1872 	in6_addr_t	target;
1873 	nd_opt_hdr_t	*opt = NULL;
1874 	int		len;
1875 	ip_stack_t	*ipst = ill->ill_ipst;
1876 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
1877 
1878 	ip6h = (ip6_t *)mp->b_rptr;
1879 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
1880 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
1881 	na = (nd_neighbor_advert_t *)icmp_nd;
1882 
1883 	if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
1884 	    (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
1885 		ip1dbg(("ndp_input_advert: Target is multicast but the "
1886 		    "solicited flag is not zero\n"));
1887 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1888 		return;
1889 	}
1890 	target = na->nd_na_target;
1891 	if (IN6_IS_ADDR_MULTICAST(&target) || IN6_IS_ADDR_V4MAPPED(&target) ||
1892 	    IN6_IS_ADDR_LOOPBACK(&target)) {
1893 		if (ip_debug > 2) {
1894 			/* ip1dbg */
1895 			pr_addr_dbg("ndp_input_solicit: Martian Target %s\n",
1896 			    AF_INET6, &target);
1897 		}
1898 		BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1899 		return;
1900 	}
1901 	if (len > sizeof (nd_neighbor_advert_t)) {
1902 		opt = (nd_opt_hdr_t *)&na[1];
1903 		if (!ndp_verify_optlen(opt,
1904 		    len - sizeof (nd_neighbor_advert_t))) {
1905 			ip1dbg(("ndp_input_advert: cannot verify SLLA\n"));
1906 			BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements);
1907 			return;
1908 		}
1909 		/* At this point we have a verified NA per spec */
1910 		len -= sizeof (nd_neighbor_advert_t);
1911 		opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR);
1912 		if (opt != NULL) {
1913 			haddr = (uchar_t *)&opt[1];
1914 			if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
1915 			    hlen == 0) {
1916 				ip1dbg(("ndp_input_advert: bad SLLA\n"));
1917 				BUMP_MIB(mib,
1918 				    ipv6IfIcmpInBadNeighborAdvertisements);
1919 				return;
1920 			}
1921 		}
1922 	}
1923 
1924 	/*
1925 	 * NOTE: we match across the illgrp since we need to do DAD for all of
1926 	 * our local addresses, and those are spread across all the active
1927 	 * ills in the group.
1928 	 */
1929 	if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
1930 		return;
1931 
1932 	if (NCE_PUBLISH(dst_ncec)) {
1933 		/*
1934 		 * Someone just advertised an addresses that we publish. First,
1935 		 * check it it was us -- if so, we can safely ignore it.
1936 		 * We don't get the haddr from the ira_l2src because, in the
1937 		 * case that the packet originated from us, on an IPMP group,
1938 		 * the ira_l2src may would be the link-layer address of the
1939 		 * cast_ill used to send the packet, which may not be the same
1940 		 * as the dst_ncec->ncec_lladdr of the address.
1941 		 */
1942 		if (haddr != NULL) {
1943 			if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
1944 				goto out;
1945 
1946 			if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
1947 				goto out;   /* from us -- no conflict */
1948 
1949 			/*
1950 			 * If we're in an IPMP group, check if this is an echo
1951 			 * from another ill in the group.  Use the double-
1952 			 * checked locking pattern to avoid grabbing
1953 			 * ill_g_lock in the non-IPMP case.
1954 			 */
1955 			if (IS_UNDER_IPMP(ill)) {
1956 				rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1957 				if (IS_UNDER_IPMP(ill) && ipmp_illgrp_find_ill(
1958 				    ill->ill_grp, haddr, hlen) != NULL) {
1959 					rw_exit(&ipst->ips_ill_g_lock);
1960 					goto out;
1961 				}
1962 				rw_exit(&ipst->ips_ill_g_lock);
1963 			}
1964 		}
1965 
1966 		/*
1967 		 * This appears to be a real conflict.  If we're trying to
1968 		 * configure this NCE (ND_PROBE), then shut it down.
1969 		 * Otherwise, handle the discovered conflict.
1970 		 */
1971 		if (dst_ncec->ncec_state == ND_PROBE) {
1972 			ndp_failure(mp, ira);
1973 		} else {
1974 			if (ip_nce_conflict(mp, ira, dst_ncec)) {
1975 				char hbuf[MAC_STR_LEN];
1976 				char sbuf[INET6_ADDRSTRLEN];
1977 
1978 				cmn_err(CE_WARN,
1979 				    "node '%s' is using %s on %s",
1980 				    inet_ntop(AF_INET6, &target, sbuf,
1981 				    sizeof (sbuf)),
1982 				    haddr == NULL ? "<none>" :
1983 				    mac_colon_addr(haddr, hlen, hbuf,
1984 				    sizeof (hbuf)), ill->ill_name);
1985 				/*
1986 				 * RFC 4862, Section 5.4.4 does not mandate
1987 				 * any specific behavior when an NA matches
1988 				 * a non-tentative address assigned to the
1989 				 * receiver. We make the choice of defending
1990 				 * our address, based on the assumption that
1991 				 * the sender has not detected the Duplicate.
1992 				 *
1993 				 * ncec_last_time_defended has been adjusted
1994 				 * in ip_nce_conflict()
1995 				 */
1996 				(void) ndp_announce(dst_ncec);
1997 			}
1998 		}
1999 	} else {
2000 		if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
2001 			dst_ncec->ncec_flags |= NCE_F_ISROUTER;
2002 
2003 		/* B_TRUE indicates this an advertisement */
2004 		nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
2005 	}
2006 out:
2007 	ncec_refrele(dst_ncec);
2008 }
2009 
2010 /*
2011  * Process NDP neighbor solicitation/advertisement messages.
2012  * The checksum has already checked o.k before reaching here.
2013  * Information about the datalink header is contained in ira_l2src, but
2014  * that should be ignored for loopback packets.
2015  */
2016 void
2017 ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
2018 {
2019 	ill_t		*ill = ira->ira_rill;
2020 	icmp6_t		*icmp_nd;
2021 	ip6_t		*ip6h;
2022 	int		len;
2023 	mib2_ipv6IfIcmpEntry_t	*mib = ill->ill_icmp6_mib;
2024 	ill_t		*orig_ill = NULL;
2025 
2026 	/*
2027 	 * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
2028 	 * and make it be the IPMP upper so avoid being confused by a packet
2029 	 * addressed to a unicast address on a different ill.
2030 	 */
2031 	if (IS_UNDER_IPMP(ill)) {
2032 		orig_ill = ill;
2033 		ill = ipmp_ill_hold_ipmp_ill(orig_ill);
2034 		if (ill == NULL) {
2035 			ill = orig_ill;
2036 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2037 			ip_drop_input("ipIfStatsInDiscards - IPMP ill",
2038 			    mp, ill);
2039 			freemsg(mp);
2040 			return;
2041 		}
2042 		ASSERT(ill != orig_ill);
2043 		orig_ill = ira->ira_ill;
2044 		ira->ira_ill = ill;
2045 		mib = ill->ill_icmp6_mib;
2046 	}
2047 	if (!pullupmsg(mp, -1)) {
2048 		ip1dbg(("ndp_input: pullupmsg failed\n"));
2049 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2050 		ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
2051 		goto done;
2052 	}
2053 	ip6h = (ip6_t *)mp->b_rptr;
2054 	if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
2055 		ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
2056 		ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
2057 		BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
2058 		goto done;
2059 	}
2060 	/*
2061 	 * NDP does not accept any extension headers between the
2062 	 * IP header and the ICMP header since e.g. a routing
2063 	 * header could be dangerous.
2064 	 * This assumes that any AH or ESP headers are removed
2065 	 * by ip prior to passing the packet to ndp_input.
2066 	 */
2067 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2068 		ip1dbg(("ndp_input: Wrong next header 0x%x\n",
2069 		    ip6h->ip6_nxt));
2070 		ip_drop_input("Wrong next header", mp, ill);
2071 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2072 		goto done;
2073 	}
2074 	icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
2075 	ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT ||
2076 	    icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
2077 	if (icmp_nd->icmp6_code != 0) {
2078 		ip1dbg(("ndp_input: icmp6 code != 0 \n"));
2079 		ip_drop_input("code non-zero", mp, ill);
2080 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2081 		goto done;
2082 	}
2083 	len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
2084 	/*
2085 	 * Make sure packet length is large enough for either
2086 	 * a NS or a NA icmp packet.
2087 	 */
2088 	if (len <  sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
2089 		ip1dbg(("ndp_input: packet too short\n"));
2090 		ip_drop_input("packet too short", mp, ill);
2091 		BUMP_MIB(mib, ipv6IfIcmpInErrors);
2092 		goto done;
2093 	}
2094 	if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
2095 		ndp_input_solicit(mp, ira);
2096 	} else {
2097 		ndp_input_advert(mp, ira);
2098 	}
2099 done:
2100 	freemsg(mp);
2101 	if (orig_ill != NULL) {
2102 		ill_refrele(ill);
2103 		ira->ira_ill = orig_ill;
2104 	}
2105 }
2106 
2107 /*
2108  * ndp_xmit is called to form and transmit a ND solicitation or
2109  * advertisement ICMP packet.
2110  *
2111  * If the source address is unspecified and this isn't a probe (used for
2112  * duplicate address detection), an appropriate source address and link layer
2113  * address will be chosen here.  The link layer address option is included if
2114  * the source is specified (i.e., all non-probe packets), and omitted (per the
2115  * specification) otherwise.
2116  *
2117  * It returns B_FALSE only if it does a successful put() to the
2118  * corresponding ill's ill_wq otherwise returns B_TRUE.
2119  */
2120 static boolean_t
2121 ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
2122     const in6_addr_t *sender, const in6_addr_t *target, int flag)
2123 {
2124 	uint32_t	len;
2125 	icmp6_t 	*icmp6;
2126 	mblk_t		*mp;
2127 	ip6_t		*ip6h;
2128 	nd_opt_hdr_t	*opt;
2129 	uint_t		plen;
2130 	zoneid_t	zoneid = GLOBAL_ZONEID;
2131 	ill_t		*hwaddr_ill = ill;
2132 	ip_xmit_attr_t	ixas;
2133 	ip_stack_t	*ipst = ill->ill_ipst;
2134 	boolean_t	need_refrele = B_FALSE;
2135 	boolean_t	probe = B_FALSE;
2136 
2137 	if (IS_UNDER_IPMP(ill)) {
2138 		probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
2139 		/*
2140 		 * We send non-probe packets on the upper IPMP interface.
2141 		 * ip_output_simple() will use cast_ill for sending any
2142 		 * multicast packets. Note that we can't follow the same
2143 		 * logic for probe packets because all interfaces in the ipmp
2144 		 * group may have failed, so that we really want to only try
2145 		 * to send the ND packet on the ill corresponding to the src
2146 		 * address.
2147 		 */
2148 		if (!probe) {
2149 			ill = ipmp_ill_hold_ipmp_ill(ill);
2150 			if (ill != NULL)
2151 				need_refrele = B_TRUE;
2152 			else
2153 				ill = hwaddr_ill;
2154 		}
2155 	}
2156 
2157 	/*
2158 	 * If we have a unspecified source(sender) address, select a
2159 	 * proper source address for the solicitation here itself so
2160 	 * that we can initialize the h/w address correctly.
2161 	 *
2162 	 * If the sender is specified then we use this address in order
2163 	 * to lookup the zoneid before calling ip_output_v6(). This is to
2164 	 * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
2165 	 * by IP (we cannot guarantee that the global zone has an interface
2166 	 * route to the destination).
2167 	 *
2168 	 * Note that the NA never comes here with the unspecified source
2169 	 * address.
2170 	 */
2171 
2172 	/*
2173 	 * Probes will have unspec src at this point.
2174 	 */
2175 	if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
2176 		zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
2177 		/*
2178 		 * It's possible for ipif_lookup_addr_zoneid_v6() to return
2179 		 * ALL_ZONES if it cannot find a matching ipif for the address
2180 		 * we are trying to use. In this case we err on the side of
2181 		 * trying to send the packet by defaulting to the GLOBAL_ZONEID.
2182 		 */
2183 		if (zoneid == ALL_ZONES)
2184 			zoneid = GLOBAL_ZONEID;
2185 	}
2186 
2187 	plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
2188 	len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
2189 	mp = allocb(len,  BPRI_LO);
2190 	if (mp == NULL) {
2191 		if (need_refrele)
2192 			ill_refrele(ill);
2193 		return (B_TRUE);
2194 	}
2195 
2196 	bzero((char *)mp->b_rptr, len);
2197 	mp->b_wptr = mp->b_rptr + len;
2198 
2199 	bzero(&ixas, sizeof (ixas));
2200 	ixas.ixa_flags = IXAF_SET_ULP_CKSUM | IXAF_NO_HW_CKSUM;
2201 
2202 	ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
2203 	ixas.ixa_ipst = ipst;
2204 	ixas.ixa_cred = kcred;
2205 	ixas.ixa_cpid = NOPID;
2206 	ixas.ixa_tsl = NULL;
2207 	ixas.ixa_zoneid = zoneid;
2208 
2209 	ip6h = (ip6_t *)mp->b_rptr;
2210 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2211 	ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2212 	ip6h->ip6_nxt = IPPROTO_ICMPV6;
2213 	ip6h->ip6_hops = IPV6_MAX_HOPS;
2214 	ixas.ixa_multicast_ttl = ip6h->ip6_hops;
2215 	ip6h->ip6_dst = *target;
2216 	icmp6 = (icmp6_t *)&ip6h[1];
2217 
2218 	if (hw_addr_len != 0) {
2219 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
2220 		    sizeof (nd_neighbor_advert_t));
2221 	} else {
2222 		opt = NULL;
2223 	}
2224 	if (operation == ND_NEIGHBOR_SOLICIT) {
2225 		nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
2226 
2227 		if (opt != NULL && !(flag & NDP_PROBE)) {
2228 			/*
2229 			 * Note that we don't send out SLLA for ND probes
2230 			 * per RFC 4862, even though we do send out the src
2231 			 * haddr for IPv4 DAD probes, even though both IPv4
2232 			 * and IPv6 go out with the unspecified/INADDR_ANY
2233 			 * src IP addr.
2234 			 */
2235 			opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
2236 		}
2237 		ip6h->ip6_src = *sender;
2238 		ns->nd_ns_target = *target;
2239 		if (!(flag & NDP_UNICAST)) {
2240 			/* Form multicast address of the target */
2241 			ip6h->ip6_dst = ipv6_solicited_node_mcast;
2242 			ip6h->ip6_dst.s6_addr32[3] |=
2243 			    ns->nd_ns_target.s6_addr32[3];
2244 		}
2245 	} else {
2246 		nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
2247 
2248 		ASSERT(!(flag & NDP_PROBE));
2249 		if (opt != NULL)
2250 			opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
2251 		ip6h->ip6_src = *sender;
2252 		na->nd_na_target = *sender;
2253 		if (flag & NDP_ISROUTER)
2254 			na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
2255 		if (flag & NDP_SOLICITED)
2256 			na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED;
2257 		if (flag & NDP_ORIDE)
2258 			na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
2259 	}
2260 
2261 	if (!(flag & NDP_PROBE)) {
2262 		if (hw_addr != NULL && opt != NULL) {
2263 			/* Fill in link layer address and option len */
2264 			opt->nd_opt_len = (uint8_t)plen;
2265 			bcopy(hw_addr, &opt[1], hw_addr_len);
2266 		}
2267 	}
2268 	if (opt != NULL && opt->nd_opt_type == 0) {
2269 		/* If there's no link layer address option, then strip it. */
2270 		len -= plen * 8;
2271 		mp->b_wptr = mp->b_rptr + len;
2272 		ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
2273 	}
2274 
2275 	icmp6->icmp6_type = (uint8_t)operation;
2276 	icmp6->icmp6_code = 0;
2277 	/*
2278 	 * Prepare for checksum by putting icmp length in the icmp
2279 	 * checksum field. The checksum is calculated in ip_output.c.
2280 	 */
2281 	icmp6->icmp6_cksum = ip6h->ip6_plen;
2282 
2283 	(void) ip_output_simple(mp, &ixas);
2284 	ixa_cleanup(&ixas);
2285 	if (need_refrele)
2286 		ill_refrele(ill);
2287 	return (B_FALSE);
2288 }
2289 
2290 /*
2291  * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
2292  * The datapath uses this as an indication that there
2293  * is a problem (as opposed to a NCE that was just
2294  * reclaimed due to lack of memory.
2295  * Note that static ARP entries never become unreachable.
2296  */
2297 void
2298 nce_make_unreachable(ncec_t *ncec)
2299 {
2300 	mutex_enter(&ncec->ncec_lock);
2301 	ncec->ncec_state = ND_UNREACHABLE;
2302 	mutex_exit(&ncec->ncec_lock);
2303 }
2304 
2305 /*
2306  * NCE retransmit timer. Common to IPv4 and IPv6.
2307  * This timer goes off when:
2308  * a. It is time to retransmit a resolution for resolver.
2309  * b. It is time to send reachability probes.
2310  */
2311 void
2312 nce_timer(void *arg)
2313 {
2314 	ncec_t		*ncec = arg;
2315 	ill_t		*ill = ncec->ncec_ill, *src_ill;
2316 	char		addrbuf[INET6_ADDRSTRLEN];
2317 	boolean_t	dropped = B_FALSE;
2318 	ip_stack_t	*ipst = ncec->ncec_ipst;
2319 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2320 	in_addr_t	sender4 = INADDR_ANY;
2321 	in6_addr_t	sender6 = ipv6_all_zeros;
2322 
2323 	/*
2324 	 * The timer has to be cancelled by ncec_delete before doing the final
2325 	 * refrele. So the NCE is guaranteed to exist when the timer runs
2326 	 * until it clears the timeout_id. Before clearing the timeout_id
2327 	 * bump up the refcnt so that we can continue to use the ncec
2328 	 */
2329 	ASSERT(ncec != NULL);
2330 	mutex_enter(&ncec->ncec_lock);
2331 	ncec_refhold_locked(ncec);
2332 	ncec->ncec_timeout_id = 0;
2333 	mutex_exit(&ncec->ncec_lock);
2334 
2335 	src_ill = nce_resolve_src(ncec, &sender6);
2336 	/* if we could not find a sender address, return */
2337 	if (src_ill == NULL) {
2338 		if (!isv6) {
2339 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
2340 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
2341 			    &sender4, addrbuf, sizeof (addrbuf))));
2342 		} else {
2343 			ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
2344 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2345 		}
2346 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2347 		ncec_refrele(ncec);
2348 		return;
2349 	}
2350 	if (!isv6)
2351 		IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
2352 
2353 	mutex_enter(&ncec->ncec_lock);
2354 	/*
2355 	 * Check the reachability state.
2356 	 */
2357 	switch (ncec->ncec_state) {
2358 	case ND_DELAY:
2359 		ASSERT(ncec->ncec_lladdr != NULL);
2360 		ncec->ncec_state = ND_PROBE;
2361 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2362 		if (isv6) {
2363 			mutex_exit(&ncec->ncec_lock);
2364 			dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
2365 			    src_ill->ill_phys_addr,
2366 			    src_ill->ill_phys_addr_length,
2367 			    &sender6, &ncec->ncec_addr,
2368 			    NDP_UNICAST);
2369 		} else {
2370 			dropped = (arp_request(ncec, sender4, src_ill) == 0);
2371 			mutex_exit(&ncec->ncec_lock);
2372 		}
2373 		if (!dropped) {
2374 			mutex_enter(&ncec->ncec_lock);
2375 			ncec->ncec_pcnt--;
2376 			mutex_exit(&ncec->ncec_lock);
2377 		}
2378 		if (ip_debug > 3) {
2379 			/* ip2dbg */
2380 			pr_addr_dbg("nce_timer: state for %s changed "
2381 			    "to PROBE\n", AF_INET6, &ncec->ncec_addr);
2382 		}
2383 		nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
2384 		break;
2385 	case ND_PROBE:
2386 		/* must be retransmit timer */
2387 		ASSERT(ncec->ncec_pcnt >= -1);
2388 		if (ncec->ncec_pcnt > 0) {
2389 			/*
2390 			 * As per RFC2461, the ncec gets deleted after
2391 			 * MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
2392 			 * Note that the first unicast solicitation is sent
2393 			 * during the DELAY state.
2394 			 */
2395 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2396 			    ncec->ncec_pcnt,
2397 			    inet_ntop((isv6? AF_INET6 : AF_INET),
2398 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2399 			if (NCE_PUBLISH(ncec)) {
2400 				mutex_exit(&ncec->ncec_lock);
2401 				/*
2402 				 * send out a probe; note that src_ill
2403 				 * is ignored by nce_dad() for all
2404 				 * DAD message types other than IPv6
2405 				 * unicast probes
2406 				 */
2407 				nce_dad(ncec, src_ill, B_TRUE);
2408 			} else {
2409 				ASSERT(src_ill != NULL);
2410 				if (isv6) {
2411 					mutex_exit(&ncec->ncec_lock);
2412 					dropped = ndp_xmit(src_ill,
2413 					    ND_NEIGHBOR_SOLICIT,
2414 					    src_ill->ill_phys_addr,
2415 					    src_ill->ill_phys_addr_length,
2416 					    &sender6, &ncec->ncec_addr,
2417 					    NDP_UNICAST);
2418 				} else {
2419 					/*
2420 					 * since the nce is REACHABLE,
2421 					 * the ARP request will be sent out
2422 					 * as a link-layer unicast.
2423 					 */
2424 					dropped = (arp_request(ncec, sender4,
2425 					    src_ill) == 0);
2426 					mutex_exit(&ncec->ncec_lock);
2427 				}
2428 				if (!dropped) {
2429 					mutex_enter(&ncec->ncec_lock);
2430 					ncec->ncec_pcnt--;
2431 					mutex_exit(&ncec->ncec_lock);
2432 				}
2433 				nce_restart_timer(ncec,
2434 				    ill->ill_reachable_retrans_time);
2435 			}
2436 		} else if (ncec->ncec_pcnt < 0) {
2437 			/* No hope, delete the ncec */
2438 			/* Tell datapath it went bad */
2439 			ncec->ncec_state = ND_UNREACHABLE;
2440 			mutex_exit(&ncec->ncec_lock);
2441 			if (ip_debug > 2) {
2442 				/* ip1dbg */
2443 				pr_addr_dbg("nce_timer: Delete NCE for"
2444 				    " dst %s\n", (isv6? AF_INET6: AF_INET),
2445 				    &ncec->ncec_addr);
2446 			}
2447 			/* if static ARP can't delete. */
2448 			if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
2449 				ncec_delete(ncec);
2450 
2451 		} else if (!NCE_PUBLISH(ncec)) {
2452 			/*
2453 			 * Probe count is 0 for a dynamic entry (one that we
2454 			 * ourselves are not publishing). We should never get
2455 			 * here if NONUD was requested, hence the ASSERT below.
2456 			 */
2457 			ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
2458 			ip2dbg(("nce_timer: pcount=%x dst %s\n",
2459 			    ncec->ncec_pcnt, inet_ntop(AF_INET6,
2460 			    &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
2461 			ncec->ncec_pcnt--;
2462 			mutex_exit(&ncec->ncec_lock);
2463 			/* Wait one interval before killing */
2464 			nce_restart_timer(ncec,
2465 			    ill->ill_reachable_retrans_time);
2466 		} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
2467 			ipif_t *ipif;
2468 			ipaddr_t ncec_addr;
2469 
2470 			/*
2471 			 * We're done probing, and we can now declare this
2472 			 * address to be usable.  Let IP know that it's ok to
2473 			 * use.
2474 			 */
2475 			ncec->ncec_state = ND_REACHABLE;
2476 			ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
2477 			mutex_exit(&ncec->ncec_lock);
2478 			if (isv6) {
2479 				ipif = ipif_lookup_addr_exact_v6(
2480 				    &ncec->ncec_addr, ill, ipst);
2481 			} else {
2482 				IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
2483 				    ncec_addr);
2484 				ipif = ipif_lookup_addr_exact(ncec_addr, ill,
2485 				    ipst);
2486 			}
2487 			if (ipif != NULL) {
2488 				if (ipif->ipif_was_dup) {
2489 					char ibuf[LIFNAMSIZ];
2490 					char sbuf[INET6_ADDRSTRLEN];
2491 
2492 					ipif->ipif_was_dup = B_FALSE;
2493 					(void) inet_ntop(AF_INET6,
2494 					    &ipif->ipif_v6lcl_addr,
2495 					    sbuf, sizeof (sbuf));
2496 					ipif_get_name(ipif, ibuf,
2497 					    sizeof (ibuf));
2498 					cmn_err(CE_NOTE, "recovered address "
2499 					    "%s on %s", sbuf, ibuf);
2500 				}
2501 				if ((ipif->ipif_flags & IPIF_UP) &&
2502 				    !ipif->ipif_addr_ready)
2503 					ipif_up_notify(ipif);
2504 				ipif->ipif_addr_ready = 1;
2505 				ipif_refrele(ipif);
2506 			}
2507 			if (!isv6 && arp_no_defense)
2508 				break;
2509 			/* Begin defending our new address */
2510 			if (ncec->ncec_unsolicit_count > 0) {
2511 				ncec->ncec_unsolicit_count--;
2512 				if (isv6) {
2513 					dropped = ndp_announce(ncec);
2514 				} else {
2515 					dropped = arp_announce(ncec);
2516 				}
2517 
2518 				if (dropped)
2519 					ncec->ncec_unsolicit_count++;
2520 				else
2521 					ncec->ncec_last_time_defended =
2522 					    ddi_get_lbolt();
2523 			}
2524 			if (ncec->ncec_unsolicit_count > 0) {
2525 				nce_restart_timer(ncec,
2526 				    ANNOUNCE_INTERVAL(isv6));
2527 			} else if (DEFENSE_INTERVAL(isv6) != 0) {
2528 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2529 			}
2530 		} else {
2531 			/*
2532 			 * This is an address we're probing to be our own, but
2533 			 * the ill is down.  Wait until it comes back before
2534 			 * doing anything, but switch to reachable state so
2535 			 * that the restart will work.
2536 			 */
2537 			ncec->ncec_state = ND_REACHABLE;
2538 			mutex_exit(&ncec->ncec_lock);
2539 		}
2540 		break;
2541 	case ND_INCOMPLETE: {
2542 		mblk_t	*mp, *nextmp;
2543 		mblk_t	**prevmpp;
2544 
2545 		/*
2546 		 * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
2547 		 * for any IPMP probe packets, and toss them.  IPMP probe
2548 		 * packets will always be at the head of ncec_qd_mp, so that
2549 		 * we can stop at the first queued ND packet that is
2550 		 * not a probe packet.
2551 		 */
2552 		prevmpp = &ncec->ncec_qd_mp;
2553 		for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
2554 			nextmp = mp->b_next;
2555 
2556 			if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
2557 				inet_freemsg(mp);
2558 				ncec->ncec_nprobes--;
2559 				*prevmpp = nextmp;
2560 			} else {
2561 				prevmpp = &mp->b_next;
2562 			}
2563 		}
2564 
2565 		/*
2566 		 * Must be resolver's retransmit timer.
2567 		 */
2568 		mutex_exit(&ncec->ncec_lock);
2569 		ip_ndp_resolve(ncec);
2570 		break;
2571 	}
2572 	case ND_REACHABLE:
2573 		if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
2574 		    ncec->ncec_unsolicit_count != 0) ||
2575 		    (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
2576 			if (ncec->ncec_unsolicit_count > 0) {
2577 				ncec->ncec_unsolicit_count--;
2578 				mutex_exit(&ncec->ncec_lock);
2579 				/*
2580 				 * When we get to zero announcements left,
2581 				 * switch to address defense
2582 				 */
2583 			} else {
2584 				boolean_t rate_limit;
2585 
2586 				mutex_exit(&ncec->ncec_lock);
2587 				rate_limit = ill_defend_rate_limit(ill, ncec);
2588 				if (rate_limit) {
2589 					nce_restart_timer(ncec,
2590 					    DEFENSE_INTERVAL(isv6));
2591 					break;
2592 				}
2593 			}
2594 			if (isv6) {
2595 				dropped = ndp_announce(ncec);
2596 			} else {
2597 				dropped = arp_announce(ncec);
2598 			}
2599 			mutex_enter(&ncec->ncec_lock);
2600 			if (dropped) {
2601 				ncec->ncec_unsolicit_count++;
2602 			} else {
2603 				ncec->ncec_last_time_defended =
2604 				    ddi_get_lbolt();
2605 			}
2606 			mutex_exit(&ncec->ncec_lock);
2607 			if (ncec->ncec_unsolicit_count != 0) {
2608 				nce_restart_timer(ncec,
2609 				    ANNOUNCE_INTERVAL(isv6));
2610 			} else {
2611 				nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
2612 			}
2613 		} else {
2614 			mutex_exit(&ncec->ncec_lock);
2615 		}
2616 		break;
2617 	default:
2618 		mutex_exit(&ncec->ncec_lock);
2619 		break;
2620 	}
2621 done:
2622 	ncec_refrele(ncec);
2623 	ill_refrele(src_ill);
2624 }
2625 
2626 /*
2627  * Set a link layer address from the ll_addr passed in.
2628  * Copy SAP from ill.
2629  */
2630 static void
2631 nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
2632 {
2633 	ill_t	*ill = ncec->ncec_ill;
2634 
2635 	ASSERT(ll_addr != NULL);
2636 	if (ill->ill_phys_addr_length > 0) {
2637 		/*
2638 		 * The bcopy() below used to be called for the physical address
2639 		 * length rather than the link layer address length. For
2640 		 * ethernet and many other media, the phys_addr and lla are
2641 		 * identical.
2642 		 *
2643 		 * The phys_addr and lla may not be the same for devices that
2644 		 * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
2645 		 * no known instances of these.
2646 		 *
2647 		 * For PPP or other interfaces with a zero length
2648 		 * physical address, don't do anything here.
2649 		 * The bcopy() with a zero phys_addr length was previously
2650 		 * a no-op for interfaces with a zero-length physical address.
2651 		 * Using the lla for them would change the way they operate.
2652 		 * Doing nothing in such cases preserves expected behavior.
2653 		 */
2654 		bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
2655 	}
2656 }
2657 
2658 boolean_t
2659 nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
2660     uint32_t ll_addr_len)
2661 {
2662 	ASSERT(ncec->ncec_lladdr != NULL);
2663 	if (ll_addr == NULL)
2664 		return (B_FALSE);
2665 	if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
2666 		return (B_TRUE);
2667 	return (B_FALSE);
2668 }
2669 
2670 /*
2671  * Updates the link layer address or the reachability state of
2672  * a cache entry.  Reset probe counter if needed.
2673  */
2674 void
2675 nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
2676 {
2677 	ill_t	*ill = ncec->ncec_ill;
2678 	boolean_t need_stop_timer = B_FALSE;
2679 	boolean_t need_fastpath_update = B_FALSE;
2680 	nce_t	*nce = NULL;
2681 	timeout_id_t tid;
2682 
2683 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2684 	/*
2685 	 * If this interface does not do NUD, there is no point
2686 	 * in allowing an update to the cache entry.  Although
2687 	 * we will respond to NS.
2688 	 * The only time we accept an update for a resolver when
2689 	 * NUD is turned off is when it has just been created.
2690 	 * Non-Resolvers will always be created as REACHABLE.
2691 	 */
2692 	if (new_state != ND_UNCHANGED) {
2693 		if ((ncec->ncec_flags & NCE_F_NONUD) &&
2694 		    (ncec->ncec_state != ND_INCOMPLETE))
2695 			return;
2696 		ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
2697 		ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
2698 		need_stop_timer = B_TRUE;
2699 		if (new_state == ND_REACHABLE)
2700 			ncec->ncec_last = TICK_TO_MSEC(ddi_get_lbolt64());
2701 		else {
2702 			/* We force NUD in this case */
2703 			ncec->ncec_last = 0;
2704 		}
2705 		ncec->ncec_state = new_state;
2706 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
2707 		ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
2708 		    new_state == ND_INCOMPLETE);
2709 	}
2710 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2711 		tid = ncec->ncec_timeout_id;
2712 		ncec->ncec_timeout_id = 0;
2713 	}
2714 	/*
2715 	 * Re-trigger fastpath probe and
2716 	 * overwrite the DL_UNITDATA_REQ data, noting we'll lose
2717 	 * whatever packets that happens to be transmitting at the time.
2718 	 */
2719 	if (new_ll_addr != NULL) {
2720 		bcopy(new_ll_addr, ncec->ncec_lladdr,
2721 		    ill->ill_phys_addr_length);
2722 		need_fastpath_update = B_TRUE;
2723 	}
2724 	mutex_exit(&ncec->ncec_lock);
2725 	if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
2726 		if (tid != 0)
2727 			(void) untimeout(tid);
2728 	}
2729 	if (need_fastpath_update) {
2730 		/*
2731 		 * Delete any existing existing dlur_mp and fp_mp information.
2732 		 * For IPMP interfaces, all underlying ill's must be checked
2733 		 * and purged.
2734 		 */
2735 		nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
2736 		/*
2737 		 * add the new dlur_mp and fp_mp
2738 		 */
2739 		nce = nce_fastpath(ncec, B_TRUE, NULL);
2740 		if (nce != NULL)
2741 			nce_refrele(nce);
2742 	}
2743 	mutex_enter(&ncec->ncec_lock);
2744 }
2745 
2746 static void
2747 nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2748 {
2749 	uint_t	count = 0;
2750 	mblk_t  **mpp, *tmp;
2751 
2752 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2753 
2754 	for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
2755 		if (++count > ncec->ncec_ill->ill_max_buf) {
2756 			tmp = ncec->ncec_qd_mp->b_next;
2757 			ncec->ncec_qd_mp->b_next = NULL;
2758 			/*
2759 			 * if we never create data addrs on the under_ill
2760 			 * does this matter?
2761 			 */
2762 			BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
2763 			    ipIfStatsOutDiscards);
2764 			ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
2765 			    ncec->ncec_ill);
2766 			freemsg(ncec->ncec_qd_mp);
2767 			ncec->ncec_qd_mp = tmp;
2768 		}
2769 	}
2770 
2771 	if (head_insert) {
2772 		ncec->ncec_nprobes++;
2773 		mp->b_next = ncec->ncec_qd_mp;
2774 		ncec->ncec_qd_mp = mp;
2775 	} else {
2776 		*mpp = mp;
2777 	}
2778 }
2779 
2780 /*
2781  * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
2782  * queued at the head or tail of the queue based on the input argument
2783  * 'head_insert'. The caller should specify this argument as B_TRUE if this
2784  * packet is an IPMP probe packet, in which case the following happens:
2785  *
2786  *   1. Insert it at the head of the ncec_qd_mp list.  Consider the normal
2787  *	(non-ipmp_probe) load-speading case where the source address of the ND
2788  *	packet is not tied to ncec_ill. If the ill bound to the source address
2789  *	cannot receive, the response to the ND packet will not be received.
2790  *	However, if ND packets for ncec_ill's probes are queued	behind that ND
2791  *	packet, those probes will also fail to be sent, and thus in.mpathd will
2792  *	 erroneously conclude that ncec_ill has also failed.
2793  *
2794  *   2. Drop the ipmp_probe packet in ndp_timer() if the ND did	not succeed on
2795  *	the first attempt.  This ensures that ND problems do not manifest as
2796  *	probe RTT spikes.
2797  *
2798  * We achieve this by inserting ipmp_probe() packets at the head of the
2799  * nce_queue.
2800  *
2801  * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
2802  * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
2803  */
2804 void
2805 nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
2806 {
2807 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
2808 	nce_queue_mp_common(ncec, mp, head_insert);
2809 }
2810 
2811 /*
2812  * Called when address resolution failed due to a timeout.
2813  * Send an ICMP unreachable in response to all queued packets.
2814  */
2815 void
2816 ndp_resolv_failed(ncec_t *ncec)
2817 {
2818 	mblk_t	*mp, *nxt_mp;
2819 	char	buf[INET6_ADDRSTRLEN];
2820 	ill_t *ill = ncec->ncec_ill;
2821 	ip_recv_attr_t	iras;
2822 
2823 	bzero(&iras, sizeof (iras));
2824 	iras.ira_flags = 0;
2825 	/*
2826 	 * we are setting the ira_rill to the ipmp_ill (instead of
2827 	 * the actual ill on which the packet was received), but this
2828 	 * is ok because we don't actually need the real ira_rill.
2829 	 * to send the icmp unreachable to the sender.
2830 	 */
2831 	iras.ira_ill = iras.ira_rill = ill;
2832 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2833 	iras.ira_rifindex = iras.ira_ruifindex;
2834 
2835 	ip1dbg(("ndp_resolv_failed: dst %s\n",
2836 	    inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
2837 	mutex_enter(&ncec->ncec_lock);
2838 	mp = ncec->ncec_qd_mp;
2839 	ncec->ncec_qd_mp = NULL;
2840 	ncec->ncec_nprobes = 0;
2841 	mutex_exit(&ncec->ncec_lock);
2842 	while (mp != NULL) {
2843 		nxt_mp = mp->b_next;
2844 		mp->b_next = NULL;
2845 
2846 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2847 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
2848 		    mp, ill);
2849 		icmp_unreachable_v6(mp,
2850 		    ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
2851 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2852 		mp = nxt_mp;
2853 	}
2854 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
2855 }
2856 
2857 /*
2858  * Handle the completion of NDP and ARP resolution.
2859  */
2860 void
2861 nce_resolv_ok(ncec_t *ncec)
2862 {
2863 	mblk_t *mp;
2864 	uint_t pkt_len;
2865 	iaflags_t ixaflags = IXAF_NO_TRACE;
2866 	nce_t *nce;
2867 	ill_t	*ill = ncec->ncec_ill;
2868 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
2869 	ip_stack_t *ipst = ill->ill_ipst;
2870 
2871 	if (IS_IPMP(ncec->ncec_ill)) {
2872 		nce_resolv_ipmp_ok(ncec);
2873 		return;
2874 	}
2875 	/* non IPMP case */
2876 
2877 	mutex_enter(&ncec->ncec_lock);
2878 	ASSERT(ncec->ncec_nprobes == 0);
2879 	mp = ncec->ncec_qd_mp;
2880 	ncec->ncec_qd_mp = NULL;
2881 	mutex_exit(&ncec->ncec_lock);
2882 
2883 	while (mp != NULL) {
2884 		mblk_t *nxt_mp;
2885 
2886 		if (ill->ill_isv6) {
2887 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2888 
2889 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2890 		} else {
2891 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
2892 
2893 			ixaflags |= IXAF_IS_IPV4;
2894 			pkt_len = ntohs(ipha->ipha_length);
2895 		}
2896 		nxt_mp = mp->b_next;
2897 		mp->b_next = NULL;
2898 		/*
2899 		 * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
2900 		 * longer available, but it's ok to drop this flag because TCP
2901 		 * has its own flow-control in effect, so TCP packets
2902 		 * are not likely to get here when flow-control is in effect.
2903 		 */
2904 		mutex_enter(&ill->ill_lock);
2905 		nce = nce_lookup(ill, &ncec->ncec_addr);
2906 		mutex_exit(&ill->ill_lock);
2907 
2908 		if (nce == NULL) {
2909 			if (isv6) {
2910 				BUMP_MIB(&ipst->ips_ip6_mib,
2911 				    ipIfStatsOutDiscards);
2912 			} else {
2913 				BUMP_MIB(&ipst->ips_ip_mib,
2914 				    ipIfStatsOutDiscards);
2915 			}
2916 			ip_drop_output("ipIfStatsOutDiscards - no nce",
2917 			    mp, NULL);
2918 			freemsg(mp);
2919 		} else {
2920 			/*
2921 			 * We don't know the zoneid, but
2922 			 * ip_xmit does not care since IXAF_NO_TRACE
2923 			 * is set. (We traced the packet the first
2924 			 * time through ip_xmit.)
2925 			 */
2926 			(void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
2927 			    ALL_ZONES, 0, NULL);
2928 			nce_refrele(nce);
2929 		}
2930 		mp = nxt_mp;
2931 	}
2932 
2933 	ncec_cb_dispatch(ncec); /* complete callbacks */
2934 }
2935 
2936 /*
2937  * Called by SIOCSNDP* ioctl to add/change an ncec entry
2938  * and the corresponding attributes.
2939  * Disallow states other than ND_REACHABLE or ND_STALE.
2940  */
2941 int
2942 ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
2943 {
2944 	sin6_t		*sin6;
2945 	in6_addr_t	*addr;
2946 	ncec_t		*ncec;
2947 	nce_t		*nce;
2948 	int		err = 0;
2949 	uint16_t	new_flags = 0;
2950 	uint16_t	old_flags = 0;
2951 	int		inflags = lnr->lnr_flags;
2952 	ip_stack_t	*ipst = ill->ill_ipst;
2953 	boolean_t	do_postprocess = B_FALSE;
2954 
2955 	ASSERT(ill->ill_isv6);
2956 	if ((lnr->lnr_state_create != ND_REACHABLE) &&
2957 	    (lnr->lnr_state_create != ND_STALE))
2958 		return (EINVAL);
2959 
2960 	sin6 = (sin6_t *)&lnr->lnr_addr;
2961 	addr = &sin6->sin6_addr;
2962 
2963 	mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
2964 	ASSERT(!IS_UNDER_IPMP(ill));
2965 	nce = nce_lookup_addr(ill, addr);
2966 	if (nce != NULL)
2967 		new_flags = nce->nce_common->ncec_flags;
2968 
2969 	switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
2970 	case NDF_ISROUTER_ON:
2971 		new_flags |= NCE_F_ISROUTER;
2972 		break;
2973 	case NDF_ISROUTER_OFF:
2974 		new_flags &= ~NCE_F_ISROUTER;
2975 		break;
2976 	case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
2977 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2978 		if (nce != NULL)
2979 			nce_refrele(nce);
2980 		return (EINVAL);
2981 	}
2982 	if (inflags & NDF_STATIC)
2983 		new_flags |= NCE_F_STATIC;
2984 
2985 	switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) {
2986 	case NDF_ANYCAST_ON:
2987 		new_flags |= NCE_F_ANYCAST;
2988 		break;
2989 	case NDF_ANYCAST_OFF:
2990 		new_flags &= ~NCE_F_ANYCAST;
2991 		break;
2992 	case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
2993 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
2994 		if (nce != NULL)
2995 			nce_refrele(nce);
2996 		return (EINVAL);
2997 	}
2998 
2999 	if (nce == NULL) {
3000 		err = nce_add_v6(ill,
3001 		    (uchar_t *)lnr->lnr_hdw_addr,
3002 		    ill->ill_phys_addr_length,
3003 		    addr,
3004 		    new_flags,
3005 		    lnr->lnr_state_create,
3006 		    &nce);
3007 		if (err != 0) {
3008 			mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3009 			ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
3010 			return (err);
3011 		} else {
3012 			do_postprocess = B_TRUE;
3013 		}
3014 	}
3015 	ncec = nce->nce_common;
3016 	old_flags = ncec->ncec_flags;
3017 	if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
3018 		ncec_router_to_host(ncec);
3019 		mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3020 		if (do_postprocess)
3021 			err = nce_add_v6_postprocess(nce);
3022 		nce_refrele(nce);
3023 		return (0);
3024 	}
3025 	mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
3026 
3027 	if (do_postprocess)
3028 		err = nce_add_v6_postprocess(nce);
3029 	/*
3030 	 * err cannot be anything other than 0 because we don't support
3031 	 * proxy arp of static addresses.
3032 	 */
3033 	ASSERT(err == 0);
3034 
3035 	mutex_enter(&ncec->ncec_lock);
3036 	ncec->ncec_flags = new_flags;
3037 	mutex_exit(&ncec->ncec_lock);
3038 	/*
3039 	 * Note that we ignore the state at this point, which
3040 	 * should be either STALE or REACHABLE.  Instead we let
3041 	 * the link layer address passed in to determine the state
3042 	 * much like incoming packets.
3043 	 */
3044 	nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
3045 	nce_refrele(nce);
3046 	return (0);
3047 }
3048 
3049 /*
3050  * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
3051  * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
3052  * be held to ensure that they are in the same group.
3053  */
3054 static nce_t *
3055 nce_fastpath_create(ill_t *ill, ncec_t *ncec)
3056 {
3057 
3058 	nce_t *nce;
3059 
3060 	nce = nce_ill_lookup_then_add(ill, ncec);
3061 
3062 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3063 		return (nce);
3064 
3065 	/*
3066 	 * hold the ncec_lock to synchronize with nce_update() so that,
3067 	 * at the end of this function, the contents of nce_dlur_mp are
3068 	 * consistent with ncec->ncec_lladdr, even though some intermediate
3069 	 * packet may have been sent out with a mangled address, which would
3070 	 * only be a transient condition.
3071 	 */
3072 	mutex_enter(&ncec->ncec_lock);
3073 	if (ncec->ncec_lladdr != NULL) {
3074 		bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
3075 		    NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
3076 	} else {
3077 		nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
3078 		    ill->ill_sap_length);
3079 	}
3080 	mutex_exit(&ncec->ncec_lock);
3081 	return (nce);
3082 }
3083 
3084 /*
3085  * we make nce_fp_mp to have an M_DATA prepend.
3086  * The caller ensures there is hold on ncec for this function.
3087  * Note that since ill_fastpath_probe() copies the mblk there is
3088  * no need to hold the nce or ncec beyond this function.
3089  *
3090  * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3091  * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3092  * and will be returned back by this function, so that no extra nce_refrele
3093  * is required for the caller. The calls from nce_add_common() use this
3094  * method. All other callers (that pass in NULL ncec_nce) will have to do a
3095  * nce_refrele of the returned nce (when it is non-null).
3096  */
3097 nce_t *
3098 nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3099 {
3100 	nce_t *nce;
3101 	ill_t *ill = ncec->ncec_ill;
3102 
3103 	ASSERT(ill != NULL);
3104 
3105 	if (IS_IPMP(ill) && trigger_fp_req) {
3106 		trigger_fp_req = B_FALSE;
3107 		ipmp_ncec_refresh_nce(ncec);
3108 	}
3109 
3110 	/*
3111 	 * If the caller already has the nce corresponding to the ill, use
3112 	 * that one. Otherwise we have to lookup/add the nce. Calls from
3113 	 * nce_add_common() fall in the former category, and have just done
3114 	 * the nce lookup/add that can be reused.
3115 	 */
3116 	if (ncec_nce == NULL)
3117 		nce = nce_fastpath_create(ill, ncec);
3118 	else
3119 		nce = ncec_nce;
3120 
3121 	if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
3122 		return (nce);
3123 
3124 	if (trigger_fp_req)
3125 		nce_fastpath_trigger(nce);
3126 	return (nce);
3127 }
3128 
3129 /*
3130  * Trigger fastpath on nce. No locks may be held.
3131  */
3132 static void
3133 nce_fastpath_trigger(nce_t *nce)
3134 {
3135 	int res;
3136 	ill_t *ill = nce->nce_ill;
3137 	ncec_t *ncec = nce->nce_common;
3138 
3139 	res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
3140 	/*
3141 	 * EAGAIN is an indication of a transient error
3142 	 * i.e. allocation failure etc. leave the ncec in the list it
3143 	 * will be updated when another probe happens for another ire
3144 	 * if not it will be taken out of the list when the ire is
3145 	 * deleted.
3146 	 */
3147 	if (res != 0 && res != EAGAIN && res != ENOTSUP)
3148 		nce_fastpath_list_delete(ill, ncec, NULL);
3149 }
3150 
3151 /*
3152  * Add ncec to the nce fastpath list on ill.
3153  */
3154 static nce_t *
3155 nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
3156 {
3157 	nce_t *nce = NULL;
3158 
3159 	ASSERT(MUTEX_HELD(&ill->ill_lock));
3160 	/*
3161 	 * Atomically ensure that the ill is not CONDEMNED and is not going
3162 	 * down, before adding the NCE.
3163 	 */
3164 	if (ill->ill_state_flags & ILL_CONDEMNED)
3165 		return (NULL);
3166 	mutex_enter(&ncec->ncec_lock);
3167 	/*
3168 	 * if ncec has not been deleted and
3169 	 * is not already in the list add it.
3170 	 */
3171 	if (!NCE_ISCONDEMNED(ncec)) {
3172 		nce = nce_lookup(ill, &ncec->ncec_addr);
3173 		if (nce != NULL)
3174 			goto done;
3175 		nce = nce_add(ill, ncec);
3176 	}
3177 done:
3178 	mutex_exit(&ncec->ncec_lock);
3179 	return (nce);
3180 }
3181 
3182 nce_t *
3183 nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3184 {
3185 	nce_t *nce;
3186 
3187 	mutex_enter(&ill->ill_lock);
3188 	nce = nce_ill_lookup_then_add_locked(ill, ncec);
3189 	mutex_exit(&ill->ill_lock);
3190 	return (nce);
3191 }
3192 
3193 
3194 /*
3195  * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3196  * nce is added to the 'dead' list, and the caller must nce_refrele() the
3197  * entry after all locks have been dropped.
3198  */
3199 void
3200 nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
3201 {
3202 	nce_t *nce;
3203 
3204 	ASSERT(ill != NULL);
3205 
3206 	/* delete any nces referencing the ncec from underlying ills */
3207 	if (IS_IPMP(ill))
3208 		ipmp_ncec_delete_nce(ncec);
3209 
3210 	/* now the ill itself */
3211 	mutex_enter(&ill->ill_lock);
3212 	for (nce = list_head(&ill->ill_nce); nce != NULL;
3213 	    nce = list_next(&ill->ill_nce, nce)) {
3214 		if (nce->nce_common == ncec) {
3215 			nce_refhold(nce);
3216 			nce_delete(nce);
3217 			break;
3218 		}
3219 	}
3220 	mutex_exit(&ill->ill_lock);
3221 	if (nce != NULL) {
3222 		if (dead == NULL)
3223 			nce_refrele(nce);
3224 		else
3225 			list_insert_tail(dead, nce);
3226 	}
3227 }
3228 
3229 /*
3230  * when the fastpath response does not fit in the datab
3231  * associated with the existing nce_fp_mp, we delete and
3232  * add the nce to retrigger fastpath based on the information
3233  * in the ncec_t.
3234  */
3235 static nce_t *
3236 nce_delete_then_add(nce_t *nce)
3237 {
3238 	ill_t		*ill = nce->nce_ill;
3239 	nce_t		*newnce = NULL;
3240 
3241 	ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3242 	    (void *)nce, ill->ill_name));
3243 	mutex_enter(&ill->ill_lock);
3244 	mutex_enter(&nce->nce_common->ncec_lock);
3245 	nce_delete(nce);
3246 	/*
3247 	 * Make sure that ncec is not condemned before adding. We hold the
3248 	 * ill_lock and ncec_lock to synchronize with ncec_delete() and
3249 	 * ipmp_ncec_delete_nce()
3250 	 */
3251 	if (!NCE_ISCONDEMNED(nce->nce_common))
3252 		newnce = nce_add(ill, nce->nce_common);
3253 	mutex_exit(&nce->nce_common->ncec_lock);
3254 	mutex_exit(&ill->ill_lock);
3255 	nce_refrele(nce);
3256 	return (newnce); /* could be null if nomem */
3257 }
3258 
3259 typedef struct nce_fp_match_s {
3260 	nce_t	*nce_fp_match_res;
3261 	mblk_t	*nce_fp_match_ack_mp;
3262 } nce_fp_match_t;
3263 
3264 /* ARGSUSED */
3265 static int
3266 nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
3267 {
3268 	nce_fp_match_t	*nce_fp_marg = arg;
3269 	ncec_t		*ncec = nce->nce_common;
3270 	mblk_t		*mp = nce_fp_marg->nce_fp_match_ack_mp;
3271 	uchar_t	*mp_rptr, *ud_mp_rptr;
3272 	mblk_t		*ud_mp = nce->nce_dlur_mp;
3273 	ptrdiff_t	cmplen;
3274 
3275 	/*
3276 	 * mp is the mp associated with the fastpath ack.
3277 	 * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
3278 	 * under consideration. If the contents match, then the
3279 	 * fastpath ack is used to update the nce.
3280 	 */
3281 	if (ud_mp == NULL)
3282 		return (0);
3283 	mp_rptr = mp->b_rptr;
3284 	cmplen = mp->b_wptr - mp_rptr;
3285 	ASSERT(cmplen >= 0);
3286 
3287 	ud_mp_rptr = ud_mp->b_rptr;
3288 	/*
3289 	 * The ncec is locked here to prevent any other threads from accessing
3290 	 * and changing nce_dlur_mp when the address becomes resolved to an
3291 	 * lla while we're in the middle of looking at and comparing the
3292 	 * hardware address (lla). It is also locked to prevent multiple
3293 	 * threads in nce_fastpath() from examining nce_dlur_mp at the same
3294 	 * time.
3295 	 */
3296 	mutex_enter(&ncec->ncec_lock);
3297 	if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
3298 	    bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
3299 		nce_fp_marg->nce_fp_match_res = nce;
3300 		mutex_exit(&ncec->ncec_lock);
3301 		nce_refhold(nce);
3302 		return (1);
3303 	}
3304 	mutex_exit(&ncec->ncec_lock);
3305 	return (0);
3306 }
3307 
3308 /*
3309  * Update all NCE's that are not in fastpath mode and
3310  * have an nce_fp_mp that matches mp. mp->b_cont contains
3311  * the fastpath header.
3312  *
3313  * Returns TRUE if entry should be dequeued, or FALSE otherwise.
3314  */
3315 void
3316 nce_fastpath_update(ill_t *ill,  mblk_t *mp)
3317 {
3318 	nce_fp_match_t nce_fp_marg;
3319 	nce_t *nce;
3320 	mblk_t *nce_fp_mp, *fp_mp;
3321 
3322 	nce_fp_marg.nce_fp_match_res = NULL;
3323 	nce_fp_marg.nce_fp_match_ack_mp = mp;
3324 
3325 	nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
3326 
3327 	if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
3328 		return;
3329 
3330 	mutex_enter(&nce->nce_lock);
3331 	nce_fp_mp = nce->nce_fp_mp;
3332 
3333 	if (nce_fp_mp != NULL) {
3334 		fp_mp = mp->b_cont;
3335 		if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
3336 		    nce_fp_mp->b_datap->db_lim) {
3337 			mutex_exit(&nce->nce_lock);
3338 			nce = nce_delete_then_add(nce);
3339 			if (nce == NULL) {
3340 				return;
3341 			}
3342 			mutex_enter(&nce->nce_lock);
3343 			nce_fp_mp = nce->nce_fp_mp;
3344 		}
3345 	}
3346 
3347 	/* Matched - install mp as the fastpath mp */
3348 	if (nce_fp_mp == NULL) {
3349 		fp_mp = dupb(mp->b_cont);
3350 		nce->nce_fp_mp = fp_mp;
3351 	} else {
3352 		fp_mp = mp->b_cont;
3353 		bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
3354 		nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
3355 		    + MBLKL(fp_mp);
3356 	}
3357 	mutex_exit(&nce->nce_lock);
3358 	nce_refrele(nce);
3359 }
3360 
3361 /*
3362  * Return a pointer to a given option in the packet.
3363  * Assumes that option part of the packet have already been validated.
3364  */
3365 nd_opt_hdr_t *
3366 ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type)
3367 {
3368 	while (optlen > 0) {
3369 		if (opt->nd_opt_type == opt_type)
3370 			return (opt);
3371 		optlen -= 8 * opt->nd_opt_len;
3372 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3373 	}
3374 	return (NULL);
3375 }
3376 
3377 /*
3378  * Verify all option lengths present are > 0, also check to see
3379  * if the option lengths and packet length are consistent.
3380  */
3381 boolean_t
3382 ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
3383 {
3384 	ASSERT(opt != NULL);
3385 	while (optlen > 0) {
3386 		if (opt->nd_opt_len == 0)
3387 			return (B_FALSE);
3388 		optlen -= 8 * opt->nd_opt_len;
3389 		if (optlen < 0)
3390 			return (B_FALSE);
3391 		opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len);
3392 	}
3393 	return (B_TRUE);
3394 }
3395 
3396 /*
3397  * ncec_walk function.
3398  * Free a fraction of the NCE cache entries.
3399  *
3400  * A possible optimization here would be to use ncec_last where possible, and
3401  * delete the least-frequently used entry, which would require more complex
3402  * computation as we walk through the ncec's (e.g., track ncec entries by
3403  * order of ncec_last and/or maintain state)
3404  */
3405 static void
3406 ncec_cache_reclaim(ncec_t *ncec, char *arg)
3407 {
3408 	ip_stack_t	*ipst = ncec->ncec_ipst;
3409 	uint_t		fraction = *(uint_t *)arg;
3410 	uint_t		rand;
3411 
3412 	if ((ncec->ncec_flags &
3413 	    (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
3414 		return;
3415 	}
3416 
3417 	rand = (uint_t)ddi_get_lbolt() +
3418 	    NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
3419 	if ((rand/fraction)*fraction == rand) {
3420 		IP_STAT(ipst, ip_nce_reclaim_deleted);
3421 		ncec_delete(ncec);
3422 	}
3423 }
3424 
3425 /*
3426  * kmem_cache callback to free up memory.
3427  *
3428  * For now we just delete a fixed fraction.
3429  */
3430 static void
3431 ip_nce_reclaim_stack(ip_stack_t *ipst)
3432 {
3433 	uint_t		fraction = ipst->ips_ip_nce_reclaim_fraction;
3434 
3435 	IP_STAT(ipst, ip_nce_reclaim_calls);
3436 
3437 	ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
3438 
3439 	/*
3440 	 * Walk all CONNs that can have a reference on an ire, ncec or dce.
3441 	 * Get them to update any stale references to drop any refholds they
3442 	 * have.
3443 	 */
3444 	ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
3445 }
3446 
3447 /*
3448  * Called by the memory allocator subsystem directly, when the system
3449  * is running low on memory.
3450  */
3451 /* ARGSUSED */
3452 void
3453 ip_nce_reclaim(void *args)
3454 {
3455 	netstack_handle_t nh;
3456 	netstack_t *ns;
3457 	ip_stack_t *ipst;
3458 
3459 	netstack_next_init(&nh);
3460 	while ((ns = netstack_next(&nh)) != NULL) {
3461 		/*
3462 		 * netstack_next() can return a netstack_t with a NULL
3463 		 * netstack_ip at boot time.
3464 		 */
3465 		if ((ipst = ns->netstack_ip) == NULL) {
3466 			netstack_rele(ns);
3467 			continue;
3468 		}
3469 		ip_nce_reclaim_stack(ipst);
3470 		netstack_rele(ns);
3471 	}
3472 	netstack_next_fini(&nh);
3473 }
3474 
3475 #ifdef DEBUG
3476 void
3477 ncec_trace_ref(ncec_t *ncec)
3478 {
3479 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3480 
3481 	if (ncec->ncec_trace_disable)
3482 		return;
3483 
3484 	if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
3485 		ncec->ncec_trace_disable = B_TRUE;
3486 		ncec_trace_cleanup(ncec);
3487 	}
3488 }
3489 
3490 void
3491 ncec_untrace_ref(ncec_t *ncec)
3492 {
3493 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3494 
3495 	if (!ncec->ncec_trace_disable)
3496 		th_trace_unref(ncec);
3497 }
3498 
3499 static void
3500 ncec_trace_cleanup(const ncec_t *ncec)
3501 {
3502 	th_trace_cleanup(ncec, ncec->ncec_trace_disable);
3503 }
3504 #endif
3505 
3506 /*
3507  * Called when address resolution fails due to a timeout.
3508  * Send an ICMP unreachable in response to all queued packets.
3509  */
3510 void
3511 arp_resolv_failed(ncec_t *ncec)
3512 {
3513 	mblk_t	*mp, *nxt_mp;
3514 	char	buf[INET6_ADDRSTRLEN];
3515 	struct in_addr ipv4addr;
3516 	ill_t *ill = ncec->ncec_ill;
3517 	ip_stack_t *ipst = ncec->ncec_ipst;
3518 	ip_recv_attr_t	iras;
3519 
3520 	bzero(&iras, sizeof (iras));
3521 	iras.ira_flags = IRAF_IS_IPV4;
3522 	/*
3523 	 * we are setting the ira_rill to the ipmp_ill (instead of
3524 	 * the actual ill on which the packet was received), but this
3525 	 * is ok because we don't actually need the real ira_rill.
3526 	 * to send the icmp unreachable to the sender.
3527 	 */
3528 	iras.ira_ill = iras.ira_rill = ill;
3529 	iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3530 	iras.ira_rifindex = iras.ira_ruifindex;
3531 
3532 	IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
3533 	ip3dbg(("arp_resolv_failed: dst %s\n",
3534 	    inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
3535 	mutex_enter(&ncec->ncec_lock);
3536 	mp = ncec->ncec_qd_mp;
3537 	ncec->ncec_qd_mp = NULL;
3538 	ncec->ncec_nprobes = 0;
3539 	mutex_exit(&ncec->ncec_lock);
3540 	while (mp != NULL) {
3541 		nxt_mp = mp->b_next;
3542 		mp->b_next = NULL;
3543 
3544 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3545 		ip_drop_output("ipIfStatsOutDiscards - address unreachable",
3546 		    mp, ill);
3547 		if (ipst->ips_ip_arp_icmp_error) {
3548 			ip3dbg(("arp_resolv_failed: "
3549 			    "Calling icmp_unreachable\n"));
3550 			icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
3551 		} else {
3552 			freemsg(mp);
3553 		}
3554 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3555 		mp = nxt_mp;
3556 	}
3557 	ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
3558 }
3559 
3560 /*
3561  * if ill is an under_ill, translate it to the ipmp_ill and add the
3562  * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
3563  * one on the underlying in_ill) will be created for the
3564  * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
3565  */
3566 int
3567 nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3568     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3569 {
3570 	int	err;
3571 	in6_addr_t addr6;
3572 	ip_stack_t *ipst = ill->ill_ipst;
3573 	nce_t	*nce, *upper_nce = NULL;
3574 	ill_t	*in_ill = ill, *under = NULL;
3575 	boolean_t need_ill_refrele = B_FALSE;
3576 
3577 	if (flags & NCE_F_MCAST) {
3578 		/*
3579 		 * hw_addr will be figured out in nce_set_multicast_v4;
3580 		 * caller needs to pass in the cast_ill for ipmp
3581 		 */
3582 		ASSERT(hw_addr == NULL);
3583 		ASSERT(!IS_IPMP(ill));
3584 		err = nce_set_multicast_v4(ill, addr, flags, newnce);
3585 		return (err);
3586 	}
3587 
3588 	if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
3589 		ill = ipmp_ill_hold_ipmp_ill(ill);
3590 		if (ill == NULL)
3591 			return (ENXIO);
3592 		need_ill_refrele = B_TRUE;
3593 	}
3594 	if ((flags & NCE_F_BCAST) != 0) {
3595 		/*
3596 		 * IPv4 broadcast ncec: compute the hwaddr.
3597 		 */
3598 		if (IS_IPMP(ill)) {
3599 			under = ipmp_ill_hold_xmit_ill(ill, B_FALSE);
3600 			if (under == NULL)  {
3601 				if (need_ill_refrele)
3602 					ill_refrele(ill);
3603 				return (ENETDOWN);
3604 			}
3605 			hw_addr = under->ill_bcast_mp->b_rptr +
3606 			    NCE_LL_ADDR_OFFSET(under);
3607 			hw_addr_len = under->ill_phys_addr_length;
3608 		} else {
3609 			hw_addr = ill->ill_bcast_mp->b_rptr +
3610 			    NCE_LL_ADDR_OFFSET(ill),
3611 			    hw_addr_len = ill->ill_phys_addr_length;
3612 		}
3613 	}
3614 
3615 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3616 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3617 	nce = nce_lookup_addr(ill, &addr6);
3618 	if (nce == NULL) {
3619 		err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
3620 		    state, &nce);
3621 	} else {
3622 		err = EEXIST;
3623 	}
3624 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3625 	if (err == 0)
3626 		err = nce_add_v4_postprocess(nce);
3627 
3628 	if (in_ill != ill && nce != NULL) {
3629 		nce_t *under_nce = NULL;
3630 
3631 		/*
3632 		 * in_ill was the under_ill. Try to create the under_nce.
3633 		 * Hold the ill_g_lock to prevent changes to group membership
3634 		 * until we are done.
3635 		 */
3636 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3637 		if (!IS_IN_SAME_ILLGRP(in_ill, ill)) {
3638 			DTRACE_PROBE2(ill__not__in__group, nce_t *, nce,
3639 			    ill_t *, ill);
3640 			rw_exit(&ipst->ips_ill_g_lock);
3641 			err = ENXIO;
3642 			nce_refrele(nce);
3643 			nce = NULL;
3644 			goto bail;
3645 		}
3646 		under_nce = nce_fastpath_create(in_ill, nce->nce_common);
3647 		if (under_nce == NULL) {
3648 			rw_exit(&ipst->ips_ill_g_lock);
3649 			err = EINVAL;
3650 			nce_refrele(nce);
3651 			nce = NULL;
3652 			goto bail;
3653 		}
3654 		rw_exit(&ipst->ips_ill_g_lock);
3655 		upper_nce = nce;
3656 		nce = under_nce; /* will be returned to caller */
3657 		if (NCE_ISREACHABLE(nce->nce_common))
3658 			nce_fastpath_trigger(under_nce);
3659 	}
3660 	if (nce != NULL) {
3661 		if (newnce != NULL)
3662 			*newnce = nce;
3663 		else
3664 			nce_refrele(nce);
3665 	}
3666 bail:
3667 	if (under != NULL)
3668 		ill_refrele(under);
3669 	if (upper_nce != NULL)
3670 		nce_refrele(upper_nce);
3671 	if (need_ill_refrele)
3672 		ill_refrele(ill);
3673 
3674 	return (err);
3675 }
3676 
3677 /*
3678  * NDP Cache Entry creation routine for IPv4.
3679  * This routine must always be called with ndp4->ndp_g_lock held.
3680  * Prior to return, ncec_refcnt is incremented.
3681  *
3682  * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
3683  * are always added pointing at the ipmp_ill. Thus, when the ill passed
3684  * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
3685  * entries will be created, both pointing at the same ncec_t. The nce_t
3686  * entries will have their nce_ill set to the ipmp_ill and the under_ill
3687  * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
3688  * Local addresses are always created on the ill passed to nce_add_v4.
3689  */
3690 int
3691 nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
3692     const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
3693 {
3694 	int		err;
3695 	boolean_t	is_multicast = (flags & NCE_F_MCAST);
3696 	struct in6_addr	addr6;
3697 	nce_t		*nce;
3698 
3699 	ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
3700 	ASSERT(!ill->ill_isv6);
3701 	ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
3702 
3703 	IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
3704 	err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
3705 	    &nce);
3706 	ASSERT(newnce != NULL);
3707 	*newnce = nce;
3708 	return (err);
3709 }
3710 
3711 /*
3712  * Post-processing routine to be executed after nce_add_v4(). This function
3713  * triggers fastpath (if appropriate) and DAD on the newly added nce entry
3714  * and must be called without any locks held.
3715  *
3716  * Always returns 0, but we return an int to keep this symmetric with the
3717  * IPv6 counter-part.
3718  */
3719 int
3720 nce_add_v4_postprocess(nce_t *nce)
3721 {
3722 	ncec_t		*ncec = nce->nce_common;
3723 	uint16_t	flags = ncec->ncec_flags;
3724 	boolean_t	ndp_need_dad = B_FALSE;
3725 	boolean_t	dropped;
3726 	clock_t		delay;
3727 	ip_stack_t	*ipst = ncec->ncec_ill->ill_ipst;
3728 	uchar_t		*hw_addr = ncec->ncec_lladdr;
3729 	boolean_t	trigger_fastpath = B_TRUE;
3730 
3731 	/*
3732 	 * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
3733 	 * we call nce_fastpath as soon as the ncec is resolved in nce_process.
3734 	 * We call nce_fastpath from nce_update if the link layer address of
3735 	 * the peer changes from nce_update
3736 	 */
3737 	if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
3738 	    ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
3739 		trigger_fastpath = B_FALSE;
3740 
3741 	if (trigger_fastpath)
3742 		nce_fastpath_trigger(nce);
3743 
3744 	if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
3745 		/*
3746 		 * Either the caller (by passing in ND_PROBE)
3747 		 * or nce_add_common() (by the internally computed state
3748 		 * based on ncec_addr and ill_net_type) has determined
3749 		 * that this unicast entry needs DAD. Trigger DAD.
3750 		 */
3751 		ndp_need_dad = B_TRUE;
3752 	} else if (flags & NCE_F_UNSOL_ADV) {
3753 		/*
3754 		 * We account for the transmit below by assigning one
3755 		 * less than the ndd variable. Subsequent decrements
3756 		 * are done in nce_timer.
3757 		 */
3758 		mutex_enter(&ncec->ncec_lock);
3759 		ncec->ncec_unsolicit_count =
3760 		    ipst->ips_ip_arp_publish_count - 1;
3761 		mutex_exit(&ncec->ncec_lock);
3762 		dropped = arp_announce(ncec);
3763 		mutex_enter(&ncec->ncec_lock);
3764 		if (dropped)
3765 			ncec->ncec_unsolicit_count++;
3766 		else
3767 			ncec->ncec_last_time_defended = ddi_get_lbolt();
3768 		if (ncec->ncec_unsolicit_count != 0) {
3769 			nce_start_timer(ncec,
3770 			    ipst->ips_ip_arp_publish_interval);
3771 		}
3772 		mutex_exit(&ncec->ncec_lock);
3773 	}
3774 
3775 	/*
3776 	 * If ncec_xmit_interval is 0, user has configured us to send the first
3777 	 * probe right away.  Do so, and set up for the subsequent probes.
3778 	 */
3779 	if (ndp_need_dad) {
3780 		mutex_enter(&ncec->ncec_lock);
3781 		if (ncec->ncec_pcnt == 0) {
3782 			/*
3783 			 * DAD probes and announce can be
3784 			 * administratively disabled by setting the
3785 			 * probe_count to zero. Restart the timer in
3786 			 * this case to mark the ipif as ready.
3787 			 */
3788 			ncec->ncec_unsolicit_count = 0;
3789 			mutex_exit(&ncec->ncec_lock);
3790 			nce_restart_timer(ncec, 0);
3791 		} else {
3792 			mutex_exit(&ncec->ncec_lock);
3793 			delay = ((ncec->ncec_flags & NCE_F_FAST) ?
3794 			    ipst->ips_arp_probe_delay :
3795 			    ipst->ips_arp_fastprobe_delay);
3796 			nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
3797 		}
3798 	}
3799 	return (0);
3800 }
3801 
3802 /*
3803  * ncec_walk routine to update all entries that have a given destination or
3804  * gateway address and cached link layer (MAC) address.  This is used when ARP
3805  * informs us that a network-to-link-layer mapping may have changed.
3806  */
3807 void
3808 nce_update_hw_changed(ncec_t *ncec, void *arg)
3809 {
3810 	nce_hw_map_t *hwm = arg;
3811 	ipaddr_t ncec_addr;
3812 
3813 	if (ncec->ncec_state != ND_REACHABLE)
3814 		return;
3815 
3816 	IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
3817 	if (ncec_addr != hwm->hwm_addr)
3818 		return;
3819 
3820 	mutex_enter(&ncec->ncec_lock);
3821 	if (hwm->hwm_flags != 0)
3822 		ncec->ncec_flags = hwm->hwm_flags;
3823 	nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
3824 	mutex_exit(&ncec->ncec_lock);
3825 }
3826 
3827 void
3828 ncec_refhold(ncec_t *ncec)
3829 {
3830 	mutex_enter(&(ncec)->ncec_lock);
3831 	(ncec)->ncec_refcnt++;
3832 	ASSERT((ncec)->ncec_refcnt != 0);
3833 #ifdef DEBUG
3834 	ncec_trace_ref(ncec);
3835 #endif
3836 	mutex_exit(&(ncec)->ncec_lock);
3837 }
3838 
3839 void
3840 ncec_refhold_notr(ncec_t *ncec)
3841 {
3842 	mutex_enter(&(ncec)->ncec_lock);
3843 	(ncec)->ncec_refcnt++;
3844 	ASSERT((ncec)->ncec_refcnt != 0);
3845 	mutex_exit(&(ncec)->ncec_lock);
3846 }
3847 
3848 static void
3849 ncec_refhold_locked(ncec_t *ncec)
3850 {
3851 	ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
3852 	(ncec)->ncec_refcnt++;
3853 #ifdef DEBUG
3854 	ncec_trace_ref(ncec);
3855 #endif
3856 }
3857 
3858 /* ncec_inactive destroys the mutex thus no mutex_exit is needed */
3859 void
3860 ncec_refrele(ncec_t *ncec)
3861 {
3862 	mutex_enter(&(ncec)->ncec_lock);
3863 #ifdef DEBUG
3864 	ncec_untrace_ref(ncec);
3865 #endif
3866 	ASSERT((ncec)->ncec_refcnt != 0);
3867 	if (--(ncec)->ncec_refcnt == 0) {
3868 		ncec_inactive(ncec);
3869 	} else {
3870 		mutex_exit(&(ncec)->ncec_lock);
3871 	}
3872 }
3873 
3874 void
3875 ncec_refrele_notr(ncec_t *ncec)
3876 {
3877 	mutex_enter(&(ncec)->ncec_lock);
3878 	ASSERT((ncec)->ncec_refcnt != 0);
3879 	if (--(ncec)->ncec_refcnt == 0) {
3880 		ncec_inactive(ncec);
3881 	} else {
3882 		mutex_exit(&(ncec)->ncec_lock);
3883 	}
3884 }
3885 
3886 /*
3887  * Common to IPv4 and IPv6.
3888  */
3889 void
3890 nce_restart_timer(ncec_t *ncec, uint_t ms)
3891 {
3892 	timeout_id_t tid;
3893 
3894 	ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
3895 
3896 	/* First cancel any running timer */
3897 	mutex_enter(&ncec->ncec_lock);
3898 	tid = ncec->ncec_timeout_id;
3899 	ncec->ncec_timeout_id = 0;
3900 	if (tid != 0) {
3901 		mutex_exit(&ncec->ncec_lock);
3902 		(void) untimeout(tid);
3903 		mutex_enter(&ncec->ncec_lock);
3904 	}
3905 
3906 	/* Restart timer */
3907 	nce_start_timer(ncec, ms);
3908 	mutex_exit(&ncec->ncec_lock);
3909 }
3910 
3911 static void
3912 nce_start_timer(ncec_t *ncec, uint_t ms)
3913 {
3914 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
3915 	/*
3916 	 * Don't start the timer if the ncec has been deleted, or if the timer
3917 	 * is already running
3918 	 */
3919 	if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
3920 		ncec->ncec_timeout_id = timeout(nce_timer, ncec,
3921 		    MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
3922 	}
3923 }
3924 
3925 int
3926 nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
3927     uint16_t flags, nce_t **newnce)
3928 {
3929 	uchar_t		*hw_addr;
3930 	int		err = 0;
3931 	ip_stack_t	*ipst = ill->ill_ipst;
3932 	in6_addr_t	dst6;
3933 	nce_t		*nce;
3934 
3935 	ASSERT(!ill->ill_isv6);
3936 
3937 	IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
3938 	mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
3939 	if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
3940 		mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3941 		goto done;
3942 	}
3943 	if (ill->ill_net_type == IRE_IF_RESOLVER) {
3944 		/*
3945 		 * For IRE_IF_RESOLVER a hardware mapping can be
3946 		 * generated, for IRE_IF_NORESOLVER, resolution cookie
3947 		 * in the ill is copied in nce_add_v4().
3948 		 */
3949 		hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
3950 		if (hw_addr == NULL) {
3951 			mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3952 			return (ENOMEM);
3953 		}
3954 		ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
3955 	} else {
3956 		/*
3957 		 * IRE_IF_NORESOLVER type simply copies the resolution
3958 		 * cookie passed in.  So no hw_addr is needed.
3959 		 */
3960 		hw_addr = NULL;
3961 	}
3962 	ASSERT(flags & NCE_F_MCAST);
3963 	ASSERT(flags & NCE_F_NONUD);
3964 	/* nce_state will be computed by nce_add_common() */
3965 	err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3966 	    ND_UNCHANGED, &nce);
3967 	mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3968 	if (err == 0)
3969 		err = nce_add_v4_postprocess(nce);
3970 	if (hw_addr != NULL)
3971 		kmem_free(hw_addr, ill->ill_phys_addr_length);
3972 	if (err != 0) {
3973 		ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3974 		return (err);
3975 	}
3976 done:
3977 	if (newnce != NULL)
3978 		*newnce = nce;
3979 	else
3980 		nce_refrele(nce);
3981 	return (0);
3982 }
3983 
3984 /*
3985  * This is used when scanning for "old" (least recently broadcast) NCEs.  We
3986  * don't want to have to walk the list for every single one, so we gather up
3987  * batches at a time.
3988  */
3989 #define	NCE_RESCHED_LIST_LEN	8
3990 
3991 typedef struct {
3992 	ill_t	*ncert_ill;
3993 	uint_t	ncert_num;
3994 	ncec_t	*ncert_nces[NCE_RESCHED_LIST_LEN];
3995 } nce_resched_t;
3996 
3997 /*
3998  * Pick the longest waiting NCEs for defense.
3999  */
4000 /* ARGSUSED */
4001 static int
4002 ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
4003 {
4004 	nce_resched_t *ncert = arg;
4005 	ncec_t **ncecs;
4006 	ncec_t **ncec_max;
4007 	ncec_t *ncec_temp;
4008 	ncec_t *ncec = nce->nce_common;
4009 
4010 	ASSERT(ncec->ncec_ill == ncert->ncert_ill);
4011 	/*
4012 	 * Only reachable entries that are ready for announcement are eligible.
4013 	 */
4014 	if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
4015 		return (0);
4016 	if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
4017 		ncec_refhold(ncec);
4018 		ncert->ncert_nces[ncert->ncert_num++] = ncec;
4019 	} else {
4020 		ncecs = ncert->ncert_nces;
4021 		ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
4022 		ncec_refhold(ncec);
4023 		for (; ncecs < ncec_max; ncecs++) {
4024 			ASSERT(ncec != NULL);
4025 			if ((*ncecs)->ncec_last_time_defended >
4026 			    ncec->ncec_last_time_defended) {
4027 				ncec_temp = *ncecs;
4028 				*ncecs = ncec;
4029 				ncec = ncec_temp;
4030 			}
4031 		}
4032 		ncec_refrele(ncec);
4033 	}
4034 	return (0);
4035 }
4036 
4037 /*
4038  * Reschedule the ARP defense of any long-waiting NCEs.  It's assumed that this
4039  * doesn't happen very often (if at all), and thus it needn't be highly
4040  * optimized.  (Note, though, that it's actually O(N) complexity, because the
4041  * outer loop is bounded by a constant rather than by the length of the list.)
4042  */
4043 static void
4044 nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
4045 {
4046 	ncec_t		*ncec;
4047 	ip_stack_t	*ipst = ill->ill_ipst;
4048 	uint_t		i, defend_rate;
4049 
4050 	i = ill->ill_defend_count;
4051 	ill->ill_defend_count = 0;
4052 	if (ill->ill_isv6)
4053 		defend_rate = ipst->ips_ndp_defend_rate;
4054 	else
4055 		defend_rate = ipst->ips_arp_defend_rate;
4056 	/* If none could be sitting around, then don't reschedule */
4057 	if (i < defend_rate) {
4058 		DTRACE_PROBE1(reschedule_none, ill_t *, ill);
4059 		return;
4060 	}
4061 	ncert->ncert_ill = ill;
4062 	while (ill->ill_defend_count < defend_rate) {
4063 		nce_walk_common(ill, ncec_reschedule, ncert);
4064 		for (i = 0; i < ncert->ncert_num; i++) {
4065 
4066 			ncec = ncert->ncert_nces[i];
4067 			mutex_enter(&ncec->ncec_lock);
4068 			ncec->ncec_flags |= NCE_F_DELAYED;
4069 			mutex_exit(&ncec->ncec_lock);
4070 			/*
4071 			 * we plan to schedule this ncec, so incr the
4072 			 * defend_count in anticipation.
4073 			 */
4074 			if (++ill->ill_defend_count >= defend_rate)
4075 				break;
4076 		}
4077 		if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
4078 			break;
4079 	}
4080 }
4081 
4082 /*
4083  * Check if the current rate-limiting parameters permit the sending
4084  * of another address defense announcement for both IPv4 and IPv6.
4085  * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
4086  * permitted), and B_FALSE otherwise. The `defend_rate' parameter
4087  * determines how many address defense announcements are permitted
4088  * in any `defense_perio' interval.
4089  */
4090 static boolean_t
4091 ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
4092 {
4093 	clock_t		now = ddi_get_lbolt();
4094 	ip_stack_t	*ipst = ill->ill_ipst;
4095 	clock_t		start = ill->ill_defend_start;
4096 	uint32_t	elapsed, defend_period, defend_rate;
4097 	nce_resched_t	ncert;
4098 	boolean_t	ret;
4099 	int		i;
4100 
4101 	if (ill->ill_isv6) {
4102 		defend_period = ipst->ips_ndp_defend_period;
4103 		defend_rate = ipst->ips_ndp_defend_rate;
4104 	} else {
4105 		defend_period = ipst->ips_arp_defend_period;
4106 		defend_rate = ipst->ips_arp_defend_rate;
4107 	}
4108 	if (defend_rate == 0)
4109 		return (B_TRUE);
4110 	bzero(&ncert, sizeof (ncert));
4111 	mutex_enter(&ill->ill_lock);
4112 	if (start > 0) {
4113 		elapsed = now - start;
4114 		if (elapsed > SEC_TO_TICK(defend_period)) {
4115 			ill->ill_defend_start = now;
4116 			/*
4117 			 * nce_ill_reschedule will attempt to
4118 			 * prevent starvation by reschduling the
4119 			 * oldest entries, which are marked with
4120 			 * the NCE_F_DELAYED flag.
4121 			 */
4122 			nce_ill_reschedule(ill, &ncert);
4123 		}
4124 	} else {
4125 		ill->ill_defend_start = now;
4126 	}
4127 	ASSERT(ill->ill_defend_count <= defend_rate);
4128 	mutex_enter(&ncec->ncec_lock);
4129 	if (ncec->ncec_flags & NCE_F_DELAYED) {
4130 		/*
4131 		 * This ncec was rescheduled as one of the really old
4132 		 * entries needing on-going defense. The
4133 		 * ill_defend_count was already incremented in
4134 		 * nce_ill_reschedule. Go ahead and send the announce.
4135 		 */
4136 		ncec->ncec_flags &= ~NCE_F_DELAYED;
4137 		mutex_exit(&ncec->ncec_lock);
4138 		ret = B_FALSE;
4139 		goto done;
4140 	}
4141 	mutex_exit(&ncec->ncec_lock);
4142 	if (ill->ill_defend_count < defend_rate)
4143 		ill->ill_defend_count++;
4144 	if (ill->ill_defend_count == defend_rate) {
4145 		/*
4146 		 * we are no longer allowed to send unbidden defense
4147 		 * messages. Wait for rescheduling.
4148 		 */
4149 		ret = B_TRUE;
4150 	} else {
4151 		ret = B_FALSE;
4152 	}
4153 done:
4154 	mutex_exit(&ill->ill_lock);
4155 	/*
4156 	 * After all the locks have been dropped we can restart nce timer,
4157 	 * and refrele the delayed ncecs
4158 	 */
4159 	for (i = 0; i < ncert.ncert_num; i++) {
4160 		clock_t	xmit_interval;
4161 		ncec_t	*tmp;
4162 
4163 		tmp = ncert.ncert_nces[i];
4164 		xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
4165 		    B_FALSE);
4166 		nce_restart_timer(tmp, xmit_interval);
4167 		ncec_refrele(tmp);
4168 	}
4169 	return (ret);
4170 }
4171 
4172 boolean_t
4173 ndp_announce(ncec_t *ncec)
4174 {
4175 	return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
4176 	    ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
4177 	    nce_advert_flags(ncec)));
4178 }
4179 
4180 ill_t *
4181 nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
4182 {
4183 	mblk_t		*mp;
4184 	in6_addr_t	src6;
4185 	ipaddr_t	src4;
4186 	ill_t		*ill = ncec->ncec_ill;
4187 	ill_t		*src_ill = NULL;
4188 	ipif_t		*ipif = NULL;
4189 	boolean_t	is_myaddr = NCE_MYADDR(ncec);
4190 	boolean_t	isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4191 
4192 	ASSERT(src != NULL);
4193 	ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
4194 	src6 = *src;
4195 	if (is_myaddr) {
4196 		src6 = ncec->ncec_addr;
4197 		if (!isv6)
4198 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
4199 	} else {
4200 		/*
4201 		 * try to find one from the outgoing packet.
4202 		 */
4203 		mutex_enter(&ncec->ncec_lock);
4204 		mp = ncec->ncec_qd_mp;
4205 		if (mp != NULL) {
4206 			if (isv6) {
4207 				ip6_t	*ip6h = (ip6_t *)mp->b_rptr;
4208 
4209 				src6 = ip6h->ip6_src;
4210 			} else {
4211 				ipha_t  *ipha = (ipha_t *)mp->b_rptr;
4212 
4213 				src4 = ipha->ipha_src;
4214 				IN6_IPADDR_TO_V4MAPPED(src4, &src6);
4215 			}
4216 		}
4217 		mutex_exit(&ncec->ncec_lock);
4218 	}
4219 
4220 	/*
4221 	 * For outgoing packets, if the src of outgoing packet is one
4222 	 * of the assigned interface addresses use it, otherwise we
4223 	 * will pick the source address below.
4224 	 * For local addresses (is_myaddr) doing DAD, NDP announce
4225 	 * messages are mcast. So we use the (IPMP) cast_ill or the
4226 	 * (non-IPMP) ncec_ill for these message types. The only case
4227 	 * of unicast DAD messages are for IPv6 ND probes, for which
4228 	 * we find the ipif_bound_ill corresponding to the ncec_addr.
4229 	 */
4230 	if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
4231 		if (isv6) {
4232 			ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
4233 			    ill->ill_ipst);
4234 		} else {
4235 			ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
4236 			    ill->ill_ipst);
4237 		}
4238 
4239 		/*
4240 		 * If no relevant ipif can be found, then it's not one of our
4241 		 * addresses.  Reset to :: and try to find a src for the NS or
4242 		 * ARP request using ipif_select_source_v[4,6]  below.
4243 		 * If an ipif can be found, but it's not yet done with
4244 		 * DAD verification, and we are not being invoked for
4245 		 * DAD (i.e., !is_myaddr), then just postpone this
4246 		 * transmission until later.
4247 		 */
4248 		if (ipif == NULL) {
4249 			src6 = ipv6_all_zeros;
4250 			src4 = INADDR_ANY;
4251 		} else if (!ipif->ipif_addr_ready && !is_myaddr) {
4252 			DTRACE_PROBE2(nce__resolve__ipif__not__ready,
4253 			    ncec_t *, ncec, ipif_t *, ipif);
4254 			ipif_refrele(ipif);
4255 			return (NULL);
4256 		}
4257 	}
4258 
4259 	if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
4260 		/*
4261 		 * Pick a source address for this solicitation, but
4262 		 * restrict the selection to addresses assigned to the
4263 		 * output interface.  We do this because the destination will
4264 		 * create a neighbor cache entry for the source address of
4265 		 * this packet, so the source address had better be a valid
4266 		 * neighbor.
4267 		 */
4268 		if (isv6) {
4269 			ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
4270 			    B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4271 			    B_FALSE, NULL);
4272 		} else {
4273 			ipaddr_t nce_addr;
4274 
4275 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
4276 			ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
4277 			    B_FALSE, NULL);
4278 		}
4279 		if (ipif == NULL && IS_IPMP(ill)) {
4280 			ill_t *send_ill = ipmp_ill_hold_xmit_ill(ill, B_TRUE);
4281 
4282 			if (send_ill != NULL) {
4283 				if (isv6) {
4284 					ipif = ipif_select_source_v6(send_ill,
4285 					    &ncec->ncec_addr, B_TRUE,
4286 					    IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
4287 					    B_FALSE, NULL);
4288 				} else {
4289 					IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
4290 					    src4);
4291 					ipif = ipif_select_source_v4(send_ill,
4292 					    src4, ALL_ZONES, B_TRUE, NULL);
4293 				}
4294 				ill_refrele(send_ill);
4295 			}
4296 		}
4297 
4298 		if (ipif == NULL) {
4299 			char buf[INET6_ADDRSTRLEN];
4300 
4301 			ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
4302 			    inet_ntop((isv6 ? AF_INET6 : AF_INET),
4303 			    (char *)&ncec->ncec_addr, buf, sizeof (buf))));
4304 			DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
4305 			return (NULL);
4306 		}
4307 		src6 = ipif->ipif_v6lcl_addr;
4308 	}
4309 	*src = src6;
4310 	if (ipif != NULL) {
4311 		src_ill = ipif->ipif_ill;
4312 		if (IS_IPMP(src_ill))
4313 			src_ill = ipmp_ipif_hold_bound_ill(ipif);
4314 		else
4315 			ill_refhold(src_ill);
4316 		ipif_refrele(ipif);
4317 		DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
4318 		    ill_t *, src_ill);
4319 	}
4320 	return (src_ill);
4321 }
4322 
4323 void
4324 ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
4325     uchar_t *hwaddr, int hwaddr_len, int flags)
4326 {
4327 	ill_t	*ill;
4328 	ncec_t	*ncec;
4329 	nce_t	*nce;
4330 	uint16_t new_state;
4331 
4332 	ill = (ipif ? ipif->ipif_ill : NULL);
4333 	if (ill != NULL) {
4334 		/*
4335 		 * only one ncec is possible
4336 		 */
4337 		nce = nce_lookup_v4(ill, addr);
4338 		if (nce != NULL) {
4339 			ncec = nce->nce_common;
4340 			mutex_enter(&ncec->ncec_lock);
4341 			if (NCE_ISREACHABLE(ncec))
4342 				new_state = ND_UNCHANGED;
4343 			else
4344 				new_state = ND_STALE;
4345 			ncec->ncec_flags = flags;
4346 			nce_update(ncec, new_state, hwaddr);
4347 			mutex_exit(&ncec->ncec_lock);
4348 			nce_refrele(nce);
4349 			return;
4350 		}
4351 	} else {
4352 		/*
4353 		 * ill is wildcard; clean up all ncec's and ire's
4354 		 * that match on addr.
4355 		 */
4356 		nce_hw_map_t hwm;
4357 
4358 		hwm.hwm_addr = *addr;
4359 		hwm.hwm_hwlen = hwaddr_len;
4360 		hwm.hwm_hwaddr = hwaddr;
4361 		hwm.hwm_flags = flags;
4362 
4363 		ncec_walk_common(ipst->ips_ndp4, NULL,
4364 		    (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
4365 	}
4366 }
4367 
4368 /*
4369  * Common function to add ncec entries.
4370  * we always add the ncec with ncec_ill == ill, and always create
4371  * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
4372  * ncec is !reachable.
4373  *
4374  * When the caller passes in an nce_state of ND_UNCHANGED,
4375  * nce_add_common() will determine the state of the created nce based
4376  * on the ill_net_type and nce_flags used. Otherwise, the nce will
4377  * be created with state set to the passed in nce_state.
4378  */
4379 static int
4380 nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
4381     const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
4382 {
4383 	static	ncec_t		nce_nil;
4384 	uchar_t			*template = NULL;
4385 	int			err;
4386 	ncec_t			*ncec;
4387 	ncec_t			**ncep;
4388 	ip_stack_t		*ipst = ill->ill_ipst;
4389 	uint16_t		state;
4390 	boolean_t		fastprobe = B_FALSE;
4391 	struct ndp_g_s		*ndp;
4392 	nce_t			*nce = NULL;
4393 	mblk_t			*dlur_mp = NULL;
4394 
4395 	if (ill->ill_isv6)
4396 		ndp = ill->ill_ipst->ips_ndp6;
4397 	else
4398 		ndp = ill->ill_ipst->ips_ndp4;
4399 
4400 	*retnce = NULL;
4401 
4402 	ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
4403 
4404 	if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
4405 		ip0dbg(("nce_add_common: no addr\n"));
4406 		return (EINVAL);
4407 	}
4408 	if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
4409 		ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
4410 		return (EINVAL);
4411 	}
4412 
4413 	if (ill->ill_isv6) {
4414 		ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
4415 	} else {
4416 		ipaddr_t v4addr;
4417 
4418 		IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
4419 		ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
4420 	}
4421 
4422 	/*
4423 	 * The caller has ensured that there is no nce on ill, but there could
4424 	 * still be an nce_common_t for the address, so that we find exisiting
4425 	 * ncec_t strucutures first, and atomically add a new nce_t if
4426 	 * one is found. The ndp_g_lock ensures that we don't cross threads
4427 	 * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
4428 	 * compare for matches across the illgrp because this function is
4429 	 * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
4430 	 * with the nce_lookup_then_add_v* passing in the ipmp_ill where
4431 	 * appropriate.
4432 	 */
4433 	ncec = *ncep;
4434 	for (; ncec != NULL; ncec = ncec->ncec_next) {
4435 		if (ncec->ncec_ill == ill) {
4436 			if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
4437 				/*
4438 				 * We should never find *retnce to be
4439 				 * MYADDR, since the caller may then
4440 				 * incorrectly restart a DAD timer that's
4441 				 * already running.  However, if we are in
4442 				 * forwarding mode, and the interface is
4443 				 * moving in/out of groups, the data
4444 				 * path ire lookup (e.g., ire_revalidate_nce)
4445 				 * may  have determined that some destination
4446 				 * is offlink while the control path is adding
4447 				 * that address as a local address.
4448 				 * Recover from  this case by failing the
4449 				 * lookup
4450 				 */
4451 				if (NCE_MYADDR(ncec))
4452 					return (ENXIO);
4453 				*retnce = nce_ill_lookup_then_add(ill, ncec);
4454 				if (*retnce != NULL)
4455 					break;
4456 			}
4457 		}
4458 	}
4459 	if (*retnce != NULL) /* caller must trigger fastpath on nce */
4460 		return (0);
4461 
4462 	ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
4463 	if (ncec == NULL)
4464 		return (ENOMEM);
4465 	*ncec = nce_nil;
4466 	ncec->ncec_ill = ill;
4467 	ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
4468 	ncec->ncec_flags = flags;
4469 	ncec->ncec_ipst = ipst;	/* No netstack_hold */
4470 
4471 	if (!ill->ill_isv6) {
4472 		ipaddr_t addr4;
4473 
4474 		/*
4475 		 * DAD probe interval and probe count are set based on
4476 		 * fast/slow probe settings. If the underlying link doesn't
4477 		 * have reliably up/down notifications or if we're working
4478 		 * with IPv4 169.254.0.0/16 Link Local Address space, then
4479 		 * don't use the fast timers.  Otherwise, use them.
4480 		 */
4481 		ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
4482 		IN6_V4MAPPED_TO_IPADDR(addr, addr4);
4483 		if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) {
4484 			fastprobe = B_TRUE;
4485 		} else if (IS_IPMP(ill) && NCE_PUBLISH(ncec) &&
4486 		    !IS_IPV4_LL_SPACE(&addr4)) {
4487 			ill_t *hwaddr_ill;
4488 
4489 			hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, hw_addr,
4490 			    hw_addr_len);
4491 			if (hwaddr_ill != NULL && hwaddr_ill->ill_note_link)
4492 				fastprobe = B_TRUE;
4493 		}
4494 		if (fastprobe) {
4495 			ncec->ncec_xmit_interval =
4496 			    ipst->ips_arp_fastprobe_interval;
4497 			ncec->ncec_pcnt =
4498 			    ipst->ips_arp_fastprobe_count;
4499 			ncec->ncec_flags |= NCE_F_FAST;
4500 		} else {
4501 			ncec->ncec_xmit_interval =
4502 			    ipst->ips_arp_probe_interval;
4503 			ncec->ncec_pcnt =
4504 			    ipst->ips_arp_probe_count;
4505 		}
4506 		if (NCE_PUBLISH(ncec)) {
4507 			ncec->ncec_unsolicit_count =
4508 			    ipst->ips_ip_arp_publish_count;
4509 		}
4510 	} else {
4511 		/*
4512 		 * probe interval is constant: ILL_PROBE_INTERVAL
4513 		 * probe count is constant: ND_MAX_UNICAST_SOLICIT
4514 		 */
4515 		ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
4516 		if (NCE_PUBLISH(ncec)) {
4517 			ncec->ncec_unsolicit_count =
4518 			    ipst->ips_ip_ndp_unsolicit_count;
4519 		}
4520 	}
4521 	ncec->ncec_rcnt = ill->ill_xmit_count;
4522 	ncec->ncec_addr = *addr;
4523 	ncec->ncec_qd_mp = NULL;
4524 	ncec->ncec_refcnt = 1; /* for ncec getting created */
4525 	mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
4526 	ncec->ncec_trace_disable = B_FALSE;
4527 
4528 	/*
4529 	 * ncec_lladdr holds link layer address
4530 	 */
4531 	if (hw_addr_len > 0) {
4532 		template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
4533 		if (template == NULL) {
4534 			err = ENOMEM;
4535 			goto err_ret;
4536 		}
4537 		ncec->ncec_lladdr = template;
4538 		ncec->ncec_lladdr_length = hw_addr_len;
4539 		bzero(ncec->ncec_lladdr, hw_addr_len);
4540 	}
4541 	if ((flags & NCE_F_BCAST) != 0) {
4542 		state = ND_REACHABLE;
4543 		ASSERT(hw_addr_len > 0);
4544 	} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
4545 		state = ND_INITIAL;
4546 	} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
4547 		/*
4548 		 * NORESOLVER entries are always created in the REACHABLE
4549 		 * state.
4550 		 */
4551 		state = ND_REACHABLE;
4552 		if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
4553 		    ill->ill_mactype != DL_IPV4 &&
4554 		    ill->ill_mactype != DL_6TO4) {
4555 			/*
4556 			 * We create a nce_res_mp with the IP nexthop address
4557 			 * as the destination address if the physical length
4558 			 * is exactly 4 bytes for point-to-multipoint links
4559 			 * that do their own resolution from IP to link-layer
4560 			 * address (e.g. IP over X.25).
4561 			 */
4562 			bcopy((uchar_t *)addr,
4563 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4564 		}
4565 		if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
4566 		    ill->ill_mactype != DL_IPV6) {
4567 			/*
4568 			 * We create a nce_res_mp with the IP nexthop address
4569 			 * as the destination address if the physical legnth
4570 			 * is exactly 16 bytes for point-to-multipoint links
4571 			 * that do their own resolution from IP to link-layer
4572 			 * address.
4573 			 */
4574 			bcopy((uchar_t *)addr,
4575 			    ncec->ncec_lladdr, ill->ill_phys_addr_length);
4576 		}
4577 		/*
4578 		 * Since NUD is not part of the base IPv4 protocol definition,
4579 		 * IPv4 neighbor entries on NORESOLVER interfaces will never
4580 		 * age, and are marked NCE_F_NONUD.
4581 		 */
4582 		if (!ill->ill_isv6)
4583 			ncec->ncec_flags |= NCE_F_NONUD;
4584 	} else if (ill->ill_net_type == IRE_LOOPBACK) {
4585 		state = ND_REACHABLE;
4586 	}
4587 
4588 	if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
4589 		/*
4590 		 * We are adding an ncec with a deterministic hw_addr,
4591 		 * so the state can only be one of {REACHABLE, STALE, PROBE}.
4592 		 *
4593 		 * if we are adding a unicast ncec for the local address
4594 		 * it would be REACHABLE; we would be adding a ND_STALE entry
4595 		 * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
4596 		 * addresses are added in PROBE to trigger DAD.
4597 		 */
4598 		if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
4599 		    ill->ill_net_type == IRE_IF_NORESOLVER)
4600 			state = ND_REACHABLE;
4601 		else if (!NCE_PUBLISH(ncec))
4602 			state = ND_STALE;
4603 		else
4604 			state = ND_PROBE;
4605 		if (hw_addr != NULL)
4606 			nce_set_ll(ncec, hw_addr);
4607 	}
4608 	/* caller overrides internally computed state */
4609 	if (nce_state != ND_UNCHANGED)
4610 		state = nce_state;
4611 
4612 	if (state == ND_PROBE)
4613 		ncec->ncec_flags |= NCE_F_UNVERIFIED;
4614 
4615 	ncec->ncec_state = state;
4616 
4617 	if (state == ND_REACHABLE) {
4618 		ncec->ncec_last = ncec->ncec_init_time =
4619 		    TICK_TO_MSEC(ddi_get_lbolt64());
4620 	} else {
4621 		ncec->ncec_last = 0;
4622 		if (state == ND_INITIAL)
4623 			ncec->ncec_init_time = TICK_TO_MSEC(ddi_get_lbolt64());
4624 	}
4625 	list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
4626 	    offsetof(ncec_cb_t, ncec_cb_node));
4627 	/*
4628 	 * have all the memory allocations out of the way before taking locks
4629 	 * and adding the nce.
4630 	 */
4631 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4632 	if (nce == NULL) {
4633 		err = ENOMEM;
4634 		goto err_ret;
4635 	}
4636 	if (ncec->ncec_lladdr != NULL ||
4637 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4638 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4639 		    ill->ill_phys_addr_length, ill->ill_sap,
4640 		    ill->ill_sap_length);
4641 		if (dlur_mp == NULL) {
4642 			err = ENOMEM;
4643 			goto err_ret;
4644 		}
4645 	}
4646 
4647 	/*
4648 	 * Atomically ensure that the ill is not CONDEMNED, before
4649 	 * adding the NCE.
4650 	 */
4651 	mutex_enter(&ill->ill_lock);
4652 	if (ill->ill_state_flags & ILL_CONDEMNED) {
4653 		mutex_exit(&ill->ill_lock);
4654 		err = EINVAL;
4655 		goto err_ret;
4656 	}
4657 	if (!NCE_MYADDR(ncec) &&
4658 	    (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
4659 		mutex_exit(&ill->ill_lock);
4660 		DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
4661 		err = EINVAL;
4662 		goto err_ret;
4663 	}
4664 	/*
4665 	 * Acquire the ncec_lock even before adding the ncec to the list
4666 	 * so that it cannot get deleted after the ncec is added, but
4667 	 * before we add the nce.
4668 	 */
4669 	mutex_enter(&ncec->ncec_lock);
4670 	if ((ncec->ncec_next = *ncep) != NULL)
4671 		ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
4672 	*ncep = ncec;
4673 	ncec->ncec_ptpn = ncep;
4674 
4675 	/* Bump up the number of ncec's referencing this ill */
4676 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4677 	    (char *), "ncec", (void *), ncec);
4678 	ill->ill_ncec_cnt++;
4679 	/*
4680 	 * Since we hold the ncec_lock at this time, the ncec cannot be
4681 	 * condemned, and we can safely add the nce.
4682 	 */
4683 	*retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
4684 	mutex_exit(&ncec->ncec_lock);
4685 	mutex_exit(&ill->ill_lock);
4686 
4687 	/* caller must trigger fastpath on *retnce */
4688 	return (0);
4689 
4690 err_ret:
4691 	if (ncec != NULL)
4692 		kmem_cache_free(ncec_cache, ncec);
4693 	if (nce != NULL)
4694 		kmem_cache_free(nce_cache, nce);
4695 	freemsg(dlur_mp);
4696 	if (template != NULL)
4697 		kmem_free(template, ill->ill_phys_addr_length);
4698 	return (err);
4699 }
4700 
4701 /*
4702  * take a ref on the nce
4703  */
4704 void
4705 nce_refhold(nce_t *nce)
4706 {
4707 	mutex_enter(&nce->nce_lock);
4708 	nce->nce_refcnt++;
4709 	ASSERT((nce)->nce_refcnt != 0);
4710 	mutex_exit(&nce->nce_lock);
4711 }
4712 
4713 /*
4714  * release a ref on the nce; In general, this
4715  * cannot be called with locks held because nce_inactive
4716  * may result in nce_inactive which will take the ill_lock,
4717  * do ipif_ill_refrele_tail etc. Thus the one exception
4718  * where this can be called with locks held is when the caller
4719  * is certain that the nce_refcnt is sufficient to prevent
4720  * the invocation of nce_inactive.
4721  */
4722 void
4723 nce_refrele(nce_t *nce)
4724 {
4725 	ASSERT((nce)->nce_refcnt != 0);
4726 	mutex_enter(&nce->nce_lock);
4727 	if (--nce->nce_refcnt == 0)
4728 		nce_inactive(nce); /* destroys the mutex */
4729 	else
4730 		mutex_exit(&nce->nce_lock);
4731 }
4732 
4733 /*
4734  * free the nce after all refs have gone away.
4735  */
4736 static void
4737 nce_inactive(nce_t *nce)
4738 {
4739 	ill_t *ill = nce->nce_ill;
4740 
4741 	ASSERT(nce->nce_refcnt == 0);
4742 
4743 	ncec_refrele_notr(nce->nce_common);
4744 	nce->nce_common = NULL;
4745 	freemsg(nce->nce_fp_mp);
4746 	freemsg(nce->nce_dlur_mp);
4747 
4748 	mutex_enter(&ill->ill_lock);
4749 	DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
4750 	    (char *), "nce", (void *), nce);
4751 	ill->ill_nce_cnt--;
4752 	nce->nce_ill = NULL;
4753 	/*
4754 	 * If the number of ncec's associated with this ill have dropped
4755 	 * to zero, check whether we need to restart any operation that
4756 	 * is waiting for this to happen.
4757 	 */
4758 	if (ILL_DOWN_OK(ill)) {
4759 		/* ipif_ill_refrele_tail drops the ill_lock */
4760 		ipif_ill_refrele_tail(ill);
4761 	} else {
4762 		mutex_exit(&ill->ill_lock);
4763 	}
4764 
4765 	mutex_destroy(&nce->nce_lock);
4766 	kmem_cache_free(nce_cache, nce);
4767 }
4768 
4769 /*
4770  * Add an nce to the ill_nce list.
4771  */
4772 static nce_t *
4773 nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
4774 {
4775 	bzero(nce, sizeof (*nce));
4776 	mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4777 	nce->nce_common = ncec;
4778 	nce->nce_addr = ncec->ncec_addr;
4779 	nce->nce_ill = ill;
4780 	DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4781 	    (char *), "nce", (void *), nce);
4782 	ill->ill_nce_cnt++;
4783 
4784 	nce->nce_refcnt = 1; /* for the thread */
4785 	ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4786 	nce->nce_dlur_mp = dlur_mp;
4787 
4788 	/* add nce to the ill's fastpath list.  */
4789 	nce->nce_refcnt++; /* for the list */
4790 	list_insert_head(&ill->ill_nce, nce);
4791 	return (nce);
4792 }
4793 
4794 static nce_t *
4795 nce_add(ill_t *ill, ncec_t *ncec)
4796 {
4797 	nce_t	*nce;
4798 	mblk_t	*dlur_mp = NULL;
4799 
4800 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4801 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4802 
4803 	nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4804 	if (nce == NULL)
4805 		return (NULL);
4806 	if (ncec->ncec_lladdr != NULL ||
4807 	    ill->ill_net_type == IRE_IF_NORESOLVER) {
4808 		dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4809 		    ill->ill_phys_addr_length, ill->ill_sap,
4810 		    ill->ill_sap_length);
4811 		if (dlur_mp == NULL) {
4812 			kmem_cache_free(nce_cache, nce);
4813 			return (NULL);
4814 		}
4815 	}
4816 	return (nce_add_impl(ill, ncec, nce, dlur_mp));
4817 }
4818 
4819 /*
4820  * remove the nce from the ill_faspath list
4821  */
4822 void
4823 nce_delete(nce_t *nce)
4824 {
4825 	ill_t	*ill = nce->nce_ill;
4826 
4827 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4828 
4829 	mutex_enter(&nce->nce_lock);
4830 	if (nce->nce_is_condemned) {
4831 		/*
4832 		 * some other thread has removed this nce from the ill_nce list
4833 		 */
4834 		mutex_exit(&nce->nce_lock);
4835 		return;
4836 	}
4837 	nce->nce_is_condemned = B_TRUE;
4838 	mutex_exit(&nce->nce_lock);
4839 
4840 	list_remove(&ill->ill_nce, nce);
4841 	/*
4842 	 * even though we are holding the ill_lock, it is ok to
4843 	 * call nce_refrele here because we know that we should have
4844 	 * at least 2 refs on the nce: one for the thread, and one
4845 	 * for the list. The refrele below will release the one for
4846 	 * the list.
4847 	 */
4848 	nce_refrele(nce);
4849 }
4850 
4851 nce_t *
4852 nce_lookup(ill_t *ill, const in6_addr_t *addr)
4853 {
4854 	nce_t *nce = NULL;
4855 
4856 	ASSERT(ill != NULL);
4857 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4858 
4859 	for (nce = list_head(&ill->ill_nce); nce != NULL;
4860 	    nce = list_next(&ill->ill_nce, nce)) {
4861 		if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
4862 			break;
4863 	}
4864 
4865 	/*
4866 	 * if we found the nce on the ill_nce list while holding
4867 	 * the ill_lock, then it cannot be condemned yet.
4868 	 */
4869 	if (nce != NULL) {
4870 		ASSERT(!nce->nce_is_condemned);
4871 		nce_refhold(nce);
4872 	}
4873 	return (nce);
4874 }
4875 
4876 /*
4877  * Walk the ill_nce list on ill. The callback function func() cannot perform
4878  * any destructive actions.
4879  */
4880 static void
4881 nce_walk_common(ill_t *ill, pfi_t func, void *arg)
4882 {
4883 	nce_t *nce = NULL, *nce_next;
4884 
4885 	ASSERT(MUTEX_HELD(&ill->ill_lock));
4886 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4887 		nce_next = list_next(&ill->ill_nce, nce);
4888 		if (func(ill, nce, arg) != 0)
4889 			break;
4890 		nce = nce_next;
4891 	}
4892 }
4893 
4894 void
4895 nce_walk(ill_t *ill, pfi_t func, void *arg)
4896 {
4897 	mutex_enter(&ill->ill_lock);
4898 	nce_walk_common(ill, func, arg);
4899 	mutex_exit(&ill->ill_lock);
4900 }
4901 
4902 void
4903 nce_flush(ill_t *ill, boolean_t flushall)
4904 {
4905 	nce_t *nce, *nce_next;
4906 	list_t dead;
4907 
4908 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
4909 	mutex_enter(&ill->ill_lock);
4910 	for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
4911 		nce_next = list_next(&ill->ill_nce, nce);
4912 		if (!flushall && NCE_PUBLISH(nce->nce_common)) {
4913 			nce = nce_next;
4914 			continue;
4915 		}
4916 		/*
4917 		 * nce_delete requires that the caller should either not
4918 		 * be holding locks, or should hold a ref to ensure that
4919 		 * we wont hit ncec_inactive. So take a ref and clean up
4920 		 * after the list is flushed.
4921 		 */
4922 		nce_refhold(nce);
4923 		nce_delete(nce);
4924 		list_insert_tail(&dead, nce);
4925 		nce = nce_next;
4926 	}
4927 	mutex_exit(&ill->ill_lock);
4928 	while ((nce = list_head(&dead)) != NULL) {
4929 		list_remove(&dead, nce);
4930 		nce_refrele(nce);
4931 	}
4932 	ASSERT(list_is_empty(&dead));
4933 	list_destroy(&dead);
4934 }
4935 
4936 /* Return an interval that is anywhere in the [1 .. intv] range */
4937 static clock_t
4938 nce_fuzz_interval(clock_t intv, boolean_t initial_time)
4939 {
4940 	clock_t rnd, frac;
4941 
4942 	(void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
4943 	/* Note that clock_t is signed; must chop off bits */
4944 	rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
4945 	if (initial_time) {
4946 		if (intv <= 0)
4947 			intv = 1;
4948 		else
4949 			intv = (rnd % intv) + 1;
4950 	} else {
4951 		/* Compute 'frac' as 20% of the configured interval */
4952 		if ((frac = intv / 5) <= 1)
4953 			frac = 2;
4954 		/* Set intv randomly in the range [intv-frac .. intv+frac] */
4955 		if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
4956 		intv = 1;
4957 	}
4958 	return (intv);
4959 }
4960 
4961 void
4962 nce_resolv_ipmp_ok(ncec_t *ncec)
4963 {
4964 	mblk_t *mp;
4965 	uint_t pkt_len;
4966 	iaflags_t ixaflags = IXAF_NO_TRACE;
4967 	nce_t *under_nce;
4968 	ill_t	*ill = ncec->ncec_ill;
4969 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
4970 	ipif_t *src_ipif = NULL;
4971 	ip_stack_t *ipst = ill->ill_ipst;
4972 	ill_t *send_ill;
4973 	uint_t nprobes;
4974 
4975 	ASSERT(IS_IPMP(ill));
4976 
4977 	mutex_enter(&ncec->ncec_lock);
4978 	nprobes = ncec->ncec_nprobes;
4979 	mp = ncec->ncec_qd_mp;
4980 	ncec->ncec_qd_mp = NULL;
4981 	ncec->ncec_nprobes = 0;
4982 	mutex_exit(&ncec->ncec_lock);
4983 
4984 	while (mp != NULL) {
4985 		mblk_t *nxt_mp;
4986 
4987 		nxt_mp = mp->b_next;
4988 		mp->b_next = NULL;
4989 		if (isv6) {
4990 			ip6_t *ip6h = (ip6_t *)mp->b_rptr;
4991 
4992 			pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
4993 			src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
4994 			    ill, ALL_ZONES, ipst);
4995 		} else {
4996 			ipha_t *ipha = (ipha_t *)mp->b_rptr;
4997 
4998 			ixaflags |= IXAF_IS_IPV4;
4999 			pkt_len = ntohs(ipha->ipha_length);
5000 			src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
5001 			    ill, ALL_ZONES, ipst);
5002 		}
5003 
5004 		/*
5005 		 * find a new nce based on an under_ill. The first IPMP probe
5006 		 * packet gets queued, so we could still find a src_ipif that
5007 		 * matches an IPMP test address.
5008 		 */
5009 		if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
5010 			/*
5011 			 * if src_ipif is null, this could be either a
5012 			 * forwarded packet or a probe whose src got deleted.
5013 			 * We identify the former case by looking for the
5014 			 * ncec_nprobes: the first ncec_nprobes packets are
5015 			 * probes;
5016 			 */
5017 			if (src_ipif == NULL && nprobes > 0)
5018 				goto drop_pkt;
5019 
5020 			/*
5021 			 * For forwarded packets, we use the ipmp rotor
5022 			 * to find send_ill.
5023 			 */
5024 			send_ill = ipmp_ill_hold_xmit_ill(ncec->ncec_ill,
5025 			    B_TRUE);
5026 		} else {
5027 			send_ill = src_ipif->ipif_ill;
5028 			ill_refhold(send_ill);
5029 		}
5030 
5031 		DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
5032 		    (ncec_t *), ncec, (ipif_t *),
5033 		    src_ipif, (ill_t *), send_ill);
5034 
5035 		if (send_ill == NULL) {
5036 			if (src_ipif != NULL)
5037 				ipif_refrele(src_ipif);
5038 			goto drop_pkt;
5039 		}
5040 		/* create an under_nce on send_ill */
5041 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
5042 		if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
5043 			under_nce = nce_fastpath_create(send_ill, ncec);
5044 		else
5045 			under_nce = NULL;
5046 		rw_exit(&ipst->ips_ill_g_lock);
5047 		if (under_nce != NULL && NCE_ISREACHABLE(ncec))
5048 			nce_fastpath_trigger(under_nce);
5049 
5050 		ill_refrele(send_ill);
5051 		if (src_ipif != NULL)
5052 			ipif_refrele(src_ipif);
5053 
5054 		if (under_nce != NULL) {
5055 			(void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
5056 			    ALL_ZONES, 0, NULL);
5057 			nce_refrele(under_nce);
5058 			if (nprobes > 0)
5059 				nprobes--;
5060 			mp = nxt_mp;
5061 			continue;
5062 		}
5063 drop_pkt:
5064 		if (isv6) {
5065 			BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
5066 		} else {
5067 			BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
5068 		}
5069 		ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
5070 		freemsg(mp);
5071 		if (nprobes > 0)
5072 			nprobes--;
5073 		mp = nxt_mp;
5074 	}
5075 	ncec_cb_dispatch(ncec); /* complete callbacks */
5076 }
5077