xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_client.c (revision b6805bf78d2bbbeeaea8909a05623587b42d58b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
27  *	All Rights Reserved
28  */
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/systm.h>
33 #include <sys/thread.h>
34 #include <sys/t_lock.h>
35 #include <sys/time.h>
36 #include <sys/vnode.h>
37 #include <sys/vfs.h>
38 #include <sys/errno.h>
39 #include <sys/buf.h>
40 #include <sys/stat.h>
41 #include <sys/cred.h>
42 #include <sys/kmem.h>
43 #include <sys/debug.h>
44 #include <sys/dnlc.h>
45 #include <sys/vmsystm.h>
46 #include <sys/flock.h>
47 #include <sys/share.h>
48 #include <sys/cmn_err.h>
49 #include <sys/tiuser.h>
50 #include <sys/sysmacros.h>
51 #include <sys/callb.h>
52 #include <sys/acl.h>
53 #include <sys/kstat.h>
54 #include <sys/signal.h>
55 #include <sys/disp.h>
56 #include <sys/atomic.h>
57 #include <sys/list.h>
58 #include <sys/sdt.h>
59 
60 #include <rpc/types.h>
61 #include <rpc/xdr.h>
62 #include <rpc/auth.h>
63 #include <rpc/clnt.h>
64 
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_clnt.h>
67 #include <nfs/nfs_acl.h>
68 
69 #include <nfs/nfs4.h>
70 #include <nfs/rnode4.h>
71 #include <nfs/nfs4_clnt.h>
72 
73 #include <vm/hat.h>
74 #include <vm/as.h>
75 #include <vm/page.h>
76 #include <vm/pvn.h>
77 #include <vm/seg.h>
78 #include <vm/seg_map.h>
79 #include <vm/seg_vn.h>
80 
81 #include <sys/ddi.h>
82 
83 /*
84  * Arguments to page-flush thread.
85  */
86 typedef struct {
87 	vnode_t *vp;
88 	cred_t *cr;
89 } pgflush_t;
90 
91 #ifdef DEBUG
92 int nfs4_client_lease_debug;
93 int nfs4_sharedfh_debug;
94 int nfs4_fname_debug;
95 
96 /* temporary: panic if v_type is inconsistent with r_attr va_type */
97 int nfs4_vtype_debug;
98 
99 uint_t nfs4_tsd_key;
100 #endif
101 
102 static time_t	nfs4_client_resumed = 0;
103 static	callb_id_t cid = 0;
104 
105 static int	nfs4renew(nfs4_server_t *);
106 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
107 static void	nfs4_pgflush_thread(pgflush_t *);
108 
109 static boolean_t nfs4_client_cpr_callb(void *, int);
110 
111 struct mi4_globals {
112 	kmutex_t	mig_lock;  /* lock protecting mig_list */
113 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
114 	boolean_t	mig_destructor_called;
115 };
116 
117 static zone_key_t mi4_list_key;
118 
119 /*
120  * Attributes caching:
121  *
122  * Attributes are cached in the rnode in struct vattr form.
123  * There is a time associated with the cached attributes (r_time_attr_inval)
124  * which tells whether the attributes are valid. The time is initialized
125  * to the difference between current time and the modify time of the vnode
126  * when new attributes are cached. This allows the attributes for
127  * files that have changed recently to be timed out sooner than for files
128  * that have not changed for a long time. There are minimum and maximum
129  * timeout values that can be set per mount point.
130  */
131 
132 /*
133  * If a cache purge is in progress, wait for it to finish.
134  *
135  * The current thread must not be in the middle of an
136  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
137  * between this thread, a recovery thread, and the page flush thread.
138  */
139 int
140 nfs4_waitfor_purge_complete(vnode_t *vp)
141 {
142 	rnode4_t *rp;
143 	k_sigset_t smask;
144 
145 	rp = VTOR4(vp);
146 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
147 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
148 		mutex_enter(&rp->r_statelock);
149 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
150 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
151 		    ((rp->r_flags & R4PGFLUSH) &&
152 		    rp->r_pgflush != curthread)) {
153 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
154 				sigunintr(&smask);
155 				mutex_exit(&rp->r_statelock);
156 				return (EINTR);
157 			}
158 		}
159 		sigunintr(&smask);
160 		mutex_exit(&rp->r_statelock);
161 	}
162 	return (0);
163 }
164 
165 /*
166  * Validate caches by checking cached attributes. If they have timed out,
167  * then get new attributes from the server.  As a side effect, cache
168  * invalidation is done if the attributes have changed.
169  *
170  * If the attributes have not timed out and if there is a cache
171  * invalidation being done by some other thread, then wait until that
172  * thread has completed the cache invalidation.
173  */
174 int
175 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
176 {
177 	int error;
178 	nfs4_ga_res_t gar;
179 
180 	if (ATTRCACHE4_VALID(vp)) {
181 		error = nfs4_waitfor_purge_complete(vp);
182 		if (error)
183 			return (error);
184 		return (0);
185 	}
186 
187 	gar.n4g_va.va_mask = AT_ALL;
188 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
189 }
190 
191 /*
192  * Fill in attribute from the cache.
193  * If valid, then return 0 to indicate that no error occurred,
194  * otherwise return 1 to indicate that an error occurred.
195  */
196 static int
197 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
198 {
199 	rnode4_t *rp;
200 
201 	rp = VTOR4(vp);
202 	mutex_enter(&rp->r_statelock);
203 	mutex_enter(&rp->r_statev4_lock);
204 	if (ATTRCACHE4_VALID(vp)) {
205 		mutex_exit(&rp->r_statev4_lock);
206 		/*
207 		 * Cached attributes are valid
208 		 */
209 		*vap = rp->r_attr;
210 		mutex_exit(&rp->r_statelock);
211 		return (0);
212 	}
213 	mutex_exit(&rp->r_statev4_lock);
214 	mutex_exit(&rp->r_statelock);
215 	return (1);
216 }
217 
218 
219 /*
220  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
221  * call is synchronous because all the pages were invalidated by the
222  * nfs4_invalidate_pages() call.
223  */
224 void
225 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
226 {
227 	struct rnode4 *rp = VTOR4(vp);
228 
229 	/* Ensure that the ..._end_op() call has been done */
230 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
231 
232 	if (errno != ESTALE)
233 		return;
234 
235 	mutex_enter(&rp->r_statelock);
236 	rp->r_flags |= R4STALE;
237 	if (!rp->r_error)
238 		rp->r_error = errno;
239 	mutex_exit(&rp->r_statelock);
240 	if (nfs4_has_pages(vp))
241 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
242 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
243 }
244 
245 /*
246  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
247  * page purge is done asynchronously.
248  */
249 void
250 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
251 {
252 	rnode4_t *rp;
253 	char *contents;
254 	vnode_t *xattr;
255 	int size;
256 	int pgflush;			/* are we the page flush thread? */
257 
258 	/*
259 	 * Purge the DNLC for any entries which refer to this file.
260 	 */
261 	if (vp->v_count > 1 &&
262 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
263 		dnlc_purge_vp(vp);
264 
265 	/*
266 	 * Clear any readdir state bits and purge the readlink response cache.
267 	 */
268 	rp = VTOR4(vp);
269 	mutex_enter(&rp->r_statelock);
270 	rp->r_flags &= ~R4LOOKUP;
271 	contents = rp->r_symlink.contents;
272 	size = rp->r_symlink.size;
273 	rp->r_symlink.contents = NULL;
274 
275 	xattr = rp->r_xattr_dir;
276 	rp->r_xattr_dir = NULL;
277 
278 	/*
279 	 * Purge pathconf cache too.
280 	 */
281 	rp->r_pathconf.pc4_xattr_valid = 0;
282 	rp->r_pathconf.pc4_cache_valid = 0;
283 
284 	pgflush = (curthread == rp->r_pgflush);
285 	mutex_exit(&rp->r_statelock);
286 
287 	if (contents != NULL) {
288 
289 		kmem_free((void *)contents, size);
290 	}
291 
292 	if (xattr != NULL)
293 		VN_RELE(xattr);
294 
295 	/*
296 	 * Flush the page cache.  If the current thread is the page flush
297 	 * thread, don't initiate a new page flush.  There's no need for
298 	 * it, and doing it correctly is hard.
299 	 */
300 	if (nfs4_has_pages(vp) && !pgflush) {
301 		if (!asyncpg) {
302 			(void) nfs4_waitfor_purge_complete(vp);
303 			nfs4_flush_pages(vp, cr);
304 		} else {
305 			pgflush_t *args;
306 
307 			/*
308 			 * We don't hold r_statelock while creating the
309 			 * thread, in case the call blocks.  So we use a
310 			 * flag to indicate that a page flush thread is
311 			 * active.
312 			 */
313 			mutex_enter(&rp->r_statelock);
314 			if (rp->r_flags & R4PGFLUSH) {
315 				mutex_exit(&rp->r_statelock);
316 			} else {
317 				rp->r_flags |= R4PGFLUSH;
318 				mutex_exit(&rp->r_statelock);
319 
320 				args = kmem_alloc(sizeof (pgflush_t),
321 				    KM_SLEEP);
322 				args->vp = vp;
323 				VN_HOLD(args->vp);
324 				args->cr = cr;
325 				crhold(args->cr);
326 				(void) zthread_create(NULL, 0,
327 				    nfs4_pgflush_thread, args, 0,
328 				    minclsyspri);
329 			}
330 		}
331 	}
332 
333 	/*
334 	 * Flush the readdir response cache.
335 	 */
336 	nfs4_purge_rddir_cache(vp);
337 }
338 
339 /*
340  * Invalidate all pages for the given file, after writing back the dirty
341  * ones.
342  */
343 
344 void
345 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
346 {
347 	int error;
348 	rnode4_t *rp = VTOR4(vp);
349 
350 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
351 	if (error == ENOSPC || error == EDQUOT) {
352 		mutex_enter(&rp->r_statelock);
353 		if (!rp->r_error)
354 			rp->r_error = error;
355 		mutex_exit(&rp->r_statelock);
356 	}
357 }
358 
359 /*
360  * Page flush thread.
361  */
362 
363 static void
364 nfs4_pgflush_thread(pgflush_t *args)
365 {
366 	rnode4_t *rp = VTOR4(args->vp);
367 
368 	/* remember which thread we are, so we don't deadlock ourselves */
369 	mutex_enter(&rp->r_statelock);
370 	ASSERT(rp->r_pgflush == NULL);
371 	rp->r_pgflush = curthread;
372 	mutex_exit(&rp->r_statelock);
373 
374 	nfs4_flush_pages(args->vp, args->cr);
375 
376 	mutex_enter(&rp->r_statelock);
377 	rp->r_pgflush = NULL;
378 	rp->r_flags &= ~R4PGFLUSH;
379 	cv_broadcast(&rp->r_cv);
380 	mutex_exit(&rp->r_statelock);
381 
382 	VN_RELE(args->vp);
383 	crfree(args->cr);
384 	kmem_free(args, sizeof (pgflush_t));
385 	zthread_exit();
386 }
387 
388 /*
389  * Purge the readdir cache of all entries which are not currently
390  * being filled.
391  */
392 void
393 nfs4_purge_rddir_cache(vnode_t *vp)
394 {
395 	rnode4_t *rp;
396 
397 	rp = VTOR4(vp);
398 
399 	mutex_enter(&rp->r_statelock);
400 	rp->r_direof = NULL;
401 	rp->r_flags &= ~R4LOOKUP;
402 	rp->r_flags |= R4READDIRWATTR;
403 	rddir4_cache_purge(rp);
404 	mutex_exit(&rp->r_statelock);
405 }
406 
407 /*
408  * Set attributes cache for given vnode using virtual attributes.  There is
409  * no cache validation, but if the attributes are deemed to be stale, they
410  * are ignored.  This corresponds to nfs3_attrcache().
411  *
412  * Set the timeout value on the attribute cache and fill it
413  * with the passed in attributes.
414  */
415 void
416 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
417 {
418 	rnode4_t *rp = VTOR4(vp);
419 
420 	mutex_enter(&rp->r_statelock);
421 	if (rp->r_time_attr_saved <= t)
422 		nfs4_attrcache_va(vp, garp, FALSE);
423 	mutex_exit(&rp->r_statelock);
424 }
425 
426 /*
427  * Use the passed in virtual attributes to check to see whether the
428  * data and metadata caches are valid, cache the new attributes, and
429  * then do the cache invalidation if required.
430  *
431  * The cache validation and caching of the new attributes is done
432  * atomically via the use of the mutex, r_statelock.  If required,
433  * the cache invalidation is done atomically w.r.t. the cache
434  * validation and caching of the attributes via the pseudo lock,
435  * r_serial.
436  *
437  * This routine is used to do cache validation and attributes caching
438  * for operations with a single set of post operation attributes.
439  */
440 
441 void
442 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
443     hrtime_t t, cred_t *cr, int async,
444     change_info4 *cinfo)
445 {
446 	rnode4_t *rp;
447 	int mtime_changed = 0;
448 	int ctime_changed = 0;
449 	vsecattr_t *vsp;
450 	int was_serial, set_time_cache_inval, recov;
451 	vattr_t *vap = &garp->n4g_va;
452 	mntinfo4_t *mi = VTOMI4(vp);
453 	len_t preattr_rsize;
454 	boolean_t writemodify_set = B_FALSE;
455 	boolean_t cachepurge_set = B_FALSE;
456 
457 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
458 
459 	/* Is curthread the recovery thread? */
460 	mutex_enter(&mi->mi_lock);
461 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
462 	mutex_exit(&mi->mi_lock);
463 
464 	rp = VTOR4(vp);
465 	mutex_enter(&rp->r_statelock);
466 	was_serial = (rp->r_serial == curthread);
467 	if (rp->r_serial && !was_serial) {
468 		klwp_t *lwp = ttolwp(curthread);
469 
470 		/*
471 		 * If we're the recovery thread, then purge current attrs
472 		 * and bail out to avoid potential deadlock between another
473 		 * thread caching attrs (r_serial thread), recov thread,
474 		 * and an async writer thread.
475 		 */
476 		if (recov) {
477 			PURGE_ATTRCACHE4_LOCKED(rp);
478 			mutex_exit(&rp->r_statelock);
479 			return;
480 		}
481 
482 		if (lwp != NULL)
483 			lwp->lwp_nostop++;
484 		while (rp->r_serial != NULL) {
485 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
486 				mutex_exit(&rp->r_statelock);
487 				if (lwp != NULL)
488 					lwp->lwp_nostop--;
489 				return;
490 			}
491 		}
492 		if (lwp != NULL)
493 			lwp->lwp_nostop--;
494 	}
495 
496 	/*
497 	 * If there is a page flush thread, the current thread needs to
498 	 * bail out, to prevent a possible deadlock between the current
499 	 * thread (which might be in a start_op/end_op region), the
500 	 * recovery thread, and the page flush thread.  Expire the
501 	 * attribute cache, so that any attributes the current thread was
502 	 * going to set are not lost.
503 	 */
504 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
505 		PURGE_ATTRCACHE4_LOCKED(rp);
506 		mutex_exit(&rp->r_statelock);
507 		return;
508 	}
509 
510 	if (rp->r_time_attr_saved > t) {
511 		/*
512 		 * Attributes have been cached since these attributes were
513 		 * probably made. If there is an inconsistency in what is
514 		 * cached, mark them invalid. If not, don't act on them.
515 		 */
516 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
517 			PURGE_ATTRCACHE4_LOCKED(rp);
518 		mutex_exit(&rp->r_statelock);
519 		return;
520 	}
521 	set_time_cache_inval = 0;
522 	if (cinfo) {
523 		/*
524 		 * Only directory modifying callers pass non-NULL cinfo.
525 		 */
526 		ASSERT(vp->v_type == VDIR);
527 		/*
528 		 * If the cache timeout either doesn't exist or hasn't expired,
529 		 * and dir didn't changed on server before dirmod op
530 		 * and dir didn't change after dirmod op but before getattr
531 		 * then there's a chance that the client's cached data for
532 		 * this object is current (not stale).  No immediate cache
533 		 * flush is required.
534 		 *
535 		 */
536 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
537 		    cinfo->before == rp->r_change &&
538 		    (garp->n4g_change_valid &&
539 		    cinfo->after == garp->n4g_change)) {
540 
541 			/*
542 			 * If atomic isn't set, then the before/after info
543 			 * cannot be blindly trusted.  For this case, we tell
544 			 * nfs4_attrcache_va to cache the attrs but also
545 			 * establish an absolute maximum cache timeout.  When
546 			 * the timeout is reached, caches will be flushed.
547 			 */
548 			if (! cinfo->atomic)
549 				set_time_cache_inval = 1;
550 		} else {
551 
552 			/*
553 			 * We're not sure exactly what changed, but we know
554 			 * what to do.  flush all caches for dir.  remove the
555 			 * attr timeout.
556 			 *
557 			 * a) timeout expired.  flush all caches.
558 			 * b) r_change != cinfo.before.  flush all caches.
559 			 * c) r_change == cinfo.before, but cinfo.after !=
560 			 *    post-op getattr(change).  flush all caches.
561 			 * d) post-op getattr(change) not provided by server.
562 			 *    flush all caches.
563 			 */
564 			mtime_changed = 1;
565 			ctime_changed = 1;
566 			rp->r_time_cache_inval = 0;
567 		}
568 	} else {
569 		/*
570 		 * Write thread after writing data to file on remote server,
571 		 * will always set R4WRITEMODIFIED to indicate that file on
572 		 * remote server was modified with a WRITE operation and would
573 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
574 		 * is set, then do not check for mtime and ctime change.
575 		 */
576 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
577 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
578 				mtime_changed = 1;
579 
580 			if (rp->r_attr.va_ctime.tv_sec !=
581 			    vap->va_ctime.tv_sec ||
582 			    rp->r_attr.va_ctime.tv_nsec !=
583 			    vap->va_ctime.tv_nsec)
584 				ctime_changed = 1;
585 		} else {
586 			writemodify_set = B_TRUE;
587 		}
588 	}
589 
590 	preattr_rsize = rp->r_size;
591 
592 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
593 
594 	/*
595 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
596 	 * drop statelock we will be in transition of purging all
597 	 * our caches and updating them. It is possible for another
598 	 * thread to pick this new file size and read in zeroed data.
599 	 * stall other threads till cache purge is complete.
600 	 */
601 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
602 		/*
603 		 * If R4WRITEMODIFIED was set and we have updated the file
604 		 * size, Server's returned file size need not necessarily
605 		 * be because of this Client's WRITE. We need to purge
606 		 * all caches.
607 		 */
608 		if (writemodify_set)
609 			mtime_changed = 1;
610 
611 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
612 			rp->r_flags |= R4INCACHEPURGE;
613 			cachepurge_set = B_TRUE;
614 		}
615 	}
616 
617 	if (!mtime_changed && !ctime_changed) {
618 		mutex_exit(&rp->r_statelock);
619 		return;
620 	}
621 
622 	rp->r_serial = curthread;
623 
624 	mutex_exit(&rp->r_statelock);
625 
626 	/*
627 	 * If we're the recov thread, then force async nfs4_purge_caches
628 	 * to avoid potential deadlock.
629 	 */
630 	if (mtime_changed)
631 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
632 
633 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
634 		mutex_enter(&rp->r_statelock);
635 		rp->r_flags &= ~R4INCACHEPURGE;
636 		cv_broadcast(&rp->r_cv);
637 		mutex_exit(&rp->r_statelock);
638 		cachepurge_set = B_FALSE;
639 	}
640 
641 	if (ctime_changed) {
642 		(void) nfs4_access_purge_rp(rp);
643 		if (rp->r_secattr != NULL) {
644 			mutex_enter(&rp->r_statelock);
645 			vsp = rp->r_secattr;
646 			rp->r_secattr = NULL;
647 			mutex_exit(&rp->r_statelock);
648 			if (vsp != NULL)
649 				nfs4_acl_free_cache(vsp);
650 		}
651 	}
652 
653 	if (!was_serial) {
654 		mutex_enter(&rp->r_statelock);
655 		rp->r_serial = NULL;
656 		cv_broadcast(&rp->r_cv);
657 		mutex_exit(&rp->r_statelock);
658 	}
659 }
660 
661 /*
662  * Set attributes cache for given vnode using virtual attributes.
663  *
664  * Set the timeout value on the attribute cache and fill it
665  * with the passed in attributes.
666  *
667  * The caller must be holding r_statelock.
668  */
669 static void
670 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
671 {
672 	rnode4_t *rp;
673 	mntinfo4_t *mi;
674 	hrtime_t delta;
675 	hrtime_t now;
676 	vattr_t *vap = &garp->n4g_va;
677 
678 	rp = VTOR4(vp);
679 
680 	ASSERT(MUTEX_HELD(&rp->r_statelock));
681 	ASSERT(vap->va_mask == AT_ALL);
682 
683 	/* Switch to master before checking v_flag */
684 	if (IS_SHADOW(vp, rp))
685 		vp = RTOV4(rp);
686 
687 	now = gethrtime();
688 
689 	mi = VTOMI4(vp);
690 
691 	/*
692 	 * Only establish a new cache timeout (if requested).  Never
693 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
694 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
695 	 */
696 	if (set_cache_timeout && ! rp->r_time_cache_inval)
697 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
698 
699 	/*
700 	 * Delta is the number of nanoseconds that we will
701 	 * cache the attributes of the file.  It is based on
702 	 * the number of nanoseconds since the last time that
703 	 * we detected a change.  The assumption is that files
704 	 * that changed recently are likely to change again.
705 	 * There is a minimum and a maximum for regular files
706 	 * and for directories which is enforced though.
707 	 *
708 	 * Using the time since last change was detected
709 	 * eliminates direct comparison or calculation
710 	 * using mixed client and server times.  NFS does
711 	 * not make any assumptions regarding the client
712 	 * and server clocks being synchronized.
713 	 */
714 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
715 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
716 	    vap->va_size != rp->r_attr.va_size) {
717 		rp->r_time_attr_saved = now;
718 	}
719 
720 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
721 		delta = 0;
722 	else {
723 		delta = now - rp->r_time_attr_saved;
724 		if (vp->v_type == VDIR) {
725 			if (delta < mi->mi_acdirmin)
726 				delta = mi->mi_acdirmin;
727 			else if (delta > mi->mi_acdirmax)
728 				delta = mi->mi_acdirmax;
729 		} else {
730 			if (delta < mi->mi_acregmin)
731 				delta = mi->mi_acregmin;
732 			else if (delta > mi->mi_acregmax)
733 				delta = mi->mi_acregmax;
734 		}
735 	}
736 	rp->r_time_attr_inval = now + delta;
737 
738 	rp->r_attr = *vap;
739 	if (garp->n4g_change_valid)
740 		rp->r_change = garp->n4g_change;
741 
742 	/*
743 	 * The attributes that were returned may be valid and can
744 	 * be used, but they may not be allowed to be cached.
745 	 * Reset the timers to cause immediate invalidation and
746 	 * clear r_change so no VERIFY operations will suceed
747 	 */
748 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
749 		rp->r_time_attr_inval = now;
750 		rp->r_time_attr_saved = now;
751 		rp->r_change = 0;
752 	}
753 
754 	/*
755 	 * If mounted_on_fileid returned AND the object is a stub,
756 	 * then set object's va_nodeid to the mounted over fid
757 	 * returned by server.
758 	 *
759 	 * If mounted_on_fileid not provided/supported, then
760 	 * just set it to 0 for now.  Eventually it would be
761 	 * better to set it to a hashed version of FH.  This
762 	 * would probably be good enough to provide a unique
763 	 * fid/d_ino within a dir.
764 	 *
765 	 * We don't need to carry mounted_on_fileid in the
766 	 * rnode as long as the client never requests fileid
767 	 * without also requesting mounted_on_fileid.  For
768 	 * now, it stays.
769 	 */
770 	if (garp->n4g_mon_fid_valid) {
771 		rp->r_mntd_fid = garp->n4g_mon_fid;
772 
773 		if (RP_ISSTUB(rp))
774 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
775 	}
776 
777 	/*
778 	 * Check to see if there are valid pathconf bits to
779 	 * cache in the rnode.
780 	 */
781 	if (garp->n4g_ext_res) {
782 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
783 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
784 		} else {
785 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
786 				rp->r_pathconf.pc4_xattr_valid = TRUE;
787 				rp->r_pathconf.pc4_xattr_exists =
788 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
789 			}
790 		}
791 	}
792 	/*
793 	 * Update the size of the file if there is no cached data or if
794 	 * the cached data is clean and there is no data being written
795 	 * out.
796 	 */
797 	if (rp->r_size != vap->va_size &&
798 	    (!vn_has_cached_data(vp) ||
799 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
800 		rp->r_size = vap->va_size;
801 	}
802 	nfs_setswaplike(vp, vap);
803 	rp->r_flags &= ~R4WRITEMODIFIED;
804 }
805 
806 /*
807  * Get attributes over-the-wire and update attributes cache
808  * if no error occurred in the over-the-wire operation.
809  * Return 0 if successful, otherwise error.
810  */
811 int
812 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
813 {
814 	mntinfo4_t *mi = VTOMI4(vp);
815 	hrtime_t t;
816 	nfs4_recov_state_t recov_state;
817 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
818 
819 	recov_state.rs_flags = 0;
820 	recov_state.rs_num_retry_despite_err = 0;
821 
822 	/* Save the original mount point security flavor */
823 	(void) save_mnt_secinfo(mi->mi_curr_serv);
824 
825 recov_retry:
826 
827 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
828 	    &recov_state, NULL))) {
829 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
830 		return (e.error);
831 	}
832 
833 	t = gethrtime();
834 
835 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
836 
837 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
838 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
839 		    NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
840 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
841 			    &recov_state, 1);
842 			goto recov_retry;
843 		}
844 	}
845 
846 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
847 
848 	if (!e.error) {
849 		if (e.stat == NFS4_OK) {
850 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
851 		} else {
852 			e.error = geterrno4(e.stat);
853 
854 			nfs4_purge_stale_fh(e.error, vp, cr);
855 		}
856 	}
857 
858 	/*
859 	 * If getattr a node that is a stub for a crossed
860 	 * mount point, keep the original secinfo flavor for
861 	 * the current file system, not the crossed one.
862 	 */
863 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
864 
865 	return (e.error);
866 }
867 
868 /*
869  * Generate a compound to get attributes over-the-wire.
870  */
871 void
872 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
873     nfs4_error_t *ep, cred_t *cr, int get_acl)
874 {
875 	COMPOUND4args_clnt args;
876 	COMPOUND4res_clnt res;
877 	int doqueue;
878 	rnode4_t *rp = VTOR4(vp);
879 	nfs_argop4 argop[2];
880 
881 	args.ctag = TAG_GETATTR;
882 
883 	args.array_len = 2;
884 	args.array = argop;
885 
886 	/* putfh */
887 	argop[0].argop = OP_CPUTFH;
888 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
889 
890 	/* getattr */
891 	/*
892 	 * Unlike nfs version 2 and 3, where getattr returns all the
893 	 * attributes, nfs version 4 returns only the ones explicitly
894 	 * asked for. This creates problems, as some system functions
895 	 * (e.g. cache check) require certain attributes and if the
896 	 * cached node lacks some attributes such as uid/gid, it can
897 	 * affect system utilities (e.g. "ls") that rely on the information
898 	 * to be there. This can lead to anything from system crashes to
899 	 * corrupted information processed by user apps.
900 	 * So to ensure that all bases are covered, request at least
901 	 * the AT_ALL attribute mask.
902 	 */
903 	argop[1].argop = OP_GETATTR;
904 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
905 	if (get_acl)
906 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
907 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
908 
909 	doqueue = 1;
910 
911 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
912 
913 	if (ep->error)
914 		return;
915 
916 	if (res.status != NFS4_OK) {
917 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
918 		return;
919 	}
920 
921 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
922 
923 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
924 }
925 
926 /*
927  * Return either cached or remote attributes. If get remote attr
928  * use them to check and invalidate caches, then cache the new attributes.
929  */
930 int
931 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
932 {
933 	int error;
934 	rnode4_t *rp;
935 	nfs4_ga_res_t gar;
936 
937 	ASSERT(nfs4_consistent_type(vp));
938 
939 	/*
940 	 * If we've got cached attributes, we're done, otherwise go
941 	 * to the server to get attributes, which will update the cache
942 	 * in the process. Either way, use the cached attributes for
943 	 * the caller's vattr_t.
944 	 *
945 	 * Note that we ignore the gar set by the OTW call: the attr caching
946 	 * code may make adjustments when storing to the rnode, and we want
947 	 * to see those changes here.
948 	 */
949 	rp = VTOR4(vp);
950 	error = 0;
951 	mutex_enter(&rp->r_statelock);
952 	if (!ATTRCACHE4_VALID(vp)) {
953 		mutex_exit(&rp->r_statelock);
954 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
955 		mutex_enter(&rp->r_statelock);
956 	}
957 
958 	if (!error)
959 		*vap = rp->r_attr;
960 
961 	/* Return the client's view of file size */
962 	vap->va_size = rp->r_size;
963 
964 	mutex_exit(&rp->r_statelock);
965 
966 	ASSERT(nfs4_consistent_type(vp));
967 
968 	return (error);
969 }
970 
971 int
972 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
973     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
974 {
975 	COMPOUND4args_clnt args;
976 	COMPOUND4res_clnt res;
977 	int doqueue;
978 	nfs_argop4 argop[2];
979 	mntinfo4_t *mi = VTOMI4(vp);
980 	bool_t needrecov = FALSE;
981 	nfs4_recov_state_t recov_state;
982 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
983 	nfs4_ga_ext_res_t *gerp;
984 
985 	recov_state.rs_flags = 0;
986 	recov_state.rs_num_retry_despite_err = 0;
987 
988 recov_retry:
989 	args.ctag = tag_type;
990 
991 	args.array_len = 2;
992 	args.array = argop;
993 
994 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
995 	if (e.error)
996 		return (e.error);
997 
998 	/* putfh */
999 	argop[0].argop = OP_CPUTFH;
1000 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1001 
1002 	/* getattr */
1003 	argop[1].argop = OP_GETATTR;
1004 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1005 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1006 
1007 	doqueue = 1;
1008 
1009 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1010 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1011 	    rnode4info(VTOR4(vp))));
1012 
1013 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1014 
1015 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1016 	if (!needrecov && e.error) {
1017 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1018 		    needrecov);
1019 		return (e.error);
1020 	}
1021 
1022 	if (needrecov) {
1023 		bool_t abort;
1024 
1025 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1026 		    "nfs4_attr_otw: initiating recovery\n"));
1027 
1028 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1029 		    NULL, OP_GETATTR, NULL, NULL, NULL);
1030 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1031 		    needrecov);
1032 		if (!e.error) {
1033 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1034 			e.error = geterrno4(res.status);
1035 		}
1036 		if (abort == FALSE)
1037 			goto recov_retry;
1038 		return (e.error);
1039 	}
1040 
1041 	if (res.status) {
1042 		e.error = geterrno4(res.status);
1043 	} else {
1044 		gerp = garp->n4g_ext_res;
1045 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1046 		    garp, sizeof (nfs4_ga_res_t));
1047 		garp->n4g_ext_res = gerp;
1048 		if (garp->n4g_ext_res &&
1049 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1050 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1051 			    ga_res.n4g_ext_res,
1052 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1053 	}
1054 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1055 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1056 	    needrecov);
1057 	return (e.error);
1058 }
1059 
1060 /*
1061  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1062  * for the demand-based allocation of async threads per-mount.  The
1063  * nfs_async_timeout is the amount of time a thread will live after it
1064  * becomes idle, unless new I/O requests are received before the thread
1065  * dies.  See nfs4_async_putpage and nfs4_async_start.
1066  */
1067 
1068 static void	nfs4_async_start(struct vfs *);
1069 static void	nfs4_async_pgops_start(struct vfs *);
1070 static void	nfs4_async_common_start(struct vfs *, int);
1071 
1072 static void
1073 free_async_args4(struct nfs4_async_reqs *args)
1074 {
1075 	rnode4_t *rp;
1076 
1077 	if (args->a_io != NFS4_INACTIVE) {
1078 		rp = VTOR4(args->a_vp);
1079 		mutex_enter(&rp->r_statelock);
1080 		rp->r_count--;
1081 		if (args->a_io == NFS4_PUTAPAGE ||
1082 		    args->a_io == NFS4_PAGEIO)
1083 			rp->r_awcount--;
1084 		cv_broadcast(&rp->r_cv);
1085 		mutex_exit(&rp->r_statelock);
1086 		VN_RELE(args->a_vp);
1087 	}
1088 	crfree(args->a_cred);
1089 	kmem_free(args, sizeof (*args));
1090 }
1091 
1092 /*
1093  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1094  * pageout(), running in the global zone, have legitimate reasons to do
1095  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1096  * use of a a per-mount "asynchronous requests manager thread" which is
1097  * signaled by the various asynchronous work routines when there is
1098  * asynchronous work to be done.  It is responsible for creating new
1099  * worker threads if necessary, and notifying existing worker threads
1100  * that there is work to be done.
1101  *
1102  * In other words, it will "take the specifications from the customers and
1103  * give them to the engineers."
1104  *
1105  * Worker threads die off of their own accord if they are no longer
1106  * needed.
1107  *
1108  * This thread is killed when the zone is going away or the filesystem
1109  * is being unmounted.
1110  */
1111 void
1112 nfs4_async_manager(vfs_t *vfsp)
1113 {
1114 	callb_cpr_t cprinfo;
1115 	mntinfo4_t *mi;
1116 	uint_t max_threads;
1117 
1118 	mi = VFTOMI4(vfsp);
1119 
1120 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1121 	    "nfs4_async_manager");
1122 
1123 	mutex_enter(&mi->mi_async_lock);
1124 	/*
1125 	 * We want to stash the max number of threads that this mount was
1126 	 * allowed so we can use it later when the variable is set to zero as
1127 	 * part of the zone/mount going away.
1128 	 *
1129 	 * We want to be able to create at least one thread to handle
1130 	 * asynchronous inactive calls.
1131 	 */
1132 	max_threads = MAX(mi->mi_max_threads, 1);
1133 	/*
1134 	 * We don't want to wait for mi_max_threads to go to zero, since that
1135 	 * happens as part of a failed unmount, but this thread should only
1136 	 * exit when the mount is really going away.
1137 	 *
1138 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1139 	 * attempted: the various _async_*() functions know to do things
1140 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1141 	 * outstanding requests.
1142 	 *
1143 	 * Note that we still create zthreads even if we notice the zone is
1144 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1145 	 * shutdown sequence to take slightly longer in some cases, but
1146 	 * doesn't violate the protocol, as all threads will exit as soon as
1147 	 * they're done processing the remaining requests.
1148 	 */
1149 	for (;;) {
1150 		while (mi->mi_async_req_count > 0) {
1151 			/*
1152 			 * Paranoia: If the mount started out having
1153 			 * (mi->mi_max_threads == 0), and the value was
1154 			 * later changed (via a debugger or somesuch),
1155 			 * we could be confused since we will think we
1156 			 * can't create any threads, and the calling
1157 			 * code (which looks at the current value of
1158 			 * mi->mi_max_threads, now non-zero) thinks we
1159 			 * can.
1160 			 *
1161 			 * So, because we're paranoid, we create threads
1162 			 * up to the maximum of the original and the
1163 			 * current value. This means that future
1164 			 * (debugger-induced) alterations of
1165 			 * mi->mi_max_threads are ignored for our
1166 			 * purposes, but who told them they could change
1167 			 * random values on a live kernel anyhow?
1168 			 */
1169 			if (mi->mi_threads[NFS4_ASYNC_QUEUE] <
1170 			    MAX(mi->mi_max_threads, max_threads)) {
1171 				mi->mi_threads[NFS4_ASYNC_QUEUE]++;
1172 				mutex_exit(&mi->mi_async_lock);
1173 				MI4_HOLD(mi);
1174 				VFS_HOLD(vfsp);	/* hold for new thread */
1175 				(void) zthread_create(NULL, 0, nfs4_async_start,
1176 				    vfsp, 0, minclsyspri);
1177 				mutex_enter(&mi->mi_async_lock);
1178 			} else if (mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] <
1179 			    NUM_ASYNC_PGOPS_THREADS) {
1180 				mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE]++;
1181 				mutex_exit(&mi->mi_async_lock);
1182 				MI4_HOLD(mi);
1183 				VFS_HOLD(vfsp); /* hold for new thread */
1184 				(void) zthread_create(NULL, 0,
1185 				    nfs4_async_pgops_start, vfsp, 0,
1186 				    minclsyspri);
1187 				mutex_enter(&mi->mi_async_lock);
1188 			}
1189 			NFS4_WAKE_ASYNC_WORKER(mi->mi_async_work_cv);
1190 			ASSERT(mi->mi_async_req_count != 0);
1191 			mi->mi_async_req_count--;
1192 		}
1193 
1194 		mutex_enter(&mi->mi_lock);
1195 		if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1196 			mutex_exit(&mi->mi_lock);
1197 			break;
1198 		}
1199 		mutex_exit(&mi->mi_lock);
1200 
1201 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1202 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1203 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1204 	}
1205 
1206 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1207 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1208 	/*
1209 	 * Let everyone know we're done.
1210 	 */
1211 	mi->mi_manager_thread = NULL;
1212 	/*
1213 	 * Wake up the inactive thread.
1214 	 */
1215 	cv_broadcast(&mi->mi_inact_req_cv);
1216 	/*
1217 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1218 	 */
1219 	cv_broadcast(&mi->mi_async_cv);
1220 	/*
1221 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1222 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1223 	 * 'mi_async_lock'.
1224 	 */
1225 	CALLB_CPR_EXIT(&cprinfo);
1226 	VFS_RELE(vfsp);	/* release thread's hold */
1227 	MI4_RELE(mi);
1228 	zthread_exit();
1229 }
1230 
1231 /*
1232  * Signal (and wait for) the async manager thread to clean up and go away.
1233  */
1234 void
1235 nfs4_async_manager_stop(vfs_t *vfsp)
1236 {
1237 	mntinfo4_t *mi = VFTOMI4(vfsp);
1238 
1239 	mutex_enter(&mi->mi_async_lock);
1240 	mutex_enter(&mi->mi_lock);
1241 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1242 	mutex_exit(&mi->mi_lock);
1243 	cv_broadcast(&mi->mi_async_reqs_cv);
1244 	/*
1245 	 * Wait for the async manager thread to die.
1246 	 */
1247 	while (mi->mi_manager_thread != NULL)
1248 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1249 	mutex_exit(&mi->mi_async_lock);
1250 }
1251 
1252 int
1253 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1254     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1255     u_offset_t, caddr_t, struct seg *, cred_t *))
1256 {
1257 	rnode4_t *rp;
1258 	mntinfo4_t *mi;
1259 	struct nfs4_async_reqs *args;
1260 
1261 	rp = VTOR4(vp);
1262 	ASSERT(rp->r_freef == NULL);
1263 
1264 	mi = VTOMI4(vp);
1265 
1266 	/*
1267 	 * If addr falls in a different segment, don't bother doing readahead.
1268 	 */
1269 	if (addr >= seg->s_base + seg->s_size)
1270 		return (-1);
1271 
1272 	/*
1273 	 * If we can't allocate a request structure, punt on the readahead.
1274 	 */
1275 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1276 		return (-1);
1277 
1278 	/*
1279 	 * If a lock operation is pending, don't initiate any new
1280 	 * readaheads.  Otherwise, bump r_count to indicate the new
1281 	 * asynchronous I/O.
1282 	 */
1283 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1284 		kmem_free(args, sizeof (*args));
1285 		return (-1);
1286 	}
1287 	mutex_enter(&rp->r_statelock);
1288 	rp->r_count++;
1289 	mutex_exit(&rp->r_statelock);
1290 	nfs_rw_exit(&rp->r_lkserlock);
1291 
1292 	args->a_next = NULL;
1293 #ifdef DEBUG
1294 	args->a_queuer = curthread;
1295 #endif
1296 	VN_HOLD(vp);
1297 	args->a_vp = vp;
1298 	ASSERT(cr != NULL);
1299 	crhold(cr);
1300 	args->a_cred = cr;
1301 	args->a_io = NFS4_READ_AHEAD;
1302 	args->a_nfs4_readahead = readahead;
1303 	args->a_nfs4_blkoff = blkoff;
1304 	args->a_nfs4_seg = seg;
1305 	args->a_nfs4_addr = addr;
1306 
1307 	mutex_enter(&mi->mi_async_lock);
1308 
1309 	/*
1310 	 * If asyncio has been disabled, don't bother readahead.
1311 	 */
1312 	if (mi->mi_max_threads == 0) {
1313 		mutex_exit(&mi->mi_async_lock);
1314 		goto noasync;
1315 	}
1316 
1317 	/*
1318 	 * Link request structure into the async list and
1319 	 * wakeup async thread to do the i/o.
1320 	 */
1321 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1322 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1323 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1324 	} else {
1325 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1326 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1327 	}
1328 
1329 	if (mi->mi_io_kstats) {
1330 		mutex_enter(&mi->mi_lock);
1331 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1332 		mutex_exit(&mi->mi_lock);
1333 	}
1334 
1335 	mi->mi_async_req_count++;
1336 	ASSERT(mi->mi_async_req_count != 0);
1337 	cv_signal(&mi->mi_async_reqs_cv);
1338 	mutex_exit(&mi->mi_async_lock);
1339 	return (0);
1340 
1341 noasync:
1342 	mutex_enter(&rp->r_statelock);
1343 	rp->r_count--;
1344 	cv_broadcast(&rp->r_cv);
1345 	mutex_exit(&rp->r_statelock);
1346 	VN_RELE(vp);
1347 	crfree(cr);
1348 	kmem_free(args, sizeof (*args));
1349 	return (-1);
1350 }
1351 
1352 static void
1353 nfs4_async_start(struct vfs *vfsp)
1354 {
1355 	nfs4_async_common_start(vfsp, NFS4_ASYNC_QUEUE);
1356 }
1357 
1358 static void
1359 nfs4_async_pgops_start(struct vfs *vfsp)
1360 {
1361 	nfs4_async_common_start(vfsp, NFS4_ASYNC_PGOPS_QUEUE);
1362 }
1363 
1364 /*
1365  * The async queues for each mounted file system are arranged as a
1366  * set of queues, one for each async i/o type.  Requests are taken
1367  * from the queues in a round-robin fashion.  A number of consecutive
1368  * requests are taken from each queue before moving on to the next
1369  * queue.  This functionality may allow the NFS Version 2 server to do
1370  * write clustering, even if the client is mixing writes and reads
1371  * because it will take multiple write requests from the queue
1372  * before processing any of the other async i/o types.
1373  *
1374  * XXX The nfs4_async_common_start thread is unsafe in the light of the present
1375  * model defined by cpr to suspend the system. Specifically over the
1376  * wire calls are cpr-unsafe. The thread should be reevaluated in
1377  * case of future updates to the cpr model.
1378  */
1379 static void
1380 nfs4_async_common_start(struct vfs *vfsp, int async_queue)
1381 {
1382 	struct nfs4_async_reqs *args;
1383 	mntinfo4_t *mi = VFTOMI4(vfsp);
1384 	clock_t time_left = 1;
1385 	callb_cpr_t cprinfo;
1386 	int i;
1387 	extern int nfs_async_timeout;
1388 	int async_types;
1389 	kcondvar_t *async_work_cv;
1390 
1391 	if (async_queue == NFS4_ASYNC_QUEUE) {
1392 		async_types = NFS4_ASYNC_TYPES;
1393 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_QUEUE];
1394 	} else {
1395 		async_types = NFS4_ASYNC_PGOPS_TYPES;
1396 		async_work_cv = &mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE];
1397 	}
1398 
1399 	/*
1400 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1401 	 * built in an implementation independent manner.
1402 	 */
1403 	if (nfs_async_timeout == -1)
1404 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1405 
1406 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1407 
1408 	mutex_enter(&mi->mi_async_lock);
1409 	for (;;) {
1410 		/*
1411 		 * Find the next queue containing an entry.  We start
1412 		 * at the current queue pointer and then round robin
1413 		 * through all of them until we either find a non-empty
1414 		 * queue or have looked through all of them.
1415 		 */
1416 		for (i = 0; i < async_types; i++) {
1417 			args = *mi->mi_async_curr[async_queue];
1418 			if (args != NULL)
1419 				break;
1420 			mi->mi_async_curr[async_queue]++;
1421 			if (mi->mi_async_curr[async_queue] ==
1422 			    &mi->mi_async_reqs[async_types]) {
1423 				mi->mi_async_curr[async_queue] =
1424 				    &mi->mi_async_reqs[0];
1425 			}
1426 		}
1427 		/*
1428 		 * If we didn't find a entry, then block until woken up
1429 		 * again and then look through the queues again.
1430 		 */
1431 		if (args == NULL) {
1432 			/*
1433 			 * Exiting is considered to be safe for CPR as well
1434 			 */
1435 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1436 
1437 			/*
1438 			 * Wakeup thread waiting to unmount the file
1439 			 * system only if all async threads are inactive.
1440 			 *
1441 			 * If we've timed-out and there's nothing to do,
1442 			 * then get rid of this thread.
1443 			 */
1444 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1445 				--mi->mi_threads[async_queue];
1446 
1447 				if (mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
1448 				    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0)
1449 					cv_signal(&mi->mi_async_cv);
1450 				CALLB_CPR_EXIT(&cprinfo);
1451 				VFS_RELE(vfsp);	/* release thread's hold */
1452 				MI4_RELE(mi);
1453 				zthread_exit();
1454 				/* NOTREACHED */
1455 			}
1456 			time_left = cv_reltimedwait(async_work_cv,
1457 			    &mi->mi_async_lock, nfs_async_timeout,
1458 			    TR_CLOCK_TICK);
1459 
1460 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1461 
1462 			continue;
1463 		} else {
1464 			time_left = 1;
1465 		}
1466 
1467 		/*
1468 		 * Remove the request from the async queue and then
1469 		 * update the current async request queue pointer.  If
1470 		 * the current queue is empty or we have removed enough
1471 		 * consecutive entries from it, then reset the counter
1472 		 * for this queue and then move the current pointer to
1473 		 * the next queue.
1474 		 */
1475 		*mi->mi_async_curr[async_queue] = args->a_next;
1476 		if (*mi->mi_async_curr[async_queue] == NULL ||
1477 		    --mi->mi_async_clusters[args->a_io] == 0) {
1478 			mi->mi_async_clusters[args->a_io] =
1479 			    mi->mi_async_init_clusters;
1480 			mi->mi_async_curr[async_queue]++;
1481 			if (mi->mi_async_curr[async_queue] ==
1482 			    &mi->mi_async_reqs[async_types]) {
1483 				mi->mi_async_curr[async_queue] =
1484 				    &mi->mi_async_reqs[0];
1485 			}
1486 		}
1487 
1488 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1489 			mutex_enter(&mi->mi_lock);
1490 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1491 			mutex_exit(&mi->mi_lock);
1492 		}
1493 
1494 		mutex_exit(&mi->mi_async_lock);
1495 
1496 		/*
1497 		 * Obtain arguments from the async request structure.
1498 		 */
1499 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1500 			(*args->a_nfs4_readahead)(args->a_vp,
1501 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1502 			    args->a_nfs4_seg, args->a_cred);
1503 		} else if (args->a_io == NFS4_PUTAPAGE) {
1504 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1505 			    args->a_nfs4_pp, args->a_nfs4_off,
1506 			    args->a_nfs4_len, args->a_nfs4_flags,
1507 			    args->a_cred);
1508 		} else if (args->a_io == NFS4_PAGEIO) {
1509 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1510 			    args->a_nfs4_pp, args->a_nfs4_off,
1511 			    args->a_nfs4_len, args->a_nfs4_flags,
1512 			    args->a_cred);
1513 		} else if (args->a_io == NFS4_READDIR) {
1514 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1515 			    args->a_nfs4_rdc, args->a_cred));
1516 		} else if (args->a_io == NFS4_COMMIT) {
1517 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1518 			    args->a_nfs4_offset, args->a_nfs4_count,
1519 			    args->a_cred);
1520 		} else if (args->a_io == NFS4_INACTIVE) {
1521 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1522 		}
1523 
1524 		/*
1525 		 * Now, release the vnode and free the credentials
1526 		 * structure.
1527 		 */
1528 		free_async_args4(args);
1529 		/*
1530 		 * Reacquire the mutex because it will be needed above.
1531 		 */
1532 		mutex_enter(&mi->mi_async_lock);
1533 	}
1534 }
1535 
1536 /*
1537  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1538  * part of VOP_INACTIVE.
1539  */
1540 
1541 void
1542 nfs4_inactive_thread(mntinfo4_t *mi)
1543 {
1544 	struct nfs4_async_reqs *args;
1545 	callb_cpr_t cprinfo;
1546 	vfs_t *vfsp = mi->mi_vfsp;
1547 
1548 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1549 	    "nfs4_inactive_thread");
1550 
1551 	for (;;) {
1552 		mutex_enter(&mi->mi_async_lock);
1553 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1554 		if (args == NULL) {
1555 			mutex_enter(&mi->mi_lock);
1556 			/*
1557 			 * We don't want to exit until the async manager is done
1558 			 * with its work; hence the check for mi_manager_thread
1559 			 * being NULL.
1560 			 *
1561 			 * The async manager thread will cv_broadcast() on
1562 			 * mi_inact_req_cv when it's done, at which point we'll
1563 			 * wake up and exit.
1564 			 */
1565 			if (mi->mi_manager_thread == NULL)
1566 				goto die;
1567 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1568 			mutex_exit(&mi->mi_lock);
1569 			cv_signal(&mi->mi_async_cv);
1570 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1571 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1572 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1573 			mutex_exit(&mi->mi_async_lock);
1574 		} else {
1575 			mutex_enter(&mi->mi_lock);
1576 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1577 			mutex_exit(&mi->mi_lock);
1578 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1579 			mutex_exit(&mi->mi_async_lock);
1580 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1581 			crfree(args->a_cred);
1582 			kmem_free(args, sizeof (*args));
1583 		}
1584 	}
1585 die:
1586 	mutex_exit(&mi->mi_lock);
1587 	mi->mi_inactive_thread = NULL;
1588 	cv_signal(&mi->mi_async_cv);
1589 
1590 	/*
1591 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1592 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1593 	 */
1594 	CALLB_CPR_EXIT(&cprinfo);
1595 
1596 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1597 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1598 
1599 	MI4_RELE(mi);
1600 	zthread_exit();
1601 	/* NOTREACHED */
1602 }
1603 
1604 /*
1605  * nfs_async_stop:
1606  * Wait for all outstanding putpage operations and the inactive thread to
1607  * complete; nfs4_async_stop_sig() without interruptibility.
1608  */
1609 void
1610 nfs4_async_stop(struct vfs *vfsp)
1611 {
1612 	mntinfo4_t *mi = VFTOMI4(vfsp);
1613 
1614 	/*
1615 	 * Wait for all outstanding async operations to complete and for
1616 	 * worker threads to exit.
1617 	 */
1618 	mutex_enter(&mi->mi_async_lock);
1619 	mi->mi_max_threads = 0;
1620 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1621 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1622 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0)
1623 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1624 
1625 	/*
1626 	 * Wait for the inactive thread to finish doing what it's doing.  It
1627 	 * won't exit until the last reference to the vfs_t goes away.
1628 	 */
1629 	if (mi->mi_inactive_thread != NULL) {
1630 		mutex_enter(&mi->mi_lock);
1631 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1632 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1633 			mutex_exit(&mi->mi_lock);
1634 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1635 			mutex_enter(&mi->mi_lock);
1636 		}
1637 		mutex_exit(&mi->mi_lock);
1638 	}
1639 	mutex_exit(&mi->mi_async_lock);
1640 }
1641 
1642 /*
1643  * nfs_async_stop_sig:
1644  * Wait for all outstanding putpage operations and the inactive thread to
1645  * complete. If a signal is delivered we will abort and return non-zero;
1646  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1647  * need to make it interruptible.
1648  */
1649 int
1650 nfs4_async_stop_sig(struct vfs *vfsp)
1651 {
1652 	mntinfo4_t *mi = VFTOMI4(vfsp);
1653 	ushort_t omax;
1654 	bool_t intr = FALSE;
1655 
1656 	/*
1657 	 * Wait for all outstanding putpage operations to complete and for
1658 	 * worker threads to exit.
1659 	 */
1660 	mutex_enter(&mi->mi_async_lock);
1661 	omax = mi->mi_max_threads;
1662 	mi->mi_max_threads = 0;
1663 	NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
1664 	while (mi->mi_threads[NFS4_ASYNC_QUEUE] != 0 ||
1665 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] != 0) {
1666 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1667 			intr = TRUE;
1668 			goto interrupted;
1669 		}
1670 	}
1671 
1672 	/*
1673 	 * Wait for the inactive thread to finish doing what it's doing.  It
1674 	 * won't exit until the a last reference to the vfs_t goes away.
1675 	 */
1676 	if (mi->mi_inactive_thread != NULL) {
1677 		mutex_enter(&mi->mi_lock);
1678 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1679 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1680 			mutex_exit(&mi->mi_lock);
1681 			if (!cv_wait_sig(&mi->mi_async_cv,
1682 			    &mi->mi_async_lock)) {
1683 				intr = TRUE;
1684 				goto interrupted;
1685 			}
1686 			mutex_enter(&mi->mi_lock);
1687 		}
1688 		mutex_exit(&mi->mi_lock);
1689 	}
1690 interrupted:
1691 	if (intr)
1692 		mi->mi_max_threads = omax;
1693 	mutex_exit(&mi->mi_async_lock);
1694 
1695 	return (intr);
1696 }
1697 
1698 int
1699 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1700     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1701     u_offset_t, size_t, int, cred_t *))
1702 {
1703 	rnode4_t *rp;
1704 	mntinfo4_t *mi;
1705 	struct nfs4_async_reqs *args;
1706 
1707 	ASSERT(flags & B_ASYNC);
1708 	ASSERT(vp->v_vfsp != NULL);
1709 
1710 	rp = VTOR4(vp);
1711 	ASSERT(rp->r_count > 0);
1712 
1713 	mi = VTOMI4(vp);
1714 
1715 	/*
1716 	 * If we can't allocate a request structure, do the putpage
1717 	 * operation synchronously in this thread's context.
1718 	 */
1719 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1720 		goto noasync;
1721 
1722 	args->a_next = NULL;
1723 #ifdef DEBUG
1724 	args->a_queuer = curthread;
1725 #endif
1726 	VN_HOLD(vp);
1727 	args->a_vp = vp;
1728 	ASSERT(cr != NULL);
1729 	crhold(cr);
1730 	args->a_cred = cr;
1731 	args->a_io = NFS4_PUTAPAGE;
1732 	args->a_nfs4_putapage = putapage;
1733 	args->a_nfs4_pp = pp;
1734 	args->a_nfs4_off = off;
1735 	args->a_nfs4_len = (uint_t)len;
1736 	args->a_nfs4_flags = flags;
1737 
1738 	mutex_enter(&mi->mi_async_lock);
1739 
1740 	/*
1741 	 * If asyncio has been disabled, then make a synchronous request.
1742 	 * This check is done a second time in case async io was diabled
1743 	 * while this thread was blocked waiting for memory pressure to
1744 	 * reduce or for the queue to drain.
1745 	 */
1746 	if (mi->mi_max_threads == 0) {
1747 		mutex_exit(&mi->mi_async_lock);
1748 
1749 		VN_RELE(vp);
1750 		crfree(cr);
1751 		kmem_free(args, sizeof (*args));
1752 		goto noasync;
1753 	}
1754 
1755 	/*
1756 	 * Link request structure into the async list and
1757 	 * wakeup async thread to do the i/o.
1758 	 */
1759 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1760 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1761 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1762 	} else {
1763 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1764 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1765 	}
1766 
1767 	mutex_enter(&rp->r_statelock);
1768 	rp->r_count++;
1769 	rp->r_awcount++;
1770 	mutex_exit(&rp->r_statelock);
1771 
1772 	if (mi->mi_io_kstats) {
1773 		mutex_enter(&mi->mi_lock);
1774 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1775 		mutex_exit(&mi->mi_lock);
1776 	}
1777 
1778 	mi->mi_async_req_count++;
1779 	ASSERT(mi->mi_async_req_count != 0);
1780 	cv_signal(&mi->mi_async_reqs_cv);
1781 	mutex_exit(&mi->mi_async_lock);
1782 	return (0);
1783 
1784 noasync:
1785 
1786 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1787 		/*
1788 		 * If we get here in the context of the pageout/fsflush,
1789 		 * or we have run out of memory or we're attempting to
1790 		 * unmount we refuse to do a sync write, because this may
1791 		 * hang pageout/fsflush and the machine. In this case,
1792 		 * we just re-mark the page as dirty and punt on the page.
1793 		 *
1794 		 * Make sure B_FORCE isn't set.  We can re-mark the
1795 		 * pages as dirty and unlock the pages in one swoop by
1796 		 * passing in B_ERROR to pvn_write_done().  However,
1797 		 * we should make sure B_FORCE isn't set - we don't
1798 		 * want the page tossed before it gets written out.
1799 		 */
1800 		if (flags & B_FORCE)
1801 			flags &= ~(B_INVAL | B_FORCE);
1802 		pvn_write_done(pp, flags | B_ERROR);
1803 		return (0);
1804 	}
1805 
1806 	if (nfs_zone() != mi->mi_zone) {
1807 		/*
1808 		 * So this was a cross-zone sync putpage.
1809 		 *
1810 		 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1811 		 * as dirty and unlock them.
1812 		 *
1813 		 * We don't want to clear B_FORCE here as the caller presumably
1814 		 * knows what they're doing if they set it.
1815 		 */
1816 		pvn_write_done(pp, flags | B_ERROR);
1817 		return (EPERM);
1818 	}
1819 	return ((*putapage)(vp, pp, off, len, flags, cr));
1820 }
1821 
1822 int
1823 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1824     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1825     size_t, int, cred_t *))
1826 {
1827 	rnode4_t *rp;
1828 	mntinfo4_t *mi;
1829 	struct nfs4_async_reqs *args;
1830 
1831 	ASSERT(flags & B_ASYNC);
1832 	ASSERT(vp->v_vfsp != NULL);
1833 
1834 	rp = VTOR4(vp);
1835 	ASSERT(rp->r_count > 0);
1836 
1837 	mi = VTOMI4(vp);
1838 
1839 	/*
1840 	 * If we can't allocate a request structure, do the pageio
1841 	 * request synchronously in this thread's context.
1842 	 */
1843 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1844 		goto noasync;
1845 
1846 	args->a_next = NULL;
1847 #ifdef DEBUG
1848 	args->a_queuer = curthread;
1849 #endif
1850 	VN_HOLD(vp);
1851 	args->a_vp = vp;
1852 	ASSERT(cr != NULL);
1853 	crhold(cr);
1854 	args->a_cred = cr;
1855 	args->a_io = NFS4_PAGEIO;
1856 	args->a_nfs4_pageio = pageio;
1857 	args->a_nfs4_pp = pp;
1858 	args->a_nfs4_off = io_off;
1859 	args->a_nfs4_len = (uint_t)io_len;
1860 	args->a_nfs4_flags = flags;
1861 
1862 	mutex_enter(&mi->mi_async_lock);
1863 
1864 	/*
1865 	 * If asyncio has been disabled, then make a synchronous request.
1866 	 * This check is done a second time in case async io was diabled
1867 	 * while this thread was blocked waiting for memory pressure to
1868 	 * reduce or for the queue to drain.
1869 	 */
1870 	if (mi->mi_max_threads == 0) {
1871 		mutex_exit(&mi->mi_async_lock);
1872 
1873 		VN_RELE(vp);
1874 		crfree(cr);
1875 		kmem_free(args, sizeof (*args));
1876 		goto noasync;
1877 	}
1878 
1879 	/*
1880 	 * Link request structure into the async list and
1881 	 * wakeup async thread to do the i/o.
1882 	 */
1883 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1884 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1885 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1886 	} else {
1887 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1888 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1889 	}
1890 
1891 	mutex_enter(&rp->r_statelock);
1892 	rp->r_count++;
1893 	rp->r_awcount++;
1894 	mutex_exit(&rp->r_statelock);
1895 
1896 	if (mi->mi_io_kstats) {
1897 		mutex_enter(&mi->mi_lock);
1898 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1899 		mutex_exit(&mi->mi_lock);
1900 	}
1901 
1902 	mi->mi_async_req_count++;
1903 	ASSERT(mi->mi_async_req_count != 0);
1904 	cv_signal(&mi->mi_async_reqs_cv);
1905 	mutex_exit(&mi->mi_async_lock);
1906 	return (0);
1907 
1908 noasync:
1909 	/*
1910 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1911 	 * the page list), for writes we do it synchronously, except for
1912 	 * proc_pageout/proc_fsflush as described below.
1913 	 */
1914 	if (flags & B_READ) {
1915 		pvn_read_done(pp, flags | B_ERROR);
1916 		return (0);
1917 	}
1918 
1919 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1920 		/*
1921 		 * If we get here in the context of the pageout/fsflush,
1922 		 * we refuse to do a sync write, because this may hang
1923 		 * pageout/fsflush (and the machine). In this case, we just
1924 		 * re-mark the page as dirty and punt on the page.
1925 		 *
1926 		 * Make sure B_FORCE isn't set.  We can re-mark the
1927 		 * pages as dirty and unlock the pages in one swoop by
1928 		 * passing in B_ERROR to pvn_write_done().  However,
1929 		 * we should make sure B_FORCE isn't set - we don't
1930 		 * want the page tossed before it gets written out.
1931 		 */
1932 		if (flags & B_FORCE)
1933 			flags &= ~(B_INVAL | B_FORCE);
1934 		pvn_write_done(pp, flags | B_ERROR);
1935 		return (0);
1936 	}
1937 
1938 	if (nfs_zone() != mi->mi_zone) {
1939 		/*
1940 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1941 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1942 		 * them.
1943 		 *
1944 		 * We don't want to clear B_FORCE here as the caller presumably
1945 		 * knows what they're doing if they set it.
1946 		 */
1947 		pvn_write_done(pp, flags | B_ERROR);
1948 		return (EPERM);
1949 	}
1950 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1951 }
1952 
1953 void
1954 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1955     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1956 {
1957 	rnode4_t *rp;
1958 	mntinfo4_t *mi;
1959 	struct nfs4_async_reqs *args;
1960 
1961 	rp = VTOR4(vp);
1962 	ASSERT(rp->r_freef == NULL);
1963 
1964 	mi = VTOMI4(vp);
1965 
1966 	/*
1967 	 * If we can't allocate a request structure, skip the readdir.
1968 	 */
1969 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1970 		goto noasync;
1971 
1972 	args->a_next = NULL;
1973 #ifdef DEBUG
1974 	args->a_queuer = curthread;
1975 #endif
1976 	VN_HOLD(vp);
1977 	args->a_vp = vp;
1978 	ASSERT(cr != NULL);
1979 	crhold(cr);
1980 	args->a_cred = cr;
1981 	args->a_io = NFS4_READDIR;
1982 	args->a_nfs4_readdir = readdir;
1983 	args->a_nfs4_rdc = rdc;
1984 
1985 	mutex_enter(&mi->mi_async_lock);
1986 
1987 	/*
1988 	 * If asyncio has been disabled, then skip this request
1989 	 */
1990 	if (mi->mi_max_threads == 0) {
1991 		mutex_exit(&mi->mi_async_lock);
1992 
1993 		VN_RELE(vp);
1994 		crfree(cr);
1995 		kmem_free(args, sizeof (*args));
1996 		goto noasync;
1997 	}
1998 
1999 	/*
2000 	 * Link request structure into the async list and
2001 	 * wakeup async thread to do the i/o.
2002 	 */
2003 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
2004 		mi->mi_async_reqs[NFS4_READDIR] = args;
2005 		mi->mi_async_tail[NFS4_READDIR] = args;
2006 	} else {
2007 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
2008 		mi->mi_async_tail[NFS4_READDIR] = args;
2009 	}
2010 
2011 	mutex_enter(&rp->r_statelock);
2012 	rp->r_count++;
2013 	mutex_exit(&rp->r_statelock);
2014 
2015 	if (mi->mi_io_kstats) {
2016 		mutex_enter(&mi->mi_lock);
2017 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2018 		mutex_exit(&mi->mi_lock);
2019 	}
2020 
2021 	mi->mi_async_req_count++;
2022 	ASSERT(mi->mi_async_req_count != 0);
2023 	cv_signal(&mi->mi_async_reqs_cv);
2024 	mutex_exit(&mi->mi_async_lock);
2025 	return;
2026 
2027 noasync:
2028 	mutex_enter(&rp->r_statelock);
2029 	rdc->entries = NULL;
2030 	/*
2031 	 * Indicate that no one is trying to fill this entry and
2032 	 * it still needs to be filled.
2033 	 */
2034 	rdc->flags &= ~RDDIR;
2035 	rdc->flags |= RDDIRREQ;
2036 	rddir4_cache_rele(rp, rdc);
2037 	mutex_exit(&rp->r_statelock);
2038 }
2039 
2040 void
2041 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
2042     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2043     cred_t *))
2044 {
2045 	rnode4_t *rp;
2046 	mntinfo4_t *mi;
2047 	struct nfs4_async_reqs *args;
2048 	page_t *pp;
2049 
2050 	rp = VTOR4(vp);
2051 	mi = VTOMI4(vp);
2052 
2053 	/*
2054 	 * If we can't allocate a request structure, do the commit
2055 	 * operation synchronously in this thread's context.
2056 	 */
2057 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2058 		goto noasync;
2059 
2060 	args->a_next = NULL;
2061 #ifdef DEBUG
2062 	args->a_queuer = curthread;
2063 #endif
2064 	VN_HOLD(vp);
2065 	args->a_vp = vp;
2066 	ASSERT(cr != NULL);
2067 	crhold(cr);
2068 	args->a_cred = cr;
2069 	args->a_io = NFS4_COMMIT;
2070 	args->a_nfs4_commit = commit;
2071 	args->a_nfs4_plist = plist;
2072 	args->a_nfs4_offset = offset;
2073 	args->a_nfs4_count = count;
2074 
2075 	mutex_enter(&mi->mi_async_lock);
2076 
2077 	/*
2078 	 * If asyncio has been disabled, then make a synchronous request.
2079 	 * This check is done a second time in case async io was diabled
2080 	 * while this thread was blocked waiting for memory pressure to
2081 	 * reduce or for the queue to drain.
2082 	 */
2083 	if (mi->mi_max_threads == 0) {
2084 		mutex_exit(&mi->mi_async_lock);
2085 
2086 		VN_RELE(vp);
2087 		crfree(cr);
2088 		kmem_free(args, sizeof (*args));
2089 		goto noasync;
2090 	}
2091 
2092 	/*
2093 	 * Link request structure into the async list and
2094 	 * wakeup async thread to do the i/o.
2095 	 */
2096 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2097 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2098 		mi->mi_async_tail[NFS4_COMMIT] = args;
2099 	} else {
2100 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2101 		mi->mi_async_tail[NFS4_COMMIT] = args;
2102 	}
2103 
2104 	mutex_enter(&rp->r_statelock);
2105 	rp->r_count++;
2106 	mutex_exit(&rp->r_statelock);
2107 
2108 	if (mi->mi_io_kstats) {
2109 		mutex_enter(&mi->mi_lock);
2110 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2111 		mutex_exit(&mi->mi_lock);
2112 	}
2113 
2114 	mi->mi_async_req_count++;
2115 	ASSERT(mi->mi_async_req_count != 0);
2116 	cv_signal(&mi->mi_async_reqs_cv);
2117 	mutex_exit(&mi->mi_async_lock);
2118 	return;
2119 
2120 noasync:
2121 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2122 	    nfs_zone() != mi->mi_zone) {
2123 		while (plist != NULL) {
2124 			pp = plist;
2125 			page_sub(&plist, pp);
2126 			pp->p_fsdata = C_COMMIT;
2127 			page_unlock(pp);
2128 		}
2129 		return;
2130 	}
2131 	(*commit)(vp, plist, offset, count, cr);
2132 }
2133 
2134 /*
2135  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2136  * reference to the vnode is handed over to the thread; the caller should
2137  * no longer refer to the vnode.
2138  *
2139  * Unlike most of the async routines, this handoff is needed for
2140  * correctness reasons, not just performance.  So doing operations in the
2141  * context of the current thread is not an option.
2142  */
2143 void
2144 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2145 {
2146 	mntinfo4_t *mi;
2147 	struct nfs4_async_reqs *args;
2148 	boolean_t signal_inactive_thread = B_FALSE;
2149 
2150 	mi = VTOMI4(vp);
2151 
2152 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2153 	args->a_next = NULL;
2154 #ifdef DEBUG
2155 	args->a_queuer = curthread;
2156 #endif
2157 	args->a_vp = vp;
2158 	ASSERT(cr != NULL);
2159 	crhold(cr);
2160 	args->a_cred = cr;
2161 	args->a_io = NFS4_INACTIVE;
2162 
2163 	/*
2164 	 * Note that we don't check mi->mi_max_threads here, since we
2165 	 * *need* to get rid of this vnode regardless of whether someone
2166 	 * set nfs4_max_threads to zero in /etc/system.
2167 	 *
2168 	 * The manager thread knows about this and is willing to create
2169 	 * at least one thread to accommodate us.
2170 	 */
2171 	mutex_enter(&mi->mi_async_lock);
2172 	if (mi->mi_inactive_thread == NULL) {
2173 		rnode4_t *rp;
2174 		vnode_t *unldvp = NULL;
2175 		char *unlname;
2176 		cred_t *unlcred;
2177 
2178 		mutex_exit(&mi->mi_async_lock);
2179 		/*
2180 		 * We just need to free up the memory associated with the
2181 		 * vnode, which can be safely done from within the current
2182 		 * context.
2183 		 */
2184 		crfree(cr);	/* drop our reference */
2185 		kmem_free(args, sizeof (*args));
2186 		rp = VTOR4(vp);
2187 		mutex_enter(&rp->r_statelock);
2188 		if (rp->r_unldvp != NULL) {
2189 			unldvp = rp->r_unldvp;
2190 			rp->r_unldvp = NULL;
2191 			unlname = rp->r_unlname;
2192 			rp->r_unlname = NULL;
2193 			unlcred = rp->r_unlcred;
2194 			rp->r_unlcred = NULL;
2195 		}
2196 		mutex_exit(&rp->r_statelock);
2197 		/*
2198 		 * No need to explicitly throw away any cached pages.  The
2199 		 * eventual r4inactive() will attempt a synchronous
2200 		 * VOP_PUTPAGE() which will immediately fail since the request
2201 		 * is coming from the wrong zone, and then will proceed to call
2202 		 * nfs4_invalidate_pages() which will clean things up for us.
2203 		 *
2204 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2205 		 * return any existing delegations becomes a no-op.
2206 		 */
2207 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2208 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2209 			    FALSE);
2210 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2211 			nfs_rw_exit(&mi->mi_recovlock);
2212 		}
2213 		nfs4_clear_open_streams(rp);
2214 
2215 		rp4_addfree(rp, cr);
2216 		if (unldvp != NULL) {
2217 			kmem_free(unlname, MAXNAMELEN);
2218 			VN_RELE(unldvp);
2219 			crfree(unlcred);
2220 		}
2221 		return;
2222 	}
2223 
2224 	if (mi->mi_manager_thread == NULL) {
2225 		/*
2226 		 * We want to talk to the inactive thread.
2227 		 */
2228 		signal_inactive_thread = B_TRUE;
2229 	}
2230 
2231 	/*
2232 	 * Enqueue the vnode and wake up either the special thread (empty
2233 	 * list) or an async thread.
2234 	 */
2235 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2236 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2237 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2238 		signal_inactive_thread = B_TRUE;
2239 	} else {
2240 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2241 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2242 	}
2243 	if (signal_inactive_thread) {
2244 		cv_signal(&mi->mi_inact_req_cv);
2245 	} else  {
2246 		mi->mi_async_req_count++;
2247 		ASSERT(mi->mi_async_req_count != 0);
2248 		cv_signal(&mi->mi_async_reqs_cv);
2249 	}
2250 
2251 	mutex_exit(&mi->mi_async_lock);
2252 }
2253 
2254 int
2255 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2256 {
2257 	int pagecreate;
2258 	int n;
2259 	int saved_n;
2260 	caddr_t saved_base;
2261 	u_offset_t offset;
2262 	int error;
2263 	int sm_error;
2264 	vnode_t *vp = RTOV(rp);
2265 
2266 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2267 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2268 	if (!vpm_enable) {
2269 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2270 	}
2271 
2272 	/*
2273 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2274 	 * spanning pages in uiomove() because page faults may cause
2275 	 * the cache to be invalidated out from under us. The r_size is not
2276 	 * updated until after the uiomove. If we push the last page of a
2277 	 * file before r_size is correct, we will lose the data written past
2278 	 * the current (and invalid) r_size.
2279 	 */
2280 	do {
2281 		offset = uio->uio_loffset;
2282 		pagecreate = 0;
2283 
2284 		/*
2285 		 * n is the number of bytes required to satisfy the request
2286 		 *   or the number of bytes to fill out the page.
2287 		 */
2288 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2289 
2290 		/*
2291 		 * Check to see if we can skip reading in the page
2292 		 * and just allocate the memory.  We can do this
2293 		 * if we are going to rewrite the entire mapping
2294 		 * or if we are going to write to or beyond the current
2295 		 * end of file from the beginning of the mapping.
2296 		 *
2297 		 * The read of r_size is now protected by r_statelock.
2298 		 */
2299 		mutex_enter(&rp->r_statelock);
2300 		/*
2301 		 * When pgcreated is nonzero the caller has already done
2302 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2303 		 * segkpm this means we already have at least one page
2304 		 * created and mapped at base.
2305 		 */
2306 		pagecreate = pgcreated ||
2307 		    ((offset & PAGEOFFSET) == 0 &&
2308 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2309 
2310 		mutex_exit(&rp->r_statelock);
2311 
2312 		if (!vpm_enable && pagecreate) {
2313 			/*
2314 			 * The last argument tells segmap_pagecreate() to
2315 			 * always lock the page, as opposed to sometimes
2316 			 * returning with the page locked. This way we avoid a
2317 			 * fault on the ensuing uiomove(), but also
2318 			 * more importantly (to fix bug 1094402) we can
2319 			 * call segmap_fault() to unlock the page in all
2320 			 * cases. An alternative would be to modify
2321 			 * segmap_pagecreate() to tell us when it is
2322 			 * locking a page, but that's a fairly major
2323 			 * interface change.
2324 			 */
2325 			if (pgcreated == 0)
2326 				(void) segmap_pagecreate(segkmap, base,
2327 				    (uint_t)n, 1);
2328 			saved_base = base;
2329 			saved_n = n;
2330 		}
2331 
2332 		/*
2333 		 * The number of bytes of data in the last page can not
2334 		 * be accurately be determined while page is being
2335 		 * uiomove'd to and the size of the file being updated.
2336 		 * Thus, inform threads which need to know accurately
2337 		 * how much data is in the last page of the file.  They
2338 		 * will not do the i/o immediately, but will arrange for
2339 		 * the i/o to happen later when this modify operation
2340 		 * will have finished.
2341 		 */
2342 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2343 		mutex_enter(&rp->r_statelock);
2344 		rp->r_flags |= R4MODINPROGRESS;
2345 		rp->r_modaddr = (offset & MAXBMASK);
2346 		mutex_exit(&rp->r_statelock);
2347 
2348 		if (vpm_enable) {
2349 			/*
2350 			 * Copy data. If new pages are created, part of
2351 			 * the page that is not written will be initizliazed
2352 			 * with zeros.
2353 			 */
2354 			error = vpm_data_copy(vp, offset, n, uio,
2355 			    !pagecreate, NULL, 0, S_WRITE);
2356 		} else {
2357 			error = uiomove(base, n, UIO_WRITE, uio);
2358 		}
2359 
2360 		/*
2361 		 * r_size is the maximum number of
2362 		 * bytes known to be in the file.
2363 		 * Make sure it is at least as high as the
2364 		 * first unwritten byte pointed to by uio_loffset.
2365 		 */
2366 		mutex_enter(&rp->r_statelock);
2367 		if (rp->r_size < uio->uio_loffset)
2368 			rp->r_size = uio->uio_loffset;
2369 		rp->r_flags &= ~R4MODINPROGRESS;
2370 		rp->r_flags |= R4DIRTY;
2371 		mutex_exit(&rp->r_statelock);
2372 
2373 		/* n = # of bytes written */
2374 		n = (int)(uio->uio_loffset - offset);
2375 
2376 		if (!vpm_enable) {
2377 			base += n;
2378 		}
2379 
2380 		tcount -= n;
2381 		/*
2382 		 * If we created pages w/o initializing them completely,
2383 		 * we need to zero the part that wasn't set up.
2384 		 * This happens on a most EOF write cases and if
2385 		 * we had some sort of error during the uiomove.
2386 		 */
2387 		if (!vpm_enable && pagecreate) {
2388 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2389 				(void) kzero(base, PAGESIZE - n);
2390 
2391 			if (pgcreated) {
2392 				/*
2393 				 * Caller is responsible for this page,
2394 				 * it was not created in this loop.
2395 				 */
2396 				pgcreated = 0;
2397 			} else {
2398 				/*
2399 				 * For bug 1094402: segmap_pagecreate locks
2400 				 * page. Unlock it. This also unlocks the
2401 				 * pages allocated by page_create_va() in
2402 				 * segmap_pagecreate().
2403 				 */
2404 				sm_error = segmap_fault(kas.a_hat, segkmap,
2405 				    saved_base, saved_n,
2406 				    F_SOFTUNLOCK, S_WRITE);
2407 				if (error == 0)
2408 					error = sm_error;
2409 			}
2410 		}
2411 	} while (tcount > 0 && error == 0);
2412 
2413 	return (error);
2414 }
2415 
2416 int
2417 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2418 {
2419 	rnode4_t *rp;
2420 	page_t *pp;
2421 	u_offset_t eoff;
2422 	u_offset_t io_off;
2423 	size_t io_len;
2424 	int error;
2425 	int rdirty;
2426 	int err;
2427 
2428 	rp = VTOR4(vp);
2429 	ASSERT(rp->r_count > 0);
2430 
2431 	if (!nfs4_has_pages(vp))
2432 		return (0);
2433 
2434 	ASSERT(vp->v_type != VCHR);
2435 
2436 	/*
2437 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2438 	 * writes.  B_FORCE is set to force the VM system to actually
2439 	 * invalidate the pages, even if the i/o failed.  The pages
2440 	 * need to get invalidated because they can't be written out
2441 	 * because there isn't any space left on either the server's
2442 	 * file system or in the user's disk quota.  The B_FREE bit
2443 	 * is cleared to avoid confusion as to whether this is a
2444 	 * request to place the page on the freelist or to destroy
2445 	 * it.
2446 	 */
2447 	if ((rp->r_flags & R4OUTOFSPACE) ||
2448 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2449 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2450 
2451 	if (len == 0) {
2452 		/*
2453 		 * If doing a full file synchronous operation, then clear
2454 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2455 		 * is happening, then R4DIRTY will get set again.  The
2456 		 * R4DIRTY bit must get cleared before the flush so that
2457 		 * we don't lose this information.
2458 		 *
2459 		 * If there are no full file async write operations
2460 		 * pending and RDIRTY bit is set, clear it.
2461 		 */
2462 		if (off == (u_offset_t)0 &&
2463 		    !(flags & B_ASYNC) &&
2464 		    (rp->r_flags & R4DIRTY)) {
2465 			mutex_enter(&rp->r_statelock);
2466 			rdirty = (rp->r_flags & R4DIRTY);
2467 			rp->r_flags &= ~R4DIRTY;
2468 			mutex_exit(&rp->r_statelock);
2469 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2470 			mutex_enter(&rp->r_statelock);
2471 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2472 				rdirty = (rp->r_flags & R4DIRTY);
2473 				rp->r_flags &= ~R4DIRTY;
2474 			}
2475 			mutex_exit(&rp->r_statelock);
2476 		} else
2477 			rdirty = 0;
2478 
2479 		/*
2480 		 * Search the entire vp list for pages >= off, and flush
2481 		 * the dirty pages.
2482 		 */
2483 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2484 		    flags, cr);
2485 
2486 		/*
2487 		 * If an error occurred and the file was marked as dirty
2488 		 * before and we aren't forcibly invalidating pages, then
2489 		 * reset the R4DIRTY flag.
2490 		 */
2491 		if (error && rdirty &&
2492 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2493 			mutex_enter(&rp->r_statelock);
2494 			rp->r_flags |= R4DIRTY;
2495 			mutex_exit(&rp->r_statelock);
2496 		}
2497 	} else {
2498 		/*
2499 		 * Do a range from [off...off + len) looking for pages
2500 		 * to deal with.
2501 		 */
2502 		error = 0;
2503 		io_len = 0;
2504 		eoff = off + len;
2505 		mutex_enter(&rp->r_statelock);
2506 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2507 		    io_off += io_len) {
2508 			mutex_exit(&rp->r_statelock);
2509 			/*
2510 			 * If we are not invalidating, synchronously
2511 			 * freeing or writing pages use the routine
2512 			 * page_lookup_nowait() to prevent reclaiming
2513 			 * them from the free list.
2514 			 */
2515 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2516 				pp = page_lookup(vp, io_off,
2517 				    (flags & (B_INVAL | B_FREE)) ?
2518 				    SE_EXCL : SE_SHARED);
2519 			} else {
2520 				pp = page_lookup_nowait(vp, io_off,
2521 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2522 			}
2523 
2524 			if (pp == NULL || !pvn_getdirty(pp, flags))
2525 				io_len = PAGESIZE;
2526 			else {
2527 				err = (*rp->r_putapage)(vp, pp, &io_off,
2528 				    &io_len, flags, cr);
2529 				if (!error)
2530 					error = err;
2531 				/*
2532 				 * "io_off" and "io_len" are returned as
2533 				 * the range of pages we actually wrote.
2534 				 * This allows us to skip ahead more quickly
2535 				 * since several pages may've been dealt
2536 				 * with by this iteration of the loop.
2537 				 */
2538 			}
2539 			mutex_enter(&rp->r_statelock);
2540 		}
2541 		mutex_exit(&rp->r_statelock);
2542 	}
2543 
2544 	return (error);
2545 }
2546 
2547 void
2548 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2549 {
2550 	rnode4_t *rp;
2551 
2552 	rp = VTOR4(vp);
2553 	if (IS_SHADOW(vp, rp))
2554 		vp = RTOV4(rp);
2555 	mutex_enter(&rp->r_statelock);
2556 	while (rp->r_flags & R4TRUNCATE)
2557 		cv_wait(&rp->r_cv, &rp->r_statelock);
2558 	rp->r_flags |= R4TRUNCATE;
2559 	if (off == (u_offset_t)0) {
2560 		rp->r_flags &= ~R4DIRTY;
2561 		if (!(rp->r_flags & R4STALE))
2562 			rp->r_error = 0;
2563 	}
2564 	rp->r_truncaddr = off;
2565 	mutex_exit(&rp->r_statelock);
2566 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2567 	    B_INVAL | B_TRUNC, cr);
2568 	mutex_enter(&rp->r_statelock);
2569 	rp->r_flags &= ~R4TRUNCATE;
2570 	cv_broadcast(&rp->r_cv);
2571 	mutex_exit(&rp->r_statelock);
2572 }
2573 
2574 static int
2575 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2576 {
2577 	mntinfo4_t *mi;
2578 	struct mntinfo_kstat *mik;
2579 	vfs_t *vfsp;
2580 
2581 	/* this is a read-only kstat. Bail out on a write */
2582 	if (rw == KSTAT_WRITE)
2583 		return (EACCES);
2584 
2585 
2586 	/*
2587 	 * We don't want to wait here as kstat_chain_lock could be held by
2588 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2589 	 * and thus could lead to a deadlock.
2590 	 */
2591 	vfsp = (struct vfs *)ksp->ks_private;
2592 
2593 	mi = VFTOMI4(vfsp);
2594 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2595 
2596 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2597 
2598 	mik->mik_vers = (uint32_t)mi->mi_vers;
2599 	mik->mik_flags = mi->mi_flags;
2600 	/*
2601 	 * The sv_secdata holds the flavor the client specifies.
2602 	 * If the client uses default and a security negotiation
2603 	 * occurs, sv_currsec will point to the current flavor
2604 	 * selected from the server flavor list.
2605 	 * sv_currsec is NULL if no security negotiation takes place.
2606 	 */
2607 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2608 	    mi->mi_curr_serv->sv_currsec->secmod :
2609 	    mi->mi_curr_serv->sv_secdata->secmod;
2610 	mik->mik_curread = (uint32_t)mi->mi_curread;
2611 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2612 	mik->mik_retrans = mi->mi_retrans;
2613 	mik->mik_timeo = mi->mi_timeo;
2614 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2615 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2616 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2617 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2618 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2619 	mik->mik_failover = (uint32_t)mi->mi_failover;
2620 	mik->mik_remap = (uint32_t)mi->mi_remap;
2621 
2622 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2623 
2624 	return (0);
2625 }
2626 
2627 void
2628 nfs4_mnt_kstat_init(struct vfs *vfsp)
2629 {
2630 	mntinfo4_t *mi = VFTOMI4(vfsp);
2631 
2632 	/*
2633 	 * PSARC 2001/697 Contract Private Interface
2634 	 * All nfs kstats are under SunMC contract
2635 	 * Please refer to the PSARC listed above and contact
2636 	 * SunMC before making any changes!
2637 	 *
2638 	 * Changes must be reviewed by Solaris File Sharing
2639 	 * Changes must be communicated to contract-2001-697@sun.com
2640 	 *
2641 	 */
2642 
2643 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2644 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2645 	if (mi->mi_io_kstats) {
2646 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2647 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2648 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2649 		kstat_install(mi->mi_io_kstats);
2650 	}
2651 
2652 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2653 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2654 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2655 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2656 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2657 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2658 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2659 		kstat_install(mi->mi_ro_kstats);
2660 	}
2661 
2662 	nfs4_mnt_recov_kstat_init(vfsp);
2663 }
2664 
2665 void
2666 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2667 {
2668 	mntinfo4_t *mi;
2669 	clock_t now = ddi_get_lbolt();
2670 
2671 	mi = VTOMI4(vp);
2672 	/*
2673 	 * In case of forced unmount, do not print any messages
2674 	 * since it can flood the console with error messages.
2675 	 */
2676 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2677 		return;
2678 
2679 	/*
2680 	 * If the mount point is dead, not recoverable, do not
2681 	 * print error messages that can flood the console.
2682 	 */
2683 	if (mi->mi_flags & MI4_RECOV_FAIL)
2684 		return;
2685 
2686 	/*
2687 	 * No use in flooding the console with ENOSPC
2688 	 * messages from the same file system.
2689 	 */
2690 	if ((error != ENOSPC && error != EDQUOT) ||
2691 	    now - mi->mi_printftime > 0) {
2692 		zoneid_t zoneid = mi->mi_zone->zone_id;
2693 
2694 #ifdef DEBUG
2695 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2696 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2697 #else
2698 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2699 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2700 #endif
2701 		if (error == ENOSPC || error == EDQUOT) {
2702 			zcmn_err(zoneid, CE_CONT,
2703 			    "^File: userid=%d, groupid=%d\n",
2704 			    crgetuid(cr), crgetgid(cr));
2705 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2706 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2707 				zcmn_err(zoneid, CE_CONT,
2708 				    "^User: userid=%d, groupid=%d\n",
2709 				    crgetuid(curthread->t_cred),
2710 				    crgetgid(curthread->t_cred));
2711 			}
2712 			mi->mi_printftime = now +
2713 			    nfs_write_error_interval * hz;
2714 		}
2715 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2716 #ifdef DEBUG
2717 		if (error == EACCES) {
2718 			zcmn_err(zoneid, CE_CONT,
2719 			    "nfs_bio: cred is%s kcred\n",
2720 			    cr == kcred ? "" : " not");
2721 		}
2722 #endif
2723 	}
2724 }
2725 
2726 /*
2727  * Return non-zero if the given file can be safely memory mapped.  Locks
2728  * are safe if whole-file (length and offset are both zero).
2729  */
2730 
2731 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2732 
2733 static int
2734 nfs4_safemap(const vnode_t *vp)
2735 {
2736 	locklist_t	*llp, *next_llp;
2737 	int		safe = 1;
2738 	rnode4_t	*rp = VTOR4(vp);
2739 
2740 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2741 
2742 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2743 	    "vp = %p", (void *)vp));
2744 
2745 	/*
2746 	 * Review all the locks for the vnode, both ones that have been
2747 	 * acquired and ones that are pending.  We assume that
2748 	 * flk_active_locks_for_vp() has merged any locks that can be
2749 	 * merged (so that if a process has the entire file locked, it is
2750 	 * represented as a single lock).
2751 	 *
2752 	 * Note that we can't bail out of the loop if we find a non-safe
2753 	 * lock, because we have to free all the elements in the llp list.
2754 	 * We might be able to speed up this code slightly by not looking
2755 	 * at each lock's l_start and l_len fields once we've found a
2756 	 * non-safe lock.
2757 	 */
2758 
2759 	llp = flk_active_locks_for_vp(vp);
2760 	while (llp) {
2761 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2762 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2763 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2764 		if (!SAFE_LOCK(llp->ll_flock)) {
2765 			safe = 0;
2766 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2767 			    "nfs4_safemap: unsafe active lock (%" PRId64
2768 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2769 			    llp->ll_flock.l_len));
2770 		}
2771 		next_llp = llp->ll_next;
2772 		VN_RELE(llp->ll_vp);
2773 		kmem_free(llp, sizeof (*llp));
2774 		llp = next_llp;
2775 	}
2776 
2777 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2778 	    safe ? "safe" : "unsafe"));
2779 	return (safe);
2780 }
2781 
2782 /*
2783  * Return whether there is a lost LOCK or LOCKU queued up for the given
2784  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2785  */
2786 
2787 bool_t
2788 nfs4_map_lost_lock_conflict(vnode_t *vp)
2789 {
2790 	bool_t conflict = FALSE;
2791 	nfs4_lost_rqst_t *lrp;
2792 	mntinfo4_t *mi = VTOMI4(vp);
2793 
2794 	mutex_enter(&mi->mi_lock);
2795 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2796 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2797 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2798 			continue;
2799 		ASSERT(lrp->lr_vp != NULL);
2800 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2801 			continue;	/* different file */
2802 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2803 			conflict = TRUE;
2804 			break;
2805 		}
2806 	}
2807 
2808 	mutex_exit(&mi->mi_lock);
2809 	return (conflict);
2810 }
2811 
2812 /*
2813  * nfs_lockcompletion:
2814  *
2815  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2816  * as non cachable (set VNOCACHE bit).
2817  */
2818 
2819 void
2820 nfs4_lockcompletion(vnode_t *vp, int cmd)
2821 {
2822 	rnode4_t *rp = VTOR4(vp);
2823 
2824 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2825 	ASSERT(!IS_SHADOW(vp, rp));
2826 
2827 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2828 
2829 		if (!nfs4_safemap(vp)) {
2830 			mutex_enter(&vp->v_lock);
2831 			vp->v_flag |= VNOCACHE;
2832 			mutex_exit(&vp->v_lock);
2833 		} else {
2834 			mutex_enter(&vp->v_lock);
2835 			vp->v_flag &= ~VNOCACHE;
2836 			mutex_exit(&vp->v_lock);
2837 		}
2838 	}
2839 	/*
2840 	 * The cached attributes of the file are stale after acquiring
2841 	 * the lock on the file. They were updated when the file was
2842 	 * opened, but not updated when the lock was acquired. Therefore the
2843 	 * cached attributes are invalidated after the lock is obtained.
2844 	 */
2845 	PURGE_ATTRCACHE4(vp);
2846 }
2847 
2848 /* ARGSUSED */
2849 static void *
2850 nfs4_mi_init(zoneid_t zoneid)
2851 {
2852 	struct mi4_globals *mig;
2853 
2854 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2855 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2856 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2857 	    offsetof(mntinfo4_t, mi_zone_node));
2858 	mig->mig_destructor_called = B_FALSE;
2859 	return (mig);
2860 }
2861 
2862 /*
2863  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2864  * state and killing off threads.
2865  */
2866 /* ARGSUSED */
2867 static void
2868 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2869 {
2870 	struct mi4_globals *mig = data;
2871 	mntinfo4_t *mi;
2872 	nfs4_server_t *np;
2873 
2874 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2875 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2876 	ASSERT(mig != NULL);
2877 	for (;;) {
2878 		mutex_enter(&mig->mig_lock);
2879 		mi = list_head(&mig->mig_list);
2880 		if (mi == NULL) {
2881 			mutex_exit(&mig->mig_lock);
2882 			break;
2883 		}
2884 
2885 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2886 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2887 		/*
2888 		 * purge the DNLC for this filesystem
2889 		 */
2890 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2891 		/*
2892 		 * Tell existing async worker threads to exit.
2893 		 */
2894 		mutex_enter(&mi->mi_async_lock);
2895 		mi->mi_max_threads = 0;
2896 		NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
2897 		/*
2898 		 * Set the appropriate flags, signal and wait for both the
2899 		 * async manager and the inactive thread to exit when they're
2900 		 * done with their current work.
2901 		 */
2902 		mutex_enter(&mi->mi_lock);
2903 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2904 		mutex_exit(&mi->mi_lock);
2905 		mutex_exit(&mi->mi_async_lock);
2906 		if (mi->mi_manager_thread) {
2907 			nfs4_async_manager_stop(mi->mi_vfsp);
2908 		}
2909 		if (mi->mi_inactive_thread) {
2910 			mutex_enter(&mi->mi_async_lock);
2911 			cv_signal(&mi->mi_inact_req_cv);
2912 			/*
2913 			 * Wait for the inactive thread to exit.
2914 			 */
2915 			while (mi->mi_inactive_thread != NULL) {
2916 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2917 			}
2918 			mutex_exit(&mi->mi_async_lock);
2919 		}
2920 		/*
2921 		 * Wait for the recovery thread to complete, that is, it will
2922 		 * signal when it is done using the "mi" structure and about
2923 		 * to exit
2924 		 */
2925 		mutex_enter(&mi->mi_lock);
2926 		while (mi->mi_in_recovery > 0)
2927 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2928 		mutex_exit(&mi->mi_lock);
2929 		/*
2930 		 * We're done when every mi has been done or the list is empty.
2931 		 * This one is done, remove it from the list.
2932 		 */
2933 		list_remove(&mig->mig_list, mi);
2934 		mutex_exit(&mig->mig_lock);
2935 		zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2936 
2937 		/*
2938 		 * Release hold on vfs and mi done to prevent race with zone
2939 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2940 		 */
2941 		VFS_RELE(mi->mi_vfsp);
2942 		MI4_RELE(mi);
2943 	}
2944 	/*
2945 	 * Tell each renew thread in the zone to exit
2946 	 */
2947 	mutex_enter(&nfs4_server_lst_lock);
2948 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2949 		mutex_enter(&np->s_lock);
2950 		if (np->zoneid == zoneid) {
2951 			/*
2952 			 * We add another hold onto the nfs4_server_t
2953 			 * because this will make sure tha the nfs4_server_t
2954 			 * stays around until nfs4_callback_fini_zone destroys
2955 			 * the zone. This way, the renew thread can
2956 			 * unconditionally release its holds on the
2957 			 * nfs4_server_t.
2958 			 */
2959 			np->s_refcnt++;
2960 			nfs4_mark_srv_dead(np);
2961 		}
2962 		mutex_exit(&np->s_lock);
2963 	}
2964 	mutex_exit(&nfs4_server_lst_lock);
2965 }
2966 
2967 static void
2968 nfs4_mi_free_globals(struct mi4_globals *mig)
2969 {
2970 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2971 	mutex_destroy(&mig->mig_lock);
2972 	kmem_free(mig, sizeof (*mig));
2973 }
2974 
2975 /* ARGSUSED */
2976 static void
2977 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2978 {
2979 	struct mi4_globals *mig = data;
2980 
2981 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2982 	    "nfs4_mi_destroy zone %d\n", zoneid));
2983 	ASSERT(mig != NULL);
2984 	mutex_enter(&mig->mig_lock);
2985 	if (list_head(&mig->mig_list) != NULL) {
2986 		/* Still waiting for VFS_FREEVFS() */
2987 		mig->mig_destructor_called = B_TRUE;
2988 		mutex_exit(&mig->mig_lock);
2989 		return;
2990 	}
2991 	nfs4_mi_free_globals(mig);
2992 }
2993 
2994 /*
2995  * Add an NFS mount to the per-zone list of NFS mounts.
2996  */
2997 void
2998 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2999 {
3000 	struct mi4_globals *mig;
3001 
3002 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3003 	mutex_enter(&mig->mig_lock);
3004 	list_insert_head(&mig->mig_list, mi);
3005 	/*
3006 	 * hold added to eliminate race with zone shutdown -this will be
3007 	 * released in mi_shutdown
3008 	 */
3009 	MI4_HOLD(mi);
3010 	VFS_HOLD(mi->mi_vfsp);
3011 	mutex_exit(&mig->mig_lock);
3012 }
3013 
3014 /*
3015  * Remove an NFS mount from the per-zone list of NFS mounts.
3016  */
3017 int
3018 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
3019 {
3020 	struct mi4_globals *mig;
3021 	int ret = 0;
3022 
3023 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
3024 	mutex_enter(&mig->mig_lock);
3025 	mutex_enter(&mi->mi_lock);
3026 	/* if this mi is marked dead, then the zone already released it */
3027 	if (!(mi->mi_flags & MI4_DEAD)) {
3028 		list_remove(&mig->mig_list, mi);
3029 		mutex_exit(&mi->mi_lock);
3030 
3031 		/* release the holds put on in zonelist_add(). */
3032 		VFS_RELE(mi->mi_vfsp);
3033 		MI4_RELE(mi);
3034 		ret = 1;
3035 	} else {
3036 		mutex_exit(&mi->mi_lock);
3037 	}
3038 
3039 	/*
3040 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
3041 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
3042 	 * mi globals.
3043 	 */
3044 	if (list_head(&mig->mig_list) == NULL &&
3045 	    mig->mig_destructor_called == B_TRUE) {
3046 		nfs4_mi_free_globals(mig);
3047 		return (ret);
3048 	}
3049 	mutex_exit(&mig->mig_lock);
3050 	return (ret);
3051 }
3052 
3053 void
3054 nfs_free_mi4(mntinfo4_t *mi)
3055 {
3056 	nfs4_open_owner_t	*foop;
3057 	nfs4_oo_hash_bucket_t   *bucketp;
3058 	nfs4_debug_msg_t	*msgp;
3059 	int i;
3060 	servinfo4_t 		*svp;
3061 
3062 	/*
3063 	 * Code introduced here should be carefully evaluated to make
3064 	 * sure none of the freed resources are accessed either directly
3065 	 * or indirectly after freeing them. For eg: Introducing calls to
3066 	 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3067 	 * the structure members or other routines calling back into NFS
3068 	 * accessing freed mntinfo4_t structure member.
3069 	 */
3070 	mutex_enter(&mi->mi_lock);
3071 	ASSERT(mi->mi_recovthread == NULL);
3072 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3073 	mutex_exit(&mi->mi_lock);
3074 	mutex_enter(&mi->mi_async_lock);
3075 	ASSERT(mi->mi_threads[NFS4_ASYNC_QUEUE] == 0 &&
3076 	    mi->mi_threads[NFS4_ASYNC_PGOPS_QUEUE] == 0);
3077 	ASSERT(mi->mi_manager_thread == NULL);
3078 	mutex_exit(&mi->mi_async_lock);
3079 	if (mi->mi_io_kstats) {
3080 		kstat_delete(mi->mi_io_kstats);
3081 		mi->mi_io_kstats = NULL;
3082 	}
3083 	if (mi->mi_ro_kstats) {
3084 		kstat_delete(mi->mi_ro_kstats);
3085 		mi->mi_ro_kstats = NULL;
3086 	}
3087 	if (mi->mi_recov_ksp) {
3088 		kstat_delete(mi->mi_recov_ksp);
3089 		mi->mi_recov_ksp = NULL;
3090 	}
3091 	mutex_enter(&mi->mi_msg_list_lock);
3092 	while (msgp = list_head(&mi->mi_msg_list)) {
3093 		list_remove(&mi->mi_msg_list, msgp);
3094 		nfs4_free_msg(msgp);
3095 	}
3096 	mutex_exit(&mi->mi_msg_list_lock);
3097 	list_destroy(&mi->mi_msg_list);
3098 	if (mi->mi_fname != NULL)
3099 		fn_rele(&mi->mi_fname);
3100 	if (mi->mi_rootfh != NULL)
3101 		sfh4_rele(&mi->mi_rootfh);
3102 	if (mi->mi_srvparentfh != NULL)
3103 		sfh4_rele(&mi->mi_srvparentfh);
3104 	svp = mi->mi_servers;
3105 	sv4_free(svp);
3106 	mutex_destroy(&mi->mi_lock);
3107 	mutex_destroy(&mi->mi_async_lock);
3108 	mutex_destroy(&mi->mi_msg_list_lock);
3109 	nfs_rw_destroy(&mi->mi_recovlock);
3110 	nfs_rw_destroy(&mi->mi_rename_lock);
3111 	nfs_rw_destroy(&mi->mi_fh_lock);
3112 	cv_destroy(&mi->mi_failover_cv);
3113 	cv_destroy(&mi->mi_async_reqs_cv);
3114 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE]);
3115 	cv_destroy(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE]);
3116 	cv_destroy(&mi->mi_async_cv);
3117 	cv_destroy(&mi->mi_inact_req_cv);
3118 	/*
3119 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3120 	 */
3121 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3122 		bucketp = &(mi->mi_oo_list[i]);
3123 		/* Destroy any remaining open owners on the list */
3124 		foop = list_head(&bucketp->b_oo_hash_list);
3125 		while (foop != NULL) {
3126 			list_remove(&bucketp->b_oo_hash_list, foop);
3127 			nfs4_destroy_open_owner(foop);
3128 			foop = list_head(&bucketp->b_oo_hash_list);
3129 		}
3130 		list_destroy(&bucketp->b_oo_hash_list);
3131 		mutex_destroy(&bucketp->b_lock);
3132 	}
3133 	/*
3134 	 * Empty and destroy the freed open owner list.
3135 	 */
3136 	foop = list_head(&mi->mi_foo_list);
3137 	while (foop != NULL) {
3138 		list_remove(&mi->mi_foo_list, foop);
3139 		nfs4_destroy_open_owner(foop);
3140 		foop = list_head(&mi->mi_foo_list);
3141 	}
3142 	list_destroy(&mi->mi_foo_list);
3143 	list_destroy(&mi->mi_bseqid_list);
3144 	list_destroy(&mi->mi_lost_state);
3145 	avl_destroy(&mi->mi_filehandles);
3146 	kmem_free(mi, sizeof (*mi));
3147 }
3148 void
3149 mi_hold(mntinfo4_t *mi)
3150 {
3151 	atomic_add_32(&mi->mi_count, 1);
3152 	ASSERT(mi->mi_count != 0);
3153 }
3154 
3155 void
3156 mi_rele(mntinfo4_t *mi)
3157 {
3158 	ASSERT(mi->mi_count != 0);
3159 	if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3160 		nfs_free_mi4(mi);
3161 	}
3162 }
3163 
3164 vnode_t    nfs4_xattr_notsupp_vnode;
3165 
3166 void
3167 nfs4_clnt_init(void)
3168 {
3169 	nfs4_vnops_init();
3170 	(void) nfs4_rnode_init();
3171 	(void) nfs4_shadow_init();
3172 	(void) nfs4_acache_init();
3173 	(void) nfs4_subr_init();
3174 	nfs4_acl_init();
3175 	nfs_idmap_init();
3176 	nfs4_callback_init();
3177 	nfs4_secinfo_init();
3178 #ifdef	DEBUG
3179 	tsd_create(&nfs4_tsd_key, NULL);
3180 #endif
3181 
3182 	/*
3183 	 * Add a CPR callback so that we can update client
3184 	 * lease after a suspend and resume.
3185 	 */
3186 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3187 
3188 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3189 	    nfs4_mi_destroy);
3190 
3191 	/*
3192 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3193 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3194 	 */
3195 	nfs4_xattr_notsupp_vnode.v_count = 1;
3196 }
3197 
3198 void
3199 nfs4_clnt_fini(void)
3200 {
3201 	(void) zone_key_delete(mi4_list_key);
3202 	nfs4_vnops_fini();
3203 	(void) nfs4_rnode_fini();
3204 	(void) nfs4_shadow_fini();
3205 	(void) nfs4_acache_fini();
3206 	(void) nfs4_subr_fini();
3207 	nfs_idmap_fini();
3208 	nfs4_callback_fini();
3209 	nfs4_secinfo_fini();
3210 #ifdef	DEBUG
3211 	tsd_destroy(&nfs4_tsd_key);
3212 #endif
3213 	if (cid)
3214 		(void) callb_delete(cid);
3215 }
3216 
3217 /*ARGSUSED*/
3218 static boolean_t
3219 nfs4_client_cpr_callb(void *arg, int code)
3220 {
3221 	/*
3222 	 * We get called for Suspend and Resume events.
3223 	 * For the suspend case we simply don't care!
3224 	 */
3225 	if (code == CB_CODE_CPR_CHKPT) {
3226 		return (B_TRUE);
3227 	}
3228 
3229 	/*
3230 	 * When we get to here we are in the process of
3231 	 * resuming the system from a previous suspend.
3232 	 */
3233 	nfs4_client_resumed = gethrestime_sec();
3234 	return (B_TRUE);
3235 }
3236 
3237 void
3238 nfs4_renew_lease_thread(nfs4_server_t *sp)
3239 {
3240 	int	error = 0;
3241 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3242 	clock_t	tick_delay = 0;
3243 	clock_t time_left = 0;
3244 	callb_cpr_t cpr_info;
3245 	kmutex_t cpr_lock;
3246 
3247 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3248 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3249 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3250 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3251 
3252 	mutex_enter(&sp->s_lock);
3253 	/* sp->s_lease_time is set via a GETATTR */
3254 	sp->last_renewal_time = gethrestime_sec();
3255 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3256 	ASSERT(sp->s_refcnt >= 1);
3257 
3258 	for (;;) {
3259 		if (!sp->state_ref_count ||
3260 		    sp->lease_valid != NFS4_LEASE_VALID) {
3261 
3262 			kip_secs = MAX((sp->s_lease_time >> 1) -
3263 			    (3 * sp->propagation_delay.tv_sec), 1);
3264 
3265 			tick_delay = SEC_TO_TICK(kip_secs);
3266 
3267 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3268 			    "nfs4_renew_lease_thread: no renew : thread "
3269 			    "wait %ld secs", kip_secs));
3270 
3271 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3272 			    "nfs4_renew_lease_thread: no renew : "
3273 			    "state_ref_count %d, lease_valid %d",
3274 			    sp->state_ref_count, sp->lease_valid));
3275 
3276 			mutex_enter(&cpr_lock);
3277 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3278 			mutex_exit(&cpr_lock);
3279 			time_left = cv_reltimedwait(&sp->cv_thread_exit,
3280 			    &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3281 			mutex_enter(&cpr_lock);
3282 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3283 			mutex_exit(&cpr_lock);
3284 
3285 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3286 			    "nfs4_renew_lease_thread: no renew: "
3287 			    "time left %ld", time_left));
3288 
3289 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3290 				goto die;
3291 			continue;
3292 		}
3293 
3294 		tmp_last_renewal_time = sp->last_renewal_time;
3295 
3296 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3297 		    (3 * sp->propagation_delay.tv_sec);
3298 
3299 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3300 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3301 		    "sp->last_renewal_time %ld", tmp_time,
3302 		    sp->last_renewal_time));
3303 
3304 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3305 
3306 		tick_delay = SEC_TO_TICK(kip_secs);
3307 
3308 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3309 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3310 		    "secs", kip_secs));
3311 
3312 		mutex_enter(&cpr_lock);
3313 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3314 		mutex_exit(&cpr_lock);
3315 		time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3316 		    tick_delay, TR_CLOCK_TICK);
3317 		mutex_enter(&cpr_lock);
3318 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3319 		mutex_exit(&cpr_lock);
3320 
3321 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3322 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3323 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3324 		    "tmp_last_renewal_time %ld", time_left,
3325 		    sp->last_renewal_time, nfs4_client_resumed,
3326 		    tmp_last_renewal_time));
3327 
3328 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3329 			goto die;
3330 
3331 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3332 		    (nfs4_client_resumed != 0 &&
3333 		    nfs4_client_resumed > sp->last_renewal_time)) {
3334 			/*
3335 			 * Issue RENEW op since we haven't renewed the lease
3336 			 * since we slept.
3337 			 */
3338 			tmp_now_time = gethrestime_sec();
3339 			error = nfs4renew(sp);
3340 			/*
3341 			 * Need to re-acquire sp's lock, nfs4renew()
3342 			 * relinqueshes it.
3343 			 */
3344 			mutex_enter(&sp->s_lock);
3345 
3346 			/*
3347 			 * See if someone changed s_thread_exit while we gave
3348 			 * up s_lock.
3349 			 */
3350 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3351 				goto die;
3352 
3353 			if (!error) {
3354 				/*
3355 				 * check to see if we implicitly renewed while
3356 				 * we waited for a reply for our RENEW call.
3357 				 */
3358 				if (tmp_last_renewal_time ==
3359 				    sp->last_renewal_time) {
3360 					/* no implicit renew came */
3361 					sp->last_renewal_time = tmp_now_time;
3362 				} else {
3363 					NFS4_DEBUG(nfs4_client_lease_debug,
3364 					    (CE_NOTE, "renew_thread: did "
3365 					    "implicit renewal before reply "
3366 					    "from server for RENEW"));
3367 				}
3368 			} else {
3369 				/* figure out error */
3370 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3371 				    "renew_thread: nfs4renew returned error"
3372 				    " %d", error));
3373 			}
3374 
3375 		}
3376 	}
3377 
3378 die:
3379 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3380 	    "nfs4_renew_lease_thread: thread exiting"));
3381 
3382 	while (sp->s_otw_call_count != 0) {
3383 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3384 		    "nfs4_renew_lease_thread: waiting for outstanding "
3385 		    "otw calls to finish for sp 0x%p, current "
3386 		    "s_otw_call_count %d", (void *)sp,
3387 		    sp->s_otw_call_count));
3388 		mutex_enter(&cpr_lock);
3389 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3390 		mutex_exit(&cpr_lock);
3391 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3392 		mutex_enter(&cpr_lock);
3393 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3394 		mutex_exit(&cpr_lock);
3395 	}
3396 	mutex_exit(&sp->s_lock);
3397 
3398 	nfs4_server_rele(sp);		/* free the thread's reference */
3399 	nfs4_server_rele(sp);		/* free the list's reference */
3400 	sp = NULL;
3401 
3402 done:
3403 	mutex_enter(&cpr_lock);
3404 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3405 	mutex_destroy(&cpr_lock);
3406 
3407 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3408 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3409 
3410 	zthread_exit();
3411 	/* NOT REACHED */
3412 }
3413 
3414 /*
3415  * Send out a RENEW op to the server.
3416  * Assumes sp is locked down.
3417  */
3418 static int
3419 nfs4renew(nfs4_server_t *sp)
3420 {
3421 	COMPOUND4args_clnt args;
3422 	COMPOUND4res_clnt res;
3423 	nfs_argop4 argop[1];
3424 	int doqueue = 1;
3425 	int rpc_error;
3426 	cred_t *cr;
3427 	mntinfo4_t *mi;
3428 	timespec_t prop_time, after_time;
3429 	int needrecov = FALSE;
3430 	nfs4_recov_state_t recov_state;
3431 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432 
3433 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3434 
3435 	recov_state.rs_flags = 0;
3436 	recov_state.rs_num_retry_despite_err = 0;
3437 
3438 recov_retry:
3439 	mi = sp->mntinfo4_list;
3440 	VFS_HOLD(mi->mi_vfsp);
3441 	mutex_exit(&sp->s_lock);
3442 	ASSERT(mi != NULL);
3443 
3444 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3445 	if (e.error) {
3446 		VFS_RELE(mi->mi_vfsp);
3447 		return (e.error);
3448 	}
3449 
3450 	/* Check to see if we're dealing with a marked-dead sp */
3451 	mutex_enter(&sp->s_lock);
3452 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3453 		mutex_exit(&sp->s_lock);
3454 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3455 		VFS_RELE(mi->mi_vfsp);
3456 		return (0);
3457 	}
3458 
3459 	/* Make sure mi hasn't changed on us */
3460 	if (mi != sp->mntinfo4_list) {
3461 		/* Must drop sp's lock to avoid a recursive mutex enter */
3462 		mutex_exit(&sp->s_lock);
3463 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3464 		VFS_RELE(mi->mi_vfsp);
3465 		mutex_enter(&sp->s_lock);
3466 		goto recov_retry;
3467 	}
3468 	mutex_exit(&sp->s_lock);
3469 
3470 	args.ctag = TAG_RENEW;
3471 
3472 	args.array_len = 1;
3473 	args.array = argop;
3474 
3475 	argop[0].argop = OP_RENEW;
3476 
3477 	mutex_enter(&sp->s_lock);
3478 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3479 	cr = sp->s_cred;
3480 	crhold(cr);
3481 	mutex_exit(&sp->s_lock);
3482 
3483 	ASSERT(cr != NULL);
3484 
3485 	/* used to figure out RTT for sp */
3486 	gethrestime(&prop_time);
3487 
3488 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3489 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3490 	    (void*)sp));
3491 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3492 	    prop_time.tv_sec, prop_time.tv_nsec));
3493 
3494 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3495 	    mntinfo4_t *, mi);
3496 
3497 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3498 	crfree(cr);
3499 
3500 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3501 	    mntinfo4_t *, mi);
3502 
3503 	gethrestime(&after_time);
3504 
3505 	mutex_enter(&sp->s_lock);
3506 	sp->propagation_delay.tv_sec =
3507 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3508 	mutex_exit(&sp->s_lock);
3509 
3510 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3511 	    after_time.tv_sec, after_time.tv_nsec));
3512 
3513 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3514 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3515 		nfs4_delegreturn_all(sp);
3516 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3517 		VFS_RELE(mi->mi_vfsp);
3518 		/*
3519 		 * If the server returns CB_PATH_DOWN, it has renewed
3520 		 * the lease and informed us that the callback path is
3521 		 * down.  Since the lease is renewed, just return 0 and
3522 		 * let the renew thread proceed as normal.
3523 		 */
3524 		return (0);
3525 	}
3526 
3527 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3528 	if (!needrecov && e.error) {
3529 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3530 		VFS_RELE(mi->mi_vfsp);
3531 		return (e.error);
3532 	}
3533 
3534 	rpc_error = e.error;
3535 
3536 	if (needrecov) {
3537 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3538 		    "nfs4renew: initiating recovery\n"));
3539 
3540 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3541 		    OP_RENEW, NULL, NULL, NULL) == FALSE) {
3542 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3543 			VFS_RELE(mi->mi_vfsp);
3544 			if (!e.error)
3545 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3546 				    (caddr_t)&res);
3547 			mutex_enter(&sp->s_lock);
3548 			goto recov_retry;
3549 		}
3550 		/* fall through for res.status case */
3551 	}
3552 
3553 	if (res.status) {
3554 		if (res.status == NFS4ERR_LEASE_MOVED) {
3555 			/*EMPTY*/
3556 			/*
3557 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3558 			 * to renew the lease on that server
3559 			 */
3560 		}
3561 		e.error = geterrno4(res.status);
3562 	}
3563 
3564 	if (!rpc_error)
3565 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3566 
3567 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3568 
3569 	VFS_RELE(mi->mi_vfsp);
3570 
3571 	return (e.error);
3572 }
3573 
3574 void
3575 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3576 {
3577 	nfs4_server_t	*sp;
3578 
3579 	/* this locks down sp if it is found */
3580 	sp = find_nfs4_server(mi);
3581 
3582 	if (sp != NULL) {
3583 		nfs4_inc_state_ref_count_nolock(sp, mi);
3584 		mutex_exit(&sp->s_lock);
3585 		nfs4_server_rele(sp);
3586 	}
3587 }
3588 
3589 /*
3590  * Bump the number of OPEN files (ie: those with state) so we know if this
3591  * nfs4_server has any state to maintain a lease for or not.
3592  *
3593  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3594  */
3595 void
3596 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3597 {
3598 	ASSERT(mutex_owned(&sp->s_lock));
3599 
3600 	sp->state_ref_count++;
3601 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3602 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3603 	    sp->state_ref_count));
3604 
3605 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3606 		sp->lease_valid = NFS4_LEASE_VALID;
3607 
3608 	/*
3609 	 * If this call caused the lease to be marked valid and/or
3610 	 * took the state_ref_count from 0 to 1, then start the time
3611 	 * on lease renewal.
3612 	 */
3613 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3614 		sp->last_renewal_time = gethrestime_sec();
3615 
3616 	/* update the number of open files for mi */
3617 	mi->mi_open_files++;
3618 }
3619 
3620 void
3621 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3622 {
3623 	nfs4_server_t	*sp;
3624 
3625 	/* this locks down sp if it is found */
3626 	sp = find_nfs4_server_all(mi, 1);
3627 
3628 	if (sp != NULL) {
3629 		nfs4_dec_state_ref_count_nolock(sp, mi);
3630 		mutex_exit(&sp->s_lock);
3631 		nfs4_server_rele(sp);
3632 	}
3633 }
3634 
3635 /*
3636  * Decrement the number of OPEN files (ie: those with state) so we know if
3637  * this nfs4_server has any state to maintain a lease for or not.
3638  */
3639 void
3640 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3641 {
3642 	ASSERT(mutex_owned(&sp->s_lock));
3643 	ASSERT(sp->state_ref_count != 0);
3644 	sp->state_ref_count--;
3645 
3646 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3647 	    "nfs4_dec_state_ref_count: state ref count now %d",
3648 	    sp->state_ref_count));
3649 
3650 	mi->mi_open_files--;
3651 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3652 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3653 	    mi->mi_open_files, mi->mi_flags));
3654 
3655 	/* We don't have to hold the mi_lock to test mi_flags */
3656 	if (mi->mi_open_files == 0 &&
3657 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3658 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3659 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3660 		    "we have closed the last open file", (void*)mi));
3661 		nfs4_remove_mi_from_server(mi, sp);
3662 	}
3663 }
3664 
3665 bool_t
3666 inlease(nfs4_server_t *sp)
3667 {
3668 	bool_t result;
3669 
3670 	ASSERT(mutex_owned(&sp->s_lock));
3671 
3672 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3673 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3674 		result = TRUE;
3675 	else
3676 		result = FALSE;
3677 
3678 	return (result);
3679 }
3680 
3681 
3682 /*
3683  * Return non-zero if the given nfs4_server_t is going through recovery.
3684  */
3685 
3686 int
3687 nfs4_server_in_recovery(nfs4_server_t *sp)
3688 {
3689 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3690 }
3691 
3692 /*
3693  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3694  * first is less than, equal to, or greater than the second.
3695  */
3696 
3697 int
3698 sfh4cmp(const void *p1, const void *p2)
3699 {
3700 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3701 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3702 
3703 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3704 }
3705 
3706 /*
3707  * Create a table for shared filehandle objects.
3708  */
3709 
3710 void
3711 sfh4_createtab(avl_tree_t *tab)
3712 {
3713 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3714 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3715 }
3716 
3717 /*
3718  * Return a shared filehandle object for the given filehandle.  The caller
3719  * is responsible for eventually calling sfh4_rele().
3720  */
3721 
3722 nfs4_sharedfh_t *
3723 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3724 {
3725 	nfs4_sharedfh_t *sfh, *nsfh;
3726 	avl_index_t where;
3727 	nfs4_sharedfh_t skey;
3728 
3729 	if (!key) {
3730 		skey.sfh_fh = *fh;
3731 		key = &skey;
3732 	}
3733 
3734 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3735 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3736 	/*
3737 	 * We allocate the largest possible filehandle size because it's
3738 	 * not that big, and it saves us from possibly having to resize the
3739 	 * buffer later.
3740 	 */
3741 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3742 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3743 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3744 	nsfh->sfh_refcnt = 1;
3745 	nsfh->sfh_flags = SFH4_IN_TREE;
3746 	nsfh->sfh_mi = mi;
3747 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3748 	    (void *)nsfh));
3749 
3750 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3751 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3752 	if (sfh != NULL) {
3753 		mutex_enter(&sfh->sfh_lock);
3754 		sfh->sfh_refcnt++;
3755 		mutex_exit(&sfh->sfh_lock);
3756 		nfs_rw_exit(&mi->mi_fh_lock);
3757 		/* free our speculative allocs */
3758 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3759 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3760 		return (sfh);
3761 	}
3762 
3763 	avl_insert(&mi->mi_filehandles, nsfh, where);
3764 	nfs_rw_exit(&mi->mi_fh_lock);
3765 
3766 	return (nsfh);
3767 }
3768 
3769 /*
3770  * Return a shared filehandle object for the given filehandle.  The caller
3771  * is responsible for eventually calling sfh4_rele().
3772  */
3773 
3774 nfs4_sharedfh_t *
3775 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3776 {
3777 	nfs4_sharedfh_t *sfh;
3778 	nfs4_sharedfh_t key;
3779 
3780 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3781 
3782 #ifdef DEBUG
3783 	if (nfs4_sharedfh_debug) {
3784 		nfs4_fhandle_t fhandle;
3785 
3786 		fhandle.fh_len = fh->nfs_fh4_len;
3787 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3788 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3789 		nfs4_printfhandle(&fhandle);
3790 	}
3791 #endif
3792 
3793 	/*
3794 	 * If there's already an object for the given filehandle, bump the
3795 	 * reference count and return it.  Otherwise, create a new object
3796 	 * and add it to the AVL tree.
3797 	 */
3798 
3799 	key.sfh_fh = *fh;
3800 
3801 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3802 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3803 	if (sfh != NULL) {
3804 		mutex_enter(&sfh->sfh_lock);
3805 		sfh->sfh_refcnt++;
3806 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3807 		    "sfh4_get: found existing %p, new refcnt=%d",
3808 		    (void *)sfh, sfh->sfh_refcnt));
3809 		mutex_exit(&sfh->sfh_lock);
3810 		nfs_rw_exit(&mi->mi_fh_lock);
3811 		return (sfh);
3812 	}
3813 	nfs_rw_exit(&mi->mi_fh_lock);
3814 
3815 	return (sfh4_put(fh, mi, &key));
3816 }
3817 
3818 /*
3819  * Get a reference to the given shared filehandle object.
3820  */
3821 
3822 void
3823 sfh4_hold(nfs4_sharedfh_t *sfh)
3824 {
3825 	ASSERT(sfh->sfh_refcnt > 0);
3826 
3827 	mutex_enter(&sfh->sfh_lock);
3828 	sfh->sfh_refcnt++;
3829 	NFS4_DEBUG(nfs4_sharedfh_debug,
3830 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3831 	    (void *)sfh, sfh->sfh_refcnt));
3832 	mutex_exit(&sfh->sfh_lock);
3833 }
3834 
3835 /*
3836  * Release a reference to the given shared filehandle object and null out
3837  * the given pointer.
3838  */
3839 
3840 void
3841 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3842 {
3843 	mntinfo4_t *mi;
3844 	nfs4_sharedfh_t *sfh = *sfhpp;
3845 
3846 	ASSERT(sfh->sfh_refcnt > 0);
3847 
3848 	mutex_enter(&sfh->sfh_lock);
3849 	if (sfh->sfh_refcnt > 1) {
3850 		sfh->sfh_refcnt--;
3851 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3852 		    "sfh4_rele %p, new refcnt=%d",
3853 		    (void *)sfh, sfh->sfh_refcnt));
3854 		mutex_exit(&sfh->sfh_lock);
3855 		goto finish;
3856 	}
3857 	mutex_exit(&sfh->sfh_lock);
3858 
3859 	/*
3860 	 * Possibly the last reference, so get the lock for the table in
3861 	 * case it's time to remove the object from the table.
3862 	 */
3863 	mi = sfh->sfh_mi;
3864 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3865 	mutex_enter(&sfh->sfh_lock);
3866 	sfh->sfh_refcnt--;
3867 	if (sfh->sfh_refcnt > 0) {
3868 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3869 		    "sfh4_rele %p, new refcnt=%d",
3870 		    (void *)sfh, sfh->sfh_refcnt));
3871 		mutex_exit(&sfh->sfh_lock);
3872 		nfs_rw_exit(&mi->mi_fh_lock);
3873 		goto finish;
3874 	}
3875 
3876 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3877 	    "sfh4_rele %p, last ref", (void *)sfh));
3878 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3879 		avl_remove(&mi->mi_filehandles, sfh);
3880 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3881 	}
3882 	mutex_exit(&sfh->sfh_lock);
3883 	nfs_rw_exit(&mi->mi_fh_lock);
3884 	mutex_destroy(&sfh->sfh_lock);
3885 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3886 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3887 
3888 finish:
3889 	*sfhpp = NULL;
3890 }
3891 
3892 /*
3893  * Update the filehandle for the given shared filehandle object.
3894  */
3895 
3896 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3897 
3898 void
3899 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3900 {
3901 	mntinfo4_t *mi = sfh->sfh_mi;
3902 	nfs4_sharedfh_t *dupsfh;
3903 	avl_index_t where;
3904 	nfs4_sharedfh_t key;
3905 
3906 #ifdef DEBUG
3907 	mutex_enter(&sfh->sfh_lock);
3908 	ASSERT(sfh->sfh_refcnt > 0);
3909 	mutex_exit(&sfh->sfh_lock);
3910 #endif
3911 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3912 
3913 	/*
3914 	 * The basic plan is to remove the shared filehandle object from
3915 	 * the table, update it to have the new filehandle, then reinsert
3916 	 * it.
3917 	 */
3918 
3919 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3920 	mutex_enter(&sfh->sfh_lock);
3921 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3922 		avl_remove(&mi->mi_filehandles, sfh);
3923 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3924 	}
3925 	mutex_exit(&sfh->sfh_lock);
3926 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3927 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3928 	    sfh->sfh_fh.nfs_fh4_len);
3929 
3930 	/*
3931 	 * XXX If there is already a shared filehandle object with the new
3932 	 * filehandle, we're in trouble, because the rnode code assumes
3933 	 * that there is only one shared filehandle object for a given
3934 	 * filehandle.  So issue a warning (for read-write mounts only)
3935 	 * and don't try to re-insert the given object into the table.
3936 	 * Hopefully the given object will quickly go away and everyone
3937 	 * will use the new object.
3938 	 */
3939 	key.sfh_fh = *newfh;
3940 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3941 	if (dupsfh != NULL) {
3942 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3943 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3944 			    "duplicate filehandle detected");
3945 			sfh4_printfhandle(dupsfh);
3946 		}
3947 	} else {
3948 		avl_insert(&mi->mi_filehandles, sfh, where);
3949 		mutex_enter(&sfh->sfh_lock);
3950 		sfh->sfh_flags |= SFH4_IN_TREE;
3951 		mutex_exit(&sfh->sfh_lock);
3952 	}
3953 	nfs_rw_exit(&mi->mi_fh_lock);
3954 }
3955 
3956 /*
3957  * Copy out the current filehandle for the given shared filehandle object.
3958  */
3959 
3960 void
3961 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3962 {
3963 	mntinfo4_t *mi = sfh->sfh_mi;
3964 
3965 	ASSERT(sfh->sfh_refcnt > 0);
3966 
3967 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3968 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3969 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3970 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3971 	nfs_rw_exit(&mi->mi_fh_lock);
3972 }
3973 
3974 /*
3975  * Print out the filehandle for the given shared filehandle object.
3976  */
3977 
3978 void
3979 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3980 {
3981 	nfs4_fhandle_t fhandle;
3982 
3983 	sfh4_copyval(sfh, &fhandle);
3984 	nfs4_printfhandle(&fhandle);
3985 }
3986 
3987 /*
3988  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3989  * if they're the same, +1 if the first is "greater" than the second.  The
3990  * caller (or whoever's calling the AVL package) is responsible for
3991  * handling locking issues.
3992  */
3993 
3994 static int
3995 fncmp(const void *p1, const void *p2)
3996 {
3997 	const nfs4_fname_t *f1 = p1;
3998 	const nfs4_fname_t *f2 = p2;
3999 	int res;
4000 
4001 	res = strcmp(f1->fn_name, f2->fn_name);
4002 	/*
4003 	 * The AVL package wants +/-1, not arbitrary positive or negative
4004 	 * integers.
4005 	 */
4006 	if (res > 0)
4007 		res = 1;
4008 	else if (res < 0)
4009 		res = -1;
4010 	return (res);
4011 }
4012 
4013 /*
4014  * Get or create an fname with the given name, as a child of the given
4015  * fname.  The caller is responsible for eventually releasing the reference
4016  * (fn_rele()).  parent may be NULL.
4017  */
4018 
4019 nfs4_fname_t *
4020 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
4021 {
4022 	nfs4_fname_t key;
4023 	nfs4_fname_t *fnp;
4024 	avl_index_t where;
4025 
4026 	key.fn_name = name;
4027 
4028 	/*
4029 	 * If there's already an fname registered with the given name, bump
4030 	 * its reference count and return it.  Otherwise, create a new one
4031 	 * and add it to the parent's AVL tree.
4032 	 *
4033 	 * fname entries we are looking for should match both name
4034 	 * and sfh stored in the fname.
4035 	 */
4036 again:
4037 	if (parent != NULL) {
4038 		mutex_enter(&parent->fn_lock);
4039 		fnp = avl_find(&parent->fn_children, &key, &where);
4040 		if (fnp != NULL) {
4041 			/*
4042 			 * This hold on fnp is released below later,
4043 			 * in case this is not the fnp we want.
4044 			 */
4045 			fn_hold(fnp);
4046 
4047 			if (fnp->fn_sfh == sfh) {
4048 				/*
4049 				 * We have found our entry.
4050 				 * put an hold and return it.
4051 				 */
4052 				mutex_exit(&parent->fn_lock);
4053 				return (fnp);
4054 			}
4055 
4056 			/*
4057 			 * We have found an entry that has a mismatching
4058 			 * fn_sfh. This could be a stale entry due to
4059 			 * server side rename. We will remove this entry
4060 			 * and make sure no such entries exist.
4061 			 */
4062 			mutex_exit(&parent->fn_lock);
4063 			mutex_enter(&fnp->fn_lock);
4064 			if (fnp->fn_parent == parent) {
4065 				/*
4066 				 * Remove ourselves from parent's
4067 				 * fn_children tree.
4068 				 */
4069 				mutex_enter(&parent->fn_lock);
4070 				avl_remove(&parent->fn_children, fnp);
4071 				mutex_exit(&parent->fn_lock);
4072 				fn_rele(&fnp->fn_parent);
4073 			}
4074 			mutex_exit(&fnp->fn_lock);
4075 			fn_rele(&fnp);
4076 			goto again;
4077 		}
4078 	}
4079 
4080 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4081 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4082 	fnp->fn_parent = parent;
4083 	if (parent != NULL)
4084 		fn_hold(parent);
4085 	fnp->fn_len = strlen(name);
4086 	ASSERT(fnp->fn_len < MAXNAMELEN);
4087 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4088 	(void) strcpy(fnp->fn_name, name);
4089 	fnp->fn_refcnt = 1;
4090 
4091 	/*
4092 	 * This hold on sfh is later released
4093 	 * when we do the final fn_rele() on this fname.
4094 	 */
4095 	sfh4_hold(sfh);
4096 	fnp->fn_sfh = sfh;
4097 
4098 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4099 	    offsetof(nfs4_fname_t, fn_tree));
4100 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4101 	    "fn_get %p:%s, a new nfs4_fname_t!",
4102 	    (void *)fnp, fnp->fn_name));
4103 	if (parent != NULL) {
4104 		avl_insert(&parent->fn_children, fnp, where);
4105 		mutex_exit(&parent->fn_lock);
4106 	}
4107 
4108 	return (fnp);
4109 }
4110 
4111 void
4112 fn_hold(nfs4_fname_t *fnp)
4113 {
4114 	atomic_add_32(&fnp->fn_refcnt, 1);
4115 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4116 	    "fn_hold %p:%s, new refcnt=%d",
4117 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4118 }
4119 
4120 /*
4121  * Decrement the reference count of the given fname, and destroy it if its
4122  * reference count goes to zero.  Nulls out the given pointer.
4123  */
4124 
4125 void
4126 fn_rele(nfs4_fname_t **fnpp)
4127 {
4128 	nfs4_fname_t *parent;
4129 	uint32_t newref;
4130 	nfs4_fname_t *fnp;
4131 
4132 recur:
4133 	fnp = *fnpp;
4134 	*fnpp = NULL;
4135 
4136 	mutex_enter(&fnp->fn_lock);
4137 	parent = fnp->fn_parent;
4138 	if (parent != NULL)
4139 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4140 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4141 	if (newref > 0) {
4142 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4143 		    "fn_rele %p:%s, new refcnt=%d",
4144 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4145 		if (parent != NULL)
4146 			mutex_exit(&parent->fn_lock);
4147 		mutex_exit(&fnp->fn_lock);
4148 		return;
4149 	}
4150 
4151 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4152 	    "fn_rele %p:%s, last reference, deleting...",
4153 	    (void *)fnp, fnp->fn_name));
4154 	if (parent != NULL) {
4155 		avl_remove(&parent->fn_children, fnp);
4156 		mutex_exit(&parent->fn_lock);
4157 	}
4158 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4159 	sfh4_rele(&fnp->fn_sfh);
4160 	mutex_destroy(&fnp->fn_lock);
4161 	avl_destroy(&fnp->fn_children);
4162 	kmem_free(fnp, sizeof (nfs4_fname_t));
4163 	/*
4164 	 * Recursivly fn_rele the parent.
4165 	 * Use goto instead of a recursive call to avoid stack overflow.
4166 	 */
4167 	if (parent != NULL) {
4168 		fnpp = &parent;
4169 		goto recur;
4170 	}
4171 }
4172 
4173 /*
4174  * Returns the single component name of the given fname, in a MAXNAMELEN
4175  * string buffer, which the caller is responsible for freeing.  Note that
4176  * the name may become invalid as a result of fn_move().
4177  */
4178 
4179 char *
4180 fn_name(nfs4_fname_t *fnp)
4181 {
4182 	char *name;
4183 
4184 	ASSERT(fnp->fn_len < MAXNAMELEN);
4185 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4186 	mutex_enter(&fnp->fn_lock);
4187 	(void) strcpy(name, fnp->fn_name);
4188 	mutex_exit(&fnp->fn_lock);
4189 
4190 	return (name);
4191 }
4192 
4193 
4194 /*
4195  * fn_path_realloc
4196  *
4197  * This function, used only by fn_path, constructs
4198  * a new string which looks like "prepend" + "/" + "current".
4199  * by allocating a new string and freeing the old one.
4200  */
4201 static void
4202 fn_path_realloc(char **curses, char *prepend)
4203 {
4204 	int len, curlen = 0;
4205 	char *news;
4206 
4207 	if (*curses == NULL) {
4208 		/*
4209 		 * Prime the pump, allocate just the
4210 		 * space for prepend and return that.
4211 		 */
4212 		len = strlen(prepend) + 1;
4213 		news = kmem_alloc(len, KM_SLEEP);
4214 		(void) strncpy(news, prepend, len);
4215 	} else {
4216 		/*
4217 		 * Allocate the space  for a new string
4218 		 * +1 +1 is for the "/" and the NULL
4219 		 * byte at the end of it all.
4220 		 */
4221 		curlen = strlen(*curses);
4222 		len = curlen + strlen(prepend) + 1 + 1;
4223 		news = kmem_alloc(len, KM_SLEEP);
4224 		(void) strncpy(news, prepend, len);
4225 		(void) strcat(news, "/");
4226 		(void) strcat(news, *curses);
4227 		kmem_free(*curses, curlen + 1);
4228 	}
4229 	*curses = news;
4230 }
4231 
4232 /*
4233  * Returns the path name (starting from the fs root) for the given fname.
4234  * The caller is responsible for freeing.  Note that the path may be or
4235  * become invalid as a result of fn_move().
4236  */
4237 
4238 char *
4239 fn_path(nfs4_fname_t *fnp)
4240 {
4241 	char *path;
4242 	nfs4_fname_t *nextfnp;
4243 
4244 	if (fnp == NULL)
4245 		return (NULL);
4246 
4247 	path = NULL;
4248 
4249 	/* walk up the tree constructing the pathname.  */
4250 
4251 	fn_hold(fnp);			/* adjust for later rele */
4252 	do {
4253 		mutex_enter(&fnp->fn_lock);
4254 		/*
4255 		 * Add fn_name in front of the current path
4256 		 */
4257 		fn_path_realloc(&path, fnp->fn_name);
4258 		nextfnp = fnp->fn_parent;
4259 		if (nextfnp != NULL)
4260 			fn_hold(nextfnp);
4261 		mutex_exit(&fnp->fn_lock);
4262 		fn_rele(&fnp);
4263 		fnp = nextfnp;
4264 	} while (fnp != NULL);
4265 
4266 	return (path);
4267 }
4268 
4269 /*
4270  * Return a reference to the parent of the given fname, which the caller is
4271  * responsible for eventually releasing.
4272  */
4273 
4274 nfs4_fname_t *
4275 fn_parent(nfs4_fname_t *fnp)
4276 {
4277 	nfs4_fname_t *parent;
4278 
4279 	mutex_enter(&fnp->fn_lock);
4280 	parent = fnp->fn_parent;
4281 	if (parent != NULL)
4282 		fn_hold(parent);
4283 	mutex_exit(&fnp->fn_lock);
4284 
4285 	return (parent);
4286 }
4287 
4288 /*
4289  * Update fnp so that its parent is newparent and its name is newname.
4290  */
4291 
4292 void
4293 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4294 {
4295 	nfs4_fname_t *parent, *tmpfnp;
4296 	ssize_t newlen;
4297 	nfs4_fname_t key;
4298 	avl_index_t where;
4299 
4300 	/*
4301 	 * This assert exists to catch the client trying to rename
4302 	 * a dir to be a child of itself.  This happened at a recent
4303 	 * bakeoff against a 3rd party (broken) server which allowed
4304 	 * the rename to succeed.  If it trips it means that:
4305 	 *	a) the code in nfs4rename that detects this case is broken
4306 	 *	b) the server is broken (since it allowed the bogus rename)
4307 	 *
4308 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4309 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4310 	 */
4311 	ASSERT(fnp != newparent);
4312 
4313 	/*
4314 	 * Remove fnp from its current parent, change its name, then add it
4315 	 * to newparent. It might happen that fnp was replaced by another
4316 	 * nfs4_fname_t with the same fn_name in parent->fn_children.
4317 	 * In such case, fnp->fn_parent is NULL and we skip the removal
4318 	 * of fnp from its current parent.
4319 	 */
4320 	mutex_enter(&fnp->fn_lock);
4321 	parent = fnp->fn_parent;
4322 	if (parent != NULL) {
4323 		mutex_enter(&parent->fn_lock);
4324 		avl_remove(&parent->fn_children, fnp);
4325 		mutex_exit(&parent->fn_lock);
4326 		fn_rele(&fnp->fn_parent);
4327 	}
4328 
4329 	newlen = strlen(newname);
4330 	if (newlen != fnp->fn_len) {
4331 		ASSERT(newlen < MAXNAMELEN);
4332 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4333 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4334 		fnp->fn_len = newlen;
4335 	}
4336 	(void) strcpy(fnp->fn_name, newname);
4337 
4338 again:
4339 	mutex_enter(&newparent->fn_lock);
4340 	key.fn_name = fnp->fn_name;
4341 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4342 	if (tmpfnp != NULL) {
4343 		/*
4344 		 * This could be due to a file that was unlinked while
4345 		 * open, or perhaps the rnode is in the free list.  Remove
4346 		 * it from newparent and let it go away on its own.  The
4347 		 * contorted code is to deal with lock order issues and
4348 		 * race conditions.
4349 		 */
4350 		fn_hold(tmpfnp);
4351 		mutex_exit(&newparent->fn_lock);
4352 		mutex_enter(&tmpfnp->fn_lock);
4353 		if (tmpfnp->fn_parent == newparent) {
4354 			mutex_enter(&newparent->fn_lock);
4355 			avl_remove(&newparent->fn_children, tmpfnp);
4356 			mutex_exit(&newparent->fn_lock);
4357 			fn_rele(&tmpfnp->fn_parent);
4358 		}
4359 		mutex_exit(&tmpfnp->fn_lock);
4360 		fn_rele(&tmpfnp);
4361 		goto again;
4362 	}
4363 	fnp->fn_parent = newparent;
4364 	fn_hold(newparent);
4365 	avl_insert(&newparent->fn_children, fnp, where);
4366 	mutex_exit(&newparent->fn_lock);
4367 	mutex_exit(&fnp->fn_lock);
4368 }
4369 
4370 #ifdef DEBUG
4371 /*
4372  * Return non-zero if the type information makes sense for the given vnode.
4373  * Otherwise panic.
4374  */
4375 int
4376 nfs4_consistent_type(vnode_t *vp)
4377 {
4378 	rnode4_t *rp = VTOR4(vp);
4379 
4380 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4381 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4382 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4383 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4384 		    rp->r_attr.va_type);
4385 	}
4386 
4387 	return (1);
4388 }
4389 #endif /* DEBUG */
4390