xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_client.c (revision 56f33205c9ed776c3c909e07d52e94610a675740)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
28  *	All Rights Reserved
29  */
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/thread.h>
35 #include <sys/t_lock.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/errno.h>
40 #include <sys/buf.h>
41 #include <sys/stat.h>
42 #include <sys/cred.h>
43 #include <sys/kmem.h>
44 #include <sys/debug.h>
45 #include <sys/dnlc.h>
46 #include <sys/vmsystm.h>
47 #include <sys/flock.h>
48 #include <sys/share.h>
49 #include <sys/cmn_err.h>
50 #include <sys/tiuser.h>
51 #include <sys/sysmacros.h>
52 #include <sys/callb.h>
53 #include <sys/acl.h>
54 #include <sys/kstat.h>
55 #include <sys/signal.h>
56 #include <sys/disp.h>
57 #include <sys/atomic.h>
58 #include <sys/list.h>
59 #include <sys/sdt.h>
60 
61 #include <rpc/types.h>
62 #include <rpc/xdr.h>
63 #include <rpc/auth.h>
64 #include <rpc/clnt.h>
65 
66 #include <nfs/nfs.h>
67 #include <nfs/nfs_clnt.h>
68 #include <nfs/nfs_acl.h>
69 
70 #include <nfs/nfs4.h>
71 #include <nfs/rnode4.h>
72 #include <nfs/nfs4_clnt.h>
73 
74 #include <vm/hat.h>
75 #include <vm/as.h>
76 #include <vm/page.h>
77 #include <vm/pvn.h>
78 #include <vm/seg.h>
79 #include <vm/seg_map.h>
80 #include <vm/seg_vn.h>
81 
82 #include <sys/ddi.h>
83 
84 /*
85  * Arguments to page-flush thread.
86  */
87 typedef struct {
88 	vnode_t *vp;
89 	cred_t *cr;
90 } pgflush_t;
91 
92 #ifdef DEBUG
93 int nfs4_client_lease_debug;
94 int nfs4_sharedfh_debug;
95 int nfs4_fname_debug;
96 
97 /* temporary: panic if v_type is inconsistent with r_attr va_type */
98 int nfs4_vtype_debug;
99 
100 uint_t nfs4_tsd_key;
101 #endif
102 
103 static time_t	nfs4_client_resumed = 0;
104 static	callb_id_t cid = 0;
105 
106 static int	nfs4renew(nfs4_server_t *);
107 static void	nfs4_attrcache_va(vnode_t *, nfs4_ga_res_t *, int);
108 static void	nfs4_pgflush_thread(pgflush_t *);
109 
110 static boolean_t nfs4_client_cpr_callb(void *, int);
111 
112 struct mi4_globals {
113 	kmutex_t	mig_lock;  /* lock protecting mig_list */
114 	list_t		mig_list;  /* list of NFS v4 mounts in zone */
115 	boolean_t	mig_destructor_called;
116 };
117 
118 static zone_key_t mi4_list_key;
119 
120 /*
121  * Attributes caching:
122  *
123  * Attributes are cached in the rnode in struct vattr form.
124  * There is a time associated with the cached attributes (r_time_attr_inval)
125  * which tells whether the attributes are valid. The time is initialized
126  * to the difference between current time and the modify time of the vnode
127  * when new attributes are cached. This allows the attributes for
128  * files that have changed recently to be timed out sooner than for files
129  * that have not changed for a long time. There are minimum and maximum
130  * timeout values that can be set per mount point.
131  */
132 
133 /*
134  * If a cache purge is in progress, wait for it to finish.
135  *
136  * The current thread must not be in the middle of an
137  * nfs4_start_op/nfs4_end_op region.  Otherwise, there could be a deadlock
138  * between this thread, a recovery thread, and the page flush thread.
139  */
140 int
141 nfs4_waitfor_purge_complete(vnode_t *vp)
142 {
143 	rnode4_t *rp;
144 	k_sigset_t smask;
145 
146 	rp = VTOR4(vp);
147 	if ((rp->r_serial != NULL && rp->r_serial != curthread) ||
148 	    ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread)) {
149 		mutex_enter(&rp->r_statelock);
150 		sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
151 		while ((rp->r_serial != NULL && rp->r_serial != curthread) ||
152 		    ((rp->r_flags & R4PGFLUSH) &&
153 		    rp->r_pgflush != curthread)) {
154 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
155 				sigunintr(&smask);
156 				mutex_exit(&rp->r_statelock);
157 				return (EINTR);
158 			}
159 		}
160 		sigunintr(&smask);
161 		mutex_exit(&rp->r_statelock);
162 	}
163 	return (0);
164 }
165 
166 /*
167  * Validate caches by checking cached attributes. If they have timed out,
168  * then get new attributes from the server.  As a side effect, cache
169  * invalidation is done if the attributes have changed.
170  *
171  * If the attributes have not timed out and if there is a cache
172  * invalidation being done by some other thread, then wait until that
173  * thread has completed the cache invalidation.
174  */
175 int
176 nfs4_validate_caches(vnode_t *vp, cred_t *cr)
177 {
178 	int error;
179 	nfs4_ga_res_t gar;
180 
181 	if (ATTRCACHE4_VALID(vp)) {
182 		error = nfs4_waitfor_purge_complete(vp);
183 		if (error)
184 			return (error);
185 		return (0);
186 	}
187 
188 	gar.n4g_va.va_mask = AT_ALL;
189 	return (nfs4_getattr_otw(vp, &gar, cr, 0));
190 }
191 
192 /*
193  * Fill in attribute from the cache.
194  * If valid, then return 0 to indicate that no error occurred,
195  * otherwise return 1 to indicate that an error occurred.
196  */
197 static int
198 nfs4_getattr_cache(vnode_t *vp, struct vattr *vap)
199 {
200 	rnode4_t *rp;
201 
202 	rp = VTOR4(vp);
203 	mutex_enter(&rp->r_statelock);
204 	mutex_enter(&rp->r_statev4_lock);
205 	if (ATTRCACHE4_VALID(vp)) {
206 		mutex_exit(&rp->r_statev4_lock);
207 		/*
208 		 * Cached attributes are valid
209 		 */
210 		*vap = rp->r_attr;
211 		mutex_exit(&rp->r_statelock);
212 		return (0);
213 	}
214 	mutex_exit(&rp->r_statev4_lock);
215 	mutex_exit(&rp->r_statelock);
216 	return (1);
217 }
218 
219 
220 /*
221  * If returned error is ESTALE flush all caches.  The nfs4_purge_caches()
222  * call is synchronous because all the pages were invalidated by the
223  * nfs4_invalidate_pages() call.
224  */
225 void
226 nfs4_purge_stale_fh(int errno, vnode_t *vp, cred_t *cr)
227 {
228 	struct rnode4 *rp = VTOR4(vp);
229 
230 	/* Ensure that the ..._end_op() call has been done */
231 	ASSERT(tsd_get(nfs4_tsd_key) == NULL);
232 
233 	if (errno != ESTALE)
234 		return;
235 
236 	mutex_enter(&rp->r_statelock);
237 	rp->r_flags |= R4STALE;
238 	if (!rp->r_error)
239 		rp->r_error = errno;
240 	mutex_exit(&rp->r_statelock);
241 	if (nfs4_has_pages(vp))
242 		nfs4_invalidate_pages(vp, (u_offset_t)0, cr);
243 	nfs4_purge_caches(vp, NFS4_PURGE_DNLC, cr, FALSE);
244 }
245 
246 /*
247  * Purge all of the various NFS `data' caches.  If "asyncpg" is TRUE, the
248  * page purge is done asynchronously.
249  */
250 void
251 nfs4_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr, int asyncpg)
252 {
253 	rnode4_t *rp;
254 	char *contents;
255 	vnode_t *xattr;
256 	int size;
257 	int pgflush;			/* are we the page flush thread? */
258 
259 	/*
260 	 * Purge the DNLC for any entries which refer to this file.
261 	 */
262 	if (vp->v_count > 1 &&
263 	    (vp->v_type == VDIR || purge_dnlc == NFS4_PURGE_DNLC))
264 		dnlc_purge_vp(vp);
265 
266 	/*
267 	 * Clear any readdir state bits and purge the readlink response cache.
268 	 */
269 	rp = VTOR4(vp);
270 	mutex_enter(&rp->r_statelock);
271 	rp->r_flags &= ~R4LOOKUP;
272 	contents = rp->r_symlink.contents;
273 	size = rp->r_symlink.size;
274 	rp->r_symlink.contents = NULL;
275 
276 	xattr = rp->r_xattr_dir;
277 	rp->r_xattr_dir = NULL;
278 
279 	/*
280 	 * Purge pathconf cache too.
281 	 */
282 	rp->r_pathconf.pc4_xattr_valid = 0;
283 	rp->r_pathconf.pc4_cache_valid = 0;
284 
285 	pgflush = (curthread == rp->r_pgflush);
286 	mutex_exit(&rp->r_statelock);
287 
288 	if (contents != NULL) {
289 
290 		kmem_free((void *)contents, size);
291 	}
292 
293 	if (xattr != NULL)
294 		VN_RELE(xattr);
295 
296 	/*
297 	 * Flush the page cache.  If the current thread is the page flush
298 	 * thread, don't initiate a new page flush.  There's no need for
299 	 * it, and doing it correctly is hard.
300 	 */
301 	if (nfs4_has_pages(vp) && !pgflush) {
302 		if (!asyncpg) {
303 			(void) nfs4_waitfor_purge_complete(vp);
304 			nfs4_flush_pages(vp, cr);
305 		} else {
306 			pgflush_t *args;
307 
308 			/*
309 			 * We don't hold r_statelock while creating the
310 			 * thread, in case the call blocks.  So we use a
311 			 * flag to indicate that a page flush thread is
312 			 * active.
313 			 */
314 			mutex_enter(&rp->r_statelock);
315 			if (rp->r_flags & R4PGFLUSH) {
316 				mutex_exit(&rp->r_statelock);
317 			} else {
318 				rp->r_flags |= R4PGFLUSH;
319 				mutex_exit(&rp->r_statelock);
320 
321 				args = kmem_alloc(sizeof (pgflush_t),
322 				    KM_SLEEP);
323 				args->vp = vp;
324 				VN_HOLD(args->vp);
325 				args->cr = cr;
326 				crhold(args->cr);
327 				(void) zthread_create(NULL, 0,
328 				    nfs4_pgflush_thread, args, 0,
329 				    minclsyspri);
330 			}
331 		}
332 	}
333 
334 	/*
335 	 * Flush the readdir response cache.
336 	 */
337 	nfs4_purge_rddir_cache(vp);
338 }
339 
340 /*
341  * Invalidate all pages for the given file, after writing back the dirty
342  * ones.
343  */
344 
345 void
346 nfs4_flush_pages(vnode_t *vp, cred_t *cr)
347 {
348 	int error;
349 	rnode4_t *rp = VTOR4(vp);
350 
351 	error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr, NULL);
352 	if (error == ENOSPC || error == EDQUOT) {
353 		mutex_enter(&rp->r_statelock);
354 		if (!rp->r_error)
355 			rp->r_error = error;
356 		mutex_exit(&rp->r_statelock);
357 	}
358 }
359 
360 /*
361  * Page flush thread.
362  */
363 
364 static void
365 nfs4_pgflush_thread(pgflush_t *args)
366 {
367 	rnode4_t *rp = VTOR4(args->vp);
368 
369 	/* remember which thread we are, so we don't deadlock ourselves */
370 	mutex_enter(&rp->r_statelock);
371 	ASSERT(rp->r_pgflush == NULL);
372 	rp->r_pgflush = curthread;
373 	mutex_exit(&rp->r_statelock);
374 
375 	nfs4_flush_pages(args->vp, args->cr);
376 
377 	mutex_enter(&rp->r_statelock);
378 	rp->r_pgflush = NULL;
379 	rp->r_flags &= ~R4PGFLUSH;
380 	cv_broadcast(&rp->r_cv);
381 	mutex_exit(&rp->r_statelock);
382 
383 	VN_RELE(args->vp);
384 	crfree(args->cr);
385 	kmem_free(args, sizeof (pgflush_t));
386 	zthread_exit();
387 }
388 
389 /*
390  * Purge the readdir cache of all entries which are not currently
391  * being filled.
392  */
393 void
394 nfs4_purge_rddir_cache(vnode_t *vp)
395 {
396 	rnode4_t *rp;
397 
398 	rp = VTOR4(vp);
399 
400 	mutex_enter(&rp->r_statelock);
401 	rp->r_direof = NULL;
402 	rp->r_flags &= ~R4LOOKUP;
403 	rp->r_flags |= R4READDIRWATTR;
404 	rddir4_cache_purge(rp);
405 	mutex_exit(&rp->r_statelock);
406 }
407 
408 /*
409  * Set attributes cache for given vnode using virtual attributes.  There is
410  * no cache validation, but if the attributes are deemed to be stale, they
411  * are ignored.  This corresponds to nfs3_attrcache().
412  *
413  * Set the timeout value on the attribute cache and fill it
414  * with the passed in attributes.
415  */
416 void
417 nfs4_attrcache_noinval(vnode_t *vp, nfs4_ga_res_t *garp, hrtime_t t)
418 {
419 	rnode4_t *rp = VTOR4(vp);
420 
421 	mutex_enter(&rp->r_statelock);
422 	if (rp->r_time_attr_saved <= t)
423 		nfs4_attrcache_va(vp, garp, FALSE);
424 	mutex_exit(&rp->r_statelock);
425 }
426 
427 /*
428  * Use the passed in virtual attributes to check to see whether the
429  * data and metadata caches are valid, cache the new attributes, and
430  * then do the cache invalidation if required.
431  *
432  * The cache validation and caching of the new attributes is done
433  * atomically via the use of the mutex, r_statelock.  If required,
434  * the cache invalidation is done atomically w.r.t. the cache
435  * validation and caching of the attributes via the pseudo lock,
436  * r_serial.
437  *
438  * This routine is used to do cache validation and attributes caching
439  * for operations with a single set of post operation attributes.
440  */
441 
442 void
443 nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp,
444     hrtime_t t, cred_t *cr, int async,
445     change_info4 *cinfo)
446 {
447 	rnode4_t *rp;
448 	int mtime_changed = 0;
449 	int ctime_changed = 0;
450 	vsecattr_t *vsp;
451 	int was_serial, set_time_cache_inval, recov;
452 	vattr_t *vap = &garp->n4g_va;
453 	mntinfo4_t *mi = VTOMI4(vp);
454 	len_t preattr_rsize;
455 	boolean_t writemodify_set = B_FALSE;
456 	boolean_t cachepurge_set = B_FALSE;
457 
458 	ASSERT(mi->mi_vfsp->vfs_dev == garp->n4g_va.va_fsid);
459 
460 	/* Is curthread the recovery thread? */
461 	mutex_enter(&mi->mi_lock);
462 	recov = (VTOMI4(vp)->mi_recovthread == curthread);
463 	mutex_exit(&mi->mi_lock);
464 
465 	rp = VTOR4(vp);
466 	mutex_enter(&rp->r_statelock);
467 	was_serial = (rp->r_serial == curthread);
468 	if (rp->r_serial && !was_serial) {
469 		klwp_t *lwp = ttolwp(curthread);
470 
471 		/*
472 		 * If we're the recovery thread, then purge current attrs
473 		 * and bail out to avoid potential deadlock between another
474 		 * thread caching attrs (r_serial thread), recov thread,
475 		 * and an async writer thread.
476 		 */
477 		if (recov) {
478 			PURGE_ATTRCACHE4_LOCKED(rp);
479 			mutex_exit(&rp->r_statelock);
480 			return;
481 		}
482 
483 		if (lwp != NULL)
484 			lwp->lwp_nostop++;
485 		while (rp->r_serial != NULL) {
486 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
487 				mutex_exit(&rp->r_statelock);
488 				if (lwp != NULL)
489 					lwp->lwp_nostop--;
490 				return;
491 			}
492 		}
493 		if (lwp != NULL)
494 			lwp->lwp_nostop--;
495 	}
496 
497 	/*
498 	 * If there is a page flush thread, the current thread needs to
499 	 * bail out, to prevent a possible deadlock between the current
500 	 * thread (which might be in a start_op/end_op region), the
501 	 * recovery thread, and the page flush thread.  Expire the
502 	 * attribute cache, so that any attributes the current thread was
503 	 * going to set are not lost.
504 	 */
505 	if ((rp->r_flags & R4PGFLUSH) && rp->r_pgflush != curthread) {
506 		PURGE_ATTRCACHE4_LOCKED(rp);
507 		mutex_exit(&rp->r_statelock);
508 		return;
509 	}
510 
511 	if (rp->r_time_attr_saved > t) {
512 		/*
513 		 * Attributes have been cached since these attributes were
514 		 * probably made. If there is an inconsistency in what is
515 		 * cached, mark them invalid. If not, don't act on them.
516 		 */
517 		if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
518 			PURGE_ATTRCACHE4_LOCKED(rp);
519 		mutex_exit(&rp->r_statelock);
520 		return;
521 	}
522 	set_time_cache_inval = 0;
523 	if (cinfo) {
524 		/*
525 		 * Only directory modifying callers pass non-NULL cinfo.
526 		 */
527 		ASSERT(vp->v_type == VDIR);
528 		/*
529 		 * If the cache timeout either doesn't exist or hasn't expired,
530 		 * and dir didn't changed on server before dirmod op
531 		 * and dir didn't change after dirmod op but before getattr
532 		 * then there's a chance that the client's cached data for
533 		 * this object is current (not stale).  No immediate cache
534 		 * flush is required.
535 		 *
536 		 */
537 		if ((! rp->r_time_cache_inval || t < rp->r_time_cache_inval) &&
538 		    cinfo->before == rp->r_change &&
539 		    (garp->n4g_change_valid &&
540 		    cinfo->after == garp->n4g_change)) {
541 
542 			/*
543 			 * If atomic isn't set, then the before/after info
544 			 * cannot be blindly trusted.  For this case, we tell
545 			 * nfs4_attrcache_va to cache the attrs but also
546 			 * establish an absolute maximum cache timeout.  When
547 			 * the timeout is reached, caches will be flushed.
548 			 */
549 			if (! cinfo->atomic)
550 				set_time_cache_inval = 1;
551 		} else {
552 
553 			/*
554 			 * We're not sure exactly what changed, but we know
555 			 * what to do.  flush all caches for dir.  remove the
556 			 * attr timeout.
557 			 *
558 			 * a) timeout expired.  flush all caches.
559 			 * b) r_change != cinfo.before.  flush all caches.
560 			 * c) r_change == cinfo.before, but cinfo.after !=
561 			 *    post-op getattr(change).  flush all caches.
562 			 * d) post-op getattr(change) not provided by server.
563 			 *    flush all caches.
564 			 */
565 			mtime_changed = 1;
566 			ctime_changed = 1;
567 			rp->r_time_cache_inval = 0;
568 		}
569 	} else {
570 		/*
571 		 * Write thread after writing data to file on remote server,
572 		 * will always set R4WRITEMODIFIED to indicate that file on
573 		 * remote server was modified with a WRITE operation and would
574 		 * have marked attribute cache as timed out. If R4WRITEMODIFIED
575 		 * is set, then do not check for mtime and ctime change.
576 		 */
577 		if (!(rp->r_flags & R4WRITEMODIFIED)) {
578 			if (!CACHE4_VALID(rp, vap->va_mtime, vap->va_size))
579 				mtime_changed = 1;
580 
581 			if (rp->r_attr.va_ctime.tv_sec !=
582 			    vap->va_ctime.tv_sec ||
583 			    rp->r_attr.va_ctime.tv_nsec !=
584 			    vap->va_ctime.tv_nsec)
585 				ctime_changed = 1;
586 		} else {
587 			writemodify_set = B_TRUE;
588 		}
589 	}
590 
591 	preattr_rsize = rp->r_size;
592 
593 	nfs4_attrcache_va(vp, garp, set_time_cache_inval);
594 
595 	/*
596 	 * If we have updated filesize in nfs4_attrcache_va, as soon as we
597 	 * drop statelock we will be in transition of purging all
598 	 * our caches and updating them. It is possible for another
599 	 * thread to pick this new file size and read in zeroed data.
600 	 * stall other threads till cache purge is complete.
601 	 */
602 	if ((!cinfo) && (rp->r_size != preattr_rsize)) {
603 		/*
604 		 * If R4WRITEMODIFIED was set and we have updated the file
605 		 * size, Server's returned file size need not necessarily
606 		 * be because of this Client's WRITE. We need to purge
607 		 * all caches.
608 		 */
609 		if (writemodify_set)
610 			mtime_changed = 1;
611 
612 		if (mtime_changed && !(rp->r_flags & R4INCACHEPURGE)) {
613 			rp->r_flags |= R4INCACHEPURGE;
614 			cachepurge_set = B_TRUE;
615 		}
616 	}
617 
618 	if (!mtime_changed && !ctime_changed) {
619 		mutex_exit(&rp->r_statelock);
620 		return;
621 	}
622 
623 	rp->r_serial = curthread;
624 
625 	mutex_exit(&rp->r_statelock);
626 
627 	/*
628 	 * If we're the recov thread, then force async nfs4_purge_caches
629 	 * to avoid potential deadlock.
630 	 */
631 	if (mtime_changed)
632 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, recov ? 1 : async);
633 
634 	if ((rp->r_flags & R4INCACHEPURGE) && cachepurge_set) {
635 		mutex_enter(&rp->r_statelock);
636 		rp->r_flags &= ~R4INCACHEPURGE;
637 		cv_broadcast(&rp->r_cv);
638 		mutex_exit(&rp->r_statelock);
639 		cachepurge_set = B_FALSE;
640 	}
641 
642 	if (ctime_changed) {
643 		(void) nfs4_access_purge_rp(rp);
644 		if (rp->r_secattr != NULL) {
645 			mutex_enter(&rp->r_statelock);
646 			vsp = rp->r_secattr;
647 			rp->r_secattr = NULL;
648 			mutex_exit(&rp->r_statelock);
649 			if (vsp != NULL)
650 				nfs4_acl_free_cache(vsp);
651 		}
652 	}
653 
654 	if (!was_serial) {
655 		mutex_enter(&rp->r_statelock);
656 		rp->r_serial = NULL;
657 		cv_broadcast(&rp->r_cv);
658 		mutex_exit(&rp->r_statelock);
659 	}
660 }
661 
662 /*
663  * Set attributes cache for given vnode using virtual attributes.
664  *
665  * Set the timeout value on the attribute cache and fill it
666  * with the passed in attributes.
667  *
668  * The caller must be holding r_statelock.
669  */
670 static void
671 nfs4_attrcache_va(vnode_t *vp, nfs4_ga_res_t *garp, int set_cache_timeout)
672 {
673 	rnode4_t *rp;
674 	mntinfo4_t *mi;
675 	hrtime_t delta;
676 	hrtime_t now;
677 	vattr_t *vap = &garp->n4g_va;
678 
679 	rp = VTOR4(vp);
680 
681 	ASSERT(MUTEX_HELD(&rp->r_statelock));
682 	ASSERT(vap->va_mask == AT_ALL);
683 
684 	/* Switch to master before checking v_flag */
685 	if (IS_SHADOW(vp, rp))
686 		vp = RTOV4(rp);
687 
688 	now = gethrtime();
689 
690 	mi = VTOMI4(vp);
691 
692 	/*
693 	 * Only establish a new cache timeout (if requested).  Never
694 	 * extend a timeout.  Never clear a timeout.  Clearing a timeout
695 	 * is done by nfs4_update_dircaches (ancestor in our call chain)
696 	 */
697 	if (set_cache_timeout && ! rp->r_time_cache_inval)
698 		rp->r_time_cache_inval = now + mi->mi_acdirmax;
699 
700 	/*
701 	 * Delta is the number of nanoseconds that we will
702 	 * cache the attributes of the file.  It is based on
703 	 * the number of nanoseconds since the last time that
704 	 * we detected a change.  The assumption is that files
705 	 * that changed recently are likely to change again.
706 	 * There is a minimum and a maximum for regular files
707 	 * and for directories which is enforced though.
708 	 *
709 	 * Using the time since last change was detected
710 	 * eliminates direct comparison or calculation
711 	 * using mixed client and server times.  NFS does
712 	 * not make any assumptions regarding the client
713 	 * and server clocks being synchronized.
714 	 */
715 	if (vap->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
716 	    vap->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
717 	    vap->va_size != rp->r_attr.va_size) {
718 		rp->r_time_attr_saved = now;
719 	}
720 
721 	if ((mi->mi_flags & MI4_NOAC) || (vp->v_flag & VNOCACHE))
722 		delta = 0;
723 	else {
724 		delta = now - rp->r_time_attr_saved;
725 		if (vp->v_type == VDIR) {
726 			if (delta < mi->mi_acdirmin)
727 				delta = mi->mi_acdirmin;
728 			else if (delta > mi->mi_acdirmax)
729 				delta = mi->mi_acdirmax;
730 		} else {
731 			if (delta < mi->mi_acregmin)
732 				delta = mi->mi_acregmin;
733 			else if (delta > mi->mi_acregmax)
734 				delta = mi->mi_acregmax;
735 		}
736 	}
737 	rp->r_time_attr_inval = now + delta;
738 
739 	rp->r_attr = *vap;
740 	if (garp->n4g_change_valid)
741 		rp->r_change = garp->n4g_change;
742 
743 	/*
744 	 * The attributes that were returned may be valid and can
745 	 * be used, but they may not be allowed to be cached.
746 	 * Reset the timers to cause immediate invalidation and
747 	 * clear r_change so no VERIFY operations will suceed
748 	 */
749 	if (garp->n4g_attrwhy == NFS4_GETATTR_NOCACHE_OK) {
750 		rp->r_time_attr_inval = now;
751 		rp->r_time_attr_saved = now;
752 		rp->r_change = 0;
753 	}
754 
755 	/*
756 	 * If mounted_on_fileid returned AND the object is a stub,
757 	 * then set object's va_nodeid to the mounted over fid
758 	 * returned by server.
759 	 *
760 	 * If mounted_on_fileid not provided/supported, then
761 	 * just set it to 0 for now.  Eventually it would be
762 	 * better to set it to a hashed version of FH.  This
763 	 * would probably be good enough to provide a unique
764 	 * fid/d_ino within a dir.
765 	 *
766 	 * We don't need to carry mounted_on_fileid in the
767 	 * rnode as long as the client never requests fileid
768 	 * without also requesting mounted_on_fileid.  For
769 	 * now, it stays.
770 	 */
771 	if (garp->n4g_mon_fid_valid) {
772 		rp->r_mntd_fid = garp->n4g_mon_fid;
773 
774 		if (RP_ISSTUB(rp))
775 			rp->r_attr.va_nodeid = rp->r_mntd_fid;
776 	}
777 
778 	/*
779 	 * Check to see if there are valid pathconf bits to
780 	 * cache in the rnode.
781 	 */
782 	if (garp->n4g_ext_res) {
783 		if (garp->n4g_ext_res->n4g_pc4.pc4_cache_valid) {
784 			rp->r_pathconf = garp->n4g_ext_res->n4g_pc4;
785 		} else {
786 			if (garp->n4g_ext_res->n4g_pc4.pc4_xattr_valid) {
787 				rp->r_pathconf.pc4_xattr_valid = TRUE;
788 				rp->r_pathconf.pc4_xattr_exists =
789 				    garp->n4g_ext_res->n4g_pc4.pc4_xattr_exists;
790 			}
791 		}
792 	}
793 	/*
794 	 * Update the size of the file if there is no cached data or if
795 	 * the cached data is clean and there is no data being written
796 	 * out.
797 	 */
798 	if (rp->r_size != vap->va_size &&
799 	    (!vn_has_cached_data(vp) ||
800 	    (!(rp->r_flags & R4DIRTY) && rp->r_count == 0))) {
801 		rp->r_size = vap->va_size;
802 	}
803 	nfs_setswaplike(vp, vap);
804 	rp->r_flags &= ~R4WRITEMODIFIED;
805 }
806 
807 /*
808  * Get attributes over-the-wire and update attributes cache
809  * if no error occurred in the over-the-wire operation.
810  * Return 0 if successful, otherwise error.
811  */
812 int
813 nfs4_getattr_otw(vnode_t *vp, nfs4_ga_res_t *garp, cred_t *cr, int get_acl)
814 {
815 	mntinfo4_t *mi = VTOMI4(vp);
816 	hrtime_t t;
817 	nfs4_recov_state_t recov_state;
818 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
819 
820 	recov_state.rs_flags = 0;
821 	recov_state.rs_num_retry_despite_err = 0;
822 
823 	/* Save the original mount point security flavor */
824 	(void) save_mnt_secinfo(mi->mi_curr_serv);
825 
826 recov_retry:
827 
828 	if ((e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR,
829 	    &recov_state, NULL))) {
830 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
831 		return (e.error);
832 	}
833 
834 	t = gethrtime();
835 
836 	nfs4_getattr_otw_norecovery(vp, garp, &e, cr, get_acl);
837 
838 	if (nfs4_needs_recovery(&e, FALSE, vp->v_vfsp)) {
839 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
840 		    NULL, OP_GETATTR, NULL, NULL, NULL) == FALSE)  {
841 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR,
842 			    &recov_state, 1);
843 			goto recov_retry;
844 		}
845 	}
846 
847 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state, 0);
848 
849 	if (!e.error) {
850 		if (e.stat == NFS4_OK) {
851 			nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
852 		} else {
853 			e.error = geterrno4(e.stat);
854 
855 			nfs4_purge_stale_fh(e.error, vp, cr);
856 		}
857 	}
858 
859 	/*
860 	 * If getattr a node that is a stub for a crossed
861 	 * mount point, keep the original secinfo flavor for
862 	 * the current file system, not the crossed one.
863 	 */
864 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
865 
866 	return (e.error);
867 }
868 
869 /*
870  * Generate a compound to get attributes over-the-wire.
871  */
872 void
873 nfs4_getattr_otw_norecovery(vnode_t *vp, nfs4_ga_res_t *garp,
874     nfs4_error_t *ep, cred_t *cr, int get_acl)
875 {
876 	COMPOUND4args_clnt args;
877 	COMPOUND4res_clnt res;
878 	int doqueue;
879 	rnode4_t *rp = VTOR4(vp);
880 	nfs_argop4 argop[2];
881 
882 	args.ctag = TAG_GETATTR;
883 
884 	args.array_len = 2;
885 	args.array = argop;
886 
887 	/* putfh */
888 	argop[0].argop = OP_CPUTFH;
889 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
890 
891 	/* getattr */
892 	/*
893 	 * Unlike nfs version 2 and 3, where getattr returns all the
894 	 * attributes, nfs version 4 returns only the ones explicitly
895 	 * asked for. This creates problems, as some system functions
896 	 * (e.g. cache check) require certain attributes and if the
897 	 * cached node lacks some attributes such as uid/gid, it can
898 	 * affect system utilities (e.g. "ls") that rely on the information
899 	 * to be there. This can lead to anything from system crashes to
900 	 * corrupted information processed by user apps.
901 	 * So to ensure that all bases are covered, request at least
902 	 * the AT_ALL attribute mask.
903 	 */
904 	argop[1].argop = OP_GETATTR;
905 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
906 	if (get_acl)
907 		argop[1].nfs_argop4_u.opgetattr.attr_request |= FATTR4_ACL_MASK;
908 	argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
909 
910 	doqueue = 1;
911 
912 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, ep);
913 
914 	if (ep->error)
915 		return;
916 
917 	if (res.status != NFS4_OK) {
918 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
919 		return;
920 	}
921 
922 	*garp = res.array[1].nfs_resop4_u.opgetattr.ga_res;
923 
924 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
925 }
926 
927 /*
928  * Return either cached or remote attributes. If get remote attr
929  * use them to check and invalidate caches, then cache the new attributes.
930  */
931 int
932 nfs4getattr(vnode_t *vp, vattr_t *vap, cred_t *cr)
933 {
934 	int error;
935 	rnode4_t *rp;
936 	nfs4_ga_res_t gar;
937 
938 	ASSERT(nfs4_consistent_type(vp));
939 
940 	/*
941 	 * If we've got cached attributes, we're done, otherwise go
942 	 * to the server to get attributes, which will update the cache
943 	 * in the process. Either way, use the cached attributes for
944 	 * the caller's vattr_t.
945 	 *
946 	 * Note that we ignore the gar set by the OTW call: the attr caching
947 	 * code may make adjustments when storing to the rnode, and we want
948 	 * to see those changes here.
949 	 */
950 	rp = VTOR4(vp);
951 	error = 0;
952 	mutex_enter(&rp->r_statelock);
953 	if (!ATTRCACHE4_VALID(vp)) {
954 		mutex_exit(&rp->r_statelock);
955 		error = nfs4_getattr_otw(vp, &gar, cr, 0);
956 		mutex_enter(&rp->r_statelock);
957 	}
958 
959 	if (!error)
960 		*vap = rp->r_attr;
961 
962 	/* Return the client's view of file size */
963 	vap->va_size = rp->r_size;
964 
965 	mutex_exit(&rp->r_statelock);
966 
967 	ASSERT(nfs4_consistent_type(vp));
968 
969 	return (error);
970 }
971 
972 int
973 nfs4_attr_otw(vnode_t *vp, nfs4_tag_type_t tag_type,
974     nfs4_ga_res_t *garp, bitmap4 reqbitmap, cred_t *cr)
975 {
976 	COMPOUND4args_clnt args;
977 	COMPOUND4res_clnt res;
978 	int doqueue;
979 	nfs_argop4 argop[2];
980 	mntinfo4_t *mi = VTOMI4(vp);
981 	bool_t needrecov = FALSE;
982 	nfs4_recov_state_t recov_state;
983 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
984 	nfs4_ga_ext_res_t *gerp;
985 
986 	recov_state.rs_flags = 0;
987 	recov_state.rs_num_retry_despite_err = 0;
988 
989 recov_retry:
990 	args.ctag = tag_type;
991 
992 	args.array_len = 2;
993 	args.array = argop;
994 
995 	e.error = nfs4_start_fop(mi, vp, NULL, OH_GETATTR, &recov_state, NULL);
996 	if (e.error)
997 		return (e.error);
998 
999 	/* putfh */
1000 	argop[0].argop = OP_CPUTFH;
1001 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
1002 
1003 	/* getattr */
1004 	argop[1].argop = OP_GETATTR;
1005 	argop[1].nfs_argop4_u.opgetattr.attr_request = reqbitmap;
1006 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1007 
1008 	doqueue = 1;
1009 
1010 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1011 	    "nfs4_attr_otw: %s call, rp %s", needrecov ? "recov" : "first",
1012 	    rnode4info(VTOR4(vp))));
1013 
1014 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1015 
1016 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
1017 	if (!needrecov && e.error) {
1018 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1019 		    needrecov);
1020 		return (e.error);
1021 	}
1022 
1023 	if (needrecov) {
1024 		bool_t abort;
1025 
1026 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1027 		    "nfs4_attr_otw: initiating recovery\n"));
1028 
1029 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
1030 		    NULL, OP_GETATTR, NULL, NULL, NULL);
1031 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1032 		    needrecov);
1033 		if (!e.error) {
1034 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1035 			e.error = geterrno4(res.status);
1036 		}
1037 		if (abort == FALSE)
1038 			goto recov_retry;
1039 		return (e.error);
1040 	}
1041 
1042 	if (res.status) {
1043 		e.error = geterrno4(res.status);
1044 	} else {
1045 		gerp = garp->n4g_ext_res;
1046 		bcopy(&res.array[1].nfs_resop4_u.opgetattr.ga_res,
1047 		    garp, sizeof (nfs4_ga_res_t));
1048 		garp->n4g_ext_res = gerp;
1049 		if (garp->n4g_ext_res &&
1050 		    res.array[1].nfs_resop4_u.opgetattr.ga_res.n4g_ext_res)
1051 			bcopy(res.array[1].nfs_resop4_u.opgetattr.
1052 			    ga_res.n4g_ext_res,
1053 			    garp->n4g_ext_res, sizeof (nfs4_ga_ext_res_t));
1054 	}
1055 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1056 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_GETATTR, &recov_state,
1057 	    needrecov);
1058 	return (e.error);
1059 }
1060 
1061 /*
1062  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1063  * for the demand-based allocation of async threads per-mount.  The
1064  * nfs_async_timeout is the amount of time a thread will live after it
1065  * becomes idle, unless new I/O requests are received before the thread
1066  * dies.  See nfs4_async_putpage and nfs4_async_start.
1067  */
1068 
1069 static void	nfs4_async_start(struct vfs *);
1070 
1071 static void
1072 free_async_args4(struct nfs4_async_reqs *args)
1073 {
1074 	rnode4_t *rp;
1075 
1076 	if (args->a_io != NFS4_INACTIVE) {
1077 		rp = VTOR4(args->a_vp);
1078 		mutex_enter(&rp->r_statelock);
1079 		rp->r_count--;
1080 		if (args->a_io == NFS4_PUTAPAGE ||
1081 		    args->a_io == NFS4_PAGEIO)
1082 			rp->r_awcount--;
1083 		cv_broadcast(&rp->r_cv);
1084 		mutex_exit(&rp->r_statelock);
1085 		VN_RELE(args->a_vp);
1086 	}
1087 	crfree(args->a_cred);
1088 	kmem_free(args, sizeof (*args));
1089 }
1090 
1091 /*
1092  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1093  * pageout(), running in the global zone, have legitimate reasons to do
1094  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1095  * use of a a per-mount "asynchronous requests manager thread" which is
1096  * signaled by the various asynchronous work routines when there is
1097  * asynchronous work to be done.  It is responsible for creating new
1098  * worker threads if necessary, and notifying existing worker threads
1099  * that there is work to be done.
1100  *
1101  * In other words, it will "take the specifications from the customers and
1102  * give them to the engineers."
1103  *
1104  * Worker threads die off of their own accord if they are no longer
1105  * needed.
1106  *
1107  * This thread is killed when the zone is going away or the filesystem
1108  * is being unmounted.
1109  */
1110 void
1111 nfs4_async_manager(vfs_t *vfsp)
1112 {
1113 	callb_cpr_t cprinfo;
1114 	mntinfo4_t *mi;
1115 	uint_t max_threads;
1116 
1117 	mi = VFTOMI4(vfsp);
1118 
1119 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1120 	    "nfs4_async_manager");
1121 
1122 	mutex_enter(&mi->mi_async_lock);
1123 	/*
1124 	 * We want to stash the max number of threads that this mount was
1125 	 * allowed so we can use it later when the variable is set to zero as
1126 	 * part of the zone/mount going away.
1127 	 *
1128 	 * We want to be able to create at least one thread to handle
1129 	 * asynchronous inactive calls.
1130 	 */
1131 	max_threads = MAX(mi->mi_max_threads, 1);
1132 	/*
1133 	 * We don't want to wait for mi_max_threads to go to zero, since that
1134 	 * happens as part of a failed unmount, but this thread should only
1135 	 * exit when the mount is really going away.
1136 	 *
1137 	 * Once MI4_ASYNC_MGR_STOP is set, no more async operations will be
1138 	 * attempted: the various _async_*() functions know to do things
1139 	 * inline if mi_max_threads == 0.  Henceforth we just drain out the
1140 	 * outstanding requests.
1141 	 *
1142 	 * Note that we still create zthreads even if we notice the zone is
1143 	 * shutting down (MI4_ASYNC_MGR_STOP is set); this may cause the zone
1144 	 * shutdown sequence to take slightly longer in some cases, but
1145 	 * doesn't violate the protocol, as all threads will exit as soon as
1146 	 * they're done processing the remaining requests.
1147 	 */
1148 	for (;;) {
1149 		while (mi->mi_async_req_count > 0) {
1150 			/*
1151 			 * Paranoia: If the mount started out having
1152 			 * (mi->mi_max_threads == 0), and the value was
1153 			 * later changed (via a debugger or somesuch),
1154 			 * we could be confused since we will think we
1155 			 * can't create any threads, and the calling
1156 			 * code (which looks at the current value of
1157 			 * mi->mi_max_threads, now non-zero) thinks we
1158 			 * can.
1159 			 *
1160 			 * So, because we're paranoid, we create threads
1161 			 * up to the maximum of the original and the
1162 			 * current value. This means that future
1163 			 * (debugger-induced) alterations of
1164 			 * mi->mi_max_threads are ignored for our
1165 			 * purposes, but who told them they could change
1166 			 * random values on a live kernel anyhow?
1167 			 */
1168 			if (mi->mi_threads <
1169 			    MAX(mi->mi_max_threads, max_threads)) {
1170 				mi->mi_threads++;
1171 				mutex_exit(&mi->mi_async_lock);
1172 				MI4_HOLD(mi);
1173 				VFS_HOLD(vfsp);	/* hold for new thread */
1174 				(void) zthread_create(NULL, 0, nfs4_async_start,
1175 				    vfsp, 0, minclsyspri);
1176 				mutex_enter(&mi->mi_async_lock);
1177 			}
1178 			cv_signal(&mi->mi_async_work_cv);
1179 			ASSERT(mi->mi_async_req_count != 0);
1180 			mi->mi_async_req_count--;
1181 		}
1182 
1183 		mutex_enter(&mi->mi_lock);
1184 		if (mi->mi_flags & MI4_ASYNC_MGR_STOP) {
1185 			mutex_exit(&mi->mi_lock);
1186 			break;
1187 		}
1188 		mutex_exit(&mi->mi_lock);
1189 
1190 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1191 		cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1192 		CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1193 	}
1194 
1195 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1196 	    "nfs4_async_manager exiting for vfs %p\n", (void *)mi->mi_vfsp));
1197 	/*
1198 	 * Let everyone know we're done.
1199 	 */
1200 	mi->mi_manager_thread = NULL;
1201 	/*
1202 	 * Wake up the inactive thread.
1203 	 */
1204 	cv_broadcast(&mi->mi_inact_req_cv);
1205 	/*
1206 	 * Wake up anyone sitting in nfs4_async_manager_stop()
1207 	 */
1208 	cv_broadcast(&mi->mi_async_cv);
1209 	/*
1210 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1211 	 * since CALLB_CPR_EXIT is actually responsible for releasing
1212 	 * 'mi_async_lock'.
1213 	 */
1214 	CALLB_CPR_EXIT(&cprinfo);
1215 	VFS_RELE(vfsp);	/* release thread's hold */
1216 	MI4_RELE(mi);
1217 	zthread_exit();
1218 }
1219 
1220 /*
1221  * Signal (and wait for) the async manager thread to clean up and go away.
1222  */
1223 void
1224 nfs4_async_manager_stop(vfs_t *vfsp)
1225 {
1226 	mntinfo4_t *mi = VFTOMI4(vfsp);
1227 
1228 	mutex_enter(&mi->mi_async_lock);
1229 	mutex_enter(&mi->mi_lock);
1230 	mi->mi_flags |= MI4_ASYNC_MGR_STOP;
1231 	mutex_exit(&mi->mi_lock);
1232 	cv_broadcast(&mi->mi_async_reqs_cv);
1233 	/*
1234 	 * Wait for the async manager thread to die.
1235 	 */
1236 	while (mi->mi_manager_thread != NULL)
1237 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1238 	mutex_exit(&mi->mi_async_lock);
1239 }
1240 
1241 int
1242 nfs4_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1243     struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1244     u_offset_t, caddr_t, struct seg *, cred_t *))
1245 {
1246 	rnode4_t *rp;
1247 	mntinfo4_t *mi;
1248 	struct nfs4_async_reqs *args;
1249 
1250 	rp = VTOR4(vp);
1251 	ASSERT(rp->r_freef == NULL);
1252 
1253 	mi = VTOMI4(vp);
1254 
1255 	/*
1256 	 * If addr falls in a different segment, don't bother doing readahead.
1257 	 */
1258 	if (addr >= seg->s_base + seg->s_size)
1259 		return (-1);
1260 
1261 	/*
1262 	 * If we can't allocate a request structure, punt on the readahead.
1263 	 */
1264 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1265 		return (-1);
1266 
1267 	/*
1268 	 * If a lock operation is pending, don't initiate any new
1269 	 * readaheads.  Otherwise, bump r_count to indicate the new
1270 	 * asynchronous I/O.
1271 	 */
1272 	if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1273 		kmem_free(args, sizeof (*args));
1274 		return (-1);
1275 	}
1276 	mutex_enter(&rp->r_statelock);
1277 	rp->r_count++;
1278 	mutex_exit(&rp->r_statelock);
1279 	nfs_rw_exit(&rp->r_lkserlock);
1280 
1281 	args->a_next = NULL;
1282 #ifdef DEBUG
1283 	args->a_queuer = curthread;
1284 #endif
1285 	VN_HOLD(vp);
1286 	args->a_vp = vp;
1287 	ASSERT(cr != NULL);
1288 	crhold(cr);
1289 	args->a_cred = cr;
1290 	args->a_io = NFS4_READ_AHEAD;
1291 	args->a_nfs4_readahead = readahead;
1292 	args->a_nfs4_blkoff = blkoff;
1293 	args->a_nfs4_seg = seg;
1294 	args->a_nfs4_addr = addr;
1295 
1296 	mutex_enter(&mi->mi_async_lock);
1297 
1298 	/*
1299 	 * If asyncio has been disabled, don't bother readahead.
1300 	 */
1301 	if (mi->mi_max_threads == 0) {
1302 		mutex_exit(&mi->mi_async_lock);
1303 		goto noasync;
1304 	}
1305 
1306 	/*
1307 	 * Link request structure into the async list and
1308 	 * wakeup async thread to do the i/o.
1309 	 */
1310 	if (mi->mi_async_reqs[NFS4_READ_AHEAD] == NULL) {
1311 		mi->mi_async_reqs[NFS4_READ_AHEAD] = args;
1312 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1313 	} else {
1314 		mi->mi_async_tail[NFS4_READ_AHEAD]->a_next = args;
1315 		mi->mi_async_tail[NFS4_READ_AHEAD] = args;
1316 	}
1317 
1318 	if (mi->mi_io_kstats) {
1319 		mutex_enter(&mi->mi_lock);
1320 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1321 		mutex_exit(&mi->mi_lock);
1322 	}
1323 
1324 	mi->mi_async_req_count++;
1325 	ASSERT(mi->mi_async_req_count != 0);
1326 	cv_signal(&mi->mi_async_reqs_cv);
1327 	mutex_exit(&mi->mi_async_lock);
1328 	return (0);
1329 
1330 noasync:
1331 	mutex_enter(&rp->r_statelock);
1332 	rp->r_count--;
1333 	cv_broadcast(&rp->r_cv);
1334 	mutex_exit(&rp->r_statelock);
1335 	VN_RELE(vp);
1336 	crfree(cr);
1337 	kmem_free(args, sizeof (*args));
1338 	return (-1);
1339 }
1340 
1341 /*
1342  * The async queues for each mounted file system are arranged as a
1343  * set of queues, one for each async i/o type.  Requests are taken
1344  * from the queues in a round-robin fashion.  A number of consecutive
1345  * requests are taken from each queue before moving on to the next
1346  * queue.  This functionality may allow the NFS Version 2 server to do
1347  * write clustering, even if the client is mixing writes and reads
1348  * because it will take multiple write requests from the queue
1349  * before processing any of the other async i/o types.
1350  *
1351  * XXX The nfs4_async_start thread is unsafe in the light of the present
1352  * model defined by cpr to suspend the system. Specifically over the
1353  * wire calls are cpr-unsafe. The thread should be reevaluated in
1354  * case of future updates to the cpr model.
1355  */
1356 static void
1357 nfs4_async_start(struct vfs *vfsp)
1358 {
1359 	struct nfs4_async_reqs *args;
1360 	mntinfo4_t *mi = VFTOMI4(vfsp);
1361 	clock_t time_left = 1;
1362 	callb_cpr_t cprinfo;
1363 	int i;
1364 	extern int nfs_async_timeout;
1365 
1366 	/*
1367 	 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1368 	 * built in an implementation independent manner.
1369 	 */
1370 	if (nfs_async_timeout == -1)
1371 		nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1372 
1373 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1374 
1375 	mutex_enter(&mi->mi_async_lock);
1376 	for (;;) {
1377 		/*
1378 		 * Find the next queue containing an entry.  We start
1379 		 * at the current queue pointer and then round robin
1380 		 * through all of them until we either find a non-empty
1381 		 * queue or have looked through all of them.
1382 		 */
1383 		for (i = 0; i < NFS4_ASYNC_TYPES; i++) {
1384 			args = *mi->mi_async_curr;
1385 			if (args != NULL)
1386 				break;
1387 			mi->mi_async_curr++;
1388 			if (mi->mi_async_curr ==
1389 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1390 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1391 		}
1392 		/*
1393 		 * If we didn't find a entry, then block until woken up
1394 		 * again and then look through the queues again.
1395 		 */
1396 		if (args == NULL) {
1397 			/*
1398 			 * Exiting is considered to be safe for CPR as well
1399 			 */
1400 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1401 
1402 			/*
1403 			 * Wakeup thread waiting to unmount the file
1404 			 * system only if all async threads are inactive.
1405 			 *
1406 			 * If we've timed-out and there's nothing to do,
1407 			 * then get rid of this thread.
1408 			 */
1409 			if (mi->mi_max_threads == 0 || time_left <= 0) {
1410 				if (--mi->mi_threads == 0)
1411 					cv_signal(&mi->mi_async_cv);
1412 				CALLB_CPR_EXIT(&cprinfo);
1413 				VFS_RELE(vfsp);	/* release thread's hold */
1414 				MI4_RELE(mi);
1415 				zthread_exit();
1416 				/* NOTREACHED */
1417 			}
1418 			time_left = cv_reltimedwait(&mi->mi_async_work_cv,
1419 			    &mi->mi_async_lock, nfs_async_timeout,
1420 			    TR_CLOCK_TICK);
1421 
1422 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1423 
1424 			continue;
1425 		} else {
1426 			time_left = 1;
1427 		}
1428 
1429 		/*
1430 		 * Remove the request from the async queue and then
1431 		 * update the current async request queue pointer.  If
1432 		 * the current queue is empty or we have removed enough
1433 		 * consecutive entries from it, then reset the counter
1434 		 * for this queue and then move the current pointer to
1435 		 * the next queue.
1436 		 */
1437 		*mi->mi_async_curr = args->a_next;
1438 		if (*mi->mi_async_curr == NULL ||
1439 		    --mi->mi_async_clusters[args->a_io] == 0) {
1440 			mi->mi_async_clusters[args->a_io] =
1441 			    mi->mi_async_init_clusters;
1442 			mi->mi_async_curr++;
1443 			if (mi->mi_async_curr ==
1444 			    &mi->mi_async_reqs[NFS4_ASYNC_TYPES])
1445 				mi->mi_async_curr = &mi->mi_async_reqs[0];
1446 		}
1447 
1448 		if (args->a_io != NFS4_INACTIVE && mi->mi_io_kstats) {
1449 			mutex_enter(&mi->mi_lock);
1450 			kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1451 			mutex_exit(&mi->mi_lock);
1452 		}
1453 
1454 		mutex_exit(&mi->mi_async_lock);
1455 
1456 		/*
1457 		 * Obtain arguments from the async request structure.
1458 		 */
1459 		if (args->a_io == NFS4_READ_AHEAD && mi->mi_max_threads > 0) {
1460 			(*args->a_nfs4_readahead)(args->a_vp,
1461 			    args->a_nfs4_blkoff, args->a_nfs4_addr,
1462 			    args->a_nfs4_seg, args->a_cred);
1463 		} else if (args->a_io == NFS4_PUTAPAGE) {
1464 			(void) (*args->a_nfs4_putapage)(args->a_vp,
1465 			    args->a_nfs4_pp, args->a_nfs4_off,
1466 			    args->a_nfs4_len, args->a_nfs4_flags,
1467 			    args->a_cred);
1468 		} else if (args->a_io == NFS4_PAGEIO) {
1469 			(void) (*args->a_nfs4_pageio)(args->a_vp,
1470 			    args->a_nfs4_pp, args->a_nfs4_off,
1471 			    args->a_nfs4_len, args->a_nfs4_flags,
1472 			    args->a_cred);
1473 		} else if (args->a_io == NFS4_READDIR) {
1474 			(void) ((*args->a_nfs4_readdir)(args->a_vp,
1475 			    args->a_nfs4_rdc, args->a_cred));
1476 		} else if (args->a_io == NFS4_COMMIT) {
1477 			(*args->a_nfs4_commit)(args->a_vp, args->a_nfs4_plist,
1478 			    args->a_nfs4_offset, args->a_nfs4_count,
1479 			    args->a_cred);
1480 		} else if (args->a_io == NFS4_INACTIVE) {
1481 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1482 		}
1483 
1484 		/*
1485 		 * Now, release the vnode and free the credentials
1486 		 * structure.
1487 		 */
1488 		free_async_args4(args);
1489 		/*
1490 		 * Reacquire the mutex because it will be needed above.
1491 		 */
1492 		mutex_enter(&mi->mi_async_lock);
1493 	}
1494 }
1495 
1496 /*
1497  * nfs4_inactive_thread - look for vnodes that need over-the-wire calls as
1498  * part of VOP_INACTIVE.
1499  */
1500 
1501 void
1502 nfs4_inactive_thread(mntinfo4_t *mi)
1503 {
1504 	struct nfs4_async_reqs *args;
1505 	callb_cpr_t cprinfo;
1506 	vfs_t *vfsp = mi->mi_vfsp;
1507 
1508 	CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1509 	    "nfs4_inactive_thread");
1510 
1511 	for (;;) {
1512 		mutex_enter(&mi->mi_async_lock);
1513 		args = mi->mi_async_reqs[NFS4_INACTIVE];
1514 		if (args == NULL) {
1515 			mutex_enter(&mi->mi_lock);
1516 			/*
1517 			 * We don't want to exit until the async manager is done
1518 			 * with its work; hence the check for mi_manager_thread
1519 			 * being NULL.
1520 			 *
1521 			 * The async manager thread will cv_broadcast() on
1522 			 * mi_inact_req_cv when it's done, at which point we'll
1523 			 * wake up and exit.
1524 			 */
1525 			if (mi->mi_manager_thread == NULL)
1526 				goto die;
1527 			mi->mi_flags |= MI4_INACTIVE_IDLE;
1528 			mutex_exit(&mi->mi_lock);
1529 			cv_signal(&mi->mi_async_cv);
1530 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1531 			cv_wait(&mi->mi_inact_req_cv, &mi->mi_async_lock);
1532 			CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1533 			mutex_exit(&mi->mi_async_lock);
1534 		} else {
1535 			mutex_enter(&mi->mi_lock);
1536 			mi->mi_flags &= ~MI4_INACTIVE_IDLE;
1537 			mutex_exit(&mi->mi_lock);
1538 			mi->mi_async_reqs[NFS4_INACTIVE] = args->a_next;
1539 			mutex_exit(&mi->mi_async_lock);
1540 			nfs4_inactive_otw(args->a_vp, args->a_cred);
1541 			crfree(args->a_cred);
1542 			kmem_free(args, sizeof (*args));
1543 		}
1544 	}
1545 die:
1546 	mutex_exit(&mi->mi_lock);
1547 	mi->mi_inactive_thread = NULL;
1548 	cv_signal(&mi->mi_async_cv);
1549 
1550 	/*
1551 	 * There is no explicit call to mutex_exit(&mi->mi_async_lock) since
1552 	 * CALLB_CPR_EXIT is actually responsible for releasing 'mi_async_lock'.
1553 	 */
1554 	CALLB_CPR_EXIT(&cprinfo);
1555 
1556 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
1557 	    "nfs4_inactive_thread exiting for vfs %p\n", (void *)vfsp));
1558 
1559 	MI4_RELE(mi);
1560 	zthread_exit();
1561 	/* NOTREACHED */
1562 }
1563 
1564 /*
1565  * nfs_async_stop:
1566  * Wait for all outstanding putpage operations and the inactive thread to
1567  * complete; nfs4_async_stop_sig() without interruptibility.
1568  */
1569 void
1570 nfs4_async_stop(struct vfs *vfsp)
1571 {
1572 	mntinfo4_t *mi = VFTOMI4(vfsp);
1573 
1574 	/*
1575 	 * Wait for all outstanding async operations to complete and for
1576 	 * worker threads to exit.
1577 	 */
1578 	mutex_enter(&mi->mi_async_lock);
1579 	mi->mi_max_threads = 0;
1580 	cv_broadcast(&mi->mi_async_work_cv);
1581 	while (mi->mi_threads != 0)
1582 		cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1583 
1584 	/*
1585 	 * Wait for the inactive thread to finish doing what it's doing.  It
1586 	 * won't exit until the last reference to the vfs_t goes away.
1587 	 */
1588 	if (mi->mi_inactive_thread != NULL) {
1589 		mutex_enter(&mi->mi_lock);
1590 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1591 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1592 			mutex_exit(&mi->mi_lock);
1593 			cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1594 			mutex_enter(&mi->mi_lock);
1595 		}
1596 		mutex_exit(&mi->mi_lock);
1597 	}
1598 	mutex_exit(&mi->mi_async_lock);
1599 }
1600 
1601 /*
1602  * nfs_async_stop_sig:
1603  * Wait for all outstanding putpage operations and the inactive thread to
1604  * complete. If a signal is delivered we will abort and return non-zero;
1605  * otherwise return 0. Since this routine is called from nfs4_unmount, we
1606  * need to make it interruptible.
1607  */
1608 int
1609 nfs4_async_stop_sig(struct vfs *vfsp)
1610 {
1611 	mntinfo4_t *mi = VFTOMI4(vfsp);
1612 	ushort_t omax;
1613 	bool_t intr = FALSE;
1614 
1615 	/*
1616 	 * Wait for all outstanding putpage operations to complete and for
1617 	 * worker threads to exit.
1618 	 */
1619 	mutex_enter(&mi->mi_async_lock);
1620 	omax = mi->mi_max_threads;
1621 	mi->mi_max_threads = 0;
1622 	cv_broadcast(&mi->mi_async_work_cv);
1623 	while (mi->mi_threads != 0) {
1624 		if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) {
1625 			intr = TRUE;
1626 			goto interrupted;
1627 		}
1628 	}
1629 
1630 	/*
1631 	 * Wait for the inactive thread to finish doing what it's doing.  It
1632 	 * won't exit until the a last reference to the vfs_t goes away.
1633 	 */
1634 	if (mi->mi_inactive_thread != NULL) {
1635 		mutex_enter(&mi->mi_lock);
1636 		while (!(mi->mi_flags & MI4_INACTIVE_IDLE) ||
1637 		    (mi->mi_async_reqs[NFS4_INACTIVE] != NULL)) {
1638 			mutex_exit(&mi->mi_lock);
1639 			if (!cv_wait_sig(&mi->mi_async_cv,
1640 			    &mi->mi_async_lock)) {
1641 				intr = TRUE;
1642 				goto interrupted;
1643 			}
1644 			mutex_enter(&mi->mi_lock);
1645 		}
1646 		mutex_exit(&mi->mi_lock);
1647 	}
1648 interrupted:
1649 	if (intr)
1650 		mi->mi_max_threads = omax;
1651 	mutex_exit(&mi->mi_async_lock);
1652 
1653 	return (intr);
1654 }
1655 
1656 int
1657 nfs4_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1658     int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1659     u_offset_t, size_t, int, cred_t *))
1660 {
1661 	rnode4_t *rp;
1662 	mntinfo4_t *mi;
1663 	struct nfs4_async_reqs *args;
1664 
1665 	ASSERT(flags & B_ASYNC);
1666 	ASSERT(vp->v_vfsp != NULL);
1667 
1668 	rp = VTOR4(vp);
1669 	ASSERT(rp->r_count > 0);
1670 
1671 	mi = VTOMI4(vp);
1672 
1673 	/*
1674 	 * If we can't allocate a request structure, do the putpage
1675 	 * operation synchronously in this thread's context.
1676 	 */
1677 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1678 		goto noasync;
1679 
1680 	args->a_next = NULL;
1681 #ifdef DEBUG
1682 	args->a_queuer = curthread;
1683 #endif
1684 	VN_HOLD(vp);
1685 	args->a_vp = vp;
1686 	ASSERT(cr != NULL);
1687 	crhold(cr);
1688 	args->a_cred = cr;
1689 	args->a_io = NFS4_PUTAPAGE;
1690 	args->a_nfs4_putapage = putapage;
1691 	args->a_nfs4_pp = pp;
1692 	args->a_nfs4_off = off;
1693 	args->a_nfs4_len = (uint_t)len;
1694 	args->a_nfs4_flags = flags;
1695 
1696 	mutex_enter(&mi->mi_async_lock);
1697 
1698 	/*
1699 	 * If asyncio has been disabled, then make a synchronous request.
1700 	 * This check is done a second time in case async io was diabled
1701 	 * while this thread was blocked waiting for memory pressure to
1702 	 * reduce or for the queue to drain.
1703 	 */
1704 	if (mi->mi_max_threads == 0) {
1705 		mutex_exit(&mi->mi_async_lock);
1706 
1707 		VN_RELE(vp);
1708 		crfree(cr);
1709 		kmem_free(args, sizeof (*args));
1710 		goto noasync;
1711 	}
1712 
1713 	/*
1714 	 * Link request structure into the async list and
1715 	 * wakeup async thread to do the i/o.
1716 	 */
1717 	if (mi->mi_async_reqs[NFS4_PUTAPAGE] == NULL) {
1718 		mi->mi_async_reqs[NFS4_PUTAPAGE] = args;
1719 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1720 	} else {
1721 		mi->mi_async_tail[NFS4_PUTAPAGE]->a_next = args;
1722 		mi->mi_async_tail[NFS4_PUTAPAGE] = args;
1723 	}
1724 
1725 	mutex_enter(&rp->r_statelock);
1726 	rp->r_count++;
1727 	rp->r_awcount++;
1728 	mutex_exit(&rp->r_statelock);
1729 
1730 	if (mi->mi_io_kstats) {
1731 		mutex_enter(&mi->mi_lock);
1732 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1733 		mutex_exit(&mi->mi_lock);
1734 	}
1735 
1736 	mi->mi_async_req_count++;
1737 	ASSERT(mi->mi_async_req_count != 0);
1738 	cv_signal(&mi->mi_async_reqs_cv);
1739 	mutex_exit(&mi->mi_async_lock);
1740 	return (0);
1741 
1742 noasync:
1743 
1744 	if (curproc == proc_pageout || curproc == proc_fsflush ||
1745 	    nfs_zone() == mi->mi_zone) {
1746 		/*
1747 		 * If we get here in the context of the pageout/fsflush,
1748 		 * or we have run out of memory or we're attempting to
1749 		 * unmount we refuse to do a sync write, because this may
1750 		 * hang pageout/fsflush and the machine. In this case,
1751 		 * we just re-mark the page as dirty and punt on the page.
1752 		 *
1753 		 * Make sure B_FORCE isn't set.  We can re-mark the
1754 		 * pages as dirty and unlock the pages in one swoop by
1755 		 * passing in B_ERROR to pvn_write_done().  However,
1756 		 * we should make sure B_FORCE isn't set - we don't
1757 		 * want the page tossed before it gets written out.
1758 		 */
1759 		if (flags & B_FORCE)
1760 			flags &= ~(B_INVAL | B_FORCE);
1761 		pvn_write_done(pp, flags | B_ERROR);
1762 		return (0);
1763 	}
1764 
1765 	/*
1766 	 * We'll get here only if (nfs_zone() != mi->mi_zone)
1767 	 * which means that this was a cross-zone sync putpage.
1768 	 *
1769 	 * We pass in B_ERROR to pvn_write_done() to re-mark the pages
1770 	 * as dirty and unlock them.
1771 	 *
1772 	 * We don't want to clear B_FORCE here as the caller presumably
1773 	 * knows what they're doing if they set it.
1774 	 */
1775 	pvn_write_done(pp, flags | B_ERROR);
1776 	return (EPERM);
1777 }
1778 
1779 int
1780 nfs4_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1781     int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1782     size_t, int, cred_t *))
1783 {
1784 	rnode4_t *rp;
1785 	mntinfo4_t *mi;
1786 	struct nfs4_async_reqs *args;
1787 
1788 	ASSERT(flags & B_ASYNC);
1789 	ASSERT(vp->v_vfsp != NULL);
1790 
1791 	rp = VTOR4(vp);
1792 	ASSERT(rp->r_count > 0);
1793 
1794 	mi = VTOMI4(vp);
1795 
1796 	/*
1797 	 * If we can't allocate a request structure, do the pageio
1798 	 * request synchronously in this thread's context.
1799 	 */
1800 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1801 		goto noasync;
1802 
1803 	args->a_next = NULL;
1804 #ifdef DEBUG
1805 	args->a_queuer = curthread;
1806 #endif
1807 	VN_HOLD(vp);
1808 	args->a_vp = vp;
1809 	ASSERT(cr != NULL);
1810 	crhold(cr);
1811 	args->a_cred = cr;
1812 	args->a_io = NFS4_PAGEIO;
1813 	args->a_nfs4_pageio = pageio;
1814 	args->a_nfs4_pp = pp;
1815 	args->a_nfs4_off = io_off;
1816 	args->a_nfs4_len = (uint_t)io_len;
1817 	args->a_nfs4_flags = flags;
1818 
1819 	mutex_enter(&mi->mi_async_lock);
1820 
1821 	/*
1822 	 * If asyncio has been disabled, then make a synchronous request.
1823 	 * This check is done a second time in case async io was diabled
1824 	 * while this thread was blocked waiting for memory pressure to
1825 	 * reduce or for the queue to drain.
1826 	 */
1827 	if (mi->mi_max_threads == 0) {
1828 		mutex_exit(&mi->mi_async_lock);
1829 
1830 		VN_RELE(vp);
1831 		crfree(cr);
1832 		kmem_free(args, sizeof (*args));
1833 		goto noasync;
1834 	}
1835 
1836 	/*
1837 	 * Link request structure into the async list and
1838 	 * wakeup async thread to do the i/o.
1839 	 */
1840 	if (mi->mi_async_reqs[NFS4_PAGEIO] == NULL) {
1841 		mi->mi_async_reqs[NFS4_PAGEIO] = args;
1842 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1843 	} else {
1844 		mi->mi_async_tail[NFS4_PAGEIO]->a_next = args;
1845 		mi->mi_async_tail[NFS4_PAGEIO] = args;
1846 	}
1847 
1848 	mutex_enter(&rp->r_statelock);
1849 	rp->r_count++;
1850 	rp->r_awcount++;
1851 	mutex_exit(&rp->r_statelock);
1852 
1853 	if (mi->mi_io_kstats) {
1854 		mutex_enter(&mi->mi_lock);
1855 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1856 		mutex_exit(&mi->mi_lock);
1857 	}
1858 
1859 	mi->mi_async_req_count++;
1860 	ASSERT(mi->mi_async_req_count != 0);
1861 	cv_signal(&mi->mi_async_reqs_cv);
1862 	mutex_exit(&mi->mi_async_lock);
1863 	return (0);
1864 
1865 noasync:
1866 	/*
1867 	 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1868 	 * the page list), for writes we do it synchronously, except for
1869 	 * proc_pageout/proc_fsflush as described below.
1870 	 */
1871 	if (flags & B_READ) {
1872 		pvn_read_done(pp, flags | B_ERROR);
1873 		return (0);
1874 	}
1875 
1876 	if (curproc == proc_pageout || curproc == proc_fsflush) {
1877 		/*
1878 		 * If we get here in the context of the pageout/fsflush,
1879 		 * we refuse to do a sync write, because this may hang
1880 		 * pageout/fsflush (and the machine). In this case, we just
1881 		 * re-mark the page as dirty and punt on the page.
1882 		 *
1883 		 * Make sure B_FORCE isn't set.  We can re-mark the
1884 		 * pages as dirty and unlock the pages in one swoop by
1885 		 * passing in B_ERROR to pvn_write_done().  However,
1886 		 * we should make sure B_FORCE isn't set - we don't
1887 		 * want the page tossed before it gets written out.
1888 		 */
1889 		if (flags & B_FORCE)
1890 			flags &= ~(B_INVAL | B_FORCE);
1891 		pvn_write_done(pp, flags | B_ERROR);
1892 		return (0);
1893 	}
1894 
1895 	if (nfs_zone() != mi->mi_zone) {
1896 		/*
1897 		 * So this was a cross-zone sync pageio.  We pass in B_ERROR
1898 		 * to pvn_write_done() to re-mark the pages as dirty and unlock
1899 		 * them.
1900 		 *
1901 		 * We don't want to clear B_FORCE here as the caller presumably
1902 		 * knows what they're doing if they set it.
1903 		 */
1904 		pvn_write_done(pp, flags | B_ERROR);
1905 		return (EPERM);
1906 	}
1907 	return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1908 }
1909 
1910 void
1911 nfs4_async_readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr,
1912     int (*readdir)(vnode_t *, rddir4_cache *, cred_t *))
1913 {
1914 	rnode4_t *rp;
1915 	mntinfo4_t *mi;
1916 	struct nfs4_async_reqs *args;
1917 
1918 	rp = VTOR4(vp);
1919 	ASSERT(rp->r_freef == NULL);
1920 
1921 	mi = VTOMI4(vp);
1922 
1923 	/*
1924 	 * If we can't allocate a request structure, skip the readdir.
1925 	 */
1926 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1927 		goto noasync;
1928 
1929 	args->a_next = NULL;
1930 #ifdef DEBUG
1931 	args->a_queuer = curthread;
1932 #endif
1933 	VN_HOLD(vp);
1934 	args->a_vp = vp;
1935 	ASSERT(cr != NULL);
1936 	crhold(cr);
1937 	args->a_cred = cr;
1938 	args->a_io = NFS4_READDIR;
1939 	args->a_nfs4_readdir = readdir;
1940 	args->a_nfs4_rdc = rdc;
1941 
1942 	mutex_enter(&mi->mi_async_lock);
1943 
1944 	/*
1945 	 * If asyncio has been disabled, then skip this request
1946 	 */
1947 	if (mi->mi_max_threads == 0) {
1948 		mutex_exit(&mi->mi_async_lock);
1949 
1950 		VN_RELE(vp);
1951 		crfree(cr);
1952 		kmem_free(args, sizeof (*args));
1953 		goto noasync;
1954 	}
1955 
1956 	/*
1957 	 * Link request structure into the async list and
1958 	 * wakeup async thread to do the i/o.
1959 	 */
1960 	if (mi->mi_async_reqs[NFS4_READDIR] == NULL) {
1961 		mi->mi_async_reqs[NFS4_READDIR] = args;
1962 		mi->mi_async_tail[NFS4_READDIR] = args;
1963 	} else {
1964 		mi->mi_async_tail[NFS4_READDIR]->a_next = args;
1965 		mi->mi_async_tail[NFS4_READDIR] = args;
1966 	}
1967 
1968 	mutex_enter(&rp->r_statelock);
1969 	rp->r_count++;
1970 	mutex_exit(&rp->r_statelock);
1971 
1972 	if (mi->mi_io_kstats) {
1973 		mutex_enter(&mi->mi_lock);
1974 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1975 		mutex_exit(&mi->mi_lock);
1976 	}
1977 
1978 	mi->mi_async_req_count++;
1979 	ASSERT(mi->mi_async_req_count != 0);
1980 	cv_signal(&mi->mi_async_reqs_cv);
1981 	mutex_exit(&mi->mi_async_lock);
1982 	return;
1983 
1984 noasync:
1985 	mutex_enter(&rp->r_statelock);
1986 	rdc->entries = NULL;
1987 	/*
1988 	 * Indicate that no one is trying to fill this entry and
1989 	 * it still needs to be filled.
1990 	 */
1991 	rdc->flags &= ~RDDIR;
1992 	rdc->flags |= RDDIRREQ;
1993 	rddir4_cache_rele(rp, rdc);
1994 	mutex_exit(&rp->r_statelock);
1995 }
1996 
1997 void
1998 nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1999     cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
2000     cred_t *))
2001 {
2002 	rnode4_t *rp;
2003 	mntinfo4_t *mi;
2004 	struct nfs4_async_reqs *args;
2005 	page_t *pp;
2006 
2007 	rp = VTOR4(vp);
2008 	mi = VTOMI4(vp);
2009 
2010 	/*
2011 	 * If we can't allocate a request structure, do the commit
2012 	 * operation synchronously in this thread's context.
2013 	 */
2014 	if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
2015 		goto noasync;
2016 
2017 	args->a_next = NULL;
2018 #ifdef DEBUG
2019 	args->a_queuer = curthread;
2020 #endif
2021 	VN_HOLD(vp);
2022 	args->a_vp = vp;
2023 	ASSERT(cr != NULL);
2024 	crhold(cr);
2025 	args->a_cred = cr;
2026 	args->a_io = NFS4_COMMIT;
2027 	args->a_nfs4_commit = commit;
2028 	args->a_nfs4_plist = plist;
2029 	args->a_nfs4_offset = offset;
2030 	args->a_nfs4_count = count;
2031 
2032 	mutex_enter(&mi->mi_async_lock);
2033 
2034 	/*
2035 	 * If asyncio has been disabled, then make a synchronous request.
2036 	 * This check is done a second time in case async io was diabled
2037 	 * while this thread was blocked waiting for memory pressure to
2038 	 * reduce or for the queue to drain.
2039 	 */
2040 	if (mi->mi_max_threads == 0) {
2041 		mutex_exit(&mi->mi_async_lock);
2042 
2043 		VN_RELE(vp);
2044 		crfree(cr);
2045 		kmem_free(args, sizeof (*args));
2046 		goto noasync;
2047 	}
2048 
2049 	/*
2050 	 * Link request structure into the async list and
2051 	 * wakeup async thread to do the i/o.
2052 	 */
2053 	if (mi->mi_async_reqs[NFS4_COMMIT] == NULL) {
2054 		mi->mi_async_reqs[NFS4_COMMIT] = args;
2055 		mi->mi_async_tail[NFS4_COMMIT] = args;
2056 	} else {
2057 		mi->mi_async_tail[NFS4_COMMIT]->a_next = args;
2058 		mi->mi_async_tail[NFS4_COMMIT] = args;
2059 	}
2060 
2061 	mutex_enter(&rp->r_statelock);
2062 	rp->r_count++;
2063 	mutex_exit(&rp->r_statelock);
2064 
2065 	if (mi->mi_io_kstats) {
2066 		mutex_enter(&mi->mi_lock);
2067 		kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
2068 		mutex_exit(&mi->mi_lock);
2069 	}
2070 
2071 	mi->mi_async_req_count++;
2072 	ASSERT(mi->mi_async_req_count != 0);
2073 	cv_signal(&mi->mi_async_reqs_cv);
2074 	mutex_exit(&mi->mi_async_lock);
2075 	return;
2076 
2077 noasync:
2078 	if (curproc == proc_pageout || curproc == proc_fsflush ||
2079 	    nfs_zone() != mi->mi_zone) {
2080 		while (plist != NULL) {
2081 			pp = plist;
2082 			page_sub(&plist, pp);
2083 			pp->p_fsdata = C_COMMIT;
2084 			page_unlock(pp);
2085 		}
2086 		return;
2087 	}
2088 	(*commit)(vp, plist, offset, count, cr);
2089 }
2090 
2091 /*
2092  * nfs4_async_inactive - hand off a VOP_INACTIVE call to a thread.  The
2093  * reference to the vnode is handed over to the thread; the caller should
2094  * no longer refer to the vnode.
2095  *
2096  * Unlike most of the async routines, this handoff is needed for
2097  * correctness reasons, not just performance.  So doing operations in the
2098  * context of the current thread is not an option.
2099  */
2100 void
2101 nfs4_async_inactive(vnode_t *vp, cred_t *cr)
2102 {
2103 	mntinfo4_t *mi;
2104 	struct nfs4_async_reqs *args;
2105 	boolean_t signal_inactive_thread = B_FALSE;
2106 
2107 	mi = VTOMI4(vp);
2108 
2109 	args = kmem_alloc(sizeof (*args), KM_SLEEP);
2110 	args->a_next = NULL;
2111 #ifdef DEBUG
2112 	args->a_queuer = curthread;
2113 #endif
2114 	args->a_vp = vp;
2115 	ASSERT(cr != NULL);
2116 	crhold(cr);
2117 	args->a_cred = cr;
2118 	args->a_io = NFS4_INACTIVE;
2119 
2120 	/*
2121 	 * Note that we don't check mi->mi_max_threads here, since we
2122 	 * *need* to get rid of this vnode regardless of whether someone
2123 	 * set nfs4_max_threads to zero in /etc/system.
2124 	 *
2125 	 * The manager thread knows about this and is willing to create
2126 	 * at least one thread to accommodate us.
2127 	 */
2128 	mutex_enter(&mi->mi_async_lock);
2129 	if (mi->mi_inactive_thread == NULL) {
2130 		rnode4_t *rp;
2131 		vnode_t *unldvp = NULL;
2132 		char *unlname;
2133 		cred_t *unlcred;
2134 
2135 		mutex_exit(&mi->mi_async_lock);
2136 		/*
2137 		 * We just need to free up the memory associated with the
2138 		 * vnode, which can be safely done from within the current
2139 		 * context.
2140 		 */
2141 		crfree(cr);	/* drop our reference */
2142 		kmem_free(args, sizeof (*args));
2143 		rp = VTOR4(vp);
2144 		mutex_enter(&rp->r_statelock);
2145 		if (rp->r_unldvp != NULL) {
2146 			unldvp = rp->r_unldvp;
2147 			rp->r_unldvp = NULL;
2148 			unlname = rp->r_unlname;
2149 			rp->r_unlname = NULL;
2150 			unlcred = rp->r_unlcred;
2151 			rp->r_unlcred = NULL;
2152 		}
2153 		mutex_exit(&rp->r_statelock);
2154 		/*
2155 		 * No need to explicitly throw away any cached pages.  The
2156 		 * eventual r4inactive() will attempt a synchronous
2157 		 * VOP_PUTPAGE() which will immediately fail since the request
2158 		 * is coming from the wrong zone, and then will proceed to call
2159 		 * nfs4_invalidate_pages() which will clean things up for us.
2160 		 *
2161 		 * Throw away the delegation here so rp4_addfree()'s attempt to
2162 		 * return any existing delegations becomes a no-op.
2163 		 */
2164 		if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
2165 			(void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
2166 			    FALSE);
2167 			(void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
2168 			nfs_rw_exit(&mi->mi_recovlock);
2169 		}
2170 		nfs4_clear_open_streams(rp);
2171 
2172 		rp4_addfree(rp, cr);
2173 		if (unldvp != NULL) {
2174 			kmem_free(unlname, MAXNAMELEN);
2175 			VN_RELE(unldvp);
2176 			crfree(unlcred);
2177 		}
2178 		return;
2179 	}
2180 
2181 	if (mi->mi_manager_thread == NULL) {
2182 		/*
2183 		 * We want to talk to the inactive thread.
2184 		 */
2185 		signal_inactive_thread = B_TRUE;
2186 	}
2187 
2188 	/*
2189 	 * Enqueue the vnode and wake up either the special thread (empty
2190 	 * list) or an async thread.
2191 	 */
2192 	if (mi->mi_async_reqs[NFS4_INACTIVE] == NULL) {
2193 		mi->mi_async_reqs[NFS4_INACTIVE] = args;
2194 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2195 		signal_inactive_thread = B_TRUE;
2196 	} else {
2197 		mi->mi_async_tail[NFS4_INACTIVE]->a_next = args;
2198 		mi->mi_async_tail[NFS4_INACTIVE] = args;
2199 	}
2200 	if (signal_inactive_thread) {
2201 		cv_signal(&mi->mi_inact_req_cv);
2202 	} else  {
2203 		mi->mi_async_req_count++;
2204 		ASSERT(mi->mi_async_req_count != 0);
2205 		cv_signal(&mi->mi_async_reqs_cv);
2206 	}
2207 
2208 	mutex_exit(&mi->mi_async_lock);
2209 }
2210 
2211 int
2212 writerp4(rnode4_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2213 {
2214 	int pagecreate;
2215 	int n;
2216 	int saved_n;
2217 	caddr_t saved_base;
2218 	u_offset_t offset;
2219 	int error;
2220 	int sm_error;
2221 	vnode_t *vp = RTOV(rp);
2222 
2223 	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2224 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2225 	if (!vpm_enable) {
2226 		ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2227 	}
2228 
2229 	/*
2230 	 * Move bytes in at most PAGESIZE chunks. We must avoid
2231 	 * spanning pages in uiomove() because page faults may cause
2232 	 * the cache to be invalidated out from under us. The r_size is not
2233 	 * updated until after the uiomove. If we push the last page of a
2234 	 * file before r_size is correct, we will lose the data written past
2235 	 * the current (and invalid) r_size.
2236 	 */
2237 	do {
2238 		offset = uio->uio_loffset;
2239 		pagecreate = 0;
2240 
2241 		/*
2242 		 * n is the number of bytes required to satisfy the request
2243 		 *   or the number of bytes to fill out the page.
2244 		 */
2245 		n = (int)MIN((PAGESIZE - (offset & PAGEOFFSET)), tcount);
2246 
2247 		/*
2248 		 * Check to see if we can skip reading in the page
2249 		 * and just allocate the memory.  We can do this
2250 		 * if we are going to rewrite the entire mapping
2251 		 * or if we are going to write to or beyond the current
2252 		 * end of file from the beginning of the mapping.
2253 		 *
2254 		 * The read of r_size is now protected by r_statelock.
2255 		 */
2256 		mutex_enter(&rp->r_statelock);
2257 		/*
2258 		 * When pgcreated is nonzero the caller has already done
2259 		 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2260 		 * segkpm this means we already have at least one page
2261 		 * created and mapped at base.
2262 		 */
2263 		pagecreate = pgcreated ||
2264 		    ((offset & PAGEOFFSET) == 0 &&
2265 		    (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2266 
2267 		mutex_exit(&rp->r_statelock);
2268 
2269 		if (!vpm_enable && pagecreate) {
2270 			/*
2271 			 * The last argument tells segmap_pagecreate() to
2272 			 * always lock the page, as opposed to sometimes
2273 			 * returning with the page locked. This way we avoid a
2274 			 * fault on the ensuing uiomove(), but also
2275 			 * more importantly (to fix bug 1094402) we can
2276 			 * call segmap_fault() to unlock the page in all
2277 			 * cases. An alternative would be to modify
2278 			 * segmap_pagecreate() to tell us when it is
2279 			 * locking a page, but that's a fairly major
2280 			 * interface change.
2281 			 */
2282 			if (pgcreated == 0)
2283 				(void) segmap_pagecreate(segkmap, base,
2284 				    (uint_t)n, 1);
2285 			saved_base = base;
2286 			saved_n = n;
2287 		}
2288 
2289 		/*
2290 		 * The number of bytes of data in the last page can not
2291 		 * be accurately be determined while page is being
2292 		 * uiomove'd to and the size of the file being updated.
2293 		 * Thus, inform threads which need to know accurately
2294 		 * how much data is in the last page of the file.  They
2295 		 * will not do the i/o immediately, but will arrange for
2296 		 * the i/o to happen later when this modify operation
2297 		 * will have finished.
2298 		 */
2299 		ASSERT(!(rp->r_flags & R4MODINPROGRESS));
2300 		mutex_enter(&rp->r_statelock);
2301 		rp->r_flags |= R4MODINPROGRESS;
2302 		rp->r_modaddr = (offset & MAXBMASK);
2303 		mutex_exit(&rp->r_statelock);
2304 
2305 		if (vpm_enable) {
2306 			/*
2307 			 * Copy data. If new pages are created, part of
2308 			 * the page that is not written will be initizliazed
2309 			 * with zeros.
2310 			 */
2311 			error = vpm_data_copy(vp, offset, n, uio,
2312 			    !pagecreate, NULL, 0, S_WRITE);
2313 		} else {
2314 			error = uiomove(base, n, UIO_WRITE, uio);
2315 		}
2316 
2317 		/*
2318 		 * r_size is the maximum number of
2319 		 * bytes known to be in the file.
2320 		 * Make sure it is at least as high as the
2321 		 * first unwritten byte pointed to by uio_loffset.
2322 		 */
2323 		mutex_enter(&rp->r_statelock);
2324 		if (rp->r_size < uio->uio_loffset)
2325 			rp->r_size = uio->uio_loffset;
2326 		rp->r_flags &= ~R4MODINPROGRESS;
2327 		rp->r_flags |= R4DIRTY;
2328 		mutex_exit(&rp->r_statelock);
2329 
2330 		/* n = # of bytes written */
2331 		n = (int)(uio->uio_loffset - offset);
2332 
2333 		if (!vpm_enable) {
2334 			base += n;
2335 		}
2336 
2337 		tcount -= n;
2338 		/*
2339 		 * If we created pages w/o initializing them completely,
2340 		 * we need to zero the part that wasn't set up.
2341 		 * This happens on a most EOF write cases and if
2342 		 * we had some sort of error during the uiomove.
2343 		 */
2344 		if (!vpm_enable && pagecreate) {
2345 			if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2346 				(void) kzero(base, PAGESIZE - n);
2347 
2348 			if (pgcreated) {
2349 				/*
2350 				 * Caller is responsible for this page,
2351 				 * it was not created in this loop.
2352 				 */
2353 				pgcreated = 0;
2354 			} else {
2355 				/*
2356 				 * For bug 1094402: segmap_pagecreate locks
2357 				 * page. Unlock it. This also unlocks the
2358 				 * pages allocated by page_create_va() in
2359 				 * segmap_pagecreate().
2360 				 */
2361 				sm_error = segmap_fault(kas.a_hat, segkmap,
2362 				    saved_base, saved_n,
2363 				    F_SOFTUNLOCK, S_WRITE);
2364 				if (error == 0)
2365 					error = sm_error;
2366 			}
2367 		}
2368 	} while (tcount > 0 && error == 0);
2369 
2370 	return (error);
2371 }
2372 
2373 int
2374 nfs4_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2375 {
2376 	rnode4_t *rp;
2377 	page_t *pp;
2378 	u_offset_t eoff;
2379 	u_offset_t io_off;
2380 	size_t io_len;
2381 	int error;
2382 	int rdirty;
2383 	int err;
2384 
2385 	rp = VTOR4(vp);
2386 	ASSERT(rp->r_count > 0);
2387 
2388 	if (!nfs4_has_pages(vp))
2389 		return (0);
2390 
2391 	ASSERT(vp->v_type != VCHR);
2392 
2393 	/*
2394 	 * If R4OUTOFSPACE is set, then all writes turn into B_INVAL
2395 	 * writes.  B_FORCE is set to force the VM system to actually
2396 	 * invalidate the pages, even if the i/o failed.  The pages
2397 	 * need to get invalidated because they can't be written out
2398 	 * because there isn't any space left on either the server's
2399 	 * file system or in the user's disk quota.  The B_FREE bit
2400 	 * is cleared to avoid confusion as to whether this is a
2401 	 * request to place the page on the freelist or to destroy
2402 	 * it.
2403 	 */
2404 	if ((rp->r_flags & R4OUTOFSPACE) ||
2405 	    (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2406 		flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2407 
2408 	if (len == 0) {
2409 		/*
2410 		 * If doing a full file synchronous operation, then clear
2411 		 * the R4DIRTY bit.  If a page gets dirtied while the flush
2412 		 * is happening, then R4DIRTY will get set again.  The
2413 		 * R4DIRTY bit must get cleared before the flush so that
2414 		 * we don't lose this information.
2415 		 *
2416 		 * If there are no full file async write operations
2417 		 * pending and RDIRTY bit is set, clear it.
2418 		 */
2419 		if (off == (u_offset_t)0 &&
2420 		    !(flags & B_ASYNC) &&
2421 		    (rp->r_flags & R4DIRTY)) {
2422 			mutex_enter(&rp->r_statelock);
2423 			rdirty = (rp->r_flags & R4DIRTY);
2424 			rp->r_flags &= ~R4DIRTY;
2425 			mutex_exit(&rp->r_statelock);
2426 		} else if (flags & B_ASYNC && off == (u_offset_t)0) {
2427 			mutex_enter(&rp->r_statelock);
2428 			if (rp->r_flags & R4DIRTY && rp->r_awcount == 0) {
2429 				rdirty = (rp->r_flags & R4DIRTY);
2430 				rp->r_flags &= ~R4DIRTY;
2431 			}
2432 			mutex_exit(&rp->r_statelock);
2433 		} else
2434 			rdirty = 0;
2435 
2436 		/*
2437 		 * Search the entire vp list for pages >= off, and flush
2438 		 * the dirty pages.
2439 		 */
2440 		error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2441 		    flags, cr);
2442 
2443 		/*
2444 		 * If an error occurred and the file was marked as dirty
2445 		 * before and we aren't forcibly invalidating pages, then
2446 		 * reset the R4DIRTY flag.
2447 		 */
2448 		if (error && rdirty &&
2449 		    (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2450 			mutex_enter(&rp->r_statelock);
2451 			rp->r_flags |= R4DIRTY;
2452 			mutex_exit(&rp->r_statelock);
2453 		}
2454 	} else {
2455 		/*
2456 		 * Do a range from [off...off + len) looking for pages
2457 		 * to deal with.
2458 		 */
2459 		error = 0;
2460 		io_len = 0;
2461 		eoff = off + len;
2462 		mutex_enter(&rp->r_statelock);
2463 		for (io_off = off; io_off < eoff && io_off < rp->r_size;
2464 		    io_off += io_len) {
2465 			mutex_exit(&rp->r_statelock);
2466 			/*
2467 			 * If we are not invalidating, synchronously
2468 			 * freeing or writing pages use the routine
2469 			 * page_lookup_nowait() to prevent reclaiming
2470 			 * them from the free list.
2471 			 */
2472 			if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2473 				pp = page_lookup(vp, io_off,
2474 				    (flags & (B_INVAL | B_FREE)) ?
2475 				    SE_EXCL : SE_SHARED);
2476 			} else {
2477 				pp = page_lookup_nowait(vp, io_off,
2478 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2479 			}
2480 
2481 			if (pp == NULL || !pvn_getdirty(pp, flags))
2482 				io_len = PAGESIZE;
2483 			else {
2484 				err = (*rp->r_putapage)(vp, pp, &io_off,
2485 				    &io_len, flags, cr);
2486 				if (!error)
2487 					error = err;
2488 				/*
2489 				 * "io_off" and "io_len" are returned as
2490 				 * the range of pages we actually wrote.
2491 				 * This allows us to skip ahead more quickly
2492 				 * since several pages may've been dealt
2493 				 * with by this iteration of the loop.
2494 				 */
2495 			}
2496 			mutex_enter(&rp->r_statelock);
2497 		}
2498 		mutex_exit(&rp->r_statelock);
2499 	}
2500 
2501 	return (error);
2502 }
2503 
2504 void
2505 nfs4_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2506 {
2507 	rnode4_t *rp;
2508 
2509 	rp = VTOR4(vp);
2510 	if (IS_SHADOW(vp, rp))
2511 		vp = RTOV4(rp);
2512 	mutex_enter(&rp->r_statelock);
2513 	while (rp->r_flags & R4TRUNCATE)
2514 		cv_wait(&rp->r_cv, &rp->r_statelock);
2515 	rp->r_flags |= R4TRUNCATE;
2516 	if (off == (u_offset_t)0) {
2517 		rp->r_flags &= ~R4DIRTY;
2518 		if (!(rp->r_flags & R4STALE))
2519 			rp->r_error = 0;
2520 	}
2521 	rp->r_truncaddr = off;
2522 	mutex_exit(&rp->r_statelock);
2523 	(void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2524 	    B_INVAL | B_TRUNC, cr);
2525 	mutex_enter(&rp->r_statelock);
2526 	rp->r_flags &= ~R4TRUNCATE;
2527 	cv_broadcast(&rp->r_cv);
2528 	mutex_exit(&rp->r_statelock);
2529 }
2530 
2531 static int
2532 nfs4_mnt_kstat_update(kstat_t *ksp, int rw)
2533 {
2534 	mntinfo4_t *mi;
2535 	struct mntinfo_kstat *mik;
2536 	vfs_t *vfsp;
2537 
2538 	/* this is a read-only kstat. Bail out on a write */
2539 	if (rw == KSTAT_WRITE)
2540 		return (EACCES);
2541 
2542 
2543 	/*
2544 	 * We don't want to wait here as kstat_chain_lock could be held by
2545 	 * dounmount(). dounmount() takes vfs_reflock before the chain lock
2546 	 * and thus could lead to a deadlock.
2547 	 */
2548 	vfsp = (struct vfs *)ksp->ks_private;
2549 
2550 	mi = VFTOMI4(vfsp);
2551 	mik = (struct mntinfo_kstat *)ksp->ks_data;
2552 
2553 	(void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
2554 
2555 	mik->mik_vers = (uint32_t)mi->mi_vers;
2556 	mik->mik_flags = mi->mi_flags;
2557 	/*
2558 	 * The sv_secdata holds the flavor the client specifies.
2559 	 * If the client uses default and a security negotiation
2560 	 * occurs, sv_currsec will point to the current flavor
2561 	 * selected from the server flavor list.
2562 	 * sv_currsec is NULL if no security negotiation takes place.
2563 	 */
2564 	mik->mik_secmod = mi->mi_curr_serv->sv_currsec ?
2565 	    mi->mi_curr_serv->sv_currsec->secmod :
2566 	    mi->mi_curr_serv->sv_secdata->secmod;
2567 	mik->mik_curread = (uint32_t)mi->mi_curread;
2568 	mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
2569 	mik->mik_retrans = mi->mi_retrans;
2570 	mik->mik_timeo = mi->mi_timeo;
2571 	mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
2572 	mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
2573 	mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
2574 	mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
2575 	mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
2576 	mik->mik_failover = (uint32_t)mi->mi_failover;
2577 	mik->mik_remap = (uint32_t)mi->mi_remap;
2578 
2579 	(void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
2580 
2581 	return (0);
2582 }
2583 
2584 void
2585 nfs4_mnt_kstat_init(struct vfs *vfsp)
2586 {
2587 	mntinfo4_t *mi = VFTOMI4(vfsp);
2588 
2589 	/*
2590 	 * PSARC 2001/697 Contract Private Interface
2591 	 * All nfs kstats are under SunMC contract
2592 	 * Please refer to the PSARC listed above and contact
2593 	 * SunMC before making any changes!
2594 	 *
2595 	 * Changes must be reviewed by Solaris File Sharing
2596 	 * Changes must be communicated to contract-2001-697@sun.com
2597 	 *
2598 	 */
2599 
2600 	mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
2601 	    NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
2602 	if (mi->mi_io_kstats) {
2603 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2604 			kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
2605 		mi->mi_io_kstats->ks_lock = &mi->mi_lock;
2606 		kstat_install(mi->mi_io_kstats);
2607 	}
2608 
2609 	if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
2610 	    getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
2611 	    sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
2612 		if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
2613 			kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
2614 		mi->mi_ro_kstats->ks_update = nfs4_mnt_kstat_update;
2615 		mi->mi_ro_kstats->ks_private = (void *)vfsp;
2616 		kstat_install(mi->mi_ro_kstats);
2617 	}
2618 
2619 	nfs4_mnt_recov_kstat_init(vfsp);
2620 }
2621 
2622 void
2623 nfs4_write_error(vnode_t *vp, int error, cred_t *cr)
2624 {
2625 	mntinfo4_t *mi;
2626 	clock_t now = ddi_get_lbolt();
2627 
2628 	mi = VTOMI4(vp);
2629 	/*
2630 	 * In case of forced unmount, do not print any messages
2631 	 * since it can flood the console with error messages.
2632 	 */
2633 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED)
2634 		return;
2635 
2636 	/*
2637 	 * If the mount point is dead, not recoverable, do not
2638 	 * print error messages that can flood the console.
2639 	 */
2640 	if (mi->mi_flags & MI4_RECOV_FAIL)
2641 		return;
2642 
2643 	/*
2644 	 * No use in flooding the console with ENOSPC
2645 	 * messages from the same file system.
2646 	 */
2647 	if ((error != ENOSPC && error != EDQUOT) ||
2648 	    now - mi->mi_printftime > 0) {
2649 		zoneid_t zoneid = mi->mi_zone->zone_id;
2650 
2651 #ifdef DEBUG
2652 		nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2653 		    mi->mi_vers, VTOR4(vp)->r_server->sv_hostname, NULL);
2654 #else
2655 		nfs_perror(error, "NFS write error on host %s: %m.\n",
2656 		    VTOR4(vp)->r_server->sv_hostname, NULL);
2657 #endif
2658 		if (error == ENOSPC || error == EDQUOT) {
2659 			zcmn_err(zoneid, CE_CONT,
2660 			    "^File: userid=%d, groupid=%d\n",
2661 			    crgetuid(cr), crgetgid(cr));
2662 			if (crgetuid(curthread->t_cred) != crgetuid(cr) ||
2663 			    crgetgid(curthread->t_cred) != crgetgid(cr)) {
2664 				zcmn_err(zoneid, CE_CONT,
2665 				    "^User: userid=%d, groupid=%d\n",
2666 				    crgetuid(curthread->t_cred),
2667 				    crgetgid(curthread->t_cred));
2668 			}
2669 			mi->mi_printftime = now +
2670 			    nfs_write_error_interval * hz;
2671 		}
2672 		sfh4_printfhandle(VTOR4(vp)->r_fh);
2673 #ifdef DEBUG
2674 		if (error == EACCES) {
2675 			zcmn_err(zoneid, CE_CONT,
2676 			    "nfs_bio: cred is%s kcred\n",
2677 			    cr == kcred ? "" : " not");
2678 		}
2679 #endif
2680 	}
2681 }
2682 
2683 /*
2684  * Return non-zero if the given file can be safely memory mapped.  Locks
2685  * are safe if whole-file (length and offset are both zero).
2686  */
2687 
2688 #define	SAFE_LOCK(flk)	((flk).l_start == 0 && (flk).l_len == 0)
2689 
2690 static int
2691 nfs4_safemap(const vnode_t *vp)
2692 {
2693 	locklist_t	*llp, *next_llp;
2694 	int		safe = 1;
2695 	rnode4_t	*rp = VTOR4(vp);
2696 
2697 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2698 
2699 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: "
2700 	    "vp = %p", (void *)vp));
2701 
2702 	/*
2703 	 * Review all the locks for the vnode, both ones that have been
2704 	 * acquired and ones that are pending.  We assume that
2705 	 * flk_active_locks_for_vp() has merged any locks that can be
2706 	 * merged (so that if a process has the entire file locked, it is
2707 	 * represented as a single lock).
2708 	 *
2709 	 * Note that we can't bail out of the loop if we find a non-safe
2710 	 * lock, because we have to free all the elements in the llp list.
2711 	 * We might be able to speed up this code slightly by not looking
2712 	 * at each lock's l_start and l_len fields once we've found a
2713 	 * non-safe lock.
2714 	 */
2715 
2716 	llp = flk_active_locks_for_vp(vp);
2717 	while (llp) {
2718 		NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2719 		    "nfs4_safemap: active lock (%" PRId64 ", %" PRId64 ")",
2720 		    llp->ll_flock.l_start, llp->ll_flock.l_len));
2721 		if (!SAFE_LOCK(llp->ll_flock)) {
2722 			safe = 0;
2723 			NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE,
2724 			    "nfs4_safemap: unsafe active lock (%" PRId64
2725 			    ", %" PRId64 ")", llp->ll_flock.l_start,
2726 			    llp->ll_flock.l_len));
2727 		}
2728 		next_llp = llp->ll_next;
2729 		VN_RELE(llp->ll_vp);
2730 		kmem_free(llp, sizeof (*llp));
2731 		llp = next_llp;
2732 	}
2733 
2734 	NFS4_DEBUG(nfs4_client_map_debug, (CE_NOTE, "nfs4_safemap: %s",
2735 	    safe ? "safe" : "unsafe"));
2736 	return (safe);
2737 }
2738 
2739 /*
2740  * Return whether there is a lost LOCK or LOCKU queued up for the given
2741  * file that would make an mmap request unsafe.  cf. nfs4_safemap().
2742  */
2743 
2744 bool_t
2745 nfs4_map_lost_lock_conflict(vnode_t *vp)
2746 {
2747 	bool_t conflict = FALSE;
2748 	nfs4_lost_rqst_t *lrp;
2749 	mntinfo4_t *mi = VTOMI4(vp);
2750 
2751 	mutex_enter(&mi->mi_lock);
2752 	for (lrp = list_head(&mi->mi_lost_state); lrp != NULL;
2753 	    lrp = list_next(&mi->mi_lost_state, lrp)) {
2754 		if (lrp->lr_op != OP_LOCK && lrp->lr_op != OP_LOCKU)
2755 			continue;
2756 		ASSERT(lrp->lr_vp != NULL);
2757 		if (!VOP_CMP(lrp->lr_vp, vp, NULL))
2758 			continue;	/* different file */
2759 		if (!SAFE_LOCK(*lrp->lr_flk)) {
2760 			conflict = TRUE;
2761 			break;
2762 		}
2763 	}
2764 
2765 	mutex_exit(&mi->mi_lock);
2766 	return (conflict);
2767 }
2768 
2769 /*
2770  * nfs_lockcompletion:
2771  *
2772  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2773  * as non cachable (set VNOCACHE bit).
2774  */
2775 
2776 void
2777 nfs4_lockcompletion(vnode_t *vp, int cmd)
2778 {
2779 	rnode4_t *rp = VTOR4(vp);
2780 
2781 	ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2782 	ASSERT(!IS_SHADOW(vp, rp));
2783 
2784 	if (cmd == F_SETLK || cmd == F_SETLKW) {
2785 
2786 		if (!nfs4_safemap(vp)) {
2787 			mutex_enter(&vp->v_lock);
2788 			vp->v_flag |= VNOCACHE;
2789 			mutex_exit(&vp->v_lock);
2790 		} else {
2791 			mutex_enter(&vp->v_lock);
2792 			vp->v_flag &= ~VNOCACHE;
2793 			mutex_exit(&vp->v_lock);
2794 		}
2795 	}
2796 	/*
2797 	 * The cached attributes of the file are stale after acquiring
2798 	 * the lock on the file. They were updated when the file was
2799 	 * opened, but not updated when the lock was acquired. Therefore the
2800 	 * cached attributes are invalidated after the lock is obtained.
2801 	 */
2802 	PURGE_ATTRCACHE4(vp);
2803 }
2804 
2805 /* ARGSUSED */
2806 static void *
2807 nfs4_mi_init(zoneid_t zoneid)
2808 {
2809 	struct mi4_globals *mig;
2810 
2811 	mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2812 	mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2813 	list_create(&mig->mig_list, sizeof (mntinfo4_t),
2814 	    offsetof(mntinfo4_t, mi_zone_node));
2815 	mig->mig_destructor_called = B_FALSE;
2816 	return (mig);
2817 }
2818 
2819 /*
2820  * Callback routine to tell all NFSv4 mounts in the zone to start tearing down
2821  * state and killing off threads.
2822  */
2823 /* ARGSUSED */
2824 static void
2825 nfs4_mi_shutdown(zoneid_t zoneid, void *data)
2826 {
2827 	struct mi4_globals *mig = data;
2828 	mntinfo4_t *mi;
2829 	nfs4_server_t *np;
2830 
2831 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2832 	    "nfs4_mi_shutdown zone %d\n", zoneid));
2833 	ASSERT(mig != NULL);
2834 	for (;;) {
2835 		mutex_enter(&mig->mig_lock);
2836 		mi = list_head(&mig->mig_list);
2837 		if (mi == NULL) {
2838 			mutex_exit(&mig->mig_lock);
2839 			break;
2840 		}
2841 
2842 		NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2843 		    "nfs4_mi_shutdown stopping vfs %p\n", (void *)mi->mi_vfsp));
2844 		/*
2845 		 * purge the DNLC for this filesystem
2846 		 */
2847 		(void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2848 		/*
2849 		 * Tell existing async worker threads to exit.
2850 		 */
2851 		mutex_enter(&mi->mi_async_lock);
2852 		mi->mi_max_threads = 0;
2853 		cv_broadcast(&mi->mi_async_work_cv);
2854 		/*
2855 		 * Set the appropriate flags, signal and wait for both the
2856 		 * async manager and the inactive thread to exit when they're
2857 		 * done with their current work.
2858 		 */
2859 		mutex_enter(&mi->mi_lock);
2860 		mi->mi_flags |= (MI4_ASYNC_MGR_STOP|MI4_DEAD);
2861 		mutex_exit(&mi->mi_lock);
2862 		mutex_exit(&mi->mi_async_lock);
2863 		if (mi->mi_manager_thread) {
2864 			nfs4_async_manager_stop(mi->mi_vfsp);
2865 		}
2866 		if (mi->mi_inactive_thread) {
2867 			mutex_enter(&mi->mi_async_lock);
2868 			cv_signal(&mi->mi_inact_req_cv);
2869 			/*
2870 			 * Wait for the inactive thread to exit.
2871 			 */
2872 			while (mi->mi_inactive_thread != NULL) {
2873 				cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2874 			}
2875 			mutex_exit(&mi->mi_async_lock);
2876 		}
2877 		/*
2878 		 * Wait for the recovery thread to complete, that is, it will
2879 		 * signal when it is done using the "mi" structure and about
2880 		 * to exit
2881 		 */
2882 		mutex_enter(&mi->mi_lock);
2883 		while (mi->mi_in_recovery > 0)
2884 			cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
2885 		mutex_exit(&mi->mi_lock);
2886 		/*
2887 		 * We're done when every mi has been done or the list is empty.
2888 		 * This one is done, remove it from the list.
2889 		 */
2890 		list_remove(&mig->mig_list, mi);
2891 		mutex_exit(&mig->mig_lock);
2892 		zone_rele(mi->mi_zone);
2893 		/*
2894 		 * Release hold on vfs and mi done to prevent race with zone
2895 		 * shutdown. This releases the hold in nfs4_mi_zonelist_add.
2896 		 */
2897 		VFS_RELE(mi->mi_vfsp);
2898 		MI4_RELE(mi);
2899 	}
2900 	/*
2901 	 * Tell each renew thread in the zone to exit
2902 	 */
2903 	mutex_enter(&nfs4_server_lst_lock);
2904 	for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2905 		mutex_enter(&np->s_lock);
2906 		if (np->zoneid == zoneid) {
2907 			/*
2908 			 * We add another hold onto the nfs4_server_t
2909 			 * because this will make sure tha the nfs4_server_t
2910 			 * stays around until nfs4_callback_fini_zone destroys
2911 			 * the zone. This way, the renew thread can
2912 			 * unconditionally release its holds on the
2913 			 * nfs4_server_t.
2914 			 */
2915 			np->s_refcnt++;
2916 			nfs4_mark_srv_dead(np);
2917 		}
2918 		mutex_exit(&np->s_lock);
2919 	}
2920 	mutex_exit(&nfs4_server_lst_lock);
2921 }
2922 
2923 static void
2924 nfs4_mi_free_globals(struct mi4_globals *mig)
2925 {
2926 	list_destroy(&mig->mig_list);	/* makes sure the list is empty */
2927 	mutex_destroy(&mig->mig_lock);
2928 	kmem_free(mig, sizeof (*mig));
2929 }
2930 
2931 /* ARGSUSED */
2932 static void
2933 nfs4_mi_destroy(zoneid_t zoneid, void *data)
2934 {
2935 	struct mi4_globals *mig = data;
2936 
2937 	NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2938 	    "nfs4_mi_destroy zone %d\n", zoneid));
2939 	ASSERT(mig != NULL);
2940 	mutex_enter(&mig->mig_lock);
2941 	if (list_head(&mig->mig_list) != NULL) {
2942 		/* Still waiting for VFS_FREEVFS() */
2943 		mig->mig_destructor_called = B_TRUE;
2944 		mutex_exit(&mig->mig_lock);
2945 		return;
2946 	}
2947 	nfs4_mi_free_globals(mig);
2948 }
2949 
2950 /*
2951  * Add an NFS mount to the per-zone list of NFS mounts.
2952  */
2953 void
2954 nfs4_mi_zonelist_add(mntinfo4_t *mi)
2955 {
2956 	struct mi4_globals *mig;
2957 
2958 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2959 	mutex_enter(&mig->mig_lock);
2960 	list_insert_head(&mig->mig_list, mi);
2961 	/*
2962 	 * hold added to eliminate race with zone shutdown -this will be
2963 	 * released in mi_shutdown
2964 	 */
2965 	MI4_HOLD(mi);
2966 	VFS_HOLD(mi->mi_vfsp);
2967 	mutex_exit(&mig->mig_lock);
2968 }
2969 
2970 /*
2971  * Remove an NFS mount from the per-zone list of NFS mounts.
2972  */
2973 int
2974 nfs4_mi_zonelist_remove(mntinfo4_t *mi)
2975 {
2976 	struct mi4_globals *mig;
2977 	int ret = 0;
2978 
2979 	mig = zone_getspecific(mi4_list_key, mi->mi_zone);
2980 	mutex_enter(&mig->mig_lock);
2981 	mutex_enter(&mi->mi_lock);
2982 	/* if this mi is marked dead, then the zone already released it */
2983 	if (!(mi->mi_flags & MI4_DEAD)) {
2984 		list_remove(&mig->mig_list, mi);
2985 		mutex_exit(&mi->mi_lock);
2986 
2987 		/* release the holds put on in zonelist_add(). */
2988 		VFS_RELE(mi->mi_vfsp);
2989 		MI4_RELE(mi);
2990 		ret = 1;
2991 	} else {
2992 		mutex_exit(&mi->mi_lock);
2993 	}
2994 
2995 	/*
2996 	 * We can be called asynchronously by VFS_FREEVFS() after the zone
2997 	 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2998 	 * mi globals.
2999 	 */
3000 	if (list_head(&mig->mig_list) == NULL &&
3001 	    mig->mig_destructor_called == B_TRUE) {
3002 		nfs4_mi_free_globals(mig);
3003 		return (ret);
3004 	}
3005 	mutex_exit(&mig->mig_lock);
3006 	return (ret);
3007 }
3008 
3009 void
3010 nfs_free_mi4(mntinfo4_t *mi)
3011 {
3012 	nfs4_open_owner_t	*foop;
3013 	nfs4_oo_hash_bucket_t   *bucketp;
3014 	nfs4_debug_msg_t	*msgp;
3015 	int i;
3016 	servinfo4_t 		*svp;
3017 
3018 	/*
3019 	 * Code introduced here should be carefully evaluated to make
3020 	 * sure none of the freed resources are accessed either directly
3021 	 * or indirectly after freeing them. For eg: Introducing calls to
3022 	 * NFS4_DEBUG that use mntinfo4_t structure member after freeing
3023 	 * the structure members or other routines calling back into NFS
3024 	 * accessing freed mntinfo4_t structure member.
3025 	 */
3026 	mutex_enter(&mi->mi_lock);
3027 	ASSERT(mi->mi_recovthread == NULL);
3028 	ASSERT(mi->mi_flags & MI4_ASYNC_MGR_STOP);
3029 	mutex_exit(&mi->mi_lock);
3030 	mutex_enter(&mi->mi_async_lock);
3031 	ASSERT(mi->mi_threads == 0);
3032 	ASSERT(mi->mi_manager_thread == NULL);
3033 	mutex_exit(&mi->mi_async_lock);
3034 	if (mi->mi_io_kstats) {
3035 		kstat_delete(mi->mi_io_kstats);
3036 		mi->mi_io_kstats = NULL;
3037 	}
3038 	if (mi->mi_ro_kstats) {
3039 		kstat_delete(mi->mi_ro_kstats);
3040 		mi->mi_ro_kstats = NULL;
3041 	}
3042 	if (mi->mi_recov_ksp) {
3043 		kstat_delete(mi->mi_recov_ksp);
3044 		mi->mi_recov_ksp = NULL;
3045 	}
3046 	mutex_enter(&mi->mi_msg_list_lock);
3047 	while (msgp = list_head(&mi->mi_msg_list)) {
3048 		list_remove(&mi->mi_msg_list, msgp);
3049 		nfs4_free_msg(msgp);
3050 	}
3051 	mutex_exit(&mi->mi_msg_list_lock);
3052 	list_destroy(&mi->mi_msg_list);
3053 	if (mi->mi_fname != NULL)
3054 		fn_rele(&mi->mi_fname);
3055 	if (mi->mi_rootfh != NULL)
3056 		sfh4_rele(&mi->mi_rootfh);
3057 	if (mi->mi_srvparentfh != NULL)
3058 		sfh4_rele(&mi->mi_srvparentfh);
3059 	svp = mi->mi_servers;
3060 	sv4_free(svp);
3061 	mutex_destroy(&mi->mi_lock);
3062 	mutex_destroy(&mi->mi_async_lock);
3063 	mutex_destroy(&mi->mi_msg_list_lock);
3064 	nfs_rw_destroy(&mi->mi_recovlock);
3065 	nfs_rw_destroy(&mi->mi_rename_lock);
3066 	nfs_rw_destroy(&mi->mi_fh_lock);
3067 	cv_destroy(&mi->mi_failover_cv);
3068 	cv_destroy(&mi->mi_async_reqs_cv);
3069 	cv_destroy(&mi->mi_async_work_cv);
3070 	cv_destroy(&mi->mi_async_cv);
3071 	cv_destroy(&mi->mi_inact_req_cv);
3072 	/*
3073 	 * Destroy the oo hash lists and mutexes for the cred hash table.
3074 	 */
3075 	for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
3076 		bucketp = &(mi->mi_oo_list[i]);
3077 		/* Destroy any remaining open owners on the list */
3078 		foop = list_head(&bucketp->b_oo_hash_list);
3079 		while (foop != NULL) {
3080 			list_remove(&bucketp->b_oo_hash_list, foop);
3081 			nfs4_destroy_open_owner(foop);
3082 			foop = list_head(&bucketp->b_oo_hash_list);
3083 		}
3084 		list_destroy(&bucketp->b_oo_hash_list);
3085 		mutex_destroy(&bucketp->b_lock);
3086 	}
3087 	/*
3088 	 * Empty and destroy the freed open owner list.
3089 	 */
3090 	foop = list_head(&mi->mi_foo_list);
3091 	while (foop != NULL) {
3092 		list_remove(&mi->mi_foo_list, foop);
3093 		nfs4_destroy_open_owner(foop);
3094 		foop = list_head(&mi->mi_foo_list);
3095 	}
3096 	list_destroy(&mi->mi_foo_list);
3097 	list_destroy(&mi->mi_bseqid_list);
3098 	list_destroy(&mi->mi_lost_state);
3099 	avl_destroy(&mi->mi_filehandles);
3100 	kmem_free(mi, sizeof (*mi));
3101 }
3102 void
3103 mi_hold(mntinfo4_t *mi)
3104 {
3105 	atomic_add_32(&mi->mi_count, 1);
3106 	ASSERT(mi->mi_count != 0);
3107 }
3108 
3109 void
3110 mi_rele(mntinfo4_t *mi)
3111 {
3112 	ASSERT(mi->mi_count != 0);
3113 	if (atomic_add_32_nv(&mi->mi_count, -1) == 0) {
3114 		nfs_free_mi4(mi);
3115 	}
3116 }
3117 
3118 vnode_t    nfs4_xattr_notsupp_vnode;
3119 
3120 void
3121 nfs4_clnt_init(void)
3122 {
3123 	nfs4_vnops_init();
3124 	(void) nfs4_rnode_init();
3125 	(void) nfs4_shadow_init();
3126 	(void) nfs4_acache_init();
3127 	(void) nfs4_subr_init();
3128 	nfs4_acl_init();
3129 	nfs_idmap_init();
3130 	nfs4_callback_init();
3131 	nfs4_secinfo_init();
3132 #ifdef	DEBUG
3133 	tsd_create(&nfs4_tsd_key, NULL);
3134 #endif
3135 
3136 	/*
3137 	 * Add a CPR callback so that we can update client
3138 	 * lease after a suspend and resume.
3139 	 */
3140 	cid = callb_add(nfs4_client_cpr_callb, 0, CB_CL_CPR_RPC, "nfs4");
3141 
3142 	zone_key_create(&mi4_list_key, nfs4_mi_init, nfs4_mi_shutdown,
3143 	    nfs4_mi_destroy);
3144 
3145 	/*
3146 	 * Initialise the reference count of the notsupp xattr cache vnode to 1
3147 	 * so that it never goes away (VOP_INACTIVE isn't called on it).
3148 	 */
3149 	nfs4_xattr_notsupp_vnode.v_count = 1;
3150 }
3151 
3152 void
3153 nfs4_clnt_fini(void)
3154 {
3155 	(void) zone_key_delete(mi4_list_key);
3156 	nfs4_vnops_fini();
3157 	(void) nfs4_rnode_fini();
3158 	(void) nfs4_shadow_fini();
3159 	(void) nfs4_acache_fini();
3160 	(void) nfs4_subr_fini();
3161 	nfs_idmap_fini();
3162 	nfs4_callback_fini();
3163 	nfs4_secinfo_fini();
3164 #ifdef	DEBUG
3165 	tsd_destroy(&nfs4_tsd_key);
3166 #endif
3167 	if (cid)
3168 		(void) callb_delete(cid);
3169 }
3170 
3171 /*ARGSUSED*/
3172 static boolean_t
3173 nfs4_client_cpr_callb(void *arg, int code)
3174 {
3175 	/*
3176 	 * We get called for Suspend and Resume events.
3177 	 * For the suspend case we simply don't care!
3178 	 */
3179 	if (code == CB_CODE_CPR_CHKPT) {
3180 		return (B_TRUE);
3181 	}
3182 
3183 	/*
3184 	 * When we get to here we are in the process of
3185 	 * resuming the system from a previous suspend.
3186 	 */
3187 	nfs4_client_resumed = gethrestime_sec();
3188 	return (B_TRUE);
3189 }
3190 
3191 void
3192 nfs4_renew_lease_thread(nfs4_server_t *sp)
3193 {
3194 	int	error = 0;
3195 	time_t	tmp_last_renewal_time, tmp_time, tmp_now_time, kip_secs;
3196 	clock_t	tick_delay = 0;
3197 	clock_t time_left = 0;
3198 	callb_cpr_t cpr_info;
3199 	kmutex_t cpr_lock;
3200 
3201 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3202 	    "nfs4_renew_lease_thread: acting on sp 0x%p", (void*)sp));
3203 	mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
3204 	CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4Lease");
3205 
3206 	mutex_enter(&sp->s_lock);
3207 	/* sp->s_lease_time is set via a GETATTR */
3208 	sp->last_renewal_time = gethrestime_sec();
3209 	sp->lease_valid = NFS4_LEASE_UNINITIALIZED;
3210 	ASSERT(sp->s_refcnt >= 1);
3211 
3212 	for (;;) {
3213 		if (!sp->state_ref_count ||
3214 		    sp->lease_valid != NFS4_LEASE_VALID) {
3215 
3216 			kip_secs = MAX((sp->s_lease_time >> 1) -
3217 			    (3 * sp->propagation_delay.tv_sec), 1);
3218 
3219 			tick_delay = SEC_TO_TICK(kip_secs);
3220 
3221 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3222 			    "nfs4_renew_lease_thread: no renew : thread "
3223 			    "wait %ld secs", kip_secs));
3224 
3225 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3226 			    "nfs4_renew_lease_thread: no renew : "
3227 			    "state_ref_count %d, lease_valid %d",
3228 			    sp->state_ref_count, sp->lease_valid));
3229 
3230 			mutex_enter(&cpr_lock);
3231 			CALLB_CPR_SAFE_BEGIN(&cpr_info);
3232 			mutex_exit(&cpr_lock);
3233 			time_left = cv_reltimedwait(&sp->cv_thread_exit,
3234 			    &sp->s_lock, tick_delay, TR_CLOCK_TICK);
3235 			mutex_enter(&cpr_lock);
3236 			CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3237 			mutex_exit(&cpr_lock);
3238 
3239 			NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3240 			    "nfs4_renew_lease_thread: no renew: "
3241 			    "time left %ld", time_left));
3242 
3243 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3244 				goto die;
3245 			continue;
3246 		}
3247 
3248 		tmp_last_renewal_time = sp->last_renewal_time;
3249 
3250 		tmp_time = gethrestime_sec() - sp->last_renewal_time +
3251 		    (3 * sp->propagation_delay.tv_sec);
3252 
3253 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3254 		    "nfs4_renew_lease_thread: tmp_time %ld, "
3255 		    "sp->last_renewal_time %ld", tmp_time,
3256 		    sp->last_renewal_time));
3257 
3258 		kip_secs = MAX((sp->s_lease_time >> 1) - tmp_time, 1);
3259 
3260 		tick_delay = SEC_TO_TICK(kip_secs);
3261 
3262 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3263 		    "nfs4_renew_lease_thread: valid lease: sleep for %ld "
3264 		    "secs", kip_secs));
3265 
3266 		mutex_enter(&cpr_lock);
3267 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3268 		mutex_exit(&cpr_lock);
3269 		time_left = cv_reltimedwait(&sp->cv_thread_exit, &sp->s_lock,
3270 		    tick_delay, TR_CLOCK_TICK);
3271 		mutex_enter(&cpr_lock);
3272 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3273 		mutex_exit(&cpr_lock);
3274 
3275 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3276 		    "nfs4_renew_lease_thread: valid lease: time left %ld :"
3277 		    "sp last_renewal_time %ld, nfs4_client_resumed %ld, "
3278 		    "tmp_last_renewal_time %ld", time_left,
3279 		    sp->last_renewal_time, nfs4_client_resumed,
3280 		    tmp_last_renewal_time));
3281 
3282 		if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3283 			goto die;
3284 
3285 		if (tmp_last_renewal_time == sp->last_renewal_time ||
3286 		    (nfs4_client_resumed != 0 &&
3287 		    nfs4_client_resumed > sp->last_renewal_time)) {
3288 			/*
3289 			 * Issue RENEW op since we haven't renewed the lease
3290 			 * since we slept.
3291 			 */
3292 			tmp_now_time = gethrestime_sec();
3293 			error = nfs4renew(sp);
3294 			/*
3295 			 * Need to re-acquire sp's lock, nfs4renew()
3296 			 * relinqueshes it.
3297 			 */
3298 			mutex_enter(&sp->s_lock);
3299 
3300 			/*
3301 			 * See if someone changed s_thread_exit while we gave
3302 			 * up s_lock.
3303 			 */
3304 			if (sp->s_thread_exit == NFS4_THREAD_EXIT)
3305 				goto die;
3306 
3307 			if (!error) {
3308 				/*
3309 				 * check to see if we implicitly renewed while
3310 				 * we waited for a reply for our RENEW call.
3311 				 */
3312 				if (tmp_last_renewal_time ==
3313 				    sp->last_renewal_time) {
3314 					/* no implicit renew came */
3315 					sp->last_renewal_time = tmp_now_time;
3316 				} else {
3317 					NFS4_DEBUG(nfs4_client_lease_debug,
3318 					    (CE_NOTE, "renew_thread: did "
3319 					    "implicit renewal before reply "
3320 					    "from server for RENEW"));
3321 				}
3322 			} else {
3323 				/* figure out error */
3324 				NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3325 				    "renew_thread: nfs4renew returned error"
3326 				    " %d", error));
3327 			}
3328 
3329 		}
3330 	}
3331 
3332 die:
3333 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3334 	    "nfs4_renew_lease_thread: thread exiting"));
3335 
3336 	while (sp->s_otw_call_count != 0) {
3337 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3338 		    "nfs4_renew_lease_thread: waiting for outstanding "
3339 		    "otw calls to finish for sp 0x%p, current "
3340 		    "s_otw_call_count %d", (void *)sp,
3341 		    sp->s_otw_call_count));
3342 		mutex_enter(&cpr_lock);
3343 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
3344 		mutex_exit(&cpr_lock);
3345 		cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
3346 		mutex_enter(&cpr_lock);
3347 		CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
3348 		mutex_exit(&cpr_lock);
3349 	}
3350 	mutex_exit(&sp->s_lock);
3351 
3352 	nfs4_server_rele(sp);		/* free the thread's reference */
3353 	nfs4_server_rele(sp);		/* free the list's reference */
3354 	sp = NULL;
3355 
3356 done:
3357 	mutex_enter(&cpr_lock);
3358 	CALLB_CPR_EXIT(&cpr_info);	/* drops cpr_lock */
3359 	mutex_destroy(&cpr_lock);
3360 
3361 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3362 	    "nfs4_renew_lease_thread: renew thread exit officially"));
3363 
3364 	zthread_exit();
3365 	/* NOT REACHED */
3366 }
3367 
3368 /*
3369  * Send out a RENEW op to the server.
3370  * Assumes sp is locked down.
3371  */
3372 static int
3373 nfs4renew(nfs4_server_t *sp)
3374 {
3375 	COMPOUND4args_clnt args;
3376 	COMPOUND4res_clnt res;
3377 	nfs_argop4 argop[1];
3378 	int doqueue = 1;
3379 	int rpc_error;
3380 	cred_t *cr;
3381 	mntinfo4_t *mi;
3382 	timespec_t prop_time, after_time;
3383 	int needrecov = FALSE;
3384 	nfs4_recov_state_t recov_state;
3385 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3386 
3387 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4renew"));
3388 
3389 	recov_state.rs_flags = 0;
3390 	recov_state.rs_num_retry_despite_err = 0;
3391 
3392 recov_retry:
3393 	mi = sp->mntinfo4_list;
3394 	VFS_HOLD(mi->mi_vfsp);
3395 	mutex_exit(&sp->s_lock);
3396 	ASSERT(mi != NULL);
3397 
3398 	e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3399 	if (e.error) {
3400 		VFS_RELE(mi->mi_vfsp);
3401 		return (e.error);
3402 	}
3403 
3404 	/* Check to see if we're dealing with a marked-dead sp */
3405 	mutex_enter(&sp->s_lock);
3406 	if (sp->s_thread_exit == NFS4_THREAD_EXIT) {
3407 		mutex_exit(&sp->s_lock);
3408 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3409 		VFS_RELE(mi->mi_vfsp);
3410 		return (0);
3411 	}
3412 
3413 	/* Make sure mi hasn't changed on us */
3414 	if (mi != sp->mntinfo4_list) {
3415 		/* Must drop sp's lock to avoid a recursive mutex enter */
3416 		mutex_exit(&sp->s_lock);
3417 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3418 		VFS_RELE(mi->mi_vfsp);
3419 		mutex_enter(&sp->s_lock);
3420 		goto recov_retry;
3421 	}
3422 	mutex_exit(&sp->s_lock);
3423 
3424 	args.ctag = TAG_RENEW;
3425 
3426 	args.array_len = 1;
3427 	args.array = argop;
3428 
3429 	argop[0].argop = OP_RENEW;
3430 
3431 	mutex_enter(&sp->s_lock);
3432 	argop[0].nfs_argop4_u.oprenew.clientid = sp->clientid;
3433 	cr = sp->s_cred;
3434 	crhold(cr);
3435 	mutex_exit(&sp->s_lock);
3436 
3437 	ASSERT(cr != NULL);
3438 
3439 	/* used to figure out RTT for sp */
3440 	gethrestime(&prop_time);
3441 
3442 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3443 	    "nfs4renew: %s call, sp 0x%p", needrecov ? "recov" : "first",
3444 	    (void*)sp));
3445 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "before: %ld s %ld ns ",
3446 	    prop_time.tv_sec, prop_time.tv_nsec));
3447 
3448 	DTRACE_PROBE2(nfs4__renew__start, nfs4_server_t *, sp,
3449 	    mntinfo4_t *, mi);
3450 
3451 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3452 	crfree(cr);
3453 
3454 	DTRACE_PROBE2(nfs4__renew__end, nfs4_server_t *, sp,
3455 	    mntinfo4_t *, mi);
3456 
3457 	gethrestime(&after_time);
3458 
3459 	mutex_enter(&sp->s_lock);
3460 	sp->propagation_delay.tv_sec =
3461 	    MAX(1, after_time.tv_sec - prop_time.tv_sec);
3462 	mutex_exit(&sp->s_lock);
3463 
3464 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "after : %ld s %ld ns ",
3465 	    after_time.tv_sec, after_time.tv_nsec));
3466 
3467 	if (e.error == 0 && res.status == NFS4ERR_CB_PATH_DOWN) {
3468 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3469 		nfs4_delegreturn_all(sp);
3470 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3471 		VFS_RELE(mi->mi_vfsp);
3472 		/*
3473 		 * If the server returns CB_PATH_DOWN, it has renewed
3474 		 * the lease and informed us that the callback path is
3475 		 * down.  Since the lease is renewed, just return 0 and
3476 		 * let the renew thread proceed as normal.
3477 		 */
3478 		return (0);
3479 	}
3480 
3481 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3482 	if (!needrecov && e.error) {
3483 		nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3484 		VFS_RELE(mi->mi_vfsp);
3485 		return (e.error);
3486 	}
3487 
3488 	rpc_error = e.error;
3489 
3490 	if (needrecov) {
3491 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3492 		    "nfs4renew: initiating recovery\n"));
3493 
3494 		if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
3495 		    OP_RENEW, NULL, NULL, NULL) == FALSE) {
3496 			nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3497 			VFS_RELE(mi->mi_vfsp);
3498 			if (!e.error)
3499 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3500 				    (caddr_t)&res);
3501 			mutex_enter(&sp->s_lock);
3502 			goto recov_retry;
3503 		}
3504 		/* fall through for res.status case */
3505 	}
3506 
3507 	if (res.status) {
3508 		if (res.status == NFS4ERR_LEASE_MOVED) {
3509 			/*EMPTY*/
3510 			/*
3511 			 * XXX need to try every mntinfo4 in sp->mntinfo4_list
3512 			 * to renew the lease on that server
3513 			 */
3514 		}
3515 		e.error = geterrno4(res.status);
3516 	}
3517 
3518 	if (!rpc_error)
3519 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3520 
3521 	nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
3522 
3523 	VFS_RELE(mi->mi_vfsp);
3524 
3525 	return (e.error);
3526 }
3527 
3528 void
3529 nfs4_inc_state_ref_count(mntinfo4_t *mi)
3530 {
3531 	nfs4_server_t	*sp;
3532 
3533 	/* this locks down sp if it is found */
3534 	sp = find_nfs4_server(mi);
3535 
3536 	if (sp != NULL) {
3537 		nfs4_inc_state_ref_count_nolock(sp, mi);
3538 		mutex_exit(&sp->s_lock);
3539 		nfs4_server_rele(sp);
3540 	}
3541 }
3542 
3543 /*
3544  * Bump the number of OPEN files (ie: those with state) so we know if this
3545  * nfs4_server has any state to maintain a lease for or not.
3546  *
3547  * Also, marks the nfs4_server's lease valid if it hasn't been done so already.
3548  */
3549 void
3550 nfs4_inc_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3551 {
3552 	ASSERT(mutex_owned(&sp->s_lock));
3553 
3554 	sp->state_ref_count++;
3555 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3556 	    "nfs4_inc_state_ref_count: state_ref_count now %d",
3557 	    sp->state_ref_count));
3558 
3559 	if (sp->lease_valid == NFS4_LEASE_UNINITIALIZED)
3560 		sp->lease_valid = NFS4_LEASE_VALID;
3561 
3562 	/*
3563 	 * If this call caused the lease to be marked valid and/or
3564 	 * took the state_ref_count from 0 to 1, then start the time
3565 	 * on lease renewal.
3566 	 */
3567 	if (sp->lease_valid == NFS4_LEASE_VALID && sp->state_ref_count == 1)
3568 		sp->last_renewal_time = gethrestime_sec();
3569 
3570 	/* update the number of open files for mi */
3571 	mi->mi_open_files++;
3572 }
3573 
3574 void
3575 nfs4_dec_state_ref_count(mntinfo4_t *mi)
3576 {
3577 	nfs4_server_t	*sp;
3578 
3579 	/* this locks down sp if it is found */
3580 	sp = find_nfs4_server_all(mi, 1);
3581 
3582 	if (sp != NULL) {
3583 		nfs4_dec_state_ref_count_nolock(sp, mi);
3584 		mutex_exit(&sp->s_lock);
3585 		nfs4_server_rele(sp);
3586 	}
3587 }
3588 
3589 /*
3590  * Decrement the number of OPEN files (ie: those with state) so we know if
3591  * this nfs4_server has any state to maintain a lease for or not.
3592  */
3593 void
3594 nfs4_dec_state_ref_count_nolock(nfs4_server_t *sp, mntinfo4_t *mi)
3595 {
3596 	ASSERT(mutex_owned(&sp->s_lock));
3597 	ASSERT(sp->state_ref_count != 0);
3598 	sp->state_ref_count--;
3599 
3600 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3601 	    "nfs4_dec_state_ref_count: state ref count now %d",
3602 	    sp->state_ref_count));
3603 
3604 	mi->mi_open_files--;
3605 	NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3606 	    "nfs4_dec_state_ref_count: mi open files %d, v4 flags 0x%x",
3607 	    mi->mi_open_files, mi->mi_flags));
3608 
3609 	/* We don't have to hold the mi_lock to test mi_flags */
3610 	if (mi->mi_open_files == 0 &&
3611 	    (mi->mi_flags & MI4_REMOVE_ON_LAST_CLOSE)) {
3612 		NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3613 		    "nfs4_dec_state_ref_count: remove mntinfo4 %p since "
3614 		    "we have closed the last open file", (void*)mi));
3615 		nfs4_remove_mi_from_server(mi, sp);
3616 	}
3617 }
3618 
3619 bool_t
3620 inlease(nfs4_server_t *sp)
3621 {
3622 	bool_t result;
3623 
3624 	ASSERT(mutex_owned(&sp->s_lock));
3625 
3626 	if (sp->lease_valid == NFS4_LEASE_VALID &&
3627 	    gethrestime_sec() < sp->last_renewal_time + sp->s_lease_time)
3628 		result = TRUE;
3629 	else
3630 		result = FALSE;
3631 
3632 	return (result);
3633 }
3634 
3635 
3636 /*
3637  * Return non-zero if the given nfs4_server_t is going through recovery.
3638  */
3639 
3640 int
3641 nfs4_server_in_recovery(nfs4_server_t *sp)
3642 {
3643 	return (nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
3644 }
3645 
3646 /*
3647  * Compare two shared filehandle objects.  Returns -1, 0, or +1, if the
3648  * first is less than, equal to, or greater than the second.
3649  */
3650 
3651 int
3652 sfh4cmp(const void *p1, const void *p2)
3653 {
3654 	const nfs4_sharedfh_t *sfh1 = (const nfs4_sharedfh_t *)p1;
3655 	const nfs4_sharedfh_t *sfh2 = (const nfs4_sharedfh_t *)p2;
3656 
3657 	return (nfs4cmpfh(&sfh1->sfh_fh, &sfh2->sfh_fh));
3658 }
3659 
3660 /*
3661  * Create a table for shared filehandle objects.
3662  */
3663 
3664 void
3665 sfh4_createtab(avl_tree_t *tab)
3666 {
3667 	avl_create(tab, sfh4cmp, sizeof (nfs4_sharedfh_t),
3668 	    offsetof(nfs4_sharedfh_t, sfh_tree));
3669 }
3670 
3671 /*
3672  * Return a shared filehandle object for the given filehandle.  The caller
3673  * is responsible for eventually calling sfh4_rele().
3674  */
3675 
3676 nfs4_sharedfh_t *
3677 sfh4_put(const nfs_fh4 *fh, mntinfo4_t *mi, nfs4_sharedfh_t *key)
3678 {
3679 	nfs4_sharedfh_t *sfh, *nsfh;
3680 	avl_index_t where;
3681 	nfs4_sharedfh_t skey;
3682 
3683 	if (!key) {
3684 		skey.sfh_fh = *fh;
3685 		key = &skey;
3686 	}
3687 
3688 	nsfh = kmem_alloc(sizeof (nfs4_sharedfh_t), KM_SLEEP);
3689 	nsfh->sfh_fh.nfs_fh4_len = fh->nfs_fh4_len;
3690 	/*
3691 	 * We allocate the largest possible filehandle size because it's
3692 	 * not that big, and it saves us from possibly having to resize the
3693 	 * buffer later.
3694 	 */
3695 	nsfh->sfh_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP);
3696 	bcopy(fh->nfs_fh4_val, nsfh->sfh_fh.nfs_fh4_val, fh->nfs_fh4_len);
3697 	mutex_init(&nsfh->sfh_lock, NULL, MUTEX_DEFAULT, NULL);
3698 	nsfh->sfh_refcnt = 1;
3699 	nsfh->sfh_flags = SFH4_IN_TREE;
3700 	nsfh->sfh_mi = mi;
3701 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE, "sfh4_get: new object (%p)",
3702 	    (void *)nsfh));
3703 
3704 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3705 	sfh = avl_find(&mi->mi_filehandles, key, &where);
3706 	if (sfh != NULL) {
3707 		mutex_enter(&sfh->sfh_lock);
3708 		sfh->sfh_refcnt++;
3709 		mutex_exit(&sfh->sfh_lock);
3710 		nfs_rw_exit(&mi->mi_fh_lock);
3711 		/* free our speculative allocs */
3712 		kmem_free(nsfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3713 		kmem_free(nsfh, sizeof (nfs4_sharedfh_t));
3714 		return (sfh);
3715 	}
3716 
3717 	avl_insert(&mi->mi_filehandles, nsfh, where);
3718 	nfs_rw_exit(&mi->mi_fh_lock);
3719 
3720 	return (nsfh);
3721 }
3722 
3723 /*
3724  * Return a shared filehandle object for the given filehandle.  The caller
3725  * is responsible for eventually calling sfh4_rele().
3726  */
3727 
3728 nfs4_sharedfh_t *
3729 sfh4_get(const nfs_fh4 *fh, mntinfo4_t *mi)
3730 {
3731 	nfs4_sharedfh_t *sfh;
3732 	nfs4_sharedfh_t key;
3733 
3734 	ASSERT(fh->nfs_fh4_len <= NFS4_FHSIZE);
3735 
3736 #ifdef DEBUG
3737 	if (nfs4_sharedfh_debug) {
3738 		nfs4_fhandle_t fhandle;
3739 
3740 		fhandle.fh_len = fh->nfs_fh4_len;
3741 		bcopy(fh->nfs_fh4_val, fhandle.fh_buf, fhandle.fh_len);
3742 		zcmn_err(mi->mi_zone->zone_id, CE_NOTE, "sfh4_get:");
3743 		nfs4_printfhandle(&fhandle);
3744 	}
3745 #endif
3746 
3747 	/*
3748 	 * If there's already an object for the given filehandle, bump the
3749 	 * reference count and return it.  Otherwise, create a new object
3750 	 * and add it to the AVL tree.
3751 	 */
3752 
3753 	key.sfh_fh = *fh;
3754 
3755 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3756 	sfh = avl_find(&mi->mi_filehandles, &key, NULL);
3757 	if (sfh != NULL) {
3758 		mutex_enter(&sfh->sfh_lock);
3759 		sfh->sfh_refcnt++;
3760 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3761 		    "sfh4_get: found existing %p, new refcnt=%d",
3762 		    (void *)sfh, sfh->sfh_refcnt));
3763 		mutex_exit(&sfh->sfh_lock);
3764 		nfs_rw_exit(&mi->mi_fh_lock);
3765 		return (sfh);
3766 	}
3767 	nfs_rw_exit(&mi->mi_fh_lock);
3768 
3769 	return (sfh4_put(fh, mi, &key));
3770 }
3771 
3772 /*
3773  * Get a reference to the given shared filehandle object.
3774  */
3775 
3776 void
3777 sfh4_hold(nfs4_sharedfh_t *sfh)
3778 {
3779 	ASSERT(sfh->sfh_refcnt > 0);
3780 
3781 	mutex_enter(&sfh->sfh_lock);
3782 	sfh->sfh_refcnt++;
3783 	NFS4_DEBUG(nfs4_sharedfh_debug,
3784 	    (CE_NOTE, "sfh4_hold %p, new refcnt=%d",
3785 	    (void *)sfh, sfh->sfh_refcnt));
3786 	mutex_exit(&sfh->sfh_lock);
3787 }
3788 
3789 /*
3790  * Release a reference to the given shared filehandle object and null out
3791  * the given pointer.
3792  */
3793 
3794 void
3795 sfh4_rele(nfs4_sharedfh_t **sfhpp)
3796 {
3797 	mntinfo4_t *mi;
3798 	nfs4_sharedfh_t *sfh = *sfhpp;
3799 
3800 	ASSERT(sfh->sfh_refcnt > 0);
3801 
3802 	mutex_enter(&sfh->sfh_lock);
3803 	if (sfh->sfh_refcnt > 1) {
3804 		sfh->sfh_refcnt--;
3805 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3806 		    "sfh4_rele %p, new refcnt=%d",
3807 		    (void *)sfh, sfh->sfh_refcnt));
3808 		mutex_exit(&sfh->sfh_lock);
3809 		goto finish;
3810 	}
3811 	mutex_exit(&sfh->sfh_lock);
3812 
3813 	/*
3814 	 * Possibly the last reference, so get the lock for the table in
3815 	 * case it's time to remove the object from the table.
3816 	 */
3817 	mi = sfh->sfh_mi;
3818 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3819 	mutex_enter(&sfh->sfh_lock);
3820 	sfh->sfh_refcnt--;
3821 	if (sfh->sfh_refcnt > 0) {
3822 		NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3823 		    "sfh4_rele %p, new refcnt=%d",
3824 		    (void *)sfh, sfh->sfh_refcnt));
3825 		mutex_exit(&sfh->sfh_lock);
3826 		nfs_rw_exit(&mi->mi_fh_lock);
3827 		goto finish;
3828 	}
3829 
3830 	NFS4_DEBUG(nfs4_sharedfh_debug, (CE_NOTE,
3831 	    "sfh4_rele %p, last ref", (void *)sfh));
3832 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3833 		avl_remove(&mi->mi_filehandles, sfh);
3834 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3835 	}
3836 	mutex_exit(&sfh->sfh_lock);
3837 	nfs_rw_exit(&mi->mi_fh_lock);
3838 	mutex_destroy(&sfh->sfh_lock);
3839 	kmem_free(sfh->sfh_fh.nfs_fh4_val, NFS4_FHSIZE);
3840 	kmem_free(sfh, sizeof (nfs4_sharedfh_t));
3841 
3842 finish:
3843 	*sfhpp = NULL;
3844 }
3845 
3846 /*
3847  * Update the filehandle for the given shared filehandle object.
3848  */
3849 
3850 int nfs4_warn_dupfh = 0;	/* if set, always warn about dup fhs below */
3851 
3852 void
3853 sfh4_update(nfs4_sharedfh_t *sfh, const nfs_fh4 *newfh)
3854 {
3855 	mntinfo4_t *mi = sfh->sfh_mi;
3856 	nfs4_sharedfh_t *dupsfh;
3857 	avl_index_t where;
3858 	nfs4_sharedfh_t key;
3859 
3860 #ifdef DEBUG
3861 	mutex_enter(&sfh->sfh_lock);
3862 	ASSERT(sfh->sfh_refcnt > 0);
3863 	mutex_exit(&sfh->sfh_lock);
3864 #endif
3865 	ASSERT(newfh->nfs_fh4_len <= NFS4_FHSIZE);
3866 
3867 	/*
3868 	 * The basic plan is to remove the shared filehandle object from
3869 	 * the table, update it to have the new filehandle, then reinsert
3870 	 * it.
3871 	 */
3872 
3873 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_WRITER, 0);
3874 	mutex_enter(&sfh->sfh_lock);
3875 	if (sfh->sfh_flags & SFH4_IN_TREE) {
3876 		avl_remove(&mi->mi_filehandles, sfh);
3877 		sfh->sfh_flags &= ~SFH4_IN_TREE;
3878 	}
3879 	mutex_exit(&sfh->sfh_lock);
3880 	sfh->sfh_fh.nfs_fh4_len = newfh->nfs_fh4_len;
3881 	bcopy(newfh->nfs_fh4_val, sfh->sfh_fh.nfs_fh4_val,
3882 	    sfh->sfh_fh.nfs_fh4_len);
3883 
3884 	/*
3885 	 * XXX If there is already a shared filehandle object with the new
3886 	 * filehandle, we're in trouble, because the rnode code assumes
3887 	 * that there is only one shared filehandle object for a given
3888 	 * filehandle.  So issue a warning (for read-write mounts only)
3889 	 * and don't try to re-insert the given object into the table.
3890 	 * Hopefully the given object will quickly go away and everyone
3891 	 * will use the new object.
3892 	 */
3893 	key.sfh_fh = *newfh;
3894 	dupsfh = avl_find(&mi->mi_filehandles, &key, &where);
3895 	if (dupsfh != NULL) {
3896 		if (!(mi->mi_vfsp->vfs_flag & VFS_RDONLY) || nfs4_warn_dupfh) {
3897 			zcmn_err(mi->mi_zone->zone_id, CE_WARN, "sfh4_update: "
3898 			    "duplicate filehandle detected");
3899 			sfh4_printfhandle(dupsfh);
3900 		}
3901 	} else {
3902 		avl_insert(&mi->mi_filehandles, sfh, where);
3903 		mutex_enter(&sfh->sfh_lock);
3904 		sfh->sfh_flags |= SFH4_IN_TREE;
3905 		mutex_exit(&sfh->sfh_lock);
3906 	}
3907 	nfs_rw_exit(&mi->mi_fh_lock);
3908 }
3909 
3910 /*
3911  * Copy out the current filehandle for the given shared filehandle object.
3912  */
3913 
3914 void
3915 sfh4_copyval(const nfs4_sharedfh_t *sfh, nfs4_fhandle_t *fhp)
3916 {
3917 	mntinfo4_t *mi = sfh->sfh_mi;
3918 
3919 	ASSERT(sfh->sfh_refcnt > 0);
3920 
3921 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
3922 	fhp->fh_len = sfh->sfh_fh.nfs_fh4_len;
3923 	ASSERT(fhp->fh_len <= NFS4_FHSIZE);
3924 	bcopy(sfh->sfh_fh.nfs_fh4_val, fhp->fh_buf, fhp->fh_len);
3925 	nfs_rw_exit(&mi->mi_fh_lock);
3926 }
3927 
3928 /*
3929  * Print out the filehandle for the given shared filehandle object.
3930  */
3931 
3932 void
3933 sfh4_printfhandle(const nfs4_sharedfh_t *sfh)
3934 {
3935 	nfs4_fhandle_t fhandle;
3936 
3937 	sfh4_copyval(sfh, &fhandle);
3938 	nfs4_printfhandle(&fhandle);
3939 }
3940 
3941 /*
3942  * Compare 2 fnames.  Returns -1 if the first is "less" than the second, 0
3943  * if they're the same, +1 if the first is "greater" than the second.  The
3944  * caller (or whoever's calling the AVL package) is responsible for
3945  * handling locking issues.
3946  */
3947 
3948 static int
3949 fncmp(const void *p1, const void *p2)
3950 {
3951 	const nfs4_fname_t *f1 = p1;
3952 	const nfs4_fname_t *f2 = p2;
3953 	int res;
3954 
3955 	res = strcmp(f1->fn_name, f2->fn_name);
3956 	/*
3957 	 * The AVL package wants +/-1, not arbitrary positive or negative
3958 	 * integers.
3959 	 */
3960 	if (res > 0)
3961 		res = 1;
3962 	else if (res < 0)
3963 		res = -1;
3964 	return (res);
3965 }
3966 
3967 /*
3968  * Get or create an fname with the given name, as a child of the given
3969  * fname.  The caller is responsible for eventually releasing the reference
3970  * (fn_rele()).  parent may be NULL.
3971  */
3972 
3973 nfs4_fname_t *
3974 fn_get(nfs4_fname_t *parent, char *name, nfs4_sharedfh_t *sfh)
3975 {
3976 	nfs4_fname_t key;
3977 	nfs4_fname_t *fnp;
3978 	avl_index_t where;
3979 
3980 	key.fn_name = name;
3981 
3982 	/*
3983 	 * If there's already an fname registered with the given name, bump
3984 	 * its reference count and return it.  Otherwise, create a new one
3985 	 * and add it to the parent's AVL tree.
3986 	 *
3987 	 * fname entries we are looking for should match both name
3988 	 * and sfh stored in the fname.
3989 	 */
3990 again:
3991 	if (parent != NULL) {
3992 		mutex_enter(&parent->fn_lock);
3993 		fnp = avl_find(&parent->fn_children, &key, &where);
3994 		if (fnp != NULL) {
3995 			/*
3996 			 * This hold on fnp is released below later,
3997 			 * in case this is not the fnp we want.
3998 			 */
3999 			fn_hold(fnp);
4000 
4001 			if (fnp->fn_sfh == sfh) {
4002 				/*
4003 				 * We have found our entry.
4004 				 * put an hold and return it.
4005 				 */
4006 				mutex_exit(&parent->fn_lock);
4007 				return (fnp);
4008 			}
4009 
4010 			/*
4011 			 * We have found an entry that has a mismatching
4012 			 * fn_sfh. This could be a stale entry due to
4013 			 * server side rename. We will remove this entry
4014 			 * and make sure no such entries exist.
4015 			 */
4016 			mutex_exit(&parent->fn_lock);
4017 			mutex_enter(&fnp->fn_lock);
4018 			if (fnp->fn_parent == parent) {
4019 				/*
4020 				 * Remove ourselves from parent's
4021 				 * fn_children tree.
4022 				 */
4023 				mutex_enter(&parent->fn_lock);
4024 				avl_remove(&parent->fn_children, fnp);
4025 				mutex_exit(&parent->fn_lock);
4026 				fn_rele(&fnp->fn_parent);
4027 			}
4028 			mutex_exit(&fnp->fn_lock);
4029 			fn_rele(&fnp);
4030 			goto again;
4031 		}
4032 	}
4033 
4034 	fnp = kmem_alloc(sizeof (nfs4_fname_t), KM_SLEEP);
4035 	mutex_init(&fnp->fn_lock, NULL, MUTEX_DEFAULT, NULL);
4036 	fnp->fn_parent = parent;
4037 	if (parent != NULL)
4038 		fn_hold(parent);
4039 	fnp->fn_len = strlen(name);
4040 	ASSERT(fnp->fn_len < MAXNAMELEN);
4041 	fnp->fn_name = kmem_alloc(fnp->fn_len + 1, KM_SLEEP);
4042 	(void) strcpy(fnp->fn_name, name);
4043 	fnp->fn_refcnt = 1;
4044 
4045 	/*
4046 	 * This hold on sfh is later released
4047 	 * when we do the final fn_rele() on this fname.
4048 	 */
4049 	sfh4_hold(sfh);
4050 	fnp->fn_sfh = sfh;
4051 
4052 	avl_create(&fnp->fn_children, fncmp, sizeof (nfs4_fname_t),
4053 	    offsetof(nfs4_fname_t, fn_tree));
4054 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4055 	    "fn_get %p:%s, a new nfs4_fname_t!",
4056 	    (void *)fnp, fnp->fn_name));
4057 	if (parent != NULL) {
4058 		avl_insert(&parent->fn_children, fnp, where);
4059 		mutex_exit(&parent->fn_lock);
4060 	}
4061 
4062 	return (fnp);
4063 }
4064 
4065 void
4066 fn_hold(nfs4_fname_t *fnp)
4067 {
4068 	atomic_add_32(&fnp->fn_refcnt, 1);
4069 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4070 	    "fn_hold %p:%s, new refcnt=%d",
4071 	    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4072 }
4073 
4074 /*
4075  * Decrement the reference count of the given fname, and destroy it if its
4076  * reference count goes to zero.  Nulls out the given pointer.
4077  */
4078 
4079 void
4080 fn_rele(nfs4_fname_t **fnpp)
4081 {
4082 	nfs4_fname_t *parent;
4083 	uint32_t newref;
4084 	nfs4_fname_t *fnp;
4085 
4086 recur:
4087 	fnp = *fnpp;
4088 	*fnpp = NULL;
4089 
4090 	mutex_enter(&fnp->fn_lock);
4091 	parent = fnp->fn_parent;
4092 	if (parent != NULL)
4093 		mutex_enter(&parent->fn_lock);	/* prevent new references */
4094 	newref = atomic_add_32_nv(&fnp->fn_refcnt, -1);
4095 	if (newref > 0) {
4096 		NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4097 		    "fn_rele %p:%s, new refcnt=%d",
4098 		    (void *)fnp, fnp->fn_name, fnp->fn_refcnt));
4099 		if (parent != NULL)
4100 			mutex_exit(&parent->fn_lock);
4101 		mutex_exit(&fnp->fn_lock);
4102 		return;
4103 	}
4104 
4105 	NFS4_DEBUG(nfs4_fname_debug, (CE_NOTE,
4106 	    "fn_rele %p:%s, last reference, deleting...",
4107 	    (void *)fnp, fnp->fn_name));
4108 	if (parent != NULL) {
4109 		avl_remove(&parent->fn_children, fnp);
4110 		mutex_exit(&parent->fn_lock);
4111 	}
4112 	kmem_free(fnp->fn_name, fnp->fn_len + 1);
4113 	sfh4_rele(&fnp->fn_sfh);
4114 	mutex_destroy(&fnp->fn_lock);
4115 	avl_destroy(&fnp->fn_children);
4116 	kmem_free(fnp, sizeof (nfs4_fname_t));
4117 	/*
4118 	 * Recursivly fn_rele the parent.
4119 	 * Use goto instead of a recursive call to avoid stack overflow.
4120 	 */
4121 	if (parent != NULL) {
4122 		fnpp = &parent;
4123 		goto recur;
4124 	}
4125 }
4126 
4127 /*
4128  * Returns the single component name of the given fname, in a MAXNAMELEN
4129  * string buffer, which the caller is responsible for freeing.  Note that
4130  * the name may become invalid as a result of fn_move().
4131  */
4132 
4133 char *
4134 fn_name(nfs4_fname_t *fnp)
4135 {
4136 	char *name;
4137 
4138 	ASSERT(fnp->fn_len < MAXNAMELEN);
4139 	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
4140 	mutex_enter(&fnp->fn_lock);
4141 	(void) strcpy(name, fnp->fn_name);
4142 	mutex_exit(&fnp->fn_lock);
4143 
4144 	return (name);
4145 }
4146 
4147 
4148 /*
4149  * fn_path_realloc
4150  *
4151  * This function, used only by fn_path, constructs
4152  * a new string which looks like "prepend" + "/" + "current".
4153  * by allocating a new string and freeing the old one.
4154  */
4155 static void
4156 fn_path_realloc(char **curses, char *prepend)
4157 {
4158 	int len, curlen = 0;
4159 	char *news;
4160 
4161 	if (*curses == NULL) {
4162 		/*
4163 		 * Prime the pump, allocate just the
4164 		 * space for prepend and return that.
4165 		 */
4166 		len = strlen(prepend) + 1;
4167 		news = kmem_alloc(len, KM_SLEEP);
4168 		(void) strncpy(news, prepend, len);
4169 	} else {
4170 		/*
4171 		 * Allocate the space  for a new string
4172 		 * +1 +1 is for the "/" and the NULL
4173 		 * byte at the end of it all.
4174 		 */
4175 		curlen = strlen(*curses);
4176 		len = curlen + strlen(prepend) + 1 + 1;
4177 		news = kmem_alloc(len, KM_SLEEP);
4178 		(void) strncpy(news, prepend, len);
4179 		(void) strcat(news, "/");
4180 		(void) strcat(news, *curses);
4181 		kmem_free(*curses, curlen + 1);
4182 	}
4183 	*curses = news;
4184 }
4185 
4186 /*
4187  * Returns the path name (starting from the fs root) for the given fname.
4188  * The caller is responsible for freeing.  Note that the path may be or
4189  * become invalid as a result of fn_move().
4190  */
4191 
4192 char *
4193 fn_path(nfs4_fname_t *fnp)
4194 {
4195 	char *path;
4196 	nfs4_fname_t *nextfnp;
4197 
4198 	if (fnp == NULL)
4199 		return (NULL);
4200 
4201 	path = NULL;
4202 
4203 	/* walk up the tree constructing the pathname.  */
4204 
4205 	fn_hold(fnp);			/* adjust for later rele */
4206 	do {
4207 		mutex_enter(&fnp->fn_lock);
4208 		/*
4209 		 * Add fn_name in front of the current path
4210 		 */
4211 		fn_path_realloc(&path, fnp->fn_name);
4212 		nextfnp = fnp->fn_parent;
4213 		if (nextfnp != NULL)
4214 			fn_hold(nextfnp);
4215 		mutex_exit(&fnp->fn_lock);
4216 		fn_rele(&fnp);
4217 		fnp = nextfnp;
4218 	} while (fnp != NULL);
4219 
4220 	return (path);
4221 }
4222 
4223 /*
4224  * Return a reference to the parent of the given fname, which the caller is
4225  * responsible for eventually releasing.
4226  */
4227 
4228 nfs4_fname_t *
4229 fn_parent(nfs4_fname_t *fnp)
4230 {
4231 	nfs4_fname_t *parent;
4232 
4233 	mutex_enter(&fnp->fn_lock);
4234 	parent = fnp->fn_parent;
4235 	if (parent != NULL)
4236 		fn_hold(parent);
4237 	mutex_exit(&fnp->fn_lock);
4238 
4239 	return (parent);
4240 }
4241 
4242 /*
4243  * Update fnp so that its parent is newparent and its name is newname.
4244  */
4245 
4246 void
4247 fn_move(nfs4_fname_t *fnp, nfs4_fname_t *newparent, char *newname)
4248 {
4249 	nfs4_fname_t *parent, *tmpfnp;
4250 	ssize_t newlen;
4251 	nfs4_fname_t key;
4252 	avl_index_t where;
4253 
4254 	/*
4255 	 * This assert exists to catch the client trying to rename
4256 	 * a dir to be a child of itself.  This happened at a recent
4257 	 * bakeoff against a 3rd party (broken) server which allowed
4258 	 * the rename to succeed.  If it trips it means that:
4259 	 *	a) the code in nfs4rename that detects this case is broken
4260 	 *	b) the server is broken (since it allowed the bogus rename)
4261 	 *
4262 	 * For non-DEBUG kernels, prepare for a recursive mutex_enter
4263 	 * panic below from:  mutex_enter(&newparent->fn_lock);
4264 	 */
4265 	ASSERT(fnp != newparent);
4266 
4267 	/*
4268 	 * Remove fnp from its current parent, change its name, then add it
4269 	 * to newparent. It might happen that fnp was replaced by another
4270 	 * nfs4_fname_t with the same fn_name in parent->fn_children.
4271 	 * In such case, fnp->fn_parent is NULL and we skip the removal
4272 	 * of fnp from its current parent.
4273 	 */
4274 	mutex_enter(&fnp->fn_lock);
4275 	parent = fnp->fn_parent;
4276 	if (parent != NULL) {
4277 		mutex_enter(&parent->fn_lock);
4278 		avl_remove(&parent->fn_children, fnp);
4279 		mutex_exit(&parent->fn_lock);
4280 		fn_rele(&fnp->fn_parent);
4281 	}
4282 
4283 	newlen = strlen(newname);
4284 	if (newlen != fnp->fn_len) {
4285 		ASSERT(newlen < MAXNAMELEN);
4286 		kmem_free(fnp->fn_name, fnp->fn_len + 1);
4287 		fnp->fn_name = kmem_alloc(newlen + 1, KM_SLEEP);
4288 		fnp->fn_len = newlen;
4289 	}
4290 	(void) strcpy(fnp->fn_name, newname);
4291 
4292 again:
4293 	mutex_enter(&newparent->fn_lock);
4294 	key.fn_name = fnp->fn_name;
4295 	tmpfnp = avl_find(&newparent->fn_children, &key, &where);
4296 	if (tmpfnp != NULL) {
4297 		/*
4298 		 * This could be due to a file that was unlinked while
4299 		 * open, or perhaps the rnode is in the free list.  Remove
4300 		 * it from newparent and let it go away on its own.  The
4301 		 * contorted code is to deal with lock order issues and
4302 		 * race conditions.
4303 		 */
4304 		fn_hold(tmpfnp);
4305 		mutex_exit(&newparent->fn_lock);
4306 		mutex_enter(&tmpfnp->fn_lock);
4307 		if (tmpfnp->fn_parent == newparent) {
4308 			mutex_enter(&newparent->fn_lock);
4309 			avl_remove(&newparent->fn_children, tmpfnp);
4310 			mutex_exit(&newparent->fn_lock);
4311 			fn_rele(&tmpfnp->fn_parent);
4312 		}
4313 		mutex_exit(&tmpfnp->fn_lock);
4314 		fn_rele(&tmpfnp);
4315 		goto again;
4316 	}
4317 	fnp->fn_parent = newparent;
4318 	fn_hold(newparent);
4319 	avl_insert(&newparent->fn_children, fnp, where);
4320 	mutex_exit(&newparent->fn_lock);
4321 	mutex_exit(&fnp->fn_lock);
4322 }
4323 
4324 #ifdef DEBUG
4325 /*
4326  * Return non-zero if the type information makes sense for the given vnode.
4327  * Otherwise panic.
4328  */
4329 int
4330 nfs4_consistent_type(vnode_t *vp)
4331 {
4332 	rnode4_t *rp = VTOR4(vp);
4333 
4334 	if (nfs4_vtype_debug && vp->v_type != VNON &&
4335 	    rp->r_attr.va_type != VNON && vp->v_type != rp->r_attr.va_type) {
4336 		cmn_err(CE_PANIC, "vnode %p type mismatch; v_type=%d, "
4337 		    "rnode attr type=%d", (void *)vp, vp->v_type,
4338 		    rp->r_attr.va_type);
4339 	}
4340 
4341 	return (1);
4342 }
4343 #endif /* DEBUG */
4344