xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs_vnops.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
26  *	All rights reserved.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/types.h>
31 #include <sys/systm.h>
32 #include <sys/cred.h>
33 #include <sys/time.h>
34 #include <sys/vnode.h>
35 #include <sys/vfs.h>
36 #include <sys/vfs_opreg.h>
37 #include <sys/file.h>
38 #include <sys/filio.h>
39 #include <sys/uio.h>
40 #include <sys/buf.h>
41 #include <sys/mman.h>
42 #include <sys/pathname.h>
43 #include <sys/dirent.h>
44 #include <sys/debug.h>
45 #include <sys/vmsystm.h>
46 #include <sys/fcntl.h>
47 #include <sys/flock.h>
48 #include <sys/swap.h>
49 #include <sys/errno.h>
50 #include <sys/strsubr.h>
51 #include <sys/sysmacros.h>
52 #include <sys/kmem.h>
53 #include <sys/cmn_err.h>
54 #include <sys/pathconf.h>
55 #include <sys/utsname.h>
56 #include <sys/dnlc.h>
57 #include <sys/acl.h>
58 #include <sys/atomic.h>
59 #include <sys/policy.h>
60 #include <sys/sdt.h>
61 
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/clnt.h>
65 
66 #include <nfs/nfs.h>
67 #include <nfs/nfs_clnt.h>
68 #include <nfs/rnode.h>
69 #include <nfs/nfs_acl.h>
70 #include <nfs/lm.h>
71 
72 #include <vm/hat.h>
73 #include <vm/as.h>
74 #include <vm/page.h>
75 #include <vm/pvn.h>
76 #include <vm/seg.h>
77 #include <vm/seg_map.h>
78 #include <vm/seg_kpm.h>
79 #include <vm/seg_vn.h>
80 
81 #include <fs/fs_subr.h>
82 
83 #include <sys/ddi.h>
84 
85 static int	nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
86 			cred_t *);
87 static int	nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
88 static int	nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
89 static int	nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
90 static int	nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
91 static int	nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
92 static int	nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
93 			caller_context_t *);
94 static int	nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
95 static int	nfs_bio(struct buf *, cred_t *);
96 static int	nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
97 			page_t *[], size_t, struct seg *, caddr_t,
98 			enum seg_rw, cred_t *);
99 static void	nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
100 			cred_t *);
101 static int	nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
102 			int, cred_t *);
103 static int	nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
104 			int, cred_t *);
105 static void	nfs_delmap_callback(struct as *, void *, uint_t);
106 
107 /*
108  * Error flags used to pass information about certain special errors
109  * which need to be handled specially.
110  */
111 #define	NFS_EOF			-98
112 
113 /*
114  * These are the vnode ops routines which implement the vnode interface to
115  * the networked file system.  These routines just take their parameters,
116  * make them look networkish by putting the right info into interface structs,
117  * and then calling the appropriate remote routine(s) to do the work.
118  *
119  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
120  * we purge the directory cache relative to that vnode.  This way, the
121  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
122  * more details on rnode locking.
123  */
124 
125 static int	nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
126 static int	nfs_close(vnode_t *, int, int, offset_t, cred_t *,
127 			caller_context_t *);
128 static int	nfs_read(vnode_t *, struct uio *, int, cred_t *,
129 			caller_context_t *);
130 static int	nfs_write(vnode_t *, struct uio *, int, cred_t *,
131 			caller_context_t *);
132 static int	nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
133 			caller_context_t *);
134 static int	nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
135 			caller_context_t *);
136 static int	nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
137 			caller_context_t *);
138 static int	nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
139 static int	nfs_accessx(void *, int, cred_t *);
140 static int	nfs_readlink(vnode_t *, struct uio *, cred_t *,
141 			caller_context_t *);
142 static int	nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
143 static void	nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
144 static int	nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
145 			int, vnode_t *, cred_t *, caller_context_t *,
146 			int *, pathname_t *);
147 static int	nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
148 			int, vnode_t **, cred_t *, int, caller_context_t *,
149 			vsecattr_t *);
150 static int	nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
151 			int);
152 static int	nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
153 			caller_context_t *, int);
154 static int	nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
155 			caller_context_t *, int);
156 static int	nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
157 			cred_t *, caller_context_t *, int, vsecattr_t *);
158 static int	nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
159 			caller_context_t *, int);
160 static int	nfs_symlink(vnode_t *, char *, struct vattr *, char *,
161 			cred_t *, caller_context_t *, int);
162 static int	nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
163 			caller_context_t *, int);
164 static int	nfs_fid(vnode_t *, fid_t *, caller_context_t *);
165 static int	nfs_rwlock(vnode_t *, int, caller_context_t *);
166 static void	nfs_rwunlock(vnode_t *, int, caller_context_t *);
167 static int	nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
168 static int	nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
169 			page_t *[], size_t, struct seg *, caddr_t,
170 			enum seg_rw, cred_t *, caller_context_t *);
171 static int	nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
172 			caller_context_t *);
173 static int	nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
174 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
175 static int	nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
176 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
177 static int	nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
178 			struct flk_callback *, cred_t *, caller_context_t *);
179 static int	nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
180 			cred_t *, caller_context_t *);
181 static int	nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
182 static int	nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
183 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
184 static int	nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
185 			caller_context_t *);
186 static int	nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
187 			cred_t *, caller_context_t *);
188 static int	nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
189 			caller_context_t *);
190 static int	nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
191 			caller_context_t *);
192 static int	nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
193 			caller_context_t *);
194 
195 struct vnodeops *nfs_vnodeops;
196 
197 const fs_operation_def_t nfs_vnodeops_template[] = {
198 	VOPNAME_OPEN,		{ .vop_open = nfs_open },
199 	VOPNAME_CLOSE,		{ .vop_close = nfs_close },
200 	VOPNAME_READ,		{ .vop_read = nfs_read },
201 	VOPNAME_WRITE,		{ .vop_write = nfs_write },
202 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs_ioctl },
203 	VOPNAME_GETATTR,	{ .vop_getattr = nfs_getattr },
204 	VOPNAME_SETATTR,	{ .vop_setattr = nfs_setattr },
205 	VOPNAME_ACCESS,		{ .vop_access = nfs_access },
206 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs_lookup },
207 	VOPNAME_CREATE,		{ .vop_create = nfs_create },
208 	VOPNAME_REMOVE,		{ .vop_remove = nfs_remove },
209 	VOPNAME_LINK,		{ .vop_link = nfs_link },
210 	VOPNAME_RENAME,		{ .vop_rename = nfs_rename },
211 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs_mkdir },
212 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs_rmdir },
213 	VOPNAME_READDIR,	{ .vop_readdir = nfs_readdir },
214 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs_symlink },
215 	VOPNAME_READLINK,	{ .vop_readlink = nfs_readlink },
216 	VOPNAME_FSYNC,		{ .vop_fsync = nfs_fsync },
217 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs_inactive },
218 	VOPNAME_FID,		{ .vop_fid = nfs_fid },
219 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs_rwlock },
220 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs_rwunlock },
221 	VOPNAME_SEEK,		{ .vop_seek = nfs_seek },
222 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs_frlock },
223 	VOPNAME_SPACE,		{ .vop_space = nfs_space },
224 	VOPNAME_REALVP,		{ .vop_realvp = nfs_realvp },
225 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs_getpage },
226 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs_putpage },
227 	VOPNAME_MAP,		{ .vop_map = nfs_map },
228 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs_addmap },
229 	VOPNAME_DELMAP,		{ .vop_delmap = nfs_delmap },
230 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
231 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs_pathconf },
232 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs_pageio },
233 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs_setsecattr },
234 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs_getsecattr },
235 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs_shrlock },
236 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
237 	NULL,			NULL
238 };
239 
240 /*
241  * XXX:  This is referenced in modstubs.s
242  */
243 struct vnodeops *
244 nfs_getvnodeops(void)
245 {
246 	return (nfs_vnodeops);
247 }
248 
249 /* ARGSUSED */
250 static int
251 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
252 {
253 	int error;
254 	struct vattr va;
255 	rnode_t *rp;
256 	vnode_t *vp;
257 
258 	vp = *vpp;
259 	rp = VTOR(vp);
260 	if (nfs_zone() != VTOMI(vp)->mi_zone)
261 		return (EIO);
262 	mutex_enter(&rp->r_statelock);
263 	if (rp->r_cred == NULL) {
264 		crhold(cr);
265 		rp->r_cred = cr;
266 	}
267 	mutex_exit(&rp->r_statelock);
268 
269 	/*
270 	 * If there is no cached data or if close-to-open
271 	 * consistency checking is turned off, we can avoid
272 	 * the over the wire getattr.  Otherwise, if the
273 	 * file system is mounted readonly, then just verify
274 	 * the caches are up to date using the normal mechanism.
275 	 * Else, if the file is not mmap'd, then just mark
276 	 * the attributes as timed out.  They will be refreshed
277 	 * and the caches validated prior to being used.
278 	 * Else, the file system is mounted writeable so
279 	 * force an over the wire GETATTR in order to ensure
280 	 * that all cached data is valid.
281 	 */
282 	if (vp->v_count > 1 ||
283 	    ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
284 	    !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
285 		if (vn_is_readonly(vp))
286 			error = nfs_validate_caches(vp, cr);
287 		else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
288 			PURGE_ATTRCACHE(vp);
289 			error = 0;
290 		} else {
291 			va.va_mask = AT_ALL;
292 			error = nfs_getattr_otw(vp, &va, cr);
293 		}
294 	} else
295 		error = 0;
296 
297 	return (error);
298 }
299 
300 /* ARGSUSED */
301 static int
302 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
303 	caller_context_t *ct)
304 {
305 	rnode_t *rp;
306 	int error;
307 	struct vattr va;
308 
309 	/*
310 	 * zone_enter(2) prevents processes from changing zones with NFS files
311 	 * open; if we happen to get here from the wrong zone we can't do
312 	 * anything over the wire.
313 	 */
314 	if (VTOMI(vp)->mi_zone != nfs_zone()) {
315 		/*
316 		 * We could attempt to clean up locks, except we're sure
317 		 * that the current process didn't acquire any locks on
318 		 * the file: any attempt to lock a file belong to another zone
319 		 * will fail, and one can't lock an NFS file and then change
320 		 * zones, as that fails too.
321 		 *
322 		 * Returning an error here is the sane thing to do.  A
323 		 * subsequent call to VN_RELE() which translates to a
324 		 * nfs_inactive() will clean up state: if the zone of the
325 		 * vnode's origin is still alive and kicking, an async worker
326 		 * thread will handle the request (from the correct zone), and
327 		 * everything (minus the final nfs_getattr_otw() call) should
328 		 * be OK. If the zone is going away nfs_async_inactive() will
329 		 * throw away cached pages inline.
330 		 */
331 		return (EIO);
332 	}
333 
334 	/*
335 	 * If we are using local locking for this filesystem, then
336 	 * release all of the SYSV style record locks.  Otherwise,
337 	 * we are doing network locking and we need to release all
338 	 * of the network locks.  All of the locks held by this
339 	 * process on this file are released no matter what the
340 	 * incoming reference count is.
341 	 */
342 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
343 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
344 		cleanshares(vp, ttoproc(curthread)->p_pid);
345 	} else
346 		nfs_lockrelease(vp, flag, offset, cr);
347 
348 	if (count > 1)
349 		return (0);
350 
351 	/*
352 	 * If the file has been `unlinked', then purge the
353 	 * DNLC so that this vnode will get reycled quicker
354 	 * and the .nfs* file on the server will get removed.
355 	 */
356 	rp = VTOR(vp);
357 	if (rp->r_unldvp != NULL)
358 		dnlc_purge_vp(vp);
359 
360 	/*
361 	 * If the file was open for write and there are pages,
362 	 * then if the file system was mounted using the "no-close-
363 	 *	to-open" semantics, then start an asynchronous flush
364 	 *	of the all of the pages in the file.
365 	 * else the file system was not mounted using the "no-close-
366 	 *	to-open" semantics, then do a synchronous flush and
367 	 *	commit of all of the dirty and uncommitted pages.
368 	 *
369 	 * The asynchronous flush of the pages in the "nocto" path
370 	 * mostly just associates a cred pointer with the rnode so
371 	 * writes which happen later will have a better chance of
372 	 * working.  It also starts the data being written to the
373 	 * server, but without unnecessarily delaying the application.
374 	 */
375 	if ((flag & FWRITE) && vn_has_cached_data(vp)) {
376 		if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
377 			error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
378 			    cr, ct);
379 			if (error == EAGAIN)
380 				error = 0;
381 		} else
382 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
383 		if (!error) {
384 			mutex_enter(&rp->r_statelock);
385 			error = rp->r_error;
386 			rp->r_error = 0;
387 			mutex_exit(&rp->r_statelock);
388 		}
389 	} else {
390 		mutex_enter(&rp->r_statelock);
391 		error = rp->r_error;
392 		rp->r_error = 0;
393 		mutex_exit(&rp->r_statelock);
394 	}
395 
396 	/*
397 	 * If RWRITEATTR is set, then issue an over the wire GETATTR to
398 	 * refresh the attribute cache with a set of attributes which
399 	 * weren't returned from a WRITE.  This will enable the close-
400 	 * to-open processing to work.
401 	 */
402 	if (rp->r_flags & RWRITEATTR)
403 		(void) nfs_getattr_otw(vp, &va, cr);
404 
405 	return (error);
406 }
407 
408 /* ARGSUSED */
409 static int
410 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
411 	caller_context_t *ct)
412 {
413 	rnode_t *rp;
414 	u_offset_t off;
415 	offset_t diff;
416 	int on;
417 	size_t n;
418 	caddr_t base;
419 	uint_t flags;
420 	int error;
421 	mntinfo_t *mi;
422 
423 	rp = VTOR(vp);
424 	mi = VTOMI(vp);
425 
426 	if (nfs_zone() != mi->mi_zone)
427 		return (EIO);
428 
429 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
430 
431 	if (vp->v_type != VREG)
432 		return (EISDIR);
433 
434 	if (uiop->uio_resid == 0)
435 		return (0);
436 
437 	if (uiop->uio_loffset > MAXOFF32_T)
438 		return (EFBIG);
439 
440 	if (uiop->uio_loffset < 0 ||
441 	    uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
442 		return (EINVAL);
443 
444 	/*
445 	 * Bypass VM if caching has been disabled (e.g., locking) or if
446 	 * using client-side direct I/O and the file is not mmap'd and
447 	 * there are no cached pages.
448 	 */
449 	if ((vp->v_flag & VNOCACHE) ||
450 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
451 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
452 	    !vn_has_cached_data(vp))) {
453 		size_t bufsize;
454 		size_t resid = 0;
455 
456 		/*
457 		 * Let's try to do read in as large a chunk as we can
458 		 * (Filesystem (NFS client) bsize if possible/needed).
459 		 * For V3, this is 32K and for V2, this is 8K.
460 		 */
461 		bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
462 		base = kmem_alloc(bufsize, KM_SLEEP);
463 		do {
464 			n = MIN(uiop->uio_resid, bufsize);
465 			error = nfsread(vp, base, uiop->uio_offset, n,
466 			    &resid, cr);
467 			if (!error) {
468 				n -= resid;
469 				error = uiomove(base, n, UIO_READ, uiop);
470 			}
471 		} while (!error && uiop->uio_resid > 0 && n > 0);
472 		kmem_free(base, bufsize);
473 		return (error);
474 	}
475 
476 	error = 0;
477 
478 	do {
479 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
480 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
481 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
482 
483 		error = nfs_validate_caches(vp, cr);
484 		if (error)
485 			break;
486 
487 		mutex_enter(&rp->r_statelock);
488 		while (rp->r_flags & RINCACHEPURGE) {
489 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
490 				mutex_exit(&rp->r_statelock);
491 				return (EINTR);
492 			}
493 		}
494 		diff = rp->r_size - uiop->uio_loffset;
495 		mutex_exit(&rp->r_statelock);
496 		if (diff <= 0)
497 			break;
498 		if (diff < n)
499 			n = (size_t)diff;
500 
501 		if (vpm_enable) {
502 			/*
503 			 * Copy data.
504 			 */
505 			error = vpm_data_copy(vp, off + on, n, uiop,
506 			    1, NULL, 0, S_READ);
507 		} else {
508 			base = segmap_getmapflt(segkmap, vp, off + on, n,
509 			    1, S_READ);
510 			error = uiomove(base + on, n, UIO_READ, uiop);
511 		}
512 
513 		if (!error) {
514 			/*
515 			 * If read a whole block or read to eof,
516 			 * won't need this buffer again soon.
517 			 */
518 			mutex_enter(&rp->r_statelock);
519 			if (n + on == MAXBSIZE ||
520 			    uiop->uio_loffset == rp->r_size)
521 				flags = SM_DONTNEED;
522 			else
523 				flags = 0;
524 			mutex_exit(&rp->r_statelock);
525 			if (vpm_enable) {
526 				error = vpm_sync_pages(vp, off, n, flags);
527 			} else {
528 				error = segmap_release(segkmap, base, flags);
529 			}
530 		} else {
531 			if (vpm_enable) {
532 				(void) vpm_sync_pages(vp, off, n, 0);
533 			} else {
534 				(void) segmap_release(segkmap, base, 0);
535 			}
536 		}
537 	} while (!error && uiop->uio_resid > 0);
538 
539 	return (error);
540 }
541 
542 /* ARGSUSED */
543 static int
544 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
545 	caller_context_t *ct)
546 {
547 	rnode_t *rp;
548 	u_offset_t off;
549 	caddr_t base;
550 	uint_t flags;
551 	int remainder;
552 	size_t n;
553 	int on;
554 	int error;
555 	int resid;
556 	offset_t offset;
557 	rlim_t limit;
558 	mntinfo_t *mi;
559 
560 	rp = VTOR(vp);
561 
562 	mi = VTOMI(vp);
563 	if (nfs_zone() != mi->mi_zone)
564 		return (EIO);
565 	if (vp->v_type != VREG)
566 		return (EISDIR);
567 
568 	if (uiop->uio_resid == 0)
569 		return (0);
570 
571 	if (ioflag & FAPPEND) {
572 		struct vattr va;
573 
574 		/*
575 		 * Must serialize if appending.
576 		 */
577 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
578 			nfs_rw_exit(&rp->r_rwlock);
579 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
580 			    INTR(vp)))
581 				return (EINTR);
582 		}
583 
584 		va.va_mask = AT_SIZE;
585 		error = nfsgetattr(vp, &va, cr);
586 		if (error)
587 			return (error);
588 		uiop->uio_loffset = va.va_size;
589 	}
590 
591 	if (uiop->uio_loffset > MAXOFF32_T)
592 		return (EFBIG);
593 
594 	offset = uiop->uio_loffset + uiop->uio_resid;
595 
596 	if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
597 		return (EINVAL);
598 
599 	if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
600 		limit = MAXOFF32_T;
601 	} else {
602 		limit = (rlim_t)uiop->uio_llimit;
603 	}
604 
605 	/*
606 	 * Check to make sure that the process will not exceed
607 	 * its limit on file size.  It is okay to write up to
608 	 * the limit, but not beyond.  Thus, the write which
609 	 * reaches the limit will be short and the next write
610 	 * will return an error.
611 	 */
612 	remainder = 0;
613 	if (offset > limit) {
614 		remainder = offset - limit;
615 		uiop->uio_resid = limit - uiop->uio_offset;
616 		if (uiop->uio_resid <= 0) {
617 			proc_t *p = ttoproc(curthread);
618 
619 			uiop->uio_resid += remainder;
620 			mutex_enter(&p->p_lock);
621 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
622 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
623 			mutex_exit(&p->p_lock);
624 			return (EFBIG);
625 		}
626 	}
627 
628 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
629 		return (EINTR);
630 
631 	/*
632 	 * Bypass VM if caching has been disabled (e.g., locking) or if
633 	 * using client-side direct I/O and the file is not mmap'd and
634 	 * there are no cached pages.
635 	 */
636 	if ((vp->v_flag & VNOCACHE) ||
637 	    (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
638 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
639 	    !vn_has_cached_data(vp))) {
640 		size_t bufsize;
641 		int count;
642 		uint_t org_offset;
643 
644 nfs_fwrite:
645 		if (rp->r_flags & RSTALE) {
646 			resid = uiop->uio_resid;
647 			offset = uiop->uio_loffset;
648 			error = rp->r_error;
649 			/*
650 			 * A close may have cleared r_error, if so,
651 			 * propagate ESTALE error return properly
652 			 */
653 			if (error == 0)
654 				error = ESTALE;
655 			goto bottom;
656 		}
657 		bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
658 		base = kmem_alloc(bufsize, KM_SLEEP);
659 		do {
660 			resid = uiop->uio_resid;
661 			offset = uiop->uio_loffset;
662 			count = MIN(uiop->uio_resid, bufsize);
663 			org_offset = uiop->uio_offset;
664 			error = uiomove(base, count, UIO_WRITE, uiop);
665 			if (!error) {
666 				error = nfswrite(vp, base, org_offset,
667 				    count, cr);
668 			}
669 		} while (!error && uiop->uio_resid > 0);
670 		kmem_free(base, bufsize);
671 		goto bottom;
672 	}
673 
674 	do {
675 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
676 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
677 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
678 
679 		resid = uiop->uio_resid;
680 		offset = uiop->uio_loffset;
681 
682 		if (rp->r_flags & RSTALE) {
683 			error = rp->r_error;
684 			/*
685 			 * A close may have cleared r_error, if so,
686 			 * propagate ESTALE error return properly
687 			 */
688 			if (error == 0)
689 				error = ESTALE;
690 			break;
691 		}
692 
693 		/*
694 		 * Don't create dirty pages faster than they
695 		 * can be cleaned so that the system doesn't
696 		 * get imbalanced.  If the async queue is
697 		 * maxed out, then wait for it to drain before
698 		 * creating more dirty pages.  Also, wait for
699 		 * any threads doing pagewalks in the vop_getattr
700 		 * entry points so that they don't block for
701 		 * long periods.
702 		 */
703 		mutex_enter(&rp->r_statelock);
704 		while ((mi->mi_max_threads != 0 &&
705 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
706 		    rp->r_gcount > 0)
707 			cv_wait(&rp->r_cv, &rp->r_statelock);
708 		mutex_exit(&rp->r_statelock);
709 
710 		/*
711 		 * Touch the page and fault it in if it is not in core
712 		 * before segmap_getmapflt or vpm_data_copy can lock it.
713 		 * This is to avoid the deadlock if the buffer is mapped
714 		 * to the same file through mmap which we want to write.
715 		 */
716 		uio_prefaultpages((long)n, uiop);
717 
718 		if (vpm_enable) {
719 			/*
720 			 * It will use kpm mappings, so no need to
721 			 * pass an address.
722 			 */
723 			error = writerp(rp, NULL, n, uiop, 0);
724 		} else  {
725 			if (segmap_kpm) {
726 				int pon = uiop->uio_loffset & PAGEOFFSET;
727 				size_t pn = MIN(PAGESIZE - pon,
728 				    uiop->uio_resid);
729 				int pagecreate;
730 
731 				mutex_enter(&rp->r_statelock);
732 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
733 				    uiop->uio_loffset + pn >= rp->r_size);
734 				mutex_exit(&rp->r_statelock);
735 
736 				base = segmap_getmapflt(segkmap, vp, off + on,
737 				    pn, !pagecreate, S_WRITE);
738 
739 				error = writerp(rp, base + pon, n, uiop,
740 				    pagecreate);
741 
742 			} else {
743 				base = segmap_getmapflt(segkmap, vp, off + on,
744 				    n, 0, S_READ);
745 				error = writerp(rp, base + on, n, uiop, 0);
746 			}
747 		}
748 
749 		if (!error) {
750 			if (mi->mi_flags & MI_NOAC)
751 				flags = SM_WRITE;
752 			else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
753 				/*
754 				 * Have written a whole block.
755 				 * Start an asynchronous write
756 				 * and mark the buffer to
757 				 * indicate that it won't be
758 				 * needed again soon.
759 				 */
760 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
761 			} else
762 				flags = 0;
763 			if ((ioflag & (FSYNC|FDSYNC)) ||
764 			    (rp->r_flags & ROUTOFSPACE)) {
765 				flags &= ~SM_ASYNC;
766 				flags |= SM_WRITE;
767 			}
768 			if (vpm_enable) {
769 				error = vpm_sync_pages(vp, off, n, flags);
770 			} else {
771 				error = segmap_release(segkmap, base, flags);
772 			}
773 		} else {
774 			if (vpm_enable) {
775 				(void) vpm_sync_pages(vp, off, n, 0);
776 			} else {
777 				(void) segmap_release(segkmap, base, 0);
778 			}
779 			/*
780 			 * In the event that we got an access error while
781 			 * faulting in a page for a write-only file just
782 			 * force a write.
783 			 */
784 			if (error == EACCES)
785 				goto nfs_fwrite;
786 		}
787 	} while (!error && uiop->uio_resid > 0);
788 
789 bottom:
790 	if (error) {
791 		uiop->uio_resid = resid + remainder;
792 		uiop->uio_loffset = offset;
793 	} else
794 		uiop->uio_resid += remainder;
795 
796 	nfs_rw_exit(&rp->r_lkserlock);
797 
798 	return (error);
799 }
800 
801 /*
802  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
803  */
804 static int
805 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
806 	int flags, cred_t *cr)
807 {
808 	struct buf *bp;
809 	int error;
810 
811 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
812 	bp = pageio_setup(pp, len, vp, flags);
813 	ASSERT(bp != NULL);
814 
815 	/*
816 	 * pageio_setup should have set b_addr to 0.  This
817 	 * is correct since we want to do I/O on a page
818 	 * boundary.  bp_mapin will use this addr to calculate
819 	 * an offset, and then set b_addr to the kernel virtual
820 	 * address it allocated for us.
821 	 */
822 	ASSERT(bp->b_un.b_addr == 0);
823 
824 	bp->b_edev = 0;
825 	bp->b_dev = 0;
826 	bp->b_lblkno = lbtodb(off);
827 	bp->b_file = vp;
828 	bp->b_offset = (offset_t)off;
829 	bp_mapin(bp);
830 
831 	error = nfs_bio(bp, cr);
832 
833 	bp_mapout(bp);
834 	pageio_done(bp);
835 
836 	return (error);
837 }
838 
839 /*
840  * Write to file.  Writes to remote server in largest size
841  * chunks that the server can handle.  Write is synchronous.
842  */
843 static int
844 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
845 {
846 	rnode_t *rp;
847 	mntinfo_t *mi;
848 	struct nfswriteargs wa;
849 	struct nfsattrstat ns;
850 	int error;
851 	int tsize;
852 	int douprintf;
853 
854 	douprintf = 1;
855 
856 	rp = VTOR(vp);
857 	mi = VTOMI(vp);
858 
859 	ASSERT(nfs_zone() == mi->mi_zone);
860 
861 	wa.wa_args = &wa.wa_args_buf;
862 	wa.wa_fhandle = *VTOFH(vp);
863 
864 	do {
865 		tsize = MIN(mi->mi_curwrite, count);
866 		wa.wa_data = base;
867 		wa.wa_begoff = offset;
868 		wa.wa_totcount = tsize;
869 		wa.wa_count = tsize;
870 		wa.wa_offset = offset;
871 
872 		if (mi->mi_io_kstats) {
873 			mutex_enter(&mi->mi_lock);
874 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
875 			mutex_exit(&mi->mi_lock);
876 		}
877 		wa.wa_mblk = NULL;
878 		do {
879 			error = rfs2call(mi, RFS_WRITE,
880 			    xdr_writeargs, (caddr_t)&wa,
881 			    xdr_attrstat, (caddr_t)&ns, cr,
882 			    &douprintf, &ns.ns_status, 0, NULL);
883 		} while (error == ENFS_TRYAGAIN);
884 		if (mi->mi_io_kstats) {
885 			mutex_enter(&mi->mi_lock);
886 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
887 			mutex_exit(&mi->mi_lock);
888 		}
889 
890 		if (!error) {
891 			error = geterrno(ns.ns_status);
892 			/*
893 			 * Can't check for stale fhandle and purge caches
894 			 * here because pages are held by nfs_getpage.
895 			 * Just mark the attribute cache as timed out
896 			 * and set RWRITEATTR to indicate that the file
897 			 * was modified with a WRITE operation.
898 			 */
899 			if (!error) {
900 				count -= tsize;
901 				base += tsize;
902 				offset += tsize;
903 				if (mi->mi_io_kstats) {
904 					mutex_enter(&mi->mi_lock);
905 					KSTAT_IO_PTR(mi->mi_io_kstats)->
906 					    writes++;
907 					KSTAT_IO_PTR(mi->mi_io_kstats)->
908 					    nwritten += tsize;
909 					mutex_exit(&mi->mi_lock);
910 				}
911 				lwp_stat_update(LWP_STAT_OUBLK, 1);
912 				mutex_enter(&rp->r_statelock);
913 				PURGE_ATTRCACHE_LOCKED(rp);
914 				rp->r_flags |= RWRITEATTR;
915 				mutex_exit(&rp->r_statelock);
916 			}
917 		}
918 	} while (!error && count);
919 
920 	return (error);
921 }
922 
923 /*
924  * Read from a file.  Reads data in largest chunks our interface can handle.
925  */
926 static int
927 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
928     int count, size_t *residp, cred_t *cr)
929 {
930 	mntinfo_t *mi;
931 	struct nfsreadargs ra;
932 	struct nfsrdresult rr;
933 	int tsize;
934 	int error;
935 	int douprintf;
936 	failinfo_t fi;
937 	rnode_t *rp;
938 	struct vattr va;
939 	hrtime_t t;
940 
941 	rp = VTOR(vp);
942 	mi = VTOMI(vp);
943 
944 	ASSERT(nfs_zone() == mi->mi_zone);
945 
946 	douprintf = 1;
947 
948 	ra.ra_fhandle = *VTOFH(vp);
949 
950 	fi.vp = vp;
951 	fi.fhp = (caddr_t)&ra.ra_fhandle;
952 	fi.copyproc = nfscopyfh;
953 	fi.lookupproc = nfslookup;
954 	fi.xattrdirproc = acl_getxattrdir2;
955 
956 	do {
957 		if (mi->mi_io_kstats) {
958 			mutex_enter(&mi->mi_lock);
959 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
960 			mutex_exit(&mi->mi_lock);
961 		}
962 
963 		do {
964 			tsize = MIN(mi->mi_curread, count);
965 			rr.rr_data = base;
966 			ra.ra_offset = offset;
967 			ra.ra_totcount = tsize;
968 			ra.ra_count = tsize;
969 			ra.ra_data = base;
970 			t = gethrtime();
971 			error = rfs2call(mi, RFS_READ,
972 			    xdr_readargs, (caddr_t)&ra,
973 			    xdr_rdresult, (caddr_t)&rr, cr,
974 			    &douprintf, &rr.rr_status, 0, &fi);
975 		} while (error == ENFS_TRYAGAIN);
976 
977 		if (mi->mi_io_kstats) {
978 			mutex_enter(&mi->mi_lock);
979 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
980 			mutex_exit(&mi->mi_lock);
981 		}
982 
983 		if (!error) {
984 			error = geterrno(rr.rr_status);
985 			if (!error) {
986 				count -= rr.rr_count;
987 				base += rr.rr_count;
988 				offset += rr.rr_count;
989 				if (mi->mi_io_kstats) {
990 					mutex_enter(&mi->mi_lock);
991 					KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
992 					KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
993 					    rr.rr_count;
994 					mutex_exit(&mi->mi_lock);
995 				}
996 				lwp_stat_update(LWP_STAT_INBLK, 1);
997 			}
998 		}
999 	} while (!error && count && rr.rr_count == tsize);
1000 
1001 	*residp = count;
1002 
1003 	if (!error) {
1004 		/*
1005 		 * Since no error occurred, we have the current
1006 		 * attributes and we need to do a cache check and then
1007 		 * potentially update the cached attributes.  We can't
1008 		 * use the normal attribute check and cache mechanisms
1009 		 * because they might cause a cache flush which would
1010 		 * deadlock.  Instead, we just check the cache to see
1011 		 * if the attributes have changed.  If it is, then we
1012 		 * just mark the attributes as out of date.  The next
1013 		 * time that the attributes are checked, they will be
1014 		 * out of date, new attributes will be fetched, and
1015 		 * the page cache will be flushed.  If the attributes
1016 		 * weren't changed, then we just update the cached
1017 		 * attributes with these attributes.
1018 		 */
1019 		/*
1020 		 * If NFS_ACL is supported on the server, then the
1021 		 * attributes returned by server may have minimal
1022 		 * permissions sometimes denying access to users having
1023 		 * proper access.  To get the proper attributes, mark
1024 		 * the attributes as expired so that they will be
1025 		 * regotten via the NFS_ACL GETATTR2 procedure.
1026 		 */
1027 		error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1028 		mutex_enter(&rp->r_statelock);
1029 		if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1030 		    (mi->mi_flags & MI_ACL)) {
1031 			mutex_exit(&rp->r_statelock);
1032 			PURGE_ATTRCACHE(vp);
1033 		} else {
1034 			if (rp->r_mtime <= t) {
1035 				nfs_attrcache_va(vp, &va);
1036 			}
1037 			mutex_exit(&rp->r_statelock);
1038 		}
1039 	}
1040 
1041 	return (error);
1042 }
1043 
1044 /* ARGSUSED */
1045 static int
1046 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1047 	caller_context_t *ct)
1048 {
1049 
1050 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1051 		return (EIO);
1052 	switch (cmd) {
1053 		case _FIODIRECTIO:
1054 			return (nfs_directio(vp, (int)arg, cr));
1055 		default:
1056 			return (ENOTTY);
1057 	}
1058 }
1059 
1060 /* ARGSUSED */
1061 static int
1062 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1063 	caller_context_t *ct)
1064 {
1065 	int error;
1066 	rnode_t *rp;
1067 
1068 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1069 		return (EIO);
1070 	/*
1071 	 * If it has been specified that the return value will
1072 	 * just be used as a hint, and we are only being asked
1073 	 * for size, fsid or rdevid, then return the client's
1074 	 * notion of these values without checking to make sure
1075 	 * that the attribute cache is up to date.
1076 	 * The whole point is to avoid an over the wire GETATTR
1077 	 * call.
1078 	 */
1079 	rp = VTOR(vp);
1080 	if (flags & ATTR_HINT) {
1081 		if (vap->va_mask ==
1082 		    (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1083 			mutex_enter(&rp->r_statelock);
1084 			if (vap->va_mask | AT_SIZE)
1085 				vap->va_size = rp->r_size;
1086 			if (vap->va_mask | AT_FSID)
1087 				vap->va_fsid = rp->r_attr.va_fsid;
1088 			if (vap->va_mask | AT_RDEV)
1089 				vap->va_rdev = rp->r_attr.va_rdev;
1090 			mutex_exit(&rp->r_statelock);
1091 			return (0);
1092 		}
1093 	}
1094 
1095 	/*
1096 	 * Only need to flush pages if asking for the mtime
1097 	 * and if there any dirty pages or any outstanding
1098 	 * asynchronous (write) requests for this file.
1099 	 */
1100 	if (vap->va_mask & AT_MTIME) {
1101 		if (vn_has_cached_data(vp) &&
1102 		    ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1103 			mutex_enter(&rp->r_statelock);
1104 			rp->r_gcount++;
1105 			mutex_exit(&rp->r_statelock);
1106 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1107 			mutex_enter(&rp->r_statelock);
1108 			if (error && (error == ENOSPC || error == EDQUOT)) {
1109 				if (!rp->r_error)
1110 					rp->r_error = error;
1111 			}
1112 			if (--rp->r_gcount == 0)
1113 				cv_broadcast(&rp->r_cv);
1114 			mutex_exit(&rp->r_statelock);
1115 		}
1116 	}
1117 
1118 	return (nfsgetattr(vp, vap, cr));
1119 }
1120 
1121 /*ARGSUSED4*/
1122 static int
1123 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1124 		caller_context_t *ct)
1125 {
1126 	int error;
1127 	uint_t mask;
1128 	struct vattr va;
1129 
1130 	mask = vap->va_mask;
1131 
1132 	if (mask & AT_NOSET)
1133 		return (EINVAL);
1134 
1135 	if ((mask & AT_SIZE) &&
1136 	    vap->va_type == VREG &&
1137 	    vap->va_size > MAXOFF32_T)
1138 		return (EFBIG);
1139 
1140 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1141 		return (EIO);
1142 
1143 	va.va_mask = AT_UID | AT_MODE;
1144 
1145 	error = nfsgetattr(vp, &va, cr);
1146 	if (error)
1147 		return (error);
1148 
1149 	error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1150 	    vp);
1151 
1152 	if (error)
1153 		return (error);
1154 
1155 	return (nfssetattr(vp, vap, flags, cr));
1156 }
1157 
1158 static int
1159 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1160 {
1161 	int error;
1162 	uint_t mask;
1163 	struct nfssaargs args;
1164 	struct nfsattrstat ns;
1165 	int douprintf;
1166 	rnode_t *rp;
1167 	struct vattr va;
1168 	mode_t omode;
1169 	mntinfo_t *mi;
1170 	vsecattr_t *vsp;
1171 	hrtime_t t;
1172 
1173 	mask = vap->va_mask;
1174 
1175 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1176 
1177 	rp = VTOR(vp);
1178 
1179 	/*
1180 	 * Only need to flush pages if there are any pages and
1181 	 * if the file is marked as dirty in some fashion.  The
1182 	 * file must be flushed so that we can accurately
1183 	 * determine the size of the file and the cached data
1184 	 * after the SETATTR returns.  A file is considered to
1185 	 * be dirty if it is either marked with RDIRTY, has
1186 	 * outstanding i/o's active, or is mmap'd.  In this
1187 	 * last case, we can't tell whether there are dirty
1188 	 * pages, so we flush just to be sure.
1189 	 */
1190 	if (vn_has_cached_data(vp) &&
1191 	    ((rp->r_flags & RDIRTY) ||
1192 	    rp->r_count > 0 ||
1193 	    rp->r_mapcnt > 0)) {
1194 		ASSERT(vp->v_type != VCHR);
1195 		error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1196 		if (error && (error == ENOSPC || error == EDQUOT)) {
1197 			mutex_enter(&rp->r_statelock);
1198 			if (!rp->r_error)
1199 				rp->r_error = error;
1200 			mutex_exit(&rp->r_statelock);
1201 		}
1202 	}
1203 
1204 	/*
1205 	 * If the system call was utime(2) or utimes(2) and the
1206 	 * application did not specify the times, then set the
1207 	 * mtime nanosecond field to 1 billion.  This will get
1208 	 * translated from 1 billion nanoseconds to 1 million
1209 	 * microseconds in the over the wire request.  The
1210 	 * server will use 1 million in the microsecond field
1211 	 * to tell whether both the mtime and atime should be
1212 	 * set to the server's current time.
1213 	 *
1214 	 * This is an overload of the protocol and should be
1215 	 * documented in the NFS Version 2 protocol specification.
1216 	 */
1217 	if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1218 		vap->va_mtime.tv_nsec = 1000000000;
1219 		if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1220 		    NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1221 			error = vattr_to_sattr(vap, &args.saa_sa);
1222 		} else {
1223 			/*
1224 			 * Use server times. vap time values will not be used.
1225 			 * To ensure no time overflow, make sure vap has
1226 			 * valid values, but retain the original values.
1227 			 */
1228 			timestruc_t	mtime = vap->va_mtime;
1229 			timestruc_t	atime = vap->va_atime;
1230 			time_t		now;
1231 
1232 			now = gethrestime_sec();
1233 			if (NFS_TIME_T_OK(now)) {
1234 				/* Just in case server does not know of this */
1235 				vap->va_mtime.tv_sec = now;
1236 				vap->va_atime.tv_sec = now;
1237 			} else {
1238 				vap->va_mtime.tv_sec = 0;
1239 				vap->va_atime.tv_sec = 0;
1240 			}
1241 			error = vattr_to_sattr(vap, &args.saa_sa);
1242 			/* set vap times back on */
1243 			vap->va_mtime = mtime;
1244 			vap->va_atime = atime;
1245 		}
1246 	} else {
1247 		/* Either do not set times or use the client specified times */
1248 		error = vattr_to_sattr(vap, &args.saa_sa);
1249 	}
1250 	if (error) {
1251 		/* req time field(s) overflow - return immediately */
1252 		return (error);
1253 	}
1254 	args.saa_fh = *VTOFH(vp);
1255 
1256 	va.va_mask = AT_MODE;
1257 	error = nfsgetattr(vp, &va, cr);
1258 	if (error)
1259 		return (error);
1260 	omode = va.va_mode;
1261 
1262 	mi = VTOMI(vp);
1263 
1264 	douprintf = 1;
1265 
1266 	t = gethrtime();
1267 
1268 	error = rfs2call(mi, RFS_SETATTR,
1269 	    xdr_saargs, (caddr_t)&args,
1270 	    xdr_attrstat, (caddr_t)&ns, cr,
1271 	    &douprintf, &ns.ns_status, 0, NULL);
1272 
1273 	/*
1274 	 * Purge the access cache and ACL cache if changing either the
1275 	 * owner of the file, the group owner, or the mode.  These may
1276 	 * change the access permissions of the file, so purge old
1277 	 * information and start over again.
1278 	 */
1279 	if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1280 		(void) nfs_access_purge_rp(rp);
1281 		if (rp->r_secattr != NULL) {
1282 			mutex_enter(&rp->r_statelock);
1283 			vsp = rp->r_secattr;
1284 			rp->r_secattr = NULL;
1285 			mutex_exit(&rp->r_statelock);
1286 			if (vsp != NULL)
1287 				nfs_acl_free(vsp);
1288 		}
1289 	}
1290 
1291 	if (!error) {
1292 		error = geterrno(ns.ns_status);
1293 		if (!error) {
1294 			/*
1295 			 * If changing the size of the file, invalidate
1296 			 * any local cached data which is no longer part
1297 			 * of the file.  We also possibly invalidate the
1298 			 * last page in the file.  We could use
1299 			 * pvn_vpzero(), but this would mark the page as
1300 			 * modified and require it to be written back to
1301 			 * the server for no particularly good reason.
1302 			 * This way, if we access it, then we bring it
1303 			 * back in.  A read should be cheaper than a
1304 			 * write.
1305 			 */
1306 			if (mask & AT_SIZE) {
1307 				nfs_invalidate_pages(vp,
1308 				    (vap->va_size & PAGEMASK), cr);
1309 			}
1310 			(void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1311 			/*
1312 			 * If NFS_ACL is supported on the server, then the
1313 			 * attributes returned by server may have minimal
1314 			 * permissions sometimes denying access to users having
1315 			 * proper access.  To get the proper attributes, mark
1316 			 * the attributes as expired so that they will be
1317 			 * regotten via the NFS_ACL GETATTR2 procedure.
1318 			 */
1319 			if (mi->mi_flags & MI_ACL) {
1320 				PURGE_ATTRCACHE(vp);
1321 			}
1322 			/*
1323 			 * This next check attempts to deal with NFS
1324 			 * servers which can not handle increasing
1325 			 * the size of the file via setattr.  Most
1326 			 * of these servers do not return an error,
1327 			 * but do not change the size of the file.
1328 			 * Hence, this check and then attempt to set
1329 			 * the file size by writing 1 byte at the
1330 			 * offset of the end of the file that we need.
1331 			 */
1332 			if ((mask & AT_SIZE) &&
1333 			    ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1334 				char zb = '\0';
1335 
1336 				error = nfswrite(vp, &zb,
1337 				    vap->va_size - sizeof (zb),
1338 				    sizeof (zb), cr);
1339 			}
1340 			/*
1341 			 * Some servers will change the mode to clear the setuid
1342 			 * and setgid bits when changing the uid or gid.  The
1343 			 * client needs to compensate appropriately.
1344 			 */
1345 			if (mask & (AT_UID | AT_GID)) {
1346 				int terror;
1347 
1348 				va.va_mask = AT_MODE;
1349 				terror = nfsgetattr(vp, &va, cr);
1350 				if (!terror &&
1351 				    (((mask & AT_MODE) &&
1352 				    va.va_mode != vap->va_mode) ||
1353 				    (!(mask & AT_MODE) &&
1354 				    va.va_mode != omode))) {
1355 					va.va_mask = AT_MODE;
1356 					if (mask & AT_MODE)
1357 						va.va_mode = vap->va_mode;
1358 					else
1359 						va.va_mode = omode;
1360 					(void) nfssetattr(vp, &va, 0, cr);
1361 				}
1362 			}
1363 		} else {
1364 			PURGE_ATTRCACHE(vp);
1365 			PURGE_STALE_FH(error, vp, cr);
1366 		}
1367 	} else {
1368 		PURGE_ATTRCACHE(vp);
1369 	}
1370 
1371 	return (error);
1372 }
1373 
1374 static int
1375 nfs_accessx(void *vp, int mode, cred_t *cr)
1376 {
1377 	ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1378 	return (nfs_access(vp, mode, 0, cr, NULL));
1379 }
1380 
1381 /* ARGSUSED */
1382 static int
1383 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1384 {
1385 	struct vattr va;
1386 	int error;
1387 	mntinfo_t *mi;
1388 	int shift = 0;
1389 
1390 	mi = VTOMI(vp);
1391 
1392 	if (nfs_zone() != mi->mi_zone)
1393 		return (EIO);
1394 	if (mi->mi_flags & MI_ACL) {
1395 		error = acl_access2(vp, mode, flags, cr);
1396 		if (mi->mi_flags & MI_ACL)
1397 			return (error);
1398 	}
1399 
1400 	va.va_mask = AT_MODE | AT_UID | AT_GID;
1401 	error = nfsgetattr(vp, &va, cr);
1402 	if (error)
1403 		return (error);
1404 
1405 	/*
1406 	 * Disallow write attempts on read-only
1407 	 * file systems, unless the file is a
1408 	 * device node.
1409 	 */
1410 	if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1411 		return (EROFS);
1412 
1413 	/*
1414 	 * Disallow attempts to access mandatory lock files.
1415 	 */
1416 	if ((mode & (VWRITE | VREAD | VEXEC)) &&
1417 	    MANDLOCK(vp, va.va_mode))
1418 		return (EACCES);
1419 
1420 	/*
1421 	 * Access check is based on only
1422 	 * one of owner, group, public.
1423 	 * If not owner, then check group.
1424 	 * If not a member of the group,
1425 	 * then check public access.
1426 	 */
1427 	if (crgetuid(cr) != va.va_uid) {
1428 		shift += 3;
1429 		if (!groupmember(va.va_gid, cr))
1430 			shift += 3;
1431 	}
1432 found:
1433 	mode &= ~(va.va_mode << shift);
1434 	if (mode == 0)
1435 		return (0);
1436 
1437 	return (secpolicy_vnode_access(cr, vp, va.va_uid, mode));
1438 }
1439 
1440 static int nfs_do_symlink_cache = 1;
1441 
1442 /* ARGSUSED */
1443 static int
1444 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1445 {
1446 	int error;
1447 	struct nfsrdlnres rl;
1448 	rnode_t *rp;
1449 	int douprintf;
1450 	failinfo_t fi;
1451 
1452 	/*
1453 	 * We want to be consistent with UFS semantics so we will return
1454 	 * EINVAL instead of ENXIO. This violates the XNFS spec and
1455 	 * the RFC 1094, which are wrong any way. BUGID 1138002.
1456 	 */
1457 	if (vp->v_type != VLNK)
1458 		return (EINVAL);
1459 
1460 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1461 		return (EIO);
1462 
1463 	rp = VTOR(vp);
1464 	if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1465 		error = nfs_validate_caches(vp, cr);
1466 		if (error)
1467 			return (error);
1468 		mutex_enter(&rp->r_statelock);
1469 		if (rp->r_symlink.contents != NULL) {
1470 			error = uiomove(rp->r_symlink.contents,
1471 			    rp->r_symlink.len, UIO_READ, uiop);
1472 			mutex_exit(&rp->r_statelock);
1473 			return (error);
1474 		}
1475 		mutex_exit(&rp->r_statelock);
1476 	}
1477 
1478 
1479 	rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1480 
1481 	fi.vp = vp;
1482 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1483 	fi.copyproc = nfscopyfh;
1484 	fi.lookupproc = nfslookup;
1485 	fi.xattrdirproc = acl_getxattrdir2;
1486 
1487 	douprintf = 1;
1488 
1489 	error = rfs2call(VTOMI(vp), RFS_READLINK,
1490 	    xdr_readlink, (caddr_t)VTOFH(vp),
1491 	    xdr_rdlnres, (caddr_t)&rl, cr,
1492 	    &douprintf, &rl.rl_status, 0, &fi);
1493 
1494 	if (error) {
1495 
1496 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1497 		return (error);
1498 	}
1499 
1500 	error = geterrno(rl.rl_status);
1501 	if (!error) {
1502 		error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1503 		if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1504 			mutex_enter(&rp->r_statelock);
1505 			if (rp->r_symlink.contents == NULL) {
1506 				rp->r_symlink.contents = rl.rl_data;
1507 				rp->r_symlink.len = (int)rl.rl_count;
1508 				rp->r_symlink.size = NFS_MAXPATHLEN;
1509 				mutex_exit(&rp->r_statelock);
1510 			} else {
1511 				mutex_exit(&rp->r_statelock);
1512 
1513 				kmem_free((void *)rl.rl_data,
1514 				    NFS_MAXPATHLEN);
1515 			}
1516 		} else {
1517 
1518 			kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1519 		}
1520 	} else {
1521 		PURGE_STALE_FH(error, vp, cr);
1522 
1523 		kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1524 	}
1525 
1526 	/*
1527 	 * Conform to UFS semantics (see comment above)
1528 	 */
1529 	return (error == ENXIO ? EINVAL : error);
1530 }
1531 
1532 /*
1533  * Flush local dirty pages to stable storage on the server.
1534  *
1535  * If FNODSYNC is specified, then there is nothing to do because
1536  * metadata changes are not cached on the client before being
1537  * sent to the server.
1538  */
1539 /* ARGSUSED */
1540 static int
1541 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1542 {
1543 	int error;
1544 
1545 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1546 		return (0);
1547 
1548 	if (nfs_zone() != VTOMI(vp)->mi_zone)
1549 		return (EIO);
1550 
1551 	error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1552 	if (!error)
1553 		error = VTOR(vp)->r_error;
1554 	return (error);
1555 }
1556 
1557 
1558 /*
1559  * Weirdness: if the file was removed or the target of a rename
1560  * operation while it was open, it got renamed instead.  Here we
1561  * remove the renamed file.
1562  */
1563 /* ARGSUSED */
1564 static void
1565 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1566 {
1567 	rnode_t *rp;
1568 
1569 	ASSERT(vp != DNLC_NO_VNODE);
1570 
1571 	/*
1572 	 * If this is coming from the wrong zone, we let someone in the right
1573 	 * zone take care of it asynchronously.  We can get here due to
1574 	 * VN_RELE() being called from pageout() or fsflush().  This call may
1575 	 * potentially turn into an expensive no-op if, for instance, v_count
1576 	 * gets incremented in the meantime, but it's still correct.
1577 	 */
1578 	if (nfs_zone() != VTOMI(vp)->mi_zone) {
1579 		nfs_async_inactive(vp, cr, nfs_inactive);
1580 		return;
1581 	}
1582 
1583 	rp = VTOR(vp);
1584 redo:
1585 	if (rp->r_unldvp != NULL) {
1586 		/*
1587 		 * Save the vnode pointer for the directory where the
1588 		 * unlinked-open file got renamed, then set it to NULL
1589 		 * to prevent another thread from getting here before
1590 		 * we're done with the remove.  While we have the
1591 		 * statelock, make local copies of the pertinent rnode
1592 		 * fields.  If we weren't to do this in an atomic way, the
1593 		 * the unl* fields could become inconsistent with respect
1594 		 * to each other due to a race condition between this
1595 		 * code and nfs_remove().  See bug report 1034328.
1596 		 */
1597 		mutex_enter(&rp->r_statelock);
1598 		if (rp->r_unldvp != NULL) {
1599 			vnode_t *unldvp;
1600 			char *unlname;
1601 			cred_t *unlcred;
1602 			struct nfsdiropargs da;
1603 			enum nfsstat status;
1604 			int douprintf;
1605 			int error;
1606 
1607 			unldvp = rp->r_unldvp;
1608 			rp->r_unldvp = NULL;
1609 			unlname = rp->r_unlname;
1610 			rp->r_unlname = NULL;
1611 			unlcred = rp->r_unlcred;
1612 			rp->r_unlcred = NULL;
1613 			mutex_exit(&rp->r_statelock);
1614 
1615 			/*
1616 			 * If there are any dirty pages left, then flush
1617 			 * them.  This is unfortunate because they just
1618 			 * may get thrown away during the remove operation,
1619 			 * but we have to do this for correctness.
1620 			 */
1621 			if (vn_has_cached_data(vp) &&
1622 			    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1623 				ASSERT(vp->v_type != VCHR);
1624 				error = nfs_putpage(vp, (offset_t)0, 0, 0,
1625 				    cr, ct);
1626 				if (error) {
1627 					mutex_enter(&rp->r_statelock);
1628 					if (!rp->r_error)
1629 						rp->r_error = error;
1630 					mutex_exit(&rp->r_statelock);
1631 				}
1632 			}
1633 
1634 			/*
1635 			 * Do the remove operation on the renamed file
1636 			 */
1637 			setdiropargs(&da, unlname, unldvp);
1638 
1639 			douprintf = 1;
1640 
1641 			(void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1642 			    xdr_diropargs, (caddr_t)&da,
1643 			    xdr_enum, (caddr_t)&status, unlcred,
1644 			    &douprintf, &status, 0, NULL);
1645 
1646 			if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1647 				nfs_purge_rddir_cache(unldvp);
1648 			PURGE_ATTRCACHE(unldvp);
1649 
1650 			/*
1651 			 * Release stuff held for the remove
1652 			 */
1653 			VN_RELE(unldvp);
1654 			kmem_free(unlname, MAXNAMELEN);
1655 			crfree(unlcred);
1656 			goto redo;
1657 		}
1658 		mutex_exit(&rp->r_statelock);
1659 	}
1660 
1661 	rp_addfree(rp, cr);
1662 }
1663 
1664 /*
1665  * Remote file system operations having to do with directory manipulation.
1666  */
1667 
1668 /* ARGSUSED */
1669 static int
1670 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1671 	int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1672 	int *direntflags, pathname_t *realpnp)
1673 {
1674 	int error;
1675 	vnode_t *vp;
1676 	vnode_t *avp = NULL;
1677 	rnode_t *drp;
1678 
1679 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1680 		return (EPERM);
1681 
1682 	drp = VTOR(dvp);
1683 
1684 	/*
1685 	 * Are we looking up extended attributes?  If so, "dvp" is
1686 	 * the file or directory for which we want attributes, and
1687 	 * we need a lookup of the hidden attribute directory
1688 	 * before we lookup the rest of the path.
1689 	 */
1690 	if (flags & LOOKUP_XATTR) {
1691 		bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1692 		mntinfo_t *mi;
1693 
1694 		mi = VTOMI(dvp);
1695 		if (!(mi->mi_flags & MI_EXTATTR))
1696 			return (EINVAL);
1697 
1698 		if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1699 			return (EINTR);
1700 
1701 		(void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1702 		if (avp == NULL)
1703 			error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1704 		else
1705 			error = 0;
1706 
1707 		nfs_rw_exit(&drp->r_rwlock);
1708 
1709 		if (error) {
1710 			if (mi->mi_flags & MI_EXTATTR)
1711 				return (error);
1712 			return (EINVAL);
1713 		}
1714 		dvp = avp;
1715 		drp = VTOR(dvp);
1716 	}
1717 
1718 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1719 		error = EINTR;
1720 		goto out;
1721 	}
1722 
1723 	error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1724 
1725 	nfs_rw_exit(&drp->r_rwlock);
1726 
1727 	/*
1728 	 * If vnode is a device, create special vnode.
1729 	 */
1730 	if (!error && IS_DEVVP(*vpp)) {
1731 		vp = *vpp;
1732 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1733 		VN_RELE(vp);
1734 	}
1735 
1736 out:
1737 	if (avp != NULL)
1738 		VN_RELE(avp);
1739 
1740 	return (error);
1741 }
1742 
1743 static int nfs_lookup_neg_cache = 1;
1744 
1745 #ifdef DEBUG
1746 static int nfs_lookup_dnlc_hits = 0;
1747 static int nfs_lookup_dnlc_misses = 0;
1748 static int nfs_lookup_dnlc_neg_hits = 0;
1749 static int nfs_lookup_dnlc_disappears = 0;
1750 static int nfs_lookup_dnlc_lookups = 0;
1751 #endif
1752 
1753 /* ARGSUSED */
1754 int
1755 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1756 	int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1757 {
1758 	int error;
1759 
1760 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1761 
1762 	/*
1763 	 * If lookup is for "", just return dvp.  Don't need
1764 	 * to send it over the wire, look it up in the dnlc,
1765 	 * or perform any access checks.
1766 	 */
1767 	if (*nm == '\0') {
1768 		VN_HOLD(dvp);
1769 		*vpp = dvp;
1770 		return (0);
1771 	}
1772 
1773 	/*
1774 	 * Can't do lookups in non-directories.
1775 	 */
1776 	if (dvp->v_type != VDIR)
1777 		return (ENOTDIR);
1778 
1779 	/*
1780 	 * If we're called with RFSCALL_SOFT, it's important that
1781 	 * the only rfscall is one we make directly; if we permit
1782 	 * an access call because we're looking up "." or validating
1783 	 * a dnlc hit, we'll deadlock because that rfscall will not
1784 	 * have the RFSCALL_SOFT set.
1785 	 */
1786 	if (rfscall_flags & RFSCALL_SOFT)
1787 		goto callit;
1788 
1789 	/*
1790 	 * If lookup is for ".", just return dvp.  Don't need
1791 	 * to send it over the wire or look it up in the dnlc,
1792 	 * just need to check access.
1793 	 */
1794 	if (strcmp(nm, ".") == 0) {
1795 		error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1796 		if (error)
1797 			return (error);
1798 		VN_HOLD(dvp);
1799 		*vpp = dvp;
1800 		return (0);
1801 	}
1802 
1803 	/*
1804 	 * Lookup this name in the DNLC.  If there was a valid entry,
1805 	 * then return the results of the lookup.
1806 	 */
1807 	error = nfslookup_dnlc(dvp, nm, vpp, cr);
1808 	if (error || *vpp != NULL)
1809 		return (error);
1810 
1811 callit:
1812 	error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1813 
1814 	return (error);
1815 }
1816 
1817 static int
1818 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1819 {
1820 	int error;
1821 	vnode_t *vp;
1822 
1823 	ASSERT(*nm != '\0');
1824 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1825 
1826 	/*
1827 	 * Lookup this name in the DNLC.  If successful, then validate
1828 	 * the caches and then recheck the DNLC.  The DNLC is rechecked
1829 	 * just in case this entry got invalidated during the call
1830 	 * to nfs_validate_caches.
1831 	 *
1832 	 * An assumption is being made that it is safe to say that a
1833 	 * file exists which may not on the server.  Any operations to
1834 	 * the server will fail with ESTALE.
1835 	 */
1836 #ifdef DEBUG
1837 	nfs_lookup_dnlc_lookups++;
1838 #endif
1839 	vp = dnlc_lookup(dvp, nm);
1840 	if (vp != NULL) {
1841 		VN_RELE(vp);
1842 		if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1843 			PURGE_ATTRCACHE(dvp);
1844 		}
1845 		error = nfs_validate_caches(dvp, cr);
1846 		if (error)
1847 			return (error);
1848 		vp = dnlc_lookup(dvp, nm);
1849 		if (vp != NULL) {
1850 			error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1851 			if (error) {
1852 				VN_RELE(vp);
1853 				return (error);
1854 			}
1855 			if (vp == DNLC_NO_VNODE) {
1856 				VN_RELE(vp);
1857 #ifdef DEBUG
1858 				nfs_lookup_dnlc_neg_hits++;
1859 #endif
1860 				return (ENOENT);
1861 			}
1862 			*vpp = vp;
1863 #ifdef DEBUG
1864 			nfs_lookup_dnlc_hits++;
1865 #endif
1866 			return (0);
1867 		}
1868 #ifdef DEBUG
1869 		nfs_lookup_dnlc_disappears++;
1870 #endif
1871 	}
1872 #ifdef DEBUG
1873 	else
1874 		nfs_lookup_dnlc_misses++;
1875 #endif
1876 
1877 	*vpp = NULL;
1878 
1879 	return (0);
1880 }
1881 
1882 static int
1883 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1884 	int rfscall_flags)
1885 {
1886 	int error;
1887 	struct nfsdiropargs da;
1888 	struct nfsdiropres dr;
1889 	int douprintf;
1890 	failinfo_t fi;
1891 	hrtime_t t;
1892 
1893 	ASSERT(*nm != '\0');
1894 	ASSERT(dvp->v_type == VDIR);
1895 	ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1896 
1897 	setdiropargs(&da, nm, dvp);
1898 
1899 	fi.vp = dvp;
1900 	fi.fhp = NULL;		/* no need to update, filehandle not copied */
1901 	fi.copyproc = nfscopyfh;
1902 	fi.lookupproc = nfslookup;
1903 	fi.xattrdirproc = acl_getxattrdir2;
1904 
1905 	douprintf = 1;
1906 
1907 	t = gethrtime();
1908 
1909 	error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1910 	    xdr_diropargs, (caddr_t)&da,
1911 	    xdr_diropres, (caddr_t)&dr, cr,
1912 	    &douprintf, &dr.dr_status, rfscall_flags, &fi);
1913 
1914 	if (!error) {
1915 		error = geterrno(dr.dr_status);
1916 		if (!error) {
1917 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1918 			    dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1919 			/*
1920 			 * If NFS_ACL is supported on the server, then the
1921 			 * attributes returned by server may have minimal
1922 			 * permissions sometimes denying access to users having
1923 			 * proper access.  To get the proper attributes, mark
1924 			 * the attributes as expired so that they will be
1925 			 * regotten via the NFS_ACL GETATTR2 procedure.
1926 			 */
1927 			if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1928 				PURGE_ATTRCACHE(*vpp);
1929 			}
1930 			if (!(rfscall_flags & RFSCALL_SOFT))
1931 				dnlc_update(dvp, nm, *vpp);
1932 		} else {
1933 			PURGE_STALE_FH(error, dvp, cr);
1934 			if (error == ENOENT && nfs_lookup_neg_cache)
1935 				dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1936 		}
1937 	}
1938 
1939 	return (error);
1940 }
1941 
1942 /* ARGSUSED */
1943 static int
1944 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1945 	int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1946 	vsecattr_t *vsecp)
1947 {
1948 	int error;
1949 	struct nfscreatargs args;
1950 	struct nfsdiropres dr;
1951 	int douprintf;
1952 	vnode_t *vp;
1953 	rnode_t *rp;
1954 	struct vattr vattr;
1955 	rnode_t *drp;
1956 	vnode_t *tempvp;
1957 	hrtime_t t;
1958 
1959 	drp = VTOR(dvp);
1960 
1961 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
1962 		return (EPERM);
1963 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1964 		return (EINTR);
1965 
1966 	/*
1967 	 * We make a copy of the attributes because the caller does not
1968 	 * expect us to change what va points to.
1969 	 */
1970 	vattr = *va;
1971 
1972 	/*
1973 	 * If the pathname is "", just use dvp.  Don't need
1974 	 * to send it over the wire, look it up in the dnlc,
1975 	 * or perform any access checks.
1976 	 */
1977 	if (*nm == '\0') {
1978 		error = 0;
1979 		VN_HOLD(dvp);
1980 		vp = dvp;
1981 	/*
1982 	 * If the pathname is ".", just use dvp.  Don't need
1983 	 * to send it over the wire or look it up in the dnlc,
1984 	 * just need to check access.
1985 	 */
1986 	} else if (strcmp(nm, ".") == 0) {
1987 		error = nfs_access(dvp, VEXEC, 0, cr, ct);
1988 		if (error) {
1989 			nfs_rw_exit(&drp->r_rwlock);
1990 			return (error);
1991 		}
1992 		VN_HOLD(dvp);
1993 		vp = dvp;
1994 	/*
1995 	 * We need to go over the wire, just to be sure whether the
1996 	 * file exists or not.  Using the DNLC can be dangerous in
1997 	 * this case when making a decision regarding existence.
1998 	 */
1999 	} else {
2000 		error = nfslookup_otw(dvp, nm, &vp, cr, 0);
2001 	}
2002 	if (!error) {
2003 		if (exclusive == EXCL)
2004 			error = EEXIST;
2005 		else if (vp->v_type == VDIR && (mode & VWRITE))
2006 			error = EISDIR;
2007 		else {
2008 			/*
2009 			 * If vnode is a device, create special vnode.
2010 			 */
2011 			if (IS_DEVVP(vp)) {
2012 				tempvp = vp;
2013 				vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2014 				VN_RELE(tempvp);
2015 			}
2016 			if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2017 				if ((vattr.va_mask & AT_SIZE) &&
2018 				    vp->v_type == VREG) {
2019 					vattr.va_mask = AT_SIZE;
2020 					error = nfssetattr(vp, &vattr, 0, cr);
2021 				}
2022 			}
2023 		}
2024 		nfs_rw_exit(&drp->r_rwlock);
2025 		if (error) {
2026 			VN_RELE(vp);
2027 		} else {
2028 			/*
2029 			 * existing file got truncated, notify.
2030 			 */
2031 			vnevent_create(vp, ct);
2032 			*vpp = vp;
2033 		}
2034 		return (error);
2035 	}
2036 
2037 	ASSERT(vattr.va_mask & AT_TYPE);
2038 	if (vattr.va_type == VREG) {
2039 		ASSERT(vattr.va_mask & AT_MODE);
2040 		if (MANDMODE(vattr.va_mode)) {
2041 			nfs_rw_exit(&drp->r_rwlock);
2042 			return (EACCES);
2043 		}
2044 	}
2045 
2046 	dnlc_remove(dvp, nm);
2047 
2048 	setdiropargs(&args.ca_da, nm, dvp);
2049 
2050 	/*
2051 	 * Decide what the group-id of the created file should be.
2052 	 * Set it in attribute list as advisory...then do a setattr
2053 	 * if the server didn't get it right the first time.
2054 	 */
2055 	error = setdirgid(dvp, &vattr.va_gid, cr);
2056 	if (error) {
2057 		nfs_rw_exit(&drp->r_rwlock);
2058 		return (error);
2059 	}
2060 	vattr.va_mask |= AT_GID;
2061 
2062 	/*
2063 	 * This is a completely gross hack to make mknod
2064 	 * work over the wire until we can wack the protocol
2065 	 */
2066 #define	IFCHR		0020000		/* character special */
2067 #define	IFBLK		0060000		/* block special */
2068 #define	IFSOCK		0140000		/* socket */
2069 
2070 	/*
2071 	 * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2072 	 * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2073 	 * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2074 	 * minor/major numbers <= 8 bits long, compress the device
2075 	 * number before sending it. Otherwise, the 4.x server will not
2076 	 * create the device with the correct device number and nothing can be
2077 	 * done about this.
2078 	 */
2079 	if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2080 		dev_t d = vattr.va_rdev;
2081 		dev32_t dev32;
2082 
2083 		if (vattr.va_type == VCHR)
2084 			vattr.va_mode |= IFCHR;
2085 		else
2086 			vattr.va_mode |= IFBLK;
2087 
2088 		(void) cmpldev(&dev32, d);
2089 		if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2090 			vattr.va_size = (u_offset_t)dev32;
2091 		else
2092 			vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2093 
2094 		vattr.va_mask |= AT_MODE|AT_SIZE;
2095 	} else if (vattr.va_type == VFIFO) {
2096 		vattr.va_mode |= IFCHR;		/* xtra kludge for namedpipe */
2097 		vattr.va_size = (u_offset_t)NFS_FIFO_DEV;	/* blech */
2098 		vattr.va_mask |= AT_MODE|AT_SIZE;
2099 	} else if (vattr.va_type == VSOCK) {
2100 		vattr.va_mode |= IFSOCK;
2101 		/*
2102 		 * To avoid triggering bugs in the servers set AT_SIZE
2103 		 * (all other RFS_CREATE calls set this).
2104 		 */
2105 		vattr.va_size = 0;
2106 		vattr.va_mask |= AT_MODE|AT_SIZE;
2107 	}
2108 
2109 	args.ca_sa = &args.ca_sa_buf;
2110 	error = vattr_to_sattr(&vattr, args.ca_sa);
2111 	if (error) {
2112 		/* req time field(s) overflow - return immediately */
2113 		nfs_rw_exit(&drp->r_rwlock);
2114 		return (error);
2115 	}
2116 
2117 	douprintf = 1;
2118 
2119 	t = gethrtime();
2120 
2121 	error = rfs2call(VTOMI(dvp), RFS_CREATE,
2122 	    xdr_creatargs, (caddr_t)&args,
2123 	    xdr_diropres, (caddr_t)&dr, cr,
2124 	    &douprintf, &dr.dr_status, 0, NULL);
2125 
2126 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2127 
2128 	if (!error) {
2129 		error = geterrno(dr.dr_status);
2130 		if (!error) {
2131 			if (HAVE_RDDIR_CACHE(drp))
2132 				nfs_purge_rddir_cache(dvp);
2133 			vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2134 			    dvp->v_vfsp, t, cr, NULL, NULL);
2135 			/*
2136 			 * If NFS_ACL is supported on the server, then the
2137 			 * attributes returned by server may have minimal
2138 			 * permissions sometimes denying access to users having
2139 			 * proper access.  To get the proper attributes, mark
2140 			 * the attributes as expired so that they will be
2141 			 * regotten via the NFS_ACL GETATTR2 procedure.
2142 			 */
2143 			if (VTOMI(vp)->mi_flags & MI_ACL) {
2144 				PURGE_ATTRCACHE(vp);
2145 			}
2146 			dnlc_update(dvp, nm, vp);
2147 			rp = VTOR(vp);
2148 			if (vattr.va_size == 0) {
2149 				mutex_enter(&rp->r_statelock);
2150 				rp->r_size = 0;
2151 				mutex_exit(&rp->r_statelock);
2152 				if (vn_has_cached_data(vp)) {
2153 					ASSERT(vp->v_type != VCHR);
2154 					nfs_invalidate_pages(vp,
2155 					    (u_offset_t)0, cr);
2156 				}
2157 			}
2158 
2159 			/*
2160 			 * Make sure the gid was set correctly.
2161 			 * If not, try to set it (but don't lose
2162 			 * any sleep over it).
2163 			 */
2164 			if (vattr.va_gid != rp->r_attr.va_gid) {
2165 				vattr.va_mask = AT_GID;
2166 				(void) nfssetattr(vp, &vattr, 0, cr);
2167 			}
2168 
2169 			/*
2170 			 * If vnode is a device create special vnode
2171 			 */
2172 			if (IS_DEVVP(vp)) {
2173 				*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2174 				VN_RELE(vp);
2175 			} else
2176 				*vpp = vp;
2177 		} else {
2178 			PURGE_STALE_FH(error, dvp, cr);
2179 		}
2180 	}
2181 
2182 	nfs_rw_exit(&drp->r_rwlock);
2183 
2184 	return (error);
2185 }
2186 
2187 /*
2188  * Weirdness: if the vnode to be removed is open
2189  * we rename it instead of removing it and nfs_inactive
2190  * will remove the new name.
2191  */
2192 /* ARGSUSED */
2193 static int
2194 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2195 {
2196 	int error;
2197 	struct nfsdiropargs da;
2198 	enum nfsstat status;
2199 	vnode_t *vp;
2200 	char *tmpname;
2201 	int douprintf;
2202 	rnode_t *rp;
2203 	rnode_t *drp;
2204 
2205 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2206 		return (EPERM);
2207 	drp = VTOR(dvp);
2208 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2209 		return (EINTR);
2210 
2211 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2212 	if (error) {
2213 		nfs_rw_exit(&drp->r_rwlock);
2214 		return (error);
2215 	}
2216 
2217 	if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2218 		VN_RELE(vp);
2219 		nfs_rw_exit(&drp->r_rwlock);
2220 		return (EPERM);
2221 	}
2222 
2223 	/*
2224 	 * First just remove the entry from the name cache, as it
2225 	 * is most likely the only entry for this vp.
2226 	 */
2227 	dnlc_remove(dvp, nm);
2228 
2229 	/*
2230 	 * If the file has a v_count > 1 then there may be more than one
2231 	 * entry in the name cache due multiple links or an open file,
2232 	 * but we don't have the real reference count so flush all
2233 	 * possible entries.
2234 	 */
2235 	if (vp->v_count > 1)
2236 		dnlc_purge_vp(vp);
2237 
2238 	/*
2239 	 * Now we have the real reference count on the vnode
2240 	 */
2241 	rp = VTOR(vp);
2242 	mutex_enter(&rp->r_statelock);
2243 	if (vp->v_count > 1 &&
2244 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2245 		mutex_exit(&rp->r_statelock);
2246 		tmpname = newname();
2247 		error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2248 		if (error)
2249 			kmem_free(tmpname, MAXNAMELEN);
2250 		else {
2251 			mutex_enter(&rp->r_statelock);
2252 			if (rp->r_unldvp == NULL) {
2253 				VN_HOLD(dvp);
2254 				rp->r_unldvp = dvp;
2255 				if (rp->r_unlcred != NULL)
2256 					crfree(rp->r_unlcred);
2257 				crhold(cr);
2258 				rp->r_unlcred = cr;
2259 				rp->r_unlname = tmpname;
2260 			} else {
2261 				kmem_free(rp->r_unlname, MAXNAMELEN);
2262 				rp->r_unlname = tmpname;
2263 			}
2264 			mutex_exit(&rp->r_statelock);
2265 		}
2266 	} else {
2267 		mutex_exit(&rp->r_statelock);
2268 		/*
2269 		 * We need to flush any dirty pages which happen to
2270 		 * be hanging around before removing the file.  This
2271 		 * shouldn't happen very often and mostly on file
2272 		 * systems mounted "nocto".
2273 		 */
2274 		if (vn_has_cached_data(vp) &&
2275 		    ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2276 			error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2277 			if (error && (error == ENOSPC || error == EDQUOT)) {
2278 				mutex_enter(&rp->r_statelock);
2279 				if (!rp->r_error)
2280 					rp->r_error = error;
2281 				mutex_exit(&rp->r_statelock);
2282 			}
2283 		}
2284 
2285 		setdiropargs(&da, nm, dvp);
2286 
2287 		douprintf = 1;
2288 
2289 		error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2290 		    xdr_diropargs, (caddr_t)&da,
2291 		    xdr_enum, (caddr_t)&status, cr,
2292 		    &douprintf, &status, 0, NULL);
2293 
2294 		/*
2295 		 * The xattr dir may be gone after last attr is removed,
2296 		 * so flush it from dnlc.
2297 		 */
2298 		if (dvp->v_flag & V_XATTRDIR)
2299 			dnlc_purge_vp(dvp);
2300 
2301 		PURGE_ATTRCACHE(dvp);	/* mod time changed */
2302 		PURGE_ATTRCACHE(vp);	/* link count changed */
2303 
2304 		if (!error) {
2305 			error = geterrno(status);
2306 			if (!error) {
2307 				if (HAVE_RDDIR_CACHE(drp))
2308 					nfs_purge_rddir_cache(dvp);
2309 			} else {
2310 				PURGE_STALE_FH(error, dvp, cr);
2311 			}
2312 		}
2313 	}
2314 
2315 	if (error == 0) {
2316 		vnevent_remove(vp, dvp, nm, ct);
2317 	}
2318 	VN_RELE(vp);
2319 
2320 	nfs_rw_exit(&drp->r_rwlock);
2321 
2322 	return (error);
2323 }
2324 
2325 /* ARGSUSED */
2326 static int
2327 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2328 	caller_context_t *ct, int flags)
2329 {
2330 	int error;
2331 	struct nfslinkargs args;
2332 	enum nfsstat status;
2333 	vnode_t *realvp;
2334 	int douprintf;
2335 	rnode_t *tdrp;
2336 
2337 	if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2338 		return (EPERM);
2339 	if (VOP_REALVP(svp, &realvp, ct) == 0)
2340 		svp = realvp;
2341 
2342 	args.la_from = VTOFH(svp);
2343 	setdiropargs(&args.la_to, tnm, tdvp);
2344 
2345 	tdrp = VTOR(tdvp);
2346 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2347 		return (EINTR);
2348 
2349 	dnlc_remove(tdvp, tnm);
2350 
2351 	douprintf = 1;
2352 
2353 	error = rfs2call(VTOMI(svp), RFS_LINK,
2354 	    xdr_linkargs, (caddr_t)&args,
2355 	    xdr_enum, (caddr_t)&status, cr,
2356 	    &douprintf, &status, 0, NULL);
2357 
2358 	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
2359 	PURGE_ATTRCACHE(svp);	/* link count changed */
2360 
2361 	if (!error) {
2362 		error = geterrno(status);
2363 		if (!error) {
2364 			if (HAVE_RDDIR_CACHE(tdrp))
2365 				nfs_purge_rddir_cache(tdvp);
2366 		}
2367 	}
2368 
2369 	nfs_rw_exit(&tdrp->r_rwlock);
2370 
2371 	if (!error) {
2372 		/*
2373 		 * Notify the source file of this link operation.
2374 		 */
2375 		vnevent_link(svp, ct);
2376 	}
2377 	return (error);
2378 }
2379 
2380 /* ARGSUSED */
2381 static int
2382 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2383 	caller_context_t *ct, int flags)
2384 {
2385 	vnode_t *realvp;
2386 
2387 	if (nfs_zone() != VTOMI(odvp)->mi_zone)
2388 		return (EPERM);
2389 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2390 		ndvp = realvp;
2391 
2392 	return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2393 }
2394 
2395 /*
2396  * nfsrename does the real work of renaming in NFS Version 2.
2397  */
2398 static int
2399 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2400     caller_context_t *ct)
2401 {
2402 	int error;
2403 	enum nfsstat status;
2404 	struct nfsrnmargs args;
2405 	int douprintf;
2406 	vnode_t *nvp = NULL;
2407 	vnode_t *ovp = NULL;
2408 	char *tmpname;
2409 	rnode_t *rp;
2410 	rnode_t *odrp;
2411 	rnode_t *ndrp;
2412 
2413 	ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2414 	if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2415 	    strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2416 		return (EINVAL);
2417 
2418 	odrp = VTOR(odvp);
2419 	ndrp = VTOR(ndvp);
2420 	if ((intptr_t)odrp < (intptr_t)ndrp) {
2421 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2422 			return (EINTR);
2423 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2424 			nfs_rw_exit(&odrp->r_rwlock);
2425 			return (EINTR);
2426 		}
2427 	} else {
2428 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2429 			return (EINTR);
2430 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2431 			nfs_rw_exit(&ndrp->r_rwlock);
2432 			return (EINTR);
2433 		}
2434 	}
2435 
2436 	/*
2437 	 * Lookup the target file.  If it exists, it needs to be
2438 	 * checked to see whether it is a mount point and whether
2439 	 * it is active (open).
2440 	 */
2441 	error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2442 	if (!error) {
2443 		/*
2444 		 * If this file has been mounted on, then just
2445 		 * return busy because renaming to it would remove
2446 		 * the mounted file system from the name space.
2447 		 */
2448 		if (vn_mountedvfs(nvp) != NULL) {
2449 			VN_RELE(nvp);
2450 			nfs_rw_exit(&odrp->r_rwlock);
2451 			nfs_rw_exit(&ndrp->r_rwlock);
2452 			return (EBUSY);
2453 		}
2454 
2455 		/*
2456 		 * Purge the name cache of all references to this vnode
2457 		 * so that we can check the reference count to infer
2458 		 * whether it is active or not.
2459 		 */
2460 		/*
2461 		 * First just remove the entry from the name cache, as it
2462 		 * is most likely the only entry for this vp.
2463 		 */
2464 		dnlc_remove(ndvp, nnm);
2465 		/*
2466 		 * If the file has a v_count > 1 then there may be more
2467 		 * than one entry in the name cache due multiple links
2468 		 * or an open file, but we don't have the real reference
2469 		 * count so flush all possible entries.
2470 		 */
2471 		if (nvp->v_count > 1)
2472 			dnlc_purge_vp(nvp);
2473 
2474 		/*
2475 		 * If the vnode is active and is not a directory,
2476 		 * arrange to rename it to a
2477 		 * temporary file so that it will continue to be
2478 		 * accessible.  This implements the "unlink-open-file"
2479 		 * semantics for the target of a rename operation.
2480 		 * Before doing this though, make sure that the
2481 		 * source and target files are not already the same.
2482 		 */
2483 		if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2484 			/*
2485 			 * Lookup the source name.
2486 			 */
2487 			error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2488 			    cr, 0);
2489 
2490 			/*
2491 			 * The source name *should* already exist.
2492 			 */
2493 			if (error) {
2494 				VN_RELE(nvp);
2495 				nfs_rw_exit(&odrp->r_rwlock);
2496 				nfs_rw_exit(&ndrp->r_rwlock);
2497 				return (error);
2498 			}
2499 
2500 			/*
2501 			 * Compare the two vnodes.  If they are the same,
2502 			 * just release all held vnodes and return success.
2503 			 */
2504 			if (ovp == nvp) {
2505 				VN_RELE(ovp);
2506 				VN_RELE(nvp);
2507 				nfs_rw_exit(&odrp->r_rwlock);
2508 				nfs_rw_exit(&ndrp->r_rwlock);
2509 				return (0);
2510 			}
2511 
2512 			/*
2513 			 * Can't mix and match directories and non-
2514 			 * directories in rename operations.  We already
2515 			 * know that the target is not a directory.  If
2516 			 * the source is a directory, return an error.
2517 			 */
2518 			if (ovp->v_type == VDIR) {
2519 				VN_RELE(ovp);
2520 				VN_RELE(nvp);
2521 				nfs_rw_exit(&odrp->r_rwlock);
2522 				nfs_rw_exit(&ndrp->r_rwlock);
2523 				return (ENOTDIR);
2524 			}
2525 
2526 			/*
2527 			 * The target file exists, is not the same as
2528 			 * the source file, and is active.  Link it
2529 			 * to a temporary filename to avoid having
2530 			 * the server removing the file completely.
2531 			 */
2532 			tmpname = newname();
2533 			error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2534 			if (error == EOPNOTSUPP) {
2535 				error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2536 				    cr, NULL, 0);
2537 			}
2538 			if (error) {
2539 				kmem_free(tmpname, MAXNAMELEN);
2540 				VN_RELE(ovp);
2541 				VN_RELE(nvp);
2542 				nfs_rw_exit(&odrp->r_rwlock);
2543 				nfs_rw_exit(&ndrp->r_rwlock);
2544 				return (error);
2545 			}
2546 			rp = VTOR(nvp);
2547 			mutex_enter(&rp->r_statelock);
2548 			if (rp->r_unldvp == NULL) {
2549 				VN_HOLD(ndvp);
2550 				rp->r_unldvp = ndvp;
2551 				if (rp->r_unlcred != NULL)
2552 					crfree(rp->r_unlcred);
2553 				crhold(cr);
2554 				rp->r_unlcred = cr;
2555 				rp->r_unlname = tmpname;
2556 			} else {
2557 				kmem_free(rp->r_unlname, MAXNAMELEN);
2558 				rp->r_unlname = tmpname;
2559 			}
2560 			mutex_exit(&rp->r_statelock);
2561 		}
2562 	}
2563 
2564 	if (ovp == NULL) {
2565 		/*
2566 		 * When renaming directories to be a subdirectory of a
2567 		 * different parent, the dnlc entry for ".." will no
2568 		 * longer be valid, so it must be removed.
2569 		 *
2570 		 * We do a lookup here to determine whether we are renaming
2571 		 * a directory and we need to check if we are renaming
2572 		 * an unlinked file.  This might have already been done
2573 		 * in previous code, so we check ovp == NULL to avoid
2574 		 * doing it twice.
2575 		 */
2576 
2577 		error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2578 
2579 		/*
2580 		 * The source name *should* already exist.
2581 		 */
2582 		if (error) {
2583 			nfs_rw_exit(&odrp->r_rwlock);
2584 			nfs_rw_exit(&ndrp->r_rwlock);
2585 			if (nvp) {
2586 				VN_RELE(nvp);
2587 			}
2588 			return (error);
2589 		}
2590 		ASSERT(ovp != NULL);
2591 	}
2592 
2593 	dnlc_remove(odvp, onm);
2594 	dnlc_remove(ndvp, nnm);
2595 
2596 	setdiropargs(&args.rna_from, onm, odvp);
2597 	setdiropargs(&args.rna_to, nnm, ndvp);
2598 
2599 	douprintf = 1;
2600 
2601 	error = rfs2call(VTOMI(odvp), RFS_RENAME,
2602 	    xdr_rnmargs, (caddr_t)&args,
2603 	    xdr_enum, (caddr_t)&status, cr,
2604 	    &douprintf, &status, 0, NULL);
2605 
2606 	PURGE_ATTRCACHE(odvp);	/* mod time changed */
2607 	PURGE_ATTRCACHE(ndvp);	/* mod time changed */
2608 
2609 	if (!error) {
2610 		error = geterrno(status);
2611 		if (!error) {
2612 			if (HAVE_RDDIR_CACHE(odrp))
2613 				nfs_purge_rddir_cache(odvp);
2614 			if (HAVE_RDDIR_CACHE(ndrp))
2615 				nfs_purge_rddir_cache(ndvp);
2616 			/*
2617 			 * when renaming directories to be a subdirectory of a
2618 			 * different parent, the dnlc entry for ".." will no
2619 			 * longer be valid, so it must be removed
2620 			 */
2621 			rp = VTOR(ovp);
2622 			if (ndvp != odvp) {
2623 				if (ovp->v_type == VDIR) {
2624 					dnlc_remove(ovp, "..");
2625 					if (HAVE_RDDIR_CACHE(rp))
2626 						nfs_purge_rddir_cache(ovp);
2627 				}
2628 			}
2629 
2630 			/*
2631 			 * If we are renaming the unlinked file, update the
2632 			 * r_unldvp and r_unlname as needed.
2633 			 */
2634 			mutex_enter(&rp->r_statelock);
2635 			if (rp->r_unldvp != NULL) {
2636 				if (strcmp(rp->r_unlname, onm) == 0) {
2637 					(void) strncpy(rp->r_unlname,
2638 					    nnm, MAXNAMELEN);
2639 					rp->r_unlname[MAXNAMELEN - 1] = '\0';
2640 
2641 					if (ndvp != rp->r_unldvp) {
2642 						VN_RELE(rp->r_unldvp);
2643 						rp->r_unldvp = ndvp;
2644 						VN_HOLD(ndvp);
2645 					}
2646 				}
2647 			}
2648 			mutex_exit(&rp->r_statelock);
2649 		} else {
2650 			/*
2651 			 * System V defines rename to return EEXIST, not
2652 			 * ENOTEMPTY if the target directory is not empty.
2653 			 * Over the wire, the error is NFSERR_ENOTEMPTY
2654 			 * which geterrno maps to ENOTEMPTY.
2655 			 */
2656 			if (error == ENOTEMPTY)
2657 				error = EEXIST;
2658 		}
2659 	}
2660 
2661 	if (error == 0) {
2662 		if (nvp)
2663 			vnevent_rename_dest(nvp, ndvp, nnm, ct);
2664 
2665 		if (odvp != ndvp)
2666 			vnevent_rename_dest_dir(ndvp, ct);
2667 
2668 		ASSERT(ovp != NULL);
2669 		vnevent_rename_src(ovp, odvp, onm, ct);
2670 	}
2671 
2672 	if (nvp) {
2673 		VN_RELE(nvp);
2674 	}
2675 	VN_RELE(ovp);
2676 
2677 	nfs_rw_exit(&odrp->r_rwlock);
2678 	nfs_rw_exit(&ndrp->r_rwlock);
2679 
2680 	return (error);
2681 }
2682 
2683 /* ARGSUSED */
2684 static int
2685 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2686 	caller_context_t *ct, int flags, vsecattr_t *vsecp)
2687 {
2688 	int error;
2689 	struct nfscreatargs args;
2690 	struct nfsdiropres dr;
2691 	int douprintf;
2692 	rnode_t *drp;
2693 	hrtime_t t;
2694 
2695 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2696 		return (EPERM);
2697 
2698 	setdiropargs(&args.ca_da, nm, dvp);
2699 
2700 	/*
2701 	 * Decide what the group-id and set-gid bit of the created directory
2702 	 * should be.  May have to do a setattr to get the gid right.
2703 	 */
2704 	error = setdirgid(dvp, &va->va_gid, cr);
2705 	if (error)
2706 		return (error);
2707 	error = setdirmode(dvp, &va->va_mode, cr);
2708 	if (error)
2709 		return (error);
2710 	va->va_mask |= AT_MODE|AT_GID;
2711 
2712 	args.ca_sa = &args.ca_sa_buf;
2713 	error = vattr_to_sattr(va, args.ca_sa);
2714 	if (error) {
2715 		/* req time field(s) overflow - return immediately */
2716 		return (error);
2717 	}
2718 
2719 	drp = VTOR(dvp);
2720 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2721 		return (EINTR);
2722 
2723 	dnlc_remove(dvp, nm);
2724 
2725 	douprintf = 1;
2726 
2727 	t = gethrtime();
2728 
2729 	error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2730 	    xdr_creatargs, (caddr_t)&args,
2731 	    xdr_diropres, (caddr_t)&dr, cr,
2732 	    &douprintf, &dr.dr_status, 0, NULL);
2733 
2734 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2735 
2736 	if (!error) {
2737 		error = geterrno(dr.dr_status);
2738 		if (!error) {
2739 			if (HAVE_RDDIR_CACHE(drp))
2740 				nfs_purge_rddir_cache(dvp);
2741 			/*
2742 			 * The attributes returned by RFS_MKDIR can not
2743 			 * be depended upon, so mark the attribute cache
2744 			 * as purged.  A subsequent GETATTR will get the
2745 			 * correct attributes from the server.
2746 			 */
2747 			*vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2748 			    dvp->v_vfsp, t, cr, NULL, NULL);
2749 			PURGE_ATTRCACHE(*vpp);
2750 			dnlc_update(dvp, nm, *vpp);
2751 
2752 			/*
2753 			 * Make sure the gid was set correctly.
2754 			 * If not, try to set it (but don't lose
2755 			 * any sleep over it).
2756 			 */
2757 			if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2758 				va->va_mask = AT_GID;
2759 				(void) nfssetattr(*vpp, va, 0, cr);
2760 			}
2761 		} else {
2762 			PURGE_STALE_FH(error, dvp, cr);
2763 		}
2764 	}
2765 
2766 	nfs_rw_exit(&drp->r_rwlock);
2767 
2768 	return (error);
2769 }
2770 
2771 /* ARGSUSED */
2772 static int
2773 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2774 	caller_context_t *ct, int flags)
2775 {
2776 	int error;
2777 	enum nfsstat status;
2778 	struct nfsdiropargs da;
2779 	vnode_t *vp;
2780 	int douprintf;
2781 	rnode_t *drp;
2782 
2783 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2784 		return (EPERM);
2785 	drp = VTOR(dvp);
2786 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2787 		return (EINTR);
2788 
2789 	/*
2790 	 * Attempt to prevent a rmdir(".") from succeeding.
2791 	 */
2792 	error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2793 	if (error) {
2794 		nfs_rw_exit(&drp->r_rwlock);
2795 		return (error);
2796 	}
2797 
2798 	if (vp == cdir) {
2799 		VN_RELE(vp);
2800 		nfs_rw_exit(&drp->r_rwlock);
2801 		return (EINVAL);
2802 	}
2803 
2804 	setdiropargs(&da, nm, dvp);
2805 
2806 	/*
2807 	 * First just remove the entry from the name cache, as it
2808 	 * is most likely an entry for this vp.
2809 	 */
2810 	dnlc_remove(dvp, nm);
2811 
2812 	/*
2813 	 * If there vnode reference count is greater than one, then
2814 	 * there may be additional references in the DNLC which will
2815 	 * need to be purged.  First, trying removing the entry for
2816 	 * the parent directory and see if that removes the additional
2817 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2818 	 * to completely remove any references to the directory which
2819 	 * might still exist in the DNLC.
2820 	 */
2821 	if (vp->v_count > 1) {
2822 		dnlc_remove(vp, "..");
2823 		if (vp->v_count > 1)
2824 			dnlc_purge_vp(vp);
2825 	}
2826 
2827 	douprintf = 1;
2828 
2829 	error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2830 	    xdr_diropargs, (caddr_t)&da,
2831 	    xdr_enum, (caddr_t)&status, cr,
2832 	    &douprintf, &status, 0, NULL);
2833 
2834 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2835 
2836 	if (error) {
2837 		VN_RELE(vp);
2838 		nfs_rw_exit(&drp->r_rwlock);
2839 		return (error);
2840 	}
2841 
2842 	error = geterrno(status);
2843 	if (!error) {
2844 		if (HAVE_RDDIR_CACHE(drp))
2845 			nfs_purge_rddir_cache(dvp);
2846 		if (HAVE_RDDIR_CACHE(VTOR(vp)))
2847 			nfs_purge_rddir_cache(vp);
2848 	} else {
2849 		PURGE_STALE_FH(error, dvp, cr);
2850 		/*
2851 		 * System V defines rmdir to return EEXIST, not
2852 		 * ENOTEMPTY if the directory is not empty.  Over
2853 		 * the wire, the error is NFSERR_ENOTEMPTY which
2854 		 * geterrno maps to ENOTEMPTY.
2855 		 */
2856 		if (error == ENOTEMPTY)
2857 			error = EEXIST;
2858 	}
2859 
2860 	if (error == 0) {
2861 		vnevent_rmdir(vp, dvp, nm, ct);
2862 	}
2863 	VN_RELE(vp);
2864 
2865 	nfs_rw_exit(&drp->r_rwlock);
2866 
2867 	return (error);
2868 }
2869 
2870 /* ARGSUSED */
2871 static int
2872 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2873 	caller_context_t *ct, int flags)
2874 {
2875 	int error;
2876 	struct nfsslargs args;
2877 	enum nfsstat status;
2878 	int douprintf;
2879 	rnode_t *drp;
2880 
2881 	if (nfs_zone() != VTOMI(dvp)->mi_zone)
2882 		return (EPERM);
2883 	setdiropargs(&args.sla_from, lnm, dvp);
2884 	args.sla_sa = &args.sla_sa_buf;
2885 	error = vattr_to_sattr(tva, args.sla_sa);
2886 	if (error) {
2887 		/* req time field(s) overflow - return immediately */
2888 		return (error);
2889 	}
2890 	args.sla_tnm = tnm;
2891 
2892 	drp = VTOR(dvp);
2893 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2894 		return (EINTR);
2895 
2896 	dnlc_remove(dvp, lnm);
2897 
2898 	douprintf = 1;
2899 
2900 	error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2901 	    xdr_slargs, (caddr_t)&args,
2902 	    xdr_enum, (caddr_t)&status, cr,
2903 	    &douprintf, &status, 0, NULL);
2904 
2905 	PURGE_ATTRCACHE(dvp);	/* mod time changed */
2906 
2907 	if (!error) {
2908 		error = geterrno(status);
2909 		if (!error) {
2910 			if (HAVE_RDDIR_CACHE(drp))
2911 				nfs_purge_rddir_cache(dvp);
2912 		} else {
2913 			PURGE_STALE_FH(error, dvp, cr);
2914 		}
2915 	}
2916 
2917 	nfs_rw_exit(&drp->r_rwlock);
2918 
2919 	return (error);
2920 }
2921 
2922 #ifdef DEBUG
2923 static int nfs_readdir_cache_hits = 0;
2924 static int nfs_readdir_cache_shorts = 0;
2925 static int nfs_readdir_cache_waits = 0;
2926 static int nfs_readdir_cache_misses = 0;
2927 static int nfs_readdir_readahead = 0;
2928 #endif
2929 
2930 static int nfs_shrinkreaddir = 0;
2931 
2932 /*
2933  * Read directory entries.
2934  * There are some weird things to look out for here.  The uio_offset
2935  * field is either 0 or it is the offset returned from a previous
2936  * readdir.  It is an opaque value used by the server to find the
2937  * correct directory block to read. The count field is the number
2938  * of blocks to read on the server.  This is advisory only, the server
2939  * may return only one block's worth of entries.  Entries may be compressed
2940  * on the server.
2941  */
2942 /* ARGSUSED */
2943 static int
2944 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2945 	caller_context_t *ct, int flags)
2946 {
2947 	int error;
2948 	size_t count;
2949 	rnode_t *rp;
2950 	rddir_cache *rdc;
2951 	rddir_cache *nrdc;
2952 	rddir_cache *rrdc;
2953 #ifdef DEBUG
2954 	int missed;
2955 #endif
2956 	rddir_cache srdc;
2957 	avl_index_t where;
2958 
2959 	rp = VTOR(vp);
2960 
2961 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2962 	if (nfs_zone() != VTOMI(vp)->mi_zone)
2963 		return (EIO);
2964 	/*
2965 	 * Make sure that the directory cache is valid.
2966 	 */
2967 	if (HAVE_RDDIR_CACHE(rp)) {
2968 		if (nfs_disable_rddir_cache) {
2969 			/*
2970 			 * Setting nfs_disable_rddir_cache in /etc/system
2971 			 * allows interoperability with servers that do not
2972 			 * properly update the attributes of directories.
2973 			 * Any cached information gets purged before an
2974 			 * access is made to it.
2975 			 */
2976 			nfs_purge_rddir_cache(vp);
2977 		} else {
2978 			error = nfs_validate_caches(vp, cr);
2979 			if (error)
2980 				return (error);
2981 		}
2982 	}
2983 
2984 	/*
2985 	 * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
2986 	 * RFS_READDIR request with rda_count set to more than 0x400. So
2987 	 * we reduce the request size here purely for compatibility.
2988 	 *
2989 	 * In general, this is no longer required.  However, if a server
2990 	 * is discovered which can not handle requests larger than 1024,
2991 	 * nfs_shrinkreaddir can be set to 1 to enable this backwards
2992 	 * compatibility.
2993 	 *
2994 	 * In any case, the request size is limited to NFS_MAXDATA bytes.
2995 	 */
2996 	count = MIN(uiop->uio_iov->iov_len,
2997 	    nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
2998 
2999 	nrdc = NULL;
3000 #ifdef DEBUG
3001 	missed = 0;
3002 #endif
3003 top:
3004 	/*
3005 	 * Short circuit last readdir which always returns 0 bytes.
3006 	 * This can be done after the directory has been read through
3007 	 * completely at least once.  This will set r_direof which
3008 	 * can be used to find the value of the last cookie.
3009 	 */
3010 	mutex_enter(&rp->r_statelock);
3011 	if (rp->r_direof != NULL &&
3012 	    uiop->uio_offset == rp->r_direof->nfs_ncookie) {
3013 		mutex_exit(&rp->r_statelock);
3014 #ifdef DEBUG
3015 		nfs_readdir_cache_shorts++;
3016 #endif
3017 		if (eofp)
3018 			*eofp = 1;
3019 		if (nrdc != NULL)
3020 			rddir_cache_rele(nrdc);
3021 		return (0);
3022 	}
3023 	/*
3024 	 * Look for a cache entry.  Cache entries are identified
3025 	 * by the NFS cookie value and the byte count requested.
3026 	 */
3027 	srdc.nfs_cookie = uiop->uio_offset;
3028 	srdc.buflen = count;
3029 	rdc = avl_find(&rp->r_dir, &srdc, &where);
3030 	if (rdc != NULL) {
3031 		rddir_cache_hold(rdc);
3032 		/*
3033 		 * If the cache entry is in the process of being
3034 		 * filled in, wait until this completes.  The
3035 		 * RDDIRWAIT bit is set to indicate that someone
3036 		 * is waiting and then the thread currently
3037 		 * filling the entry is done, it should do a
3038 		 * cv_broadcast to wakeup all of the threads
3039 		 * waiting for it to finish.
3040 		 */
3041 		if (rdc->flags & RDDIR) {
3042 			nfs_rw_exit(&rp->r_rwlock);
3043 			rdc->flags |= RDDIRWAIT;
3044 #ifdef DEBUG
3045 			nfs_readdir_cache_waits++;
3046 #endif
3047 			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3048 				/*
3049 				 * We got interrupted, probably
3050 				 * the user typed ^C or an alarm
3051 				 * fired.  We free the new entry
3052 				 * if we allocated one.
3053 				 */
3054 				mutex_exit(&rp->r_statelock);
3055 				(void) nfs_rw_enter_sig(&rp->r_rwlock,
3056 				    RW_READER, FALSE);
3057 				rddir_cache_rele(rdc);
3058 				if (nrdc != NULL)
3059 					rddir_cache_rele(nrdc);
3060 				return (EINTR);
3061 			}
3062 			mutex_exit(&rp->r_statelock);
3063 			(void) nfs_rw_enter_sig(&rp->r_rwlock,
3064 			    RW_READER, FALSE);
3065 			rddir_cache_rele(rdc);
3066 			goto top;
3067 		}
3068 		/*
3069 		 * Check to see if a readdir is required to
3070 		 * fill the entry.  If so, mark this entry
3071 		 * as being filled, remove our reference,
3072 		 * and branch to the code to fill the entry.
3073 		 */
3074 		if (rdc->flags & RDDIRREQ) {
3075 			rdc->flags &= ~RDDIRREQ;
3076 			rdc->flags |= RDDIR;
3077 			if (nrdc != NULL)
3078 				rddir_cache_rele(nrdc);
3079 			nrdc = rdc;
3080 			mutex_exit(&rp->r_statelock);
3081 			goto bottom;
3082 		}
3083 #ifdef DEBUG
3084 		if (!missed)
3085 			nfs_readdir_cache_hits++;
3086 #endif
3087 		/*
3088 		 * If an error occurred while attempting
3089 		 * to fill the cache entry, just return it.
3090 		 */
3091 		if (rdc->error) {
3092 			error = rdc->error;
3093 			mutex_exit(&rp->r_statelock);
3094 			rddir_cache_rele(rdc);
3095 			if (nrdc != NULL)
3096 				rddir_cache_rele(nrdc);
3097 			return (error);
3098 		}
3099 
3100 		/*
3101 		 * The cache entry is complete and good,
3102 		 * copyout the dirent structs to the calling
3103 		 * thread.
3104 		 */
3105 		error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3106 
3107 		/*
3108 		 * If no error occurred during the copyout,
3109 		 * update the offset in the uio struct to
3110 		 * contain the value of the next cookie
3111 		 * and set the eof value appropriately.
3112 		 */
3113 		if (!error) {
3114 			uiop->uio_offset = rdc->nfs_ncookie;
3115 			if (eofp)
3116 				*eofp = rdc->eof;
3117 		}
3118 
3119 		/*
3120 		 * Decide whether to do readahead.  Don't if
3121 		 * have already read to the end of directory.
3122 		 */
3123 		if (rdc->eof) {
3124 			rp->r_direof = rdc;
3125 			mutex_exit(&rp->r_statelock);
3126 			rddir_cache_rele(rdc);
3127 			if (nrdc != NULL)
3128 				rddir_cache_rele(nrdc);
3129 			return (error);
3130 		}
3131 
3132 		/*
3133 		 * Check to see whether we found an entry
3134 		 * for the readahead.  If so, we don't need
3135 		 * to do anything further, so free the new
3136 		 * entry if one was allocated.  Otherwise,
3137 		 * allocate a new entry, add it to the cache,
3138 		 * and then initiate an asynchronous readdir
3139 		 * operation to fill it.
3140 		 */
3141 		srdc.nfs_cookie = rdc->nfs_ncookie;
3142 		srdc.buflen = count;
3143 		rrdc = avl_find(&rp->r_dir, &srdc, &where);
3144 		if (rrdc != NULL) {
3145 			if (nrdc != NULL)
3146 				rddir_cache_rele(nrdc);
3147 		} else {
3148 			if (nrdc != NULL)
3149 				rrdc = nrdc;
3150 			else {
3151 				rrdc = rddir_cache_alloc(KM_NOSLEEP);
3152 			}
3153 			if (rrdc != NULL) {
3154 				rrdc->nfs_cookie = rdc->nfs_ncookie;
3155 				rrdc->buflen = count;
3156 				avl_insert(&rp->r_dir, rrdc, where);
3157 				rddir_cache_hold(rrdc);
3158 				mutex_exit(&rp->r_statelock);
3159 				rddir_cache_rele(rdc);
3160 #ifdef DEBUG
3161 				nfs_readdir_readahead++;
3162 #endif
3163 				nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3164 				return (error);
3165 			}
3166 		}
3167 
3168 		mutex_exit(&rp->r_statelock);
3169 		rddir_cache_rele(rdc);
3170 		return (error);
3171 	}
3172 
3173 	/*
3174 	 * Didn't find an entry in the cache.  Construct a new empty
3175 	 * entry and link it into the cache.  Other processes attempting
3176 	 * to access this entry will need to wait until it is filled in.
3177 	 *
3178 	 * Since kmem_alloc may block, another pass through the cache
3179 	 * will need to be taken to make sure that another process
3180 	 * hasn't already added an entry to the cache for this request.
3181 	 */
3182 	if (nrdc == NULL) {
3183 		mutex_exit(&rp->r_statelock);
3184 		nrdc = rddir_cache_alloc(KM_SLEEP);
3185 		nrdc->nfs_cookie = uiop->uio_offset;
3186 		nrdc->buflen = count;
3187 		goto top;
3188 	}
3189 
3190 	/*
3191 	 * Add this entry to the cache.
3192 	 */
3193 	avl_insert(&rp->r_dir, nrdc, where);
3194 	rddir_cache_hold(nrdc);
3195 	mutex_exit(&rp->r_statelock);
3196 
3197 bottom:
3198 #ifdef DEBUG
3199 	missed = 1;
3200 	nfs_readdir_cache_misses++;
3201 #endif
3202 	/*
3203 	 * Do the readdir.
3204 	 */
3205 	error = nfsreaddir(vp, nrdc, cr);
3206 
3207 	/*
3208 	 * If this operation failed, just return the error which occurred.
3209 	 */
3210 	if (error != 0)
3211 		return (error);
3212 
3213 	/*
3214 	 * Since the RPC operation will have taken sometime and blocked
3215 	 * this process, another pass through the cache will need to be
3216 	 * taken to find the correct cache entry.  It is possible that
3217 	 * the correct cache entry will not be there (although one was
3218 	 * added) because the directory changed during the RPC operation
3219 	 * and the readdir cache was flushed.  In this case, just start
3220 	 * over.  It is hoped that this will not happen too often... :-)
3221 	 */
3222 	nrdc = NULL;
3223 	goto top;
3224 	/* NOTREACHED */
3225 }
3226 
3227 static int
3228 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3229 {
3230 	int error;
3231 	struct nfsrddirargs rda;
3232 	struct nfsrddirres rd;
3233 	rnode_t *rp;
3234 	mntinfo_t *mi;
3235 	uint_t count;
3236 	int douprintf;
3237 	failinfo_t fi, *fip;
3238 
3239 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3240 	count = rdc->buflen;
3241 
3242 	rp = VTOR(vp);
3243 	mi = VTOMI(vp);
3244 
3245 	rda.rda_fh = *VTOFH(vp);
3246 	rda.rda_offset = rdc->nfs_cookie;
3247 
3248 	/*
3249 	 * NFS client failover support
3250 	 * suppress failover unless we have a zero cookie
3251 	 */
3252 	if (rdc->nfs_cookie == (off_t)0) {
3253 		fi.vp = vp;
3254 		fi.fhp = (caddr_t)&rda.rda_fh;
3255 		fi.copyproc = nfscopyfh;
3256 		fi.lookupproc = nfslookup;
3257 		fi.xattrdirproc = acl_getxattrdir2;
3258 		fip = &fi;
3259 	} else {
3260 		fip = NULL;
3261 	}
3262 
3263 	rd.rd_entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3264 	rd.rd_size = count;
3265 	rd.rd_offset = rda.rda_offset;
3266 
3267 	douprintf = 1;
3268 
3269 	if (mi->mi_io_kstats) {
3270 		mutex_enter(&mi->mi_lock);
3271 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3272 		mutex_exit(&mi->mi_lock);
3273 	}
3274 
3275 	do {
3276 		rda.rda_count = MIN(count, mi->mi_curread);
3277 		error = rfs2call(mi, RFS_READDIR,
3278 		    xdr_rddirargs, (caddr_t)&rda,
3279 		    xdr_getrddirres, (caddr_t)&rd, cr,
3280 		    &douprintf, &rd.rd_status, 0, fip);
3281 	} while (error == ENFS_TRYAGAIN);
3282 
3283 	if (mi->mi_io_kstats) {
3284 		mutex_enter(&mi->mi_lock);
3285 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3286 		mutex_exit(&mi->mi_lock);
3287 	}
3288 
3289 	/*
3290 	 * Since we are actually doing a READDIR RPC, we must have
3291 	 * exclusive access to the cache entry being filled.  Thus,
3292 	 * it is safe to update all fields except for the flags
3293 	 * field.  The r_statelock in the rnode must be held to
3294 	 * prevent two different threads from simultaneously
3295 	 * attempting to update the flags field.  This can happen
3296 	 * if we are turning off RDDIR and the other thread is
3297 	 * trying to set RDDIRWAIT.
3298 	 */
3299 	ASSERT(rdc->flags & RDDIR);
3300 	if (!error) {
3301 		error = geterrno(rd.rd_status);
3302 		if (!error) {
3303 			rdc->nfs_ncookie = rd.rd_offset;
3304 			rdc->eof = rd.rd_eof ? 1 : 0;
3305 			rdc->entlen = rd.rd_size;
3306 			ASSERT(rdc->entlen <= rdc->buflen);
3307 #ifdef DEBUG
3308 			rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3309 			    KM_SLEEP);
3310 #else
3311 			rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3312 #endif
3313 			bcopy(rd.rd_entries, rdc->entries, rdc->entlen);
3314 			rdc->error = 0;
3315 			if (mi->mi_io_kstats) {
3316 				mutex_enter(&mi->mi_lock);
3317 				KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3318 				KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3319 				    rd.rd_size;
3320 				mutex_exit(&mi->mi_lock);
3321 			}
3322 		} else {
3323 			PURGE_STALE_FH(error, vp, cr);
3324 		}
3325 	}
3326 	if (error) {
3327 		rdc->entries = NULL;
3328 		rdc->error = error;
3329 	}
3330 	kmem_free(rd.rd_entries, rdc->buflen);
3331 
3332 	mutex_enter(&rp->r_statelock);
3333 	rdc->flags &= ~RDDIR;
3334 	if (rdc->flags & RDDIRWAIT) {
3335 		rdc->flags &= ~RDDIRWAIT;
3336 		cv_broadcast(&rdc->cv);
3337 	}
3338 	if (error)
3339 		rdc->flags |= RDDIRREQ;
3340 	mutex_exit(&rp->r_statelock);
3341 
3342 	rddir_cache_rele(rdc);
3343 
3344 	return (error);
3345 }
3346 
3347 #ifdef DEBUG
3348 static int nfs_bio_do_stop = 0;
3349 #endif
3350 
3351 static int
3352 nfs_bio(struct buf *bp, cred_t *cr)
3353 {
3354 	rnode_t *rp = VTOR(bp->b_vp);
3355 	int count;
3356 	int error;
3357 	cred_t *cred;
3358 	uint_t offset;
3359 
3360 	DTRACE_IO1(start, struct buf *, bp);
3361 
3362 	ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3363 	offset = dbtob(bp->b_blkno);
3364 
3365 	if (bp->b_flags & B_READ) {
3366 		mutex_enter(&rp->r_statelock);
3367 		if (rp->r_cred != NULL) {
3368 			cred = rp->r_cred;
3369 			crhold(cred);
3370 		} else {
3371 			rp->r_cred = cr;
3372 			crhold(cr);
3373 			cred = cr;
3374 			crhold(cred);
3375 		}
3376 		mutex_exit(&rp->r_statelock);
3377 	read_again:
3378 		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3379 		    offset, bp->b_bcount, &bp->b_resid, cred);
3380 
3381 		crfree(cred);
3382 		if (!error) {
3383 			if (bp->b_resid) {
3384 				/*
3385 				 * Didn't get it all because we hit EOF,
3386 				 * zero all the memory beyond the EOF.
3387 				 */
3388 				/* bzero(rdaddr + */
3389 				bzero(bp->b_un.b_addr +
3390 				    bp->b_bcount - bp->b_resid, bp->b_resid);
3391 			}
3392 			mutex_enter(&rp->r_statelock);
3393 			if (bp->b_resid == bp->b_bcount &&
3394 			    offset >= rp->r_size) {
3395 				/*
3396 				 * We didn't read anything at all as we are
3397 				 * past EOF.  Return an error indicator back
3398 				 * but don't destroy the pages (yet).
3399 				 */
3400 				error = NFS_EOF;
3401 			}
3402 			mutex_exit(&rp->r_statelock);
3403 		} else if (error == EACCES) {
3404 			mutex_enter(&rp->r_statelock);
3405 			if (cred != cr) {
3406 				if (rp->r_cred != NULL)
3407 					crfree(rp->r_cred);
3408 				rp->r_cred = cr;
3409 				crhold(cr);
3410 				cred = cr;
3411 				crhold(cred);
3412 				mutex_exit(&rp->r_statelock);
3413 				goto read_again;
3414 			}
3415 			mutex_exit(&rp->r_statelock);
3416 		}
3417 	} else {
3418 		if (!(rp->r_flags & RSTALE)) {
3419 			mutex_enter(&rp->r_statelock);
3420 			if (rp->r_cred != NULL) {
3421 				cred = rp->r_cred;
3422 				crhold(cred);
3423 			} else {
3424 				rp->r_cred = cr;
3425 				crhold(cr);
3426 				cred = cr;
3427 				crhold(cred);
3428 			}
3429 			mutex_exit(&rp->r_statelock);
3430 		write_again:
3431 			mutex_enter(&rp->r_statelock);
3432 			count = MIN(bp->b_bcount, rp->r_size - offset);
3433 			mutex_exit(&rp->r_statelock);
3434 			if (count < 0)
3435 				cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3436 #ifdef DEBUG
3437 			if (count == 0) {
3438 				zcmn_err(getzoneid(), CE_WARN,
3439 				    "nfs_bio: zero length write at %d",
3440 				    offset);
3441 				nfs_printfhandle(&rp->r_fh);
3442 				if (nfs_bio_do_stop)
3443 					debug_enter("nfs_bio");
3444 			}
3445 #endif
3446 			error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3447 			    count, cred);
3448 			if (error == EACCES) {
3449 				mutex_enter(&rp->r_statelock);
3450 				if (cred != cr) {
3451 					if (rp->r_cred != NULL)
3452 						crfree(rp->r_cred);
3453 					rp->r_cred = cr;
3454 					crhold(cr);
3455 					crfree(cred);
3456 					cred = cr;
3457 					crhold(cred);
3458 					mutex_exit(&rp->r_statelock);
3459 					goto write_again;
3460 				}
3461 				mutex_exit(&rp->r_statelock);
3462 			}
3463 			bp->b_error = error;
3464 			if (error && error != EINTR) {
3465 				/*
3466 				 * Don't print EDQUOT errors on the console.
3467 				 * Don't print asynchronous EACCES errors.
3468 				 * Don't print EFBIG errors.
3469 				 * Print all other write errors.
3470 				 */
3471 				if (error != EDQUOT && error != EFBIG &&
3472 				    (error != EACCES ||
3473 				    !(bp->b_flags & B_ASYNC)))
3474 					nfs_write_error(bp->b_vp, error, cred);
3475 				/*
3476 				 * Update r_error and r_flags as appropriate.
3477 				 * If the error was ESTALE, then mark the
3478 				 * rnode as not being writeable and save
3479 				 * the error status.  Otherwise, save any
3480 				 * errors which occur from asynchronous
3481 				 * page invalidations.  Any errors occurring
3482 				 * from other operations should be saved
3483 				 * by the caller.
3484 				 */
3485 				mutex_enter(&rp->r_statelock);
3486 				if (error == ESTALE) {
3487 					rp->r_flags |= RSTALE;
3488 					if (!rp->r_error)
3489 						rp->r_error = error;
3490 				} else if (!rp->r_error &&
3491 				    (bp->b_flags &
3492 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
3493 				    (B_INVAL|B_FORCE|B_ASYNC)) {
3494 					rp->r_error = error;
3495 				}
3496 				mutex_exit(&rp->r_statelock);
3497 			}
3498 			crfree(cred);
3499 		} else {
3500 			error = rp->r_error;
3501 			/*
3502 			 * A close may have cleared r_error, if so,
3503 			 * propagate ESTALE error return properly
3504 			 */
3505 			if (error == 0)
3506 				error = ESTALE;
3507 		}
3508 	}
3509 
3510 	if (error != 0 && error != NFS_EOF)
3511 		bp->b_flags |= B_ERROR;
3512 
3513 	DTRACE_IO1(done, struct buf *, bp);
3514 
3515 	return (error);
3516 }
3517 
3518 /* ARGSUSED */
3519 static int
3520 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3521 {
3522 	struct nfs_fid *fp;
3523 	rnode_t *rp;
3524 
3525 	rp = VTOR(vp);
3526 
3527 	if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3528 		fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3529 		return (ENOSPC);
3530 	}
3531 	fp = (struct nfs_fid *)fidp;
3532 	fp->nf_pad = 0;
3533 	fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3534 	bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3535 	return (0);
3536 }
3537 
3538 /* ARGSUSED2 */
3539 static int
3540 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3541 {
3542 	rnode_t *rp = VTOR(vp);
3543 
3544 	if (!write_lock) {
3545 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3546 		return (V_WRITELOCK_FALSE);
3547 	}
3548 
3549 	if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3550 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3551 		if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3552 			return (V_WRITELOCK_FALSE);
3553 		nfs_rw_exit(&rp->r_rwlock);
3554 	}
3555 
3556 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3557 	return (V_WRITELOCK_TRUE);
3558 }
3559 
3560 /* ARGSUSED */
3561 static void
3562 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3563 {
3564 	rnode_t *rp = VTOR(vp);
3565 
3566 	nfs_rw_exit(&rp->r_rwlock);
3567 }
3568 
3569 /* ARGSUSED */
3570 static int
3571 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3572 {
3573 
3574 	/*
3575 	 * Because we stuff the readdir cookie into the offset field
3576 	 * someone may attempt to do an lseek with the cookie which
3577 	 * we want to succeed.
3578 	 */
3579 	if (vp->v_type == VDIR)
3580 		return (0);
3581 	if (*noffp < 0 || *noffp > MAXOFF32_T)
3582 		return (EINVAL);
3583 	return (0);
3584 }
3585 
3586 /*
3587  * number of NFS_MAXDATA blocks to read ahead
3588  * optimized for 100 base-T.
3589  */
3590 static int nfs_nra = 4;
3591 
3592 #ifdef DEBUG
3593 static int nfs_lostpage = 0;	/* number of times we lost original page */
3594 #endif
3595 
3596 /*
3597  * Return all the pages from [off..off+len) in file
3598  */
3599 /* ARGSUSED */
3600 static int
3601 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3602 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3603 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3604 {
3605 	rnode_t *rp;
3606 	int error;
3607 	mntinfo_t *mi;
3608 
3609 	if (vp->v_flag & VNOMAP)
3610 		return (ENOSYS);
3611 
3612 	ASSERT(off <= MAXOFF32_T);
3613 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3614 		return (EIO);
3615 	if (protp != NULL)
3616 		*protp = PROT_ALL;
3617 
3618 	/*
3619 	 * Now valididate that the caches are up to date.
3620 	 */
3621 	error = nfs_validate_caches(vp, cr);
3622 	if (error)
3623 		return (error);
3624 
3625 	rp = VTOR(vp);
3626 	mi = VTOMI(vp);
3627 retry:
3628 	mutex_enter(&rp->r_statelock);
3629 
3630 	/*
3631 	 * Don't create dirty pages faster than they
3632 	 * can be cleaned so that the system doesn't
3633 	 * get imbalanced.  If the async queue is
3634 	 * maxed out, then wait for it to drain before
3635 	 * creating more dirty pages.  Also, wait for
3636 	 * any threads doing pagewalks in the vop_getattr
3637 	 * entry points so that they don't block for
3638 	 * long periods.
3639 	 */
3640 	if (rw == S_CREATE) {
3641 		while ((mi->mi_max_threads != 0 &&
3642 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
3643 		    rp->r_gcount > 0)
3644 			cv_wait(&rp->r_cv, &rp->r_statelock);
3645 	}
3646 
3647 	/*
3648 	 * If we are getting called as a side effect of an nfs_write()
3649 	 * operation the local file size might not be extended yet.
3650 	 * In this case we want to be able to return pages of zeroes.
3651 	 */
3652 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3653 		mutex_exit(&rp->r_statelock);
3654 		return (EFAULT);		/* beyond EOF */
3655 	}
3656 
3657 	mutex_exit(&rp->r_statelock);
3658 
3659 	if (len <= PAGESIZE) {
3660 		error = nfs_getapage(vp, off, len, protp, pl, plsz,
3661 		    seg, addr, rw, cr);
3662 	} else {
3663 		error = pvn_getpages(nfs_getapage, vp, off, len, protp,
3664 		    pl, plsz, seg, addr, rw, cr);
3665 	}
3666 
3667 	switch (error) {
3668 	case NFS_EOF:
3669 		nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3670 		goto retry;
3671 	case ESTALE:
3672 		PURGE_STALE_FH(error, vp, cr);
3673 	}
3674 
3675 	return (error);
3676 }
3677 
3678 /*
3679  * Called from pvn_getpages or nfs_getpage to get a particular page.
3680  */
3681 /* ARGSUSED */
3682 static int
3683 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3684 	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3685 	enum seg_rw rw, cred_t *cr)
3686 {
3687 	rnode_t *rp;
3688 	uint_t bsize;
3689 	struct buf *bp;
3690 	page_t *pp;
3691 	u_offset_t lbn;
3692 	u_offset_t io_off;
3693 	u_offset_t blkoff;
3694 	u_offset_t rablkoff;
3695 	size_t io_len;
3696 	uint_t blksize;
3697 	int error;
3698 	int readahead;
3699 	int readahead_issued = 0;
3700 	int ra_window; /* readahead window */
3701 	page_t *pagefound;
3702 
3703 	if (nfs_zone() != VTOMI(vp)->mi_zone)
3704 		return (EIO);
3705 	rp = VTOR(vp);
3706 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3707 
3708 reread:
3709 	bp = NULL;
3710 	pp = NULL;
3711 	pagefound = NULL;
3712 
3713 	if (pl != NULL)
3714 		pl[0] = NULL;
3715 
3716 	error = 0;
3717 	lbn = off / bsize;
3718 	blkoff = lbn * bsize;
3719 
3720 	/*
3721 	 * Queueing up the readahead before doing the synchronous read
3722 	 * results in a significant increase in read throughput because
3723 	 * of the increased parallelism between the async threads and
3724 	 * the process context.
3725 	 */
3726 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3727 	    rw != S_CREATE &&
3728 	    !(vp->v_flag & VNOCACHE)) {
3729 		mutex_enter(&rp->r_statelock);
3730 
3731 		/*
3732 		 * Calculate the number of readaheads to do.
3733 		 * a) No readaheads at offset = 0.
3734 		 * b) Do maximum(nfs_nra) readaheads when the readahead
3735 		 *    window is closed.
3736 		 * c) Do readaheads between 1 to (nfs_nra - 1) depending
3737 		 *    upon how far the readahead window is open or close.
3738 		 * d) No readaheads if rp->r_nextr is not within the scope
3739 		 *    of the readahead window (random i/o).
3740 		 */
3741 
3742 		if (off == 0)
3743 			readahead = 0;
3744 		else if (blkoff == rp->r_nextr)
3745 			readahead = nfs_nra;
3746 		else if (rp->r_nextr > blkoff &&
3747 		    ((ra_window = (rp->r_nextr - blkoff) / bsize)
3748 		    <= (nfs_nra - 1)))
3749 			readahead = nfs_nra - ra_window;
3750 		else
3751 			readahead = 0;
3752 
3753 		rablkoff = rp->r_nextr;
3754 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3755 			mutex_exit(&rp->r_statelock);
3756 			if (nfs_async_readahead(vp, rablkoff + bsize,
3757 			    addr + (rablkoff + bsize - off), seg, cr,
3758 			    nfs_readahead) < 0) {
3759 				mutex_enter(&rp->r_statelock);
3760 				break;
3761 			}
3762 			readahead--;
3763 			rablkoff += bsize;
3764 			/*
3765 			 * Indicate that we did a readahead so
3766 			 * readahead offset is not updated
3767 			 * by the synchronous read below.
3768 			 */
3769 			readahead_issued = 1;
3770 			mutex_enter(&rp->r_statelock);
3771 			/*
3772 			 * set readahead offset to
3773 			 * offset of last async readahead
3774 			 * request.
3775 			 */
3776 			rp->r_nextr = rablkoff;
3777 		}
3778 		mutex_exit(&rp->r_statelock);
3779 	}
3780 
3781 again:
3782 	if ((pagefound = page_exists(vp, off)) == NULL) {
3783 		if (pl == NULL) {
3784 			(void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3785 			    nfs_readahead);
3786 		} else if (rw == S_CREATE) {
3787 			/*
3788 			 * Block for this page is not allocated, or the offset
3789 			 * is beyond the current allocation size, or we're
3790 			 * allocating a swap slot and the page was not found,
3791 			 * so allocate it and return a zero page.
3792 			 */
3793 			if ((pp = page_create_va(vp, off,
3794 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3795 				cmn_err(CE_PANIC, "nfs_getapage: page_create");
3796 			io_len = PAGESIZE;
3797 			mutex_enter(&rp->r_statelock);
3798 			rp->r_nextr = off + PAGESIZE;
3799 			mutex_exit(&rp->r_statelock);
3800 		} else {
3801 			/*
3802 			 * Need to go to server to get a BLOCK, exception to
3803 			 * that being while reading at offset = 0 or doing
3804 			 * random i/o, in that case read only a PAGE.
3805 			 */
3806 			mutex_enter(&rp->r_statelock);
3807 			if (blkoff < rp->r_size &&
3808 			    blkoff + bsize >= rp->r_size) {
3809 				/*
3810 				 * If only a block or less is left in
3811 				 * the file, read all that is remaining.
3812 				 */
3813 				if (rp->r_size <= off) {
3814 					/*
3815 					 * Trying to access beyond EOF,
3816 					 * set up to get at least one page.
3817 					 */
3818 					blksize = off + PAGESIZE - blkoff;
3819 				} else
3820 					blksize = rp->r_size - blkoff;
3821 			} else if ((off == 0) ||
3822 			    (off != rp->r_nextr && !readahead_issued)) {
3823 				blksize = PAGESIZE;
3824 				blkoff = off; /* block = page here */
3825 			} else
3826 				blksize = bsize;
3827 			mutex_exit(&rp->r_statelock);
3828 
3829 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3830 			    &io_len, blkoff, blksize, 0);
3831 
3832 			/*
3833 			 * Some other thread has entered the page,
3834 			 * so just use it.
3835 			 */
3836 			if (pp == NULL)
3837 				goto again;
3838 
3839 			/*
3840 			 * Now round the request size up to page boundaries.
3841 			 * This ensures that the entire page will be
3842 			 * initialized to zeroes if EOF is encountered.
3843 			 */
3844 			io_len = ptob(btopr(io_len));
3845 
3846 			bp = pageio_setup(pp, io_len, vp, B_READ);
3847 			ASSERT(bp != NULL);
3848 
3849 			/*
3850 			 * pageio_setup should have set b_addr to 0.  This
3851 			 * is correct since we want to do I/O on a page
3852 			 * boundary.  bp_mapin will use this addr to calculate
3853 			 * an offset, and then set b_addr to the kernel virtual
3854 			 * address it allocated for us.
3855 			 */
3856 			ASSERT(bp->b_un.b_addr == 0);
3857 
3858 			bp->b_edev = 0;
3859 			bp->b_dev = 0;
3860 			bp->b_lblkno = lbtodb(io_off);
3861 			bp->b_file = vp;
3862 			bp->b_offset = (offset_t)off;
3863 			bp_mapin(bp);
3864 
3865 			/*
3866 			 * If doing a write beyond what we believe is EOF,
3867 			 * don't bother trying to read the pages from the
3868 			 * server, we'll just zero the pages here.  We
3869 			 * don't check that the rw flag is S_WRITE here
3870 			 * because some implementations may attempt a
3871 			 * read access to the buffer before copying data.
3872 			 */
3873 			mutex_enter(&rp->r_statelock);
3874 			if (io_off >= rp->r_size && seg == segkmap) {
3875 				mutex_exit(&rp->r_statelock);
3876 				bzero(bp->b_un.b_addr, io_len);
3877 			} else {
3878 				mutex_exit(&rp->r_statelock);
3879 				error = nfs_bio(bp, cr);
3880 			}
3881 
3882 			/*
3883 			 * Unmap the buffer before freeing it.
3884 			 */
3885 			bp_mapout(bp);
3886 			pageio_done(bp);
3887 
3888 			if (error == NFS_EOF) {
3889 				/*
3890 				 * If doing a write system call just return
3891 				 * zeroed pages, else user tried to get pages
3892 				 * beyond EOF, return error.  We don't check
3893 				 * that the rw flag is S_WRITE here because
3894 				 * some implementations may attempt a read
3895 				 * access to the buffer before copying data.
3896 				 */
3897 				if (seg == segkmap)
3898 					error = 0;
3899 				else
3900 					error = EFAULT;
3901 			}
3902 
3903 			if (!readahead_issued && !error) {
3904 				mutex_enter(&rp->r_statelock);
3905 				rp->r_nextr = io_off + io_len;
3906 				mutex_exit(&rp->r_statelock);
3907 			}
3908 		}
3909 	}
3910 
3911 out:
3912 	if (pl == NULL)
3913 		return (error);
3914 
3915 	if (error) {
3916 		if (pp != NULL)
3917 			pvn_read_done(pp, B_ERROR);
3918 		return (error);
3919 	}
3920 
3921 	if (pagefound) {
3922 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3923 
3924 		/*
3925 		 * Page exists in the cache, acquire the appropriate lock.
3926 		 * If this fails, start all over again.
3927 		 */
3928 		if ((pp = page_lookup(vp, off, se)) == NULL) {
3929 #ifdef DEBUG
3930 			nfs_lostpage++;
3931 #endif
3932 			goto reread;
3933 		}
3934 		pl[0] = pp;
3935 		pl[1] = NULL;
3936 		return (0);
3937 	}
3938 
3939 	if (pp != NULL)
3940 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3941 
3942 	return (error);
3943 }
3944 
3945 static void
3946 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3947 	cred_t *cr)
3948 {
3949 	int error;
3950 	page_t *pp;
3951 	u_offset_t io_off;
3952 	size_t io_len;
3953 	struct buf *bp;
3954 	uint_t bsize, blksize;
3955 	rnode_t *rp = VTOR(vp);
3956 
3957 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3958 
3959 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3960 
3961 	mutex_enter(&rp->r_statelock);
3962 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3963 		/*
3964 		 * If less than a block left in file read less
3965 		 * than a block.
3966 		 */
3967 		blksize = rp->r_size - blkoff;
3968 	} else
3969 		blksize = bsize;
3970 	mutex_exit(&rp->r_statelock);
3971 
3972 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3973 	    &io_off, &io_len, blkoff, blksize, 1);
3974 	/*
3975 	 * The isra flag passed to the kluster function is 1, we may have
3976 	 * gotten a return value of NULL for a variety of reasons (# of free
3977 	 * pages < minfree, someone entered the page on the vnode etc). In all
3978 	 * cases, we want to punt on the readahead.
3979 	 */
3980 	if (pp == NULL)
3981 		return;
3982 
3983 	/*
3984 	 * Now round the request size up to page boundaries.
3985 	 * This ensures that the entire page will be
3986 	 * initialized to zeroes if EOF is encountered.
3987 	 */
3988 	io_len = ptob(btopr(io_len));
3989 
3990 	bp = pageio_setup(pp, io_len, vp, B_READ);
3991 	ASSERT(bp != NULL);
3992 
3993 	/*
3994 	 * pageio_setup should have set b_addr to 0.  This is correct since
3995 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
3996 	 * to calculate an offset, and then set b_addr to the kernel virtual
3997 	 * address it allocated for us.
3998 	 */
3999 	ASSERT(bp->b_un.b_addr == 0);
4000 
4001 	bp->b_edev = 0;
4002 	bp->b_dev = 0;
4003 	bp->b_lblkno = lbtodb(io_off);
4004 	bp->b_file = vp;
4005 	bp->b_offset = (offset_t)blkoff;
4006 	bp_mapin(bp);
4007 
4008 	/*
4009 	 * If doing a write beyond what we believe is EOF, don't bother trying
4010 	 * to read the pages from the server, we'll just zero the pages here.
4011 	 * We don't check that the rw flag is S_WRITE here because some
4012 	 * implementations may attempt a read access to the buffer before
4013 	 * copying data.
4014 	 */
4015 	mutex_enter(&rp->r_statelock);
4016 	if (io_off >= rp->r_size && seg == segkmap) {
4017 		mutex_exit(&rp->r_statelock);
4018 		bzero(bp->b_un.b_addr, io_len);
4019 		error = 0;
4020 	} else {
4021 		mutex_exit(&rp->r_statelock);
4022 		error = nfs_bio(bp, cr);
4023 		if (error == NFS_EOF)
4024 			error = 0;
4025 	}
4026 
4027 	/*
4028 	 * Unmap the buffer before freeing it.
4029 	 */
4030 	bp_mapout(bp);
4031 	pageio_done(bp);
4032 
4033 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4034 
4035 	/*
4036 	 * In case of error set readahead offset
4037 	 * to the lowest offset.
4038 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
4039 	 */
4040 	if (error && rp->r_nextr > io_off) {
4041 		mutex_enter(&rp->r_statelock);
4042 		if (rp->r_nextr > io_off)
4043 			rp->r_nextr = io_off;
4044 		mutex_exit(&rp->r_statelock);
4045 	}
4046 }
4047 
4048 /*
4049  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4050  * If len == 0, do from off to EOF.
4051  *
4052  * The normal cases should be len == 0 && off == 0 (entire vp list),
4053  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4054  * (from pageout).
4055  */
4056 /* ARGSUSED */
4057 static int
4058 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4059 	caller_context_t *ct)
4060 {
4061 	int error;
4062 	rnode_t *rp;
4063 
4064 	ASSERT(cr != NULL);
4065 
4066 	/*
4067 	 * XXX - Why should this check be made here?
4068 	 */
4069 	if (vp->v_flag & VNOMAP)
4070 		return (ENOSYS);
4071 
4072 	if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4073 		return (0);
4074 
4075 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4076 		return (EIO);
4077 	ASSERT(off <= MAXOFF32_T);
4078 
4079 	rp = VTOR(vp);
4080 	mutex_enter(&rp->r_statelock);
4081 	rp->r_count++;
4082 	mutex_exit(&rp->r_statelock);
4083 	error = nfs_putpages(vp, off, len, flags, cr);
4084 	mutex_enter(&rp->r_statelock);
4085 	rp->r_count--;
4086 	cv_broadcast(&rp->r_cv);
4087 	mutex_exit(&rp->r_statelock);
4088 
4089 	return (error);
4090 }
4091 
4092 /*
4093  * Write out a single page, possibly klustering adjacent dirty pages.
4094  */
4095 int
4096 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4097 	int flags, cred_t *cr)
4098 {
4099 	u_offset_t io_off;
4100 	u_offset_t lbn_off;
4101 	u_offset_t lbn;
4102 	size_t io_len;
4103 	uint_t bsize;
4104 	int error;
4105 	rnode_t *rp;
4106 
4107 	ASSERT(!vn_is_readonly(vp));
4108 	ASSERT(pp != NULL);
4109 	ASSERT(cr != NULL);
4110 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4111 
4112 	rp = VTOR(vp);
4113 	ASSERT(rp->r_count > 0);
4114 
4115 	ASSERT(pp->p_offset <= MAXOFF32_T);
4116 
4117 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4118 	lbn = pp->p_offset / bsize;
4119 	lbn_off = lbn * bsize;
4120 
4121 	/*
4122 	 * Find a kluster that fits in one block, or in
4123 	 * one page if pages are bigger than blocks.  If
4124 	 * there is less file space allocated than a whole
4125 	 * page, we'll shorten the i/o request below.
4126 	 */
4127 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4128 	    roundup(bsize, PAGESIZE), flags);
4129 
4130 	/*
4131 	 * pvn_write_kluster shouldn't have returned a page with offset
4132 	 * behind the original page we were given.  Verify that.
4133 	 */
4134 	ASSERT((pp->p_offset / bsize) >= lbn);
4135 
4136 	/*
4137 	 * Now pp will have the list of kept dirty pages marked for
4138 	 * write back.  It will also handle invalidation and freeing
4139 	 * of pages that are not dirty.  Check for page length rounding
4140 	 * problems.
4141 	 */
4142 	if (io_off + io_len > lbn_off + bsize) {
4143 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4144 		io_len = lbn_off + bsize - io_off;
4145 	}
4146 	/*
4147 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4148 	 * consistent value of r_size. RMODINPROGRESS is set in writerp().
4149 	 * When RMODINPROGRESS is set it indicates that a uiomove() is in
4150 	 * progress and the r_size has not been made consistent with the
4151 	 * new size of the file. When the uiomove() completes the r_size is
4152 	 * updated and the RMODINPROGRESS flag is cleared.
4153 	 *
4154 	 * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4155 	 * consistent value of r_size. Without this handshaking, it is
4156 	 * possible that nfs(3)_bio() picks  up the old value of r_size
4157 	 * before the uiomove() in writerp() completes. This will result
4158 	 * in the write through nfs(3)_bio() being dropped.
4159 	 *
4160 	 * More precisely, there is a window between the time the uiomove()
4161 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4162 	 * operation intervenes in this window, the page will be picked up,
4163 	 * because it is dirty (it will be unlocked, unless it was
4164 	 * pagecreate'd). When the page is picked up as dirty, the dirty
4165 	 * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4166 	 * checked. This will still be the old size. Therefore the page will
4167 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4168 	 * the page will be found to be clean and the write will be dropped.
4169 	 */
4170 	if (rp->r_flags & RMODINPROGRESS) {
4171 		mutex_enter(&rp->r_statelock);
4172 		if ((rp->r_flags & RMODINPROGRESS) &&
4173 		    rp->r_modaddr + MAXBSIZE > io_off &&
4174 		    rp->r_modaddr < io_off + io_len) {
4175 			page_t *plist;
4176 			/*
4177 			 * A write is in progress for this region of the file.
4178 			 * If we did not detect RMODINPROGRESS here then this
4179 			 * path through nfs_putapage() would eventually go to
4180 			 * nfs(3)_bio() and may not write out all of the data
4181 			 * in the pages. We end up losing data. So we decide
4182 			 * to set the modified bit on each page in the page
4183 			 * list and mark the rnode with RDIRTY. This write
4184 			 * will be restarted at some later time.
4185 			 */
4186 			plist = pp;
4187 			while (plist != NULL) {
4188 				pp = plist;
4189 				page_sub(&plist, pp);
4190 				hat_setmod(pp);
4191 				page_io_unlock(pp);
4192 				page_unlock(pp);
4193 			}
4194 			rp->r_flags |= RDIRTY;
4195 			mutex_exit(&rp->r_statelock);
4196 			if (offp)
4197 				*offp = io_off;
4198 			if (lenp)
4199 				*lenp = io_len;
4200 			return (0);
4201 		}
4202 		mutex_exit(&rp->r_statelock);
4203 	}
4204 
4205 	if (flags & B_ASYNC) {
4206 		error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4207 		    nfs_sync_putapage);
4208 	} else
4209 		error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4210 
4211 	if (offp)
4212 		*offp = io_off;
4213 	if (lenp)
4214 		*lenp = io_len;
4215 	return (error);
4216 }
4217 
4218 static int
4219 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4220 	int flags, cred_t *cr)
4221 {
4222 	int error;
4223 	rnode_t *rp;
4224 
4225 	flags |= B_WRITE;
4226 
4227 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4228 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4229 
4230 	rp = VTOR(vp);
4231 
4232 	if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4233 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4234 		if (!(rp->r_flags & ROUTOFSPACE)) {
4235 			mutex_enter(&rp->r_statelock);
4236 			rp->r_flags |= ROUTOFSPACE;
4237 			mutex_exit(&rp->r_statelock);
4238 		}
4239 		flags |= B_ERROR;
4240 		pvn_write_done(pp, flags);
4241 		/*
4242 		 * If this was not an async thread, then try again to
4243 		 * write out the pages, but this time, also destroy
4244 		 * them whether or not the write is successful.  This
4245 		 * will prevent memory from filling up with these
4246 		 * pages and destroying them is the only alternative
4247 		 * if they can't be written out.
4248 		 *
4249 		 * Don't do this if this is an async thread because
4250 		 * when the pages are unlocked in pvn_write_done,
4251 		 * some other thread could have come along, locked
4252 		 * them, and queued for an async thread.  It would be
4253 		 * possible for all of the async threads to be tied
4254 		 * up waiting to lock the pages again and they would
4255 		 * all already be locked and waiting for an async
4256 		 * thread to handle them.  Deadlock.
4257 		 */
4258 		if (!(flags & B_ASYNC)) {
4259 			error = nfs_putpage(vp, io_off, io_len,
4260 			    B_INVAL | B_FORCE, cr, NULL);
4261 		}
4262 	} else {
4263 		if (error)
4264 			flags |= B_ERROR;
4265 		else if (rp->r_flags & ROUTOFSPACE) {
4266 			mutex_enter(&rp->r_statelock);
4267 			rp->r_flags &= ~ROUTOFSPACE;
4268 			mutex_exit(&rp->r_statelock);
4269 		}
4270 		pvn_write_done(pp, flags);
4271 	}
4272 
4273 	return (error);
4274 }
4275 
4276 /* ARGSUSED */
4277 static int
4278 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4279 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4280 	caller_context_t *ct)
4281 {
4282 	struct segvn_crargs vn_a;
4283 	int error;
4284 	rnode_t *rp;
4285 	struct vattr va;
4286 
4287 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4288 		return (EIO);
4289 
4290 	if (vp->v_flag & VNOMAP)
4291 		return (ENOSYS);
4292 
4293 	if (off > MAXOFF32_T)
4294 		return (EFBIG);
4295 
4296 	if (off < 0 || off + len < 0)
4297 		return (ENXIO);
4298 
4299 	if (vp->v_type != VREG)
4300 		return (ENODEV);
4301 
4302 	/*
4303 	 * If there is cached data and if close-to-open consistency
4304 	 * checking is not turned off and if the file system is not
4305 	 * mounted readonly, then force an over the wire getattr.
4306 	 * Otherwise, just invoke nfsgetattr to get a copy of the
4307 	 * attributes.  The attribute cache will be used unless it
4308 	 * is timed out and if it is, then an over the wire getattr
4309 	 * will be issued.
4310 	 */
4311 	va.va_mask = AT_ALL;
4312 	if (vn_has_cached_data(vp) &&
4313 	    !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4314 		error = nfs_getattr_otw(vp, &va, cr);
4315 	else
4316 		error = nfsgetattr(vp, &va, cr);
4317 	if (error)
4318 		return (error);
4319 
4320 	/*
4321 	 * Check to see if the vnode is currently marked as not cachable.
4322 	 * This means portions of the file are locked (through VOP_FRLOCK).
4323 	 * In this case the map request must be refused.  We use
4324 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
4325 	 */
4326 	rp = VTOR(vp);
4327 
4328 	/*
4329 	 * Atomically increment r_inmap after acquiring r_rwlock. The
4330 	 * idea here is to acquire r_rwlock to block read/write and
4331 	 * not to protect r_inmap. r_inmap will inform nfs_read/write()
4332 	 * that we are in nfs_map(). Now, r_rwlock is acquired in order
4333 	 * and we can prevent the deadlock that would have occurred
4334 	 * when nfs_addmap() would have acquired it out of order.
4335 	 *
4336 	 * Since we are not protecting r_inmap by any lock, we do not
4337 	 * hold any lock when we decrement it. We atomically decrement
4338 	 * r_inmap after we release r_lkserlock.
4339 	 */
4340 
4341 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4342 		return (EINTR);
4343 	atomic_add_int(&rp->r_inmap, 1);
4344 	nfs_rw_exit(&rp->r_rwlock);
4345 
4346 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
4347 		atomic_add_int(&rp->r_inmap, -1);
4348 		return (EINTR);
4349 	}
4350 	if (vp->v_flag & VNOCACHE) {
4351 		error = EAGAIN;
4352 		goto done;
4353 	}
4354 
4355 	/*
4356 	 * Don't allow concurrent locks and mapping if mandatory locking is
4357 	 * enabled.
4358 	 */
4359 	if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4360 	    MANDLOCK(vp, va.va_mode)) {
4361 		error = EAGAIN;
4362 		goto done;
4363 	}
4364 
4365 	as_rangelock(as);
4366 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4367 	if (error != 0) {
4368 		as_rangeunlock(as);
4369 		goto done;
4370 	}
4371 
4372 	vn_a.vp = vp;
4373 	vn_a.offset = off;
4374 	vn_a.type = (flags & MAP_TYPE);
4375 	vn_a.prot = (uchar_t)prot;
4376 	vn_a.maxprot = (uchar_t)maxprot;
4377 	vn_a.flags = (flags & ~MAP_TYPE);
4378 	vn_a.cred = cr;
4379 	vn_a.amp = NULL;
4380 	vn_a.szc = 0;
4381 	vn_a.lgrp_mem_policy_flags = 0;
4382 
4383 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4384 	as_rangeunlock(as);
4385 
4386 done:
4387 	nfs_rw_exit(&rp->r_lkserlock);
4388 	atomic_add_int(&rp->r_inmap, -1);
4389 	return (error);
4390 }
4391 
4392 /* ARGSUSED */
4393 static int
4394 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4395 	size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4396 	caller_context_t *ct)
4397 {
4398 	rnode_t *rp;
4399 
4400 	if (vp->v_flag & VNOMAP)
4401 		return (ENOSYS);
4402 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4403 		return (EIO);
4404 
4405 	rp = VTOR(vp);
4406 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4407 
4408 	return (0);
4409 }
4410 
4411 /* ARGSUSED */
4412 static int
4413 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4414 	struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4415 {
4416 	netobj lm_fh;
4417 	int rc;
4418 	u_offset_t start, end;
4419 	rnode_t *rp;
4420 	int error = 0, intr = INTR(vp);
4421 
4422 	/* check for valid cmd parameter */
4423 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4424 		return (EINVAL);
4425 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4426 		return (EIO);
4427 
4428 	/* Verify l_type. */
4429 	switch (bfp->l_type) {
4430 	case F_RDLCK:
4431 		if (cmd != F_GETLK && !(flag & FREAD))
4432 			return (EBADF);
4433 		break;
4434 	case F_WRLCK:
4435 		if (cmd != F_GETLK && !(flag & FWRITE))
4436 			return (EBADF);
4437 		break;
4438 	case F_UNLCK:
4439 		intr = 0;
4440 		break;
4441 
4442 	default:
4443 		return (EINVAL);
4444 	}
4445 
4446 	/* check the validity of the lock range */
4447 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4448 		return (rc);
4449 	if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4450 		return (rc);
4451 
4452 	/*
4453 	 * If the filesystem is mounted using local locking, pass the
4454 	 * request off to the local locking code.
4455 	 */
4456 	if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4457 		if (offset > MAXOFF32_T)
4458 			return (EFBIG);
4459 		if (cmd == F_SETLK || cmd == F_SETLKW) {
4460 			/*
4461 			 * For complete safety, we should be holding
4462 			 * r_lkserlock.  However, we can't call
4463 			 * lm_safelock and then fs_frlock while
4464 			 * holding r_lkserlock, so just invoke
4465 			 * lm_safelock and expect that this will
4466 			 * catch enough of the cases.
4467 			 */
4468 			if (!lm_safelock(vp, bfp, cr))
4469 				return (EAGAIN);
4470 		}
4471 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4472 	}
4473 
4474 	rp = VTOR(vp);
4475 
4476 	/*
4477 	 * Check whether the given lock request can proceed, given the
4478 	 * current file mappings.
4479 	 */
4480 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4481 		return (EINTR);
4482 	if (cmd == F_SETLK || cmd == F_SETLKW) {
4483 		if (!lm_safelock(vp, bfp, cr)) {
4484 			rc = EAGAIN;
4485 			goto done;
4486 		}
4487 	}
4488 
4489 	/*
4490 	 * Flush the cache after waiting for async I/O to finish.  For new
4491 	 * locks, this is so that the process gets the latest bits from the
4492 	 * server.  For unlocks, this is so that other clients see the
4493 	 * latest bits once the file has been unlocked.  If currently dirty
4494 	 * pages can't be flushed, then don't allow a lock to be set.  But
4495 	 * allow unlocks to succeed, to avoid having orphan locks on the
4496 	 * server.
4497 	 */
4498 	if (cmd != F_GETLK) {
4499 		mutex_enter(&rp->r_statelock);
4500 		while (rp->r_count > 0) {
4501 			if (intr) {
4502 				klwp_t *lwp = ttolwp(curthread);
4503 
4504 				if (lwp != NULL)
4505 					lwp->lwp_nostop++;
4506 				if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4507 				    == 0) {
4508 					if (lwp != NULL)
4509 						lwp->lwp_nostop--;
4510 					rc = EINTR;
4511 					break;
4512 				}
4513 				if (lwp != NULL)
4514 					lwp->lwp_nostop--;
4515 			} else
4516 			cv_wait(&rp->r_cv, &rp->r_statelock);
4517 		}
4518 		mutex_exit(&rp->r_statelock);
4519 		if (rc != 0)
4520 			goto done;
4521 		error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4522 		if (error) {
4523 			if (error == ENOSPC || error == EDQUOT) {
4524 				mutex_enter(&rp->r_statelock);
4525 				if (!rp->r_error)
4526 					rp->r_error = error;
4527 				mutex_exit(&rp->r_statelock);
4528 			}
4529 			if (bfp->l_type != F_UNLCK) {
4530 				rc = ENOLCK;
4531 				goto done;
4532 			}
4533 		}
4534 	}
4535 
4536 	lm_fh.n_len = sizeof (fhandle_t);
4537 	lm_fh.n_bytes = (char *)VTOFH(vp);
4538 
4539 	/*
4540 	 * Call the lock manager to do the real work of contacting
4541 	 * the server and obtaining the lock.
4542 	 */
4543 	rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4544 
4545 	if (rc == 0)
4546 		nfs_lockcompletion(vp, cmd);
4547 
4548 done:
4549 	nfs_rw_exit(&rp->r_lkserlock);
4550 	return (rc);
4551 }
4552 
4553 /*
4554  * Free storage space associated with the specified vnode.  The portion
4555  * to be freed is specified by bfp->l_start and bfp->l_len (already
4556  * normalized to a "whence" of 0).
4557  *
4558  * This is an experimental facility whose continued existence is not
4559  * guaranteed.  Currently, we only support the special case
4560  * of l_len == 0, meaning free to end of file.
4561  */
4562 /* ARGSUSED */
4563 static int
4564 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4565 	offset_t offset, cred_t *cr, caller_context_t *ct)
4566 {
4567 	int error;
4568 
4569 	ASSERT(vp->v_type == VREG);
4570 	if (cmd != F_FREESP)
4571 		return (EINVAL);
4572 
4573 	if (offset > MAXOFF32_T)
4574 		return (EFBIG);
4575 
4576 	if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4577 	    (bfp->l_len > MAXOFF32_T))
4578 		return (EFBIG);
4579 
4580 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4581 		return (EIO);
4582 
4583 	error = convoff(vp, bfp, 0, offset);
4584 	if (!error) {
4585 		ASSERT(bfp->l_start >= 0);
4586 		if (bfp->l_len == 0) {
4587 			struct vattr va;
4588 
4589 			/*
4590 			 * ftruncate should not change the ctime and
4591 			 * mtime if we truncate the file to its
4592 			 * previous size.
4593 			 */
4594 			va.va_mask = AT_SIZE;
4595 			error = nfsgetattr(vp, &va, cr);
4596 			if (error || va.va_size == bfp->l_start)
4597 				return (error);
4598 			va.va_mask = AT_SIZE;
4599 			va.va_size = bfp->l_start;
4600 			error = nfssetattr(vp, &va, 0, cr);
4601 		} else
4602 			error = EINVAL;
4603 	}
4604 
4605 	return (error);
4606 }
4607 
4608 /* ARGSUSED */
4609 static int
4610 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4611 {
4612 
4613 	return (EINVAL);
4614 }
4615 
4616 /*
4617  * Setup and add an address space callback to do the work of the delmap call.
4618  * The callback will (and must be) deleted in the actual callback function.
4619  *
4620  * This is done in order to take care of the problem that we have with holding
4621  * the address space's a_lock for a long period of time (e.g. if the NFS server
4622  * is down).  Callbacks will be executed in the address space code while the
4623  * a_lock is not held.	Holding the address space's a_lock causes things such
4624  * as ps and fork to hang because they are trying to acquire this lock as well.
4625  */
4626 /* ARGSUSED */
4627 static int
4628 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4629 	size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4630 	caller_context_t *ct)
4631 {
4632 	int			caller_found;
4633 	int			error;
4634 	rnode_t			*rp;
4635 	nfs_delmap_args_t	*dmapp;
4636 	nfs_delmapcall_t	*delmap_call;
4637 
4638 	if (vp->v_flag & VNOMAP)
4639 		return (ENOSYS);
4640 	/*
4641 	 * A process may not change zones if it has NFS pages mmap'ed
4642 	 * in, so we can't legitimately get here from the wrong zone.
4643 	 */
4644 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4645 
4646 	rp = VTOR(vp);
4647 
4648 	/*
4649 	 * The way that the address space of this process deletes its mapping
4650 	 * of this file is via the following call chains:
4651 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4652 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4653 	 *
4654 	 * With the use of address space callbacks we are allowed to drop the
4655 	 * address space lock, a_lock, while executing the NFS operations that
4656 	 * need to go over the wire.  Returning EAGAIN to the caller of this
4657 	 * function is what drives the execution of the callback that we add
4658 	 * below.  The callback will be executed by the address space code
4659 	 * after dropping the a_lock.  When the callback is finished, since
4660 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4661 	 * is called again on the same segment to finish the rest of the work
4662 	 * that needs to happen during unmapping.
4663 	 *
4664 	 * This action of calling back into the segment driver causes
4665 	 * nfs_delmap() to get called again, but since the callback was
4666 	 * already executed at this point, it already did the work and there
4667 	 * is nothing left for us to do.
4668 	 *
4669 	 * To Summarize:
4670 	 * - The first time nfs_delmap is called by the current thread is when
4671 	 * we add the caller associated with this delmap to the delmap caller
4672 	 * list, add the callback, and return EAGAIN.
4673 	 * - The second time in this call chain when nfs_delmap is called we
4674 	 * will find this caller in the delmap caller list and realize there
4675 	 * is no more work to do thus removing this caller from the list and
4676 	 * returning the error that was set in the callback execution.
4677 	 */
4678 	caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4679 	if (caller_found) {
4680 		/*
4681 		 * 'error' is from the actual delmap operations.  To avoid
4682 		 * hangs, we need to handle the return of EAGAIN differently
4683 		 * since this is what drives the callback execution.
4684 		 * In this case, we don't want to return EAGAIN and do the
4685 		 * callback execution because there are none to execute.
4686 		 */
4687 		if (error == EAGAIN)
4688 			return (0);
4689 		else
4690 			return (error);
4691 	}
4692 
4693 	/* current caller was not in the list */
4694 	delmap_call = nfs_init_delmapcall();
4695 
4696 	mutex_enter(&rp->r_statelock);
4697 	list_insert_tail(&rp->r_indelmap, delmap_call);
4698 	mutex_exit(&rp->r_statelock);
4699 
4700 	dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4701 
4702 	dmapp->vp = vp;
4703 	dmapp->off = off;
4704 	dmapp->addr = addr;
4705 	dmapp->len = len;
4706 	dmapp->prot = prot;
4707 	dmapp->maxprot = maxprot;
4708 	dmapp->flags = flags;
4709 	dmapp->cr = cr;
4710 	dmapp->caller = delmap_call;
4711 
4712 	error = as_add_callback(as, nfs_delmap_callback, dmapp,
4713 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4714 
4715 	return (error ? error : EAGAIN);
4716 }
4717 
4718 /*
4719  * Remove some pages from an mmap'd vnode.  Just update the
4720  * count of pages.  If doing close-to-open, then flush all
4721  * of the pages associated with this file.  Otherwise, start
4722  * an asynchronous page flush to write out any dirty pages.
4723  * This will also associate a credential with the rnode which
4724  * can be used to write the pages.
4725  */
4726 /* ARGSUSED */
4727 static void
4728 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4729 {
4730 	int			error;
4731 	rnode_t			*rp;
4732 	mntinfo_t		*mi;
4733 	nfs_delmap_args_t	*dmapp = (nfs_delmap_args_t *)arg;
4734 
4735 	rp = VTOR(dmapp->vp);
4736 	mi = VTOMI(dmapp->vp);
4737 
4738 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4739 	ASSERT(rp->r_mapcnt >= 0);
4740 
4741 	/*
4742 	 * Initiate a page flush if there are pages, the file system
4743 	 * was not mounted readonly, the segment was mapped shared, and
4744 	 * the pages themselves were writeable.
4745 	 */
4746 	if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4747 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4748 		mutex_enter(&rp->r_statelock);
4749 		rp->r_flags |= RDIRTY;
4750 		mutex_exit(&rp->r_statelock);
4751 		/*
4752 		 * If this is a cross-zone access a sync putpage won't work, so
4753 		 * the best we can do is try an async putpage.  That seems
4754 		 * better than something more draconian such as discarding the
4755 		 * dirty pages.
4756 		 */
4757 		if ((mi->mi_flags & MI_NOCTO) ||
4758 		    nfs_zone() != mi->mi_zone)
4759 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4760 			    B_ASYNC, dmapp->cr, NULL);
4761 		else
4762 			error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4763 			    0, dmapp->cr, NULL);
4764 		if (!error) {
4765 			mutex_enter(&rp->r_statelock);
4766 			error = rp->r_error;
4767 			rp->r_error = 0;
4768 			mutex_exit(&rp->r_statelock);
4769 		}
4770 	} else
4771 		error = 0;
4772 
4773 	if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4774 		(void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4775 		    B_INVAL, dmapp->cr, NULL);
4776 
4777 	dmapp->caller->error = error;
4778 	(void) as_delete_callback(as, arg);
4779 	kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4780 }
4781 
4782 /* ARGSUSED */
4783 static int
4784 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4785 	caller_context_t *ct)
4786 {
4787 	int error = 0;
4788 
4789 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4790 		return (EIO);
4791 	/*
4792 	 * This looks a little weird because it's written in a general
4793 	 * manner but we make little use of cases.  If cntl() ever gets
4794 	 * widely used, the outer switch will make more sense.
4795 	 */
4796 
4797 	switch (cmd) {
4798 
4799 	/*
4800 	 * Large file spec - need to base answer new query with
4801 	 * hardcoded constant based on the protocol.
4802 	 */
4803 	case _PC_FILESIZEBITS:
4804 		*valp = 32;
4805 		return (0);
4806 
4807 	case _PC_LINK_MAX:
4808 	case _PC_NAME_MAX:
4809 	case _PC_PATH_MAX:
4810 	case _PC_SYMLINK_MAX:
4811 	case _PC_CHOWN_RESTRICTED:
4812 	case _PC_NO_TRUNC: {
4813 		mntinfo_t *mi;
4814 		struct pathcnf *pc;
4815 
4816 		if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4817 			return (EINVAL);
4818 		error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4819 		switch (cmd) {
4820 		case _PC_LINK_MAX:
4821 			*valp = pc->pc_link_max;
4822 			break;
4823 		case _PC_NAME_MAX:
4824 			*valp = pc->pc_name_max;
4825 			break;
4826 		case _PC_PATH_MAX:
4827 		case _PC_SYMLINK_MAX:
4828 			*valp = pc->pc_path_max;
4829 			break;
4830 		case _PC_CHOWN_RESTRICTED:
4831 			/*
4832 			 * if we got here, error is really a boolean which
4833 			 * indicates whether cmd is set or not.
4834 			 */
4835 			*valp = error ? 1 : 0;	/* see above */
4836 			error = 0;
4837 			break;
4838 		case _PC_NO_TRUNC:
4839 			/*
4840 			 * if we got here, error is really a boolean which
4841 			 * indicates whether cmd is set or not.
4842 			 */
4843 			*valp = error ? 1 : 0;	/* see above */
4844 			error = 0;
4845 			break;
4846 		}
4847 		return (error ? EINVAL : 0);
4848 		}
4849 
4850 	case _PC_XATTR_EXISTS:
4851 		*valp = 0;
4852 		if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4853 			vnode_t *avp;
4854 			rnode_t *rp;
4855 			mntinfo_t *mi = VTOMI(vp);
4856 
4857 			if (!(mi->mi_flags & MI_EXTATTR))
4858 				return (0);
4859 
4860 			rp = VTOR(vp);
4861 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4862 			    INTR(vp)))
4863 				return (EINTR);
4864 
4865 			error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4866 			if (error || avp == NULL)
4867 				error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4868 
4869 			nfs_rw_exit(&rp->r_rwlock);
4870 
4871 			if (error == 0 && avp != NULL) {
4872 				error = do_xattr_exists_check(avp, valp, cr);
4873 				VN_RELE(avp);
4874 			}
4875 		}
4876 		return (error ? EINVAL : 0);
4877 
4878 	case _PC_ACL_ENABLED:
4879 		*valp = _ACL_ACLENT_ENABLED;
4880 		return (0);
4881 
4882 	default:
4883 		return (EINVAL);
4884 	}
4885 }
4886 
4887 /*
4888  * Called by async thread to do synchronous pageio. Do the i/o, wait
4889  * for it to complete, and cleanup the page list when done.
4890  */
4891 static int
4892 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4893 	int flags, cred_t *cr)
4894 {
4895 	int error;
4896 
4897 	ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4898 	error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4899 	if (flags & B_READ)
4900 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4901 	else
4902 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4903 	return (error);
4904 }
4905 
4906 /* ARGSUSED */
4907 static int
4908 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4909 	int flags, cred_t *cr, caller_context_t *ct)
4910 {
4911 	int error;
4912 	rnode_t *rp;
4913 
4914 	if (pp == NULL)
4915 		return (EINVAL);
4916 
4917 	if (io_off > MAXOFF32_T)
4918 		return (EFBIG);
4919 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4920 		return (EIO);
4921 	rp = VTOR(vp);
4922 	mutex_enter(&rp->r_statelock);
4923 	rp->r_count++;
4924 	mutex_exit(&rp->r_statelock);
4925 
4926 	if (flags & B_ASYNC) {
4927 		error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4928 		    nfs_sync_pageio);
4929 	} else
4930 		error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4931 	mutex_enter(&rp->r_statelock);
4932 	rp->r_count--;
4933 	cv_broadcast(&rp->r_cv);
4934 	mutex_exit(&rp->r_statelock);
4935 	return (error);
4936 }
4937 
4938 /* ARGSUSED */
4939 static int
4940 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4941 	caller_context_t *ct)
4942 {
4943 	int error;
4944 	mntinfo_t *mi;
4945 
4946 	mi = VTOMI(vp);
4947 
4948 	if (nfs_zone() != mi->mi_zone)
4949 		return (EIO);
4950 	if (mi->mi_flags & MI_ACL) {
4951 		error = acl_setacl2(vp, vsecattr, flag, cr);
4952 		if (mi->mi_flags & MI_ACL)
4953 			return (error);
4954 	}
4955 
4956 	return (ENOSYS);
4957 }
4958 
4959 /* ARGSUSED */
4960 static int
4961 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4962 	caller_context_t *ct)
4963 {
4964 	int error;
4965 	mntinfo_t *mi;
4966 
4967 	mi = VTOMI(vp);
4968 
4969 	if (nfs_zone() != mi->mi_zone)
4970 		return (EIO);
4971 	if (mi->mi_flags & MI_ACL) {
4972 		error = acl_getacl2(vp, vsecattr, flag, cr);
4973 		if (mi->mi_flags & MI_ACL)
4974 			return (error);
4975 	}
4976 
4977 	return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
4978 }
4979 
4980 /* ARGSUSED */
4981 static int
4982 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
4983 	caller_context_t *ct)
4984 {
4985 	int error;
4986 	struct shrlock nshr;
4987 	struct nfs_owner nfs_owner;
4988 	netobj lm_fh;
4989 
4990 	if (nfs_zone() != VTOMI(vp)->mi_zone)
4991 		return (EIO);
4992 
4993 	/*
4994 	 * check for valid cmd parameter
4995 	 */
4996 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
4997 		return (EINVAL);
4998 
4999 	/*
5000 	 * Check access permissions
5001 	 */
5002 	if (cmd == F_SHARE &&
5003 	    (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
5004 	    ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
5005 		return (EBADF);
5006 
5007 	/*
5008 	 * If the filesystem is mounted using local locking, pass the
5009 	 * request off to the local share code.
5010 	 */
5011 	if (VTOMI(vp)->mi_flags & MI_LLOCK)
5012 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
5013 
5014 	switch (cmd) {
5015 	case F_SHARE:
5016 	case F_UNSHARE:
5017 		lm_fh.n_len = sizeof (fhandle_t);
5018 		lm_fh.n_bytes = (char *)VTOFH(vp);
5019 
5020 		/*
5021 		 * If passed an owner that is too large to fit in an
5022 		 * nfs_owner it is likely a recursive call from the
5023 		 * lock manager client and pass it straight through.  If
5024 		 * it is not a nfs_owner then simply return an error.
5025 		 */
5026 		if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
5027 			if (((struct nfs_owner *)shr->s_owner)->magic !=
5028 			    NFS_OWNER_MAGIC)
5029 				return (EINVAL);
5030 
5031 			if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
5032 				error = set_errno(error);
5033 			}
5034 			return (error);
5035 		}
5036 		/*
5037 		 * Remote share reservations owner is a combination of
5038 		 * a magic number, hostname, and the local owner
5039 		 */
5040 		bzero(&nfs_owner, sizeof (nfs_owner));
5041 		nfs_owner.magic = NFS_OWNER_MAGIC;
5042 		(void) strncpy(nfs_owner.hname, uts_nodename(),
5043 		    sizeof (nfs_owner.hname));
5044 		bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5045 		nshr.s_access = shr->s_access;
5046 		nshr.s_deny = shr->s_deny;
5047 		nshr.s_sysid = 0;
5048 		nshr.s_pid = ttoproc(curthread)->p_pid;
5049 		nshr.s_own_len = sizeof (nfs_owner);
5050 		nshr.s_owner = (caddr_t)&nfs_owner;
5051 
5052 		if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5053 			error = set_errno(error);
5054 		}
5055 
5056 		break;
5057 
5058 	case F_HASREMOTELOCKS:
5059 		/*
5060 		 * NFS client can't store remote locks itself
5061 		 */
5062 		shr->s_access = 0;
5063 		error = 0;
5064 		break;
5065 
5066 	default:
5067 		error = EINVAL;
5068 		break;
5069 	}
5070 
5071 	return (error);
5072 }
5073