xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_vnops.c (revision 5f82aa32fbc5dc2c59bca6ff315f44a4c4c9ea86)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /*
25  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
26  */
27 
28 /*
29  * vnode ops for the /dev filesystem
30  *
31  * - VDIR, VCHR, CBLK, and VLNK are considered must supported files
32  * - VREG and VDOOR are used for some internal implementations in
33  *    the global zone, e.g. devname and devfsadm communication
34  * - other file types are unusual in this namespace and
35  *    not supported for now
36  */
37 
38 /*
39  * sdev has a few basic goals:
40  *   o Provide /dev for the global zone as well as various non-global zones.
41  *   o Provide the basic functionality that devfsadm might need (mknod,
42  *     symlinks, etc.)
43  *   o Allow persistent permissions on files in /dev.
44  *   o Allow for dynamic directories and nodes for use by various services (pts,
45  *     zvol, net, etc.)
46  *
47  * The sdev file system is primarily made up of sdev_node_t's which is sdev's
48  * counterpart to the vnode_t. There are two different classes of sdev_node_t's
49  * that we generally care about, dynamic and otherwise.
50  *
51  * Persisting Information
52  * ----------------------
53  *
54  * When sdev is mounted, it keeps track of the underlying file system it is
55  * mounted over. In certain situations, sdev will go and create entries in that
56  * underlying file system. These underlying 'back end' nodes are used as proxies
57  * for various changes in permissions. While specific sets of nodes, such as
58  * dynamic ones, are exempt, this process stores permission changes against
59  * these back end nodes. The point of all of this is to allow for these settings
60  * to persist across host and zone reboots. As an example, consider the entry
61  * /dev/dsk/c0t0d0 which is a character device and that / is in UFS. Upon
62  * changing the permissions on c0t0d0 you'd have the following logical
63  * relationships:
64  *
65  *    +------------------+   sdev_vnode     +--------------+
66  *    | sdev_node_t      |<---------------->| vnode_t      |
67  *    | /dev/dsk/c0t0d0  |<---------------->| for sdev     |
68  *    +------------------+                  +--------------+
69  *           |
70  *           | sdev_attrvp
71  *           |
72  *           |    +---------------------+
73  *           +--->| vnode_t for UFS|ZFS |
74  *                | /dev/dsk/c0t0d0     |
75  *                +---------------------+
76  *
77  * sdev is generally in memory. Therefore when a lookup happens and there is no
78  * entry already inside of a directory cache, it will next check the backing
79  * store. If the backing store exists, we will reconstitute the sdev_node based
80  * on the information that we persisted. When we create the backing store node,
81  * we use the struct vattr information that we already have in sdev_node_t.
82  * Because of this, we already know if the entry was previously a symlink,
83  * directory, or some other kind of type. Note that not all types of nodes are
84  * supported. Currently only VDIR, VCHR, VBLK, VREG, VDOOR, and VLNK are
85  * eligible to be persisted.
86  *
87  * When the sdev_node is created and the lookup is done, we grab a hold on the
88  * underlying vnode as part of the call to VOP_LOOKUP. That reference is held
89  * until the sdev_node becomes inactive. Once its reference count reaches one
90  * and the VOP_INACTIVE callback fires leading to the destruction of the node,
91  * the reference on the underlying vnode will be released.
92  *
93  * The backing store node will be deleted only when the node itself is deleted
94  * through the means of a VOP_REMOVE, VOP_RMDIR, or similar call.
95  *
96  * Not everything can be persisted, see The Rules section for more details.
97  *
98  * Dynamic Nodes
99  * -------------
100  *
101  * Dynamic nodes allow for specific interactions with various kernel subsystems
102  * when looking up directory entries. This allows the lookup and readdir
103  * functions to check against the kernel subsystem's for validity. eg. does a
104  * zvol or nic still exist.
105  *
106  * More specifically, when we create various directories we check if the
107  * directory name matches that of one of the names in the vtab[] (sdev_subr.c).
108  * If it does, we swap out the vnode operations into a new set which combine the
109  * normal sdev vnode operations with the dynamic set here.
110  *
111  * In addition, various dynamic nodes implement a verification entry point. This
112  * verification entry is used as a part of lookup and readdir. The goal for
113  * these dynamic nodes is to allow them to check with the underlying subsystems
114  * to ensure that these devices are still present, or if they have gone away, to
115  * remove them from the results. This is indicated by using the SDEV_VTOR flag
116  * in vtab[].
117  *
118  * Dynamic nodes have additional restrictions placed upon them. They may only
119  * appear at the top level directory of the file system. In addition, users
120  * cannot create dirents below any leve of a dynamic node aside from its special
121  * vnops.
122  *
123  * Profiles
124  * --------
125  *
126  * Profiles exist for the purpose of non-global zones. They work with the zone
127  * brands and zoneadmd to set up a filter of allowed devices that can appear in
128  * a non-global zone's /dev. These are sent to sdev by means of libdevinfo and a
129  * modctl system call. Specifically it allows one to add patterns of device
130  * paths to include and exclude. It allows for a collection of symlinks to be
131  * added and it allows for remapping names.
132  *
133  * When operating in a non-global zone, several of the sdev vnops are redirected
134  * to the profile versions. These impose additional restrictions such as
135  * enforcing that a non-global zone's /dev is read only.
136  *
137  * sdev_node_t States
138  * ------------------
139  *
140  * A given sdev_node_t has a field called the sdev_state which describes where
141  * in the sdev life cycle it is. There are three primary states: SDEV_INIT,
142  * SDEV_READY, and SDEV_ZOMBIE.
143  *
144  *	SDEV_INIT: When a new /dev file is first looked up, a sdev_node
145  *		   is allocated, initialized and added to the directory's
146  *		   sdev_node cache. A node at this state will also
147  *		   have the SDEV_LOOKUP flag set.
148  *
149  *		   Other threads that are trying to look up a node at
150  *		   this state will be blocked until the SDEV_LOOKUP flag
151  *		   is cleared.
152  *
153  *		   When the SDEV_LOOKUP flag is cleared, the node may
154  *		   transition into the SDEV_READY state for a successful
155  *		   lookup or the node is removed from the directory cache
156  *		   and destroyed if the named node can not be found.
157  *		   An ENOENT error is returned for the second case.
158  *
159  *	SDEV_READY: A /dev file has been successfully looked up and
160  *		    associated with a vnode. The /dev file is available
161  *		    for the supported /dev file system operations.
162  *
163  *	SDEV_ZOMBIE: Deletion of a /dev file has been explicitly issued
164  *		    to an SDEV_READY node. The node is transitioned into
165  *		    the SDEV_ZOMBIE state if the vnode reference count
166  *		    is still held. A SDEV_ZOMBIE node does not support
167  *		    any of the /dev file system operations. A SDEV_ZOMBIE
168  *		    node is immediately removed from the directory cache
169  *		    and destroyed once the reference count reaches zero.
170  *
171  * Historically nodes that were marked SDEV_ZOMBIE were not removed from the
172  * underlying directory caches. This has been the source of numerous bugs and
173  * thus to better mimic what happens on a real file system, it is no longer the
174  * case.
175  *
176  * The following state machine describes the life cycle of a given node and its
177  * associated states:
178  *
179  * node is . . . . .
180  * allocated via   .     +-------------+         . . . . . . . vnode_t refcount
181  * sdev_nodeinit() .     | Unallocated |         .             reaches zero and
182  *        +--------*-----|   Memory    |<--------*---+         sdev_inactive is
183  *        |              +-------------+             |         called.
184  *        |       +------------^                     |         called.
185  *        v       |                                  |
186  *  +-----------+ * . . sdev_nodeready()      +-------------+
187  *  | SDEV_INIT | |     or related setup      | SDEV_ZOMBIE |
188  *  +-----------+ |     failure               +-------------+
189  *        |       |                                  ^
190  *        |       |      +------------+              |
191  *        +-*----------->| SDEV_READY |--------*-----+
192  *          .            +------------+        .          The node is no longer
193  *          . . node successfully              . . . . .  valid or we've been
194  *              inserted into the                         asked to remove it.
195  *              directory cache                           This happens via
196  *              and sdev_nodready()                       sdev_dirdelete().
197  *              call successful.
198  *
199  * Adding and Removing Dirents, Zombie Nodes
200  * -----------------------------------------
201  *
202  * As part of doing a lookup, readdir, or an explicit creation operation like
203  * mkdir or create, nodes may be created. Every directory has an avl tree which
204  * contains its children, the sdev_entries tree. This is only used if the type
205  * is VDIR. Access to this is controlled by the sdev_node_t's contents_lock and
206  * it is managed through sdev_cache_update().
207  *
208  * Every sdev_node_t has a field sdev_state, which describes the current state
209  * of the node. A node is generally speaking in the SDEV_READY state. When it is
210  * there, it can be looked up, accessed, and operations performed on it. When a
211  * node is going to be removed from the directory cache it is marked as a
212  * zombie. Once a node becomes a zombie, no other file system operations will
213  * succeed and it will continue to exist as a node until the vnode count on the
214  * node reaches zero. At that point, the node will be freed.  However, once a
215  * node has been marked as a zombie, it will be removed immediately from the
216  * directory cache such that no one else may find it again.  This means that
217  * someone else can insert a new entry into that directory with the same name
218  * and without a problem.
219  *
220  * To remove a node, see the section on that in The Rules.
221  *
222  * The Rules
223  * ---------
224  * These are the rules to live by when working in sdev. These are not
225  * exhaustive.
226  *
227  * - Set 1: Working with Backing Nodes
228  *   o If there is a SDEV_READY sdev_node_t, it knows about its backing node.
229  *   o If we find a backing node when looking up an sdev_node_t for the first
230  *     time, we use its attributes to build our sdev_node_t.
231  *   o If there is a found backing node, or we create a backing node, that's
232  *     when we grab the hold on its vnode.
233  *   o If we mark an sdev_node_t a ZOMBIE, we must remove its backing node from
234  *     the underlying file system. It must not be searchable or findable.
235  *   o We release our hold on the backing node vnode when we destroy the
236  *     sdev_node_t.
237  *
238  * - Set 2: Locking rules for sdev (not exhaustive)
239  *   o The majority of nodes contain an sdev_contents rw lock. You must hold it
240  *     for read or write if manipulating its contents appropriately.
241  *   o You must lock your parent before yourself.
242  *   o If you need your vnode's v_lock and the sdev_contents rw lock, you must
243  *     grab the v_lock before the sdev_contents rw_lock.
244  *   o If you release a lock on the node as a part of upgrading it, you must
245  *     verify that the node has not become a zombie as a part of this process.
246  *
247  * - Set 3: Zombie Status and What it Means
248  *   o If you encounter a node that is a ZOMBIE, that means that it has been
249  *     unlinked from the backing store.
250  *   o If you release your contents lock and acquire it again (say as part of
251  *     trying to grab a write lock) you must check that the node has not become
252  *     a zombie.
253  *   o You should VERIFY that a looked up node is not a zombie. This follows
254  *     from the following logic. To mark something as a zombie means that it is
255  *     removed from the parents directory cache. To do that, you must have a
256  *     write lock on the parent's sdev_contents. To lookup through that
257  *     directory you must have a read lock. This then becomes a simple ordering
258  *     problem. If you've been granted the lock then the other operation cannot
259  *     be in progress or must have already succeeded.
260  *
261  * - Set 4: Removing Directory Entries (aka making nodes Zombies)
262  *   o Write lock must be held on the directory
263  *   o Write lock must be held on the node
264  *   o Remove the sdev_node_t from its parent cache
265  *   o Remove the corresponding backing store node, if it exists, eg. use
266  *     VOP_REMOVE or VOP_RMDIR.
267  *   o You must NOT make any change in the vnode reference count! Nodes should
268  *     only be cleaned up through VOP_INACTIVE callbacks.
269  *   o VOP_INACTIVE is the only one responsible for doing the final vn_rele of
270  *     the backing store vnode that was grabbed during lookup.
271  *
272  * - Set 5: What Nodes may be Persisted
273  *   o The root, /dev is always persisted
274  *   o Any node in vtab which is marked SDEV_DYNAMIC, may not be persisted
275  *     unless it is also marked SDEV_PERSIST
276  *   o Anything whose parent directory is marked SDEV_PERSIST will pass that
277  *     along to the child as long as it does not contradict the above rules
278  */
279 
280 #include <sys/types.h>
281 #include <sys/param.h>
282 #include <sys/t_lock.h>
283 #include <sys/systm.h>
284 #include <sys/sysmacros.h>
285 #include <sys/user.h>
286 #include <sys/time.h>
287 #include <sys/vfs.h>
288 #include <sys/vnode.h>
289 #include <sys/vfs_opreg.h>
290 #include <sys/file.h>
291 #include <sys/fcntl.h>
292 #include <sys/flock.h>
293 #include <sys/kmem.h>
294 #include <sys/uio.h>
295 #include <sys/errno.h>
296 #include <sys/stat.h>
297 #include <sys/cred.h>
298 #include <sys/dirent.h>
299 #include <sys/pathname.h>
300 #include <sys/cmn_err.h>
301 #include <sys/debug.h>
302 #include <sys/policy.h>
303 #include <vm/hat.h>
304 #include <vm/seg_vn.h>
305 #include <vm/seg_map.h>
306 #include <vm/seg.h>
307 #include <vm/as.h>
308 #include <vm/page.h>
309 #include <sys/proc.h>
310 #include <sys/mode.h>
311 #include <sys/sunndi.h>
312 #include <sys/ptms.h>
313 #include <fs/fs_subr.h>
314 #include <sys/fs/dv_node.h>
315 #include <sys/fs/sdev_impl.h>
316 
317 /*ARGSUSED*/
318 static int
319 sdev_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct)
320 {
321 	struct sdev_node *dv = VTOSDEV(*vpp);
322 	struct sdev_node *ddv = dv->sdev_dotdot;
323 	int error = 0;
324 
325 	if ((*vpp)->v_type == VDIR)
326 		return (0);
327 
328 	if (!SDEV_IS_GLOBAL(dv))
329 		return (ENOTSUP);
330 
331 	if ((*vpp)->v_type == VLNK)
332 		return (ENOENT);
333 	ASSERT((*vpp)->v_type == VREG);
334 	if ((*vpp)->v_type != VREG)
335 		return (ENOTSUP);
336 
337 	ASSERT(ddv);
338 	rw_enter(&ddv->sdev_contents, RW_READER);
339 	if (dv->sdev_attrvp == NULL) {
340 		rw_exit(&ddv->sdev_contents);
341 		return (ENOENT);
342 	}
343 	error = VOP_OPEN(&(dv->sdev_attrvp), flag, cred, ct);
344 	rw_exit(&ddv->sdev_contents);
345 	return (error);
346 }
347 
348 /*ARGSUSED1*/
349 static int
350 sdev_close(struct vnode *vp, int flag, int count,
351     offset_t offset, struct cred *cred, caller_context_t *ct)
352 {
353 	struct sdev_node *dv = VTOSDEV(vp);
354 
355 	if (vp->v_type == VDIR) {
356 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
357 		cleanshares(vp, ttoproc(curthread)->p_pid);
358 		return (0);
359 	}
360 
361 	if (!SDEV_IS_GLOBAL(dv))
362 		return (ENOTSUP);
363 
364 	ASSERT(vp->v_type == VREG);
365 	if (vp->v_type != VREG)
366 		return (ENOTSUP);
367 
368 	ASSERT(dv->sdev_attrvp);
369 	return (VOP_CLOSE(dv->sdev_attrvp, flag, count, offset, cred, ct));
370 }
371 
372 /*ARGSUSED*/
373 static int
374 sdev_read(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred,
375 	struct caller_context *ct)
376 {
377 	struct sdev_node *dv = (struct sdev_node *)VTOSDEV(vp);
378 	int	error;
379 
380 	if (!SDEV_IS_GLOBAL(dv))
381 		return (EINVAL);
382 
383 	if (vp->v_type == VDIR)
384 		return (EISDIR);
385 
386 	/* only supporting regular files in /dev */
387 	ASSERT(vp->v_type == VREG);
388 	if (vp->v_type != VREG)
389 		return (EINVAL);
390 
391 	ASSERT(RW_READ_HELD(&VTOSDEV(vp)->sdev_contents));
392 	ASSERT(dv->sdev_attrvp);
393 	(void) VOP_RWLOCK(dv->sdev_attrvp, 0, ct);
394 	error = VOP_READ(dv->sdev_attrvp, uio, ioflag, cred, ct);
395 	VOP_RWUNLOCK(dv->sdev_attrvp, 0, ct);
396 	return (error);
397 }
398 
399 /*ARGSUSED*/
400 static int
401 sdev_write(struct vnode *vp, struct uio *uio, int ioflag, struct cred *cred,
402 	struct caller_context *ct)
403 {
404 	struct sdev_node *dv = VTOSDEV(vp);
405 	int	error = 0;
406 
407 	if (!SDEV_IS_GLOBAL(dv))
408 		return (EINVAL);
409 
410 	if (vp->v_type == VDIR)
411 		return (EISDIR);
412 
413 	/* only supporting regular files in /dev */
414 	ASSERT(vp->v_type == VREG);
415 	if (vp->v_type != VREG)
416 		return (EINVAL);
417 
418 	ASSERT(dv->sdev_attrvp);
419 
420 	(void) VOP_RWLOCK(dv->sdev_attrvp, 1, ct);
421 	error = VOP_WRITE(dv->sdev_attrvp, uio, ioflag, cred, ct);
422 	VOP_RWUNLOCK(dv->sdev_attrvp, 1, ct);
423 	if (error == 0) {
424 		sdev_update_timestamps(dv->sdev_attrvp, kcred,
425 		    AT_MTIME);
426 	}
427 	return (error);
428 }
429 
430 /*ARGSUSED*/
431 static int
432 sdev_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
433     struct cred *cred, int *rvalp,  caller_context_t *ct)
434 {
435 	struct sdev_node *dv = VTOSDEV(vp);
436 
437 	if (!SDEV_IS_GLOBAL(dv) || (vp->v_type == VDIR))
438 		return (ENOTTY);
439 
440 	ASSERT(vp->v_type == VREG);
441 	if (vp->v_type != VREG)
442 		return (EINVAL);
443 
444 	ASSERT(dv->sdev_attrvp);
445 	return (VOP_IOCTL(dv->sdev_attrvp, cmd, arg, flag, cred, rvalp, ct));
446 }
447 
448 static int
449 sdev_getattr(struct vnode *vp, struct vattr *vap, int flags,
450     struct cred *cr, caller_context_t *ct)
451 {
452 	int			error = 0;
453 	struct sdev_node	*dv = VTOSDEV(vp);
454 	struct sdev_node	*parent = dv->sdev_dotdot;
455 
456 	ASSERT(parent);
457 
458 	rw_enter(&parent->sdev_contents, RW_READER);
459 	ASSERT(dv->sdev_attr || dv->sdev_attrvp);
460 
461 	/*
462 	 * search order:
463 	 * 	- for persistent nodes (SDEV_PERSIST): backstore
464 	 *	- for non-persistent nodes: module ops if global, then memory
465 	 */
466 	if (dv->sdev_attrvp) {
467 		rw_exit(&parent->sdev_contents);
468 		error = VOP_GETATTR(dv->sdev_attrvp, vap, flags, cr, ct);
469 		sdev_vattr_merge(dv, vap);
470 	} else {
471 		ASSERT(dv->sdev_attr);
472 		*vap = *dv->sdev_attr;
473 		sdev_vattr_merge(dv, vap);
474 		rw_exit(&parent->sdev_contents);
475 	}
476 
477 	return (error);
478 }
479 
480 /*ARGSUSED4*/
481 static int
482 sdev_setattr(struct vnode *vp, struct vattr *vap, int flags,
483     struct cred *cred, caller_context_t *ctp)
484 {
485 	return (devname_setattr_func(vp, vap, flags, cred, NULL, 0));
486 }
487 
488 static int
489 sdev_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
490     struct cred *cr, caller_context_t *ct)
491 {
492 	int	error;
493 	struct sdev_node *dv = VTOSDEV(vp);
494 	struct vnode *avp = dv->sdev_attrvp;
495 
496 	if (avp == NULL) {
497 		/* return fs_fab_acl() if flavor matches, else do nothing */
498 		if ((SDEV_ACL_FLAVOR(vp) == _ACL_ACLENT_ENABLED &&
499 		    (vsap->vsa_mask & (VSA_ACLCNT | VSA_DFACLCNT))) ||
500 		    (SDEV_ACL_FLAVOR(vp) == _ACL_ACE_ENABLED &&
501 		    (vsap->vsa_mask & (VSA_ACECNT | VSA_ACE))))
502 			return (fs_fab_acl(vp, vsap, flags, cr, ct));
503 
504 		return (ENOSYS);
505 	}
506 
507 	(void) VOP_RWLOCK(avp, 1, ct);
508 	error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
509 	VOP_RWUNLOCK(avp, 1, ct);
510 	return (error);
511 }
512 
513 static int
514 sdev_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
515     struct cred *cr, caller_context_t *ct)
516 {
517 	int	error;
518 	struct sdev_node *dv = VTOSDEV(vp);
519 	struct vnode *avp = dv->sdev_attrvp;
520 
521 	if (dv->sdev_state == SDEV_ZOMBIE)
522 		return (0);
523 
524 	if (avp == NULL) {
525 		if (SDEV_IS_GLOBAL(dv) && !SDEV_IS_PERSIST(dv))
526 			return (fs_nosys());
527 		ASSERT(dv->sdev_attr);
528 		/*
529 		 * if coming in directly, the acl system call will
530 		 * have held the read-write lock via VOP_RWLOCK()
531 		 * If coming in via specfs, specfs will have
532 		 * held the rw lock on the realvp i.e. us.
533 		 */
534 		ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
535 		sdev_vattr_merge(dv, dv->sdev_attr);
536 		error = sdev_shadow_node(dv, cr);
537 		if (error) {
538 			return (fs_nosys());
539 		}
540 
541 		ASSERT(dv->sdev_attrvp);
542 		/* clean out the memory copy if any */
543 		if (dv->sdev_attr) {
544 			kmem_free(dv->sdev_attr, sizeof (struct vattr));
545 			dv->sdev_attr = NULL;
546 		}
547 		avp = dv->sdev_attrvp;
548 	}
549 	ASSERT(avp);
550 
551 	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, ct);
552 	error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
553 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, ct);
554 	return (error);
555 }
556 
557 /*
558  * There are two different unlocked routines. This one is not static as it is
559  * used as part of the secpolicy_vnode_setattr calls in sdev_subr.c. Because it
560  * is used in that function it has to have a specific signature.
561  */
562 int
563 sdev_unlocked_access(void *vdv, int mode, struct cred *cr)
564 {
565 	struct sdev_node	*dv = vdv;
566 	int			shift = 0;
567 	uid_t			owner = dv->sdev_attr->va_uid;
568 
569 	if (crgetuid(cr) != owner) {
570 		shift += 3;
571 		if (groupmember(dv->sdev_attr->va_gid, cr) == 0)
572 			shift += 3;
573 	}
574 
575 	return (secpolicy_vnode_access2(cr, SDEVTOV(dv), owner,
576 	    dv->sdev_attr->va_mode << shift, mode));
577 }
578 
579 static int
580 sdev_self_access(sdev_node_t *dv, int mode, int flags, struct cred *cr,
581     caller_context_t *ct)
582 {
583 	int ret;
584 
585 	ASSERT(dv->sdev_attr || dv->sdev_attrvp);
586 	if (dv->sdev_attrvp) {
587 		ret = VOP_ACCESS(dv->sdev_attrvp, mode, flags, cr, ct);
588 	} else if (dv->sdev_attr) {
589 		ret = sdev_unlocked_access(dv, mode, cr);
590 		if (ret)
591 			ret = EACCES;
592 	}
593 
594 	return (ret);
595 }
596 
597 static int
598 sdev_access(struct vnode *vp, int mode, int flags, struct cred *cr,
599     caller_context_t *ct)
600 {
601 	struct sdev_node *dv = VTOSDEV(vp);
602 	int ret;
603 
604 	rw_enter(&dv->sdev_contents, RW_READER);
605 	ret = sdev_self_access(dv, mode, flags, cr, ct);
606 	rw_exit(&dv->sdev_contents);
607 
608 	return (ret);
609 }
610 
611 /*
612  * Lookup
613  */
614 /*ARGSUSED3*/
615 static int
616 sdev_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
617     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
618     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
619 {
620 	struct sdev_node *parent;
621 	int error;
622 
623 	parent = VTOSDEV(dvp);
624 	ASSERT(parent);
625 
626 	/* execute access is required to search the directory */
627 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
628 		return (error);
629 
630 	if (!SDEV_IS_GLOBAL(parent))
631 		return (prof_lookup(dvp, nm, vpp, cred));
632 	return (devname_lookup_func(parent, nm, vpp, cred, NULL, 0));
633 }
634 
635 /*ARGSUSED2*/
636 static int
637 sdev_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
638     int mode, struct vnode **vpp, struct cred *cred, int flag,
639     caller_context_t *ct, vsecattr_t *vsecp)
640 {
641 	struct vnode		*vp = NULL;
642 	struct vnode		*avp;
643 	struct sdev_node	*parent;
644 	struct sdev_node	*self = NULL;
645 	int			error = 0;
646 	vtype_t			type = vap->va_type;
647 
648 	ASSERT(type != VNON && type != VBAD);
649 
650 	if ((type == VFIFO) || (type == VSOCK) ||
651 	    (type == VPROC) || (type == VPORT))
652 		return (ENOTSUP);
653 
654 	parent = VTOSDEV(dvp);
655 	ASSERT(parent);
656 
657 	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
658 	if (parent->sdev_state == SDEV_ZOMBIE) {
659 		rw_exit(&parent->sdev_dotdot->sdev_contents);
660 		return (ENOENT);
661 	}
662 
663 	/*
664 	 * Nodes cannot be created in NGZ context.
665 	 */
666 	if (!SDEV_IS_GLOBAL(parent)) {
667 		rw_exit(&parent->sdev_dotdot->sdev_contents);
668 		error = prof_lookup(dvp, nm, vpp, cred);
669 
670 		/*
671 		 * In this case, we can't create a vnode but we can
672 		 * open an existing one. However, we still want to
673 		 * enforce the open(2) error semantics as if this was
674 		 * a regular sdev_create() in GZ context. Since we
675 		 * know the vnode already exists (error == 0) we a)
676 		 * return EEXIST if exclusive access was requested, or
677 		 * b) return EISDIR if write access was requested on a
678 		 * directory. Otherwise, we return the value from
679 		 * prof_lookup() as is.
680 		 */
681 		if (error == 0) {
682 			if (excl == EXCL) {
683 				error = EEXIST;
684 			} else if (((*vpp)->v_type == VDIR) &&
685 			    (mode & VWRITE)) {
686 				error = EISDIR;
687 			}
688 
689 			if (error != 0)
690 				VN_RELE(*vpp);
691 		}
692 
693 
694 		return (error);
695 	}
696 	rw_exit(&parent->sdev_dotdot->sdev_contents);
697 
698 	/* execute access is required to search the directory */
699 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
700 		return (error);
701 
702 	/* check existing name */
703 /* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
704 	error = VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);
705 
706 	/* name found */
707 	if (error == 0) {
708 		ASSERT(vp);
709 		if (excl == EXCL) {
710 			error = EEXIST;
711 		} else if ((vp->v_type == VDIR) && (mode & VWRITE)) {
712 			/* allowing create/read-only an existing directory */
713 			error = EISDIR;
714 		} else {
715 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
716 		}
717 
718 		if (error) {
719 			VN_RELE(vp);
720 			return (error);
721 		}
722 
723 		/* truncation first */
724 		if ((vp->v_type == VREG) && (vap->va_mask & AT_SIZE) &&
725 		    (vap->va_size == 0)) {
726 			ASSERT(parent->sdev_attrvp);
727 			error = VOP_CREATE(parent->sdev_attrvp,
728 			    nm, vap, excl, mode, &avp, cred, flag, ct, vsecp);
729 
730 			if (error) {
731 				VN_RELE(vp);
732 				return (error);
733 			}
734 		}
735 
736 		sdev_update_timestamps(vp, kcred,
737 		    AT_CTIME|AT_MTIME|AT_ATIME);
738 		*vpp = vp;
739 		return (0);
740 	}
741 
742 	/* bail out early */
743 	if (error != ENOENT)
744 		return (error);
745 
746 	/* verify write access - compliance specifies ENXIO */
747 	if ((error = VOP_ACCESS(dvp, VEXEC|VWRITE, 0, cred, ct)) != 0) {
748 		if (error == EACCES)
749 			error = ENXIO;
750 		return (error);
751 	}
752 
753 	/*
754 	 * For memory-based (ROFS) directory:
755 	 * 	- either disallow node creation;
756 	 *	- or implement VOP_CREATE of its own
757 	 */
758 	rw_enter(&parent->sdev_contents, RW_WRITER);
759 	if (!SDEV_IS_PERSIST(parent)) {
760 		rw_exit(&parent->sdev_contents);
761 		return (ENOTSUP);
762 	}
763 	ASSERT(parent->sdev_attrvp);
764 	error = sdev_mknode(parent, nm, &self, vap, NULL, NULL,
765 	    cred, SDEV_READY);
766 	if (error) {
767 		rw_exit(&parent->sdev_contents);
768 		if (self)
769 			SDEV_RELE(self);
770 		return (error);
771 	}
772 	rw_exit(&parent->sdev_contents);
773 
774 	ASSERT(self);
775 	/* take care the timestamps for the node and its parent */
776 	sdev_update_timestamps(SDEVTOV(self), kcred,
777 	    AT_CTIME|AT_MTIME|AT_ATIME);
778 	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
779 	if (SDEV_IS_GLOBAL(parent))
780 		atomic_inc_ulong(&parent->sdev_gdir_gen);
781 
782 	/* wake up other threads blocked on looking up this node */
783 	mutex_enter(&self->sdev_lookup_lock);
784 	SDEV_UNBLOCK_OTHERS(self, SDEV_LOOKUP);
785 	mutex_exit(&self->sdev_lookup_lock);
786 	error = sdev_to_vp(self, vpp);
787 	return (error);
788 }
789 
790 static int
791 sdev_remove(struct vnode *dvp, char *nm, struct cred *cred,
792     caller_context_t *ct, int flags)
793 {
794 	int	error;
795 	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
796 	struct vnode *vp = NULL;
797 	struct sdev_node *dv = NULL;
798 	int len;
799 	int bkstore;
800 
801 	/* bail out early */
802 	len = strlen(nm);
803 	if (nm[0] == '.') {
804 		if (len == 1) {
805 			return (EINVAL);
806 		} else if (len == 2 && nm[1] == '.') {
807 			return (EEXIST);
808 		}
809 	}
810 
811 	ASSERT(parent);
812 	rw_enter(&parent->sdev_contents, RW_READER);
813 	if (!SDEV_IS_GLOBAL(parent)) {
814 		rw_exit(&parent->sdev_contents);
815 		return (ENOTSUP);
816 	}
817 
818 	/* execute access is required to search the directory */
819 	if ((error = sdev_self_access(parent, VEXEC, 0, cred, ct)) != 0) {
820 		rw_exit(&parent->sdev_contents);
821 		return (error);
822 	}
823 
824 	/* check existence first */
825 	dv = sdev_cache_lookup(parent, nm);
826 	if (dv == NULL) {
827 		rw_exit(&parent->sdev_contents);
828 		return (ENOENT);
829 	}
830 
831 	vp = SDEVTOV(dv);
832 	if ((dv->sdev_state == SDEV_INIT) ||
833 	    (dv->sdev_state == SDEV_ZOMBIE)) {
834 		rw_exit(&parent->sdev_contents);
835 		VN_RELE(vp);
836 		return (ENOENT);
837 	}
838 
839 	/* write access is required to remove an entry */
840 	if ((error = sdev_self_access(parent, VWRITE, 0, cred, ct)) != 0) {
841 		rw_exit(&parent->sdev_contents);
842 		VN_RELE(vp);
843 		return (error);
844 	}
845 
846 	bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
847 	if (!rw_tryupgrade(&parent->sdev_contents)) {
848 		rw_exit(&parent->sdev_contents);
849 		rw_enter(&parent->sdev_contents, RW_WRITER);
850 		/* Make sure we didn't become a zombie */
851 		if (parent->sdev_state == SDEV_ZOMBIE) {
852 			rw_exit(&parent->sdev_contents);
853 			VN_RELE(vp);
854 			return (ENOENT);
855 		}
856 	}
857 
858 	/* we do not support unlinking a non-empty directory */
859 	if (vp->v_type == VDIR && dv->sdev_nlink > 2) {
860 		rw_exit(&parent->sdev_contents);
861 		VN_RELE(vp);
862 		return (EBUSY);
863 	}
864 
865 	/*
866 	 * sdev_dirdelete does the real job of:
867 	 *  - make sure no open ref count
868 	 *  - destroying the sdev_node
869 	 *  - releasing the hold on attrvp
870 	 */
871 	sdev_cache_update(parent, &dv, nm, SDEV_CACHE_DELETE);
872 	VN_RELE(vp);
873 	rw_exit(&parent->sdev_contents);
874 
875 	/*
876 	 * best efforts clean up the backing store
877 	 */
878 	if (bkstore) {
879 		ASSERT(parent->sdev_attrvp);
880 		error = VOP_REMOVE(parent->sdev_attrvp, nm, cred,
881 		    ct, flags);
882 		/*
883 		 * do not report BUSY error
884 		 * because the backing store ref count is released
885 		 * when the last ref count on the sdev_node is
886 		 * released.
887 		 */
888 		if (error == EBUSY) {
889 			sdcmn_err2(("sdev_remove: device %s is still on"
890 			    "disk %s\n", nm, parent->sdev_path));
891 			error = 0;
892 		}
893 	}
894 
895 	return (error);
896 }
897 
898 /*
899  * Some restrictions for this file system:
900  *  - both oldnm and newnm are in the scope of /dev file system,
901  *    to simply the namespace management model.
902  */
903 /*ARGSUSED6*/
904 static int
905 sdev_rename(struct vnode *odvp, char *onm, struct vnode *ndvp, char *nnm,
906     struct cred *cred, caller_context_t *ct, int flags)
907 {
908 	struct sdev_node	*fromparent = NULL;
909 	struct vattr		vattr;
910 	struct sdev_node	*toparent;
911 	struct sdev_node	*fromdv = NULL;	/* source node */
912 	struct vnode 		*ovp = NULL;	/* source vnode */
913 	struct sdev_node	*todv = NULL;	/* destination node */
914 	struct vnode 		*nvp = NULL;	/* destination vnode */
915 	int			samedir = 0;	/* set if odvp == ndvp */
916 	struct vnode		*realvp;
917 	int error = 0;
918 	dev_t fsid;
919 	int bkstore = 0;
920 	vtype_t type;
921 
922 	/* prevent modifying "." and ".." */
923 	if ((onm[0] == '.' &&
924 	    (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) ||
925 	    (nnm[0] == '.' &&
926 	    (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0')))) {
927 		return (EINVAL);
928 	}
929 
930 	fromparent = VTOSDEV(odvp);
931 	toparent = VTOSDEV(ndvp);
932 
933 	/* ZOMBIE parent doesn't allow new node creation */
934 	rw_enter(&fromparent->sdev_dotdot->sdev_contents, RW_READER);
935 	if (fromparent->sdev_state == SDEV_ZOMBIE) {
936 		rw_exit(&fromparent->sdev_dotdot->sdev_contents);
937 		return (ENOENT);
938 	}
939 
940 	/* renaming only supported for global device nodes */
941 	if (!SDEV_IS_GLOBAL(fromparent)) {
942 		rw_exit(&fromparent->sdev_dotdot->sdev_contents);
943 		return (ENOTSUP);
944 	}
945 	rw_exit(&fromparent->sdev_dotdot->sdev_contents);
946 
947 	rw_enter(&toparent->sdev_dotdot->sdev_contents, RW_READER);
948 	if (toparent->sdev_state == SDEV_ZOMBIE) {
949 		rw_exit(&toparent->sdev_dotdot->sdev_contents);
950 		return (ENOENT);
951 	}
952 	rw_exit(&toparent->sdev_dotdot->sdev_contents);
953 
954 	/*
955 	 * acquire the global lock to prevent
956 	 * mount/unmount/other rename activities.
957 	 */
958 	mutex_enter(&sdev_lock);
959 
960 	/* check existence of the source node */
961 /* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
962 	error = VOP_LOOKUP(odvp, onm, &ovp, NULL, 0, NULL, cred, ct,
963 	    NULL, NULL);
964 	if (error) {
965 		sdcmn_err2(("sdev_rename: the source node %s exists\n",
966 		    onm));
967 		mutex_exit(&sdev_lock);
968 		return (error);
969 	}
970 
971 	if (VOP_REALVP(ovp, &realvp, ct) == 0) {
972 		VN_HOLD(realvp);
973 		VN_RELE(ovp);
974 		ovp = realvp;
975 	}
976 
977 	/* check existence of destination */
978 /* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
979 	error = VOP_LOOKUP(ndvp, nnm, &nvp, NULL, 0, NULL, cred, ct,
980 	    NULL, NULL);
981 	if (error && (error != ENOENT)) {
982 		mutex_exit(&sdev_lock);
983 		VN_RELE(ovp);
984 		return (error);
985 	}
986 
987 	if (nvp && (VOP_REALVP(nvp, &realvp, ct) == 0)) {
988 		VN_HOLD(realvp);
989 		VN_RELE(nvp);
990 		nvp = realvp;
991 	}
992 
993 	/*
994 	 * make sure the source and the destination are
995 	 * in the same dev filesystem
996 	 */
997 	if (odvp != ndvp) {
998 		vattr.va_mask = AT_FSID;
999 		if (error = VOP_GETATTR(odvp, &vattr, 0, cred, ct)) {
1000 			mutex_exit(&sdev_lock);
1001 			VN_RELE(ovp);
1002 			if (nvp != NULL)
1003 				VN_RELE(nvp);
1004 			return (error);
1005 		}
1006 		fsid = vattr.va_fsid;
1007 		vattr.va_mask = AT_FSID;
1008 		if (error = VOP_GETATTR(ndvp, &vattr, 0, cred, ct)) {
1009 			mutex_exit(&sdev_lock);
1010 			VN_RELE(ovp);
1011 			if (nvp != NULL)
1012 				VN_RELE(nvp);
1013 			return (error);
1014 		}
1015 		if (fsid != vattr.va_fsid) {
1016 			mutex_exit(&sdev_lock);
1017 			VN_RELE(ovp);
1018 			if (nvp != NULL)
1019 				VN_RELE(nvp);
1020 			return (EXDEV);
1021 		}
1022 	}
1023 
1024 	/* make sure the old entry can be deleted */
1025 	error = VOP_ACCESS(odvp, VWRITE, 0, cred, ct);
1026 	if (error) {
1027 		mutex_exit(&sdev_lock);
1028 		VN_RELE(ovp);
1029 		if (nvp != NULL)
1030 			VN_RELE(nvp);
1031 		return (error);
1032 	}
1033 
1034 	/* make sure the destination allows creation */
1035 	samedir = (fromparent == toparent);
1036 	if (!samedir) {
1037 		error = VOP_ACCESS(ndvp, VEXEC|VWRITE, 0, cred, ct);
1038 		if (error) {
1039 			mutex_exit(&sdev_lock);
1040 			VN_RELE(ovp);
1041 			if (nvp != NULL)
1042 				VN_RELE(nvp);
1043 			return (error);
1044 		}
1045 	}
1046 
1047 	fromdv = VTOSDEV(ovp);
1048 	ASSERT(fromdv);
1049 
1050 	/* destination file exists */
1051 	if (nvp != NULL) {
1052 		todv = VTOSDEV(nvp);
1053 		ASSERT(todv);
1054 	}
1055 
1056 	if ((fromdv->sdev_flags & SDEV_DYNAMIC) != 0 ||
1057 	    (todv != NULL && (todv->sdev_flags & SDEV_DYNAMIC) != 0)) {
1058 		mutex_exit(&sdev_lock);
1059 		if (nvp != NULL)
1060 			VN_RELE(nvp);
1061 		VN_RELE(ovp);
1062 		return (EACCES);
1063 	}
1064 
1065 	/*
1066 	 * link source to new target in the memory. Regardless of failure, we
1067 	 * must rele our hold on nvp.
1068 	 */
1069 	error = sdev_rnmnode(fromparent, fromdv, toparent, &todv, nnm, cred);
1070 	if (nvp != NULL)
1071 		VN_RELE(nvp);
1072 	if (error) {
1073 		sdcmn_err2(("sdev_rename: renaming %s to %s failed "
1074 		    " with error %d\n", onm, nnm, error));
1075 		mutex_exit(&sdev_lock);
1076 		VN_RELE(ovp);
1077 		return (error);
1078 	}
1079 
1080 	/*
1081 	 * unlink from source
1082 	 */
1083 	rw_enter(&fromparent->sdev_contents, RW_READER);
1084 	fromdv = sdev_cache_lookup(fromparent, onm);
1085 	if (fromdv == NULL) {
1086 		rw_exit(&fromparent->sdev_contents);
1087 		mutex_exit(&sdev_lock);
1088 		VN_RELE(ovp);
1089 		sdcmn_err2(("sdev_rename: the source is deleted already\n"));
1090 		return (0);
1091 	}
1092 
1093 	if (fromdv->sdev_state == SDEV_ZOMBIE) {
1094 		rw_exit(&fromparent->sdev_contents);
1095 		mutex_exit(&sdev_lock);
1096 		VN_RELE(SDEVTOV(fromdv));
1097 		VN_RELE(ovp);
1098 		sdcmn_err2(("sdev_rename: the source is being deleted\n"));
1099 		return (0);
1100 	}
1101 	rw_exit(&fromparent->sdev_contents);
1102 	ASSERT(SDEVTOV(fromdv) == ovp);
1103 	VN_RELE(ovp);
1104 
1105 	/* clean out the directory contents before it can be removed */
1106 	type = SDEVTOV(fromdv)->v_type;
1107 	if (type == VDIR) {
1108 		error = sdev_cleandir(fromdv, NULL, 0);
1109 		sdcmn_err2(("sdev_rename: cleandir finished with %d\n",
1110 		    error));
1111 		if (error == EBUSY)
1112 			error = 0;
1113 	}
1114 
1115 	rw_enter(&fromparent->sdev_contents, RW_WRITER);
1116 	bkstore = SDEV_IS_PERSIST(fromdv) ? 1 : 0;
1117 	sdev_cache_update(fromparent, &fromdv, onm,
1118 	    SDEV_CACHE_DELETE);
1119 	VN_RELE(SDEVTOV(fromdv));
1120 
1121 	/* best effforts clean up the backing store */
1122 	if (bkstore) {
1123 		ASSERT(fromparent->sdev_attrvp);
1124 		if (type != VDIR) {
1125 /* XXXci - We may need to translate the C-I flags on VOP_REMOVE */
1126 			error = VOP_REMOVE(fromparent->sdev_attrvp,
1127 			    onm, kcred, ct, 0);
1128 		} else {
1129 /* XXXci - We may need to translate the C-I flags on VOP_RMDIR */
1130 			error = VOP_RMDIR(fromparent->sdev_attrvp,
1131 			    onm, fromparent->sdev_attrvp, kcred, ct, 0);
1132 		}
1133 
1134 		if (error) {
1135 			sdcmn_err2(("sdev_rename: device %s is "
1136 			    "still on disk %s\n", onm,
1137 			    fromparent->sdev_path));
1138 			error = 0;
1139 		}
1140 	}
1141 	rw_exit(&fromparent->sdev_contents);
1142 	mutex_exit(&sdev_lock);
1143 
1144 	/* once reached to this point, the rename is regarded successful */
1145 	return (0);
1146 }
1147 
1148 /*
1149  * dev-fs version of "ln -s path dev-name"
1150  *	tnm - path, e.g. /devices/... or /dev/...
1151  *	lnm - dev_name
1152  */
1153 /*ARGSUSED6*/
1154 static int
1155 sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva,
1156     char *tnm, struct cred *cred, caller_context_t *ct, int flags)
1157 {
1158 	int error;
1159 	struct vnode *vp = NULL;
1160 	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
1161 	struct sdev_node *self = (struct sdev_node *)NULL;
1162 
1163 	ASSERT(parent);
1164 	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
1165 	if (parent->sdev_state == SDEV_ZOMBIE) {
1166 		rw_exit(&parent->sdev_dotdot->sdev_contents);
1167 		sdcmn_err2(("sdev_symlink: parent %s is ZOMBIED \n",
1168 		    parent->sdev_name));
1169 		return (ENOENT);
1170 	}
1171 
1172 	if (!SDEV_IS_GLOBAL(parent)) {
1173 		rw_exit(&parent->sdev_dotdot->sdev_contents);
1174 		return (ENOTSUP);
1175 	}
1176 	rw_exit(&parent->sdev_dotdot->sdev_contents);
1177 
1178 	/* execute access is required to search a directory */
1179 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
1180 		return (error);
1181 
1182 	/* find existing name */
1183 /* XXXci - We may need to translate the C-I flags here */
1184 	error = VOP_LOOKUP(dvp, lnm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);
1185 	if (error == 0) {
1186 		ASSERT(vp);
1187 		VN_RELE(vp);
1188 		sdcmn_err2(("sdev_symlink: node %s already exists\n", lnm));
1189 		return (EEXIST);
1190 	}
1191 	if (error != ENOENT)
1192 		return (error);
1193 
1194 	/* write access is required to create a symlink */
1195 	if ((error = VOP_ACCESS(dvp, VWRITE, 0, cred, ct)) != 0)
1196 		return (error);
1197 
1198 	/* put it into memory cache */
1199 	rw_enter(&parent->sdev_contents, RW_WRITER);
1200 	error = sdev_mknode(parent, lnm, &self, tva, NULL, (void *)tnm,
1201 	    cred, SDEV_READY);
1202 	if (error) {
1203 		rw_exit(&parent->sdev_contents);
1204 		sdcmn_err2(("sdev_symlink: node %s creation failed\n", lnm));
1205 		if (self)
1206 			SDEV_RELE(self);
1207 
1208 		return (error);
1209 	}
1210 	ASSERT(self && (self->sdev_state == SDEV_READY));
1211 	rw_exit(&parent->sdev_contents);
1212 
1213 	/* take care the timestamps for the node and its parent */
1214 	sdev_update_timestamps(SDEVTOV(self), kcred,
1215 	    AT_CTIME|AT_MTIME|AT_ATIME);
1216 	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
1217 	if (SDEV_IS_GLOBAL(parent))
1218 		atomic_inc_ulong(&parent->sdev_gdir_gen);
1219 
1220 	/* wake up other threads blocked on looking up this node */
1221 	mutex_enter(&self->sdev_lookup_lock);
1222 	SDEV_UNBLOCK_OTHERS(self, SDEV_LOOKUP);
1223 	mutex_exit(&self->sdev_lookup_lock);
1224 	SDEV_RELE(self);	/* don't return with vnode held */
1225 	return (0);
1226 }
1227 
1228 /*ARGSUSED6*/
1229 static int
1230 sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp,
1231     struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp)
1232 {
1233 	int error;
1234 	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
1235 	struct sdev_node *self = NULL;
1236 	struct vnode	*vp = NULL;
1237 
1238 	ASSERT(parent && parent->sdev_dotdot);
1239 	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
1240 	if (parent->sdev_state == SDEV_ZOMBIE) {
1241 		rw_exit(&parent->sdev_dotdot->sdev_contents);
1242 		return (ENOENT);
1243 	}
1244 
1245 	/* non-global do not allow pure directory creation */
1246 	if (!SDEV_IS_GLOBAL(parent)) {
1247 		rw_exit(&parent->sdev_dotdot->sdev_contents);
1248 		return (prof_lookup(dvp, nm, vpp, cred));
1249 	}
1250 	rw_exit(&parent->sdev_dotdot->sdev_contents);
1251 
1252 	/* execute access is required to search the directory */
1253 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) {
1254 		return (error);
1255 	}
1256 
1257 	/* find existing name */
1258 /* XXXci - We may need to translate the C-I flags on VOP_LOOKUP */
1259 	error = VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL, NULL);
1260 	if (error == 0) {
1261 		VN_RELE(vp);
1262 		return (EEXIST);
1263 	}
1264 	if (error != ENOENT)
1265 		return (error);
1266 
1267 	/* require write access to create a directory */
1268 	if ((error = VOP_ACCESS(dvp, VWRITE, 0, cred, ct)) != 0) {
1269 		return (error);
1270 	}
1271 
1272 	/* put it into memory */
1273 	rw_enter(&parent->sdev_contents, RW_WRITER);
1274 	error = sdev_mknode(parent, nm, &self,
1275 	    va, NULL, NULL, cred, SDEV_READY);
1276 	if (error) {
1277 		rw_exit(&parent->sdev_contents);
1278 		if (self)
1279 			SDEV_RELE(self);
1280 		return (error);
1281 	}
1282 	ASSERT(self && (self->sdev_state == SDEV_READY));
1283 	rw_exit(&parent->sdev_contents);
1284 
1285 	/* take care the timestamps for the node and its parent */
1286 	sdev_update_timestamps(SDEVTOV(self), kcred,
1287 	    AT_CTIME|AT_MTIME|AT_ATIME);
1288 	sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME);
1289 	if (SDEV_IS_GLOBAL(parent))
1290 		atomic_inc_ulong(&parent->sdev_gdir_gen);
1291 
1292 	/* wake up other threads blocked on looking up this node */
1293 	mutex_enter(&self->sdev_lookup_lock);
1294 	SDEV_UNBLOCK_OTHERS(self, SDEV_LOOKUP);
1295 	mutex_exit(&self->sdev_lookup_lock);
1296 	*vpp = SDEVTOV(self);
1297 	return (0);
1298 }
1299 
1300 /*
1301  * allowing removing an empty directory under /dev
1302  */
1303 /*ARGSUSED*/
1304 static int
1305 sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred,
1306     caller_context_t *ct, int flags)
1307 {
1308 	int error = 0;
1309 	struct sdev_node *parent = (struct sdev_node *)VTOSDEV(dvp);
1310 	struct sdev_node *self = NULL;
1311 	struct vnode *vp = NULL;
1312 
1313 	/* bail out early */
1314 	if (strcmp(nm, ".") == 0)
1315 		return (EINVAL);
1316 	if (strcmp(nm, "..") == 0)
1317 		return (EEXIST); /* should be ENOTEMPTY */
1318 
1319 	/* no destruction of non-global node */
1320 	ASSERT(parent && parent->sdev_dotdot);
1321 	rw_enter(&parent->sdev_dotdot->sdev_contents, RW_READER);
1322 	if (!SDEV_IS_GLOBAL(parent)) {
1323 		rw_exit(&parent->sdev_dotdot->sdev_contents);
1324 		return (ENOTSUP);
1325 	}
1326 	rw_exit(&parent->sdev_dotdot->sdev_contents);
1327 
1328 	/* execute access is required to search the directory */
1329 	if ((error = VOP_ACCESS(dvp, VEXEC|VWRITE, 0, cred, ct)) != 0)
1330 		return (error);
1331 
1332 	/* check existing name */
1333 	rw_enter(&parent->sdev_contents, RW_WRITER);
1334 	self = sdev_cache_lookup(parent, nm);
1335 	if (self == NULL) {
1336 		rw_exit(&parent->sdev_contents);
1337 		return (ENOENT);
1338 	}
1339 
1340 	vp = SDEVTOV(self);
1341 	if ((self->sdev_state == SDEV_INIT) ||
1342 	    (self->sdev_state == SDEV_ZOMBIE)) {
1343 		rw_exit(&parent->sdev_contents);
1344 		VN_RELE(vp);
1345 		return (ENOENT);
1346 	}
1347 
1348 	/* some sanity checks */
1349 	if (vp == dvp || vp == cdir) {
1350 		rw_exit(&parent->sdev_contents);
1351 		VN_RELE(vp);
1352 		return (EINVAL);
1353 	}
1354 
1355 	if (vp->v_type != VDIR) {
1356 		rw_exit(&parent->sdev_contents);
1357 		VN_RELE(vp);
1358 		return (ENOTDIR);
1359 	}
1360 
1361 	if (vn_vfswlock(vp)) {
1362 		rw_exit(&parent->sdev_contents);
1363 		VN_RELE(vp);
1364 		return (EBUSY);
1365 	}
1366 
1367 	if (vn_mountedvfs(vp) != NULL) {
1368 		rw_exit(&parent->sdev_contents);
1369 		vn_vfsunlock(vp);
1370 		VN_RELE(vp);
1371 		return (EBUSY);
1372 	}
1373 
1374 	self = VTOSDEV(vp);
1375 	/* bail out on a non-empty directory */
1376 	rw_enter(&self->sdev_contents, RW_READER);
1377 	if (self->sdev_nlink > 2) {
1378 		rw_exit(&self->sdev_contents);
1379 		rw_exit(&parent->sdev_contents);
1380 		vn_vfsunlock(vp);
1381 		VN_RELE(vp);
1382 		return (ENOTEMPTY);
1383 	}
1384 	rw_exit(&self->sdev_contents);
1385 
1386 	/* unlink it from the directory cache */
1387 	sdev_cache_update(parent, &self, nm, SDEV_CACHE_DELETE);
1388 	rw_exit(&parent->sdev_contents);
1389 	vn_vfsunlock(vp);
1390 	VN_RELE(vp);
1391 
1392 	/* best effort to clean up the backing store */
1393 	if (SDEV_IS_PERSIST(parent)) {
1394 		ASSERT(parent->sdev_attrvp);
1395 		error = VOP_RMDIR(parent->sdev_attrvp, nm,
1396 		    parent->sdev_attrvp, kcred, ct, flags);
1397 
1398 		if (error)
1399 			sdcmn_err2(("sdev_rmdir: cleaning device %s is on"
1400 			    " disk error %d\n", parent->sdev_path, error));
1401 		if (error == EBUSY)
1402 			error = 0;
1403 
1404 	}
1405 
1406 	return (error);
1407 }
1408 
1409 /*
1410  * read the contents of a symbolic link
1411  */
1412 static int
1413 sdev_readlink(struct vnode *vp, struct uio *uiop, struct cred *cred,
1414     caller_context_t *ct)
1415 {
1416 	struct sdev_node *dv;
1417 	int	error = 0;
1418 
1419 	ASSERT(vp->v_type == VLNK);
1420 
1421 	dv = VTOSDEV(vp);
1422 
1423 	if (dv->sdev_attrvp) {
1424 		/* non-NULL attrvp implys a persisted node at READY state */
1425 		return (VOP_READLINK(dv->sdev_attrvp, uiop, cred, ct));
1426 	} else if (dv->sdev_symlink != NULL) {
1427 		/* memory nodes, e.g. local nodes */
1428 		rw_enter(&dv->sdev_contents, RW_READER);
1429 		sdcmn_err2(("sdev_readlink link is %s\n", dv->sdev_symlink));
1430 		error = uiomove(dv->sdev_symlink, strlen(dv->sdev_symlink),
1431 		    UIO_READ, uiop);
1432 		rw_exit(&dv->sdev_contents);
1433 		return (error);
1434 	}
1435 
1436 	return (ENOENT);
1437 }
1438 
1439 /*ARGSUSED4*/
1440 static int
1441 sdev_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
1442     caller_context_t *ct, int flags)
1443 {
1444 	struct sdev_node *parent = VTOSDEV(dvp);
1445 	int error;
1446 
1447 	/*
1448 	 * We must check that we have execute access to search the directory --
1449 	 * but because our sdev_contents lock is already held as a reader (the
1450 	 * caller must have done a VOP_RWLOCK()), we call directly into the
1451 	 * underlying access routine if sdev_attr is non-NULL.
1452 	 */
1453 	if (parent->sdev_attr != NULL) {
1454 		VERIFY(RW_READ_HELD(&parent->sdev_contents));
1455 
1456 		if (sdev_unlocked_access(parent, VEXEC, cred) != 0)
1457 			return (EACCES);
1458 	} else {
1459 		if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
1460 			return (error);
1461 	}
1462 
1463 	ASSERT(parent);
1464 	if (!SDEV_IS_GLOBAL(parent))
1465 		prof_filldir(parent);
1466 	return (devname_readdir_func(dvp, uiop, cred, eofp, SDEV_BROWSE));
1467 }
1468 
1469 /*ARGSUSED1*/
1470 static void
1471 sdev_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1472 {
1473 	devname_inactive_func(vp, cred, NULL);
1474 }
1475 
1476 /*ARGSUSED2*/
1477 static int
1478 sdev_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1479 {
1480 	struct sdev_node	*dv = VTOSDEV(vp);
1481 	struct sdev_fid	*sdev_fid;
1482 
1483 	if (fidp->fid_len < (sizeof (struct sdev_fid) - sizeof (ushort_t))) {
1484 		fidp->fid_len = sizeof (struct sdev_fid) - sizeof (ushort_t);
1485 		return (ENOSPC);
1486 	}
1487 
1488 	sdev_fid = (struct sdev_fid *)fidp;
1489 	bzero(sdev_fid, sizeof (struct sdev_fid));
1490 	sdev_fid->sdevfid_len =
1491 	    (int)sizeof (struct sdev_fid) - sizeof (ushort_t);
1492 	sdev_fid->sdevfid_ino = dv->sdev_ino;
1493 
1494 	return (0);
1495 }
1496 
1497 /*
1498  * This pair of routines bracket all VOP_READ, VOP_WRITE
1499  * and VOP_READDIR requests.  The contents lock stops things
1500  * moving around while we're looking at them.
1501  */
1502 /*ARGSUSED2*/
1503 static int
1504 sdev_rwlock(struct vnode *vp, int write_flag, caller_context_t *ctp)
1505 {
1506 	rw_enter(&VTOSDEV(vp)->sdev_contents,
1507 	    write_flag ? RW_WRITER : RW_READER);
1508 	return (write_flag ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE);
1509 }
1510 
1511 /*ARGSUSED1*/
1512 static void
1513 sdev_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ctp)
1514 {
1515 	rw_exit(&VTOSDEV(vp)->sdev_contents);
1516 }
1517 
1518 /*ARGSUSED1*/
1519 static int
1520 sdev_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1521     caller_context_t *ct)
1522 {
1523 	struct vnode *attrvp = VTOSDEV(vp)->sdev_attrvp;
1524 
1525 	ASSERT(vp->v_type != VCHR &&
1526 	    vp->v_type != VBLK && vp->v_type != VLNK);
1527 
1528 	if (vp->v_type == VDIR)
1529 		return (fs_seek(vp, ooff, noffp, ct));
1530 
1531 	ASSERT(attrvp);
1532 	return (VOP_SEEK(attrvp, ooff, noffp, ct));
1533 }
1534 
1535 /*ARGSUSED1*/
1536 static int
1537 sdev_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
1538     offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
1539     caller_context_t *ct)
1540 {
1541 	int error;
1542 	struct sdev_node *dv = VTOSDEV(vp);
1543 
1544 	ASSERT(dv);
1545 	ASSERT(dv->sdev_attrvp);
1546 	error = VOP_FRLOCK(dv->sdev_attrvp, cmd, bfp, flag, offset,
1547 	    flk_cbp, cr, ct);
1548 
1549 	return (error);
1550 }
1551 
1552 static int
1553 sdev_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
1554     caller_context_t *ct)
1555 {
1556 	switch (cmd) {
1557 	case _PC_ACL_ENABLED:
1558 		*valp = SDEV_ACL_FLAVOR(vp);
1559 		return (0);
1560 	}
1561 
1562 	return (fs_pathconf(vp, cmd, valp, cr, ct));
1563 }
1564 
1565 vnodeops_t *sdev_vnodeops;
1566 
1567 const fs_operation_def_t sdev_vnodeops_tbl[] = {
1568 	VOPNAME_OPEN,		{ .vop_open = sdev_open },
1569 	VOPNAME_CLOSE,		{ .vop_close = sdev_close },
1570 	VOPNAME_READ,		{ .vop_read = sdev_read },
1571 	VOPNAME_WRITE,		{ .vop_write = sdev_write },
1572 	VOPNAME_IOCTL,		{ .vop_ioctl = sdev_ioctl },
1573 	VOPNAME_GETATTR,	{ .vop_getattr = sdev_getattr },
1574 	VOPNAME_SETATTR,	{ .vop_setattr = sdev_setattr },
1575 	VOPNAME_ACCESS,		{ .vop_access = sdev_access },
1576 	VOPNAME_LOOKUP,		{ .vop_lookup = sdev_lookup },
1577 	VOPNAME_CREATE,		{ .vop_create = sdev_create },
1578 	VOPNAME_RENAME,		{ .vop_rename = sdev_rename },
1579 	VOPNAME_REMOVE,		{ .vop_remove = sdev_remove },
1580 	VOPNAME_MKDIR,		{ .vop_mkdir = sdev_mkdir },
1581 	VOPNAME_RMDIR,		{ .vop_rmdir = sdev_rmdir },
1582 	VOPNAME_READDIR,	{ .vop_readdir = sdev_readdir },
1583 	VOPNAME_SYMLINK,	{ .vop_symlink = sdev_symlink },
1584 	VOPNAME_READLINK,	{ .vop_readlink = sdev_readlink },
1585 	VOPNAME_INACTIVE,	{ .vop_inactive = sdev_inactive },
1586 	VOPNAME_FID,		{ .vop_fid = sdev_fid },
1587 	VOPNAME_RWLOCK,		{ .vop_rwlock = sdev_rwlock },
1588 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = sdev_rwunlock },
1589 	VOPNAME_SEEK,		{ .vop_seek = sdev_seek },
1590 	VOPNAME_FRLOCK,		{ .vop_frlock = sdev_frlock },
1591 	VOPNAME_PATHCONF,	{ .vop_pathconf = sdev_pathconf },
1592 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = sdev_setsecattr },
1593 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = sdev_getsecattr },
1594 	NULL,			NULL
1595 };
1596 
1597 int sdev_vnodeops_tbl_size = sizeof (sdev_vnodeops_tbl);
1598