xref: /illumos-gate/usr/src/uts/common/fs/devfs/devfs_vnops.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * vnode ops for the devfs
30  *
31  * For leaf vnode special files (VCHR|VBLK) specfs will always see the VOP
32  * first because dv_find always performs leaf vnode substitution, returning
33  * a specfs vnode with an s_realvp pointing to the devfs leaf vnode. This
34  * means that the only leaf special file VOP operations that devfs will see
35  * after VOP_LOOKUP are the ones that specfs forwards.
36  */
37 
38 #include <sys/types.h>
39 #include <sys/param.h>
40 #include <sys/t_lock.h>
41 #include <sys/systm.h>
42 #include <sys/sysmacros.h>
43 #include <sys/user.h>
44 #include <sys/time.h>
45 #include <sys/vfs.h>
46 #include <sys/vnode.h>
47 #include <sys/vfs_opreg.h>
48 #include <sys/file.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/kmem.h>
52 #include <sys/uio.h>
53 #include <sys/errno.h>
54 #include <sys/stat.h>
55 #include <sys/cred.h>
56 #include <sys/dirent.h>
57 #include <sys/pathname.h>
58 #include <sys/cmn_err.h>
59 #include <sys/debug.h>
60 #include <sys/policy.h>
61 #include <sys/modctl.h>
62 
63 #include <fs/fs_subr.h>
64 #include <sys/fs/dv_node.h>
65 
66 extern struct vattr	dv_vattr_dir, dv_vattr_file;
67 extern dev_t rconsdev;
68 
69 /*
70  * Open of devices (leaf nodes) is handled by specfs.
71  * There is nothing to do to open a directory
72  */
73 /*ARGSUSED*/
74 static int
75 devfs_open(struct vnode **vpp, int flag, struct cred *cred,
76     caller_context_t *ct)
77 {
78 	struct dv_node	*dv = VTODV(*vpp);
79 
80 	dcmn_err2(("devfs_open %s\n", dv->dv_name));
81 	ASSERT((*vpp)->v_type == VDIR);
82 	return (0);
83 }
84 
85 /*
86  * Close of devices (leaf nodes) is handled by specfs.
87  * There is nothing much to do inorder to close a directory.
88  */
89 /*ARGSUSED1*/
90 static int
91 devfs_close(struct vnode *vp, int flag, int count,
92     offset_t offset, struct cred *cred, caller_context_t *ct)
93 {
94 	struct dv_node	*dv = VTODV(vp);
95 
96 	dcmn_err2(("devfs_close %s\n", dv->dv_name));
97 	ASSERT(vp->v_type == VDIR);
98 
99 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
100 	cleanshares(vp, ttoproc(curthread)->p_pid);
101 	return (0);
102 }
103 
104 /*
105  * Read of devices (leaf nodes) is handled by specfs.
106  * Read of directories is not supported.
107  */
108 /*ARGSUSED*/
109 static int
110 devfs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
111 	struct caller_context *ct)
112 {
113 	dcmn_err2(("devfs_read %s\n", VTODV(vp)->dv_name));
114 	ASSERT(vp->v_type == VDIR);
115 	ASSERT(RW_READ_HELD(&VTODV(vp)->dv_contents));
116 	return (EISDIR);
117 }
118 
119 /*
120  * Write of devices (leaf nodes) is handled by specfs.
121  * Write of directories is not supported.
122  */
123 /*ARGSUSED*/
124 static int
125 devfs_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred,
126 	struct caller_context *ct)
127 {
128 	dcmn_err2(("devfs_write %s\n", VTODV(vp)->dv_name));
129 	ASSERT(vp->v_type == VDIR);
130 	ASSERT(RW_WRITE_HELD(&VTODV(vp)->dv_contents));
131 	return (EISDIR);
132 }
133 
134 /*
135  * Ioctls to device (leaf nodes) is handled by specfs.
136  * Ioctl to directories is not supported.
137  */
138 /*ARGSUSED*/
139 static int
140 devfs_ioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
141     struct cred *cred, int *rvalp, caller_context_t *ct)
142 {
143 	dcmn_err2(("devfs_ioctl %s\n", VTODV(vp)->dv_name));
144 	ASSERT(vp->v_type == VDIR);
145 
146 	return (ENOTTY);	/* no ioctls supported */
147 }
148 
149 /*
150  * We can be asked directly about the attributes of directories, or
151  * (via sp->s_realvp) about the filesystem attributes of special files.
152  *
153  * For directories, we just believe the attribute store
154  * though we mangle the nodeid, fsid, and rdev to convince userland we
155  * really are a different filesystem.
156  *
157  * For special files, a little more fakery is required.
158  *
159  * If the attribute store is not there (read only root), we believe our
160  * memory based attributes.
161  */
162 static int
163 devfs_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
164     caller_context_t *ct)
165 {
166 	struct dv_node	*dv = VTODV(vp);
167 	int		error = 0;
168 	uint_t		mask;
169 
170 	/*
171 	 * Message goes to console only. Otherwise, the message
172 	 * causes devfs_getattr to be invoked again... infinite loop
173 	 */
174 	dcmn_err2(("?devfs_getattr %s\n", dv->dv_name));
175 	ASSERT(dv->dv_attr || dv->dv_attrvp);
176 
177 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
178 		cmn_err(CE_WARN,	/* panic ? */
179 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
180 		return (ENOENT);
181 	}
182 
183 	rw_enter(&dv->dv_contents, RW_READER);
184 	if (dv->dv_attr) {
185 		/*
186 		 * obtain from the memory version of attribute.
187 		 * preserve mask for those that optimize.
188 		 * devfs specific fields are already merged on creation.
189 		 */
190 		mask = vap->va_mask;
191 		*vap = *dv->dv_attr;
192 		vap->va_mask = mask;
193 	} else {
194 		/* obtain from attribute store and merge */
195 		error = VOP_GETATTR(dv->dv_attrvp, vap, flags, cr, ct);
196 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
197 		dv_vattr_merge(dv, vap);
198 	}
199 	rw_exit(&dv->dv_contents);
200 
201 	/*
202 	 * Restrict the permissions of the node fronting the console
203 	 * to 0600 with root as the owner.  This prevents a non-root
204 	 * user from gaining access to a serial terminal (like /dev/term/a)
205 	 * which is in reality serving as the console device (/dev/console).
206 	 */
207 	if (vp->v_rdev == rconsdev) {
208 		mode_t	rconsmask = S_IXUSR|S_IRWXG|S_IRWXO;
209 		vap->va_mode &= (~rconsmask);
210 		vap->va_uid = 0;
211 	}
212 
213 	return (error);
214 }
215 
216 static int devfs_unlocked_access(void *, int, struct cred *);
217 
218 /*ARGSUSED4*/
219 static int
220 devfs_setattr_dir(
221 	struct dv_node *dv,
222 	struct vnode *vp,
223 	struct vattr *vap,
224 	int flags,
225 	struct cred *cr)
226 {
227 	struct vattr	*map;
228 	uint_t		mask;
229 	int		error = 0;
230 	struct vattr	vattr;
231 
232 	ASSERT(dv->dv_attr || dv->dv_attrvp);
233 
234 	ASSERT(vp->v_type == VDIR);
235 	ASSERT((dv->dv_flags & DV_NO_FSPERM) == 0);
236 
237 	if (vap->va_mask & AT_NOSET)
238 		return (EINVAL);
239 
240 	/* to ensure consistency, single thread setting of attributes */
241 	rw_enter(&dv->dv_contents, RW_WRITER);
242 
243 again:	if (dv->dv_attr) {
244 
245 		error = secpolicy_vnode_setattr(cr, vp, vap,
246 		    dv->dv_attr, flags, devfs_unlocked_access, dv);
247 
248 		if (error)
249 			goto out;
250 
251 		/*
252 		 * Apply changes to the memory based attribute. This code
253 		 * is modeled after the tmpfs implementation of memory
254 		 * based vnodes
255 		 */
256 		map = dv->dv_attr;
257 		mask = vap->va_mask;
258 
259 		/* Change file access modes. */
260 		if (mask & AT_MODE) {
261 			map->va_mode &= S_IFMT;
262 			map->va_mode |= vap->va_mode & ~S_IFMT;
263 		}
264 		if (mask & AT_UID)
265 			map->va_uid = vap->va_uid;
266 		if (mask & AT_GID)
267 			map->va_gid = vap->va_gid;
268 		if (mask & AT_ATIME)
269 			map->va_atime = vap->va_atime;
270 		if (mask & AT_MTIME)
271 			map->va_mtime = vap->va_mtime;
272 
273 		if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME))
274 			gethrestime(&map->va_ctime);
275 	} else {
276 		/* use the backing attribute store */
277 		ASSERT(dv->dv_attrvp);
278 
279 		/*
280 		 * See if we are changing something we care about
281 		 * the persistence of - return success if we don't care.
282 		 */
283 		if (vap->va_mask & (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) {
284 			/* Set the attributes */
285 			error = VOP_SETATTR(dv->dv_attrvp,
286 			    vap, flags, cr, NULL);
287 			dsysdebug(error,
288 			    ("vop_setattr %s %d\n", dv->dv_name, error));
289 
290 			/*
291 			 * Some file systems may return EROFS for a setattr
292 			 * on a readonly file system.  In this case we create
293 			 * our own memory based attribute.
294 			 */
295 			if (error == EROFS) {
296 				/*
297 				 * obtain attributes from existing file
298 				 * that we will modify and switch to memory
299 				 * based attribute until attribute store is
300 				 * read/write.
301 				 */
302 				vattr = dv_vattr_dir;
303 				if (VOP_GETATTR(dv->dv_attrvp,
304 				    &vattr, flags, cr, NULL) == 0) {
305 					dv->dv_attr = kmem_alloc(
306 					    sizeof (struct vattr), KM_SLEEP);
307 					*dv->dv_attr = vattr;
308 					dv_vattr_merge(dv, dv->dv_attr);
309 					goto again;
310 				}
311 			}
312 		}
313 	}
314 out:
315 	rw_exit(&dv->dv_contents);
316 	return (error);
317 }
318 
319 
320 /*
321  * Compare the uid/gid/mode changes requested for a setattr
322  * operation with the same details of a node's default minor
323  * perm information.  Return 0 if identical.
324  */
325 static int
326 dv_setattr_cmp(struct vattr *map, mperm_t *mp)
327 {
328 	if ((map->va_mode & S_IAMB) != (mp->mp_mode & S_IAMB))
329 		return (1);
330 	if (map->va_uid != mp->mp_uid)
331 		return (1);
332 	if (map->va_gid != mp->mp_gid)
333 		return (1);
334 	return (0);
335 }
336 
337 
338 /*ARGSUSED4*/
339 static int
340 devfs_setattr(
341 	struct vnode *vp,
342 	struct vattr *vap,
343 	int flags,
344 	struct cred *cr,
345 	caller_context_t *ct)
346 {
347 	struct dv_node	*dv = VTODV(vp);
348 	struct dv_node	*ddv;
349 	struct vnode	*dvp;
350 	struct vattr	*map;
351 	uint_t		mask;
352 	int		error = 0;
353 	struct vattr	*free_vattr = NULL;
354 	struct vattr	*vattrp = NULL;
355 	mperm_t		mp;
356 	int		persist;
357 
358 	/*
359 	 * Message goes to console only. Otherwise, the message
360 	 * causes devfs_getattr to be invoked again... infinite loop
361 	 */
362 	dcmn_err2(("?devfs_setattr %s\n", dv->dv_name));
363 	ASSERT(dv->dv_attr || dv->dv_attrvp);
364 
365 	if (!(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK)) {
366 		cmn_err(CE_WARN,	/* panic ? */
367 		    "?%s: getattr on vnode type %d", dvnm, vp->v_type);
368 		return (ENOENT);
369 	}
370 
371 	if (vap->va_mask & AT_NOSET)
372 		return (EINVAL);
373 
374 	/*
375 	 * If we are changing something we don't care about
376 	 * the persistence of, return success.
377 	 */
378 	if ((vap->va_mask &
379 	    (AT_MODE|AT_UID|AT_GID|AT_ATIME|AT_MTIME)) == 0)
380 		return (0);
381 
382 	/*
383 	 * If driver overrides fs perm, disallow chmod
384 	 * and do not create attribute nodes.
385 	 */
386 	if (dv->dv_flags & DV_NO_FSPERM) {
387 		ASSERT(dv->dv_attr);
388 		if (vap->va_mask & (AT_MODE | AT_UID | AT_GID))
389 			return (EPERM);
390 		if ((vap->va_mask & (AT_ATIME|AT_MTIME)) == 0)
391 			return (0);
392 		rw_enter(&dv->dv_contents, RW_WRITER);
393 		if (vap->va_mask & AT_ATIME)
394 			dv->dv_attr->va_atime = vap->va_atime;
395 		if (vap->va_mask & AT_MTIME)
396 			dv->dv_attr->va_mtime = vap->va_mtime;
397 		rw_exit(&dv->dv_contents);
398 		return (0);
399 	}
400 
401 	/*
402 	 * Directories are always created but device nodes are
403 	 * only used to persist non-default permissions.
404 	 */
405 	if (vp->v_type == VDIR) {
406 		ASSERT(dv->dv_attr || dv->dv_attrvp);
407 		return (devfs_setattr_dir(dv, vp, vap, flags, cr));
408 	}
409 
410 	/*
411 	 * Allocate now before we take any locks
412 	 */
413 	vattrp = kmem_zalloc(sizeof (*vattrp), KM_SLEEP);
414 
415 	/* to ensure consistency, single thread setting of attributes */
416 	rw_enter(&dv->dv_contents, RW_WRITER);
417 
418 	/*
419 	 * We don't need to create an attribute node
420 	 * to persist access or modification times.
421 	 */
422 	persist = (vap->va_mask & (AT_MODE | AT_UID | AT_GID));
423 
424 	/*
425 	 * If persisting something, get the default permissions
426 	 * for this minor to compare against what the attributes
427 	 * are now being set to.  Default ordering is:
428 	 *	- minor_perm match for this minor
429 	 *	- mode supplied by ddi_create_priv_minor_node
430 	 *	- devfs defaults
431 	 */
432 	if (persist) {
433 		if (dev_minorperm(dv->dv_devi, dv->dv_name, &mp) != 0) {
434 			mp.mp_uid = dv_vattr_file.va_uid;
435 			mp.mp_gid = dv_vattr_file.va_gid;
436 			mp.mp_mode = dv_vattr_file.va_mode;
437 			if (dv->dv_flags & DV_DFLT_MODE) {
438 				ASSERT((dv->dv_dflt_mode & ~S_IAMB) == 0);
439 				mp.mp_mode &= ~S_IAMB;
440 				mp.mp_mode |= dv->dv_dflt_mode;
441 				dcmn_err5(("%s: setattr priv default 0%o\n",
442 				    dv->dv_name, mp.mp_mode));
443 			} else {
444 				dcmn_err5(("%s: setattr devfs default 0%o\n",
445 				    dv->dv_name, mp.mp_mode));
446 			}
447 		} else {
448 			dcmn_err5(("%s: setattr minor perm default 0%o\n",
449 			    dv->dv_name, mp.mp_mode));
450 		}
451 	}
452 
453 	/*
454 	 * If we don't have a vattr for this node, construct one.
455 	 */
456 	if (dv->dv_attr) {
457 		free_vattr = vattrp;
458 		vattrp = NULL;
459 	} else {
460 		ASSERT(dv->dv_attrvp);
461 		ASSERT(vp->v_type != VDIR);
462 		*vattrp = dv_vattr_file;
463 		error = VOP_GETATTR(dv->dv_attrvp, vattrp, 0, cr, ct);
464 		dsysdebug(error, ("vop_getattr %s %d\n", dv->dv_name, error));
465 		if (error)
466 			goto out;
467 		dv->dv_attr = vattrp;
468 		dv_vattr_merge(dv, dv->dv_attr);
469 		vattrp = NULL;
470 	}
471 
472 	error = secpolicy_vnode_setattr(cr, vp, vap, dv->dv_attr,
473 	    flags, devfs_unlocked_access, dv);
474 	if (error) {
475 		dsysdebug(error, ("devfs_setattr %s secpolicy error %d\n",
476 		    dv->dv_name, error));
477 		goto out;
478 	}
479 
480 	/*
481 	 * Apply changes to the memory based attribute. This code
482 	 * is modeled after the tmpfs implementation of memory
483 	 * based vnodes
484 	 */
485 	map = dv->dv_attr;
486 	mask = vap->va_mask;
487 
488 	/* Change file access modes. */
489 	if (mask & AT_MODE) {
490 		map->va_mode &= S_IFMT;
491 		map->va_mode |= vap->va_mode & ~S_IFMT;
492 	}
493 	if (mask & AT_UID)
494 		map->va_uid = vap->va_uid;
495 	if (mask & AT_GID)
496 		map->va_gid = vap->va_gid;
497 	if (mask & AT_ATIME)
498 		map->va_atime = vap->va_atime;
499 	if (mask & AT_MTIME)
500 		map->va_mtime = vap->va_mtime;
501 
502 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_MTIME)) {
503 		gethrestime(&map->va_ctime);
504 	}
505 
506 	/*
507 	 * A setattr to defaults means we no longer need the
508 	 * shadow node as a persistent store, unless there
509 	 * are ACLs.  Otherwise create a shadow node if one
510 	 * doesn't exist yet.
511 	 */
512 	if (persist) {
513 		if ((dv_setattr_cmp(map, &mp) == 0) &&
514 		    ((dv->dv_flags & DV_ACL) == 0)) {
515 
516 			if (dv->dv_attrvp) {
517 				ddv = dv->dv_dotdot;
518 				ASSERT(ddv->dv_attrvp);
519 				error = VOP_REMOVE(ddv->dv_attrvp,
520 				    dv->dv_name, cr, ct, 0);
521 				dsysdebug(error,
522 				    ("vop_remove %s %s %d\n",
523 				    ddv->dv_name, dv->dv_name, error));
524 
525 				if (error == EROFS)
526 					error = 0;
527 				VN_RELE(dv->dv_attrvp);
528 				dv->dv_attrvp = NULL;
529 			}
530 			ASSERT(dv->dv_attr);
531 		} else {
532 			if (mask & AT_MODE)
533 				dcmn_err5(("%s persisting mode 0%o\n",
534 				    dv->dv_name, vap->va_mode));
535 			if (mask & AT_UID)
536 				dcmn_err5(("%s persisting uid %d\n",
537 				    dv->dv_name, vap->va_uid));
538 			if (mask & AT_GID)
539 				dcmn_err5(("%s persisting gid %d\n",
540 				    dv->dv_name, vap->va_gid));
541 
542 			if (dv->dv_attrvp == NULL) {
543 				dvp = DVTOV(dv->dv_dotdot);
544 				dv_shadow_node(dvp, dv->dv_name, vp,
545 				    NULL, NULLVP, cr,
546 				    DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
547 			}
548 			if (dv->dv_attrvp) {
549 				/* If map still valid do TIME for free. */
550 				if (dv->dv_attr == map) {
551 					mask = map->va_mask;
552 					map->va_mask =
553 					    vap->va_mask | AT_ATIME | AT_MTIME;
554 					error = VOP_SETATTR(dv->dv_attrvp, map,
555 					    flags, cr, NULL);
556 					map->va_mask = mask;
557 				} else {
558 					error = VOP_SETATTR(dv->dv_attrvp,
559 					    vap, flags, cr, NULL);
560 				}
561 				dsysdebug(error, ("vop_setattr %s %d\n",
562 				    dv->dv_name, error));
563 			}
564 			/*
565 			 * Some file systems may return EROFS for a setattr
566 			 * on a readonly file system.  In this case save
567 			 * as our own memory based attribute.
568 			 * NOTE: ufs is NOT one of these (see ufs_iupdat).
569 			 */
570 			if (dv->dv_attr && dv->dv_attrvp && error == 0) {
571 				vattrp = dv->dv_attr;
572 				dv->dv_attr = NULL;
573 			} else if (error == EROFS)
574 				error = 0;
575 		}
576 	}
577 
578 out:
579 	rw_exit(&dv->dv_contents);
580 
581 	if (vattrp)
582 		kmem_free(vattrp, sizeof (*vattrp));
583 	if (free_vattr)
584 		kmem_free(free_vattr, sizeof (*free_vattr));
585 	return (error);
586 }
587 
588 static int
589 devfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
590     caller_context_t *ct)
591 {
592 	switch (cmd) {
593 	case _PC_ACL_ENABLED:
594 		/*
595 		 * We rely on the underlying filesystem for ACLs,
596 		 * so direct the query for ACL support there.
597 		 * ACL support isn't relative to the file
598 		 * and we can't guarantee that the dv node
599 		 * has an attribute node, so any valid
600 		 * attribute node will suffice.
601 		 */
602 		ASSERT(dvroot);
603 		ASSERT(dvroot->dv_attrvp);
604 		return (VOP_PATHCONF(dvroot->dv_attrvp, cmd, valp, cr, ct));
605 		/*NOTREACHED*/
606 	}
607 
608 	return (fs_pathconf(vp, cmd, valp, cr, ct));
609 }
610 
611 /*
612  * Let avp handle security attributes (acl's).
613  */
614 static int
615 devfs_getsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
616     struct cred *cr, caller_context_t *ct)
617 {
618 	dvnode_t *dv = VTODV(vp);
619 	struct vnode *avp;
620 	int	error;
621 
622 	dcmn_err2(("devfs_getsecattr %s\n", dv->dv_name));
623 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
624 
625 	rw_enter(&dv->dv_contents, RW_READER);
626 
627 	avp = dv->dv_attrvp;
628 
629 	/* fabricate the acl */
630 	if (avp == NULL) {
631 		error = fs_fab_acl(vp, vsap, flags, cr, ct);
632 		rw_exit(&dv->dv_contents);
633 		return (error);
634 	}
635 
636 	error = VOP_GETSECATTR(avp, vsap, flags, cr, ct);
637 	dsysdebug(error, ("vop_getsecattr %s %d\n", VTODV(vp)->dv_name, error));
638 	rw_exit(&dv->dv_contents);
639 	return (error);
640 }
641 
642 /*
643  * Set security attributes (acl's)
644  *
645  * Note that the dv_contents lock has already been acquired
646  * by the caller's VOP_RWLOCK.
647  */
648 static int
649 devfs_setsecattr(struct vnode *vp, struct vsecattr *vsap, int flags,
650     struct cred *cr, caller_context_t *ct)
651 {
652 	dvnode_t *dv = VTODV(vp);
653 	struct vnode *avp;
654 	int	error;
655 
656 	dcmn_err2(("devfs_setsecattr %s\n", dv->dv_name));
657 	ASSERT(vp->v_type == VDIR || vp->v_type == VCHR || vp->v_type == VBLK);
658 	ASSERT(RW_LOCK_HELD(&dv->dv_contents));
659 
660 	/*
661 	 * Not a supported operation on drivers not providing
662 	 * file system based permissions.
663 	 */
664 	if (dv->dv_flags & DV_NO_FSPERM)
665 		return (ENOTSUP);
666 
667 	/*
668 	 * To complete, the setsecattr requires an underlying attribute node.
669 	 */
670 	if (dv->dv_attrvp == NULL) {
671 		ASSERT(vp->v_type == VCHR || vp->v_type == VBLK);
672 		dv_shadow_node(DVTOV(dv->dv_dotdot), dv->dv_name, vp,
673 		    NULL, NULLVP, cr, DV_SHADOW_CREATE | DV_SHADOW_WRITE_HELD);
674 	}
675 
676 	if ((avp = dv->dv_attrvp) == NULL) {
677 		dcmn_err2(("devfs_setsecattr %s: "
678 		    "cannot construct attribute node\n", dv->dv_name));
679 		return (fs_nosys());
680 	}
681 
682 	/*
683 	 * The acl(2) system call issues a VOP_RWLOCK before setting an ACL.
684 	 * Since backing file systems expect the lock to be held before seeing
685 	 * a VOP_SETSECATTR ACL, we need to issue the VOP_RWLOCK to the backing
686 	 * store before forwarding the ACL.
687 	 */
688 	(void) VOP_RWLOCK(avp, V_WRITELOCK_TRUE, NULL);
689 	error = VOP_SETSECATTR(avp, vsap, flags, cr, ct);
690 	dsysdebug(error, ("vop_setsecattr %s %d\n", VTODV(vp)->dv_name, error));
691 	VOP_RWUNLOCK(avp, V_WRITELOCK_TRUE, NULL);
692 
693 	/*
694 	 * Set DV_ACL if we have a non-trivial set of ACLs.  It is not
695 	 * necessary to hold VOP_RWLOCK since fs_acl_nontrivial only does
696 	 * VOP_GETSECATTR calls.
697 	 */
698 	if (fs_acl_nontrivial(avp, cr))
699 		dv->dv_flags |= DV_ACL;
700 	return (error);
701 }
702 
703 /*
704  * This function is used for secpolicy_setattr().  It must call an
705  * access() like function while it is already holding the
706  * dv_contents lock.  We only care about this when dv_attr != NULL;
707  * so the unlocked access call only concerns itself with that
708  * particular branch of devfs_access().
709  */
710 static int
711 devfs_unlocked_access(void *vdv, int mode, struct cred *cr)
712 {
713 	struct dv_node *dv = vdv;
714 	int shift = 0;
715 	uid_t owner = dv->dv_attr->va_uid;
716 
717 	/* Check access based on owner, group and public permissions. */
718 	if (crgetuid(cr) != owner) {
719 		shift += 3;
720 		if (groupmember(dv->dv_attr->va_gid, cr) == 0)
721 			shift += 3;
722 	}
723 
724 	/* compute missing mode bits */
725 	mode &= ~(dv->dv_attr->va_mode << shift);
726 
727 	if (mode == 0)
728 		return (0);
729 
730 	return (secpolicy_vnode_access(cr, DVTOV(dv), owner, mode));
731 }
732 
733 static int
734 devfs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
735     caller_context_t *ct)
736 {
737 	struct dv_node	*dv = VTODV(vp);
738 	int		res;
739 
740 	dcmn_err2(("devfs_access %s\n", dv->dv_name));
741 	ASSERT(dv->dv_attr || dv->dv_attrvp);
742 
743 	/* restrict console access to privileged processes */
744 	if ((vp->v_rdev == rconsdev) && secpolicy_console(cr) != 0) {
745 		return (EACCES);
746 	}
747 
748 	rw_enter(&dv->dv_contents, RW_READER);
749 	if (dv->dv_attr && ((dv->dv_flags & DV_ACL) == 0)) {
750 		res = devfs_unlocked_access(dv, mode, cr);
751 	} else {
752 		res = VOP_ACCESS(dv->dv_attrvp, mode, flags, cr, ct);
753 	}
754 	rw_exit(&dv->dv_contents);
755 	return (res);
756 }
757 
758 /*
759  * Lookup
760  *
761  * Given the directory vnode and the name of the component, return
762  * the corresponding held vnode for that component.
763  *
764  * Of course in these fictional filesystems, nothing's ever quite
765  * -that- simple.
766  *
767  * devfs name	type		shadow (fs attributes)	type	comments
768  * -------------------------------------------------------------------------
769  * drv[@addr]	VDIR		drv[@addr]		VDIR	nexus driver
770  * drv[@addr]:m	VCHR/VBLK	drv[@addr]:m		VREG	leaf driver
771  * drv[@addr]	VCHR/VBLK	drv[@addr]:.default	VREG	leaf driver
772  * -------------------------------------------------------------------------
773  *
774  * The following names are reserved for the attribute filesystem (which
775  * could easily be another layer on top of this one - we simply need to
776  * hold the vnode of the thing we're looking at)
777  *
778  * attr name	type		shadow (fs attributes)	type	comments
779  * -------------------------------------------------------------------------
780  * drv[@addr]	VDIR		-			-	attribute dir
781  * minorname	VDIR		-			-	minorname
782  * attribute	VREG		-			-	attribute
783  * -------------------------------------------------------------------------
784  *
785  * Examples:
786  *
787  *	devfs:/devices/.../mm@0:zero		VCHR
788  *	shadow:/.devices/.../mm@0:zero		VREG, fs attrs
789  *	devfs:/devices/.../mm@0:/zero/attr	VREG, driver attribute
790  *
791  *	devfs:/devices/.../sd@0,0:a		VBLK
792  *	shadow:/.devices/.../sd@0,0:a		VREG, fs attrs
793  *	devfs:/devices/.../sd@0,0:/a/.type	VREG, "ddi_block:chan"
794  *
795  *	devfs:/devices/.../mm@0			VCHR
796  *	shadow:/.devices/.../mm@0:.default	VREG, fs attrs
797  *	devfs:/devices/.../mm@0:/.default/attr	VREG, driver attribute
798  *	devfs:/devices/.../mm@0:/.default/.type	VREG, "ddi_pseudo"
799  *
800  *	devfs:/devices/.../obio			VDIR
801  *	shadow:/devices/.../obio		VDIR, needed for fs attrs.
802  *	devfs:/devices/.../obio:/.default/attr	VDIR, driver attribute
803  *
804  * We also need to be able deal with "old" devices that have gone away,
805  * though I think that provided we return them with readdir, they can
806  * be removed (i.e. they don't have to respond to lookup, though it might
807  * be weird if they didn't ;-)
808  *
809  * Lookup has side-effects.
810  *
811  * - It will create directories and fs attribute files in the shadow hierarchy.
812  * - It should cause non-SID devices to be probed (ask the parent nexi).
813  */
814 /*ARGSUSED3*/
815 static int
816 devfs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
817     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
818     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
819 {
820 	ASSERT(dvp->v_type == VDIR);
821 	dcmn_err2(("devfs_lookup: %s\n", nm));
822 	return (dv_find(VTODV(dvp), nm, vpp, pnp, rdir, cred, 0));
823 }
824 
825 /*
826  * devfs nodes can't really be created directly by userland - however,
827  * we do allow creates to find existing nodes:
828  *
829  * - any create fails if the node doesn't exist - EROFS.
830  * - creating an existing directory read-only succeeds, otherwise EISDIR.
831  * - exclusive creates fail if the node already exists - EEXIST.
832  * - failure to create the snode for an existing device - ENOSYS.
833  */
834 /*ARGSUSED2*/
835 static int
836 devfs_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
837     int mode, struct vnode **vpp, struct cred *cred, int flag,
838     caller_context_t *ct, vsecattr_t *vsecp)
839 {
840 	int error;
841 	struct vnode *vp;
842 
843 	dcmn_err2(("devfs_create %s\n", nm));
844 	error = dv_find(VTODV(dvp), nm, &vp, NULL, NULLVP, cred, 0);
845 	if (error == 0) {
846 		if (excl == EXCL)
847 			error = EEXIST;
848 		else if (vp->v_type == VDIR && (mode & VWRITE))
849 			error = EISDIR;
850 		else
851 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
852 
853 		if (error) {
854 			VN_RELE(vp);
855 		} else
856 			*vpp = vp;
857 	} else if (error == ENOENT)
858 		error = EROFS;
859 
860 	return (error);
861 }
862 
863 /*
864  * If DV_BUILD is set, we call into nexus driver to do a BUS_CONFIG_ALL.
865  * Otherwise, simply return cached dv_node's. Hotplug code always call
866  * devfs_clean() to invalid the dv_node cache.
867  */
868 /*ARGSUSED5*/
869 static int
870 devfs_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, int *eofp,
871     caller_context_t *ct, int flags)
872 {
873 	struct dv_node *ddv, *dv;
874 	struct dirent64 *de, *bufp;
875 	offset_t diroff;
876 	offset_t	soff;
877 	size_t reclen, movesz;
878 	int error;
879 	struct vattr va;
880 	size_t bufsz;
881 
882 	ddv = VTODV(dvp);
883 	dcmn_err2(("devfs_readdir %s: offset %lld len %ld\n",
884 	    ddv->dv_name, uiop->uio_loffset, uiop->uio_iov->iov_len));
885 	ASSERT(ddv->dv_attr || ddv->dv_attrvp);
886 	ASSERT(RW_READ_HELD(&ddv->dv_contents));
887 
888 	if (uiop->uio_loffset >= MAXOFF_T) {
889 		if (eofp)
890 			*eofp = 1;
891 		return (0);
892 	}
893 
894 	if (uiop->uio_iovcnt != 1)
895 		return (EINVAL);
896 
897 	if (dvp->v_type != VDIR)
898 		return (ENOTDIR);
899 
900 	/* Load the initial contents */
901 	if (ddv->dv_flags & DV_BUILD) {
902 		if (!rw_tryupgrade(&ddv->dv_contents)) {
903 			rw_exit(&ddv->dv_contents);
904 			rw_enter(&ddv->dv_contents, RW_WRITER);
905 		}
906 
907 		/* recheck and fill */
908 		if (ddv->dv_flags & DV_BUILD)
909 			dv_filldir(ddv);
910 
911 		rw_downgrade(&ddv->dv_contents);
912 	}
913 
914 	soff = uiop->uio_loffset;
915 	bufsz = uiop->uio_iov->iov_len;
916 	de = bufp = kmem_alloc(bufsz, KM_SLEEP);
917 	movesz = 0;
918 	dv = (struct dv_node *)-1;
919 
920 	/*
921 	 * Move as many entries into the uio structure as it will take.
922 	 * Special case "." and "..".
923 	 */
924 	diroff = 0;
925 	if (soff == 0) {				/* . */
926 		reclen = DIRENT64_RECLEN(strlen("."));
927 		if ((movesz + reclen) > bufsz)
928 			goto full;
929 		de->d_ino = (ino64_t)ddv->dv_ino;
930 		de->d_off = (off64_t)diroff + 1;
931 		de->d_reclen = (ushort_t)reclen;
932 
933 		/* use strncpy(9f) to zero out uninitialized bytes */
934 
935 		(void) strncpy(de->d_name, ".", DIRENT64_NAMELEN(reclen));
936 		movesz += reclen;
937 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
938 		dcmn_err3(("devfs_readdir: A: diroff %lld, soff %lld: '%s' "
939 		    "reclen %lu\n", diroff, soff, ".", reclen));
940 	}
941 
942 	diroff++;
943 	if (soff <= 1) {				/* .. */
944 		reclen = DIRENT64_RECLEN(strlen(".."));
945 		if ((movesz + reclen) > bufsz)
946 			goto full;
947 		de->d_ino = (ino64_t)ddv->dv_dotdot->dv_ino;
948 		de->d_off = (off64_t)diroff + 1;
949 		de->d_reclen = (ushort_t)reclen;
950 
951 		/* use strncpy(9f) to zero out uninitialized bytes */
952 
953 		(void) strncpy(de->d_name, "..", DIRENT64_NAMELEN(reclen));
954 		movesz += reclen;
955 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
956 		dcmn_err3(("devfs_readdir: B: diroff %lld, soff %lld: '%s' "
957 		    "reclen %lu\n", diroff, soff, "..", reclen));
958 	}
959 
960 	diroff++;
961 	for (dv = DV_FIRST_ENTRY(ddv); dv;
962 	    dv = DV_NEXT_ENTRY(ddv, dv), diroff++) {
963 		/*
964 		 * although DDM_INTERNAL_PATH minor nodes are skipped for
965 		 * readdirs outside the kernel, they still occupy directory
966 		 * offsets
967 		 */
968 		if (diroff < soff ||
969 		    ((dv->dv_flags & DV_INTERNAL) && (cred != kcred)))
970 			continue;
971 
972 		reclen = DIRENT64_RECLEN(strlen(dv->dv_name));
973 		if ((movesz + reclen) > bufsz) {
974 			dcmn_err3(("devfs_readdir: C: diroff "
975 			    "%lld, soff %lld: '%s' reclen %lu\n",
976 			    diroff, soff, dv->dv_name, reclen));
977 			goto full;
978 		}
979 		de->d_ino = (ino64_t)dv->dv_ino;
980 		de->d_off = (off64_t)diroff + 1;
981 		de->d_reclen = (ushort_t)reclen;
982 
983 		/* use strncpy(9f) to zero out uninitialized bytes */
984 
985 		ASSERT(strlen(dv->dv_name) + 1 <=
986 		    DIRENT64_NAMELEN(reclen));
987 		(void) strncpy(de->d_name, dv->dv_name,
988 		    DIRENT64_NAMELEN(reclen));
989 
990 		movesz += reclen;
991 		de = (dirent64_t *)(intptr_t)((char *)de + reclen);
992 		dcmn_err4(("devfs_readdir: D: diroff "
993 		    "%lld, soff %lld: '%s' reclen %lu\n", diroff, soff,
994 		    dv->dv_name, reclen));
995 	}
996 
997 	/* the buffer is full, or we exhausted everything */
998 full:	dcmn_err3(("devfs_readdir: moving %lu bytes: "
999 	    "diroff %lld, soff %lld, dv %p\n",
1000 	    movesz, diroff, soff, (void *)dv));
1001 
1002 	if ((movesz == 0) && dv)
1003 		error = EINVAL;		/* cannot be represented */
1004 	else {
1005 		error = uiomove(bufp, movesz, UIO_READ, uiop);
1006 		if (error == 0) {
1007 			if (eofp)
1008 				*eofp = dv ? 0 : 1;
1009 			uiop->uio_loffset = diroff;
1010 		}
1011 
1012 		va.va_mask = AT_ATIME;
1013 		gethrestime(&va.va_atime);
1014 		rw_exit(&ddv->dv_contents);
1015 		(void) devfs_setattr(dvp, &va, 0, cred, ct);
1016 		rw_enter(&ddv->dv_contents, RW_READER);
1017 	}
1018 
1019 	kmem_free(bufp, bufsz);
1020 	return (error);
1021 }
1022 
1023 /*ARGSUSED*/
1024 static int
1025 devfs_fsync(struct vnode *vp, int syncflag, struct cred *cred,
1026     caller_context_t *ct)
1027 {
1028 	/*
1029 	 * Message goes to console only. Otherwise, the message
1030 	 * causes devfs_fsync to be invoked again... infinite loop
1031 	 */
1032 	dcmn_err2(("devfs_fsync %s\n", VTODV(vp)->dv_name));
1033 	return (0);
1034 }
1035 
1036 /*
1037  * Normally, we leave the dv_node here at count of 0.
1038  * The node will be destroyed when dv_cleandir() is called.
1039  *
1040  * Stale dv_node's are already unlinked from the fs tree,
1041  * so dv_cleandir() won't find them. We destroy such nodes
1042  * immediately.
1043  */
1044 /*ARGSUSED1*/
1045 static void
1046 devfs_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct)
1047 {
1048 	int destroy;
1049 	struct dv_node *dv = VTODV(vp);
1050 
1051 	dcmn_err2(("devfs_inactive: %s\n", dv->dv_name));
1052 	mutex_enter(&vp->v_lock);
1053 	ASSERT(vp->v_count >= 1);
1054 	--vp->v_count;
1055 	destroy = (DV_STALE(dv) && vp->v_count == 0);
1056 	mutex_exit(&vp->v_lock);
1057 
1058 	/* stale nodes cannot be rediscovered, destroy it here */
1059 	if (destroy)
1060 		dv_destroy(dv, 0);
1061 }
1062 
1063 /*
1064  * XXX Why do we need this?  NFS mounted /dev directories?
1065  * XXX Talk to peter staubach about this.
1066  */
1067 /*ARGSUSED2*/
1068 static int
1069 devfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1070 {
1071 	struct dv_node	*dv = VTODV(vp);
1072 	struct dv_fid	*dv_fid;
1073 
1074 	if (fidp->fid_len < (sizeof (struct dv_fid) - sizeof (ushort_t))) {
1075 		fidp->fid_len = sizeof (struct dv_fid) - sizeof (ushort_t);
1076 		return (ENOSPC);
1077 	}
1078 
1079 	dv_fid = (struct dv_fid *)fidp;
1080 	bzero(dv_fid, sizeof (struct dv_fid));
1081 	dv_fid->dvfid_len = (int)sizeof (struct dv_fid) - sizeof (ushort_t);
1082 	dv_fid->dvfid_ino = dv->dv_ino;
1083 	/* dv_fid->dvfid_gen = dv->tn_gen; XXX ? */
1084 
1085 	return (0);
1086 }
1087 
1088 /*
1089  * This pair of routines bracket all VOP_READ, VOP_WRITE
1090  * and VOP_READDIR requests.  The contents lock stops things
1091  * moving around while we're looking at them.
1092  *
1093  * Also used by file and record locking.
1094  */
1095 /*ARGSUSED2*/
1096 static int
1097 devfs_rwlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1098 {
1099 	dcmn_err2(("devfs_rwlock %s\n", VTODV(vp)->dv_name));
1100 	rw_enter(&VTODV(vp)->dv_contents, write_flag ? RW_WRITER : RW_READER);
1101 	return (write_flag);
1102 }
1103 
1104 /*ARGSUSED1*/
1105 static void
1106 devfs_rwunlock(struct vnode *vp, int write_flag, caller_context_t *ct)
1107 {
1108 	dcmn_err2(("devfs_rwunlock %s\n", VTODV(vp)->dv_name));
1109 	rw_exit(&VTODV(vp)->dv_contents);
1110 }
1111 
1112 /*
1113  * XXX	Should probably do a better job of computing the maximum
1114  *	offset available in the directory.
1115  */
1116 /*ARGSUSED1*/
1117 static int
1118 devfs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
1119     caller_context_t *ct)
1120 {
1121 	ASSERT(vp->v_type == VDIR);
1122 	dcmn_err2(("devfs_seek %s\n", VTODV(vp)->dv_name));
1123 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1124 }
1125 
1126 vnodeops_t *dv_vnodeops;
1127 
1128 const fs_operation_def_t dv_vnodeops_template[] = {
1129 	VOPNAME_OPEN,		{ .vop_open = devfs_open },
1130 	VOPNAME_CLOSE,		{ .vop_close = devfs_close },
1131 	VOPNAME_READ,		{ .vop_read = devfs_read },
1132 	VOPNAME_WRITE,		{ .vop_write = devfs_write },
1133 	VOPNAME_IOCTL,		{ .vop_ioctl = devfs_ioctl },
1134 	VOPNAME_GETATTR,	{ .vop_getattr = devfs_getattr },
1135 	VOPNAME_SETATTR,	{ .vop_setattr = devfs_setattr },
1136 	VOPNAME_ACCESS,		{ .vop_access = devfs_access },
1137 	VOPNAME_LOOKUP,		{ .vop_lookup = devfs_lookup },
1138 	VOPNAME_CREATE,		{ .vop_create = devfs_create },
1139 	VOPNAME_READDIR,	{ .vop_readdir = devfs_readdir },
1140 	VOPNAME_FSYNC,		{ .vop_fsync = devfs_fsync },
1141 	VOPNAME_INACTIVE,	{ .vop_inactive = devfs_inactive },
1142 	VOPNAME_FID,		{ .vop_fid = devfs_fid },
1143 	VOPNAME_RWLOCK,		{ .vop_rwlock = devfs_rwlock },
1144 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = devfs_rwunlock },
1145 	VOPNAME_SEEK,		{ .vop_seek = devfs_seek },
1146 	VOPNAME_PATHCONF,	{ .vop_pathconf = devfs_pathconf },
1147 	VOPNAME_DISPOSE,	{ .error = fs_error },
1148 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = devfs_setsecattr },
1149 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = devfs_getsecattr },
1150 	NULL,			NULL
1151 };
1152