xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_subr.c (revision e0731422366620894c16c1ee6515551c5f00733d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * utility routines for the /dev fs
27  */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/t_lock.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/user.h>
35 #include <sys/time.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/fcntl.h>
40 #include <sys/flock.h>
41 #include <sys/kmem.h>
42 #include <sys/uio.h>
43 #include <sys/errno.h>
44 #include <sys/stat.h>
45 #include <sys/cred.h>
46 #include <sys/dirent.h>
47 #include <sys/pathname.h>
48 #include <sys/cmn_err.h>
49 #include <sys/debug.h>
50 #include <sys/mode.h>
51 #include <sys/policy.h>
52 #include <fs/fs_subr.h>
53 #include <sys/mount.h>
54 #include <sys/fs/snode.h>
55 #include <sys/fs/dv_node.h>
56 #include <sys/fs/sdev_impl.h>
57 #include <sys/sunndi.h>
58 #include <sys/sunmdi.h>
59 #include <sys/conf.h>
60 #include <sys/proc.h>
61 #include <sys/user.h>
62 #include <sys/modctl.h>
63 
64 #ifdef DEBUG
65 int sdev_debug = 0x00000001;
66 int sdev_debug_cache_flags = 0;
67 #endif
68 
69 /*
70  * globals
71  */
72 /* prototype memory vattrs */
73 vattr_t sdev_vattr_dir = {
74 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
75 	VDIR,					/* va_type */
76 	SDEV_DIRMODE_DEFAULT,			/* va_mode */
77 	SDEV_UID_DEFAULT,			/* va_uid */
78 	SDEV_GID_DEFAULT,			/* va_gid */
79 	0,					/* va_fsid */
80 	0,					/* va_nodeid */
81 	0,					/* va_nlink */
82 	0,					/* va_size */
83 	0,					/* va_atime */
84 	0,					/* va_mtime */
85 	0,					/* va_ctime */
86 	0,					/* va_rdev */
87 	0,					/* va_blksize */
88 	0,					/* va_nblocks */
89 	0					/* va_vcode */
90 };
91 
92 vattr_t sdev_vattr_lnk = {
93 	AT_TYPE|AT_MODE,			/* va_mask */
94 	VLNK,					/* va_type */
95 	SDEV_LNKMODE_DEFAULT,			/* va_mode */
96 	SDEV_UID_DEFAULT,			/* va_uid */
97 	SDEV_GID_DEFAULT,			/* va_gid */
98 	0,					/* va_fsid */
99 	0,					/* va_nodeid */
100 	0,					/* va_nlink */
101 	0,					/* va_size */
102 	0,					/* va_atime */
103 	0,					/* va_mtime */
104 	0,					/* va_ctime */
105 	0,					/* va_rdev */
106 	0,					/* va_blksize */
107 	0,					/* va_nblocks */
108 	0					/* va_vcode */
109 };
110 
111 vattr_t sdev_vattr_blk = {
112 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
113 	VBLK,					/* va_type */
114 	S_IFBLK | SDEV_DEVMODE_DEFAULT,		/* va_mode */
115 	SDEV_UID_DEFAULT,			/* va_uid */
116 	SDEV_GID_DEFAULT,			/* va_gid */
117 	0,					/* va_fsid */
118 	0,					/* va_nodeid */
119 	0,					/* va_nlink */
120 	0,					/* va_size */
121 	0,					/* va_atime */
122 	0,					/* va_mtime */
123 	0,					/* va_ctime */
124 	0,					/* va_rdev */
125 	0,					/* va_blksize */
126 	0,					/* va_nblocks */
127 	0					/* va_vcode */
128 };
129 
130 vattr_t sdev_vattr_chr = {
131 	AT_TYPE|AT_MODE|AT_UID|AT_GID,		/* va_mask */
132 	VCHR,					/* va_type */
133 	S_IFCHR | SDEV_DEVMODE_DEFAULT,		/* va_mode */
134 	SDEV_UID_DEFAULT,			/* va_uid */
135 	SDEV_GID_DEFAULT,			/* va_gid */
136 	0,					/* va_fsid */
137 	0,					/* va_nodeid */
138 	0,					/* va_nlink */
139 	0,					/* va_size */
140 	0,					/* va_atime */
141 	0,					/* va_mtime */
142 	0,					/* va_ctime */
143 	0,					/* va_rdev */
144 	0,					/* va_blksize */
145 	0,					/* va_nblocks */
146 	0					/* va_vcode */
147 };
148 
149 kmem_cache_t	*sdev_node_cache;	/* sdev_node cache */
150 int		devtype;		/* fstype */
151 
152 /* static */
153 static struct vnodeops *sdev_get_vop(struct sdev_node *);
154 static void sdev_set_no_negcache(struct sdev_node *);
155 static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
156 static void sdev_free_vtab(fs_operation_def_t *);
157 
158 static void
159 sdev_prof_free(struct sdev_node *dv)
160 {
161 	ASSERT(!SDEV_IS_GLOBAL(dv));
162 	if (dv->sdev_prof.dev_name)
163 		nvlist_free(dv->sdev_prof.dev_name);
164 	if (dv->sdev_prof.dev_map)
165 		nvlist_free(dv->sdev_prof.dev_map);
166 	if (dv->sdev_prof.dev_symlink)
167 		nvlist_free(dv->sdev_prof.dev_symlink);
168 	if (dv->sdev_prof.dev_glob_incdir)
169 		nvlist_free(dv->sdev_prof.dev_glob_incdir);
170 	if (dv->sdev_prof.dev_glob_excdir)
171 		nvlist_free(dv->sdev_prof.dev_glob_excdir);
172 	bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
173 }
174 
175 /* sdev_node cache constructor */
176 /*ARGSUSED1*/
177 static int
178 i_sdev_node_ctor(void *buf, void *cfarg, int flag)
179 {
180 	struct sdev_node *dv = (struct sdev_node *)buf;
181 	struct vnode *vp;
182 
183 	bzero(buf, sizeof (struct sdev_node));
184 	vp = dv->sdev_vnode = vn_alloc(flag);
185 	if (vp == NULL) {
186 		return (-1);
187 	}
188 	vp->v_data = dv;
189 	rw_init(&dv->sdev_contents, NULL, RW_DEFAULT, NULL);
190 	return (0);
191 }
192 
193 /* sdev_node cache destructor */
194 /*ARGSUSED1*/
195 static void
196 i_sdev_node_dtor(void *buf, void *arg)
197 {
198 	struct sdev_node *dv = (struct sdev_node *)buf;
199 	struct vnode *vp = SDEVTOV(dv);
200 
201 	rw_destroy(&dv->sdev_contents);
202 	vn_free(vp);
203 }
204 
205 /* initialize sdev_node cache */
206 void
207 sdev_node_cache_init()
208 {
209 	int flags = 0;
210 
211 #ifdef	DEBUG
212 	flags = sdev_debug_cache_flags;
213 	if (flags)
214 		sdcmn_err(("cache debug flags 0x%x\n", flags));
215 #endif	/* DEBUG */
216 
217 	ASSERT(sdev_node_cache == NULL);
218 	sdev_node_cache = kmem_cache_create("sdev_node_cache",
219 	    sizeof (struct sdev_node), 0, i_sdev_node_ctor, i_sdev_node_dtor,
220 	    NULL, NULL, NULL, flags);
221 }
222 
223 /* destroy sdev_node cache */
224 void
225 sdev_node_cache_fini()
226 {
227 	ASSERT(sdev_node_cache != NULL);
228 	kmem_cache_destroy(sdev_node_cache);
229 	sdev_node_cache = NULL;
230 }
231 
232 /*
233  * Compare two nodes lexographically to balance avl tree
234  */
235 static int
236 sdev_compare_nodes(const struct sdev_node *dv1, const struct sdev_node *dv2)
237 {
238 	int rv;
239 	if ((rv = strcmp(dv1->sdev_name, dv2->sdev_name)) == 0)
240 		return (0);
241 	return ((rv < 0) ? -1 : 1);
242 }
243 
244 void
245 sdev_set_nodestate(struct sdev_node *dv, sdev_node_state_t state)
246 {
247 	ASSERT(dv);
248 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents));
249 	dv->sdev_state = state;
250 }
251 
252 static void
253 sdev_attr_update(struct sdev_node *dv, vattr_t *vap)
254 {
255 	timestruc_t	now;
256 	struct vattr	*attrp;
257 	uint_t		mask;
258 
259 	ASSERT(dv->sdev_attr);
260 	ASSERT(vap);
261 
262 	attrp = dv->sdev_attr;
263 	mask = vap->va_mask;
264 	if (mask & AT_TYPE)
265 		attrp->va_type = vap->va_type;
266 	if (mask & AT_MODE)
267 		attrp->va_mode = vap->va_mode;
268 	if (mask & AT_UID)
269 		attrp->va_uid = vap->va_uid;
270 	if (mask & AT_GID)
271 		attrp->va_gid = vap->va_gid;
272 	if (mask & AT_RDEV)
273 		attrp->va_rdev = vap->va_rdev;
274 
275 	gethrestime(&now);
276 	attrp->va_atime = (mask & AT_ATIME) ? vap->va_atime : now;
277 	attrp->va_mtime = (mask & AT_MTIME) ? vap->va_mtime : now;
278 	attrp->va_ctime = (mask & AT_CTIME) ? vap->va_ctime : now;
279 }
280 
281 static void
282 sdev_attr_alloc(struct sdev_node *dv, vattr_t *vap)
283 {
284 	ASSERT(dv->sdev_attr == NULL);
285 	ASSERT(vap->va_mask & AT_TYPE);
286 	ASSERT(vap->va_mask & AT_MODE);
287 
288 	dv->sdev_attr = kmem_zalloc(sizeof (struct vattr), KM_SLEEP);
289 	sdev_attr_update(dv, vap);
290 }
291 
292 /* alloc and initialize a sdev_node */
293 int
294 sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
295     vattr_t *vap)
296 {
297 	struct sdev_node *dv = NULL;
298 	struct vnode *vp;
299 	size_t nmlen, len;
300 	devname_handle_t  *dhl;
301 
302 	nmlen = strlen(nm) + 1;
303 	if (nmlen > MAXNAMELEN) {
304 		sdcmn_err9(("sdev_nodeinit: node name %s"
305 		    " too long\n", nm));
306 		*newdv = NULL;
307 		return (ENAMETOOLONG);
308 	}
309 
310 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
311 
312 	dv->sdev_name = kmem_alloc(nmlen, KM_SLEEP);
313 	bcopy(nm, dv->sdev_name, nmlen);
314 	dv->sdev_namelen = nmlen - 1;	/* '\0' not included */
315 	len = strlen(ddv->sdev_path) + strlen(nm) + 2;
316 	dv->sdev_path = kmem_alloc(len, KM_SLEEP);
317 	(void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm);
318 	/* overwritten for VLNK nodes */
319 	dv->sdev_symlink = NULL;
320 
321 	vp = SDEVTOV(dv);
322 	vn_reinit(vp);
323 	vp->v_vfsp = SDEVTOV(ddv)->v_vfsp;
324 	if (vap)
325 		vp->v_type = vap->va_type;
326 
327 	/*
328 	 * initialized to the parent's vnodeops.
329 	 * maybe overwriten for a VDIR
330 	 */
331 	vn_setops(vp, vn_getops(SDEVTOV(ddv)));
332 	vn_exists(vp);
333 
334 	dv->sdev_dotdot = NULL;
335 	dv->sdev_attrvp = NULL;
336 	if (vap) {
337 		sdev_attr_alloc(dv, vap);
338 	} else {
339 		dv->sdev_attr = NULL;
340 	}
341 
342 	dv->sdev_ino = sdev_mkino(dv);
343 	dv->sdev_nlink = 0;		/* updated on insert */
344 	dv->sdev_flags = ddv->sdev_flags; /* inherit from the parent first */
345 	dv->sdev_flags |= SDEV_BUILD;
346 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
347 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
348 	if (SDEV_IS_GLOBAL(ddv)) {
349 		dv->sdev_flags |= SDEV_GLOBAL;
350 		dhl = &(dv->sdev_handle);
351 		dhl->dh_data = dv;
352 		dhl->dh_args = NULL;
353 		sdev_set_no_negcache(dv);
354 		dv->sdev_gdir_gen = 0;
355 	} else {
356 		dv->sdev_flags &= ~SDEV_GLOBAL;
357 		dv->sdev_origin = NULL; /* set later */
358 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
359 		dv->sdev_ldir_gen = 0;
360 		dv->sdev_devtree_gen = 0;
361 	}
362 
363 	rw_enter(&dv->sdev_contents, RW_WRITER);
364 	sdev_set_nodestate(dv, SDEV_INIT);
365 	rw_exit(&dv->sdev_contents);
366 	*newdv = dv;
367 
368 	return (0);
369 }
370 
371 /*
372  * transition a sdev_node into SDEV_READY state
373  */
374 int
375 sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp,
376     void *args, struct cred *cred)
377 {
378 	int error = 0;
379 	struct vnode *vp = SDEVTOV(dv);
380 	vtype_t type;
381 
382 	ASSERT(dv && (dv->sdev_state != SDEV_READY) && vap);
383 
384 	type = vap->va_type;
385 	vp->v_type = type;
386 	vp->v_rdev = vap->va_rdev;
387 	rw_enter(&dv->sdev_contents, RW_WRITER);
388 	if (type == VDIR) {
389 		dv->sdev_nlink = 2;
390 		dv->sdev_flags &= ~SDEV_PERSIST;
391 		dv->sdev_flags &= ~SDEV_DYNAMIC;
392 		vn_setops(vp, sdev_get_vop(dv)); /* from internal vtab */
393 		ASSERT(dv->sdev_dotdot);
394 		ASSERT(SDEVTOV(dv->sdev_dotdot)->v_type == VDIR);
395 		vp->v_rdev = SDEVTOV(dv->sdev_dotdot)->v_rdev;
396 		avl_create(&dv->sdev_entries,
397 		    (int (*)(const void *, const void *))sdev_compare_nodes,
398 		    sizeof (struct sdev_node),
399 		    offsetof(struct sdev_node, sdev_avllink));
400 	} else if (type == VLNK) {
401 		ASSERT(args);
402 		dv->sdev_nlink = 1;
403 		dv->sdev_symlink = i_ddi_strdup((char *)args, KM_SLEEP);
404 	} else {
405 		dv->sdev_nlink = 1;
406 	}
407 
408 	if (!(SDEV_IS_GLOBAL(dv))) {
409 		dv->sdev_origin = (struct sdev_node *)args;
410 		dv->sdev_flags &= ~SDEV_PERSIST;
411 	}
412 
413 	/*
414 	 * shadow node is created here OR
415 	 * if failed (indicated by dv->sdev_attrvp == NULL),
416 	 * created later in sdev_setattr
417 	 */
418 	if (avp) {
419 		dv->sdev_attrvp = avp;
420 	} else {
421 		if (dv->sdev_attr == NULL) {
422 			sdev_attr_alloc(dv, vap);
423 		} else {
424 			sdev_attr_update(dv, vap);
425 		}
426 
427 		if ((dv->sdev_attrvp == NULL) && SDEV_IS_PERSIST(dv))
428 			error = sdev_shadow_node(dv, cred);
429 	}
430 
431 	if (error == 0) {
432 		/* transition to READY state */
433 		sdev_set_nodestate(dv, SDEV_READY);
434 		sdev_nc_node_exists(dv);
435 	} else {
436 		sdev_set_nodestate(dv, SDEV_ZOMBIE);
437 	}
438 	rw_exit(&dv->sdev_contents);
439 	return (error);
440 }
441 
442 /*
443  * setting ZOMBIE state
444  */
445 static int
446 sdev_nodezombied(struct sdev_node *dv)
447 {
448 	rw_enter(&dv->sdev_contents, RW_WRITER);
449 	sdev_set_nodestate(dv, SDEV_ZOMBIE);
450 	rw_exit(&dv->sdev_contents);
451 	return (0);
452 }
453 
454 /*
455  * Build the VROOT sdev_node.
456  */
457 /*ARGSUSED*/
458 struct sdev_node *
459 sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp,
460     struct vnode *avp, struct cred *cred)
461 {
462 	struct sdev_node *dv;
463 	struct vnode *vp;
464 	char devdir[] = "/dev";
465 
466 	ASSERT(sdev_node_cache != NULL);
467 	ASSERT(avp);
468 	dv = kmem_cache_alloc(sdev_node_cache, KM_SLEEP);
469 	vp = SDEVTOV(dv);
470 	vn_reinit(vp);
471 	vp->v_flag |= VROOT;
472 	vp->v_vfsp = vfsp;
473 	vp->v_type = VDIR;
474 	vp->v_rdev = devdev;
475 	vn_setops(vp, sdev_vnodeops); /* apply the default vnodeops at /dev */
476 	vn_exists(vp);
477 
478 	if (vfsp->vfs_mntpt)
479 		dv->sdev_name = i_ddi_strdup(
480 		    (char *)refstr_value(vfsp->vfs_mntpt), KM_SLEEP);
481 	else
482 		/* vfs_mountdev1 set mount point later */
483 		dv->sdev_name = i_ddi_strdup("/dev", KM_SLEEP);
484 	dv->sdev_namelen = strlen(dv->sdev_name); /* '\0' not included */
485 	dv->sdev_path = i_ddi_strdup(devdir, KM_SLEEP);
486 	dv->sdev_ino = SDEV_ROOTINO;
487 	dv->sdev_nlink = 2;		/* name + . (no sdev_insert) */
488 	dv->sdev_dotdot = dv;		/* .. == self */
489 	dv->sdev_attrvp = avp;
490 	dv->sdev_attr = NULL;
491 	mutex_init(&dv->sdev_lookup_lock, NULL, MUTEX_DEFAULT, NULL);
492 	cv_init(&dv->sdev_lookup_cv, NULL, CV_DEFAULT, NULL);
493 	if (strcmp(dv->sdev_name, "/dev") == 0) {
494 		dv->sdev_flags = SDEV_BUILD|SDEV_GLOBAL|SDEV_PERSIST;
495 		bzero(&dv->sdev_handle, sizeof (dv->sdev_handle));
496 		dv->sdev_gdir_gen = 0;
497 	} else {
498 		dv->sdev_flags = SDEV_BUILD;
499 		dv->sdev_flags &= ~SDEV_PERSIST;
500 		bzero(&dv->sdev_prof, sizeof (dv->sdev_prof));
501 		dv->sdev_ldir_gen = 0;
502 		dv->sdev_devtree_gen = 0;
503 	}
504 
505 	avl_create(&dv->sdev_entries,
506 	    (int (*)(const void *, const void *))sdev_compare_nodes,
507 	    sizeof (struct sdev_node),
508 	    offsetof(struct sdev_node, sdev_avllink));
509 
510 	rw_enter(&dv->sdev_contents, RW_WRITER);
511 	sdev_set_nodestate(dv, SDEV_READY);
512 	rw_exit(&dv->sdev_contents);
513 	sdev_nc_node_exists(dv);
514 	return (dv);
515 }
516 
517 /* directory dependent vop table */
518 struct sdev_vop_table {
519 	char *vt_name;				/* subdirectory name */
520 	const fs_operation_def_t *vt_service;	/* vnodeops table */
521 	struct vnodeops *vt_vops;		/* constructed vop */
522 	struct vnodeops **vt_global_vops;	/* global container for vop */
523 	int (*vt_vtor)(struct sdev_node *);	/* validate sdev_node */
524 	int vt_flags;
525 };
526 
527 /*
528  * A nice improvement would be to provide a plug-in mechanism
529  * for this table instead of a const table.
530  */
531 static struct sdev_vop_table vtab[] =
532 {
533 	{ "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate,
534 	SDEV_DYNAMIC | SDEV_VTOR },
535 
536 	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
537 	SDEV_DYNAMIC | SDEV_VTOR },
538 
539 	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
540 	devzvol_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
541 
542 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
543 
544 	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
545 	SDEV_DYNAMIC | SDEV_VTOR },
546 
547 	{ "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops,
548 	devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE },
549 
550 	/*
551 	 * SDEV_DYNAMIC: prevent calling out to devfsadm, since only the
552 	 * lofi driver controls child nodes.
553 	 *
554 	 * SDEV_PERSIST: ensure devfsadm knows to clean up any persisted
555 	 * stale nodes (e.g. from devfsadm -R).
556 	 *
557 	 * In addition, devfsadm knows not to attempt a rmdir: a zone
558 	 * may hold a reference, which would zombify the node,
559 	 * preventing a mkdir.
560 	 */
561 
562 	{ "lofi", NULL, NULL, NULL, NULL,
563 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
564 	{ "rlofi", NULL, NULL, NULL, NULL,
565 	    SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST },
566 
567 	{ NULL, NULL, NULL, NULL, NULL, 0}
568 };
569 
570 struct sdev_vop_table *
571 sdev_match(struct sdev_node *dv)
572 {
573 	int vlen;
574 	int i;
575 
576 	for (i = 0; vtab[i].vt_name; i++) {
577 		if (strcmp(vtab[i].vt_name, dv->sdev_name) == 0)
578 			return (&vtab[i]);
579 		if (vtab[i].vt_flags & SDEV_SUBDIR) {
580 			char *ptr;
581 
582 			ASSERT(strlen(dv->sdev_path) > 5);
583 			ptr = dv->sdev_path + 5;
584 			vlen = strlen(vtab[i].vt_name);
585 			if ((strncmp(vtab[i].vt_name, ptr,
586 			    vlen - 1) == 0) && ptr[vlen] == '/')
587 				return (&vtab[i]);
588 		}
589 
590 	}
591 	return (NULL);
592 }
593 
594 /*
595  *  sets a directory's vnodeops if the directory is in the vtab;
596  */
597 static struct vnodeops *
598 sdev_get_vop(struct sdev_node *dv)
599 {
600 	struct sdev_vop_table *vtp;
601 	char *path;
602 
603 	path = dv->sdev_path;
604 	ASSERT(path);
605 
606 	/* gets the relative path to /dev/ */
607 	path += 5;
608 
609 	/* gets the vtab entry it matches */
610 	if ((vtp = sdev_match(dv)) != NULL) {
611 		dv->sdev_flags |= vtp->vt_flags;
612 
613 		if (vtp->vt_vops) {
614 			if (vtp->vt_global_vops)
615 				*(vtp->vt_global_vops) = vtp->vt_vops;
616 			return (vtp->vt_vops);
617 		}
618 
619 		if (vtp->vt_service) {
620 			fs_operation_def_t *templ;
621 			templ = sdev_merge_vtab(vtp->vt_service);
622 			if (vn_make_ops(vtp->vt_name,
623 			    (const fs_operation_def_t *)templ,
624 			    &vtp->vt_vops) != 0) {
625 				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
626 				    vtp->vt_name);
627 				/*NOTREACHED*/
628 			}
629 			if (vtp->vt_global_vops) {
630 				*(vtp->vt_global_vops) = vtp->vt_vops;
631 			}
632 			sdev_free_vtab(templ);
633 			return (vtp->vt_vops);
634 		}
635 		return (sdev_vnodeops);
636 	}
637 
638 	/* child inherits the persistence of the parent */
639 	if (SDEV_IS_PERSIST(dv->sdev_dotdot))
640 		dv->sdev_flags |= SDEV_PERSIST;
641 
642 	return (sdev_vnodeops);
643 }
644 
645 static void
646 sdev_set_no_negcache(struct sdev_node *dv)
647 {
648 	int i;
649 	char *path;
650 
651 	ASSERT(dv->sdev_path);
652 	path = dv->sdev_path + strlen("/dev/");
653 
654 	for (i = 0; vtab[i].vt_name; i++) {
655 		if (strcmp(vtab[i].vt_name, path) == 0) {
656 			if (vtab[i].vt_flags & SDEV_NO_NCACHE)
657 				dv->sdev_flags |= SDEV_NO_NCACHE;
658 			break;
659 		}
660 	}
661 }
662 
663 void *
664 sdev_get_vtor(struct sdev_node *dv)
665 {
666 	struct sdev_vop_table *vtp;
667 
668 	vtp = sdev_match(dv);
669 	if (vtp)
670 		return ((void *)vtp->vt_vtor);
671 	else
672 		return (NULL);
673 }
674 
675 /*
676  * Build the base root inode
677  */
678 ino_t
679 sdev_mkino(struct sdev_node *dv)
680 {
681 	ino_t	ino;
682 
683 	/*
684 	 * for now, follow the lead of tmpfs here
685 	 * need to someday understand the requirements here
686 	 */
687 	ino = (ino_t)(uint32_t)((uintptr_t)dv >> 3);
688 	ino += SDEV_ROOTINO + 1;
689 
690 	return (ino);
691 }
692 
693 int
694 sdev_getlink(struct vnode *linkvp, char **link)
695 {
696 	int err;
697 	char *buf;
698 	struct uio uio = {0};
699 	struct iovec iov = {0};
700 
701 	if (linkvp == NULL)
702 		return (ENOENT);
703 	ASSERT(linkvp->v_type == VLNK);
704 
705 	buf = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
706 	iov.iov_base = buf;
707 	iov.iov_len = MAXPATHLEN;
708 	uio.uio_iov = &iov;
709 	uio.uio_iovcnt = 1;
710 	uio.uio_resid = MAXPATHLEN;
711 	uio.uio_segflg = UIO_SYSSPACE;
712 	uio.uio_llimit = MAXOFFSET_T;
713 
714 	err = VOP_READLINK(linkvp, &uio, kcred, NULL);
715 	if (err) {
716 		cmn_err(CE_WARN, "readlink %s failed in dev\n", buf);
717 		kmem_free(buf, MAXPATHLEN);
718 		return (ENOENT);
719 	}
720 
721 	/* mission complete */
722 	*link = i_ddi_strdup(buf, KM_SLEEP);
723 	kmem_free(buf, MAXPATHLEN);
724 	return (0);
725 }
726 
727 /*
728  * A convenient wrapper to get the devfs node vnode for a device
729  * minor functionality: readlink() of a /dev symlink
730  * Place the link into dv->sdev_symlink
731  */
732 static int
733 sdev_follow_link(struct sdev_node *dv)
734 {
735 	int err;
736 	struct vnode *linkvp;
737 	char *link = NULL;
738 
739 	linkvp = SDEVTOV(dv);
740 	if (linkvp == NULL)
741 		return (ENOENT);
742 	ASSERT(linkvp->v_type == VLNK);
743 	err = sdev_getlink(linkvp, &link);
744 	if (err) {
745 		(void) sdev_nodezombied(dv);
746 		dv->sdev_symlink = NULL;
747 		return (ENOENT);
748 	}
749 
750 	ASSERT(link != NULL);
751 	dv->sdev_symlink = link;
752 	return (0);
753 }
754 
755 static int
756 sdev_node_check(struct sdev_node *dv, struct vattr *nvap, void *nargs)
757 {
758 	vtype_t otype = SDEVTOV(dv)->v_type;
759 
760 	/*
761 	 * existing sdev_node has a different type.
762 	 */
763 	if (otype != nvap->va_type) {
764 		sdcmn_err9(("sdev_node_check: existing node "
765 		    "  %s type %d does not match new node type %d\n",
766 		    dv->sdev_name, otype, nvap->va_type));
767 		return (EEXIST);
768 	}
769 
770 	/*
771 	 * For a symlink, the target should be the same.
772 	 */
773 	if (otype == VLNK) {
774 		ASSERT(nargs != NULL);
775 		ASSERT(dv->sdev_symlink != NULL);
776 		if (strcmp(dv->sdev_symlink, (char *)nargs) != 0) {
777 			sdcmn_err9(("sdev_node_check: existing node "
778 			    " %s has different symlink %s as new node "
779 			    " %s\n", dv->sdev_name, dv->sdev_symlink,
780 			    (char *)nargs));
781 			return (EEXIST);
782 		}
783 	}
784 
785 	return (0);
786 }
787 
788 /*
789  * sdev_mknode - a wrapper for sdev_nodeinit(), sdev_nodeready()
790  *
791  * arguments:
792  *	- ddv (parent)
793  *	- nm (child name)
794  *	- newdv (sdev_node for nm is returned here)
795  *	- vap (vattr for the node to be created, va_type should be set.
796  *	- avp (attribute vnode)
797  *	  the defaults should be used if unknown)
798  *	- cred
799  *	- args
800  *	    . tnm (for VLNK)
801  *	    . global sdev_node (for !SDEV_GLOBAL)
802  * 	- state: SDEV_INIT, SDEV_READY
803  *
804  * only ddv, nm, newddv, vap, cred are required for sdev_mknode(SDEV_INIT)
805  *
806  * NOTE:  directory contents writers lock needs to be held before
807  *	  calling this routine.
808  */
809 int
810 sdev_mknode(struct sdev_node *ddv, char *nm, struct sdev_node **newdv,
811     struct vattr *vap, struct vnode *avp, void *args, struct cred *cred,
812     sdev_node_state_t state)
813 {
814 	int error = 0;
815 	sdev_node_state_t node_state;
816 	struct sdev_node *dv = NULL;
817 
818 	ASSERT(state != SDEV_ZOMBIE);
819 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
820 
821 	if (*newdv) {
822 		dv = *newdv;
823 	} else {
824 		/* allocate and initialize a sdev_node */
825 		if (ddv->sdev_state == SDEV_ZOMBIE) {
826 			sdcmn_err9(("sdev_mknode: parent %s ZOMBIEd\n",
827 			    ddv->sdev_path));
828 			return (ENOENT);
829 		}
830 
831 		error = sdev_nodeinit(ddv, nm, &dv, vap);
832 		if (error != 0) {
833 			sdcmn_err9(("sdev_mknode: error %d,"
834 			    " name %s can not be initialized\n",
835 			    error, nm));
836 			return (error);
837 		}
838 		ASSERT(dv);
839 
840 		/* insert into the directory cache */
841 		error = sdev_cache_update(ddv, &dv, nm, SDEV_CACHE_ADD);
842 		if (error) {
843 			sdcmn_err9(("sdev_mknode: node %s can not"
844 			    " be added into directory cache\n", nm));
845 			return (ENOENT);
846 		}
847 	}
848 
849 	ASSERT(dv);
850 	node_state = dv->sdev_state;
851 	ASSERT(node_state != SDEV_ZOMBIE);
852 
853 	if (state == SDEV_READY) {
854 		switch (node_state) {
855 		case SDEV_INIT:
856 			error = sdev_nodeready(dv, vap, avp, args, cred);
857 			if (error) {
858 				sdcmn_err9(("sdev_mknode: node %s can NOT"
859 				    " be transitioned into READY state, "
860 				    "error %d\n", nm, error));
861 			}
862 			break;
863 		case SDEV_READY:
864 			/*
865 			 * Do some sanity checking to make sure
866 			 * the existing sdev_node is what has been
867 			 * asked for.
868 			 */
869 			error = sdev_node_check(dv, vap, args);
870 			break;
871 		default:
872 			break;
873 		}
874 	}
875 
876 	if (!error) {
877 		*newdv = dv;
878 		ASSERT((*newdv)->sdev_state != SDEV_ZOMBIE);
879 	} else {
880 		SDEV_SIMPLE_RELE(dv);
881 		*newdv = NULL;
882 	}
883 
884 	return (error);
885 }
886 
887 /*
888  * convenient wrapper to change vp's ATIME, CTIME and MTIME
889  */
890 void
891 sdev_update_timestamps(struct vnode *vp, cred_t *cred, uint_t mask)
892 {
893 	struct vattr attr;
894 	timestruc_t now;
895 	int err;
896 
897 	ASSERT(vp);
898 	gethrestime(&now);
899 	if (mask & AT_CTIME)
900 		attr.va_ctime = now;
901 	if (mask & AT_MTIME)
902 		attr.va_mtime = now;
903 	if (mask & AT_ATIME)
904 		attr.va_atime = now;
905 
906 	attr.va_mask = (mask & AT_TIMES);
907 	err = VOP_SETATTR(vp, &attr, 0, cred, NULL);
908 	if (err && (err != EROFS)) {
909 		sdcmn_err(("update timestamps error %d\n", err));
910 	}
911 }
912 
913 /*
914  * the backing store vnode is released here
915  */
916 /*ARGSUSED1*/
917 void
918 sdev_nodedestroy(struct sdev_node *dv, uint_t flags)
919 {
920 	/* no references */
921 	ASSERT(dv->sdev_nlink == 0);
922 
923 	if (dv->sdev_attrvp != NULLVP) {
924 		VN_RELE(dv->sdev_attrvp);
925 		/*
926 		 * reset the attrvp so that no more
927 		 * references can be made on this already
928 		 * vn_rele() vnode
929 		 */
930 		dv->sdev_attrvp = NULLVP;
931 	}
932 
933 	if (dv->sdev_attr != NULL) {
934 		kmem_free(dv->sdev_attr, sizeof (struct vattr));
935 		dv->sdev_attr = NULL;
936 	}
937 
938 	if (dv->sdev_name != NULL) {
939 		kmem_free(dv->sdev_name, dv->sdev_namelen + 1);
940 		dv->sdev_name = NULL;
941 	}
942 
943 	if (dv->sdev_symlink != NULL) {
944 		kmem_free(dv->sdev_symlink, strlen(dv->sdev_symlink) + 1);
945 		dv->sdev_symlink = NULL;
946 	}
947 
948 	if (dv->sdev_path) {
949 		kmem_free(dv->sdev_path, strlen(dv->sdev_path) + 1);
950 		dv->sdev_path = NULL;
951 	}
952 
953 	if (!SDEV_IS_GLOBAL(dv))
954 		sdev_prof_free(dv);
955 
956 	if (SDEVTOV(dv)->v_type == VDIR) {
957 		ASSERT(SDEV_FIRST_ENTRY(dv) == NULL);
958 		avl_destroy(&dv->sdev_entries);
959 	}
960 
961 	mutex_destroy(&dv->sdev_lookup_lock);
962 	cv_destroy(&dv->sdev_lookup_cv);
963 
964 	/* return node to initial state as per constructor */
965 	(void) memset((void *)&dv->sdev_instance_data, 0,
966 	    sizeof (dv->sdev_instance_data));
967 	vn_invalid(SDEVTOV(dv));
968 	kmem_cache_free(sdev_node_cache, dv);
969 }
970 
971 /*
972  * DIRECTORY CACHE lookup
973  */
974 struct sdev_node *
975 sdev_findbyname(struct sdev_node *ddv, char *nm)
976 {
977 	struct sdev_node *dv;
978 	struct sdev_node dvtmp;
979 	avl_index_t	where;
980 
981 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
982 
983 	dvtmp.sdev_name = nm;
984 	dv = avl_find(&ddv->sdev_entries, &dvtmp, &where);
985 	if (dv) {
986 		ASSERT(dv->sdev_dotdot == ddv);
987 		ASSERT(strcmp(dv->sdev_name, nm) == 0);
988 		SDEV_HOLD(dv);
989 		return (dv);
990 	}
991 	return (NULL);
992 }
993 
994 /*
995  * Inserts a new sdev_node in a parent directory
996  */
997 void
998 sdev_direnter(struct sdev_node *ddv, struct sdev_node *dv)
999 {
1000 	avl_index_t where;
1001 
1002 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1003 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
1004 	ASSERT(ddv->sdev_nlink >= 2);
1005 	ASSERT(dv->sdev_nlink == 0);
1006 
1007 	dv->sdev_dotdot = ddv;
1008 	VERIFY(avl_find(&ddv->sdev_entries, dv, &where) == NULL);
1009 	avl_insert(&ddv->sdev_entries, dv, where);
1010 	ddv->sdev_nlink++;
1011 }
1012 
1013 /*
1014  * The following check is needed because while sdev_nodes are linked
1015  * in SDEV_INIT state, they have their link counts incremented only
1016  * in SDEV_READY state.
1017  */
1018 static void
1019 decr_link(struct sdev_node *dv)
1020 {
1021 	if (dv->sdev_state != SDEV_INIT)
1022 		dv->sdev_nlink--;
1023 	else
1024 		ASSERT(dv->sdev_nlink == 0);
1025 }
1026 
1027 /*
1028  * Delete an existing dv from directory cache
1029  *
1030  * In the case of a node is still held by non-zero reference count,
1031  *     the node is put into ZOMBIE state. Once the reference count
1032  *     reaches "0", the node is unlinked and destroyed,
1033  *     in sdev_inactive().
1034  */
1035 static int
1036 sdev_dirdelete(struct sdev_node *ddv, struct sdev_node *dv)
1037 {
1038 	struct vnode *vp;
1039 
1040 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1041 
1042 	vp = SDEVTOV(dv);
1043 	mutex_enter(&vp->v_lock);
1044 
1045 	/* dv is held still */
1046 	if (vp->v_count > 1) {
1047 		rw_enter(&dv->sdev_contents, RW_WRITER);
1048 		if (dv->sdev_state == SDEV_READY) {
1049 			sdcmn_err9((
1050 			    "sdev_dirdelete: node %s busy with count %d\n",
1051 			    dv->sdev_name, vp->v_count));
1052 			dv->sdev_state = SDEV_ZOMBIE;
1053 		}
1054 		rw_exit(&dv->sdev_contents);
1055 		--vp->v_count;
1056 		mutex_exit(&vp->v_lock);
1057 		return (EBUSY);
1058 	}
1059 	ASSERT(vp->v_count == 1);
1060 
1061 	/* unlink from the memory cache */
1062 	ddv->sdev_nlink--;	/* .. to above */
1063 	if (vp->v_type == VDIR) {
1064 		decr_link(dv);		/* . to self */
1065 	}
1066 
1067 	avl_remove(&ddv->sdev_entries, dv);
1068 	decr_link(dv);	/* name, back to zero */
1069 	vp->v_count--;
1070 	mutex_exit(&vp->v_lock);
1071 
1072 	/* destroy the node */
1073 	sdev_nodedestroy(dv, 0);
1074 	return (0);
1075 }
1076 
1077 /*
1078  * check if the source is in the path of the target
1079  *
1080  * source and target are different
1081  */
1082 /*ARGSUSED2*/
1083 static int
1084 sdev_checkpath(struct sdev_node *sdv, struct sdev_node *tdv, struct cred *cred)
1085 {
1086 	int error = 0;
1087 	struct sdev_node *dotdot, *dir;
1088 
1089 	dotdot = tdv->sdev_dotdot;
1090 	ASSERT(dotdot);
1091 
1092 	/* fs root */
1093 	if (dotdot == tdv) {
1094 		return (0);
1095 	}
1096 
1097 	for (;;) {
1098 		/*
1099 		 * avoid error cases like
1100 		 *	mv a a/b
1101 		 *	mv a a/b/c
1102 		 *	etc.
1103 		 */
1104 		if (dotdot == sdv) {
1105 			error = EINVAL;
1106 			break;
1107 		}
1108 
1109 		dir = dotdot;
1110 		dotdot = dir->sdev_dotdot;
1111 
1112 		/* done checking because root is reached */
1113 		if (dir == dotdot) {
1114 			break;
1115 		}
1116 	}
1117 	return (error);
1118 }
1119 
1120 int
1121 sdev_rnmnode(struct sdev_node *oddv, struct sdev_node *odv,
1122     struct sdev_node *nddv, struct sdev_node **ndvp, char *nnm,
1123     struct cred *cred)
1124 {
1125 	int error = 0;
1126 	struct vnode *ovp = SDEVTOV(odv);
1127 	struct vnode *nvp;
1128 	struct vattr vattr;
1129 	int doingdir = (ovp->v_type == VDIR);
1130 	char *link = NULL;
1131 	int samedir = (oddv == nddv) ? 1 : 0;
1132 	int bkstore = 0;
1133 	struct sdev_node *idv = NULL;
1134 	struct sdev_node *ndv = NULL;
1135 	timestruc_t now;
1136 
1137 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1138 	error = VOP_GETATTR(ovp, &vattr, 0, cred, NULL);
1139 	if (error)
1140 		return (error);
1141 
1142 	if (!samedir)
1143 		rw_enter(&oddv->sdev_contents, RW_WRITER);
1144 	rw_enter(&nddv->sdev_contents, RW_WRITER);
1145 
1146 	/*
1147 	 * the source may have been deleted by another thread before
1148 	 * we gets here.
1149 	 */
1150 	if (odv->sdev_state != SDEV_READY) {
1151 		error = ENOENT;
1152 		goto err_out;
1153 	}
1154 
1155 	if (doingdir && (odv == nddv)) {
1156 		error = EINVAL;
1157 		goto err_out;
1158 	}
1159 
1160 	/*
1161 	 * If renaming a directory, and the parents are different (".." must be
1162 	 * changed) then the source dir must not be in the dir hierarchy above
1163 	 * the target since it would orphan everything below the source dir.
1164 	 */
1165 	if (doingdir && (oddv != nddv)) {
1166 		error = sdev_checkpath(odv, nddv, cred);
1167 		if (error)
1168 			goto err_out;
1169 	}
1170 
1171 	/* destination existing */
1172 	if (*ndvp) {
1173 		nvp = SDEVTOV(*ndvp);
1174 		ASSERT(nvp);
1175 
1176 		/* handling renaming to itself */
1177 		if (odv == *ndvp) {
1178 			error = 0;
1179 			goto err_out;
1180 		}
1181 
1182 		if (nvp->v_type == VDIR) {
1183 			if (!doingdir) {
1184 				error = EISDIR;
1185 				goto err_out;
1186 			}
1187 
1188 			if (vn_vfswlock(nvp)) {
1189 				error = EBUSY;
1190 				goto err_out;
1191 			}
1192 
1193 			if (vn_mountedvfs(nvp) != NULL) {
1194 				vn_vfsunlock(nvp);
1195 				error = EBUSY;
1196 				goto err_out;
1197 			}
1198 
1199 			/* in case dir1 exists in dir2 and "mv dir1 dir2" */
1200 			if ((*ndvp)->sdev_nlink > 2) {
1201 				vn_vfsunlock(nvp);
1202 				error = EEXIST;
1203 				goto err_out;
1204 			}
1205 			vn_vfsunlock(nvp);
1206 
1207 			(void) sdev_dirdelete(nddv, *ndvp);
1208 			*ndvp = NULL;
1209 			ASSERT(nddv->sdev_attrvp);
1210 			error = VOP_RMDIR(nddv->sdev_attrvp, nnm,
1211 			    nddv->sdev_attrvp, cred, NULL, 0);
1212 			if (error)
1213 				goto err_out;
1214 		} else {
1215 			if (doingdir) {
1216 				error = ENOTDIR;
1217 				goto err_out;
1218 			}
1219 
1220 			if (SDEV_IS_PERSIST((*ndvp))) {
1221 				bkstore = 1;
1222 			}
1223 
1224 			/*
1225 			 * get rid of the node from the directory cache
1226 			 * note, in case EBUSY is returned, the ZOMBIE
1227 			 * node is taken care in sdev_mknode.
1228 			 */
1229 			(void) sdev_dirdelete(nddv, *ndvp);
1230 			*ndvp = NULL;
1231 			if (bkstore) {
1232 				ASSERT(nddv->sdev_attrvp);
1233 				error = VOP_REMOVE(nddv->sdev_attrvp,
1234 				    nnm, cred, NULL, 0);
1235 				if (error)
1236 					goto err_out;
1237 			}
1238 		}
1239 	}
1240 
1241 	/* fix the source for a symlink */
1242 	if (vattr.va_type == VLNK) {
1243 		if (odv->sdev_symlink == NULL) {
1244 			error = sdev_follow_link(odv);
1245 			if (error) {
1246 				error = ENOENT;
1247 				goto err_out;
1248 			}
1249 		}
1250 		ASSERT(odv->sdev_symlink);
1251 		link = i_ddi_strdup(odv->sdev_symlink, KM_SLEEP);
1252 	}
1253 
1254 	/*
1255 	 * make a fresh node from the source attrs
1256 	 */
1257 	ASSERT(RW_WRITE_HELD(&nddv->sdev_contents));
1258 	error = sdev_mknode(nddv, nnm, ndvp, &vattr,
1259 	    NULL, (void *)link, cred, SDEV_READY);
1260 
1261 	if (link)
1262 		kmem_free(link, strlen(link) + 1);
1263 
1264 	if (error)
1265 		goto err_out;
1266 	ASSERT(*ndvp);
1267 	ASSERT((*ndvp)->sdev_state == SDEV_READY);
1268 
1269 	/* move dir contents */
1270 	if (doingdir) {
1271 		for (idv = SDEV_FIRST_ENTRY(odv); idv;
1272 		    idv = SDEV_NEXT_ENTRY(odv, idv)) {
1273 			error = sdev_rnmnode(odv, idv,
1274 			    (struct sdev_node *)(*ndvp), &ndv,
1275 			    idv->sdev_name, cred);
1276 			if (error)
1277 				goto err_out;
1278 			ndv = NULL;
1279 		}
1280 	}
1281 
1282 	if ((*ndvp)->sdev_attrvp) {
1283 		sdev_update_timestamps((*ndvp)->sdev_attrvp, kcred,
1284 		    AT_CTIME|AT_ATIME);
1285 	} else {
1286 		ASSERT((*ndvp)->sdev_attr);
1287 		gethrestime(&now);
1288 		(*ndvp)->sdev_attr->va_ctime = now;
1289 		(*ndvp)->sdev_attr->va_atime = now;
1290 	}
1291 
1292 	if (nddv->sdev_attrvp) {
1293 		sdev_update_timestamps(nddv->sdev_attrvp, kcred,
1294 		    AT_MTIME|AT_ATIME);
1295 	} else {
1296 		ASSERT(nddv->sdev_attr);
1297 		gethrestime(&now);
1298 		nddv->sdev_attr->va_mtime = now;
1299 		nddv->sdev_attr->va_atime = now;
1300 	}
1301 	rw_exit(&nddv->sdev_contents);
1302 	if (!samedir)
1303 		rw_exit(&oddv->sdev_contents);
1304 
1305 	SDEV_RELE(*ndvp);
1306 	return (error);
1307 
1308 err_out:
1309 	rw_exit(&nddv->sdev_contents);
1310 	if (!samedir)
1311 		rw_exit(&oddv->sdev_contents);
1312 	return (error);
1313 }
1314 
1315 /*
1316  * Merge sdev_node specific information into an attribute structure.
1317  *
1318  * note: sdev_node is not locked here
1319  */
1320 void
1321 sdev_vattr_merge(struct sdev_node *dv, struct vattr *vap)
1322 {
1323 	struct vnode *vp = SDEVTOV(dv);
1324 
1325 	vap->va_nlink = dv->sdev_nlink;
1326 	vap->va_nodeid = dv->sdev_ino;
1327 	vap->va_fsid = SDEVTOV(dv->sdev_dotdot)->v_rdev;
1328 	vap->va_type = vp->v_type;
1329 
1330 	if (vp->v_type == VDIR) {
1331 		vap->va_rdev = 0;
1332 		vap->va_fsid = vp->v_rdev;
1333 	} else if (vp->v_type == VLNK) {
1334 		vap->va_rdev = 0;
1335 		vap->va_mode  &= ~S_IFMT;
1336 		vap->va_mode |= S_IFLNK;
1337 	} else if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
1338 		vap->va_rdev = vp->v_rdev;
1339 		vap->va_mode &= ~S_IFMT;
1340 		if (vap->va_type == VCHR)
1341 			vap->va_mode |= S_IFCHR;
1342 		else
1343 			vap->va_mode |= S_IFBLK;
1344 	} else {
1345 		vap->va_rdev = 0;
1346 	}
1347 }
1348 
1349 struct vattr *
1350 sdev_getdefault_attr(enum vtype type)
1351 {
1352 	if (type == VDIR)
1353 		return (&sdev_vattr_dir);
1354 	else if (type == VCHR)
1355 		return (&sdev_vattr_chr);
1356 	else if (type == VBLK)
1357 		return (&sdev_vattr_blk);
1358 	else if (type == VLNK)
1359 		return (&sdev_vattr_lnk);
1360 	else
1361 		return (NULL);
1362 }
1363 int
1364 sdev_to_vp(struct sdev_node *dv, struct vnode **vpp)
1365 {
1366 	int rv = 0;
1367 	struct vnode *vp = SDEVTOV(dv);
1368 
1369 	switch (vp->v_type) {
1370 	case VCHR:
1371 	case VBLK:
1372 		/*
1373 		 * If vnode is a device, return special vnode instead
1374 		 * (though it knows all about -us- via sp->s_realvp)
1375 		 */
1376 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, kcred);
1377 		VN_RELE(vp);
1378 		if (*vpp == NULLVP)
1379 			rv = ENOSYS;
1380 		break;
1381 	default:	/* most types are returned as is */
1382 		*vpp = vp;
1383 		break;
1384 	}
1385 	return (rv);
1386 }
1387 
1388 /*
1389  * junction between devname and root file system, e.g. ufs
1390  */
1391 int
1392 devname_backstore_lookup(struct sdev_node *ddv, char *nm, struct vnode **rvp)
1393 {
1394 	struct vnode *rdvp = ddv->sdev_attrvp;
1395 	int rval = 0;
1396 
1397 	ASSERT(rdvp);
1398 
1399 	rval = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, kcred, NULL, NULL,
1400 	    NULL);
1401 	return (rval);
1402 }
1403 
1404 static int
1405 sdev_filldir_from_store(struct sdev_node *ddv, int dlen, struct cred *cred)
1406 {
1407 	struct sdev_node *dv = NULL;
1408 	char	*nm;
1409 	struct vnode *dirvp;
1410 	int	error;
1411 	vnode_t	*vp;
1412 	int eof;
1413 	struct iovec iov;
1414 	struct uio uio;
1415 	struct dirent64 *dp;
1416 	dirent64_t *dbuf;
1417 	size_t dbuflen;
1418 	struct vattr vattr;
1419 	char *link = NULL;
1420 
1421 	if (ddv->sdev_attrvp == NULL)
1422 		return (0);
1423 	if (!(ddv->sdev_flags & SDEV_BUILD))
1424 		return (0);
1425 
1426 	dirvp = ddv->sdev_attrvp;
1427 	VN_HOLD(dirvp);
1428 	dbuf = kmem_zalloc(dlen, KM_SLEEP);
1429 
1430 	uio.uio_iov = &iov;
1431 	uio.uio_iovcnt = 1;
1432 	uio.uio_segflg = UIO_SYSSPACE;
1433 	uio.uio_fmode = 0;
1434 	uio.uio_extflg = UIO_COPY_CACHED;
1435 	uio.uio_loffset = 0;
1436 	uio.uio_llimit = MAXOFFSET_T;
1437 
1438 	eof = 0;
1439 	error = 0;
1440 	while (!error && !eof) {
1441 		uio.uio_resid = dlen;
1442 		iov.iov_base = (char *)dbuf;
1443 		iov.iov_len = dlen;
1444 		(void) VOP_RWLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1445 		error = VOP_READDIR(dirvp, &uio, kcred, &eof, NULL, 0);
1446 		VOP_RWUNLOCK(dirvp, V_WRITELOCK_FALSE, NULL);
1447 
1448 		dbuflen = dlen - uio.uio_resid;
1449 		if (error || dbuflen == 0)
1450 			break;
1451 
1452 		if (!(ddv->sdev_flags & SDEV_BUILD))
1453 			break;
1454 
1455 		for (dp = dbuf; ((intptr_t)dp <
1456 		    (intptr_t)dbuf + dbuflen);
1457 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
1458 			nm = dp->d_name;
1459 
1460 			if (strcmp(nm, ".") == 0 ||
1461 			    strcmp(nm, "..") == 0)
1462 				continue;
1463 
1464 			vp = NULLVP;
1465 			dv = sdev_cache_lookup(ddv, nm);
1466 			if (dv) {
1467 				if (dv->sdev_state != SDEV_ZOMBIE) {
1468 					SDEV_SIMPLE_RELE(dv);
1469 				} else {
1470 					/*
1471 					 * A ZOMBIE node may not have been
1472 					 * cleaned up from the backing store,
1473 					 * bypass this entry in this case,
1474 					 * and clean it up from the directory
1475 					 * cache if this is the last call.
1476 					 */
1477 					(void) sdev_dirdelete(ddv, dv);
1478 				}
1479 				continue;
1480 			}
1481 
1482 			/* refill the cache if not already */
1483 			error = devname_backstore_lookup(ddv, nm, &vp);
1484 			if (error)
1485 				continue;
1486 
1487 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
1488 			error = VOP_GETATTR(vp, &vattr, 0, cred, NULL);
1489 			if (error)
1490 				continue;
1491 
1492 			if (vattr.va_type == VLNK) {
1493 				error = sdev_getlink(vp, &link);
1494 				if (error) {
1495 					continue;
1496 				}
1497 				ASSERT(link != NULL);
1498 			}
1499 
1500 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
1501 				rw_exit(&ddv->sdev_contents);
1502 				rw_enter(&ddv->sdev_contents, RW_WRITER);
1503 			}
1504 			error = sdev_mknode(ddv, nm, &dv, &vattr, vp, link,
1505 			    cred, SDEV_READY);
1506 			rw_downgrade(&ddv->sdev_contents);
1507 
1508 			if (link != NULL) {
1509 				kmem_free(link, strlen(link) + 1);
1510 				link = NULL;
1511 			}
1512 
1513 			if (!error) {
1514 				ASSERT(dv);
1515 				ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1516 				SDEV_SIMPLE_RELE(dv);
1517 			}
1518 			vp = NULL;
1519 			dv = NULL;
1520 		}
1521 	}
1522 
1523 done:
1524 	VN_RELE(dirvp);
1525 	kmem_free(dbuf, dlen);
1526 
1527 	return (error);
1528 }
1529 
1530 void
1531 sdev_filldir_dynamic(struct sdev_node *ddv)
1532 {
1533 	int error;
1534 	int i;
1535 	struct vattr vattr;
1536 	struct vattr *vap = &vattr;
1537 	char *nm = NULL;
1538 	struct sdev_node *dv = NULL;
1539 
1540 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1541 	ASSERT((ddv->sdev_flags & SDEV_BUILD));
1542 
1543 	*vap = *sdev_getdefault_attr(VDIR);	/* note structure copy here */
1544 	gethrestime(&vap->va_atime);
1545 	vap->va_mtime = vap->va_atime;
1546 	vap->va_ctime = vap->va_atime;
1547 	for (i = 0; vtab[i].vt_name != NULL; i++) {
1548 		/*
1549 		 * This early, we may be in a read-only /dev
1550 		 * environment: leave the creation of any nodes we'd
1551 		 * attempt to persist to devfsadm.
1552 		 */
1553 		if (vtab[i].vt_flags & SDEV_PERSIST)
1554 			continue;
1555 		nm = vtab[i].vt_name;
1556 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1557 		dv = NULL;
1558 		error = sdev_mknode(ddv, nm, &dv, vap, NULL,
1559 		    NULL, kcred, SDEV_READY);
1560 		if (error) {
1561 			cmn_err(CE_WARN, "%s/%s: error %d\n",
1562 			    ddv->sdev_name, nm, error);
1563 		} else {
1564 			ASSERT(dv);
1565 			ASSERT(dv->sdev_state != SDEV_ZOMBIE);
1566 			SDEV_SIMPLE_RELE(dv);
1567 		}
1568 	}
1569 }
1570 
1571 /*
1572  * Creating a backing store entry based on sdev_attr.
1573  * This is called either as part of node creation in a persistent directory
1574  * or from setattr/setsecattr to persist access attributes across reboot.
1575  */
1576 int
1577 sdev_shadow_node(struct sdev_node *dv, struct cred *cred)
1578 {
1579 	int error = 0;
1580 	struct vnode *dvp = SDEVTOV(dv->sdev_dotdot);
1581 	struct vnode *rdvp = VTOSDEV(dvp)->sdev_attrvp;
1582 	struct vattr *vap = dv->sdev_attr;
1583 	char *nm = dv->sdev_name;
1584 	struct vnode *tmpvp, **rvp = &tmpvp, *rrvp = NULL;
1585 
1586 	ASSERT(dv && dv->sdev_name && rdvp);
1587 	ASSERT(RW_WRITE_HELD(&dv->sdev_contents) && dv->sdev_attrvp == NULL);
1588 
1589 lookup:
1590 	/* try to find it in the backing store */
1591 	error = VOP_LOOKUP(rdvp, nm, rvp, NULL, 0, NULL, cred, NULL, NULL,
1592 	    NULL);
1593 	if (error == 0) {
1594 		if (VOP_REALVP(*rvp, &rrvp, NULL) == 0) {
1595 			VN_HOLD(rrvp);
1596 			VN_RELE(*rvp);
1597 			*rvp = rrvp;
1598 		}
1599 
1600 		kmem_free(dv->sdev_attr, sizeof (vattr_t));
1601 		dv->sdev_attr = NULL;
1602 		dv->sdev_attrvp = *rvp;
1603 		return (0);
1604 	}
1605 
1606 	/* let's try to persist the node */
1607 	gethrestime(&vap->va_atime);
1608 	vap->va_mtime = vap->va_atime;
1609 	vap->va_ctime = vap->va_atime;
1610 	vap->va_mask |= AT_TYPE|AT_MODE;
1611 	switch (vap->va_type) {
1612 	case VDIR:
1613 		error = VOP_MKDIR(rdvp, nm, vap, rvp, cred, NULL, 0, NULL);
1614 		sdcmn_err9(("sdev_shadow_node: mkdir vp %p error %d\n",
1615 		    (void *)(*rvp), error));
1616 		break;
1617 	case VCHR:
1618 	case VBLK:
1619 	case VREG:
1620 	case VDOOR:
1621 		error = VOP_CREATE(rdvp, nm, vap, NONEXCL, VREAD|VWRITE,
1622 		    rvp, cred, 0, NULL, NULL);
1623 		sdcmn_err9(("sdev_shadow_node: create vp %p, error %d\n",
1624 		    (void *)(*rvp), error));
1625 		if (!error)
1626 			VN_RELE(*rvp);
1627 		break;
1628 	case VLNK:
1629 		ASSERT(dv->sdev_symlink);
1630 		error = VOP_SYMLINK(rdvp, nm, vap, dv->sdev_symlink, cred,
1631 		    NULL, 0);
1632 		sdcmn_err9(("sdev_shadow_node: create symlink error %d\n",
1633 		    error));
1634 		break;
1635 	default:
1636 		cmn_err(CE_PANIC, "dev: %s: sdev_shadow_node "
1637 		    "create\n", nm);
1638 		/*NOTREACHED*/
1639 	}
1640 
1641 	/* go back to lookup to factor out spec node and set attrvp */
1642 	if (error == 0)
1643 		goto lookup;
1644 
1645 	sdcmn_err(("cannot persist %s - error %d\n", dv->sdev_path, error));
1646 	return (error);
1647 }
1648 
1649 static int
1650 sdev_cache_add(struct sdev_node *ddv, struct sdev_node **dv, char *nm)
1651 {
1652 	int error = 0;
1653 	struct sdev_node *dup = NULL;
1654 
1655 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1656 	if ((dup = sdev_findbyname(ddv, nm)) == NULL) {
1657 		sdev_direnter(ddv, *dv);
1658 	} else {
1659 		if (dup->sdev_state == SDEV_ZOMBIE) {
1660 			error = sdev_dirdelete(ddv, dup);
1661 			/*
1662 			 * The ZOMBIE node is still hanging
1663 			 * around with more than one reference counts.
1664 			 * Fail the new node creation so that
1665 			 * the directory cache won't have
1666 			 * duplicate entries for the same named node
1667 			 */
1668 			if (error == EBUSY) {
1669 				SDEV_SIMPLE_RELE(*dv);
1670 				sdev_nodedestroy(*dv, 0);
1671 				*dv = NULL;
1672 				return (error);
1673 			}
1674 			sdev_direnter(ddv, *dv);
1675 		} else {
1676 			ASSERT((*dv)->sdev_state != SDEV_ZOMBIE);
1677 			SDEV_SIMPLE_RELE(*dv);
1678 			sdev_nodedestroy(*dv, 0);
1679 			*dv = dup;
1680 		}
1681 	}
1682 
1683 	return (0);
1684 }
1685 
1686 static int
1687 sdev_cache_delete(struct sdev_node *ddv, struct sdev_node **dv)
1688 {
1689 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1690 	return (sdev_dirdelete(ddv, *dv));
1691 }
1692 
1693 /*
1694  * update the in-core directory cache
1695  */
1696 int
1697 sdev_cache_update(struct sdev_node *ddv, struct sdev_node **dv, char *nm,
1698     sdev_cache_ops_t ops)
1699 {
1700 	int error = 0;
1701 
1702 	ASSERT((SDEV_HELD(*dv)));
1703 
1704 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1705 	switch (ops) {
1706 	case SDEV_CACHE_ADD:
1707 		error = sdev_cache_add(ddv, dv, nm);
1708 		break;
1709 	case SDEV_CACHE_DELETE:
1710 		error = sdev_cache_delete(ddv, dv);
1711 		break;
1712 	default:
1713 		break;
1714 	}
1715 
1716 	return (error);
1717 }
1718 
1719 /*
1720  * retrieve the named entry from the directory cache
1721  */
1722 struct sdev_node *
1723 sdev_cache_lookup(struct sdev_node *ddv, char *nm)
1724 {
1725 	struct sdev_node *dv = NULL;
1726 
1727 	ASSERT(RW_LOCK_HELD(&ddv->sdev_contents));
1728 	dv = sdev_findbyname(ddv, nm);
1729 
1730 	return (dv);
1731 }
1732 
1733 /*
1734  * Implicit reconfig for nodes constructed by a link generator
1735  * Start devfsadm if needed, or if devfsadm is in progress,
1736  * prepare to block on devfsadm either completing or
1737  * constructing the desired node.  As devfsadmd is global
1738  * in scope, constructing all necessary nodes, we only
1739  * need to initiate it once.
1740  */
1741 static int
1742 sdev_call_devfsadmd(struct sdev_node *ddv, struct sdev_node *dv, char *nm)
1743 {
1744 	int error = 0;
1745 
1746 	if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
1747 		sdcmn_err6(("lookup: waiting for %s/%s, 0x%x\n",
1748 		    ddv->sdev_name, nm, devfsadm_state));
1749 		mutex_enter(&dv->sdev_lookup_lock);
1750 		SDEV_BLOCK_OTHERS(dv, (SDEV_LOOKUP | SDEV_LGWAITING));
1751 		mutex_exit(&dv->sdev_lookup_lock);
1752 		error = 0;
1753 	} else if (!DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state)) {
1754 		sdcmn_err6(("lookup %s/%s starting devfsadm, 0x%x\n",
1755 		    ddv->sdev_name, nm, devfsadm_state));
1756 
1757 		sdev_devfsadmd_thread(ddv, dv, kcred);
1758 		mutex_enter(&dv->sdev_lookup_lock);
1759 		SDEV_BLOCK_OTHERS(dv,
1760 		    (SDEV_LOOKUP | SDEV_LGWAITING));
1761 		mutex_exit(&dv->sdev_lookup_lock);
1762 		error = 0;
1763 	} else {
1764 		error = -1;
1765 	}
1766 
1767 	return (error);
1768 }
1769 
1770 /*
1771  *  Support for specialized device naming construction mechanisms
1772  */
1773 static int
1774 sdev_call_dircallback(struct sdev_node *ddv, struct sdev_node **dvp, char *nm,
1775     int (*callback)(struct sdev_node *, char *, void **, struct cred *,
1776     void *, char *), int flags, struct cred *cred)
1777 {
1778 	int rv = 0;
1779 	char *physpath = NULL;
1780 	struct vattr vattr;
1781 	struct vattr *vap = &vattr;
1782 	struct sdev_node *dv = NULL;
1783 
1784 	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
1785 	if (flags & SDEV_VLINK) {
1786 		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1787 		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
1788 		    NULL);
1789 		if (rv) {
1790 			kmem_free(physpath, MAXPATHLEN);
1791 			return (-1);
1792 		}
1793 
1794 		*vap = *sdev_getdefault_attr(VLNK);	/* structure copy */
1795 		vap->va_size = strlen(physpath);
1796 		gethrestime(&vap->va_atime);
1797 		vap->va_mtime = vap->va_atime;
1798 		vap->va_ctime = vap->va_atime;
1799 
1800 		rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
1801 		    (void *)physpath, cred, SDEV_READY);
1802 		kmem_free(physpath, MAXPATHLEN);
1803 		if (rv)
1804 			return (rv);
1805 	} else if (flags & SDEV_VATTR) {
1806 		/*
1807 		 * /dev/pts
1808 		 *
1809 		 * callback is responsible to set the basic attributes,
1810 		 * e.g. va_type/va_uid/va_gid/
1811 		 *    dev_t if VCHR or VBLK/
1812 		 */
1813 		ASSERT(callback);
1814 		rv = callback(ddv, nm, (void *)&vattr, kcred, NULL, NULL);
1815 		if (rv) {
1816 			sdcmn_err3(("devname_lookup_func: SDEV_NONE "
1817 			    "callback failed \n"));
1818 			return (-1);
1819 		}
1820 
1821 		rv = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL,
1822 		    cred, SDEV_READY);
1823 
1824 		if (rv)
1825 			return (rv);
1826 
1827 	} else {
1828 		impossible(("lookup: %s/%s by %s not supported (%d)\n",
1829 		    SDEVTOV(ddv)->v_path, nm, curproc->p_user.u_comm,
1830 		    __LINE__));
1831 		rv = -1;
1832 	}
1833 
1834 	*dvp = dv;
1835 	return (rv);
1836 }
1837 
1838 static int
1839 is_devfsadm_thread(char *exec_name)
1840 {
1841 	/*
1842 	 * note: because devfsadmd -> /usr/sbin/devfsadm
1843 	 * it is safe to use "devfsadm" to capture the lookups
1844 	 * from devfsadm and its daemon version.
1845 	 */
1846 	if (strcmp(exec_name, "devfsadm") == 0)
1847 		return (1);
1848 	return (0);
1849 }
1850 
1851 /*
1852  * Lookup Order:
1853  *	sdev_node cache;
1854  *	backing store (SDEV_PERSIST);
1855  *	DBNR: a. dir_ops implemented in the loadable modules;
1856  *	      b. vnode ops in vtab.
1857  */
1858 int
1859 devname_lookup_func(struct sdev_node *ddv, char *nm, struct vnode **vpp,
1860     struct cred *cred, int (*callback)(struct sdev_node *, char *, void **,
1861     struct cred *, void *, char *), int flags)
1862 {
1863 	int rv = 0, nmlen;
1864 	struct vnode *rvp = NULL;
1865 	struct sdev_node *dv = NULL;
1866 	int	retried = 0;
1867 	int	error = 0;
1868 	struct vattr vattr;
1869 	char *lookup_thread = curproc->p_user.u_comm;
1870 	int failed_flags = 0;
1871 	int (*vtor)(struct sdev_node *) = NULL;
1872 	int state;
1873 	int parent_state;
1874 	char *link = NULL;
1875 
1876 	if (SDEVTOV(ddv)->v_type != VDIR)
1877 		return (ENOTDIR);
1878 
1879 	/*
1880 	 * Empty name or ., return node itself.
1881 	 */
1882 	nmlen = strlen(nm);
1883 	if ((nmlen == 0) || ((nmlen == 1) && (nm[0] == '.'))) {
1884 		*vpp = SDEVTOV(ddv);
1885 		VN_HOLD(*vpp);
1886 		return (0);
1887 	}
1888 
1889 	/*
1890 	 * .., return the parent directory
1891 	 */
1892 	if ((nmlen == 2) && (strcmp(nm, "..") == 0)) {
1893 		*vpp = SDEVTOV(ddv->sdev_dotdot);
1894 		VN_HOLD(*vpp);
1895 		return (0);
1896 	}
1897 
1898 	rw_enter(&ddv->sdev_contents, RW_READER);
1899 	if (ddv->sdev_flags & SDEV_VTOR) {
1900 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
1901 		ASSERT(vtor);
1902 	}
1903 
1904 tryagain:
1905 	/*
1906 	 * (a) directory cache lookup:
1907 	 */
1908 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1909 	parent_state = ddv->sdev_state;
1910 	dv = sdev_cache_lookup(ddv, nm);
1911 	if (dv) {
1912 		state = dv->sdev_state;
1913 		switch (state) {
1914 		case SDEV_INIT:
1915 			if (is_devfsadm_thread(lookup_thread))
1916 				break;
1917 
1918 			/* ZOMBIED parent won't allow node creation */
1919 			if (parent_state == SDEV_ZOMBIE) {
1920 				SD_TRACE_FAILED_LOOKUP(ddv, nm,
1921 				    retried);
1922 				goto nolock_notfound;
1923 			}
1924 
1925 			mutex_enter(&dv->sdev_lookup_lock);
1926 			/* compensate the threads started after devfsadm */
1927 			if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
1928 			    !(SDEV_IS_LOOKUP(dv)))
1929 				SDEV_BLOCK_OTHERS(dv,
1930 				    (SDEV_LOOKUP | SDEV_LGWAITING));
1931 
1932 			if (SDEV_IS_LOOKUP(dv)) {
1933 				failed_flags |= SLF_REBUILT;
1934 				rw_exit(&ddv->sdev_contents);
1935 				error = sdev_wait4lookup(dv, SDEV_LOOKUP);
1936 				mutex_exit(&dv->sdev_lookup_lock);
1937 				rw_enter(&ddv->sdev_contents, RW_READER);
1938 
1939 				if (error != 0) {
1940 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1941 					    retried);
1942 					goto nolock_notfound;
1943 				}
1944 
1945 				state = dv->sdev_state;
1946 				if (state == SDEV_INIT) {
1947 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1948 					    retried);
1949 					goto nolock_notfound;
1950 				} else if (state == SDEV_READY) {
1951 					goto found;
1952 				} else if (state == SDEV_ZOMBIE) {
1953 					rw_exit(&ddv->sdev_contents);
1954 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
1955 					    retried);
1956 					SDEV_RELE(dv);
1957 					goto lookup_failed;
1958 				}
1959 			} else {
1960 				mutex_exit(&dv->sdev_lookup_lock);
1961 			}
1962 			break;
1963 		case SDEV_READY:
1964 			goto found;
1965 		case SDEV_ZOMBIE:
1966 			rw_exit(&ddv->sdev_contents);
1967 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1968 			SDEV_RELE(dv);
1969 			goto lookup_failed;
1970 		default:
1971 			rw_exit(&ddv->sdev_contents);
1972 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1973 			sdev_lookup_failed(ddv, nm, failed_flags);
1974 			*vpp = NULLVP;
1975 			return (ENOENT);
1976 		}
1977 	}
1978 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
1979 
1980 	/*
1981 	 * ZOMBIED parent does not allow new node creation.
1982 	 * bail out early
1983 	 */
1984 	if (parent_state == SDEV_ZOMBIE) {
1985 		rw_exit(&ddv->sdev_contents);
1986 		*vpp = NULLVP;
1987 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
1988 		return (ENOENT);
1989 	}
1990 
1991 	/*
1992 	 * (b0): backing store lookup
1993 	 *	SDEV_PERSIST is default except:
1994 	 *		1) pts nodes
1995 	 *		2) non-chmod'ed local nodes
1996 	 *		3) zvol nodes
1997 	 */
1998 	if (SDEV_IS_PERSIST(ddv)) {
1999 		error = devname_backstore_lookup(ddv, nm, &rvp);
2000 
2001 		if (!error) {
2002 
2003 			vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
2004 			error = VOP_GETATTR(rvp, &vattr, 0, cred, NULL);
2005 			if (error) {
2006 				rw_exit(&ddv->sdev_contents);
2007 				if (dv)
2008 					SDEV_RELE(dv);
2009 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2010 				sdev_lookup_failed(ddv, nm, failed_flags);
2011 				*vpp = NULLVP;
2012 				return (ENOENT);
2013 			}
2014 
2015 			if (vattr.va_type == VLNK) {
2016 				error = sdev_getlink(rvp, &link);
2017 				if (error) {
2018 					rw_exit(&ddv->sdev_contents);
2019 					if (dv)
2020 						SDEV_RELE(dv);
2021 					SD_TRACE_FAILED_LOOKUP(ddv, nm,
2022 					    retried);
2023 					sdev_lookup_failed(ddv, nm,
2024 					    failed_flags);
2025 					*vpp = NULLVP;
2026 					return (ENOENT);
2027 				}
2028 				ASSERT(link != NULL);
2029 			}
2030 
2031 			if (!rw_tryupgrade(&ddv->sdev_contents)) {
2032 				rw_exit(&ddv->sdev_contents);
2033 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2034 			}
2035 			error = sdev_mknode(ddv, nm, &dv, &vattr,
2036 			    rvp, link, cred, SDEV_READY);
2037 			rw_downgrade(&ddv->sdev_contents);
2038 
2039 			if (link != NULL) {
2040 				kmem_free(link, strlen(link) + 1);
2041 				link = NULL;
2042 			}
2043 
2044 			if (error) {
2045 				SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2046 				rw_exit(&ddv->sdev_contents);
2047 				if (dv)
2048 					SDEV_RELE(dv);
2049 				goto lookup_failed;
2050 			} else {
2051 				goto found;
2052 			}
2053 		} else if (retried) {
2054 			rw_exit(&ddv->sdev_contents);
2055 			sdcmn_err3(("retry of lookup of %s/%s: failed\n",
2056 			    ddv->sdev_name, nm));
2057 			if (dv)
2058 				SDEV_RELE(dv);
2059 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2060 			sdev_lookup_failed(ddv, nm, failed_flags);
2061 			*vpp = NULLVP;
2062 			return (ENOENT);
2063 		}
2064 	}
2065 
2066 lookup_create_node:
2067 	/* first thread that is doing the lookup on this node */
2068 	if (callback) {
2069 		ASSERT(dv == NULL);
2070 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2071 			rw_exit(&ddv->sdev_contents);
2072 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2073 		}
2074 		error = sdev_call_dircallback(ddv, &dv, nm, callback,
2075 		    flags, cred);
2076 		rw_downgrade(&ddv->sdev_contents);
2077 		if (error == 0) {
2078 			goto found;
2079 		} else {
2080 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2081 			rw_exit(&ddv->sdev_contents);
2082 			goto lookup_failed;
2083 		}
2084 	}
2085 	if (!dv) {
2086 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2087 			rw_exit(&ddv->sdev_contents);
2088 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2089 		}
2090 		error = sdev_mknode(ddv, nm, &dv, NULL, NULL, NULL,
2091 		    cred, SDEV_INIT);
2092 		if (!dv) {
2093 			rw_exit(&ddv->sdev_contents);
2094 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2095 			sdev_lookup_failed(ddv, nm, failed_flags);
2096 			*vpp = NULLVP;
2097 			return (ENOENT);
2098 		}
2099 		rw_downgrade(&ddv->sdev_contents);
2100 	}
2101 
2102 	/*
2103 	 * (b1) invoking devfsadm once per life time for devfsadm nodes
2104 	 */
2105 	ASSERT(SDEV_HELD(dv));
2106 
2107 	if (SDEV_IS_NO_NCACHE(dv))
2108 		failed_flags |= SLF_NO_NCACHE;
2109 	if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
2110 	    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
2111 	    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
2112 		ASSERT(SDEV_HELD(dv));
2113 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2114 		goto nolock_notfound;
2115 	}
2116 
2117 	/*
2118 	 * filter out known non-existent devices recorded
2119 	 * during initial reconfiguration boot for which
2120 	 * reconfig should not be done and lookup may
2121 	 * be short-circuited now.
2122 	 */
2123 	if (sdev_lookup_filter(ddv, nm)) {
2124 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2125 		goto nolock_notfound;
2126 	}
2127 
2128 	/* bypassing devfsadm internal nodes */
2129 	if (is_devfsadm_thread(lookup_thread)) {
2130 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2131 		goto nolock_notfound;
2132 	}
2133 
2134 	if (sdev_reconfig_disable) {
2135 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2136 		goto nolock_notfound;
2137 	}
2138 
2139 	error = sdev_call_devfsadmd(ddv, dv, nm);
2140 	if (error == 0) {
2141 		sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
2142 		    ddv->sdev_name, nm, curproc->p_user.u_comm));
2143 		if (sdev_reconfig_verbose) {
2144 			cmn_err(CE_CONT,
2145 			    "?lookup of %s/%s by %s: reconfig\n",
2146 			    ddv->sdev_name, nm, curproc->p_user.u_comm);
2147 		}
2148 		retried = 1;
2149 		failed_flags |= SLF_REBUILT;
2150 		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
2151 		SDEV_SIMPLE_RELE(dv);
2152 		goto tryagain;
2153 	} else {
2154 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2155 		goto nolock_notfound;
2156 	}
2157 
2158 found:
2159 	ASSERT(!(dv->sdev_flags & SDEV_STALE));
2160 	ASSERT(dv->sdev_state == SDEV_READY);
2161 	if (vtor) {
2162 		/*
2163 		 * Check validity of returned node
2164 		 */
2165 		switch (vtor(dv)) {
2166 		case SDEV_VTOR_VALID:
2167 			break;
2168 		case SDEV_VTOR_STALE:
2169 			/*
2170 			 * The name exists, but the cache entry is
2171 			 * stale and needs to be re-created.
2172 			 */
2173 			ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2174 			if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
2175 				rw_exit(&ddv->sdev_contents);
2176 				rw_enter(&ddv->sdev_contents, RW_WRITER);
2177 			}
2178 			error = sdev_cache_update(ddv, &dv, nm,
2179 			    SDEV_CACHE_DELETE);
2180 			rw_downgrade(&ddv->sdev_contents);
2181 			if (error == 0) {
2182 				dv = NULL;
2183 				goto lookup_create_node;
2184 			}
2185 			/* FALLTHRU */
2186 		case SDEV_VTOR_INVALID:
2187 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2188 			sdcmn_err7(("lookup: destroy invalid "
2189 			    "node: %s(%p)\n", dv->sdev_name, (void *)dv));
2190 			goto nolock_notfound;
2191 		case SDEV_VTOR_SKIP:
2192 			sdcmn_err7(("lookup: node not applicable - "
2193 			    "skipping: %s(%p)\n", dv->sdev_name, (void *)dv));
2194 			rw_exit(&ddv->sdev_contents);
2195 			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
2196 			SDEV_RELE(dv);
2197 			goto lookup_failed;
2198 		default:
2199 			cmn_err(CE_PANIC,
2200 			    "dev fs: validator failed: %s(%p)\n",
2201 			    dv->sdev_name, (void *)dv);
2202 			break;
2203 		}
2204 	}
2205 
2206 	rw_exit(&ddv->sdev_contents);
2207 	rv = sdev_to_vp(dv, vpp);
2208 	sdcmn_err3(("devname_lookup_func: returning vp %p v_count %d state %d "
2209 	    "for nm %s, error %d\n", (void *)*vpp, (*vpp)->v_count,
2210 	    dv->sdev_state, nm, rv));
2211 	return (rv);
2212 
2213 nolock_notfound:
2214 	/*
2215 	 * Destroy the node that is created for synchronization purposes.
2216 	 */
2217 	sdcmn_err3(("devname_lookup_func: %s with state %d\n",
2218 	    nm, dv->sdev_state));
2219 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2220 	if (dv->sdev_state == SDEV_INIT) {
2221 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
2222 			rw_exit(&ddv->sdev_contents);
2223 			rw_enter(&ddv->sdev_contents, RW_WRITER);
2224 		}
2225 
2226 		/*
2227 		 * Node state may have changed during the lock
2228 		 * changes. Re-check.
2229 		 */
2230 		if (dv->sdev_state == SDEV_INIT) {
2231 			(void) sdev_dirdelete(ddv, dv);
2232 			rw_exit(&ddv->sdev_contents);
2233 			sdev_lookup_failed(ddv, nm, failed_flags);
2234 			*vpp = NULL;
2235 			return (ENOENT);
2236 		}
2237 	}
2238 
2239 	rw_exit(&ddv->sdev_contents);
2240 	SDEV_RELE(dv);
2241 
2242 lookup_failed:
2243 	sdev_lookup_failed(ddv, nm, failed_flags);
2244 	*vpp = NULL;
2245 	return (ENOENT);
2246 }
2247 
2248 /*
2249  * Given a directory node, mark all nodes beneath as
2250  * STALE, i.e. nodes that don't exist as far as new
2251  * consumers are concerned.  Remove them from the
2252  * list of directory entries so that no lookup or
2253  * directory traversal will find them.  The node
2254  * not deallocated so existing holds are not affected.
2255  */
2256 void
2257 sdev_stale(struct sdev_node *ddv)
2258 {
2259 	struct sdev_node *dv;
2260 	struct vnode *vp;
2261 
2262 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2263 
2264 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2265 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = SDEV_NEXT_ENTRY(ddv, dv)) {
2266 		vp = SDEVTOV(dv);
2267 		if (vp->v_type == VDIR)
2268 			sdev_stale(dv);
2269 
2270 		sdcmn_err9(("sdev_stale: setting stale %s\n",
2271 		    dv->sdev_path));
2272 		dv->sdev_flags |= SDEV_STALE;
2273 		avl_remove(&ddv->sdev_entries, dv);
2274 	}
2275 	ddv->sdev_flags |= SDEV_BUILD;
2276 	rw_exit(&ddv->sdev_contents);
2277 }
2278 
2279 /*
2280  * Given a directory node, clean out all the nodes beneath.
2281  * If expr is specified, clean node with names matching expr.
2282  * If SDEV_ENFORCE is specified in flags, busy nodes are made stale,
2283  *	so they are excluded from future lookups.
2284  */
2285 int
2286 sdev_cleandir(struct sdev_node *ddv, char *expr, uint_t flags)
2287 {
2288 	int error = 0;
2289 	int busy = 0;
2290 	struct vnode *vp;
2291 	struct sdev_node *dv, *next = NULL;
2292 	int bkstore = 0;
2293 	int len = 0;
2294 	char *bks_name = NULL;
2295 
2296 	ASSERT(SDEVTOV(ddv)->v_type == VDIR);
2297 
2298 	/*
2299 	 * We try our best to destroy all unused sdev_node's
2300 	 */
2301 	rw_enter(&ddv->sdev_contents, RW_WRITER);
2302 	for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) {
2303 		next = SDEV_NEXT_ENTRY(ddv, dv);
2304 		vp = SDEVTOV(dv);
2305 
2306 		if (expr && gmatch(dv->sdev_name, expr) == 0)
2307 			continue;
2308 
2309 		if (vp->v_type == VDIR &&
2310 		    sdev_cleandir(dv, NULL, flags) != 0) {
2311 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2312 			    dv->sdev_name));
2313 			busy++;
2314 			continue;
2315 		}
2316 
2317 		if (vp->v_count > 0 && (flags & SDEV_ENFORCE) == 0) {
2318 			sdcmn_err9(("sdev_cleandir: dir %s busy\n",
2319 			    dv->sdev_name));
2320 			busy++;
2321 			continue;
2322 		}
2323 
2324 		/*
2325 		 * at this point, either dv is not held or SDEV_ENFORCE
2326 		 * is specified. In either case, dv needs to be deleted
2327 		 */
2328 		SDEV_HOLD(dv);
2329 
2330 		bkstore = SDEV_IS_PERSIST(dv) ? 1 : 0;
2331 		if (bkstore && (vp->v_type == VDIR))
2332 			bkstore += 1;
2333 
2334 		if (bkstore) {
2335 			len = strlen(dv->sdev_name) + 1;
2336 			bks_name = kmem_alloc(len, KM_SLEEP);
2337 			bcopy(dv->sdev_name, bks_name, len);
2338 		}
2339 
2340 		error = sdev_dirdelete(ddv, dv);
2341 
2342 		if (error == EBUSY) {
2343 			sdcmn_err9(("sdev_cleandir: dir busy\n"));
2344 			busy++;
2345 		}
2346 
2347 		/* take care the backing store clean up */
2348 		if (bkstore && (error == 0)) {
2349 			ASSERT(bks_name);
2350 			ASSERT(ddv->sdev_attrvp);
2351 
2352 			if (bkstore == 1) {
2353 				error = VOP_REMOVE(ddv->sdev_attrvp,
2354 				    bks_name, kcred, NULL, 0);
2355 			} else if (bkstore == 2) {
2356 				error = VOP_RMDIR(ddv->sdev_attrvp,
2357 				    bks_name, ddv->sdev_attrvp, kcred, NULL, 0);
2358 			}
2359 
2360 			/* do not propagate the backing store errors */
2361 			if (error) {
2362 				sdcmn_err9(("sdev_cleandir: backing store"
2363 				    "not cleaned\n"));
2364 				error = 0;
2365 			}
2366 
2367 			bkstore = 0;
2368 			kmem_free(bks_name, len);
2369 			bks_name = NULL;
2370 			len = 0;
2371 		}
2372 	}
2373 
2374 	ddv->sdev_flags |= SDEV_BUILD;
2375 	rw_exit(&ddv->sdev_contents);
2376 
2377 	if (busy) {
2378 		error = EBUSY;
2379 	}
2380 
2381 	return (error);
2382 }
2383 
2384 /*
2385  * a convenient wrapper for readdir() funcs
2386  */
2387 size_t
2388 add_dir_entry(dirent64_t *de, char *nm, size_t size, ino_t ino, offset_t off)
2389 {
2390 	size_t reclen = DIRENT64_RECLEN(strlen(nm));
2391 	if (reclen > size)
2392 		return (0);
2393 
2394 	de->d_ino = (ino64_t)ino;
2395 	de->d_off = (off64_t)off + 1;
2396 	de->d_reclen = (ushort_t)reclen;
2397 	(void) strncpy(de->d_name, nm, DIRENT64_NAMELEN(reclen));
2398 	return (reclen);
2399 }
2400 
2401 /*
2402  * sdev_mount service routines
2403  */
2404 int
2405 sdev_copyin_mountargs(struct mounta *uap, struct sdev_mountargs *args)
2406 {
2407 	int	error;
2408 
2409 	if (uap->datalen != sizeof (*args))
2410 		return (EINVAL);
2411 
2412 	if (error = copyin(uap->dataptr, args, sizeof (*args))) {
2413 		cmn_err(CE_WARN, "sdev_copyin_mountargs: can not"
2414 		    "get user data. error %d\n", error);
2415 		return (EFAULT);
2416 	}
2417 
2418 	return (0);
2419 }
2420 
2421 #ifdef nextdp
2422 #undef nextdp
2423 #endif
2424 #define	nextdp(dp)	((struct dirent64 *) \
2425 			    (intptr_t)((char *)(dp) + (dp)->d_reclen))
2426 
2427 /*
2428  * readdir helper func
2429  */
2430 int
2431 devname_readdir_func(vnode_t *vp, uio_t *uiop, cred_t *cred, int *eofp,
2432     int flags)
2433 {
2434 	struct sdev_node *ddv = VTOSDEV(vp);
2435 	struct sdev_node *dv;
2436 	dirent64_t	*dp;
2437 	ulong_t		outcount = 0;
2438 	size_t		namelen;
2439 	ulong_t		alloc_count;
2440 	void		*outbuf;
2441 	struct iovec	*iovp;
2442 	int		error = 0;
2443 	size_t		reclen;
2444 	offset_t	diroff;
2445 	offset_t	soff;
2446 	int		this_reclen;
2447 	int (*vtor)(struct sdev_node *) = NULL;
2448 	struct vattr attr;
2449 	timestruc_t now;
2450 
2451 	ASSERT(ddv->sdev_attr || ddv->sdev_attrvp);
2452 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
2453 
2454 	if (uiop->uio_loffset >= MAXOFF_T) {
2455 		if (eofp)
2456 			*eofp = 1;
2457 		return (0);
2458 	}
2459 
2460 	if (uiop->uio_iovcnt != 1)
2461 		return (EINVAL);
2462 
2463 	if (vp->v_type != VDIR)
2464 		return (ENOTDIR);
2465 
2466 	if (ddv->sdev_flags & SDEV_VTOR) {
2467 		vtor = (int (*)(struct sdev_node *))sdev_get_vtor(ddv);
2468 		ASSERT(vtor);
2469 	}
2470 
2471 	if (eofp != NULL)
2472 		*eofp = 0;
2473 
2474 	soff = uiop->uio_loffset;
2475 	iovp = uiop->uio_iov;
2476 	alloc_count = iovp->iov_len;
2477 	dp = outbuf = kmem_alloc(alloc_count, KM_SLEEP);
2478 	outcount = 0;
2479 
2480 	if (ddv->sdev_state == SDEV_ZOMBIE)
2481 		goto get_cache;
2482 
2483 	if (SDEV_IS_GLOBAL(ddv)) {
2484 
2485 		if ((sdev_boot_state == SDEV_BOOT_STATE_COMPLETE) &&
2486 		    !sdev_reconfig_boot && (flags & SDEV_BROWSE) &&
2487 		    !SDEV_IS_DYNAMIC(ddv) && !SDEV_IS_NO_NCACHE(ddv) &&
2488 		    ((moddebug & MODDEBUG_FINI_EBUSY) == 0) &&
2489 		    !DEVNAME_DEVFSADM_HAS_RUN(devfsadm_state) &&
2490 		    !DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state) &&
2491 		    !sdev_reconfig_disable) {
2492 			/*
2493 			 * invoking "devfsadm" to do system device reconfig
2494 			 */
2495 			mutex_enter(&ddv->sdev_lookup_lock);
2496 			SDEV_BLOCK_OTHERS(ddv,
2497 			    (SDEV_READDIR|SDEV_LGWAITING));
2498 			mutex_exit(&ddv->sdev_lookup_lock);
2499 
2500 			sdcmn_err8(("readdir of %s by %s: reconfig\n",
2501 			    ddv->sdev_path, curproc->p_user.u_comm));
2502 			if (sdev_reconfig_verbose) {
2503 				cmn_err(CE_CONT,
2504 				    "?readdir of %s by %s: reconfig\n",
2505 				    ddv->sdev_path, curproc->p_user.u_comm);
2506 			}
2507 
2508 			sdev_devfsadmd_thread(ddv, NULL, kcred);
2509 		} else if (DEVNAME_DEVFSADM_IS_RUNNING(devfsadm_state)) {
2510 			/*
2511 			 * compensate the "ls" started later than "devfsadm"
2512 			 */
2513 			mutex_enter(&ddv->sdev_lookup_lock);
2514 			SDEV_BLOCK_OTHERS(ddv, (SDEV_READDIR|SDEV_LGWAITING));
2515 			mutex_exit(&ddv->sdev_lookup_lock);
2516 		}
2517 
2518 		/*
2519 		 * release the contents lock so that
2520 		 * the cache may be updated by devfsadmd
2521 		 */
2522 		rw_exit(&ddv->sdev_contents);
2523 		mutex_enter(&ddv->sdev_lookup_lock);
2524 		if (SDEV_IS_READDIR(ddv))
2525 			(void) sdev_wait4lookup(ddv, SDEV_READDIR);
2526 		mutex_exit(&ddv->sdev_lookup_lock);
2527 		rw_enter(&ddv->sdev_contents, RW_READER);
2528 
2529 		sdcmn_err4(("readdir of directory %s by %s\n",
2530 		    ddv->sdev_name, curproc->p_user.u_comm));
2531 		if (ddv->sdev_flags & SDEV_BUILD) {
2532 			if (SDEV_IS_PERSIST(ddv)) {
2533 				error = sdev_filldir_from_store(ddv,
2534 				    alloc_count, cred);
2535 			}
2536 			ddv->sdev_flags &= ~SDEV_BUILD;
2537 		}
2538 	}
2539 
2540 get_cache:
2541 	/* handle "." and ".." */
2542 	diroff = 0;
2543 	if (soff == 0) {
2544 		/* first time */
2545 		this_reclen = DIRENT64_RECLEN(1);
2546 		if (alloc_count < this_reclen) {
2547 			error = EINVAL;
2548 			goto done;
2549 		}
2550 
2551 		dp->d_ino = (ino64_t)ddv->sdev_ino;
2552 		dp->d_off = (off64_t)1;
2553 		dp->d_reclen = (ushort_t)this_reclen;
2554 
2555 		(void) strncpy(dp->d_name, ".",
2556 		    DIRENT64_NAMELEN(this_reclen));
2557 		outcount += dp->d_reclen;
2558 		dp = nextdp(dp);
2559 	}
2560 
2561 	diroff++;
2562 	if (soff <= 1) {
2563 		this_reclen = DIRENT64_RECLEN(2);
2564 		if (alloc_count < outcount + this_reclen) {
2565 			error = EINVAL;
2566 			goto done;
2567 		}
2568 
2569 		dp->d_reclen = (ushort_t)this_reclen;
2570 		dp->d_ino = (ino64_t)ddv->sdev_dotdot->sdev_ino;
2571 		dp->d_off = (off64_t)2;
2572 
2573 		(void) strncpy(dp->d_name, "..",
2574 		    DIRENT64_NAMELEN(this_reclen));
2575 		outcount += dp->d_reclen;
2576 
2577 		dp = nextdp(dp);
2578 	}
2579 
2580 
2581 	/* gets the cache */
2582 	diroff++;
2583 	for (dv = SDEV_FIRST_ENTRY(ddv); dv;
2584 	    dv = SDEV_NEXT_ENTRY(ddv, dv), diroff++) {
2585 		sdcmn_err3(("sdev_readdir: diroff %lld soff %lld for '%s' \n",
2586 		    diroff, soff, dv->sdev_name));
2587 
2588 		/* bypassing pre-matured nodes */
2589 		if (diroff < soff || (dv->sdev_state != SDEV_READY)) {
2590 			sdcmn_err3(("sdev_readdir: pre-mature node  "
2591 			    "%s %d\n", dv->sdev_name, dv->sdev_state));
2592 			continue;
2593 		}
2594 
2595 		/*
2596 		 * Check validity of node
2597 		 * Drop invalid and nodes to be skipped.
2598 		 * A node the validator indicates as stale needs
2599 		 * to be returned as presumably the node name itself
2600 		 * is valid and the node data itself will be refreshed
2601 		 * on lookup.  An application performing a readdir then
2602 		 * stat on each entry should thus always see consistent
2603 		 * data.  In any case, it is not possible to synchronize
2604 		 * with dynamic kernel state, and any view we return can
2605 		 * never be anything more than a snapshot at a point in time.
2606 		 */
2607 		if (vtor) {
2608 			switch (vtor(dv)) {
2609 			case SDEV_VTOR_VALID:
2610 				break;
2611 			case SDEV_VTOR_INVALID:
2612 			case SDEV_VTOR_SKIP:
2613 				continue;
2614 			case SDEV_VTOR_STALE:
2615 				sdcmn_err3(("sdev_readir: %s stale\n",
2616 				    dv->sdev_name));
2617 				break;
2618 			default:
2619 				cmn_err(CE_PANIC,
2620 				    "dev fs: validator failed: %s(%p)\n",
2621 				    dv->sdev_name, (void *)dv);
2622 				break;
2623 			/*NOTREACHED*/
2624 			}
2625 		}
2626 
2627 		namelen = strlen(dv->sdev_name);
2628 		reclen = DIRENT64_RECLEN(namelen);
2629 		if (outcount + reclen > alloc_count) {
2630 			goto full;
2631 		}
2632 		dp->d_reclen = (ushort_t)reclen;
2633 		dp->d_ino = (ino64_t)dv->sdev_ino;
2634 		dp->d_off = (off64_t)diroff + 1;
2635 		(void) strncpy(dp->d_name, dv->sdev_name,
2636 		    DIRENT64_NAMELEN(reclen));
2637 		outcount += reclen;
2638 		dp = nextdp(dp);
2639 	}
2640 
2641 full:
2642 	sdcmn_err4(("sdev_readdir: moving %lu bytes: "
2643 	    "diroff %lld, soff %lld, dv %p\n", outcount, diroff, soff,
2644 	    (void *)dv));
2645 
2646 	if (outcount)
2647 		error = uiomove(outbuf, outcount, UIO_READ, uiop);
2648 
2649 	if (!error) {
2650 		uiop->uio_loffset = diroff;
2651 		if (eofp)
2652 			*eofp = dv ? 0 : 1;
2653 	}
2654 
2655 
2656 	if (ddv->sdev_attrvp) {
2657 		gethrestime(&now);
2658 		attr.va_ctime = now;
2659 		attr.va_atime = now;
2660 		attr.va_mask = AT_CTIME|AT_ATIME;
2661 
2662 		(void) VOP_SETATTR(ddv->sdev_attrvp, &attr, 0, kcred, NULL);
2663 	}
2664 done:
2665 	kmem_free(outbuf, alloc_count);
2666 	return (error);
2667 }
2668 
2669 static int
2670 sdev_modctl_lookup(const char *path, vnode_t **r_vp)
2671 {
2672 	vnode_t *vp;
2673 	vnode_t *cvp;
2674 	struct sdev_node *svp;
2675 	char *nm;
2676 	struct pathname pn;
2677 	int error;
2678 	int persisted = 0;
2679 
2680 	ASSERT(INGLOBALZONE(curproc));
2681 
2682 	if (error = pn_get((char *)path, UIO_SYSSPACE, &pn))
2683 		return (error);
2684 	nm = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2685 
2686 	vp = rootdir;
2687 	VN_HOLD(vp);
2688 
2689 	while (pn_pathleft(&pn)) {
2690 		ASSERT(vp->v_type == VDIR || vp->v_type == VLNK);
2691 		(void) pn_getcomponent(&pn, nm);
2692 
2693 		/*
2694 		 * Deal with the .. special case where we may be
2695 		 * traversing up across a mount point, to the
2696 		 * root of this filesystem or global root.
2697 		 */
2698 		if (nm[0] == '.' && nm[1] == '.' && nm[2] == 0) {
2699 checkforroot:
2700 			if (VN_CMP(vp, rootdir)) {
2701 				nm[1] = 0;
2702 			} else if (vp->v_flag & VROOT) {
2703 				vfs_t *vfsp;
2704 				cvp = vp;
2705 				vfsp = cvp->v_vfsp;
2706 				vfs_rlock_wait(vfsp);
2707 				vp = cvp->v_vfsp->vfs_vnodecovered;
2708 				if (vp == NULL ||
2709 				    (cvp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
2710 					vfs_unlock(vfsp);
2711 					VN_RELE(cvp);
2712 					error = EIO;
2713 					break;
2714 				}
2715 				VN_HOLD(vp);
2716 				vfs_unlock(vfsp);
2717 				VN_RELE(cvp);
2718 				cvp = NULL;
2719 				goto checkforroot;
2720 			}
2721 		}
2722 
2723 		error = VOP_LOOKUP(vp, nm, &cvp, NULL, 0, NULL, kcred, NULL,
2724 		    NULL, NULL);
2725 		if (error) {
2726 			VN_RELE(vp);
2727 			break;
2728 		}
2729 
2730 		/* traverse mount points encountered on our journey */
2731 		if (vn_ismntpt(cvp) && (error = traverse(&cvp)) != 0) {
2732 			VN_RELE(vp);
2733 			VN_RELE(cvp);
2734 			break;
2735 		}
2736 
2737 		/*
2738 		 * symbolic link, can be either relative and absolute
2739 		 */
2740 		if ((cvp->v_type == VLNK) && pn_pathleft(&pn)) {
2741 			struct pathname linkpath;
2742 			pn_alloc(&linkpath);
2743 			if (error = pn_getsymlink(cvp, &linkpath, kcred)) {
2744 				pn_free(&linkpath);
2745 				break;
2746 			}
2747 			if (pn_pathleft(&linkpath) == 0)
2748 				(void) pn_set(&linkpath, ".");
2749 			error = pn_insert(&pn, &linkpath, strlen(nm));
2750 			pn_free(&linkpath);
2751 			if (pn.pn_pathlen == 0) {
2752 				VN_RELE(vp);
2753 				return (ENOENT);
2754 			}
2755 			if (pn.pn_path[0] == '/') {
2756 				pn_skipslash(&pn);
2757 				VN_RELE(vp);
2758 				VN_RELE(cvp);
2759 				vp = rootdir;
2760 				VN_HOLD(vp);
2761 			} else {
2762 				VN_RELE(cvp);
2763 			}
2764 			continue;
2765 		}
2766 
2767 		VN_RELE(vp);
2768 
2769 		/*
2770 		 * Direct the operation to the persisting filesystem
2771 		 * underlying /dev.  Bail if we encounter a
2772 		 * non-persistent dev entity here.
2773 		 */
2774 		if (cvp->v_vfsp->vfs_fstype == devtype) {
2775 
2776 			if ((VTOSDEV(cvp)->sdev_flags & SDEV_PERSIST) == 0) {
2777 				error = ENOENT;
2778 				VN_RELE(cvp);
2779 				break;
2780 			}
2781 
2782 			if (VTOSDEV(cvp) == NULL) {
2783 				error = ENOENT;
2784 				VN_RELE(cvp);
2785 				break;
2786 			}
2787 			svp = VTOSDEV(cvp);
2788 			if ((vp = svp->sdev_attrvp) == NULL) {
2789 				error = ENOENT;
2790 				VN_RELE(cvp);
2791 				break;
2792 			}
2793 			persisted = 1;
2794 			VN_HOLD(vp);
2795 			VN_RELE(cvp);
2796 			cvp = vp;
2797 		}
2798 
2799 		vp = cvp;
2800 		pn_skipslash(&pn);
2801 	}
2802 
2803 	kmem_free(nm, MAXNAMELEN);
2804 	pn_free(&pn);
2805 
2806 	if (error)
2807 		return (error);
2808 
2809 	/*
2810 	 * Only return persisted nodes in the filesystem underlying /dev.
2811 	 */
2812 	if (!persisted) {
2813 		VN_RELE(vp);
2814 		return (ENOENT);
2815 	}
2816 
2817 	*r_vp = vp;
2818 	return (0);
2819 }
2820 
2821 int
2822 sdev_modctl_readdir(const char *dir, char ***dirlistp,
2823 	int *npathsp, int *npathsp_alloc, int checking_empty)
2824 {
2825 	char	**pathlist = NULL;
2826 	char	**newlist = NULL;
2827 	int	npaths = 0;
2828 	int	npaths_alloc = 0;
2829 	dirent64_t *dbuf = NULL;
2830 	int	n;
2831 	char	*s;
2832 	int error;
2833 	vnode_t *vp;
2834 	int eof;
2835 	struct iovec iov;
2836 	struct uio uio;
2837 	struct dirent64 *dp;
2838 	size_t dlen;
2839 	size_t dbuflen;
2840 	int ndirents = 64;
2841 	char *nm;
2842 
2843 	error = sdev_modctl_lookup(dir, &vp);
2844 	sdcmn_err11(("modctl readdir: %s by %s: %s\n",
2845 	    dir, curproc->p_user.u_comm,
2846 	    (error == 0) ? "ok" : "failed"));
2847 	if (error)
2848 		return (error);
2849 
2850 	dlen = ndirents * (sizeof (*dbuf));
2851 	dbuf = kmem_alloc(dlen, KM_SLEEP);
2852 
2853 	uio.uio_iov = &iov;
2854 	uio.uio_iovcnt = 1;
2855 	uio.uio_segflg = UIO_SYSSPACE;
2856 	uio.uio_fmode = 0;
2857 	uio.uio_extflg = UIO_COPY_CACHED;
2858 	uio.uio_loffset = 0;
2859 	uio.uio_llimit = MAXOFFSET_T;
2860 
2861 	eof = 0;
2862 	error = 0;
2863 	while (!error && !eof) {
2864 		uio.uio_resid = dlen;
2865 		iov.iov_base = (char *)dbuf;
2866 		iov.iov_len = dlen;
2867 
2868 		(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2869 		error = VOP_READDIR(vp, &uio, kcred, &eof, NULL, 0);
2870 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2871 
2872 		dbuflen = dlen - uio.uio_resid;
2873 
2874 		if (error || dbuflen == 0)
2875 			break;
2876 
2877 		for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen);
2878 		    dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) {
2879 
2880 			nm = dp->d_name;
2881 
2882 			if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0)
2883 				continue;
2884 			if (npaths == npaths_alloc) {
2885 				npaths_alloc += 64;
2886 				newlist = (char **)
2887 				    kmem_zalloc((npaths_alloc + 1) *
2888 				    sizeof (char *), KM_SLEEP);
2889 				if (pathlist) {
2890 					bcopy(pathlist, newlist,
2891 					    npaths * sizeof (char *));
2892 					kmem_free(pathlist,
2893 					    (npaths + 1) * sizeof (char *));
2894 				}
2895 				pathlist = newlist;
2896 			}
2897 			n = strlen(nm) + 1;
2898 			s = kmem_alloc(n, KM_SLEEP);
2899 			bcopy(nm, s, n);
2900 			pathlist[npaths++] = s;
2901 			sdcmn_err11(("  %s/%s\n", dir, s));
2902 
2903 			/* if checking empty, one entry is as good as many */
2904 			if (checking_empty) {
2905 				eof = 1;
2906 				break;
2907 			}
2908 		}
2909 	}
2910 
2911 exit:
2912 	VN_RELE(vp);
2913 
2914 	if (dbuf)
2915 		kmem_free(dbuf, dlen);
2916 
2917 	if (error)
2918 		return (error);
2919 
2920 	*dirlistp = pathlist;
2921 	*npathsp = npaths;
2922 	*npathsp_alloc = npaths_alloc;
2923 
2924 	return (0);
2925 }
2926 
2927 void
2928 sdev_modctl_readdir_free(char **pathlist, int npaths, int npaths_alloc)
2929 {
2930 	int	i, n;
2931 
2932 	for (i = 0; i < npaths; i++) {
2933 		n = strlen(pathlist[i]) + 1;
2934 		kmem_free(pathlist[i], n);
2935 	}
2936 
2937 	kmem_free(pathlist, (npaths_alloc + 1) * sizeof (char *));
2938 }
2939 
2940 int
2941 sdev_modctl_devexists(const char *path)
2942 {
2943 	vnode_t *vp;
2944 	int error;
2945 
2946 	error = sdev_modctl_lookup(path, &vp);
2947 	sdcmn_err11(("modctl dev exists: %s by %s: %s\n",
2948 	    path, curproc->p_user.u_comm,
2949 	    (error == 0) ? "ok" : "failed"));
2950 	if (error == 0)
2951 		VN_RELE(vp);
2952 
2953 	return (error);
2954 }
2955 
2956 extern int sdev_vnodeops_tbl_size;
2957 
2958 /*
2959  * construct a new template with overrides from vtab
2960  */
2961 static fs_operation_def_t *
2962 sdev_merge_vtab(const fs_operation_def_t tab[])
2963 {
2964 	fs_operation_def_t *new;
2965 	const fs_operation_def_t *tab_entry;
2966 
2967 	/* make a copy of standard vnode ops table */
2968 	new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP);
2969 	bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size);
2970 
2971 	/* replace the overrides from tab */
2972 	for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) {
2973 		fs_operation_def_t *std_entry = new;
2974 		while (std_entry->name) {
2975 			if (strcmp(tab_entry->name, std_entry->name) == 0) {
2976 				std_entry->func = tab_entry->func;
2977 				break;
2978 			}
2979 			std_entry++;
2980 		}
2981 		if (std_entry->name == NULL)
2982 			cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.",
2983 			    tab_entry->name);
2984 	}
2985 
2986 	return (new);
2987 }
2988 
2989 /* free memory allocated by sdev_merge_vtab */
2990 static void
2991 sdev_free_vtab(fs_operation_def_t *new)
2992 {
2993 	kmem_free(new, sdev_vnodeops_tbl_size);
2994 }
2995 
2996 /*
2997  * a generic setattr() function
2998  *
2999  * note: flags only supports AT_UID and AT_GID.
3000  *	 Future enhancements can be done for other types, e.g. AT_MODE
3001  */
3002 int
3003 devname_setattr_func(struct vnode *vp, struct vattr *vap, int flags,
3004     struct cred *cred, int (*callback)(struct sdev_node *, struct vattr *,
3005     int), int protocol)
3006 {
3007 	struct sdev_node	*dv = VTOSDEV(vp);
3008 	struct sdev_node	*parent = dv->sdev_dotdot;
3009 	struct vattr		*get;
3010 	uint_t			mask = vap->va_mask;
3011 	int 			error;
3012 
3013 	/* some sanity checks */
3014 	if (vap->va_mask & AT_NOSET)
3015 		return (EINVAL);
3016 
3017 	if (vap->va_mask & AT_SIZE) {
3018 		if (vp->v_type == VDIR) {
3019 			return (EISDIR);
3020 		}
3021 	}
3022 
3023 	/* no need to set attribute, but do not fail either */
3024 	ASSERT(parent);
3025 	rw_enter(&parent->sdev_contents, RW_READER);
3026 	if (dv->sdev_state == SDEV_ZOMBIE) {
3027 		rw_exit(&parent->sdev_contents);
3028 		return (0);
3029 	}
3030 
3031 	/* If backing store exists, just set it. */
3032 	if (dv->sdev_attrvp) {
3033 		rw_exit(&parent->sdev_contents);
3034 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3035 	}
3036 
3037 	/*
3038 	 * Otherwise, for nodes with the persistence attribute, create it.
3039 	 */
3040 	ASSERT(dv->sdev_attr);
3041 	if (SDEV_IS_PERSIST(dv) ||
3042 	    ((vap->va_mask & ~AT_TIMES) != 0 && !SDEV_IS_DYNAMIC(dv))) {
3043 		sdev_vattr_merge(dv, vap);
3044 		rw_enter(&dv->sdev_contents, RW_WRITER);
3045 		error = sdev_shadow_node(dv, cred);
3046 		rw_exit(&dv->sdev_contents);
3047 		rw_exit(&parent->sdev_contents);
3048 
3049 		if (error)
3050 			return (error);
3051 		return (VOP_SETATTR(dv->sdev_attrvp, vap, flags, cred, NULL));
3052 	}
3053 
3054 
3055 	/*
3056 	 * sdev_attr was allocated in sdev_mknode
3057 	 */
3058 	rw_enter(&dv->sdev_contents, RW_WRITER);
3059 	error = secpolicy_vnode_setattr(cred, vp, vap,
3060 	    dv->sdev_attr, flags, sdev_unlocked_access, dv);
3061 	if (error) {
3062 		rw_exit(&dv->sdev_contents);
3063 		rw_exit(&parent->sdev_contents);
3064 		return (error);
3065 	}
3066 
3067 	get = dv->sdev_attr;
3068 	if (mask & AT_MODE) {
3069 		get->va_mode &= S_IFMT;
3070 		get->va_mode |= vap->va_mode & ~S_IFMT;
3071 	}
3072 
3073 	if ((mask & AT_UID) || (mask & AT_GID)) {
3074 		if (mask & AT_UID)
3075 			get->va_uid = vap->va_uid;
3076 		if (mask & AT_GID)
3077 			get->va_gid = vap->va_gid;
3078 		/*
3079 		 * a callback must be provided if the protocol is set
3080 		 */
3081 		if ((protocol & AT_UID) || (protocol & AT_GID)) {
3082 			ASSERT(callback);
3083 			error = callback(dv, get, protocol);
3084 			if (error) {
3085 				rw_exit(&dv->sdev_contents);
3086 				rw_exit(&parent->sdev_contents);
3087 				return (error);
3088 			}
3089 		}
3090 	}
3091 
3092 	if (mask & AT_ATIME)
3093 		get->va_atime = vap->va_atime;
3094 	if (mask & AT_MTIME)
3095 		get->va_mtime = vap->va_mtime;
3096 	if (mask & (AT_MODE | AT_UID | AT_GID | AT_CTIME)) {
3097 		gethrestime(&get->va_ctime);
3098 	}
3099 
3100 	sdev_vattr_merge(dv, get);
3101 	rw_exit(&dv->sdev_contents);
3102 	rw_exit(&parent->sdev_contents);
3103 	return (0);
3104 }
3105 
3106 /*
3107  * a generic inactive() function
3108  */
3109 /*ARGSUSED*/
3110 void
3111 devname_inactive_func(struct vnode *vp, struct cred *cred,
3112     void (*callback)(struct vnode *))
3113 {
3114 	int clean;
3115 	struct sdev_node *dv = VTOSDEV(vp);
3116 	struct sdev_node *ddv = dv->sdev_dotdot;
3117 	int state;
3118 
3119 	rw_enter(&ddv->sdev_contents, RW_WRITER);
3120 	state = dv->sdev_state;
3121 
3122 	mutex_enter(&vp->v_lock);
3123 	ASSERT(vp->v_count >= 1);
3124 
3125 	if (vp->v_count == 1 && callback != NULL)
3126 		callback(vp);
3127 
3128 	clean = (vp->v_count == 1) && (state == SDEV_ZOMBIE);
3129 
3130 	/*
3131 	 * last ref count on the ZOMBIE node is released.
3132 	 * clean up the sdev_node, and
3133 	 * release the hold on the backing store node so that
3134 	 * the ZOMBIE backing stores also cleaned out.
3135 	 */
3136 	if (clean) {
3137 		ASSERT(ddv);
3138 
3139 		ddv->sdev_nlink--;
3140 		if (vp->v_type == VDIR) {
3141 			dv->sdev_nlink--;
3142 		}
3143 		if ((dv->sdev_flags & SDEV_STALE) == 0)
3144 			avl_remove(&ddv->sdev_entries, dv);
3145 		dv->sdev_nlink--;
3146 		--vp->v_count;
3147 		mutex_exit(&vp->v_lock);
3148 		sdev_nodedestroy(dv, 0);
3149 	} else {
3150 		--vp->v_count;
3151 		mutex_exit(&vp->v_lock);
3152 	}
3153 	rw_exit(&ddv->sdev_contents);
3154 }
3155