xref: /illumos-gate/usr/src/uts/common/fs/dev/sdev_zvolops.c (revision dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2013, 2016 Joyent, Inc.  All rights reserved.
25  */
26 
27 /* vnode ops for the /dev/zvol directory */
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/ddi.h>
33 #include <sys/sunndi.h>
34 #include <sys/sunldi.h>
35 #include <fs/fs_subr.h>
36 #include <sys/fs/dv_node.h>
37 #include <sys/fs/sdev_impl.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/policy.h>
40 #include <sys/stat.h>
41 #include <sys/vfs_opreg.h>
42 
43 struct vnodeops	*devzvol_vnodeops;
44 static major_t devzvol_major;
45 static taskq_ent_t devzvol_zclist_task;
46 
47 static kmutex_t devzvol_mtx;
48 /* Below are protected by devzvol_mtx */
49 static boolean_t devzvol_isopen;
50 static boolean_t devzvol_zclist_task_running = B_FALSE;
51 static uint64_t devzvol_gen = 0;
52 static uint64_t devzvol_zclist;
53 static size_t devzvol_zclist_size;
54 static ldi_ident_t devzvol_li;
55 static ldi_handle_t devzvol_lh;
56 
57 /*
58  * we need to use ddi_mod* since fs/dev gets loaded early on in
59  * startup(), and linking fs/dev to fs/zfs would drag in a lot of
60  * other stuff (like drv/random) before the rest of the system is
61  * ready to go
62  */
63 ddi_modhandle_t zfs_mod;
64 int (*szcm)(char *);
65 int (*szn2m)(char *, minor_t *);
66 
67 int
68 sdev_zvol_create_minor(char *dsname)
69 {
70 	if (szcm == NULL)
71 		return (-1);
72 	return ((*szcm)(dsname));
73 }
74 
75 int
76 sdev_zvol_name2minor(char *dsname, minor_t *minor)
77 {
78 	if (szn2m == NULL)
79 		return (-1);
80 	return ((*szn2m)(dsname, minor));
81 }
82 
83 int
84 devzvol_open_zfs()
85 {
86 	int rc;
87 	dev_t dv;
88 
89 	devzvol_li = ldi_ident_from_anon();
90 	if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
91 	    &devzvol_lh, devzvol_li))
92 		return (-1);
93 	if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
94 	    KRTLD_MODE_FIRST, &rc)) == NULL)) {
95 		return (rc);
96 	}
97 	ASSERT(szcm == NULL && szn2m == NULL);
98 	if ((szcm = (int (*)(char *))
99 	    ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
100 		cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
101 		return (rc);
102 	}
103 	if ((szn2m = (int(*)(char *, minor_t *))
104 	    ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
105 		cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
106 		return (rc);
107 	}
108 	if (ldi_get_dev(devzvol_lh, &dv))
109 		return (-1);
110 	devzvol_major = getmajor(dv);
111 	return (0);
112 }
113 
114 void
115 devzvol_close_zfs()
116 {
117 	szcm = NULL;
118 	szn2m = NULL;
119 	(void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
120 	ldi_ident_release(devzvol_li);
121 	if (zfs_mod != NULL) {
122 		(void) ddi_modclose(zfs_mod);
123 		zfs_mod = NULL;
124 	}
125 }
126 
127 int
128 devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
129 {
130 	uint64_t cookie;
131 	int size = 8000;
132 	int unused;
133 	int rc;
134 
135 	if (cmd != ZFS_IOC_POOL_CONFIGS)
136 		mutex_enter(&devzvol_mtx);
137 	if (!devzvol_isopen) {
138 		if ((rc = devzvol_open_zfs()) == 0) {
139 			devzvol_isopen = B_TRUE;
140 		} else {
141 			if (cmd != ZFS_IOC_POOL_CONFIGS)
142 				mutex_exit(&devzvol_mtx);
143 			return (ENXIO);
144 		}
145 	}
146 	cookie = zc->zc_cookie;
147 again:
148 	zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
149 	    KM_SLEEP);
150 	zc->zc_nvlist_dst_size = size;
151 	rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
152 	    &unused);
153 	if (rc == ENOMEM) {
154 		int newsize;
155 		newsize = zc->zc_nvlist_dst_size;
156 		ASSERT(newsize > size);
157 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
158 		size = newsize;
159 		zc->zc_cookie = cookie;
160 		goto again;
161 	}
162 	if (alloc_size == NULL)
163 		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
164 	else
165 		*alloc_size = size;
166 	if (cmd != ZFS_IOC_POOL_CONFIGS)
167 		mutex_exit(&devzvol_mtx);
168 	return (rc);
169 }
170 
171 /* figures out if the objset exists and returns its type */
172 int
173 devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
174 {
175 	boolean_t	ispool;
176 	zfs_cmd_t	*zc;
177 	int rc;
178 	nvlist_t 	*nvl;
179 	size_t nvsz;
180 
181 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
182 	(void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
183 
184 	nvl = fnvlist_alloc();
185 	fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
186 	zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
187 	zc->zc_nvlist_src_size = nvsz;
188 	fnvlist_free(nvl);
189 
190 	ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE;
191 	rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
192 	    ZFS_IOC_OBJSET_STATS, zc, NULL);
193 	if (type && rc == 0)
194 		*type = (ispool) ? DMU_OST_ZFS :
195 		    zc->zc_objset_stats.dds_type;
196 	fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
197 	kmem_free(zc, sizeof (zfs_cmd_t));
198 	return (rc);
199 }
200 
201 /*
202  * Returns what the zfs dataset name should be, given the /dev/zvol
203  * path and an optional name (can be NULL).
204  *
205  * Note that if the name param is NULL, then path must be an
206  * actual dataset's directory and not one of the top-level
207  * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
208  * specific dataset.
209  */
210 char *
211 devzvol_make_dsname(const char *path, const char *name)
212 {
213 	char *dsname;
214 	const char *ptr;
215 	int dslen;
216 
217 	if (strcmp(path, ZVOL_DIR) == 0)
218 		return (NULL);
219 	if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
220 		return (NULL);
221 	ptr = path + strlen(ZVOL_DIR);
222 	if (strncmp(ptr, "/dsk", 4) == 0)
223 		ptr += strlen("/dsk");
224 	else if (strncmp(ptr, "/rdsk", 5) == 0)
225 		ptr += strlen("/rdsk");
226 	else
227 		return (NULL);
228 
229 	if (*ptr == '/')
230 		ptr++;
231 	else if (name == NULL)
232 		return (NULL);
233 
234 	dslen = strlen(ptr);
235 	if (dslen)
236 		dslen++;			/* plus null */
237 	if (name)
238 		dslen += strlen(name) + 1;	/* plus slash */
239 	dsname = kmem_zalloc(dslen, KM_SLEEP);
240 	if (*ptr) {
241 		(void) strlcpy(dsname, ptr, dslen);
242 		if (name)
243 			(void) strlcat(dsname, "/", dslen);
244 	}
245 	if (name)
246 		(void) strlcat(dsname, name, dslen);
247 	return (dsname);
248 }
249 
250 /*
251  * check if the zvol's sdev_node is still valid, which means make
252  * sure the zvol is still valid.  zvol minors aren't proactively
253  * destroyed when the zvol is destroyed, so we use a validator to clean
254  * these up (in other words, when such nodes are encountered during
255  * subsequent lookup() and readdir() operations) so that only valid
256  * nodes are returned.  The ordering between devname_lookup_func and
257  * devzvol_validate is a little inefficient in the case of invalid
258  * or stale nodes because devname_lookup_func calls
259  * devzvol_create_{dir, link}, then the validator says it's invalid,
260  * and then the node gets cleaned up.
261  */
262 int
263 devzvol_validate(struct sdev_node *dv)
264 {
265 	vnode_t *vn = SDEVTOV(dv);
266 	dmu_objset_type_t do_type;
267 	char *dsname;
268 	char *nm = dv->sdev_name;
269 	int rc;
270 
271 	sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
272 	/*
273 	 * validate only READY nodes; if someone is sitting on the
274 	 * directory of a dataset that just got destroyed we could
275 	 * get a zombie node which we just skip.
276 	 */
277 	if (dv->sdev_state != SDEV_READY) {
278 		sdcmn_err13(("skipping '%s'", nm));
279 		return (SDEV_VTOR_SKIP);
280 	}
281 
282 	if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
283 	    (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
284 		return (SDEV_VTOR_VALID);
285 	dsname = devzvol_make_dsname(dv->sdev_path, NULL);
286 	if (dsname == NULL)
287 		return (SDEV_VTOR_INVALID);
288 
289 	/*
290 	 * Leave any nodes alone that have been explicitly created by
291 	 * sdev profiles.
292 	 */
293 	if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
294 		kmem_free(dsname, strlen(dsname) + 1);
295 		return (SDEV_VTOR_VALID);
296 	}
297 
298 	rc = devzvol_objset_check(dsname, &do_type);
299 	sdcmn_err13(("  '%s' rc %d", dsname, rc));
300 	if (rc != 0) {
301 		sdev_node_t *parent = dv->sdev_dotdot;
302 		/*
303 		 * Explicitly passed-through zvols in our sdev profile can't
304 		 * be created as prof_* shadow nodes, because in the GZ they
305 		 * are symlinks, but in the NGZ they are actual device files.
306 		 *
307 		 * The objset_check will fail on these as they are outside
308 		 * any delegated dataset (zfs will not allow ioctl access to
309 		 * them from this zone). We still want them to work, though.
310 		 */
311 		if (!(parent->sdev_flags & SDEV_GLOBAL) &&
312 		    parent->sdev_origin != NULL &&
313 		    !(dv->sdev_flags & SDEV_GLOBAL) &&
314 		    (vn->v_type == VBLK || vn->v_type == VCHR) &&
315 		    prof_name_matched(nm, parent)) {
316 			do_type = DMU_OST_ZVOL;
317 		} else {
318 			kmem_free(dsname, strlen(dsname) + 1);
319 			return (SDEV_VTOR_INVALID);
320 		}
321 	}
322 
323 	sdcmn_err13(("  v_type %d do_type %d",
324 	    vn->v_type, do_type));
325 	if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
326 	    ((vn->v_type == VBLK || vn->v_type == VCHR) &&
327 	    do_type != DMU_OST_ZVOL) ||
328 	    (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
329 		kmem_free(dsname, strlen(dsname) + 1);
330 		return (SDEV_VTOR_STALE);
331 	}
332 	if (vn->v_type == VLNK) {
333 		char *ptr, *link;
334 		long val = 0;
335 		minor_t lminor, ominor;
336 
337 		rc = sdev_getlink(vn, &link);
338 		ASSERT(rc == 0);
339 
340 		ptr = strrchr(link, ':') + 1;
341 		rc = ddi_strtol(ptr, NULL, 10, &val);
342 		kmem_free(link, strlen(link) + 1);
343 		ASSERT(rc == 0 && val != 0);
344 		lminor = (minor_t)val;
345 		if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
346 		    ominor != lminor) {
347 			kmem_free(dsname, strlen(dsname) + 1);
348 			return (SDEV_VTOR_STALE);
349 		}
350 	}
351 	kmem_free(dsname, strlen(dsname) + 1);
352 	return (SDEV_VTOR_VALID);
353 }
354 
355 /*
356  * Taskq callback to update the devzvol_zclist.
357  *
358  * We need to defer this to the taskq to avoid it running with a user
359  * context that might be associated with some non-global zone, and thus
360  * not being able to list all of the pools on the entire system.
361  */
362 /*ARGSUSED*/
363 static void
364 devzvol_update_zclist_cb(void *arg)
365 {
366 	zfs_cmd_t	*zc;
367 	int		rc;
368 	size_t		size;
369 
370 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
371 	mutex_enter(&devzvol_mtx);
372 	zc->zc_cookie = devzvol_gen;
373 
374 	rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
375 	switch (rc) {
376 		case 0:
377 			/* new generation */
378 			ASSERT(devzvol_gen != zc->zc_cookie);
379 			devzvol_gen = zc->zc_cookie;
380 			if (devzvol_zclist)
381 				kmem_free((void *)(uintptr_t)devzvol_zclist,
382 				    devzvol_zclist_size);
383 			devzvol_zclist = zc->zc_nvlist_dst;
384 			/* Keep the alloc'd size, not the nvlist size. */
385 			devzvol_zclist_size = size;
386 			break;
387 		default:
388 			/*
389 			 * Either there was no change in pool configuration
390 			 * since we last asked (rc == EEXIST) or we got a
391 			 * catastrophic error.
392 			 *
393 			 * Give up memory and exit.
394 			 */
395 			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
396 			    size);
397 			break;
398 	}
399 
400 	VERIFY(devzvol_zclist_task_running == B_TRUE);
401 	devzvol_zclist_task_running = B_FALSE;
402 	mutex_exit(&devzvol_mtx);
403 
404 	kmem_free(zc, sizeof (zfs_cmd_t));
405 }
406 
407 static void
408 devzvol_update_zclist(void)
409 {
410 	mutex_enter(&devzvol_mtx);
411 	if (devzvol_zclist_task_running == B_TRUE) {
412 		mutex_exit(&devzvol_mtx);
413 		goto wait;
414 	}
415 
416 	devzvol_zclist_task_running = B_TRUE;
417 
418 	taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
419 	    &devzvol_zclist_task);
420 
421 	mutex_exit(&devzvol_mtx);
422 
423 wait:
424 	taskq_wait(sdev_taskq);
425 }
426 
427 /*
428  * Creates sub-directories for each zpool as needed in response to a
429  * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
430  */
431 void
432 devzvol_create_pool_dirs(struct vnode *dvp)
433 {
434 	nvlist_t *nv = NULL;
435 	nvpair_t *elem = NULL;
436 	int pools = 0;
437 	int rc;
438 
439 	sdcmn_err13(("devzvol_create_pool_dirs"));
440 
441 	devzvol_update_zclist();
442 
443 	mutex_enter(&devzvol_mtx);
444 
445 	rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
446 	    devzvol_zclist_size, &nv, 0);
447 	if (rc) {
448 		ASSERT(rc == 0);
449 		kmem_free((void *)(uintptr_t)devzvol_zclist,
450 		    devzvol_zclist_size);
451 		devzvol_gen = 0;
452 		devzvol_zclist = NULL;
453 		devzvol_zclist_size = 0;
454 		goto out;
455 	}
456 	mutex_exit(&devzvol_mtx);
457 	while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
458 		struct vnode *vp;
459 		ASSERT(dvp->v_count > 0);
460 		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
461 		    NULL, kcred, NULL, 0, NULL);
462 		/* should either work, or not be visible from a zone */
463 		ASSERT(rc == 0 || rc == ENOENT);
464 		if (rc == 0)
465 			VN_RELE(vp);
466 		pools++;
467 	}
468 	nvlist_free(nv);
469 	mutex_enter(&devzvol_mtx);
470 	if (devzvol_isopen && pools == 0) {
471 		/* clean up so zfs can be unloaded */
472 		devzvol_close_zfs();
473 		devzvol_isopen = B_FALSE;
474 	}
475 out:
476 	mutex_exit(&devzvol_mtx);
477 }
478 
479 /*ARGSUSED3*/
480 static int
481 devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
482     cred_t *cred, void *whatever, char *whichever)
483 {
484 	timestruc_t now;
485 	struct vattr *vap = (struct vattr *)arg;
486 
487 	sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
488 	    ddv->sdev_path, nm));
489 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
490 	    strlen(ZVOL_DIR)) == 0);
491 	*vap = *sdev_getdefault_attr(VDIR);
492 	gethrestime(&now);
493 	vap->va_atime = now;
494 	vap->va_mtime = now;
495 	vap->va_ctime = now;
496 	return (0);
497 }
498 
499 /*ARGSUSED3*/
500 static int
501 devzvol_create_link(struct sdev_node *ddv, char *nm,
502     void **arg, cred_t *cred, void *whatever, char *whichever)
503 {
504 	minor_t minor;
505 	char *pathname = (char *)*arg;
506 	int rc;
507 	char *dsname;
508 	char *x;
509 	char str[MAXNAMELEN];
510 	sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
511 	    ddv->sdev_path, nm));
512 	dsname = devzvol_make_dsname(ddv->sdev_path, nm);
513 	rc = sdev_zvol_create_minor(dsname);
514 	if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
515 	    sdev_zvol_name2minor(dsname, &minor)) {
516 		sdcmn_err13(("devzvol_create_link %d", rc));
517 		kmem_free(dsname, strlen(dsname) + 1);
518 		return (-1);
519 	}
520 	kmem_free(dsname, strlen(dsname) + 1);
521 
522 	/*
523 	 * This is a valid zvol; create a symlink that points to the
524 	 * minor which was created under /devices/pseudo/zfs@0
525 	 */
526 	*pathname = '\0';
527 	for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
528 		(void) strcat(pathname, "../");
529 	(void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
530 	(void) strncat(pathname, str, MAXPATHLEN);
531 	if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
532 	    strlen(ZVOL_FULL_RDEV_DIR)) == 0)
533 		(void) strcat(pathname, ",raw");
534 	return (0);
535 }
536 
537 /* Clean zvol sdev_nodes that are no longer valid.  */
538 static void
539 devzvol_prunedir(struct sdev_node *ddv)
540 {
541 	struct sdev_node *dv;
542 
543 	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
544 
545 	sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
546 	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
547 	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
548 		rw_exit(&ddv->sdev_contents);
549 		rw_enter(&ddv->sdev_contents, RW_WRITER);
550 	}
551 
552 	dv = SDEV_FIRST_ENTRY(ddv);
553 	while (dv) {
554 		sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
555 
556 		switch (devzvol_validate(dv)) {
557 		case SDEV_VTOR_VALID:
558 		case SDEV_VTOR_SKIP:
559 			dv = SDEV_NEXT_ENTRY(ddv, dv);
560 			continue;
561 		case SDEV_VTOR_INVALID:
562 			sdcmn_err7(("prunedir: destroy invalid "
563 			    "node: %s\n", dv->sdev_name));
564 			break;
565 		}
566 
567 		if ((SDEVTOV(dv)->v_type == VDIR) &&
568 		    (sdev_cleandir(dv, NULL, 0) != 0)) {
569 			dv = SDEV_NEXT_ENTRY(ddv, dv);
570 			continue;
571 		}
572 		SDEV_HOLD(dv);
573 		/* remove the cache node */
574 		sdev_cache_update(ddv, &dv, dv->sdev_name,
575 		    SDEV_CACHE_DELETE);
576 		SDEV_RELE(dv);
577 		dv = SDEV_FIRST_ENTRY(ddv);
578 	}
579 	rw_downgrade(&ddv->sdev_contents);
580 }
581 
582 /*
583  * This function is used to create a dir or dev inside a zone's /dev when the
584  * zone has a zvol that is dynamically created within the zone (i.e. inside
585  * of a delegated dataset.  Since there is no /devices tree within a zone,
586  * we create the chr/blk devices directly inside the zone's /dev instead of
587  * making symlinks.
588  */
589 static int
590 devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
591 {
592 	struct vattr vattr;
593 	timestruc_t now;
594 	enum vtype expected_type = VDIR;
595 	dmu_objset_type_t do_type;
596 	struct sdev_node *dv = NULL;
597 	int res;
598 	char *dsname;
599 
600 	bzero(&vattr, sizeof (vattr));
601 	gethrestime(&now);
602 	vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
603 	vattr.va_uid = SDEV_UID_DEFAULT;
604 	vattr.va_gid = SDEV_GID_DEFAULT;
605 	vattr.va_type = VNON;
606 	vattr.va_atime = now;
607 	vattr.va_mtime = now;
608 	vattr.va_ctime = now;
609 
610 	if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
611 		return (ENOENT);
612 
613 	if (devzvol_objset_check(dsname, &do_type) != 0) {
614 		/*
615 		 * objset_check will succeed on any valid objset in the global
616 		 * zone, and any valid delegated dataset. It will fail, however,
617 		 * in non-global zones on explicitly whitelisted zvol devices
618 		 * that are outside any delegated dataset.
619 		 *
620 		 * The directories leading up to the zvol device itself will be
621 		 * created by prof for us in advance (and will always validate
622 		 * because of the matching check in devzvol_validate). The zvol
623 		 * device itself can't be created by prof though because in the
624 		 * GZ it's a symlink, and in the NGZ it is not. So, we create
625 		 * such zvol device files here.
626 		 */
627 		if (!(parent->sdev_flags & SDEV_GLOBAL) &&
628 		    parent->sdev_origin != NULL &&
629 		    prof_name_matched(nm, parent)) {
630 			do_type = DMU_OST_ZVOL;
631 		} else {
632 			kmem_free(dsname, strlen(dsname) + 1);
633 			return (ENOENT);
634 		}
635 	}
636 
637 	if (do_type == DMU_OST_ZVOL)
638 		expected_type = VBLK;
639 
640 	if (expected_type == VDIR) {
641 		vattr.va_type = VDIR;
642 		vattr.va_mode = SDEV_DIRMODE_DEFAULT;
643 	} else {
644 		minor_t minor;
645 		dev_t devnum;
646 		int rc;
647 
648 		rc = sdev_zvol_create_minor(dsname);
649 		if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
650 		    sdev_zvol_name2minor(dsname, &minor)) {
651 			kmem_free(dsname, strlen(dsname) + 1);
652 			return (ENOENT);
653 		}
654 
655 		devnum = makedevice(devzvol_major, minor);
656 		vattr.va_rdev = devnum;
657 
658 		if (strstr(parent->sdev_path, "/rdsk/") != NULL)
659 			vattr.va_type = VCHR;
660 		else
661 			vattr.va_type = VBLK;
662 		vattr.va_mode = SDEV_DEVMODE_DEFAULT;
663 	}
664 	kmem_free(dsname, strlen(dsname) + 1);
665 
666 	rw_enter(&parent->sdev_contents, RW_WRITER);
667 
668 	res = sdev_mknode(parent, nm, &dv, &vattr,
669 	    NULL, NULL, kcred, SDEV_READY);
670 	rw_exit(&parent->sdev_contents);
671 	if (res != 0)
672 		return (ENOENT);
673 
674 	SDEV_RELE(dv);
675 	return (0);
676 }
677 
678 /*ARGSUSED*/
679 static int
680 devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
681     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
682     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
683 {
684 	enum vtype expected_type = VDIR;
685 	struct sdev_node *parent = VTOSDEV(dvp);
686 	char *dsname;
687 	dmu_objset_type_t do_type;
688 	int error;
689 
690 	sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
691 	*vpp = NULL;
692 	/* execute access is required to search the directory */
693 	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
694 		return (error);
695 
696 	rw_enter(&parent->sdev_contents, RW_READER);
697 	if (!SDEV_IS_GLOBAL(parent)) {
698 		int res;
699 
700 		rw_exit(&parent->sdev_contents);
701 
702 		/*
703 		 * If we're in the global zone and reach down into a non-global
704 		 * zone's /dev/zvol then this action could trigger the creation
705 		 * of all of the zvol devices for every zone into the non-global
706 		 * zone's /dev tree. This could be a big security hole. To
707 		 * prevent this, disallow the global zone from looking inside
708 		 * a non-global zones /dev/zvol. This behavior is similar to
709 		 * delegated datasets, which cannot be used by the global zone.
710 		 */
711 		if (getzoneid() == GLOBAL_ZONEID)
712 			return (EPERM);
713 
714 		res = prof_lookup(dvp, nm, vpp, cred);
715 
716 		/*
717 		 * We won't find a zvol that was dynamically created inside
718 		 * a NGZ, within a delegated dataset, in the zone's dev profile
719 		 * but prof_lookup will also find it via sdev_cache_lookup.
720 		 */
721 		if (res == ENOENT) {
722 			/*
723 			 * We have to create the sdev node for the dymamically
724 			 * created zvol.
725 			 */
726 			if (devzvol_mk_ngz_node(parent, nm) != 0)
727 				return (ENOENT);
728 			res = prof_lookup(dvp, nm, vpp, cred);
729 		}
730 
731 		return (res);
732 	}
733 
734 	/*
735 	 * Don't let the global-zone style lookup succeed here when we're not
736 	 * running in the global zone. This can happen because prof calls into
737 	 * us (in prof_filldir) trying to create an explicitly passed-through
738 	 * zvol device outside any delegated dataset.
739 	 *
740 	 * We have to stop this here or else we will create prof shadows of
741 	 * the global zone symlink, which will make no sense at all in the
742 	 * non-global zone (it has no /devices for the symlink to point at).
743 	 *
744 	 * These zvols will be created later (at access time) by mk_ngz_node
745 	 * instead. The dirs leading up to them will be created by prof
746 	 * internally.
747 	 *
748 	 * We have to return EPERM here, because ENOENT is given special
749 	 * meaning by prof in this context.
750 	 */
751 	if (getzoneid() != GLOBAL_ZONEID) {
752 		rw_exit(&parent->sdev_contents);
753 		return (EPERM);
754 	}
755 
756 	dsname = devzvol_make_dsname(parent->sdev_path, nm);
757 	rw_exit(&parent->sdev_contents);
758 	sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
759 	if (dsname) {
760 		error = devzvol_objset_check(dsname, &do_type);
761 		if (error != 0) {
762 			error = ENOENT;
763 			goto out;
764 		}
765 		if (do_type == DMU_OST_ZVOL)
766 			expected_type = VLNK;
767 	}
768 	/*
769 	 * the callbacks expect:
770 	 *
771 	 * parent->sdev_path		   nm
772 	 * /dev/zvol			   {r}dsk
773 	 * /dev/zvol/{r}dsk		   <pool name>
774 	 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
775 	 *
776 	 * sdev_name is always last path component of sdev_path
777 	 */
778 	if (expected_type == VDIR) {
779 		error = devname_lookup_func(parent, nm, vpp, cred,
780 		    devzvol_create_dir, SDEV_VATTR);
781 	} else {
782 		error = devname_lookup_func(parent, nm, vpp, cred,
783 		    devzvol_create_link, SDEV_VLINK);
784 	}
785 	sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
786 	ASSERT(error || ((*vpp)->v_type == expected_type));
787 out:
788 	if (dsname)
789 		kmem_free(dsname, strlen(dsname) + 1);
790 	sdcmn_err13(("devzvol_lookup %d", error));
791 	return (error);
792 }
793 
794 /*
795  * We allow create to find existing nodes
796  *	- if the node doesn't exist - EROFS
797  *	- creating an existing dir read-only succeeds, otherwise EISDIR
798  *	- exclusive creates fail - EEXIST
799  */
800 /*ARGSUSED2*/
801 static int
802 devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
803     int mode, struct vnode **vpp, struct cred *cred, int flag,
804     caller_context_t *ct, vsecattr_t *vsecp)
805 {
806 	int error;
807 	struct vnode *vp;
808 
809 	*vpp = NULL;
810 
811 	error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
812 	    NULL);
813 	if (error == 0) {
814 		if (excl == EXCL)
815 			error = EEXIST;
816 		else if (vp->v_type == VDIR && (mode & VWRITE))
817 			error = EISDIR;
818 		else
819 			error = VOP_ACCESS(vp, mode, 0, cred, ct);
820 
821 		if (error) {
822 			VN_RELE(vp);
823 		} else
824 			*vpp = vp;
825 	} else if (error == ENOENT) {
826 		error = EROFS;
827 	}
828 
829 	return (error);
830 }
831 
832 void sdev_iter_snapshots(struct vnode *dvp, char *name);
833 
834 void
835 sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
836 {
837 	zfs_cmd_t	*zc;
838 	int rc;
839 
840 	sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
841 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
842 	(void) strcpy(zc->zc_name, name);
843 
844 	while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
845 		struct vnode *vpp;
846 		char *ptr;
847 
848 		sdcmn_err13(("  name %s", zc->zc_name));
849 		if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
850 			goto skip;
851 		ptr = strrchr(zc->zc_name, '/') + 1;
852 		rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
853 		    kcred, NULL, NULL, NULL);
854 		if (rc == 0) {
855 			VN_RELE(vpp);
856 		} else if (rc == ENOENT) {
857 			goto skip;
858 		} else {
859 			/*
860 			 * EBUSY == problem with zvols's dmu holds?
861 			 * EPERM when in a NGZ and traversing up and out.
862 			 */
863 			goto skip;
864 		}
865 		if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
866 		    zc->zc_objset_stats.dds_type != DMU_OST_ZFS)
867 			sdev_iter_snapshots(dvp, zc->zc_name);
868 skip:
869 		(void) strcpy(zc->zc_name, name);
870 	}
871 	kmem_free(zc, sizeof (zfs_cmd_t));
872 }
873 
874 void
875 sdev_iter_snapshots(struct vnode *dvp, char *name)
876 {
877 	sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
878 }
879 
880 /*ARGSUSED4*/
881 static int
882 devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
883     int *eofp, caller_context_t *ct_unused, int flags_unused)
884 {
885 	struct sdev_node *sdvp = VTOSDEV(dvp);
886 	char *ptr;
887 
888 	sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
889 	    sdvp->sdev_name));
890 
891 	if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
892 		struct vnode *vp;
893 
894 		rw_exit(&sdvp->sdev_contents);
895 		(void) devname_lookup_func(sdvp, "dsk", &vp, cred,
896 		    devzvol_create_dir, SDEV_VATTR);
897 		VN_RELE(vp);
898 		(void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
899 		    devzvol_create_dir, SDEV_VATTR);
900 		VN_RELE(vp);
901 		rw_enter(&sdvp->sdev_contents, RW_READER);
902 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
903 	}
904 	if (uiop->uio_offset == 0)
905 		devzvol_prunedir(sdvp);
906 	ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
907 	if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
908 		rw_exit(&sdvp->sdev_contents);
909 		devzvol_create_pool_dirs(dvp);
910 		rw_enter(&sdvp->sdev_contents, RW_READER);
911 		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
912 	}
913 
914 	ptr = strchr(ptr + 1, '/');
915 	if (ptr == NULL)
916 		return (ENOENT);
917 	ptr++;
918 	rw_exit(&sdvp->sdev_contents);
919 	sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
920 	rw_enter(&sdvp->sdev_contents, RW_READER);
921 	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
922 }
923 
924 const fs_operation_def_t devzvol_vnodeops_tbl[] = {
925 	VOPNAME_READDIR,	{ .vop_readdir = devzvol_readdir },
926 	VOPNAME_LOOKUP,		{ .vop_lookup = devzvol_lookup },
927 	VOPNAME_CREATE,		{ .vop_create = devzvol_create },
928 	VOPNAME_RENAME,		{ .error = fs_nosys },
929 	VOPNAME_MKDIR,		{ .error = fs_nosys },
930 	VOPNAME_RMDIR,		{ .error = fs_nosys },
931 	VOPNAME_REMOVE,		{ .error = fs_nosys },
932 	VOPNAME_SYMLINK,	{ .error = fs_nosys },
933 	NULL,			NULL
934 };
935