xref: /illumos-gate/usr/src/cmd/zinject/zinject.c (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24  */
25 
26 /*
27  * ZFS Fault Injector
28  *
29  * This userland component takes a set of options and uses libzpool to translate
30  * from a user-visible object type and name to an internal representation.
31  * There are two basic types of faults: device faults and data faults.
32  *
33  *
34  * DEVICE FAULTS
35  *
36  * Errors can be injected into a particular vdev using the '-d' option.  This
37  * option takes a path or vdev GUID to uniquely identify the device within a
38  * pool.  There are two types of errors that can be injected, EIO and ENXIO,
39  * that can be controlled through the '-e' option.  The default is ENXIO.  For
40  * EIO failures, any attempt to read data from the device will return EIO, but
41  * subsequent attempt to reopen the device will succeed.  For ENXIO failures,
42  * any attempt to read from the device will return EIO, but any attempt to
43  * reopen the device will also return ENXIO.
44  * For label faults, the -L option must be specified. This allows faults
45  * to be injected into either the nvlist, uberblock, pad1, or pad2 region
46  * of all the labels for the specified device.
47  *
48  * This form of the command looks like:
49  *
50  *	zinject -d device [-e errno] [-L <uber | nvlist | pad1 | pad2>] pool
51  *
52  *
53  * DATA FAULTS
54  *
55  * We begin with a tuple of the form:
56  *
57  *	<type,level,range,object>
58  *
59  *	type	A string describing the type of data to target.  Each type
60  *		implicitly describes how to interpret 'object'. Currently,
61  *		the following values are supported:
62  *
63  *		data		User data for a file
64  *		dnode		Dnode for a file or directory
65  *
66  *		The following MOS objects are special.  Instead of injecting
67  *		errors on a particular object or blkid, we inject errors across
68  *		all objects of the given type.
69  *
70  *		mos		Any data in the MOS
71  *		mosdir		object directory
72  *		config		pool configuration
73  *		bpobj		blkptr list
74  *		spacemap	spacemap
75  *		metaslab	metaslab
76  *		errlog		persistent error log
77  *
78  *	level	Object level.  Defaults to '0', not applicable to all types.  If
79  *		a range is given, this corresponds to the indirect block
80  *		corresponding to the specific range.
81  *
82  *	range	A numerical range [start,end) within the object.  Defaults to
83  *		the full size of the file.
84  *
85  *	object	A string describing the logical location of the object.  For
86  *		files and directories (currently the only supported types),
87  *		this is the path of the object on disk.
88  *
89  * This is translated, via libzpool, into the following internal representation:
90  *
91  *	<type,objset,object,level,range>
92  *
93  * These types should be self-explanatory.  This tuple is then passed to the
94  * kernel via a special ioctl() to initiate fault injection for the given
95  * object.  Note that 'type' is not strictly necessary for fault injection, but
96  * is used when translating existing faults into a human-readable string.
97  *
98  *
99  * The command itself takes one of the forms:
100  *
101  *	zinject
102  *	zinject <-a | -u pool>
103  *	zinject -c <id|all>
104  *	zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level]
105  *	    [-r range] <object>
106  *	zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool
107  *
108  * With no arguments, the command prints all currently registered injection
109  * handlers, with their numeric identifiers.
110  *
111  * The '-c' option will clear the given handler, or all handlers if 'all' is
112  * specified.
113  *
114  * The '-e' option takes a string describing the errno to simulate.  This must
115  * be one of 'io', 'checksum', or 'decrypt'.  In most cases this will result
116  * in the same behavior, but RAID-Z will produce a different set of ereports
117  * for this situation.
118  *
119  * The '-a', '-u', and '-m' flags toggle internal flush behavior.  If '-a' is
120  * specified, then the ARC cache is flushed appropriately.  If '-u' is
121  * specified, then the underlying SPA is unloaded.  Either of these flags can be
122  * specified independently of any other handlers.  The '-m' flag automatically
123  * does an unmount and remount of the underlying dataset to aid in flushing the
124  * cache.
125  *
126  * The '-f' flag controls the frequency of errors injected, expressed as a
127  * integer percentage between 1 and 100.  The default is 100.
128  *
129  * The this form is responsible for actually injecting the handler into the
130  * framework.  It takes the arguments described above, translates them to the
131  * internal tuple using libzpool, and then issues an ioctl() to register the
132  * handler.
133  *
134  * The final form can target a specific bookmark, regardless of whether a
135  * human-readable interface has been designed.  It allows developers to specify
136  * a particular block by number.
137  */
138 
139 #include <errno.h>
140 #include <fcntl.h>
141 #include <stdio.h>
142 #include <stdlib.h>
143 #include <strings.h>
144 #include <unistd.h>
145 
146 #include <sys/fs/zfs.h>
147 #include <sys/mount.h>
148 
149 #include <libzfs.h>
150 
151 #undef verify	/* both libzfs.h and zfs_context.h want to define this */
152 
153 #include "zinject.h"
154 
155 libzfs_handle_t *g_zfs;
156 int zfs_fd;
157 
158 #define	ECKSUM	EBADE
159 
160 static const char *errtable[TYPE_INVAL] = {
161 	"data",
162 	"dnode",
163 	"mos",
164 	"mosdir",
165 	"metaslab",
166 	"config",
167 	"bpobj",
168 	"spacemap",
169 	"errlog",
170 	"uber",
171 	"nvlist",
172 	"pad1",
173 	"pad2"
174 };
175 
176 static err_type_t
177 name_to_type(const char *arg)
178 {
179 	int i;
180 	for (i = 0; i < TYPE_INVAL; i++)
181 		if (strcmp(errtable[i], arg) == 0)
182 			return (i);
183 
184 	return (TYPE_INVAL);
185 }
186 
187 static const char *
188 type_to_name(uint64_t type)
189 {
190 	switch (type) {
191 	case DMU_OT_OBJECT_DIRECTORY:
192 		return ("mosdir");
193 	case DMU_OT_OBJECT_ARRAY:
194 		return ("metaslab");
195 	case DMU_OT_PACKED_NVLIST:
196 		return ("config");
197 	case DMU_OT_BPOBJ:
198 		return ("bpobj");
199 	case DMU_OT_SPACE_MAP:
200 		return ("spacemap");
201 	case DMU_OT_ERROR_LOG:
202 		return ("errlog");
203 	default:
204 		return ("-");
205 	}
206 }
207 
208 
209 /*
210  * Print usage message.
211  */
212 void
213 usage(void)
214 {
215 	(void) printf(
216 	    "usage:\n"
217 	    "\n"
218 	    "\tzinject\n"
219 	    "\n"
220 	    "\t\tList all active injection records.\n"
221 	    "\n"
222 	    "\tzinject -c <id|all>\n"
223 	    "\n"
224 	    "\t\tClear the particular record (if given a numeric ID), or\n"
225 	    "\t\tall records if 'all' is specificed.\n"
226 	    "\n"
227 	    "\tzinject -p <function name> pool\n"
228 	    "\n"
229 	    "\t\tInject a panic fault at the specified function. Only \n"
230 	    "\t\tfunctions which call spa_vdev_config_exit(), or \n"
231 	    "\t\tspa_vdev_exit() will trigger a panic.\n"
232 	    "\n"
233 	    "\tzinject -d device [-e errno] [-L <nvlist|uber|pad1|pad2>] [-F]\n"
234 	    "\t    [-T <read|write|free|claim|all> pool\n"
235 	    "\n"
236 	    "\t\tInject a fault into a particular device or the device's\n"
237 	    "\t\tlabel.  Label injection can either be 'nvlist', 'uber',\n "
238 	    "\t\t'pad1', or 'pad2'.\n"
239 	    "\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
240 	    "\n"
241 	    "\tzinject -d device -A <degrade|fault> pool\n"
242 	    "\n"
243 	    "\t\tPerform a specific action on a particular device\n"
244 	    "\n"
245 	    "\tzinject -d device -D latency:lanes pool\n"
246 	    "\n"
247 	    "\t\tAdd an artificial delay to IO requests on a particular\n"
248 	    "\t\tdevice, such that the requests take a minimum of 'latency'\n"
249 	    "\t\tmilliseconds to complete. Each delay has an associated\n"
250 	    "\t\tnumber of 'lanes' which defines the number of concurrent\n"
251 	    "\t\tIO requests that can be processed.\n"
252 	    "\n"
253 	    "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n"
254 	    "\t\tthe device will only be able to service a single IO request\n"
255 	    "\t\tat a time with each request taking 10 ms to complete. So,\n"
256 	    "\t\tif only a single request is submitted every 10 ms, the\n"
257 	    "\t\taverage latency will be 10 ms; but if more than one request\n"
258 	    "\t\tis submitted every 10 ms, the average latency will be more\n"
259 	    "\t\tthan 10 ms.\n"
260 	    "\n"
261 	    "\t\tSimilarly, if a delay of 10 ms is specified to have two\n"
262 	    "\t\tlanes (-D 10:2), then the device will be able to service\n"
263 	    "\t\ttwo requests at a time, each with a minimum latency of\n"
264 	    "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n"
265 	    "\t\tthe average latency will be 10 ms; but if more than two\n"
266 	    "\t\trequests are submitted every 10 ms, the average latency\n"
267 	    "\t\twill be more than 10 ms.\n"
268 	    "\n"
269 	    "\t\tAlso note, these delays are additive. So two invocations\n"
270 	    "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n"
271 	    "\t\tof '-D 10:2'. This also means, one can specify multiple\n"
272 	    "\t\tlanes with differing target latencies. For example, an\n"
273 	    "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n"
274 	    "\t\tcreate 3 lanes on the device; one lane with a latency\n"
275 	    "\t\tof 10 ms and two lanes with a 25 ms latency.\n"
276 	    "\n"
277 	    "\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
278 	    "\n"
279 	    "\t\tCause the pool to stop writing blocks yet not\n"
280 	    "\t\treport errors for a duration.  Simulates buggy hardware\n"
281 	    "\t\tthat fails to honor cache flush requests.\n"
282 	    "\t\tDefault duration is 30 seconds.  The machine is panicked\n"
283 	    "\t\tat the end of the duration.\n"
284 	    "\n"
285 	    "\tzinject -b objset:object:level:blkid pool\n"
286 	    "\n"
287 	    "\t\tInject an error into pool 'pool' with the numeric bookmark\n"
288 	    "\t\tspecified by the remaining tuple.  Each number is in\n"
289 	    "\t\thexidecimal, and only one block can be specified.\n"
290 	    "\n"
291 	    "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
292 	    "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
293 	    "\n"
294 	    "\t\tInject an error into the object specified by the '-t' option\n"
295 	    "\t\tand the object descriptor.  The 'object' parameter is\n"
296 	    "\t\tinterperted depending on the '-t' option.\n"
297 	    "\n"
298 	    "\t\t-q\tQuiet mode.  Only print out the handler number added.\n"
299 	    "\t\t-e\tInject a specific error.  Must be one of 'io', "
300 	    "'checksum',\n"
301 	    "\t\t\t'decompress', or decrypt.  Default is 'io'.\n"
302 	    "\t\t-C\tInject the given error only into specific DVAs. The\n"
303 	    "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
304 	    "\t\t\tseparated by commas (ex. '0,2').\n"
305 	    "\t\t-l\tInject error at a particular block level. Default is "
306 	    "0.\n"
307 	    "\t\t-m\tAutomatically remount underlying filesystem.\n"
308 	    "\t\t-r\tInject error over a particular logical range of an\n"
309 	    "\t\t\tobject.  Will be translated to the appropriate blkid\n"
310 	    "\t\t\trange according to the object's properties.\n"
311 	    "\t\t-a\tFlush the ARC cache.  Can be specified without any\n"
312 	    "\t\t\tassociated object.\n"
313 	    "\t\t-u\tUnload the associated pool.  Can be specified with only\n"
314 	    "\t\t\ta pool object.\n"
315 	    "\t\t-f\tOnly inject errors a fraction of the time.  Expressed as\n"
316 	    "\t\t\ta percentage between 1 and 100.\n"
317 	    "\n"
318 	    "\t-t data\t\tInject an error into the plain file contents of a\n"
319 	    "\t\t\tfile.  The object must be specified as a complete path\n"
320 	    "\t\t\tto a file on a ZFS filesystem.\n"
321 	    "\n"
322 	    "\t-t dnode\tInject an error into the metadnode in the block\n"
323 	    "\t\t\tcorresponding to the dnode for a file or directory.  The\n"
324 	    "\t\t\t'-r' option is incompatible with this mode.  The object\n"
325 	    "\t\t\tis specified as a complete path to a file or directory\n"
326 	    "\t\t\ton a ZFS filesystem.\n"
327 	    "\n"
328 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
329 	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
330 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
331 	    "\t\t\tthe poolname.\n");
332 }
333 
334 static int
335 iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *),
336     void *data)
337 {
338 	zfs_cmd_t zc = { 0 };
339 	int ret;
340 
341 	while (ioctl(zfs_fd, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0)
342 		if ((ret = func((int)zc.zc_guid, zc.zc_name,
343 		    &zc.zc_inject_record, data)) != 0)
344 			return (ret);
345 
346 	if (errno != ENOENT) {
347 		(void) fprintf(stderr, "Unable to list handlers: %s\n",
348 		    strerror(errno));
349 		return (-1);
350 	}
351 
352 	return (0);
353 }
354 
355 static int
356 print_data_handler(int id, const char *pool, zinject_record_t *record,
357     void *data)
358 {
359 	int *count = data;
360 
361 	if (record->zi_guid != 0 || record->zi_func[0] != '\0')
362 		return (0);
363 
364 	if (*count == 0) {
365 		(void) printf("%3s  %-15s  %-6s  %-6s  %-8s  %3s  %-4s  ",
366 		    "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
367 		    "LVL", "DVAs", "RANGE");
368 		(void) printf("---  ---------------  ------  "
369 		    "------  --------  ---  ---- ----------------\n");
370 	}
371 
372 	*count += 1;
373 
374 	(void) printf("%3d  %-15s  %-6llu  %-6llu  %-8s  %-3d  0x%02x  ",
375 	    id, pool, (u_longlong_t)record->zi_objset,
376 	    (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
377 	    record->zi_level, record->zi_dvas);
378 
379 	if (record->zi_start == 0 &&
380 	    record->zi_end == -1ULL)
381 		(void) printf("all\n");
382 	else
383 		(void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start,
384 		    (u_longlong_t)record->zi_end);
385 
386 	return (0);
387 }
388 
389 static int
390 print_device_handler(int id, const char *pool, zinject_record_t *record,
391     void *data)
392 {
393 	int *count = data;
394 
395 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
396 		return (0);
397 
398 	if (record->zi_cmd == ZINJECT_DELAY_IO)
399 		return (0);
400 
401 	if (*count == 0) {
402 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "GUID");
403 		(void) printf("---  ---------------  ----------------\n");
404 	}
405 
406 	*count += 1;
407 
408 	(void) printf("%3d  %-15s  %llx\n", id, pool,
409 	    (u_longlong_t)record->zi_guid);
410 
411 	return (0);
412 }
413 
414 static int
415 print_delay_handler(int id, const char *pool, zinject_record_t *record,
416     void *data)
417 {
418 	int *count = data;
419 
420 	if (record->zi_guid == 0 || record->zi_func[0] != '\0')
421 		return (0);
422 
423 	if (record->zi_cmd != ZINJECT_DELAY_IO)
424 		return (0);
425 
426 	if (*count == 0) {
427 		(void) printf("%3s  %-15s  %-15s  %-15s  %s\n",
428 		    "ID", "POOL", "DELAY (ms)", "LANES", "GUID");
429 		(void) printf("---  ---------------  ---------------  "
430 		    "---------------  ----------------\n");
431 	}
432 
433 	*count += 1;
434 
435 	(void) printf("%3d  %-15s  %-15llu  %-15llu  %llx\n", id, pool,
436 	    (u_longlong_t)NSEC2MSEC(record->zi_timer),
437 	    (u_longlong_t)record->zi_nlanes,
438 	    (u_longlong_t)record->zi_guid);
439 
440 	return (0);
441 }
442 
443 static int
444 print_panic_handler(int id, const char *pool, zinject_record_t *record,
445     void *data)
446 {
447 	int *count = data;
448 
449 	if (record->zi_func[0] == '\0')
450 		return (0);
451 
452 	if (*count == 0) {
453 		(void) printf("%3s  %-15s  %s\n", "ID", "POOL", "FUNCTION");
454 		(void) printf("---  ---------------  ----------------\n");
455 	}
456 
457 	*count += 1;
458 
459 	(void) printf("%3d  %-15s  %s\n", id, pool, record->zi_func);
460 
461 	return (0);
462 }
463 
464 /*
465  * Print all registered error handlers.  Returns the number of handlers
466  * registered.
467  */
468 static int
469 print_all_handlers(void)
470 {
471 	int count = 0, total = 0;
472 
473 	(void) iter_handlers(print_device_handler, &count);
474 	if (count > 0) {
475 		total += count;
476 		(void) printf("\n");
477 		count = 0;
478 	}
479 
480 	(void) iter_handlers(print_delay_handler, &count);
481 	if (count > 0) {
482 		total += count;
483 		(void) printf("\n");
484 		count = 0;
485 	}
486 
487 	(void) iter_handlers(print_data_handler, &count);
488 	if (count > 0) {
489 		total += count;
490 		(void) printf("\n");
491 		count = 0;
492 	}
493 
494 	(void) iter_handlers(print_panic_handler, &count);
495 
496 	return (count + total);
497 }
498 
499 /* ARGSUSED */
500 static int
501 cancel_one_handler(int id, const char *pool, zinject_record_t *record,
502     void *data)
503 {
504 	zfs_cmd_t zc = { 0 };
505 
506 	zc.zc_guid = (uint64_t)id;
507 
508 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
509 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
510 		    id, strerror(errno));
511 		return (1);
512 	}
513 
514 	return (0);
515 }
516 
517 /*
518  * Remove all fault injection handlers.
519  */
520 static int
521 cancel_all_handlers(void)
522 {
523 	int ret = iter_handlers(cancel_one_handler, NULL);
524 
525 	if (ret == 0)
526 		(void) printf("removed all registered handlers\n");
527 
528 	return (ret);
529 }
530 
531 /*
532  * Remove a specific fault injection handler.
533  */
534 static int
535 cancel_handler(int id)
536 {
537 	zfs_cmd_t zc = { 0 };
538 
539 	zc.zc_guid = (uint64_t)id;
540 
541 	if (ioctl(zfs_fd, ZFS_IOC_CLEAR_FAULT, &zc) != 0) {
542 		(void) fprintf(stderr, "failed to remove handler %d: %s\n",
543 		    id, strerror(errno));
544 		return (1);
545 	}
546 
547 	(void) printf("removed handler %d\n", id);
548 
549 	return (0);
550 }
551 
552 /*
553  * Register a new fault injection handler.
554  */
555 static int
556 register_handler(const char *pool, int flags, zinject_record_t *record,
557     int quiet)
558 {
559 	zfs_cmd_t zc = { 0 };
560 
561 	(void) strcpy(zc.zc_name, pool);
562 	zc.zc_inject_record = *record;
563 	zc.zc_guid = flags;
564 
565 	if (ioctl(zfs_fd, ZFS_IOC_INJECT_FAULT, &zc) != 0) {
566 		(void) fprintf(stderr, "failed to add handler: %s\n",
567 		    errno == EDOM ? "block level exceeds max level of object" :
568 		    strerror(errno));
569 		return (1);
570 	}
571 
572 	if (flags & ZINJECT_NULL)
573 		return (0);
574 
575 	if (quiet) {
576 		(void) printf("%llu\n", (u_longlong_t)zc.zc_guid);
577 	} else {
578 		(void) printf("Added handler %llu with the following "
579 		    "properties:\n", (u_longlong_t)zc.zc_guid);
580 		(void) printf("  pool: %s\n", pool);
581 		if (record->zi_guid) {
582 			(void) printf("  vdev: %llx\n",
583 			    (u_longlong_t)record->zi_guid);
584 		} else if (record->zi_func[0] != '\0') {
585 			(void) printf("  panic function: %s\n",
586 			    record->zi_func);
587 		} else if (record->zi_duration > 0) {
588 			(void) printf(" time: %lld seconds\n",
589 			    (u_longlong_t)record->zi_duration);
590 		} else if (record->zi_duration < 0) {
591 			(void) printf(" txgs: %lld \n",
592 			    (u_longlong_t)-record->zi_duration);
593 		} else {
594 			(void) printf("objset: %llu\n",
595 			    (u_longlong_t)record->zi_objset);
596 			(void) printf("object: %llu\n",
597 			    (u_longlong_t)record->zi_object);
598 			(void) printf("  type: %llu\n",
599 			    (u_longlong_t)record->zi_type);
600 			(void) printf(" level: %d\n", record->zi_level);
601 			if (record->zi_start == 0 &&
602 			    record->zi_end == -1ULL)
603 				(void) printf(" range: all\n");
604 			else
605 				(void) printf(" range: [%llu, %llu)\n",
606 				    (u_longlong_t)record->zi_start,
607 				    (u_longlong_t)record->zi_end);
608 			(void) printf("  dvas: 0x%x\n", record->zi_dvas);
609 		}
610 	}
611 
612 	return (0);
613 }
614 
615 int
616 perform_action(const char *pool, zinject_record_t *record, int cmd)
617 {
618 	zfs_cmd_t zc = { 0 };
619 
620 	ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED);
621 	(void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name));
622 	zc.zc_guid = record->zi_guid;
623 	zc.zc_cookie = cmd;
624 
625 	if (ioctl(zfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
626 		return (0);
627 
628 	return (1);
629 }
630 
631 static int
632 parse_delay(char *str, uint64_t *delay, uint64_t *nlanes)
633 {
634 	unsigned long scan_delay;
635 	unsigned long scan_nlanes;
636 
637 	if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2)
638 		return (1);
639 
640 	/*
641 	 * We explicitly disallow a delay of zero here, because we key
642 	 * off this value being non-zero in translate_device(), to
643 	 * determine if the fault is a ZINJECT_DELAY_IO fault or not.
644 	 */
645 	if (scan_delay == 0)
646 		return (1);
647 
648 	/*
649 	 * The units for the CLI delay parameter is milliseconds, but
650 	 * the data passed to the kernel is interpreted as nanoseconds.
651 	 * Thus we scale the milliseconds to nanoseconds here, and this
652 	 * nanosecond value is used to pass the delay to the kernel.
653 	 */
654 	*delay = MSEC2NSEC(scan_delay);
655 	*nlanes = scan_nlanes;
656 
657 	return (0);
658 }
659 
660 /*
661  * This function converts a string specifier for DVAs into a bit mask.
662  * The dva's provided by the user should be 0 indexed and separated by
663  * a comma. For example:
664  *     "1"     -> 0b0010  (0x2)
665  *     "0,1"   -> 0b0011  (0x3)
666  *     "0,1,2" -> 0b0111  (0x7)
667  */
668 static int
669 parse_dvas(const char *str, uint32_t *dvas_out)
670 {
671 	const char *c = str;
672 	uint32_t mask = 0;
673 	boolean_t need_delim = B_FALSE;
674 
675 	/* max string length is 5 ("0,1,2") */
676 	if (strlen(str) > 5 || strlen(str) == 0)
677 		return (EINVAL);
678 
679 	while (*c != '\0') {
680 		switch (*c) {
681 		case '0':
682 		case '1':
683 		case '2':
684 			/* check for pipe between DVAs */
685 			if (need_delim)
686 				return (EINVAL);
687 
688 			/* check if this DVA has been set already */
689 			if (mask & (1 << ((*c) - '0')))
690 				return (EINVAL);
691 
692 			mask |= (1 << ((*c) - '0'));
693 			need_delim = B_TRUE;
694 			break;
695 		case ',':
696 			need_delim = B_FALSE;
697 			break;
698 		default:
699 			/* check for invalid character */
700 			return (EINVAL);
701 		}
702 		c++;
703 	}
704 
705 	/* check for dangling delimiter */
706 	if (!need_delim)
707 		return (EINVAL);
708 
709 	*dvas_out = mask;
710 	return (0);
711 }
712 
713 int
714 main(int argc, char **argv)
715 {
716 	int c;
717 	char *range = NULL;
718 	char *cancel = NULL;
719 	char *end;
720 	char *raw = NULL;
721 	char *device = NULL;
722 	int level = 0;
723 	int quiet = 0;
724 	int error = 0;
725 	int domount = 0;
726 	int io_type = ZIO_TYPES;
727 	int action = VDEV_STATE_UNKNOWN;
728 	err_type_t type = TYPE_INVAL;
729 	err_type_t label = TYPE_INVAL;
730 	zinject_record_t record = { 0 };
731 	char pool[MAXNAMELEN];
732 	char dataset[MAXNAMELEN];
733 	zfs_handle_t *zhp;
734 	int nowrites = 0;
735 	int dur_txg = 0;
736 	int dur_secs = 0;
737 	int ret;
738 	int flags = 0;
739 	uint32_t dvas = 0;
740 
741 	if ((g_zfs = libzfs_init()) == NULL) {
742 		(void) fprintf(stderr, "internal error: failed to "
743 		    "initialize ZFS library\n");
744 		return (1);
745 	}
746 
747 	libzfs_print_on_error(g_zfs, B_TRUE);
748 
749 	if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) {
750 		(void) fprintf(stderr, "failed to open ZFS device\n");
751 		return (1);
752 	}
753 
754 	if (argc == 1) {
755 		/*
756 		 * No arguments.  Print the available handlers.  If there are no
757 		 * available handlers, direct the user to '-h' for help
758 		 * information.
759 		 */
760 		if (print_all_handlers() == 0) {
761 			(void) printf("No handlers registered.\n");
762 			(void) printf("Run 'zinject -h' for usage "
763 			    "information.\n");
764 		}
765 
766 		return (0);
767 	}
768 
769 	while ((c = getopt(argc, argv,
770 	    ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
771 		switch (c) {
772 		case 'a':
773 			flags |= ZINJECT_FLUSH_ARC;
774 			break;
775 		case 'A':
776 			if (strcasecmp(optarg, "degrade") == 0) {
777 				action = VDEV_STATE_DEGRADED;
778 			} else if (strcasecmp(optarg, "fault") == 0) {
779 				action = VDEV_STATE_FAULTED;
780 			} else {
781 				(void) fprintf(stderr, "invalid action '%s': "
782 				    "must be 'degrade' or 'fault'\n", optarg);
783 				usage();
784 				return (1);
785 			}
786 			break;
787 		case 'b':
788 			raw = optarg;
789 			break;
790 		case 'c':
791 			cancel = optarg;
792 			break;
793 		case 'C':
794 			ret = parse_dvas(optarg, &dvas);
795 			if (ret != 0) {
796 				(void) fprintf(stderr, "invalid DVA list '%s': "
797 				    "DVAs should be 0 indexed and separated by "
798 				    "commas.\n", optarg);
799 				usage();
800 				libzfs_fini(g_zfs);
801 				return (1);
802 			}
803 			break;
804 		case 'd':
805 			device = optarg;
806 			break;
807 		case 'D':
808 			ret = parse_delay(optarg, &record.zi_timer,
809 			    &record.zi_nlanes);
810 			if (ret != 0) {
811 				(void) fprintf(stderr, "invalid i/o delay "
812 				    "value: '%s'\n", optarg);
813 				usage();
814 				return (1);
815 			}
816 			break;
817 		case 'e':
818 			if (strcasecmp(optarg, "io") == 0) {
819 				error = EIO;
820 			} else if (strcasecmp(optarg, "checksum") == 0) {
821 				error = ECKSUM;
822 			} else if (strcasecmp(optarg, "decrypt") == 0) {
823 				error = EACCES;
824 			} else if (strcasecmp(optarg, "nxio") == 0) {
825 				error = ENXIO;
826 			} else if (strcasecmp(optarg, "dtl") == 0) {
827 				error = ECHILD;
828 			} else {
829 				(void) fprintf(stderr, "invalid error type "
830 				    "'%s': must be 'io', 'checksum' or "
831 				    "'nxio'\n", optarg);
832 				usage();
833 				return (1);
834 			}
835 			break;
836 		case 'f':
837 			record.zi_freq = atoi(optarg);
838 			if (record.zi_freq < 1 || record.zi_freq > 100) {
839 				(void) fprintf(stderr, "frequency range must "
840 				    "be in the range (0, 100]\n");
841 				return (1);
842 			}
843 			break;
844 		case 'F':
845 			record.zi_failfast = B_TRUE;
846 			break;
847 		case 'g':
848 			dur_txg = 1;
849 			record.zi_duration = (int)strtol(optarg, &end, 10);
850 			if (record.zi_duration <= 0 || *end != '\0') {
851 				(void) fprintf(stderr, "invalid duration '%s': "
852 				    "must be a positive integer\n", optarg);
853 				usage();
854 				return (1);
855 			}
856 			/* store duration of txgs as its negative */
857 			record.zi_duration *= -1;
858 			break;
859 		case 'h':
860 			usage();
861 			return (0);
862 		case 'I':
863 			/* default duration, if one hasn't yet been defined */
864 			nowrites = 1;
865 			if (dur_secs == 0 && dur_txg == 0)
866 				record.zi_duration = 30;
867 			break;
868 		case 'l':
869 			level = (int)strtol(optarg, &end, 10);
870 			if (*end != '\0') {
871 				(void) fprintf(stderr, "invalid level '%s': "
872 				    "must be an integer\n", optarg);
873 				usage();
874 				return (1);
875 			}
876 			break;
877 		case 'm':
878 			domount = 1;
879 			break;
880 		case 'p':
881 			(void) strlcpy(record.zi_func, optarg,
882 			    sizeof (record.zi_func));
883 			record.zi_cmd = ZINJECT_PANIC;
884 			break;
885 		case 'q':
886 			quiet = 1;
887 			break;
888 		case 'r':
889 			range = optarg;
890 			flags |= ZINJECT_CALC_RANGE;
891 			break;
892 		case 's':
893 			dur_secs = 1;
894 			record.zi_duration = (int)strtol(optarg, &end, 10);
895 			if (record.zi_duration <= 0 || *end != '\0') {
896 				(void) fprintf(stderr, "invalid duration '%s': "
897 				    "must be a positive integer\n", optarg);
898 				usage();
899 				return (1);
900 			}
901 			break;
902 		case 'T':
903 			if (strcasecmp(optarg, "read") == 0) {
904 				io_type = ZIO_TYPE_READ;
905 			} else if (strcasecmp(optarg, "write") == 0) {
906 				io_type = ZIO_TYPE_WRITE;
907 			} else if (strcasecmp(optarg, "free") == 0) {
908 				io_type = ZIO_TYPE_FREE;
909 			} else if (strcasecmp(optarg, "claim") == 0) {
910 				io_type = ZIO_TYPE_CLAIM;
911 			} else if (strcasecmp(optarg, "all") == 0) {
912 				io_type = ZIO_TYPES;
913 			} else {
914 				(void) fprintf(stderr, "invalid I/O type "
915 				    "'%s': must be 'read', 'write', 'free', "
916 				    "'claim' or 'all'\n", optarg);
917 				usage();
918 				return (1);
919 			}
920 			break;
921 		case 't':
922 			if ((type = name_to_type(optarg)) == TYPE_INVAL &&
923 			    !MOS_TYPE(type)) {
924 				(void) fprintf(stderr, "invalid type '%s'\n",
925 				    optarg);
926 				usage();
927 				return (1);
928 			}
929 			break;
930 		case 'u':
931 			flags |= ZINJECT_UNLOAD_SPA;
932 			break;
933 		case 'L':
934 			if ((label = name_to_type(optarg)) == TYPE_INVAL &&
935 			    !LABEL_TYPE(type)) {
936 				(void) fprintf(stderr, "invalid label type "
937 				    "'%s'\n", optarg);
938 				usage();
939 				return (1);
940 			}
941 			break;
942 		case ':':
943 			(void) fprintf(stderr, "option -%c requires an "
944 			    "operand\n", optopt);
945 			usage();
946 			return (1);
947 		case '?':
948 			(void) fprintf(stderr, "invalid option '%c'\n",
949 			    optopt);
950 			usage();
951 			return (2);
952 		}
953 	}
954 
955 	argc -= optind;
956 	argv += optind;
957 
958 	if (record.zi_duration != 0)
959 		record.zi_cmd = ZINJECT_IGNORED_WRITES;
960 
961 	if (cancel != NULL) {
962 		/*
963 		 * '-c' is invalid with any other options.
964 		 */
965 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
966 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
967 		    record.zi_freq > 0 || dvas != 0) {
968 			(void) fprintf(stderr, "cancel (-c) incompatible with "
969 			    "any other options\n");
970 			usage();
971 			return (2);
972 		}
973 		if (argc != 0) {
974 			(void) fprintf(stderr, "extraneous argument to '-c'\n");
975 			usage();
976 			return (2);
977 		}
978 
979 		if (strcmp(cancel, "all") == 0) {
980 			return (cancel_all_handlers());
981 		} else {
982 			int id = (int)strtol(cancel, &end, 10);
983 			if (*end != '\0') {
984 				(void) fprintf(stderr, "invalid handle id '%s':"
985 				    " must be an integer or 'all'\n", cancel);
986 				usage();
987 				return (1);
988 			}
989 			return (cancel_handler(id));
990 		}
991 	}
992 
993 	if (device != NULL) {
994 		/*
995 		 * Device (-d) injection uses a completely different mechanism
996 		 * for doing injection, so handle it separately here.
997 		 */
998 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
999 		    level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
1000 		    dvas != 0) {
1001 			(void) fprintf(stderr, "device (-d) incompatible with "
1002 			    "data error injection\n");
1003 			usage();
1004 			return (2);
1005 		}
1006 
1007 		if (argc != 1) {
1008 			(void) fprintf(stderr, "device (-d) injection requires "
1009 			    "a single pool name\n");
1010 			usage();
1011 			return (2);
1012 		}
1013 
1014 		(void) strcpy(pool, argv[0]);
1015 		dataset[0] = '\0';
1016 
1017 		if (error == ECKSUM) {
1018 			(void) fprintf(stderr, "device error type must be "
1019 			    "'io' or 'nxio'\n");
1020 			return (1);
1021 		}
1022 
1023 		record.zi_iotype = io_type;
1024 		if (translate_device(pool, device, label, &record) != 0)
1025 			return (1);
1026 		if (!error)
1027 			error = ENXIO;
1028 
1029 		if (action != VDEV_STATE_UNKNOWN)
1030 			return (perform_action(pool, &record, action));
1031 
1032 	} else if (raw != NULL) {
1033 		if (range != NULL || type != TYPE_INVAL || level != 0 ||
1034 		    record.zi_cmd != ZINJECT_UNINITIALIZED ||
1035 		    record.zi_freq > 0 || dvas != 0) {
1036 			(void) fprintf(stderr, "raw (-b) format with "
1037 			    "any other options\n");
1038 			usage();
1039 			return (2);
1040 		}
1041 
1042 		if (argc != 1) {
1043 			(void) fprintf(stderr, "raw (-b) format expects a "
1044 			    "single pool name\n");
1045 			usage();
1046 			return (2);
1047 		}
1048 
1049 		(void) strcpy(pool, argv[0]);
1050 		dataset[0] = '\0';
1051 
1052 		if (error == ENXIO) {
1053 			(void) fprintf(stderr, "data error type must be "
1054 			    "'checksum' or 'io'\n");
1055 			return (1);
1056 		}
1057 
1058 		record.zi_cmd = ZINJECT_DATA_FAULT;
1059 		if (translate_raw(raw, &record) != 0)
1060 			return (1);
1061 		if (!error)
1062 			error = EIO;
1063 	} else if (record.zi_cmd == ZINJECT_PANIC) {
1064 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1065 		    level != 0 || device != NULL || record.zi_freq > 0 ||
1066 		    dvas != 0) {
1067 			(void) fprintf(stderr, "panic (-p) incompatible with "
1068 			    "other options\n");
1069 			usage();
1070 			return (2);
1071 		}
1072 
1073 		if (argc < 1 || argc > 2) {
1074 			(void) fprintf(stderr, "panic (-p) injection requires "
1075 			    "a single pool name and an optional id\n");
1076 			usage();
1077 			return (2);
1078 		}
1079 
1080 		(void) strcpy(pool, argv[0]);
1081 		if (argv[1] != NULL)
1082 			record.zi_type = atoi(argv[1]);
1083 		dataset[0] = '\0';
1084 	} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
1085 		if (raw != NULL || range != NULL || type != TYPE_INVAL ||
1086 		    level != 0 || record.zi_freq > 0 || dvas != 0) {
1087 			(void) fprintf(stderr, "hardware failure (-I) "
1088 			    "incompatible with other options\n");
1089 			usage();
1090 			libzfs_fini(g_zfs);
1091 			return (2);
1092 		}
1093 
1094 		if (nowrites == 0) {
1095 			(void) fprintf(stderr, "-s or -g meaningless "
1096 			    "without -I (ignore writes)\n");
1097 			usage();
1098 			return (2);
1099 		} else if (dur_secs && dur_txg) {
1100 			(void) fprintf(stderr, "choose a duration either "
1101 			    "in seconds (-s) or a number of txgs (-g) "
1102 			    "but not both\n");
1103 			usage();
1104 			return (2);
1105 		} else if (argc != 1) {
1106 			(void) fprintf(stderr, "ignore writes (-I) "
1107 			    "injection requires a single pool name\n");
1108 			usage();
1109 			return (2);
1110 		}
1111 
1112 		(void) strcpy(pool, argv[0]);
1113 		dataset[0] = '\0';
1114 	} else if (type == TYPE_INVAL) {
1115 		if (flags == 0) {
1116 			(void) fprintf(stderr, "at least one of '-b', '-d', "
1117 			    "'-t', '-a', '-p', '-I' or '-u' "
1118 			    "must be specified\n");
1119 			usage();
1120 			return (2);
1121 		}
1122 
1123 		if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) {
1124 			(void) strcpy(pool, argv[0]);
1125 			dataset[0] = '\0';
1126 		} else if (argc != 0) {
1127 			(void) fprintf(stderr, "extraneous argument for "
1128 			    "'-f'\n");
1129 			usage();
1130 			return (2);
1131 		}
1132 
1133 		flags |= ZINJECT_NULL;
1134 	} else {
1135 		if (argc != 1) {
1136 			(void) fprintf(stderr, "missing object\n");
1137 			usage();
1138 			return (2);
1139 		}
1140 
1141 		if (error == ENXIO) {
1142 			(void) fprintf(stderr, "data error type must be "
1143 			    "'checksum' or 'io'\n");
1144 			return (1);
1145 		}
1146 
1147 		if (dvas != 0) {
1148 			if (error == EACCES || error == EINVAL) {
1149 				(void) fprintf(stderr, "the '-C' option may "
1150 				    "not be used with logical data errors "
1151 				    "'decrypt' and 'decompress'\n");
1152 				record.zi_dvas = dvas;
1153 			}
1154 		}
1155 
1156 		record.zi_cmd = ZINJECT_DATA_FAULT;
1157 
1158 		if (error == EACCES) {
1159 			if (type != TYPE_DATA) {
1160 				(void) fprintf(stderr, "decryption errors "
1161 				    "may only be injected for 'data' types\n");
1162 				libzfs_fini(g_zfs);
1163 				return (1);
1164 			}
1165 
1166 			record.zi_cmd = ZINJECT_DECRYPT_FAULT;
1167 			/*
1168 			 * Internally, ZFS actually uses ECKSUM for decryption
1169 			 * errors since EACCES is used to indicate the key was
1170 			 * not found.
1171 			 */
1172 			error = ECKSUM;
1173 		}
1174 
1175 		if (translate_record(type, argv[0], range, level, &record, pool,
1176 		    dataset) != 0)
1177 			return (1);
1178 		if (!error)
1179 			error = EIO;
1180 	}
1181 
1182 	/*
1183 	 * If this is pool-wide metadata, unmount everything.  The ioctl() will
1184 	 * unload the pool, so that we trigger spa-wide reopen of metadata next
1185 	 * time we access the pool.
1186 	 */
1187 	if (dataset[0] != '\0' && domount) {
1188 		if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL)
1189 			return (1);
1190 
1191 		if (zfs_unmount(zhp, NULL, 0) != 0)
1192 			return (1);
1193 	}
1194 
1195 	record.zi_error = error;
1196 
1197 	ret = register_handler(pool, flags, &record, quiet);
1198 
1199 	if (dataset[0] != '\0' && domount)
1200 		ret = (zfs_mount(zhp, NULL, 0) != 0);
1201 
1202 	libzfs_fini(g_zfs);
1203 
1204 	return (ret);
1205 }
1206