xref: /illumos-gate/usr/src/boot/sys/cddl/boot/zfs/zfssubr.c (revision 5f82aa32fbc5dc2c59bca6ff315f44a4c4c9ea86)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 
28 static uint64_t zfs_crc64_table[256];
29 
30 #define	ECKSUM	666
31 
32 #define	ASSERT3S(x, y, z)	((void)0)
33 #define	ASSERT3U(x, y, z)	((void)0)
34 #define	ASSERT3P(x, y, z)	((void)0)
35 #define	ASSERT0(x)		((void)0)
36 #define	ASSERT(x)		((void)0)
37 
38 #define	panic(...)	do {						\
39 	printf(__VA_ARGS__);						\
40 	for (;;) ;							\
41 } while (0)
42 
43 #define	kmem_alloc(size, flag)	zfs_alloc((size))
44 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
45 
46 static void
47 zfs_init_crc(void)
48 {
49 	int i, j;
50 	uint64_t *ct;
51 
52 	/*
53 	 * Calculate the crc64 table (used for the zap hash
54 	 * function).
55 	 */
56 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58 		for (i = 0; i < 256; i++)
59 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
61 	}
62 }
63 
64 static void
65 zio_checksum_off(const void *buf, uint64_t size,
66     const void *ctx_template, zio_cksum_t *zcp)
67 {
68 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69 }
70 
71 /*
72  * Signature for checksum functions.
73  */
74 typedef void zio_checksum_t(const void *data, uint64_t size,
75     const void *ctx_template, zio_cksum_t *zcp);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
78 
79 typedef enum zio_checksum_flags {
80 	/* Strong enough for metadata? */
81 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
82 	/* ZIO embedded checksum */
83 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84 	/* Strong enough for dedup (without verification)? */
85 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
86 	/* Uses salt value */
87 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
88 	/* Strong enough for nopwrite? */
89 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90 } zio_checksum_flags_t;
91 
92 /*
93  * Information about each checksum function.
94  */
95 typedef struct zio_checksum_info {
96 	/* checksum function for each byteorder */
97 	zio_checksum_t			*ci_func[2];
98 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
99 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
100 	zio_checksum_flags_t		ci_flags;
101 	const char			*ci_name;	/* descriptive name */
102 } zio_checksum_info_t;
103 
104 #include "blkptr.c"
105 
106 #include "fletcher.c"
107 #include "sha256.c"
108 
109 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
110 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
111 	{{NULL, NULL}, NULL, NULL, 0, "on"},
112 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
113 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
114 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
115 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
116 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
117 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
118 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
119 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
120 	    0, "fletcher2"},
121 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
122 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
123 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
124 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
125 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
126 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
127 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
128 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
129 	    0, "noparity"},
130 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
131 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
132 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
133 	/* no skein and edonr for now */
134 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
135 	    ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED |
136 	    ZCHECKSUM_FLAG_NOPWRITE, "skein"},
137 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
138 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
139 };
140 
141 /*
142  * Common signature for all zio compress/decompress functions.
143  */
144 typedef size_t zio_compress_func_t(void *src, void *dst,
145     size_t s_len, size_t d_len, int);
146 typedef int zio_decompress_func_t(void *src, void *dst,
147     size_t s_len, size_t d_len, int);
148 
149 extern int gzip_decompress(void *src, void *dst,
150     size_t s_len, size_t d_len, int);
151 /*
152  * Information about each compression function.
153  */
154 typedef struct zio_compress_info {
155 	zio_compress_func_t	*ci_compress;	/* compression function */
156 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
157 	int			ci_level;	/* level parameter */
158 	const char		*ci_name;	/* algorithm name */
159 } zio_compress_info_t;
160 
161 #include "lzjb.c"
162 #include "zle.c"
163 #include "lz4.c"
164 
165 /*
166  * Compression vectors.
167  */
168 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
169 	{NULL,			NULL,			0,	"inherit"},
170 	{NULL,			NULL,			0,	"on"},
171 	{NULL,			NULL,			0,	"uncompressed"},
172 	{NULL,			lzjb_decompress,	0,	"lzjb"},
173 	{NULL,			NULL,			0,	"empty"},
174 	{NULL,			gzip_decompress,	1,	"gzip-1"},
175 	{NULL,			gzip_decompress,	2,	"gzip-2"},
176 	{NULL,			gzip_decompress,	3,	"gzip-3"},
177 	{NULL,			gzip_decompress,	4,	"gzip-4"},
178 	{NULL,			gzip_decompress,	5,	"gzip-5"},
179 	{NULL,			gzip_decompress,	6,	"gzip-6"},
180 	{NULL,			gzip_decompress,	7,	"gzip-7"},
181 	{NULL,			gzip_decompress,	8,	"gzip-8"},
182 	{NULL,			gzip_decompress,	9,	"gzip-9"},
183 	{NULL,			zle_decompress,		64,	"zle"},
184 	{NULL,			lz4_decompress,		0,	"lz4"},
185 };
186 
187 static void
188 byteswap_uint64_array(void *vbuf, size_t size)
189 {
190 	uint64_t *buf = vbuf;
191 	size_t count = size >> 3;
192 	int i;
193 
194 	ASSERT((size & 7) == 0);
195 
196 	for (i = 0; i < count; i++)
197 		buf[i] = BSWAP_64(buf[i]);
198 }
199 
200 /*
201  * Set the external verifier for a gang block based on <vdev, offset, txg>,
202  * a tuple which is guaranteed to be unique for the life of the pool.
203  */
204 static void
205 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206 {
207 	const dva_t *dva = BP_IDENTITY(bp);
208 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209 
210 	ASSERT(BP_IS_GANG(bp));
211 
212 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
213 }
214 
215 /*
216  * Set the external verifier for a label block based on its offset.
217  * The vdev is implicit, and the txg is unknowable at pool open time --
218  * hence the logic in vdev_uberblock_load() to find the most recent copy.
219  */
220 static void
221 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222 {
223 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
224 }
225 
226 /*
227  * Calls the template init function of a checksum which supports context
228  * templates and installs the template into the spa_t.
229  */
230 static void
231 zio_checksum_template_init(enum zio_checksum checksum, const spa_t *spa)
232 {
233 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234 
235 	if (ci->ci_tmpl_init == NULL)
236 		return;
237 #if 0	/* for now we dont have anything here */
238 	if (spa->spa_cksum_tmpls[checksum] != NULL)
239 		return;
240 
241 	VERIFY(ci->ci_tmpl_free != NULL);
242 	mutex_enter(&spa->spa_cksum_tmpls_lock);
243 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
244 		spa->spa_cksum_tmpls[checksum] =
245 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
246 		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
247 	}
248 	mutex_exit(&spa->spa_cksum_tmpls_lock);
249 #endif
250 }
251 
252 static int
253 zio_checksum_verify(const blkptr_t *bp, void *data)
254 {
255 	uint64_t size;
256 	unsigned int checksum;
257 	zio_checksum_info_t *ci;
258 	zio_cksum_t actual_cksum, expected_cksum, verifier;
259 	int byteswap;
260 
261 	checksum = BP_GET_CHECKSUM(bp);
262 	size = BP_GET_PSIZE(bp);
263 
264 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
265 		return (EINVAL);
266 	ci = &zio_checksum_table[checksum];
267 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
268 		return (EINVAL);
269 
270 	zio_checksum_template_init(checksum, NULL);
271 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
272 		zio_eck_t *eck;
273 
274 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
275 		    checksum == ZIO_CHECKSUM_LABEL);
276 
277 		eck = (zio_eck_t *)((char *)data + size) - 1;
278 
279 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
280 			zio_checksum_gang_verifier(&verifier, bp);
281 		else if (checksum == ZIO_CHECKSUM_LABEL)
282 			zio_checksum_label_verifier(&verifier,
283 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
284 		else
285 			verifier = bp->blk_cksum;
286 
287 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
288 
289 		if (byteswap)
290 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
291 
292 		expected_cksum = eck->zec_cksum;
293 		eck->zec_cksum = verifier;
294 		ci->ci_func[byteswap](data, size, NULL, &actual_cksum);
295 		eck->zec_cksum = expected_cksum;
296 
297 		if (byteswap)
298 			byteswap_uint64_array(&expected_cksum,
299 			    sizeof (zio_cksum_t));
300 	} else {
301 		expected_cksum = bp->blk_cksum;
302 		ci->ci_func[0](data, size, NULL, &actual_cksum);
303 	}
304 
305 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
306 		/*printf("ZFS: read checksum failed\n");*/
307 		return (EIO);
308 	}
309 
310 	return (0);
311 }
312 
313 static int
314 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
315 	void *dest, uint64_t destsize)
316 {
317 	zio_compress_info_t *ci;
318 
319 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
320 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
321 		return (EIO);
322 	}
323 
324 	ci = &zio_compress_table[cpfunc];
325 	if (!ci->ci_decompress) {
326 		printf("ZFS: unsupported compression algorithm %s\n",
327 		    ci->ci_name);
328 		return (EIO);
329 	}
330 
331 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
332 }
333 
334 static uint64_t
335 zap_hash(uint64_t salt, const char *name)
336 {
337 	const uint8_t *cp;
338 	uint8_t c;
339 	uint64_t crc = salt;
340 
341 	ASSERT(crc != 0);
342 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
343 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
344 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
345 
346 	/*
347 	 * Only use 28 bits, since we need 4 bits in the cookie for the
348 	 * collision differentiator.  We MUST use the high bits, since
349 	 * those are the onces that we first pay attention to when
350 	 * chosing the bucket.
351 	 */
352 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
353 
354 	return (crc);
355 }
356 
357 static void *zfs_alloc(size_t size);
358 static void zfs_free(void *ptr, size_t size);
359 
360 typedef struct raidz_col {
361 	uint64_t rc_devidx;		/* child device index for I/O */
362 	uint64_t rc_offset;		/* device offset */
363 	uint64_t rc_size;		/* I/O size */
364 	void *rc_data;			/* I/O data */
365 	int rc_error;			/* I/O error for this device */
366 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
367 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
368 } raidz_col_t;
369 
370 typedef struct raidz_map {
371 	uint64_t rm_cols;		/* Regular column count */
372 	uint64_t rm_scols;		/* Count including skipped columns */
373 	uint64_t rm_bigcols;		/* Number of oversized columns */
374 	uint64_t rm_asize;		/* Actual total I/O size */
375 	uint64_t rm_missingdata;	/* Count of missing data devices */
376 	uint64_t rm_missingparity;	/* Count of missing parity devices */
377 	uint64_t rm_firstdatacol;	/* First data column/parity count */
378 	uint64_t rm_nskip;		/* Skipped sectors for padding */
379 	uint64_t rm_skipstart;		/* Column index of padding start */
380 	uintptr_t rm_reports;		/* # of referencing checksum reports */
381 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
382 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
383 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
384 } raidz_map_t;
385 
386 #define	VDEV_RAIDZ_P		0
387 #define	VDEV_RAIDZ_Q		1
388 #define	VDEV_RAIDZ_R		2
389 
390 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
391 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
392 
393 /*
394  * We provide a mechanism to perform the field multiplication operation on a
395  * 64-bit value all at once rather than a byte at a time. This works by
396  * creating a mask from the top bit in each byte and using that to
397  * conditionally apply the XOR of 0x1d.
398  */
399 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
400 { \
401 	(mask) = (x) & 0x8080808080808080ULL; \
402 	(mask) = ((mask) << 1) - ((mask) >> 7); \
403 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
404 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
405 }
406 
407 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
408 { \
409 	VDEV_RAIDZ_64MUL_2((x), mask); \
410 	VDEV_RAIDZ_64MUL_2((x), mask); \
411 }
412 
413 /*
414  * These two tables represent powers and logs of 2 in the Galois field defined
415  * above. These values were computed by repeatedly multiplying by 2 as above.
416  */
417 static const uint8_t vdev_raidz_pow2[256] = {
418 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
419 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
420 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
421 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
422 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
423 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
424 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
425 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
426 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
427 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
428 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
429 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
430 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
431 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
432 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
433 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
434 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
435 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
436 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
437 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
438 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
439 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
440 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
441 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
442 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
443 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
444 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
445 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
446 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
447 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
448 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
449 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
450 };
451 static const uint8_t vdev_raidz_log2[256] = {
452 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
453 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
454 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
455 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
456 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
457 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
458 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
459 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
460 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
461 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
462 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
463 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
464 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
465 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
466 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
467 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
468 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
469 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
470 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
471 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
472 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
473 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
474 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
475 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
476 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
477 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
478 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
479 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
480 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
481 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
482 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
483 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
484 };
485 
486 /*
487  * Multiply a given number by 2 raised to the given power.
488  */
489 static uint8_t
490 vdev_raidz_exp2(uint8_t a, int exp)
491 {
492 	if (a == 0)
493 		return (0);
494 
495 	ASSERT(exp >= 0);
496 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
497 
498 	exp += vdev_raidz_log2[a];
499 	if (exp > 255)
500 		exp -= 255;
501 
502 	return (vdev_raidz_pow2[exp]);
503 }
504 
505 static void
506 vdev_raidz_generate_parity_p(raidz_map_t *rm)
507 {
508 	uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
509 	int c;
510 
511 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
512 
513 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
514 		src = rm->rm_col[c].rc_data;
515 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
516 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
517 
518 		if (c == rm->rm_firstdatacol) {
519 			ASSERT(ccount == pcount);
520 			for (i = 0; i < ccount; i++, src++, p++) {
521 				*p = *src;
522 			}
523 		} else {
524 			ASSERT(ccount <= pcount);
525 			for (i = 0; i < ccount; i++, src++, p++) {
526 				*p ^= *src;
527 			}
528 		}
529 	}
530 }
531 
532 static void
533 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
534 {
535 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
536 	int c;
537 
538 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
539 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
540 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
541 
542 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
543 		src = rm->rm_col[c].rc_data;
544 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
545 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
546 
547 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
548 
549 		if (c == rm->rm_firstdatacol) {
550 			ASSERT(ccnt == pcnt || ccnt == 0);
551 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
552 				*p = *src;
553 				*q = *src;
554 			}
555 			for (; i < pcnt; i++, src++, p++, q++) {
556 				*p = 0;
557 				*q = 0;
558 			}
559 		} else {
560 			ASSERT(ccnt <= pcnt);
561 
562 			/*
563 			 * Apply the algorithm described above by multiplying
564 			 * the previous result and adding in the new value.
565 			 */
566 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
567 				*p ^= *src;
568 
569 				VDEV_RAIDZ_64MUL_2(*q, mask);
570 				*q ^= *src;
571 			}
572 
573 			/*
574 			 * Treat short columns as though they are full of 0s.
575 			 * Note that there's therefore nothing needed for P.
576 			 */
577 			for (; i < pcnt; i++, q++) {
578 				VDEV_RAIDZ_64MUL_2(*q, mask);
579 			}
580 		}
581 	}
582 }
583 
584 static void
585 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
586 {
587 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
588 	int c;
589 
590 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
591 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
592 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
593 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
594 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
595 
596 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
597 		src = rm->rm_col[c].rc_data;
598 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
599 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
600 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
601 
602 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
603 
604 		if (c == rm->rm_firstdatacol) {
605 			ASSERT(ccnt == pcnt || ccnt == 0);
606 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
607 				*p = *src;
608 				*q = *src;
609 				*r = *src;
610 			}
611 			for (; i < pcnt; i++, src++, p++, q++, r++) {
612 				*p = 0;
613 				*q = 0;
614 				*r = 0;
615 			}
616 		} else {
617 			ASSERT(ccnt <= pcnt);
618 
619 			/*
620 			 * Apply the algorithm described above by multiplying
621 			 * the previous result and adding in the new value.
622 			 */
623 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
624 				*p ^= *src;
625 
626 				VDEV_RAIDZ_64MUL_2(*q, mask);
627 				*q ^= *src;
628 
629 				VDEV_RAIDZ_64MUL_4(*r, mask);
630 				*r ^= *src;
631 			}
632 
633 			/*
634 			 * Treat short columns as though they are full of 0s.
635 			 * Note that there's therefore nothing needed for P.
636 			 */
637 			for (; i < pcnt; i++, q++, r++) {
638 				VDEV_RAIDZ_64MUL_2(*q, mask);
639 				VDEV_RAIDZ_64MUL_4(*r, mask);
640 			}
641 		}
642 	}
643 }
644 
645 /*
646  * Generate RAID parity in the first virtual columns according to the number of
647  * parity columns available.
648  */
649 static void
650 vdev_raidz_generate_parity(raidz_map_t *rm)
651 {
652 	switch (rm->rm_firstdatacol) {
653 	case 1:
654 		vdev_raidz_generate_parity_p(rm);
655 		break;
656 	case 2:
657 		vdev_raidz_generate_parity_pq(rm);
658 		break;
659 	case 3:
660 		vdev_raidz_generate_parity_pqr(rm);
661 		break;
662 	default:
663 		panic("invalid RAID-Z configuration");
664 	}
665 }
666 
667 /* BEGIN CSTYLED */
668 /*
669  * In the general case of reconstruction, we must solve the system of linear
670  * equations defined by the coeffecients used to generate parity as well as
671  * the contents of the data and parity disks. This can be expressed with
672  * vectors for the original data (D) and the actual data (d) and parity (p)
673  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
674  *
675  *            __   __                     __     __
676  *            |     |         __     __   |  p_0  |
677  *            |  V  |         |  D_0  |   | p_m-1 |
678  *            |     |    x    |   :   | = |  d_0  |
679  *            |  I  |         | D_n-1 |   |   :   |
680  *            |     |         ~~     ~~   | d_n-1 |
681  *            ~~   ~~                     ~~     ~~
682  *
683  * I is simply a square identity matrix of size n, and V is a vandermonde
684  * matrix defined by the coeffecients we chose for the various parity columns
685  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
686  * computation as well as linear separability.
687  *
688  *      __               __               __     __
689  *      |   1   ..  1 1 1 |               |  p_0  |
690  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
691  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
692  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
693  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
694  *      |   :       : : : |   |   :   |   |  d_2  |
695  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
696  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
697  *      |   0   ..  0 0 1 |               | d_n-1 |
698  *      ~~               ~~               ~~     ~~
699  *
700  * Note that I, V, d, and p are known. To compute D, we must invert the
701  * matrix and use the known data and parity values to reconstruct the unknown
702  * data values. We begin by removing the rows in V|I and d|p that correspond
703  * to failed or missing columns; we then make V|I square (n x n) and d|p
704  * sized n by removing rows corresponding to unused parity from the bottom up
705  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
706  * using Gauss-Jordan elimination. In the example below we use m=3 parity
707  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
708  *           __                               __
709  *           |  1   1   1   1   1   1   1   1  |
710  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
711  *           |  19 205 116  29  64  16  4   1  |      / /
712  *           |  1   0   0   0   0   0   0   0  |     / /
713  *           |  0   1   0   0   0   0   0   0  | <--' /
714  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
715  *           |  0   0   0   1   0   0   0   0  |
716  *           |  0   0   0   0   1   0   0   0  |
717  *           |  0   0   0   0   0   1   0   0  |
718  *           |  0   0   0   0   0   0   1   0  |
719  *           |  0   0   0   0   0   0   0   1  |
720  *           ~~                               ~~
721  *           __                               __
722  *           |  1   1   1   1   1   1   1   1  |
723  *           | 128  64  32  16  8   4   2   1  |
724  *           |  19 205 116  29  64  16  4   1  |
725  *           |  1   0   0   0   0   0   0   0  |
726  *           |  0   1   0   0   0   0   0   0  |
727  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
728  *           |  0   0   0   1   0   0   0   0  |
729  *           |  0   0   0   0   1   0   0   0  |
730  *           |  0   0   0   0   0   1   0   0  |
731  *           |  0   0   0   0   0   0   1   0  |
732  *           |  0   0   0   0   0   0   0   1  |
733  *           ~~                               ~~
734  *
735  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
736  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
737  * matrix is not singular.
738  * __                                                                 __
739  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
740  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
741  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
742  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
743  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
744  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
745  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
746  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
747  * ~~                                                                 ~~
748  * __                                                                 __
749  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
750  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
751  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
752  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
753  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
754  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
755  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
756  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
757  * ~~                                                                 ~~
758  * __                                                                 __
759  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
760  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
761  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
762  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
763  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
764  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
765  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
766  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
767  * ~~                                                                 ~~
768  * __                                                                 __
769  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
770  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
771  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
772  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
773  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
774  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
775  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
776  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
777  * ~~                                                                 ~~
778  * __                                                                 __
779  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
780  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
781  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
782  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
783  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
784  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
785  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
786  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
787  * ~~                                                                 ~~
788  * __                                                                 __
789  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
790  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
791  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
792  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
793  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
794  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
795  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
796  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
797  * ~~                                                                 ~~
798  *                   __                               __
799  *                   |  0   0   1   0   0   0   0   0  |
800  *                   | 167 100  5   41 159 169 217 208 |
801  *                   | 166 100  4   40 158 168 216 209 |
802  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
803  *                   |  0   0   0   0   1   0   0   0  |
804  *                   |  0   0   0   0   0   1   0   0  |
805  *                   |  0   0   0   0   0   0   1   0  |
806  *                   |  0   0   0   0   0   0   0   1  |
807  *                   ~~                               ~~
808  *
809  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
810  * of the missing data.
811  *
812  * As is apparent from the example above, the only non-trivial rows in the
813  * inverse matrix correspond to the data disks that we're trying to
814  * reconstruct. Indeed, those are the only rows we need as the others would
815  * only be useful for reconstructing data known or assumed to be valid. For
816  * that reason, we only build the coefficients in the rows that correspond to
817  * targeted columns.
818  */
819 /* END CSTYLED */
820 
821 static void
822 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
823     uint8_t **rows)
824 {
825 	int i, j;
826 	int pow;
827 
828 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
829 
830 	/*
831 	 * Fill in the missing rows of interest.
832 	 */
833 	for (i = 0; i < nmap; i++) {
834 		ASSERT3S(0, <=, map[i]);
835 		ASSERT3S(map[i], <=, 2);
836 
837 		pow = map[i] * n;
838 		if (pow > 255)
839 			pow -= 255;
840 		ASSERT(pow <= 255);
841 
842 		for (j = 0; j < n; j++) {
843 			pow -= map[i];
844 			if (pow < 0)
845 				pow += 255;
846 			rows[i][j] = vdev_raidz_pow2[pow];
847 		}
848 	}
849 }
850 
851 static void
852 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
853     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
854 {
855 	int i, j, ii, jj;
856 	uint8_t log;
857 
858 	/*
859 	 * Assert that the first nmissing entries from the array of used
860 	 * columns correspond to parity columns and that subsequent entries
861 	 * correspond to data columns.
862 	 */
863 	for (i = 0; i < nmissing; i++) {
864 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
865 	}
866 	for (; i < n; i++) {
867 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
868 	}
869 
870 	/*
871 	 * First initialize the storage where we'll compute the inverse rows.
872 	 */
873 	for (i = 0; i < nmissing; i++) {
874 		for (j = 0; j < n; j++) {
875 			invrows[i][j] = (i == j) ? 1 : 0;
876 		}
877 	}
878 
879 	/*
880 	 * Subtract all trivial rows from the rows of consequence.
881 	 */
882 	for (i = 0; i < nmissing; i++) {
883 		for (j = nmissing; j < n; j++) {
884 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
885 			jj = used[j] - rm->rm_firstdatacol;
886 			ASSERT3S(jj, <, n);
887 			invrows[i][j] = rows[i][jj];
888 			rows[i][jj] = 0;
889 		}
890 	}
891 
892 	/*
893 	 * For each of the rows of interest, we must normalize it and subtract
894 	 * a multiple of it from the other rows.
895 	 */
896 	for (i = 0; i < nmissing; i++) {
897 		for (j = 0; j < missing[i]; j++) {
898 			ASSERT3U(rows[i][j], ==, 0);
899 		}
900 		ASSERT3U(rows[i][missing[i]], !=, 0);
901 
902 		/*
903 		 * Compute the inverse of the first element and multiply each
904 		 * element in the row by that value.
905 		 */
906 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
907 
908 		for (j = 0; j < n; j++) {
909 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
910 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
911 		}
912 
913 		for (ii = 0; ii < nmissing; ii++) {
914 			if (i == ii)
915 				continue;
916 
917 			ASSERT3U(rows[ii][missing[i]], !=, 0);
918 
919 			log = vdev_raidz_log2[rows[ii][missing[i]]];
920 
921 			for (j = 0; j < n; j++) {
922 				rows[ii][j] ^=
923 				    vdev_raidz_exp2(rows[i][j], log);
924 				invrows[ii][j] ^=
925 				    vdev_raidz_exp2(invrows[i][j], log);
926 			}
927 		}
928 	}
929 
930 	/*
931 	 * Verify that the data that is left in the rows are properly part of
932 	 * an identity matrix.
933 	 */
934 	for (i = 0; i < nmissing; i++) {
935 		for (j = 0; j < n; j++) {
936 			if (j == missing[i]) {
937 				ASSERT3U(rows[i][j], ==, 1);
938 			} else {
939 				ASSERT3U(rows[i][j], ==, 0);
940 			}
941 		}
942 	}
943 }
944 
945 static void
946 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
947     int *missing, uint8_t **invrows, const uint8_t *used)
948 {
949 	int i, j, x, cc, c;
950 	uint8_t *src;
951 	uint64_t ccount;
952 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
953 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
954 	uint8_t log, val;
955 	int ll;
956 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
957 	uint8_t *p, *pp;
958 	size_t psize;
959 
960 	log = 0;	/* gcc */
961 	psize = sizeof (invlog[0][0]) * n * nmissing;
962 	p = zfs_alloc(psize);
963 
964 	for (pp = p, i = 0; i < nmissing; i++) {
965 		invlog[i] = pp;
966 		pp += n;
967 	}
968 
969 	for (i = 0; i < nmissing; i++) {
970 		for (j = 0; j < n; j++) {
971 			ASSERT3U(invrows[i][j], !=, 0);
972 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
973 		}
974 	}
975 
976 	for (i = 0; i < n; i++) {
977 		c = used[i];
978 		ASSERT3U(c, <, rm->rm_cols);
979 
980 		src = rm->rm_col[c].rc_data;
981 		ccount = rm->rm_col[c].rc_size;
982 		for (j = 0; j < nmissing; j++) {
983 			cc = missing[j] + rm->rm_firstdatacol;
984 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
985 			ASSERT3U(cc, <, rm->rm_cols);
986 			ASSERT3U(cc, !=, c);
987 
988 			dst[j] = rm->rm_col[cc].rc_data;
989 			dcount[j] = rm->rm_col[cc].rc_size;
990 		}
991 
992 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
993 
994 		for (x = 0; x < ccount; x++, src++) {
995 			if (*src != 0)
996 				log = vdev_raidz_log2[*src];
997 
998 			for (cc = 0; cc < nmissing; cc++) {
999 				if (x >= dcount[cc])
1000 					continue;
1001 
1002 				if (*src == 0) {
1003 					val = 0;
1004 				} else {
1005 					if ((ll = log + invlog[cc][i]) >= 255)
1006 						ll -= 255;
1007 					val = vdev_raidz_pow2[ll];
1008 				}
1009 
1010 				if (i == 0)
1011 					dst[cc][x] = val;
1012 				else
1013 					dst[cc][x] ^= val;
1014 			}
1015 		}
1016 	}
1017 
1018 	zfs_free(p, psize);
1019 }
1020 
1021 static int
1022 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1023 {
1024 	int n, i, c, t, tt;
1025 	int nmissing_rows;
1026 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1027 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1028 
1029 	uint8_t *p, *pp;
1030 	size_t psize;
1031 
1032 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1033 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1034 	uint8_t *used;
1035 
1036 	int code = 0;
1037 
1038 
1039 	n = rm->rm_cols - rm->rm_firstdatacol;
1040 
1041 	/*
1042 	 * Figure out which data columns are missing.
1043 	 */
1044 	nmissing_rows = 0;
1045 	for (t = 0; t < ntgts; t++) {
1046 		if (tgts[t] >= rm->rm_firstdatacol) {
1047 			missing_rows[nmissing_rows++] =
1048 			    tgts[t] - rm->rm_firstdatacol;
1049 		}
1050 	}
1051 
1052 	/*
1053 	 * Figure out which parity columns to use to help generate the missing
1054 	 * data columns.
1055 	 */
1056 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1057 		ASSERT(tt < ntgts);
1058 		ASSERT(c < rm->rm_firstdatacol);
1059 
1060 		/*
1061 		 * Skip any targeted parity columns.
1062 		 */
1063 		if (c == tgts[tt]) {
1064 			tt++;
1065 			continue;
1066 		}
1067 
1068 		code |= 1 << c;
1069 
1070 		parity_map[i] = c;
1071 		i++;
1072 	}
1073 
1074 	ASSERT(code != 0);
1075 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1076 
1077 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1078 	    nmissing_rows * n + sizeof (used[0]) * n;
1079 	p = kmem_alloc(psize, KM_SLEEP);
1080 
1081 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1082 		rows[i] = pp;
1083 		pp += n;
1084 		invrows[i] = pp;
1085 		pp += n;
1086 	}
1087 	used = pp;
1088 
1089 	for (i = 0; i < nmissing_rows; i++) {
1090 		used[i] = parity_map[i];
1091 	}
1092 
1093 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1094 		if (tt < nmissing_rows &&
1095 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1096 			tt++;
1097 			continue;
1098 		}
1099 
1100 		ASSERT3S(i, <, n);
1101 		used[i] = c;
1102 		i++;
1103 	}
1104 
1105 	/*
1106 	 * Initialize the interesting rows of the matrix.
1107 	 */
1108 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1109 
1110 	/*
1111 	 * Invert the matrix.
1112 	 */
1113 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1114 	    invrows, used);
1115 
1116 	/*
1117 	 * Reconstruct the missing data using the generated matrix.
1118 	 */
1119 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1120 	    invrows, used);
1121 
1122 	kmem_free(p, psize);
1123 
1124 	return (code);
1125 }
1126 
1127 static int
1128 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1129 {
1130 	int tgts[VDEV_RAIDZ_MAXPARITY];
1131 	int ntgts;
1132 	int i, c;
1133 	int code;
1134 	int nbadparity, nbaddata;
1135 
1136 	/*
1137 	 * The tgts list must already be sorted.
1138 	 */
1139 	for (i = 1; i < nt; i++) {
1140 		ASSERT(t[i] > t[i - 1]);
1141 	}
1142 
1143 	nbadparity = rm->rm_firstdatacol;
1144 	nbaddata = rm->rm_cols - nbadparity;
1145 	ntgts = 0;
1146 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1147 		if (i < nt && c == t[i]) {
1148 			tgts[ntgts++] = c;
1149 			i++;
1150 		} else if (rm->rm_col[c].rc_error != 0) {
1151 			tgts[ntgts++] = c;
1152 		} else if (c >= rm->rm_firstdatacol) {
1153 			nbaddata--;
1154 		} else {
1155 			nbadparity--;
1156 		}
1157 	}
1158 
1159 	ASSERT(ntgts >= nt);
1160 	ASSERT(nbaddata >= 0);
1161 	ASSERT(nbaddata + nbadparity == ntgts);
1162 
1163 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1164 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1165 	ASSERT(code > 0);
1166 	return (code);
1167 }
1168 
1169 static raidz_map_t *
1170 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1171     uint64_t dcols, uint64_t nparity)
1172 {
1173 	raidz_map_t *rm;
1174 	uint64_t b = offset >> unit_shift;
1175 	uint64_t s = size >> unit_shift;
1176 	uint64_t f = b % dcols;
1177 	uint64_t o = (b / dcols) << unit_shift;
1178 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1179 
1180 	q = s / (dcols - nparity);
1181 	r = s - q * (dcols - nparity);
1182 	bc = (r == 0 ? 0 : r + nparity);
1183 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1184 
1185 	if (q == 0) {
1186 		acols = bc;
1187 		scols = MIN(dcols, roundup(bc, nparity + 1));
1188 	} else {
1189 		acols = dcols;
1190 		scols = dcols;
1191 	}
1192 
1193 	ASSERT3U(acols, <=, scols);
1194 
1195 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1196 
1197 	rm->rm_cols = acols;
1198 	rm->rm_scols = scols;
1199 	rm->rm_bigcols = bc;
1200 	rm->rm_skipstart = bc;
1201 	rm->rm_missingdata = 0;
1202 	rm->rm_missingparity = 0;
1203 	rm->rm_firstdatacol = nparity;
1204 	rm->rm_reports = 0;
1205 	rm->rm_freed = 0;
1206 	rm->rm_ecksuminjected = 0;
1207 
1208 	asize = 0;
1209 
1210 	for (c = 0; c < scols; c++) {
1211 		col = f + c;
1212 		coff = o;
1213 		if (col >= dcols) {
1214 			col -= dcols;
1215 			coff += 1ULL << unit_shift;
1216 		}
1217 		rm->rm_col[c].rc_devidx = col;
1218 		rm->rm_col[c].rc_offset = coff;
1219 		rm->rm_col[c].rc_data = NULL;
1220 		rm->rm_col[c].rc_error = 0;
1221 		rm->rm_col[c].rc_tried = 0;
1222 		rm->rm_col[c].rc_skipped = 0;
1223 
1224 		if (c >= acols)
1225 			rm->rm_col[c].rc_size = 0;
1226 		else if (c < bc)
1227 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1228 		else
1229 			rm->rm_col[c].rc_size = q << unit_shift;
1230 
1231 		asize += rm->rm_col[c].rc_size;
1232 	}
1233 
1234 	ASSERT3U(asize, ==, tot << unit_shift);
1235 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1236 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1237 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1238 	ASSERT3U(rm->rm_nskip, <=, nparity);
1239 
1240 	for (c = 0; c < rm->rm_firstdatacol; c++)
1241 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1242 
1243 	rm->rm_col[c].rc_data = data;
1244 
1245 	for (c = c + 1; c < acols; c++)
1246 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1247 		    rm->rm_col[c - 1].rc_size;
1248 
1249 	/*
1250 	 * If all data stored spans all columns, there's a danger that parity
1251 	 * will always be on the same device and, since parity isn't read
1252 	 * during normal operation, that that device's I/O bandwidth won't be
1253 	 * used effectively. We therefore switch the parity every 1MB.
1254 	 *
1255 	 * ... at least that was, ostensibly, the theory. As a practical
1256 	 * matter unless we juggle the parity between all devices evenly, we
1257 	 * won't see any benefit. Further, occasional writes that aren't a
1258 	 * multiple of the LCM of the number of children and the minimum
1259 	 * stripe width are sufficient to avoid pessimal behavior.
1260 	 * Unfortunately, this decision created an implicit on-disk format
1261 	 * requirement that we need to support for all eternity, but only
1262 	 * for single-parity RAID-Z.
1263 	 *
1264 	 * If we intend to skip a sector in the zeroth column for padding
1265 	 * we must make sure to note this swap. We will never intend to
1266 	 * skip the first column since at least one data and one parity
1267 	 * column must appear in each row.
1268 	 */
1269 	ASSERT(rm->rm_cols >= 2);
1270 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1271 
1272 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1273 		devidx = rm->rm_col[0].rc_devidx;
1274 		o = rm->rm_col[0].rc_offset;
1275 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1276 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1277 		rm->rm_col[1].rc_devidx = devidx;
1278 		rm->rm_col[1].rc_offset = o;
1279 
1280 		if (rm->rm_skipstart == 0)
1281 			rm->rm_skipstart = 1;
1282 	}
1283 
1284 	return (rm);
1285 }
1286 
1287 static void
1288 vdev_raidz_map_free(raidz_map_t *rm)
1289 {
1290 	int c;
1291 
1292 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1293 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1294 
1295 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1296 }
1297 
1298 static vdev_t *
1299 vdev_child(vdev_t *pvd, uint64_t devidx)
1300 {
1301 	vdev_t *cvd;
1302 
1303 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1304 		if (cvd->v_id == devidx)
1305 			break;
1306 	}
1307 
1308 	return (cvd);
1309 }
1310 
1311 /*
1312  * We keep track of whether or not there were any injected errors, so that
1313  * any ereports we generate can note it.
1314  */
1315 static int
1316 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1317 {
1318 
1319 	return (zio_checksum_verify(bp, data));
1320 }
1321 
1322 /*
1323  * Generate the parity from the data columns. If we tried and were able to
1324  * read the parity without error, verify that the generated parity matches the
1325  * data we read. If it doesn't, we fire off a checksum error. Return the
1326  * number such failures.
1327  */
1328 static int
1329 raidz_parity_verify(raidz_map_t *rm)
1330 {
1331 	void *orig[VDEV_RAIDZ_MAXPARITY];
1332 	int c, ret = 0;
1333 	raidz_col_t *rc;
1334 
1335 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1336 		rc = &rm->rm_col[c];
1337 		if (!rc->rc_tried || rc->rc_error != 0)
1338 			continue;
1339 		orig[c] = zfs_alloc(rc->rc_size);
1340 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1341 	}
1342 
1343 	vdev_raidz_generate_parity(rm);
1344 
1345 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1346 		rc = &rm->rm_col[c];
1347 		if (!rc->rc_tried || rc->rc_error != 0)
1348 			continue;
1349 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1350 			rc->rc_error = ECKSUM;
1351 			ret++;
1352 		}
1353 		zfs_free(orig[c], rc->rc_size);
1354 	}
1355 
1356 	return (ret);
1357 }
1358 
1359 /*
1360  * Iterate over all combinations of bad data and attempt a reconstruction.
1361  * Note that the algorithm below is non-optimal because it doesn't take into
1362  * account how reconstruction is actually performed. For example, with
1363  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1364  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1365  * cases we'd only use parity information in column 0.
1366  */
1367 static int
1368 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1369     off_t offset, uint64_t bytes, int total_errors, int data_errors)
1370 {
1371 	raidz_col_t *rc;
1372 	void *orig[VDEV_RAIDZ_MAXPARITY];
1373 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1374 	int *tgts = &tstore[1];
1375 	int current, next, i, c, n;
1376 	int code, ret = 0;
1377 
1378 	ASSERT(total_errors < rm->rm_firstdatacol);
1379 
1380 	/*
1381 	 * This simplifies one edge condition.
1382 	 */
1383 	tgts[-1] = -1;
1384 
1385 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1386 		/*
1387 		 * Initialize the targets array by finding the first n columns
1388 		 * that contain no error.
1389 		 *
1390 		 * If there were no data errors, we need to ensure that we're
1391 		 * always explicitly attempting to reconstruct at least one
1392 		 * data column. To do this, we simply push the highest target
1393 		 * up into the data columns.
1394 		 */
1395 		for (c = 0, i = 0; i < n; i++) {
1396 			if (i == n - 1 && data_errors == 0 &&
1397 			    c < rm->rm_firstdatacol) {
1398 				c = rm->rm_firstdatacol;
1399 			}
1400 
1401 			while (rm->rm_col[c].rc_error != 0) {
1402 				c++;
1403 				ASSERT3S(c, <, rm->rm_cols);
1404 			}
1405 
1406 			tgts[i] = c++;
1407 		}
1408 
1409 		/*
1410 		 * Setting tgts[n] simplifies the other edge condition.
1411 		 */
1412 		tgts[n] = rm->rm_cols;
1413 
1414 		/*
1415 		 * These buffers were allocated in previous iterations.
1416 		 */
1417 		for (i = 0; i < n - 1; i++) {
1418 			ASSERT(orig[i] != NULL);
1419 		}
1420 
1421 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1422 
1423 		current = 0;
1424 		next = tgts[current];
1425 
1426 		while (current != n) {
1427 			tgts[current] = next;
1428 			current = 0;
1429 
1430 			/*
1431 			 * Save off the original data that we're going to
1432 			 * attempt to reconstruct.
1433 			 */
1434 			for (i = 0; i < n; i++) {
1435 				ASSERT(orig[i] != NULL);
1436 				c = tgts[i];
1437 				ASSERT3S(c, >=, 0);
1438 				ASSERT3S(c, <, rm->rm_cols);
1439 				rc = &rm->rm_col[c];
1440 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1441 			}
1442 
1443 			/*
1444 			 * Attempt a reconstruction and exit the outer loop on
1445 			 * success.
1446 			 */
1447 			code = vdev_raidz_reconstruct(rm, tgts, n);
1448 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1449 				for (i = 0; i < n; i++) {
1450 					c = tgts[i];
1451 					rc = &rm->rm_col[c];
1452 					ASSERT(rc->rc_error == 0);
1453 					rc->rc_error = ECKSUM;
1454 				}
1455 
1456 				ret = code;
1457 				goto done;
1458 			}
1459 
1460 			/*
1461 			 * Restore the original data.
1462 			 */
1463 			for (i = 0; i < n; i++) {
1464 				c = tgts[i];
1465 				rc = &rm->rm_col[c];
1466 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1467 			}
1468 
1469 			do {
1470 				/*
1471 				 * Find the next valid column after the current
1472 				 * position..
1473 				 */
1474 				for (next = tgts[current] + 1;
1475 				    next < rm->rm_cols &&
1476 				    rm->rm_col[next].rc_error != 0; next++)
1477 					continue;
1478 
1479 				ASSERT(next <= tgts[current + 1]);
1480 
1481 				/*
1482 				 * If that spot is available, we're done here.
1483 				 */
1484 				if (next != tgts[current + 1])
1485 					break;
1486 
1487 				/*
1488 				 * Otherwise, find the next valid column after
1489 				 * the previous position.
1490 				 */
1491 				for (c = tgts[current - 1] + 1;
1492 				    rm->rm_col[c].rc_error != 0; c++)
1493 					continue;
1494 
1495 				tgts[current] = c;
1496 				current++;
1497 
1498 			} while (current != n);
1499 		}
1500 	}
1501 	n--;
1502 done:
1503 	for (i = n - 1; i >= 0; i--) {
1504 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1505 	}
1506 
1507 	return (ret);
1508 }
1509 
1510 static int
1511 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1512     off_t offset, size_t bytes)
1513 {
1514 	vdev_t *tvd = vd->v_top;
1515 	vdev_t *cvd;
1516 	raidz_map_t *rm;
1517 	raidz_col_t *rc;
1518 	int c, error;
1519 	int unexpected_errors;
1520 	int parity_errors;
1521 	int parity_untried;
1522 	int data_errors;
1523 	int total_errors;
1524 	int n;
1525 	int tgts[VDEV_RAIDZ_MAXPARITY];
1526 	int code;
1527 
1528 	rc = NULL;	/* gcc */
1529 	error = 0;
1530 
1531 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1532 	    vd->v_nchildren, vd->v_nparity);
1533 
1534 	/*
1535 	 * Iterate over the columns in reverse order so that we hit the parity
1536 	 * last -- any errors along the way will force us to read the parity.
1537 	 */
1538 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1539 		rc = &rm->rm_col[c];
1540 		cvd = vdev_child(vd, rc->rc_devidx);
1541 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1542 			if (c >= rm->rm_firstdatacol)
1543 				rm->rm_missingdata++;
1544 			else
1545 				rm->rm_missingparity++;
1546 			rc->rc_error = ENXIO;
1547 			rc->rc_tried = 1;	/* don't even try */
1548 			rc->rc_skipped = 1;
1549 			continue;
1550 		}
1551 #if 0		/* XXX: Too hard for the boot code. */
1552 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1553 			if (c >= rm->rm_firstdatacol)
1554 				rm->rm_missingdata++;
1555 			else
1556 				rm->rm_missingparity++;
1557 			rc->rc_error = ESTALE;
1558 			rc->rc_skipped = 1;
1559 			continue;
1560 		}
1561 #endif
1562 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1563 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1564 			    rc->rc_offset, rc->rc_size);
1565 			rc->rc_tried = 1;
1566 			rc->rc_skipped = 0;
1567 		}
1568 	}
1569 
1570 reconstruct:
1571 	unexpected_errors = 0;
1572 	parity_errors = 0;
1573 	parity_untried = 0;
1574 	data_errors = 0;
1575 	total_errors = 0;
1576 
1577 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1578 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1579 
1580 	for (c = 0; c < rm->rm_cols; c++) {
1581 		rc = &rm->rm_col[c];
1582 
1583 		if (rc->rc_error) {
1584 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1585 
1586 			if (c < rm->rm_firstdatacol)
1587 				parity_errors++;
1588 			else
1589 				data_errors++;
1590 
1591 			if (!rc->rc_skipped)
1592 				unexpected_errors++;
1593 
1594 			total_errors++;
1595 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1596 			parity_untried++;
1597 		}
1598 	}
1599 
1600 	/*
1601 	 * There are three potential phases for a read:
1602 	 *	1. produce valid data from the columns read
1603 	 *	2. read all disks and try again
1604 	 *	3. perform combinatorial reconstruction
1605 	 *
1606 	 * Each phase is progressively both more expensive and less likely to
1607 	 * occur. If we encounter more errors than we can repair or all phases
1608 	 * fail, we have no choice but to return an error.
1609 	 */
1610 
1611 	/*
1612 	 * If the number of errors we saw was correctable -- less than or equal
1613 	 * to the number of parity disks read -- attempt to produce data that
1614 	 * has a valid checksum. Naturally, this case applies in the absence of
1615 	 * any errors.
1616 	 */
1617 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1618 		if (data_errors == 0) {
1619 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1620 				/*
1621 				 * If we read parity information (unnecessarily
1622 				 * as it happens since no reconstruction was
1623 				 * needed) regenerate and verify the parity.
1624 				 * We also regenerate parity when resilvering
1625 				 * so we can write it out to the failed device
1626 				 * later.
1627 				 */
1628 				if (parity_errors + parity_untried <
1629 				    rm->rm_firstdatacol) {
1630 					n = raidz_parity_verify(rm);
1631 					unexpected_errors += n;
1632 					ASSERT(parity_errors + n <=
1633 					    rm->rm_firstdatacol);
1634 				}
1635 				goto done;
1636 			}
1637 		} else {
1638 			/*
1639 			 * We either attempt to read all the parity columns or
1640 			 * none of them. If we didn't try to read parity, we
1641 			 * wouldn't be here in the correctable case. There must
1642 			 * also have been fewer parity errors than parity
1643 			 * columns or, again, we wouldn't be in this code path.
1644 			 */
1645 			ASSERT(parity_untried == 0);
1646 			ASSERT(parity_errors < rm->rm_firstdatacol);
1647 
1648 			/*
1649 			 * Identify the data columns that reported an error.
1650 			 */
1651 			n = 0;
1652 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1653 				rc = &rm->rm_col[c];
1654 				if (rc->rc_error != 0) {
1655 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1656 					tgts[n++] = c;
1657 				}
1658 			}
1659 
1660 			ASSERT(rm->rm_firstdatacol >= n);
1661 
1662 			code = vdev_raidz_reconstruct(rm, tgts, n);
1663 
1664 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1665 				/*
1666 				 * If we read more parity disks than were used
1667 				 * for reconstruction, confirm that the other
1668 				 * parity disks produced correct data. This
1669 				 * routine is suboptimal in that it regenerates
1670 				 * the parity that we already used in addition
1671 				 * to the parity that we're attempting to
1672 				 * verify, but this should be a relatively
1673 				 * uncommon case, and can be optimized if it
1674 				 * becomes a problem. Note that we regenerate
1675 				 * parity when resilvering so we can write it
1676 				 * out to failed devices later.
1677 				 */
1678 				if (parity_errors < rm->rm_firstdatacol - n) {
1679 					n = raidz_parity_verify(rm);
1680 					unexpected_errors += n;
1681 					ASSERT(parity_errors + n <=
1682 					    rm->rm_firstdatacol);
1683 				}
1684 
1685 				goto done;
1686 			}
1687 		}
1688 	}
1689 
1690 	/*
1691 	 * This isn't a typical situation -- either we got a read
1692 	 * error or a child silently returned bad data. Read every
1693 	 * block so we can try again with as much data and parity as
1694 	 * we can track down. If we've already been through once
1695 	 * before, all children will be marked as tried so we'll
1696 	 * proceed to combinatorial reconstruction.
1697 	 */
1698 	unexpected_errors = 1;
1699 	rm->rm_missingdata = 0;
1700 	rm->rm_missingparity = 0;
1701 
1702 	n = 0;
1703 	for (c = 0; c < rm->rm_cols; c++) {
1704 		rc = &rm->rm_col[c];
1705 
1706 		if (rc->rc_tried)
1707 			continue;
1708 
1709 		cvd = vdev_child(vd, rc->rc_devidx);
1710 		ASSERT(cvd != NULL);
1711 		rc->rc_error = cvd->v_read(cvd, NULL,
1712 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1713 		if (rc->rc_error == 0)
1714 			n++;
1715 		rc->rc_tried = 1;
1716 		rc->rc_skipped = 0;
1717 	}
1718 	/*
1719 	 * If we managed to read anything more, retry the
1720 	 * reconstruction.
1721 	 */
1722 	if (n > 0)
1723 		goto reconstruct;
1724 
1725 	/*
1726 	 * At this point we've attempted to reconstruct the data given the
1727 	 * errors we detected, and we've attempted to read all columns. There
1728 	 * must, therefore, be one or more additional problems -- silent errors
1729 	 * resulting in invalid data rather than explicit I/O errors resulting
1730 	 * in absent data. We check if there is enough additional data to
1731 	 * possibly reconstruct the data and then perform combinatorial
1732 	 * reconstruction over all possible combinations. If that fails,
1733 	 * we're cooked.
1734 	 */
1735 	if (total_errors > rm->rm_firstdatacol) {
1736 		error = EIO;
1737 	} else if (total_errors < rm->rm_firstdatacol &&
1738 	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1739 	     total_errors, data_errors)) != 0) {
1740 		/*
1741 		 * If we didn't use all the available parity for the
1742 		 * combinatorial reconstruction, verify that the remaining
1743 		 * parity is correct.
1744 		 */
1745 		if (code != (1 << rm->rm_firstdatacol) - 1)
1746 			(void) raidz_parity_verify(rm);
1747 	} else {
1748 		/*
1749 		 * We're here because either:
1750 		 *
1751 		 *	total_errors == rm_first_datacol, or
1752 		 *	vdev_raidz_combrec() failed
1753 		 *
1754 		 * In either case, there is enough bad data to prevent
1755 		 * reconstruction.
1756 		 *
1757 		 * Start checksum ereports for all children which haven't
1758 		 * failed, and the IO wasn't speculative.
1759 		 */
1760 		error = ECKSUM;
1761 	}
1762 
1763 done:
1764 	vdev_raidz_map_free(rm);
1765 
1766 	return (error);
1767 }
1768