xref: /illumos-gate/usr/src/uts/common/io/vioblk/vioblk.c (revision a963a5aa3ca3777616e2475ae05969f48439c694)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
25  * Copyright 2020 Joyent Inc.
26  * Copyright 2019 Western Digital Corporation.
27  * Copyright 2020 Oxide Computer Company
28  */
29 
30 /*
31  * VIRTIO BLOCK DRIVER
32  *
33  * This driver provides support for Virtio Block devices.  Each driver instance
34  * attaches to a single underlying block device.
35  *
36  * REQUEST CHAIN LAYOUT
37  *
38  * Every request chain sent to the I/O queue has the following structure.  Each
39  * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within
40  * the chain:
41  *
42  *    +-0-----------------------------------------+
43  *    | struct virtio_blk_hdr                     |-----------------------\
44  *    |   (written by driver, read by device)     |                       |
45  *    +-1-----------------------------------------+                       |
46  *    | optional data payload                     |--\                    |
47  *    |   (written by driver for write requests,  |  |                    |
48  *    |    or by device for read requests)        |  |                    |
49  *    +-2-----------------------------------------+  |                    |
50  *    | ,~`           :                              |-cookies loaned     |
51  *    |/              :                        ,~`|  | from blkdev        |
52  *                    :                       /   |  |                    |
53  *    +-(N - 1)-----------------------------------+  |                    |
54  *    | ... end of data payload.                  |  |                    |
55  *    |                                           |  |                    |
56  *    |                                           |--/                    |
57  *    +-N-----------------------------------------+                       |
58  *    | status byte                               |                       |
59  *    |   (written by device, read by driver)     |--------------------\  |
60  *    +-------------------------------------------+                    |  |
61  *                                                                     |  |
62  * The memory for the header and status bytes (i.e., 0 and N above)    |  |
63  * is allocated as a single chunk by vioblk_alloc_reqs():              |  |
64  *                                                                     |  |
65  *    +-------------------------------------------+                    |  |
66  *    | struct virtio_blk_hdr                     |<----------------------/
67  *    +-------------------------------------------+                    |
68  *    | status byte                               |<-------------------/
69  *    +-------------------------------------------+
70  */
71 
72 #include <sys/modctl.h>
73 #include <sys/blkdev.h>
74 #include <sys/types.h>
75 #include <sys/errno.h>
76 #include <sys/param.h>
77 #include <sys/stropts.h>
78 #include <sys/stream.h>
79 #include <sys/strsubr.h>
80 #include <sys/kmem.h>
81 #include <sys/conf.h>
82 #include <sys/devops.h>
83 #include <sys/ksynch.h>
84 #include <sys/stat.h>
85 #include <sys/modctl.h>
86 #include <sys/debug.h>
87 #include <sys/pci.h>
88 #include <sys/containerof.h>
89 #include <sys/ctype.h>
90 #include <sys/sysmacros.h>
91 #include <sys/dkioc_free_util.h>
92 
93 #include "virtio.h"
94 #include "vioblk.h"
95 
96 static void vioblk_get_id(vioblk_t *);
97 uint_t vioblk_int_handler(caddr_t, caddr_t);
98 static uint_t vioblk_poll(vioblk_t *);
99 static int vioblk_quiesce(dev_info_t *);
100 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t);
101 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t);
102 
103 
104 static struct dev_ops vioblk_dev_ops = {
105 	.devo_rev =			DEVO_REV,
106 	.devo_refcnt =			0,
107 
108 	.devo_attach =			vioblk_attach,
109 	.devo_detach =			vioblk_detach,
110 	.devo_quiesce =			vioblk_quiesce,
111 
112 	.devo_getinfo =			ddi_no_info,
113 	.devo_identify =		nulldev,
114 	.devo_probe =			nulldev,
115 	.devo_reset =			nodev,
116 	.devo_cb_ops =			NULL,
117 	.devo_bus_ops =			NULL,
118 	.devo_power =			NULL,
119 };
120 
121 static struct modldrv vioblk_modldrv = {
122 	.drv_modops =			&mod_driverops,
123 	.drv_linkinfo =			"VIRTIO block driver",
124 	.drv_dev_ops =			&vioblk_dev_ops
125 };
126 
127 static struct modlinkage vioblk_modlinkage = {
128 	.ml_rev =			MODREV_1,
129 	.ml_linkage =			{ &vioblk_modldrv, NULL }
130 };
131 
132 /*
133  * DMA attribute template for header and status blocks.  We also make a
134  * per-instance copy of this template with negotiated sizes from the device for
135  * blkdev.
136  */
137 static const ddi_dma_attr_t vioblk_dma_attr = {
138 	.dma_attr_version =		DMA_ATTR_V0,
139 	.dma_attr_addr_lo =		0x0000000000000000,
140 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
141 	.dma_attr_count_max =		0x00000000FFFFFFFF,
142 	.dma_attr_align =		1,
143 	.dma_attr_burstsizes =		1,
144 	.dma_attr_minxfer =		1,
145 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
146 	.dma_attr_seg =			0x00000000FFFFFFFF,
147 	.dma_attr_sgllen =		1,
148 	.dma_attr_granular =		1,
149 	.dma_attr_flags =		0
150 };
151 
152 static vioblk_req_t *
153 vioblk_req_alloc(vioblk_t *vib)
154 {
155 	vioblk_req_t *vbr;
156 
157 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
158 
159 	if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) {
160 		return (NULL);
161 	}
162 	vib->vib_nreqs_alloc++;
163 
164 	VERIFY0(vbr->vbr_status);
165 	vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED;
166 
167 	VERIFY3P(vbr->vbr_chain, !=, NULL);
168 	VERIFY3P(vbr->vbr_xfer, ==, NULL);
169 	VERIFY3S(vbr->vbr_error, ==, 0);
170 
171 	return (vbr);
172 }
173 
174 static void
175 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr)
176 {
177 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
178 
179 	/*
180 	 * Check that this request was allocated, then zero the status field to
181 	 * clear all status bits.
182 	 */
183 	VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED);
184 	vbr->vbr_status = 0;
185 
186 	vbr->vbr_xfer = NULL;
187 	vbr->vbr_error = 0;
188 	vbr->vbr_type = 0;
189 	virtio_chain_clear(vbr->vbr_chain);
190 
191 	list_insert_head(&vib->vib_reqs, vbr);
192 
193 	VERIFY3U(vib->vib_nreqs_alloc, >, 0);
194 	vib->vib_nreqs_alloc--;
195 }
196 
197 static void
198 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr)
199 {
200 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
201 
202 	VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE));
203 	vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE;
204 
205 	if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) {
206 		vib->vib_stats->vbs_rw_cacheflush.value.ui64++;
207 	}
208 
209 	if (vbr->vbr_xfer != NULL) {
210 		/*
211 		 * This is a blkdev framework request.
212 		 */
213 		mutex_exit(&vib->vib_mutex);
214 		bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error);
215 		mutex_enter(&vib->vib_mutex);
216 		vbr->vbr_xfer = NULL;
217 	}
218 }
219 
220 static vioblk_req_t *
221 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector,
222     boolean_t polled)
223 {
224 	vioblk_req_t *vbr = NULL;
225 
226 	if ((vbr = vioblk_req_alloc(vib)) == NULL) {
227 		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
228 		return (NULL);
229 	}
230 	vbr->vbr_type = type;
231 
232 	if (polled) {
233 		/*
234 		 * Mark this command as polled so that we can wait on it
235 		 * ourselves.
236 		 */
237 		vbr->vbr_status |= VIOBLK_REQSTAT_POLLED;
238 	}
239 
240 	struct vioblk_req_hdr vbh;
241 	vbh.vbh_type = type;
242 	vbh.vbh_ioprio = 0;
243 	vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE;
244 	bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh));
245 
246 	/*
247 	 * Put the header in the first descriptor.  See the block comment at
248 	 * the top of the file for more details on the chain layout.
249 	 */
250 	if (virtio_chain_append(vbr->vbr_chain,
251 	    virtio_dma_cookie_pa(vbr->vbr_dma, 0),
252 	    sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) !=
253 	    DDI_SUCCESS) {
254 		vioblk_req_free(vib, vbr);
255 		return (NULL);
256 	}
257 
258 	return (vbr);
259 }
260 
261 static int
262 vioblk_common_submit(vioblk_t *vib, vioblk_req_t *vbr)
263 {
264 	virtio_chain_t *vic = vbr->vbr_chain;
265 	int r;
266 
267 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
268 
269 	/*
270 	 * The device will write the status byte into this last descriptor.
271 	 * See the block comment at the top of the file for more details on the
272 	 * chain layout.
273 	 */
274 	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) +
275 	    sizeof (struct vioblk_req_hdr), sizeof (uint8_t),
276 	    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
277 		vioblk_req_free(vib, vbr);
278 		return (ENOMEM);
279 	}
280 
281 	virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV);
282 	virtio_chain_submit(vic, B_TRUE);
283 
284 	if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) {
285 		/*
286 		 * This is not a polled request.  Our request will be freed and
287 		 * the caller notified later in vioblk_poll().
288 		 */
289 		return (0);
290 	}
291 
292 	/*
293 	 * This is a polled request.  We need to block here and wait for the
294 	 * device to complete request processing.
295 	 */
296 	while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) {
297 		if (ddi_in_panic()) {
298 			/*
299 			 * When panicking, interrupts are disabled.  We must
300 			 * poll the queue manually.
301 			 */
302 			drv_usecwait(10);
303 			(void) vioblk_poll(vib);
304 			continue;
305 		}
306 
307 		/*
308 		 * When not panicking, the device will interrupt on command
309 		 * completion and vioblk_poll() will be called to wake us up.
310 		 */
311 		cv_wait(&vib->vib_cv, &vib->vib_mutex);
312 	}
313 
314 	vioblk_complete(vib, vbr);
315 	r = vbr->vbr_error;
316 	vioblk_req_free(vib, vbr);
317 	return (r);
318 }
319 
320 static int
321 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma,
322     uint64_t sector, virtio_direction_t dir)
323 {
324 	vioblk_req_t *vbr;
325 
326 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
327 
328 	/*
329 	 * Allocate a polled request.
330 	 */
331 	if ((vbr = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) {
332 		return (ENOMEM);
333 	}
334 
335 	/*
336 	 * If there is a request payload, it goes between the header and the
337 	 * status byte.  See the block comment at the top of the file for more
338 	 * detail on the chain layout.
339 	 */
340 	if (dma != NULL) {
341 		virtio_chain_t *vic = vbr->vbr_chain;
342 		for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) {
343 			if (virtio_chain_append(vic,
344 			    virtio_dma_cookie_pa(dma, n),
345 			    virtio_dma_cookie_size(dma, n), dir) !=
346 			    DDI_SUCCESS) {
347 				vioblk_req_free(vib, vbr);
348 				return (ENOMEM);
349 			}
350 		}
351 	}
352 
353 	return (vioblk_common_submit(vib, vbr));
354 }
355 
356 static int
357 vioblk_map_discard(vioblk_t *vib, virtio_chain_t *vic, const bd_xfer_t *xfer)
358 {
359 	const dkioc_free_list_t *dfl = xfer->x_dfl;
360 	const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
361 	virtio_dma_t *dma = NULL;
362 	struct vioblk_discard_write_zeroes *wzp = NULL;
363 
364 	dma = virtio_dma_alloc(vib->vib_virtio,
365 	    dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr,
366 	    DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP);
367 	if (dma == NULL)
368 		return (ENOMEM);
369 
370 	wzp = virtio_dma_va(dma, 0);
371 
372 	for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) {
373 		uint64_t start = dfl->dfl_offset + exts->dfle_start;
374 
375 		const struct vioblk_discard_write_zeroes vdwz = {
376 			.vdwz_sector = start >> DEV_BSHIFT,
377 			.vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT,
378 			.vdwz_flags = 0
379 		};
380 
381 		bcopy(&vdwz, wzp, sizeof (*wzp));
382 	}
383 
384 	if (virtio_chain_append(vic,
385 	    virtio_dma_cookie_pa(dma, 0),
386 	    virtio_dma_cookie_size(dma, 0),
387 	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
388 		virtio_dma_free(dma);
389 		return (ENOMEM);
390 	}
391 
392 	return (0);
393 }
394 
395 static int
396 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type)
397 {
398 	vioblk_req_t *vbr = NULL;
399 	uint_t total_cookies = 2;
400 	boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0;
401 
402 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
403 
404 	/*
405 	 * Ensure that this request falls within the advertised size of the
406 	 * block device.  Be careful to avoid overflow.
407 	 */
408 	if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno ||
409 	    (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) {
410 		vib->vib_stats->vbs_rw_badoffset.value.ui64++;
411 		return (EINVAL);
412 	}
413 
414 	if ((vbr = vioblk_common_start(vib, type, xfer->x_blkno, polled)) ==
415 	    NULL) {
416 		return (ENOMEM);
417 	}
418 	vbr->vbr_xfer = xfer;
419 
420 	/*
421 	 * If there is a request payload, it goes between the header and the
422 	 * status byte.  See the block comment at the top of the file for more
423 	 * detail on the chain layout.
424 	 */
425 	if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) &&
426 	    xfer->x_nblks > 0) {
427 		virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ?
428 		    VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES;
429 		virtio_chain_t *vic = vbr->vbr_chain;
430 
431 		for (uint_t n = 0; n < xfer->x_ndmac; n++) {
432 			ddi_dma_cookie_t dmac;
433 
434 			if (n == 0) {
435 				/*
436 				 * The first cookie is in the blkdev request.
437 				 */
438 				dmac = xfer->x_dmac;
439 			} else {
440 				ddi_dma_nextcookie(xfer->x_dmah, &dmac);
441 			}
442 
443 			if (virtio_chain_append(vic, dmac.dmac_laddress,
444 			    dmac.dmac_size, dir) != DDI_SUCCESS) {
445 				vioblk_req_free(vib, vbr);
446 				return (ENOMEM);
447 			}
448 		}
449 
450 		total_cookies += xfer->x_ndmac;
451 
452 	} else if (xfer->x_nblks > 0) {
453 		dev_err(vib->vib_dip, CE_PANIC,
454 		    "request of type %d had payload length of %lu blocks", type,
455 		    xfer->x_nblks);
456 	} else if (type == VIRTIO_BLK_T_DISCARD) {
457 		int r = vioblk_map_discard(vib, vbr->vbr_chain, xfer);
458 		if (r != 0) {
459 			vioblk_req_free(vib, vbr);
460 			return (r);
461 		}
462 	}
463 
464 	if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) {
465 		vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies;
466 	}
467 
468 	return (vioblk_common_submit(vib, vbr));
469 }
470 
471 static int
472 vioblk_bd_read(void *arg, bd_xfer_t *xfer)
473 {
474 	vioblk_t *vib = arg;
475 	int r;
476 
477 	mutex_enter(&vib->vib_mutex);
478 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN);
479 	mutex_exit(&vib->vib_mutex);
480 
481 	return (r);
482 }
483 
484 static int
485 vioblk_bd_write(void *arg, bd_xfer_t *xfer)
486 {
487 	vioblk_t *vib = arg;
488 	int r;
489 
490 	mutex_enter(&vib->vib_mutex);
491 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT);
492 	mutex_exit(&vib->vib_mutex);
493 
494 	return (r);
495 }
496 
497 static int
498 vioblk_bd_flush(void *arg, bd_xfer_t *xfer)
499 {
500 	vioblk_t *vib = arg;
501 	int r;
502 
503 	mutex_enter(&vib->vib_mutex);
504 	if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) {
505 		/*
506 		 * We don't really expect to get here, because if we did not
507 		 * negotiate the flush feature we would not have installed this
508 		 * function in the blkdev ops vector.
509 		 */
510 		mutex_exit(&vib->vib_mutex);
511 		return (ENOTSUP);
512 	}
513 
514 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH);
515 	mutex_exit(&vib->vib_mutex);
516 
517 	return (r);
518 }
519 
520 static void
521 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive)
522 {
523 	vioblk_t *vib = arg;
524 
525 	drive->d_qsize = vib->vib_reqs_capacity;
526 	drive->d_removable = B_FALSE;
527 	drive->d_hotpluggable = B_TRUE;
528 	drive->d_target = 0;
529 	drive->d_lun = 0;
530 
531 	drive->d_vendor = "Virtio";
532 	drive->d_vendor_len = strlen(drive->d_vendor);
533 
534 	drive->d_product = "Block Device";
535 	drive->d_product_len = strlen(drive->d_product);
536 
537 	drive->d_serial = vib->vib_devid;
538 	drive->d_serial_len = strlen(drive->d_serial);
539 
540 	drive->d_revision = "0000";
541 	drive->d_revision_len = strlen(drive->d_revision);
542 
543 	if (vib->vib_can_discard) {
544 		drive->d_free_align = vib->vib_discard_sector_align;
545 		drive->d_max_free_seg = vib->vib_max_discard_seg;
546 		drive->d_max_free_blks = vib->vib_max_discard_sectors;
547 		/*
548 		 * The virtio 1.1 spec doesn't specify a per segment sector
549 		 * limit for discards -- only a limit on the total sectors in
550 		 * a discard request. Therefore, we assume a vioblk device must
551 		 * be able to accept a single segment of vib_max_discard_sectors
552 		 * (when it supports discard requests) and use
553 		 * vib_max_discard_sectors both for the overall limit for
554 		 * a discard request, but also as the limit for a single
555 		 * segment. blkdev will ensure we are never called with
556 		 * a dkioc_free_list_t that violates either limit.
557 		 */
558 		drive->d_max_free_seg_blks = vib->vib_max_discard_sectors;
559 	}
560 }
561 
562 static int
563 vioblk_bd_mediainfo(void *arg, bd_media_t *media)
564 {
565 	vioblk_t *vib = (void *)arg;
566 
567 	/*
568 	 * The device protocol is specified in terms of 512 byte logical
569 	 * blocks, regardless of the recommended I/O size which might be
570 	 * larger.
571 	 */
572 	media->m_nblks = vib->vib_nblks;
573 	media->m_blksize = vib->vib_blk_size;
574 
575 	media->m_readonly = vib->vib_readonly;
576 	media->m_pblksize = vib->vib_pblk_size;
577 	return (0);
578 }
579 
580 static void
581 vioblk_get_id(vioblk_t *vib)
582 {
583 	virtio_dma_t *dma;
584 	int r;
585 
586 	if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES,
587 	    &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ,
588 	    KM_SLEEP)) == NULL) {
589 		return;
590 	}
591 
592 	mutex_enter(&vib->vib_mutex);
593 	if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0,
594 	    VIRTIO_DIR_DEVICE_WRITES)) == 0) {
595 		const char *b = virtio_dma_va(dma, 0);
596 		uint_t pos = 0;
597 
598 		/*
599 		 * Save the entire response for debugging purposes.
600 		 */
601 		bcopy(virtio_dma_va(dma, 0), vib->vib_rawid,
602 		    VIRTIO_BLK_ID_BYTES);
603 
604 		/*
605 		 * Process the returned ID.
606 		 */
607 		bzero(vib->vib_devid, sizeof (vib->vib_devid));
608 		for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) {
609 			if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') {
610 				/*
611 				 * Accept a subset of printable ASCII
612 				 * characters.
613 				 */
614 				vib->vib_devid[pos++] = b[n];
615 			} else {
616 				/*
617 				 * Stop processing at the first sign of
618 				 * trouble.
619 				 */
620 				break;
621 			}
622 		}
623 
624 		vib->vib_devid_fetched = B_TRUE;
625 	}
626 	mutex_exit(&vib->vib_mutex);
627 
628 	virtio_dma_free(dma);
629 }
630 
631 static int
632 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid)
633 {
634 	vioblk_t *vib = arg;
635 	size_t len;
636 
637 	if ((len = strlen(vib->vib_devid)) == 0) {
638 		/*
639 		 * The device has no ID.
640 		 */
641 		return (DDI_FAILURE);
642 	}
643 
644 	return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid,
645 	    devid));
646 }
647 
648 static int
649 vioblk_bd_free_space(void *arg, bd_xfer_t *xfer)
650 {
651 	vioblk_t *vib = arg;
652 	int r = 0;
653 
654 	/*
655 	 * Since vib_can_discard is write once (and set during attach),
656 	 * we can check if it's enabled without taking the mutex.
657 	 */
658 	if (!vib->vib_can_discard) {
659 		return (ENOTSUP);
660 	}
661 
662 	mutex_enter(&vib->vib_mutex);
663 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_DISCARD);
664 	mutex_exit(&vib->vib_mutex);
665 
666 	return (r);
667 }
668 
669 /*
670  * As the device completes processing of a request, it returns the chain for
671  * that request to our I/O queue.  This routine is called in two contexts:
672  *   - from the interrupt handler, in response to notification from the device
673  *   - synchronously in line with request processing when panicking
674  */
675 static uint_t
676 vioblk_poll(vioblk_t *vib)
677 {
678 	virtio_chain_t *vic;
679 	uint_t count = 0;
680 	boolean_t wakeup = B_FALSE;
681 
682 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
683 
684 	while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) {
685 		vioblk_req_t *vbr = virtio_chain_data(vic);
686 		uint8_t status;
687 
688 		virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU);
689 
690 		bcopy(virtio_dma_va(vbr->vbr_dma,
691 		    sizeof (struct vioblk_req_hdr)), &status, sizeof (status));
692 
693 		switch (status) {
694 		case VIRTIO_BLK_S_OK:
695 			vbr->vbr_error = 0;
696 			break;
697 		case VIRTIO_BLK_S_IOERR:
698 			vbr->vbr_error = EIO;
699 			vib->vib_stats->vbs_io_errors.value.ui64++;
700 			break;
701 		case VIRTIO_BLK_S_UNSUPP:
702 			vbr->vbr_error = ENOTTY;
703 			vib->vib_stats->vbs_unsupp_errors.value.ui64++;
704 			break;
705 		default:
706 			vbr->vbr_error = ENXIO;
707 			vib->vib_stats->vbs_nxio_errors.value.ui64++;
708 			break;
709 		}
710 
711 		count++;
712 
713 		if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) {
714 			/*
715 			 * This request must not be freed as it is being held
716 			 * by a call to vioblk_common_submit().
717 			 */
718 			VERIFY(!(vbr->vbr_status &
719 			    VIOBLK_REQSTAT_POLL_COMPLETE));
720 			vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE;
721 			wakeup = B_TRUE;
722 			continue;
723 		}
724 
725 		vioblk_complete(vib, vbr);
726 
727 		vioblk_req_free(vib, vbr);
728 	}
729 
730 	if (wakeup) {
731 		/*
732 		 * Signal anybody waiting for polled command completion.
733 		 */
734 		cv_broadcast(&vib->vib_cv);
735 	}
736 
737 	return (count);
738 }
739 
740 uint_t
741 vioblk_int_handler(caddr_t arg0, caddr_t arg1)
742 {
743 	vioblk_t *vib = (vioblk_t *)arg0;
744 	uint_t count;
745 
746 	mutex_enter(&vib->vib_mutex);
747 	if ((count = vioblk_poll(vib)) >
748 	    vib->vib_stats->vbs_intr_queuemax.value.ui32) {
749 		vib->vib_stats->vbs_intr_queuemax.value.ui32 = count;
750 	}
751 
752 	vib->vib_stats->vbs_intr_total.value.ui64++;
753 	mutex_exit(&vib->vib_mutex);
754 
755 	return (DDI_INTR_CLAIMED);
756 }
757 
758 static void
759 vioblk_free_reqs(vioblk_t *vib)
760 {
761 	VERIFY3U(vib->vib_nreqs_alloc, ==, 0);
762 
763 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
764 		struct vioblk_req *vbr = &vib->vib_reqs_mem[i];
765 
766 		VERIFY(list_link_active(&vbr->vbr_link));
767 		list_remove(&vib->vib_reqs, vbr);
768 
769 		VERIFY0(vbr->vbr_status);
770 
771 		if (vbr->vbr_chain != NULL) {
772 			virtio_chain_free(vbr->vbr_chain);
773 			vbr->vbr_chain = NULL;
774 		}
775 		if (vbr->vbr_dma != NULL) {
776 			virtio_dma_free(vbr->vbr_dma);
777 			vbr->vbr_dma = NULL;
778 		}
779 	}
780 	VERIFY(list_is_empty(&vib->vib_reqs));
781 
782 	if (vib->vib_reqs_mem != NULL) {
783 		kmem_free(vib->vib_reqs_mem,
784 		    sizeof (struct vioblk_req) * vib->vib_reqs_capacity);
785 		vib->vib_reqs_mem = NULL;
786 		vib->vib_reqs_capacity = 0;
787 	}
788 }
789 
790 static int
791 vioblk_alloc_reqs(vioblk_t *vib)
792 {
793 	vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq),
794 	    VIRTIO_BLK_REQ_BUFS);
795 	vib->vib_reqs_mem = kmem_zalloc(
796 	    sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP);
797 	vib->vib_nreqs_alloc = 0;
798 
799 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
800 		list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]);
801 	}
802 
803 	for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL;
804 	    vbr = list_next(&vib->vib_reqs, vbr)) {
805 		if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio,
806 		    sizeof (struct vioblk_req_hdr) + sizeof (uint8_t),
807 		    &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
808 		    KM_SLEEP)) == NULL) {
809 			goto fail;
810 		}
811 		vbr->vbr_chain = virtio_chain_alloc(vib->vib_vq, KM_SLEEP);
812 		if (vbr->vbr_chain == NULL) {
813 			goto fail;
814 		}
815 		virtio_chain_data_set(vbr->vbr_chain, vbr);
816 	}
817 
818 	return (0);
819 
820 fail:
821 	vioblk_free_reqs(vib);
822 	return (ENOMEM);
823 }
824 
825 static int
826 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
827 {
828 	int instance = ddi_get_instance(dip);
829 	vioblk_t *vib;
830 	virtio_t *vio;
831 	boolean_t did_mutex = B_FALSE;
832 
833 	if (cmd != DDI_ATTACH) {
834 		return (DDI_FAILURE);
835 	}
836 
837 	if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) ==
838 	    NULL) {
839 		dev_err(dip, CE_WARN, "failed to start Virtio init");
840 		return (DDI_FAILURE);
841 	}
842 
843 	vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
844 	vib->vib_dip = dip;
845 	vib->vib_virtio = vio;
846 	ddi_set_driver_private(dip, vib);
847 	list_create(&vib->vib_reqs, sizeof (vioblk_req_t),
848 	    offsetof(vioblk_req_t, vbr_link));
849 
850 	/*
851 	 * Determine how many scatter-gather entries we can use in a single
852 	 * request.
853 	 */
854 	vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG;
855 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) {
856 		vib->vib_seg_max = virtio_dev_get32(vio,
857 		    VIRTIO_BLK_CONFIG_SEG_MAX);
858 
859 		if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) {
860 			/*
861 			 * We need to be able to use at least one data segment,
862 			 * so we'll assume that this device is just poorly
863 			 * implemented and try for one.
864 			 */
865 			vib->vib_seg_max = 1;
866 		}
867 	}
868 
869 	if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) {
870 		vib->vib_max_discard_sectors = virtio_dev_get32(vio,
871 		    VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT);
872 		vib->vib_max_discard_seg = virtio_dev_get32(vio,
873 		    VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG);
874 		vib->vib_discard_sector_align = virtio_dev_get32(vio,
875 		    VIRTIO_BLK_CONFIG_DISCARD_ALIGN);
876 
877 		if (vib->vib_max_discard_sectors == 0 ||
878 		    vib->vib_max_discard_seg == 0 ||
879 		    vib->vib_discard_sector_align == 0) {
880 			vib->vib_can_discard = B_FALSE;
881 
882 			/*
883 			 * The hypervisor shouldn't be giving us bad values.
884 			 * If it is, it's probably worth notifying the
885 			 * operator.
886 			 */
887 			dev_err(dip, CE_NOTE,
888 			    "Host is advertising DISCARD support but with bad"
889 			    "parameters: max_discard_sectors=%u, "
890 			    "max_discard_segments=%u, discard_sector_align=%u",
891 			    vib->vib_max_discard_sectors,
892 			    vib->vib_max_discard_seg,
893 			    vib->vib_discard_sector_align);
894 		} else {
895 			vib->vib_can_discard = B_TRUE;
896 		}
897 	}
898 
899 	/*
900 	 * When allocating the request queue, we include two additional
901 	 * descriptors (beyond those required for request data) to account for
902 	 * the header and the status byte.
903 	 */
904 	if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io",
905 	    vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) {
906 		goto fail;
907 	}
908 
909 	if (virtio_init_complete(vio, 0) != DDI_SUCCESS) {
910 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
911 		goto fail;
912 	}
913 
914 	cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL);
915 	mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
916 	did_mutex = B_TRUE;
917 
918 	if ((vib->vib_kstat = kstat_create("vioblk", instance,
919 	    "statistics", "controller", KSTAT_TYPE_NAMED,
920 	    sizeof (struct vioblk_stats) / sizeof (kstat_named_t),
921 	    KSTAT_FLAG_PERSISTENT)) == NULL) {
922 		dev_err(dip, CE_WARN, "kstat_create failed");
923 		goto fail;
924 	}
925 	vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data;
926 	kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory,
927 	    "total_rw_outofmemory", KSTAT_DATA_UINT64);
928 	kstat_named_init(&vib->vib_stats->vbs_rw_badoffset,
929 	    "total_rw_badoffset", KSTAT_DATA_UINT64);
930 	kstat_named_init(&vib->vib_stats->vbs_intr_total,
931 	    "total_intr", KSTAT_DATA_UINT64);
932 	kstat_named_init(&vib->vib_stats->vbs_io_errors,
933 	    "total_io_errors", KSTAT_DATA_UINT64);
934 	kstat_named_init(&vib->vib_stats->vbs_unsupp_errors,
935 	    "total_unsupp_errors", KSTAT_DATA_UINT64);
936 	kstat_named_init(&vib->vib_stats->vbs_nxio_errors,
937 	    "total_nxio_errors", KSTAT_DATA_UINT64);
938 	kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush,
939 	    "total_rw_cacheflush", KSTAT_DATA_UINT64);
940 	kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax,
941 	    "max_rw_cookies", KSTAT_DATA_UINT32);
942 	kstat_named_init(&vib->vib_stats->vbs_intr_queuemax,
943 	    "max_intr_queue", KSTAT_DATA_UINT32);
944 	kstat_install(vib->vib_kstat);
945 
946 	vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO);
947 	if ((vib->vib_nblks = virtio_dev_get64(vio,
948 	    VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) {
949 		dev_err(dip, CE_WARN, "invalid capacity");
950 		goto fail;
951 	}
952 
953 	/*
954 	 * Determine the optimal logical block size recommended by the device.
955 	 * This size is advisory; the protocol always deals in 512 byte blocks.
956 	 */
957 	vib->vib_blk_size = DEV_BSIZE;
958 	if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) {
959 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE);
960 
961 		if (v != 0 && v != PCI_EINVAL32) {
962 			vib->vib_blk_size = v;
963 		}
964 	}
965 
966 	/*
967 	 * Device capacity is always in 512-byte units, convert to
968 	 * native blocks.
969 	 */
970 	vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size;
971 
972 	/*
973 	 * The device may also provide an advisory physical block size.
974 	 */
975 	vib->vib_pblk_size = vib->vib_blk_size;
976 	if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) {
977 		uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP);
978 
979 		if (v != PCI_EINVAL8) {
980 			vib->vib_pblk_size <<= v;
981 		}
982 	}
983 
984 	/*
985 	 * The maximum size for a cookie in a request.
986 	 */
987 	vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE;
988 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) {
989 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX);
990 
991 		if (v != 0 && v != PCI_EINVAL32) {
992 			vib->vib_seg_size_max = v;
993 		}
994 	}
995 
996 	/*
997 	 * Set up the DMA attributes for blkdev to use for request data.  The
998 	 * specification is not extremely clear about whether DMA-related
999 	 * parameters include or exclude the header and status descriptors.
1000 	 * For now, we assume they cover only the request data and not the
1001 	 * headers.
1002 	 */
1003 	vib->vib_bd_dma_attr = vioblk_dma_attr;
1004 	vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max;
1005 	vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max;
1006 	vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max *
1007 	    vib->vib_seg_size_max;
1008 
1009 	if (vioblk_alloc_reqs(vib) != 0) {
1010 		goto fail;
1011 	}
1012 
1013 	/*
1014 	 * The blkdev framework does not provide a way to specify that the
1015 	 * device does not support write cache flushing, except by omitting the
1016 	 * "o_sync_cache" member from the ops vector.  As "bd_alloc_handle()"
1017 	 * makes a copy of the ops vector, we can safely assemble one on the
1018 	 * stack based on negotiated features.
1019 	 *
1020 	 * Similarly, the blkdev framework does not provide a way to indicate
1021 	 * if a device supports an TRIM/UNMAP/DISCARD type operation except
1022 	 * by omitting the "o_free_space" member from the ops vector.
1023 	 */
1024 	bd_ops_t vioblk_bd_ops = {
1025 		.o_version =		BD_OPS_CURRENT_VERSION,
1026 		.o_drive_info =		vioblk_bd_driveinfo,
1027 		.o_media_info =		vioblk_bd_mediainfo,
1028 		.o_devid_init =		vioblk_bd_devid,
1029 		.o_sync_cache =		vioblk_bd_flush,
1030 		.o_read =		vioblk_bd_read,
1031 		.o_write =		vioblk_bd_write,
1032 		.o_free_space =		vioblk_bd_free_space,
1033 	};
1034 	if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) {
1035 		vioblk_bd_ops.o_sync_cache = NULL;
1036 	}
1037 	if (!vib->vib_can_discard) {
1038 		vioblk_bd_ops.o_free_space = NULL;
1039 	}
1040 
1041 	vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops,
1042 	    &vib->vib_bd_dma_attr, KM_SLEEP);
1043 
1044 	/*
1045 	 * Enable interrupts now so that we can request the device identity.
1046 	 */
1047 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
1048 		goto fail;
1049 	}
1050 
1051 	vioblk_get_id(vib);
1052 
1053 	if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) {
1054 		dev_err(dip, CE_WARN, "Failed to attach blkdev");
1055 		goto fail;
1056 	}
1057 
1058 	return (DDI_SUCCESS);
1059 
1060 fail:
1061 	if (vib->vib_bd_h != NULL) {
1062 		(void) bd_detach_handle(vib->vib_bd_h);
1063 		bd_free_handle(vib->vib_bd_h);
1064 	}
1065 	if (vio != NULL) {
1066 		(void) virtio_fini(vio, B_TRUE);
1067 	}
1068 	if (did_mutex) {
1069 		mutex_destroy(&vib->vib_mutex);
1070 		cv_destroy(&vib->vib_cv);
1071 	}
1072 	if (vib->vib_kstat != NULL) {
1073 		kstat_delete(vib->vib_kstat);
1074 	}
1075 	vioblk_free_reqs(vib);
1076 	kmem_free(vib, sizeof (*vib));
1077 	return (DDI_FAILURE);
1078 }
1079 
1080 static int
1081 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1082 {
1083 	vioblk_t *vib = ddi_get_driver_private(dip);
1084 
1085 	if (cmd != DDI_DETACH) {
1086 		return (DDI_FAILURE);
1087 	}
1088 
1089 	mutex_enter(&vib->vib_mutex);
1090 	if (vib->vib_nreqs_alloc > 0) {
1091 		/*
1092 		 * Cannot detach while there are still outstanding requests.
1093 		 */
1094 		mutex_exit(&vib->vib_mutex);
1095 		return (DDI_FAILURE);
1096 	}
1097 
1098 	if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) {
1099 		mutex_exit(&vib->vib_mutex);
1100 		return (DDI_FAILURE);
1101 	}
1102 
1103 	/*
1104 	 * Tear down the Virtio framework before freeing the rest of the
1105 	 * resources.  This will ensure the interrupt handlers are no longer
1106 	 * running.
1107 	 */
1108 	virtio_fini(vib->vib_virtio, B_FALSE);
1109 
1110 	vioblk_free_reqs(vib);
1111 	kstat_delete(vib->vib_kstat);
1112 
1113 	mutex_exit(&vib->vib_mutex);
1114 	mutex_destroy(&vib->vib_mutex);
1115 
1116 	kmem_free(vib, sizeof (*vib));
1117 
1118 	return (DDI_SUCCESS);
1119 }
1120 
1121 static int
1122 vioblk_quiesce(dev_info_t *dip)
1123 {
1124 	vioblk_t *vib;
1125 
1126 	if ((vib = ddi_get_driver_private(dip)) == NULL) {
1127 		return (DDI_FAILURE);
1128 	}
1129 
1130 	return (virtio_quiesce(vib->vib_virtio));
1131 }
1132 
1133 int
1134 _init(void)
1135 {
1136 	int rv;
1137 
1138 	bd_mod_init(&vioblk_dev_ops);
1139 
1140 	if ((rv = mod_install(&vioblk_modlinkage)) != 0) {
1141 		bd_mod_fini(&vioblk_dev_ops);
1142 	}
1143 
1144 	return (rv);
1145 }
1146 
1147 int
1148 _fini(void)
1149 {
1150 	int rv;
1151 
1152 	if ((rv = mod_remove(&vioblk_modlinkage)) == 0) {
1153 		bd_mod_fini(&vioblk_dev_ops);
1154 	}
1155 
1156 	return (rv);
1157 }
1158 
1159 int
1160 _info(struct modinfo *modinfop)
1161 {
1162 	return (mod_info(&vioblk_modlinkage, modinfop));
1163 }
1164