xref: /illumos-gate/usr/src/uts/common/io/nvme/nvme.c (revision 19f828dfeb540cb7af7e68528aa8711911d3ed39)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2018 Nexenta Systems, Inc.
14  * Copyright 2016 Tegile Systems, Inc. All rights reserved.
15  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
16  * Copyright 2020 Joyent, Inc.
17  * Copyright 2019 Western Digital Corporation.
18  * Copyright 2020 Racktop Systems.
19  * Copyright 2021 Oxide Computer Company.
20  */
21 
22 /*
23  * blkdev driver for NVMe compliant storage devices
24  *
25  * This driver was written to conform to version 1.2.1 of the NVMe
26  * specification.  It may work with newer versions, but that is completely
27  * untested and disabled by default.
28  *
29  * The driver has only been tested on x86 systems and will not work on big-
30  * endian systems without changes to the code accessing registers and data
31  * structures used by the hardware.
32  *
33  *
34  * Interrupt Usage:
35  *
36  * The driver will use a single interrupt while configuring the device as the
37  * specification requires, but contrary to the specification it will try to use
38  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
39  * will switch to multiple-message MSI(-X) if supported. The driver wants to
40  * have one interrupt vector per CPU, but it will work correctly if less are
41  * available. Interrupts can be shared by queues, the interrupt handler will
42  * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
43  * the admin queue will share an interrupt with one I/O queue. The interrupt
44  * handler will retrieve completed commands from all queues sharing an interrupt
45  * vector and will post them to a taskq for completion processing.
46  *
47  *
48  * Command Processing:
49  *
50  * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
51  * to 65536 I/O commands. The driver will configure one I/O queue pair per
52  * available interrupt vector, with the queue length usually much smaller than
53  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
54  * interrupt vectors will be used.
55  *
56  * Additionally the hardware provides a single special admin queue pair that can
57  * hold up to 4096 admin commands.
58  *
59  * From the hardware perspective both queues of a queue pair are independent,
60  * but they share some driver state: the command array (holding pointers to
61  * commands currently being processed by the hardware) and the active command
62  * counter. Access to a submission queue and the shared state is protected by
63  * nq_mutex; completion queue is protected by ncq_mutex.
64  *
65  * When a command is submitted to a queue pair the active command counter is
66  * incremented and a pointer to the command is stored in the command array. The
67  * array index is used as command identifier (CID) in the submission queue
68  * entry. Some commands may take a very long time to complete, and if the queue
69  * wraps around in that time a submission may find the next array slot to still
70  * be used by a long-running command. In this case the array is sequentially
71  * searched for the next free slot. The length of the command array is the same
72  * as the configured queue length. Queue overrun is prevented by the semaphore,
73  * so a command submission may block if the queue is full.
74  *
75  *
76  * Polled I/O Support:
77  *
78  * For kernel core dump support the driver can do polled I/O. As interrupts are
79  * turned off while dumping the driver will just submit a command in the regular
80  * way, and then repeatedly attempt a command retrieval until it gets the
81  * command back.
82  *
83  *
84  * Namespace Support:
85  *
86  * NVMe devices can have multiple namespaces, each being a independent data
87  * store. The driver supports multiple namespaces and creates a blkdev interface
88  * for each namespace found. Namespaces can have various attributes to support
89  * protection information. This driver does not support any of this and ignores
90  * namespaces that have these attributes.
91  *
92  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
93  * (EUI64). This driver uses the EUI64 if present to generate the devid and
94  * passes it to blkdev to use it in the device node names. As this is currently
95  * untested namespaces with EUI64 are ignored by default.
96  *
97  * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
98  * single controller. This is an artificial limit imposed by the driver to be
99  * able to address a reasonable number of controllers and namespaces using a
100  * 32bit minor node number.
101  *
102  *
103  * Minor nodes:
104  *
105  * For each NVMe device the driver exposes one minor node for the controller and
106  * one minor node for each namespace. The only operations supported by those
107  * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
108  * interface for the nvmeadm(1M) utility.
109  *
110  *
111  * Blkdev Interface:
112  *
113  * This driver uses blkdev to do all the heavy lifting involved with presenting
114  * a disk device to the system. As a result, the processing of I/O requests is
115  * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
116  * setup, and splitting of transfers into manageable chunks.
117  *
118  * I/O requests coming in from blkdev are turned into NVM commands and posted to
119  * an I/O queue. The queue is selected by taking the CPU id modulo the number of
120  * queues. There is currently no timeout handling of I/O commands.
121  *
122  * Blkdev also supports querying device/media information and generating a
123  * devid. The driver reports the best block size as determined by the namespace
124  * format back to blkdev as physical block size to support partition and block
125  * alignment. The devid is either based on the namespace EUI64, if present, or
126  * composed using the device vendor ID, model number, serial number, and the
127  * namespace ID.
128  *
129  *
130  * Error Handling:
131  *
132  * Error handling is currently limited to detecting fatal hardware errors,
133  * either by asynchronous events, or synchronously through command status or
134  * admin command timeouts. In case of severe errors the device is fenced off,
135  * all further requests will return EIO. FMA is then called to fault the device.
136  *
137  * The hardware has a limit for outstanding asynchronous event requests. Before
138  * this limit is known the driver assumes it is at least 1 and posts a single
139  * asynchronous request. Later when the limit is known more asynchronous event
140  * requests are posted to allow quicker reception of error information. When an
141  * asynchronous event is posted by the hardware the driver will parse the error
142  * status fields and log information or fault the device, depending on the
143  * severity of the asynchronous event. The asynchronous event request is then
144  * reused and posted to the admin queue again.
145  *
146  * On command completion the command status is checked for errors. In case of
147  * errors indicating a driver bug the driver panics. Almost all other error
148  * status values just cause EIO to be returned.
149  *
150  * Command timeouts are currently detected for all admin commands except
151  * asynchronous event requests. If a command times out and the hardware appears
152  * to be healthy the driver attempts to abort the command. The original command
153  * timeout is also applied to the abort command. If the abort times out too the
154  * driver assumes the device to be dead, fences it off, and calls FMA to retire
155  * it. In all other cases the aborted command should return immediately with a
156  * status indicating it was aborted, and the driver will wait indefinitely for
157  * that to happen. No timeout handling of normal I/O commands is presently done.
158  *
159  * Any command that times out due to the controller dropping dead will be put on
160  * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
161  * memory being reused by the system and later be written to by a "dead" NVMe
162  * controller.
163  *
164  *
165  * Locking:
166  *
167  * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
168  * when accessing shared state and submission queue registers, ncq_mutex
169  * is held when accessing completion queue state and registers.
170  * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
171  * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
172  * mutexes themselves.
173  *
174  * Each command also has its own nc_mutex, which is associated with the
175  * condition variable nc_cv. It is only used on admin commands which are run
176  * synchronously. In that case it must be held across calls to
177  * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
178  * nvme_admin_cmd(). It must also be held whenever the completion state of the
179  * command is changed or while a admin command timeout is handled.
180  *
181  * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
182  * More than one nc_mutex may only be held when aborting commands. In this case,
183  * the nc_mutex of the command to be aborted must be held across the call to
184  * nvme_abort_cmd() to prevent the command from completing while the abort is in
185  * progress.
186  *
187  * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
188  * acquired first. More than one nq_mutex is never held by a single thread.
189  * The ncq_mutex is only held by nvme_retrieve_cmd() and
190  * nvme_process_iocq(). nvme_process_iocq() is only called from the
191  * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
192  * mutex is non-contentious but is required for implementation completeness
193  * and safety.
194  *
195  * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
196  * and exclusive-open flag nm_oexcl.
197  *
198  *
199  * Quiesce / Fast Reboot:
200  *
201  * The driver currently does not support fast reboot. A quiesce(9E) entry point
202  * is still provided which is used to send a shutdown notification to the
203  * device.
204  *
205  *
206  * NVMe Hotplug:
207  *
208  * The driver supports hot removal. The driver uses the NDI event framework
209  * to register a callback, nvme_remove_callback, to clean up when a disk is
210  * removed. In particular, the driver will unqueue outstanding I/O commands and
211  * set n_dead on the softstate to true so that other operations, such as ioctls
212  * and command submissions, fail as well.
213  *
214  * While the callback registration relies on the NDI event framework, the
215  * removal event itself is kicked off in the PCIe hotplug framework, when the
216  * PCIe bridge driver ("pcieb") gets a hotplug interrupt indicatating that a
217  * device was removed from the slot.
218  *
219  * The NVMe driver instance itself will remain until the final close of the
220  * device.
221  *
222  *
223  * DDI UFM Support
224  *
225  * The driver supports the DDI UFM framework for reporting information about
226  * the device's firmware image and slot configuration. This data can be
227  * queried by userland software via ioctls to the ufm driver. For more
228  * information, see ddi_ufm(9E).
229  *
230  *
231  * Driver Configuration:
232  *
233  * The following driver properties can be changed to control some aspects of the
234  * drivers operation:
235  * - strict-version: can be set to 0 to allow devices conforming to newer
236  *   major versions to be used
237  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
238  *   specific command status as a fatal error leading device faulting
239  * - admin-queue-len: the maximum length of the admin queue (16-4096)
240  * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
241  * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
242  * - async-event-limit: the maximum number of asynchronous event requests to be
243  *   posted by the driver
244  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
245  *   cache
246  * - min-phys-block-size: the minimum physical block size to report to blkdev,
247  *   which is among other things the basis for ZFS vdev ashift
248  * - max-submission-queues: the maximum number of I/O submission queues.
249  * - max-completion-queues: the maximum number of I/O completion queues,
250  *   can be less than max-submission-queues, in which case the completion
251  *   queues are shared.
252  *
253  *
254  * TODO:
255  * - figure out sane default for I/O queue depth reported to blkdev
256  * - FMA handling of media errors
257  * - support for devices supporting very large I/O requests using chained PRPs
258  * - support for configuring hardware parameters like interrupt coalescing
259  * - support for media formatting and hard partitioning into namespaces
260  * - support for big-endian systems
261  * - support for fast reboot
262  * - support for NVMe Subsystem Reset (1.1)
263  * - support for Scatter/Gather lists (1.1)
264  * - support for Reservations (1.1)
265  * - support for power management
266  */
267 
268 #include <sys/byteorder.h>
269 #ifdef _BIG_ENDIAN
270 #error nvme driver needs porting for big-endian platforms
271 #endif
272 
273 #include <sys/modctl.h>
274 #include <sys/conf.h>
275 #include <sys/devops.h>
276 #include <sys/ddi.h>
277 #include <sys/ddi_ufm.h>
278 #include <sys/sunddi.h>
279 #include <sys/sunndi.h>
280 #include <sys/bitmap.h>
281 #include <sys/sysmacros.h>
282 #include <sys/param.h>
283 #include <sys/varargs.h>
284 #include <sys/cpuvar.h>
285 #include <sys/disp.h>
286 #include <sys/blkdev.h>
287 #include <sys/atomic.h>
288 #include <sys/archsystm.h>
289 #include <sys/sata/sata_hba.h>
290 #include <sys/stat.h>
291 #include <sys/policy.h>
292 #include <sys/list.h>
293 #include <sys/dkio.h>
294 
295 #include <sys/nvme.h>
296 
297 #ifdef __x86
298 #include <sys/x86_archext.h>
299 #endif
300 
301 #include "nvme_reg.h"
302 #include "nvme_var.h"
303 
304 /*
305  * Assertions to make sure that we've properly captured various aspects of the
306  * packed structures and haven't broken them during updates.
307  */
308 CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000);
309 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
310 CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
311 CTASSERT(offsetof(nvme_identify_ctrl_t, id_oncs) == 520);
312 CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
313 CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
314 CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
315 CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
316 
317 CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000);
318 CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
319 CTASSERT(offsetof(nvme_identify_nsid_t, id_anagrpid) == 92);
320 CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
321 CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
322 CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
323 
324 CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000);
325 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
326 CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
327 
328 
329 /* NVMe spec version supported */
330 static const int nvme_version_major = 1;
331 
332 /* tunable for admin command timeout in seconds, default is 1s */
333 int nvme_admin_cmd_timeout = 1;
334 
335 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
336 int nvme_format_cmd_timeout = 600;
337 
338 /* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
339 int nvme_commit_save_cmd_timeout = 15;
340 
341 /*
342  * tunable for the size of arbitrary vendor specific admin commands,
343  * default is 16MiB
344  */
345 uint32_t nvme_vendor_specific_admin_cmd_size = 1 << 24;
346 
347 /*
348  * tunable for the max timeout of arbitary vendor specific admin commands,
349  * default is 60s.
350  */
351 uint_t nvme_vendor_specific_admin_cmd_max_timeout = 60;
352 
353 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
354 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
355 static int nvme_quiesce(dev_info_t *);
356 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
357 static int nvme_setup_interrupts(nvme_t *, int, int);
358 static void nvme_release_interrupts(nvme_t *);
359 static uint_t nvme_intr(caddr_t, caddr_t);
360 
361 static void nvme_shutdown(nvme_t *, int, boolean_t);
362 static boolean_t nvme_reset(nvme_t *, boolean_t);
363 static int nvme_init(nvme_t *);
364 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
365 static void nvme_free_cmd(nvme_cmd_t *);
366 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
367     bd_xfer_t *);
368 static void nvme_admin_cmd(nvme_cmd_t *, int);
369 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
370 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
371 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
372 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
373 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
374 static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
375 static void nvme_wakeup_cmd(void *);
376 static void nvme_async_event_task(void *);
377 
378 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
379 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
380 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
381 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
382 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
383 static inline int nvme_check_cmd_status(nvme_cmd_t *);
384 
385 static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
386 static void nvme_async_event(nvme_t *);
387 static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t,
388     uint8_t, boolean_t, uint8_t);
389 static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t,
390     ...);
391 static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **);
392 static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
393     uint32_t *);
394 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *,
395     void **, size_t *);
396 static int nvme_write_cache_set(nvme_t *, boolean_t);
397 static int nvme_set_nqueues(nvme_t *);
398 
399 static void nvme_free_dma(nvme_dma_t *);
400 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
401     nvme_dma_t **);
402 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
403     nvme_dma_t **);
404 static void nvme_free_qpair(nvme_qpair_t *);
405 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
406 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
407 
408 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
409 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
410 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
411 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
412 
413 static boolean_t nvme_check_regs_hdl(nvme_t *);
414 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
415 
416 static int nvme_fill_prp(nvme_cmd_t *, ddi_dma_handle_t);
417 
418 static void nvme_bd_xfer_done(void *);
419 static void nvme_bd_driveinfo(void *, bd_drive_t *);
420 static int nvme_bd_mediainfo(void *, bd_media_t *);
421 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
422 static int nvme_bd_read(void *, bd_xfer_t *);
423 static int nvme_bd_write(void *, bd_xfer_t *);
424 static int nvme_bd_sync(void *, bd_xfer_t *);
425 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
426 static int nvme_bd_free_space(void *, bd_xfer_t *);
427 
428 static int nvme_prp_dma_constructor(void *, void *, int);
429 static void nvme_prp_dma_destructor(void *, void *);
430 
431 static void nvme_prepare_devid(nvme_t *, uint32_t);
432 
433 /* DDI UFM callbacks */
434 static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
435     ddi_ufm_image_t *);
436 static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
437     ddi_ufm_slot_t *);
438 static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
439 
440 static int nvme_open(dev_t *, int, int, cred_t *);
441 static int nvme_close(dev_t, int, int, cred_t *);
442 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
443 
444 static ddi_ufm_ops_t nvme_ufm_ops = {
445 	NULL,
446 	nvme_ufm_fill_image,
447 	nvme_ufm_fill_slot,
448 	nvme_ufm_getcaps
449 };
450 
451 #define	NVME_MINOR_INST_SHIFT	9
452 #define	NVME_MINOR(inst, nsid)	(((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
453 #define	NVME_MINOR_INST(minor)	((minor) >> NVME_MINOR_INST_SHIFT)
454 #define	NVME_MINOR_NSID(minor)	((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
455 #define	NVME_MINOR_MAX		(NVME_MINOR(1, 0) - 2)
456 #define	NVME_IS_VENDOR_UNIQUE_CMD(x)	(((x) >= 0xC0) && ((x) <= 0xFF))
457 
458 static void *nvme_state;
459 static kmem_cache_t *nvme_cmd_cache;
460 
461 /*
462  * DMA attributes for queue DMA memory
463  *
464  * Queue DMA memory must be page aligned. The maximum length of a queue is
465  * 65536 entries, and an entry can be 64 bytes long.
466  */
467 static ddi_dma_attr_t nvme_queue_dma_attr = {
468 	.dma_attr_version	= DMA_ATTR_V0,
469 	.dma_attr_addr_lo	= 0,
470 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
471 	.dma_attr_count_max	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
472 	.dma_attr_align		= 0x1000,
473 	.dma_attr_burstsizes	= 0x7ff,
474 	.dma_attr_minxfer	= 0x1000,
475 	.dma_attr_maxxfer	= (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
476 	.dma_attr_seg		= 0xffffffffffffffffULL,
477 	.dma_attr_sgllen	= 1,
478 	.dma_attr_granular	= 1,
479 	.dma_attr_flags		= 0,
480 };
481 
482 /*
483  * DMA attributes for transfers using Physical Region Page (PRP) entries
484  *
485  * A PRP entry describes one page of DMA memory using the page size specified
486  * in the controller configuration's memory page size register (CC.MPS). It uses
487  * a 64bit base address aligned to this page size. There is no limitation on
488  * chaining PRPs together for arbitrarily large DMA transfers.
489  */
490 static ddi_dma_attr_t nvme_prp_dma_attr = {
491 	.dma_attr_version	= DMA_ATTR_V0,
492 	.dma_attr_addr_lo	= 0,
493 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
494 	.dma_attr_count_max	= 0xfff,
495 	.dma_attr_align		= 0x1000,
496 	.dma_attr_burstsizes	= 0x7ff,
497 	.dma_attr_minxfer	= 0x1000,
498 	.dma_attr_maxxfer	= 0x1000,
499 	.dma_attr_seg		= 0xfff,
500 	.dma_attr_sgllen	= -1,
501 	.dma_attr_granular	= 1,
502 	.dma_attr_flags		= 0,
503 };
504 
505 /*
506  * DMA attributes for transfers using scatter/gather lists
507  *
508  * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
509  * 32bit length field. SGL Segment and SGL Last Segment entries require the
510  * length to be a multiple of 16 bytes.
511  */
512 static ddi_dma_attr_t nvme_sgl_dma_attr = {
513 	.dma_attr_version	= DMA_ATTR_V0,
514 	.dma_attr_addr_lo	= 0,
515 	.dma_attr_addr_hi	= 0xffffffffffffffffULL,
516 	.dma_attr_count_max	= 0xffffffffUL,
517 	.dma_attr_align		= 1,
518 	.dma_attr_burstsizes	= 0x7ff,
519 	.dma_attr_minxfer	= 0x10,
520 	.dma_attr_maxxfer	= 0xfffffffffULL,
521 	.dma_attr_seg		= 0xffffffffffffffffULL,
522 	.dma_attr_sgllen	= -1,
523 	.dma_attr_granular	= 0x10,
524 	.dma_attr_flags		= 0
525 };
526 
527 static ddi_device_acc_attr_t nvme_reg_acc_attr = {
528 	.devacc_attr_version	= DDI_DEVICE_ATTR_V0,
529 	.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
530 	.devacc_attr_dataorder	= DDI_STRICTORDER_ACC
531 };
532 
533 static struct cb_ops nvme_cb_ops = {
534 	.cb_open	= nvme_open,
535 	.cb_close	= nvme_close,
536 	.cb_strategy	= nodev,
537 	.cb_print	= nodev,
538 	.cb_dump	= nodev,
539 	.cb_read	= nodev,
540 	.cb_write	= nodev,
541 	.cb_ioctl	= nvme_ioctl,
542 	.cb_devmap	= nodev,
543 	.cb_mmap	= nodev,
544 	.cb_segmap	= nodev,
545 	.cb_chpoll	= nochpoll,
546 	.cb_prop_op	= ddi_prop_op,
547 	.cb_str		= 0,
548 	.cb_flag	= D_NEW | D_MP,
549 	.cb_rev		= CB_REV,
550 	.cb_aread	= nodev,
551 	.cb_awrite	= nodev
552 };
553 
554 static struct dev_ops nvme_dev_ops = {
555 	.devo_rev	= DEVO_REV,
556 	.devo_refcnt	= 0,
557 	.devo_getinfo	= ddi_no_info,
558 	.devo_identify	= nulldev,
559 	.devo_probe	= nulldev,
560 	.devo_attach	= nvme_attach,
561 	.devo_detach	= nvme_detach,
562 	.devo_reset	= nodev,
563 	.devo_cb_ops	= &nvme_cb_ops,
564 	.devo_bus_ops	= NULL,
565 	.devo_power	= NULL,
566 	.devo_quiesce	= nvme_quiesce,
567 };
568 
569 static struct modldrv nvme_modldrv = {
570 	.drv_modops	= &mod_driverops,
571 	.drv_linkinfo	= "NVMe v1.1b",
572 	.drv_dev_ops	= &nvme_dev_ops
573 };
574 
575 static struct modlinkage nvme_modlinkage = {
576 	.ml_rev		= MODREV_1,
577 	.ml_linkage	= { &nvme_modldrv, NULL }
578 };
579 
580 static bd_ops_t nvme_bd_ops = {
581 	.o_version	= BD_OPS_CURRENT_VERSION,
582 	.o_drive_info	= nvme_bd_driveinfo,
583 	.o_media_info	= nvme_bd_mediainfo,
584 	.o_devid_init	= nvme_bd_devid,
585 	.o_sync_cache	= nvme_bd_sync,
586 	.o_read		= nvme_bd_read,
587 	.o_write	= nvme_bd_write,
588 	.o_free_space	= nvme_bd_free_space,
589 };
590 
591 /*
592  * This list will hold commands that have timed out and couldn't be aborted.
593  * As we don't know what the hardware may still do with the DMA memory we can't
594  * free them, so we'll keep them forever on this list where we can easily look
595  * at them with mdb.
596  */
597 static struct list nvme_lost_cmds;
598 static kmutex_t nvme_lc_mutex;
599 
600 int
601 _init(void)
602 {
603 	int error;
604 
605 	error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
606 	if (error != DDI_SUCCESS)
607 		return (error);
608 
609 	nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
610 	    sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
611 
612 	mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
613 	list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
614 	    offsetof(nvme_cmd_t, nc_list));
615 
616 	bd_mod_init(&nvme_dev_ops);
617 
618 	error = mod_install(&nvme_modlinkage);
619 	if (error != DDI_SUCCESS) {
620 		ddi_soft_state_fini(&nvme_state);
621 		mutex_destroy(&nvme_lc_mutex);
622 		list_destroy(&nvme_lost_cmds);
623 		bd_mod_fini(&nvme_dev_ops);
624 	}
625 
626 	return (error);
627 }
628 
629 int
630 _fini(void)
631 {
632 	int error;
633 
634 	if (!list_is_empty(&nvme_lost_cmds))
635 		return (DDI_FAILURE);
636 
637 	error = mod_remove(&nvme_modlinkage);
638 	if (error == DDI_SUCCESS) {
639 		ddi_soft_state_fini(&nvme_state);
640 		kmem_cache_destroy(nvme_cmd_cache);
641 		mutex_destroy(&nvme_lc_mutex);
642 		list_destroy(&nvme_lost_cmds);
643 		bd_mod_fini(&nvme_dev_ops);
644 	}
645 
646 	return (error);
647 }
648 
649 int
650 _info(struct modinfo *modinfop)
651 {
652 	return (mod_info(&nvme_modlinkage, modinfop));
653 }
654 
655 static inline void
656 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
657 {
658 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
659 
660 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
661 	ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
662 }
663 
664 static inline void
665 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
666 {
667 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
668 
669 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
670 	ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
671 }
672 
673 static inline uint64_t
674 nvme_get64(nvme_t *nvme, uintptr_t reg)
675 {
676 	uint64_t val;
677 
678 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
679 
680 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
681 	val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
682 
683 	return (val);
684 }
685 
686 static inline uint32_t
687 nvme_get32(nvme_t *nvme, uintptr_t reg)
688 {
689 	uint32_t val;
690 
691 	ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
692 
693 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
694 	val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
695 
696 	return (val);
697 }
698 
699 static boolean_t
700 nvme_check_regs_hdl(nvme_t *nvme)
701 {
702 	ddi_fm_error_t error;
703 
704 	ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
705 
706 	if (error.fme_status != DDI_FM_OK)
707 		return (B_TRUE);
708 
709 	return (B_FALSE);
710 }
711 
712 static boolean_t
713 nvme_check_dma_hdl(nvme_dma_t *dma)
714 {
715 	ddi_fm_error_t error;
716 
717 	if (dma == NULL)
718 		return (B_FALSE);
719 
720 	ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
721 
722 	if (error.fme_status != DDI_FM_OK)
723 		return (B_TRUE);
724 
725 	return (B_FALSE);
726 }
727 
728 static void
729 nvme_free_dma_common(nvme_dma_t *dma)
730 {
731 	if (dma->nd_dmah != NULL)
732 		(void) ddi_dma_unbind_handle(dma->nd_dmah);
733 	if (dma->nd_acch != NULL)
734 		ddi_dma_mem_free(&dma->nd_acch);
735 	if (dma->nd_dmah != NULL)
736 		ddi_dma_free_handle(&dma->nd_dmah);
737 }
738 
739 static void
740 nvme_free_dma(nvme_dma_t *dma)
741 {
742 	nvme_free_dma_common(dma);
743 	kmem_free(dma, sizeof (*dma));
744 }
745 
746 /* ARGSUSED */
747 static void
748 nvme_prp_dma_destructor(void *buf, void *private)
749 {
750 	nvme_dma_t *dma = (nvme_dma_t *)buf;
751 
752 	nvme_free_dma_common(dma);
753 }
754 
755 static int
756 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
757     size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
758 {
759 	if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
760 	    &dma->nd_dmah) != DDI_SUCCESS) {
761 		/*
762 		 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
763 		 * the only other possible error is DDI_DMA_BADATTR which
764 		 * indicates a driver bug which should cause a panic.
765 		 */
766 		dev_err(nvme->n_dip, CE_PANIC,
767 		    "!failed to get DMA handle, check DMA attributes");
768 		return (DDI_FAILURE);
769 	}
770 
771 	/*
772 	 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
773 	 * or the flags are conflicting, which isn't the case here.
774 	 */
775 	(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
776 	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
777 	    &dma->nd_len, &dma->nd_acch);
778 
779 	if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
780 	    dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
781 	    &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
782 		dev_err(nvme->n_dip, CE_WARN,
783 		    "!failed to bind DMA memory");
784 		atomic_inc_32(&nvme->n_dma_bind_err);
785 		nvme_free_dma_common(dma);
786 		return (DDI_FAILURE);
787 	}
788 
789 	return (DDI_SUCCESS);
790 }
791 
792 static int
793 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
794     ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
795 {
796 	nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
797 
798 	if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
799 	    DDI_SUCCESS) {
800 		*ret = NULL;
801 		kmem_free(dma, sizeof (nvme_dma_t));
802 		return (DDI_FAILURE);
803 	}
804 
805 	bzero(dma->nd_memp, dma->nd_len);
806 
807 	*ret = dma;
808 	return (DDI_SUCCESS);
809 }
810 
811 /* ARGSUSED */
812 static int
813 nvme_prp_dma_constructor(void *buf, void *private, int flags)
814 {
815 	nvme_dma_t *dma = (nvme_dma_t *)buf;
816 	nvme_t *nvme = (nvme_t *)private;
817 
818 	dma->nd_dmah = NULL;
819 	dma->nd_acch = NULL;
820 
821 	if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
822 	    DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
823 		return (-1);
824 	}
825 
826 	ASSERT(dma->nd_ncookie == 1);
827 
828 	dma->nd_cached = B_TRUE;
829 
830 	return (0);
831 }
832 
833 static int
834 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
835     uint_t flags, nvme_dma_t **dma)
836 {
837 	uint32_t len = nentry * qe_len;
838 	ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
839 
840 	len = roundup(len, nvme->n_pagesize);
841 
842 	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
843 	    != DDI_SUCCESS) {
844 		dev_err(nvme->n_dip, CE_WARN,
845 		    "!failed to get DMA memory for queue");
846 		goto fail;
847 	}
848 
849 	if ((*dma)->nd_ncookie != 1) {
850 		dev_err(nvme->n_dip, CE_WARN,
851 		    "!got too many cookies for queue DMA");
852 		goto fail;
853 	}
854 
855 	return (DDI_SUCCESS);
856 
857 fail:
858 	if (*dma) {
859 		nvme_free_dma(*dma);
860 		*dma = NULL;
861 	}
862 
863 	return (DDI_FAILURE);
864 }
865 
866 static void
867 nvme_free_cq(nvme_cq_t *cq)
868 {
869 	mutex_destroy(&cq->ncq_mutex);
870 
871 	if (cq->ncq_cmd_taskq != NULL)
872 		taskq_destroy(cq->ncq_cmd_taskq);
873 
874 	if (cq->ncq_dma != NULL)
875 		nvme_free_dma(cq->ncq_dma);
876 
877 	kmem_free(cq, sizeof (*cq));
878 }
879 
880 static void
881 nvme_free_qpair(nvme_qpair_t *qp)
882 {
883 	int i;
884 
885 	mutex_destroy(&qp->nq_mutex);
886 	sema_destroy(&qp->nq_sema);
887 
888 	if (qp->nq_sqdma != NULL)
889 		nvme_free_dma(qp->nq_sqdma);
890 
891 	if (qp->nq_active_cmds > 0)
892 		for (i = 0; i != qp->nq_nentry; i++)
893 			if (qp->nq_cmd[i] != NULL)
894 				nvme_free_cmd(qp->nq_cmd[i]);
895 
896 	if (qp->nq_cmd != NULL)
897 		kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
898 
899 	kmem_free(qp, sizeof (nvme_qpair_t));
900 }
901 
902 /*
903  * Destroy the pre-allocated cq array, but only free individual completion
904  * queues from the given starting index.
905  */
906 static void
907 nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
908 {
909 	uint_t i;
910 
911 	for (i = start; i < nvme->n_cq_count; i++)
912 		if (nvme->n_cq[i] != NULL)
913 			nvme_free_cq(nvme->n_cq[i]);
914 
915 	kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
916 }
917 
918 static int
919 nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx,
920     uint_t nthr)
921 {
922 	nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
923 	char name[64];		/* large enough for the taskq name */
924 
925 	mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
926 	    DDI_INTR_PRI(nvme->n_intr_pri));
927 
928 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
929 	    DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
930 		goto fail;
931 
932 	cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
933 	cq->ncq_nentry = nentry;
934 	cq->ncq_id = idx;
935 	cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
936 
937 	/*
938 	 * Each completion queue has its own command taskq.
939 	 */
940 	(void) snprintf(name, sizeof (name), "%s%d_cmd_taskq%u",
941 	    ddi_driver_name(nvme->n_dip), ddi_get_instance(nvme->n_dip), idx);
942 
943 	cq->ncq_cmd_taskq = taskq_create(name, nthr, minclsyspri, 64, INT_MAX,
944 	    TASKQ_PREPOPULATE);
945 
946 	if (cq->ncq_cmd_taskq == NULL) {
947 		dev_err(nvme->n_dip, CE_WARN, "!failed to create cmd "
948 		    "taskq for cq %u", idx);
949 		goto fail;
950 	}
951 
952 	*cqp = cq;
953 	return (DDI_SUCCESS);
954 
955 fail:
956 	nvme_free_cq(cq);
957 	*cqp = NULL;
958 
959 	return (DDI_FAILURE);
960 }
961 
962 /*
963  * Create the n_cq array big enough to hold "ncq" completion queues.
964  * If the array already exists it will be re-sized (but only larger).
965  * The admin queue is included in this array, which boosts the
966  * max number of entries to UINT16_MAX + 1.
967  */
968 static int
969 nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry, uint_t nthr)
970 {
971 	nvme_cq_t **cq;
972 	uint_t i, cq_count;
973 
974 	ASSERT3U(ncq, >, nvme->n_cq_count);
975 
976 	cq = nvme->n_cq;
977 	cq_count = nvme->n_cq_count;
978 
979 	nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
980 	nvme->n_cq_count = ncq;
981 
982 	for (i = 0; i < cq_count; i++)
983 		nvme->n_cq[i] = cq[i];
984 
985 	for (; i < nvme->n_cq_count; i++)
986 		if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i, nthr) !=
987 		    DDI_SUCCESS)
988 			goto fail;
989 
990 	if (cq != NULL)
991 		kmem_free(cq, sizeof (*cq) * cq_count);
992 
993 	return (DDI_SUCCESS);
994 
995 fail:
996 	nvme_destroy_cq_array(nvme, cq_count);
997 	/*
998 	 * Restore the original array
999 	 */
1000 	nvme->n_cq_count = cq_count;
1001 	nvme->n_cq = cq;
1002 
1003 	return (DDI_FAILURE);
1004 }
1005 
1006 static int
1007 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
1008     uint_t idx)
1009 {
1010 	nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
1011 	uint_t cq_idx;
1012 
1013 	mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
1014 	    DDI_INTR_PRI(nvme->n_intr_pri));
1015 
1016 	/*
1017 	 * The NVMe spec defines that a full queue has one empty (unused) slot;
1018 	 * initialize the semaphore accordingly.
1019 	 */
1020 	sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
1021 
1022 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
1023 	    DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
1024 		goto fail;
1025 
1026 	/*
1027 	 * idx == 0 is adminq, those above 0 are shared io completion queues.
1028 	 */
1029 	cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
1030 	qp->nq_cq = nvme->n_cq[cq_idx];
1031 	qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
1032 	qp->nq_nentry = nentry;
1033 
1034 	qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
1035 
1036 	qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
1037 	qp->nq_next_cmd = 0;
1038 
1039 	*nqp = qp;
1040 	return (DDI_SUCCESS);
1041 
1042 fail:
1043 	nvme_free_qpair(qp);
1044 	*nqp = NULL;
1045 
1046 	return (DDI_FAILURE);
1047 }
1048 
1049 static nvme_cmd_t *
1050 nvme_alloc_cmd(nvme_t *nvme, int kmflag)
1051 {
1052 	nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
1053 
1054 	if (cmd == NULL)
1055 		return (cmd);
1056 
1057 	bzero(cmd, sizeof (nvme_cmd_t));
1058 
1059 	cmd->nc_nvme = nvme;
1060 
1061 	mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
1062 	    DDI_INTR_PRI(nvme->n_intr_pri));
1063 	cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
1064 
1065 	return (cmd);
1066 }
1067 
1068 static void
1069 nvme_free_cmd(nvme_cmd_t *cmd)
1070 {
1071 	/* Don't free commands on the lost commands list. */
1072 	if (list_link_active(&cmd->nc_list))
1073 		return;
1074 
1075 	if (cmd->nc_dma) {
1076 		nvme_free_dma(cmd->nc_dma);
1077 		cmd->nc_dma = NULL;
1078 	}
1079 
1080 	if (cmd->nc_prp) {
1081 		kmem_cache_free(cmd->nc_nvme->n_prp_cache, cmd->nc_prp);
1082 		cmd->nc_prp = NULL;
1083 	}
1084 
1085 	cv_destroy(&cmd->nc_cv);
1086 	mutex_destroy(&cmd->nc_mutex);
1087 
1088 	kmem_cache_free(nvme_cmd_cache, cmd);
1089 }
1090 
1091 static void
1092 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1093 {
1094 	sema_p(&qp->nq_sema);
1095 	nvme_submit_cmd_common(qp, cmd);
1096 }
1097 
1098 static int
1099 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1100 {
1101 	if (cmd->nc_nvme->n_dead) {
1102 		return (EIO);
1103 	}
1104 
1105 	if (sema_tryp(&qp->nq_sema) == 0)
1106 		return (EAGAIN);
1107 
1108 	nvme_submit_cmd_common(qp, cmd);
1109 	return (0);
1110 }
1111 
1112 static void
1113 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
1114 {
1115 	nvme_reg_sqtdbl_t tail = { 0 };
1116 
1117 	mutex_enter(&qp->nq_mutex);
1118 	cmd->nc_completed = B_FALSE;
1119 
1120 	/*
1121 	 * Now that we hold the queue pair lock, we must check whether or not
1122 	 * the controller has been listed as dead (e.g. was removed due to
1123 	 * hotplug). This is necessary as otherwise we could race with
1124 	 * nvme_remove_callback(). Because this has not been enqueued, we don't
1125 	 * call nvme_unqueue_cmd(), which is why we must manually decrement the
1126 	 * semaphore.
1127 	 */
1128 	if (cmd->nc_nvme->n_dead) {
1129 		taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq, cmd->nc_callback,
1130 		    cmd, TQ_NOSLEEP, &cmd->nc_tqent);
1131 		sema_v(&qp->nq_sema);
1132 		mutex_exit(&qp->nq_mutex);
1133 		return;
1134 	}
1135 
1136 	/*
1137 	 * Try to insert the cmd into the active cmd array at the nq_next_cmd
1138 	 * slot. If the slot is already occupied advance to the next slot and
1139 	 * try again. This can happen for long running commands like async event
1140 	 * requests.
1141 	 */
1142 	while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
1143 		qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1144 	qp->nq_cmd[qp->nq_next_cmd] = cmd;
1145 
1146 	qp->nq_active_cmds++;
1147 
1148 	cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
1149 	bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
1150 	(void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
1151 	    sizeof (nvme_sqe_t) * qp->nq_sqtail,
1152 	    sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
1153 	qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
1154 
1155 	tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
1156 	nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
1157 
1158 	mutex_exit(&qp->nq_mutex);
1159 }
1160 
1161 static nvme_cmd_t *
1162 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
1163 {
1164 	nvme_cmd_t *cmd;
1165 
1166 	ASSERT(mutex_owned(&qp->nq_mutex));
1167 	ASSERT3S(cid, <, qp->nq_nentry);
1168 
1169 	cmd = qp->nq_cmd[cid];
1170 	qp->nq_cmd[cid] = NULL;
1171 	ASSERT3U(qp->nq_active_cmds, >, 0);
1172 	qp->nq_active_cmds--;
1173 	sema_v(&qp->nq_sema);
1174 
1175 	ASSERT3P(cmd, !=, NULL);
1176 	ASSERT3P(cmd->nc_nvme, ==, nvme);
1177 	ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
1178 
1179 	return (cmd);
1180 }
1181 
1182 /*
1183  * Get the command tied to the next completed cqe and bump along completion
1184  * queue head counter.
1185  */
1186 static nvme_cmd_t *
1187 nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq)
1188 {
1189 	nvme_qpair_t *qp;
1190 	nvme_cqe_t *cqe;
1191 	nvme_cmd_t *cmd;
1192 
1193 	ASSERT(mutex_owned(&cq->ncq_mutex));
1194 
1195 	cqe = &cq->ncq_cq[cq->ncq_head];
1196 
1197 	/* Check phase tag of CQE. Hardware inverts it for new entries. */
1198 	if (cqe->cqe_sf.sf_p == cq->ncq_phase)
1199 		return (NULL);
1200 
1201 	qp = nvme->n_ioq[cqe->cqe_sqid];
1202 
1203 	mutex_enter(&qp->nq_mutex);
1204 	cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
1205 	mutex_exit(&qp->nq_mutex);
1206 
1207 	ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
1208 	bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
1209 
1210 	qp->nq_sqhead = cqe->cqe_sqhd;
1211 
1212 	cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry;
1213 
1214 	/* Toggle phase on wrap-around. */
1215 	if (cq->ncq_head == 0)
1216 		cq->ncq_phase = cq->ncq_phase ? 0 : 1;
1217 
1218 	return (cmd);
1219 }
1220 
1221 /*
1222  * Process all completed commands on the io completion queue.
1223  */
1224 static uint_t
1225 nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
1226 {
1227 	nvme_reg_cqhdbl_t head = { 0 };
1228 	nvme_cmd_t *cmd;
1229 	uint_t completed = 0;
1230 
1231 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1232 	    DDI_SUCCESS)
1233 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1234 		    __func__);
1235 
1236 	mutex_enter(&cq->ncq_mutex);
1237 
1238 	while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1239 		taskq_dispatch_ent(cq->ncq_cmd_taskq, cmd->nc_callback, cmd,
1240 		    TQ_NOSLEEP, &cmd->nc_tqent);
1241 
1242 		completed++;
1243 	}
1244 
1245 	if (completed > 0) {
1246 		/*
1247 		 * Update the completion queue head doorbell.
1248 		 */
1249 		head.b.cqhdbl_cqh = cq->ncq_head;
1250 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
1251 	}
1252 
1253 	mutex_exit(&cq->ncq_mutex);
1254 
1255 	return (completed);
1256 }
1257 
1258 static nvme_cmd_t *
1259 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
1260 {
1261 	nvme_cq_t *cq = qp->nq_cq;
1262 	nvme_reg_cqhdbl_t head = { 0 };
1263 	nvme_cmd_t *cmd;
1264 
1265 	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
1266 	    DDI_SUCCESS)
1267 		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
1268 		    __func__);
1269 
1270 	mutex_enter(&cq->ncq_mutex);
1271 
1272 	if ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
1273 		head.b.cqhdbl_cqh = cq->ncq_head;
1274 		nvme_put32(nvme, cq->ncq_hdbl, head.r);
1275 	}
1276 
1277 	mutex_exit(&cq->ncq_mutex);
1278 
1279 	return (cmd);
1280 }
1281 
1282 static int
1283 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
1284 {
1285 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1286 
1287 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1288 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1289 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1290 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1291 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1292 
1293 	if (cmd->nc_xfer != NULL)
1294 		bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1295 
1296 	if (cmd->nc_nvme->n_strict_version) {
1297 		cmd->nc_nvme->n_dead = B_TRUE;
1298 		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
1299 	}
1300 
1301 	return (EIO);
1302 }
1303 
1304 static int
1305 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd)
1306 {
1307 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1308 
1309 	dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1310 	    "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1311 	    "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
1312 	    cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
1313 	    cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
1314 	if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) {
1315 		cmd->nc_nvme->n_dead = B_TRUE;
1316 		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
1317 	}
1318 
1319 	return (EIO);
1320 }
1321 
1322 static int
1323 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd)
1324 {
1325 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1326 
1327 	switch (cqe->cqe_sf.sf_sc) {
1328 	case NVME_CQE_SC_INT_NVM_WRITE:
1329 		/* write fail */
1330 		/* TODO: post ereport */
1331 		if (cmd->nc_xfer != NULL)
1332 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1333 		return (EIO);
1334 
1335 	case NVME_CQE_SC_INT_NVM_READ:
1336 		/* read fail */
1337 		/* TODO: post ereport */
1338 		if (cmd->nc_xfer != NULL)
1339 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1340 		return (EIO);
1341 
1342 	default:
1343 		return (nvme_check_unknown_cmd_status(cmd));
1344 	}
1345 }
1346 
1347 static int
1348 nvme_check_generic_cmd_status(nvme_cmd_t *cmd)
1349 {
1350 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1351 
1352 	switch (cqe->cqe_sf.sf_sc) {
1353 	case NVME_CQE_SC_GEN_SUCCESS:
1354 		return (0);
1355 
1356 	/*
1357 	 * Errors indicating a bug in the driver should cause a panic.
1358 	 */
1359 	case NVME_CQE_SC_GEN_INV_OPC:
1360 		/* Invalid Command Opcode */
1361 		if (!cmd->nc_dontpanic)
1362 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1363 			    "programming error: invalid opcode in cmd %p",
1364 			    (void *)cmd);
1365 		return (EINVAL);
1366 
1367 	case NVME_CQE_SC_GEN_INV_FLD:
1368 		/* Invalid Field in Command */
1369 		if (!cmd->nc_dontpanic)
1370 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1371 			    "programming error: invalid field in cmd %p",
1372 			    (void *)cmd);
1373 		return (EIO);
1374 
1375 	case NVME_CQE_SC_GEN_ID_CNFL:
1376 		/* Command ID Conflict */
1377 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1378 		    "cmd ID conflict in cmd %p", (void *)cmd);
1379 		return (0);
1380 
1381 	case NVME_CQE_SC_GEN_INV_NS:
1382 		/* Invalid Namespace or Format */
1383 		if (!cmd->nc_dontpanic)
1384 			dev_err(cmd->nc_nvme->n_dip, CE_PANIC,
1385 			    "programming error: invalid NS/format in cmd %p",
1386 			    (void *)cmd);
1387 		return (EINVAL);
1388 
1389 	case NVME_CQE_SC_GEN_NVM_LBA_RANGE:
1390 		/* LBA Out Of Range */
1391 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1392 		    "LBA out of range in cmd %p", (void *)cmd);
1393 		return (0);
1394 
1395 	/*
1396 	 * Non-fatal errors, handle gracefully.
1397 	 */
1398 	case NVME_CQE_SC_GEN_DATA_XFR_ERR:
1399 		/* Data Transfer Error (DMA) */
1400 		/* TODO: post ereport */
1401 		atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err);
1402 		if (cmd->nc_xfer != NULL)
1403 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1404 		return (EIO);
1405 
1406 	case NVME_CQE_SC_GEN_INTERNAL_ERR:
1407 		/*
1408 		 * Internal Error. The spec (v1.0, section 4.5.1.2) says
1409 		 * detailed error information is returned as async event,
1410 		 * so we pretty much ignore the error here and handle it
1411 		 * in the async event handler.
1412 		 */
1413 		atomic_inc_32(&cmd->nc_nvme->n_internal_err);
1414 		if (cmd->nc_xfer != NULL)
1415 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1416 		return (EIO);
1417 
1418 	case NVME_CQE_SC_GEN_ABORT_REQUEST:
1419 		/*
1420 		 * Command Abort Requested. This normally happens only when a
1421 		 * command times out.
1422 		 */
1423 		/* TODO: post ereport or change blkdev to handle this? */
1424 		atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err);
1425 		return (ECANCELED);
1426 
1427 	case NVME_CQE_SC_GEN_ABORT_PWRLOSS:
1428 		/* Command Aborted due to Power Loss Notification */
1429 		ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST);
1430 		cmd->nc_nvme->n_dead = B_TRUE;
1431 		return (EIO);
1432 
1433 	case NVME_CQE_SC_GEN_ABORT_SQ_DEL:
1434 		/* Command Aborted due to SQ Deletion */
1435 		atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del);
1436 		return (EIO);
1437 
1438 	case NVME_CQE_SC_GEN_NVM_CAP_EXC:
1439 		/* Capacity Exceeded */
1440 		atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc);
1441 		if (cmd->nc_xfer != NULL)
1442 			bd_error(cmd->nc_xfer, BD_ERR_MEDIA);
1443 		return (EIO);
1444 
1445 	case NVME_CQE_SC_GEN_NVM_NS_NOTRDY:
1446 		/* Namespace Not Ready */
1447 		atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy);
1448 		if (cmd->nc_xfer != NULL)
1449 			bd_error(cmd->nc_xfer, BD_ERR_NTRDY);
1450 		return (EIO);
1451 
1452 	default:
1453 		return (nvme_check_unknown_cmd_status(cmd));
1454 	}
1455 }
1456 
1457 static int
1458 nvme_check_specific_cmd_status(nvme_cmd_t *cmd)
1459 {
1460 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1461 
1462 	switch (cqe->cqe_sf.sf_sc) {
1463 	case NVME_CQE_SC_SPC_INV_CQ:
1464 		/* Completion Queue Invalid */
1465 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE);
1466 		atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err);
1467 		return (EINVAL);
1468 
1469 	case NVME_CQE_SC_SPC_INV_QID:
1470 		/* Invalid Queue Identifier */
1471 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1472 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE ||
1473 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE ||
1474 		    cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1475 		atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err);
1476 		return (EINVAL);
1477 
1478 	case NVME_CQE_SC_SPC_MAX_QSZ_EXC:
1479 		/* Max Queue Size Exceeded */
1480 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE ||
1481 		    cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1482 		atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc);
1483 		return (EINVAL);
1484 
1485 	case NVME_CQE_SC_SPC_ABRT_CMD_EXC:
1486 		/* Abort Command Limit Exceeded */
1487 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT);
1488 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1489 		    "abort command limit exceeded in cmd %p", (void *)cmd);
1490 		return (0);
1491 
1492 	case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC:
1493 		/* Async Event Request Limit Exceeded */
1494 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT);
1495 		dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: "
1496 		    "async event request limit exceeded in cmd %p",
1497 		    (void *)cmd);
1498 		return (0);
1499 
1500 	case NVME_CQE_SC_SPC_INV_INT_VECT:
1501 		/* Invalid Interrupt Vector */
1502 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE);
1503 		atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect);
1504 		return (EINVAL);
1505 
1506 	case NVME_CQE_SC_SPC_INV_LOG_PAGE:
1507 		/* Invalid Log Page */
1508 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE);
1509 		atomic_inc_32(&cmd->nc_nvme->n_inv_log_page);
1510 		return (EINVAL);
1511 
1512 	case NVME_CQE_SC_SPC_INV_FORMAT:
1513 		/* Invalid Format */
1514 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT);
1515 		atomic_inc_32(&cmd->nc_nvme->n_inv_format);
1516 		if (cmd->nc_xfer != NULL)
1517 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1518 		return (EINVAL);
1519 
1520 	case NVME_CQE_SC_SPC_INV_Q_DEL:
1521 		/* Invalid Queue Deletion */
1522 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE);
1523 		atomic_inc_32(&cmd->nc_nvme->n_inv_q_del);
1524 		return (EINVAL);
1525 
1526 	case NVME_CQE_SC_SPC_NVM_CNFL_ATTR:
1527 		/* Conflicting Attributes */
1528 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT ||
1529 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1530 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1531 		atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr);
1532 		if (cmd->nc_xfer != NULL)
1533 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1534 		return (EINVAL);
1535 
1536 	case NVME_CQE_SC_SPC_NVM_INV_PROT:
1537 		/* Invalid Protection Information */
1538 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE ||
1539 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ ||
1540 		    cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1541 		atomic_inc_32(&cmd->nc_nvme->n_inv_prot);
1542 		if (cmd->nc_xfer != NULL)
1543 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1544 		return (EINVAL);
1545 
1546 	case NVME_CQE_SC_SPC_NVM_READONLY:
1547 		/* Write to Read Only Range */
1548 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1549 		atomic_inc_32(&cmd->nc_nvme->n_readonly);
1550 		if (cmd->nc_xfer != NULL)
1551 			bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1552 		return (EROFS);
1553 
1554 	case NVME_CQE_SC_SPC_INV_FW_SLOT:
1555 		/* Invalid Firmware Slot */
1556 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1557 		return (EINVAL);
1558 
1559 	case NVME_CQE_SC_SPC_INV_FW_IMG:
1560 		/* Invalid Firmware Image */
1561 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1562 		return (EINVAL);
1563 
1564 	case NVME_CQE_SC_SPC_FW_RESET:
1565 		/* Conventional Reset Required */
1566 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1567 		return (0);
1568 
1569 	case NVME_CQE_SC_SPC_FW_NSSR:
1570 		/* NVMe Subsystem Reset Required */
1571 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1572 		return (0);
1573 
1574 	case NVME_CQE_SC_SPC_FW_NEXT_RESET:
1575 		/* Activation Requires Reset */
1576 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1577 		return (0);
1578 
1579 	case NVME_CQE_SC_SPC_FW_MTFA:
1580 		/* Activation Requires Maximum Time Violation */
1581 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1582 		return (EAGAIN);
1583 
1584 	case NVME_CQE_SC_SPC_FW_PROHIBITED:
1585 		/* Activation Prohibited */
1586 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_ACTIVATE);
1587 		return (EINVAL);
1588 
1589 	case NVME_CQE_SC_SPC_FW_OVERLAP:
1590 		/* Overlapping Firmware Ranges */
1591 		ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_FW_IMAGE_LOAD);
1592 		return (EINVAL);
1593 
1594 	default:
1595 		return (nvme_check_unknown_cmd_status(cmd));
1596 	}
1597 }
1598 
1599 static inline int
1600 nvme_check_cmd_status(nvme_cmd_t *cmd)
1601 {
1602 	nvme_cqe_t *cqe = &cmd->nc_cqe;
1603 
1604 	/*
1605 	 * Take a shortcut if the controller is dead, or if
1606 	 * command status indicates no error.
1607 	 */
1608 	if (cmd->nc_nvme->n_dead)
1609 		return (EIO);
1610 
1611 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1612 	    cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
1613 		return (0);
1614 
1615 	if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
1616 		return (nvme_check_generic_cmd_status(cmd));
1617 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
1618 		return (nvme_check_specific_cmd_status(cmd));
1619 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
1620 		return (nvme_check_integrity_cmd_status(cmd));
1621 	else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
1622 		return (nvme_check_vendor_cmd_status(cmd));
1623 
1624 	return (nvme_check_unknown_cmd_status(cmd));
1625 }
1626 
1627 static int
1628 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec)
1629 {
1630 	nvme_t *nvme = abort_cmd->nc_nvme;
1631 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1632 	nvme_abort_cmd_t ac = { 0 };
1633 	int ret = 0;
1634 
1635 	sema_p(&nvme->n_abort_sema);
1636 
1637 	ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
1638 	ac.b.ac_sqid = abort_cmd->nc_sqid;
1639 
1640 	cmd->nc_sqid = 0;
1641 	cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
1642 	cmd->nc_callback = nvme_wakeup_cmd;
1643 	cmd->nc_sqe.sqe_cdw10 = ac.r;
1644 
1645 	/*
1646 	 * Send the ABORT to the hardware. The ABORT command will return _after_
1647 	 * the aborted command has completed (aborted or otherwise), but since
1648 	 * we still hold the aborted command's mutex its callback hasn't been
1649 	 * processed yet.
1650 	 */
1651 	nvme_admin_cmd(cmd, sec);
1652 	sema_v(&nvme->n_abort_sema);
1653 
1654 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1655 		dev_err(nvme->n_dip, CE_WARN,
1656 		    "!ABORT failed with sct = %x, sc = %x",
1657 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1658 		atomic_inc_32(&nvme->n_abort_failed);
1659 	} else {
1660 		dev_err(nvme->n_dip, CE_WARN,
1661 		    "!ABORT of command %d/%d %ssuccessful",
1662 		    abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid,
1663 		    cmd->nc_cqe.cqe_dw0 & 1 ? "un" : "");
1664 		if ((cmd->nc_cqe.cqe_dw0 & 1) == 0)
1665 			atomic_inc_32(&nvme->n_cmd_aborted);
1666 	}
1667 
1668 	nvme_free_cmd(cmd);
1669 	return (ret);
1670 }
1671 
1672 /*
1673  * nvme_wait_cmd -- wait for command completion or timeout
1674  *
1675  * In case of a serious error or a timeout of the abort command the hardware
1676  * will be declared dead and FMA will be notified.
1677  */
1678 static void
1679 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec)
1680 {
1681 	clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC);
1682 	nvme_t *nvme = cmd->nc_nvme;
1683 	nvme_reg_csts_t csts;
1684 	nvme_qpair_t *qp;
1685 
1686 	ASSERT(mutex_owned(&cmd->nc_mutex));
1687 
1688 	while (!cmd->nc_completed) {
1689 		if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
1690 			break;
1691 	}
1692 
1693 	if (cmd->nc_completed)
1694 		return;
1695 
1696 	/*
1697 	 * The command timed out.
1698 	 *
1699 	 * Check controller for fatal status, any errors associated with the
1700 	 * register or DMA handle, or for a double timeout (abort command timed
1701 	 * out). If necessary log a warning and call FMA.
1702 	 */
1703 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1704 	dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
1705 	    "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
1706 	    cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
1707 	atomic_inc_32(&nvme->n_cmd_timeout);
1708 
1709 	if (csts.b.csts_cfs ||
1710 	    nvme_check_regs_hdl(nvme) ||
1711 	    nvme_check_dma_hdl(cmd->nc_dma) ||
1712 	    cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
1713 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1714 		nvme->n_dead = B_TRUE;
1715 	} else if (nvme_abort_cmd(cmd, sec) == 0) {
1716 		/*
1717 		 * If the abort succeeded the command should complete
1718 		 * immediately with an appropriate status.
1719 		 */
1720 		while (!cmd->nc_completed)
1721 			cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
1722 
1723 		return;
1724 	}
1725 
1726 	qp = nvme->n_ioq[cmd->nc_sqid];
1727 
1728 	mutex_enter(&qp->nq_mutex);
1729 	(void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
1730 	mutex_exit(&qp->nq_mutex);
1731 
1732 	/*
1733 	 * As we don't know what the presumed dead hardware might still do with
1734 	 * the DMA memory, we'll put the command on the lost commands list if it
1735 	 * has any DMA memory.
1736 	 */
1737 	if (cmd->nc_dma != NULL) {
1738 		mutex_enter(&nvme_lc_mutex);
1739 		list_insert_head(&nvme_lost_cmds, cmd);
1740 		mutex_exit(&nvme_lc_mutex);
1741 	}
1742 }
1743 
1744 static void
1745 nvme_wakeup_cmd(void *arg)
1746 {
1747 	nvme_cmd_t *cmd = arg;
1748 
1749 	mutex_enter(&cmd->nc_mutex);
1750 	cmd->nc_completed = B_TRUE;
1751 	cv_signal(&cmd->nc_cv);
1752 	mutex_exit(&cmd->nc_mutex);
1753 }
1754 
1755 static void
1756 nvme_async_event_task(void *arg)
1757 {
1758 	nvme_cmd_t *cmd = arg;
1759 	nvme_t *nvme = cmd->nc_nvme;
1760 	nvme_error_log_entry_t *error_log = NULL;
1761 	nvme_health_log_t *health_log = NULL;
1762 	size_t logsize = 0;
1763 	nvme_async_event_t event;
1764 
1765 	/*
1766 	 * Check for errors associated with the async request itself. The only
1767 	 * command-specific error is "async event limit exceeded", which
1768 	 * indicates a programming error in the driver and causes a panic in
1769 	 * nvme_check_cmd_status().
1770 	 *
1771 	 * Other possible errors are various scenarios where the async request
1772 	 * was aborted, or internal errors in the device. Internal errors are
1773 	 * reported to FMA, the command aborts need no special handling here.
1774 	 *
1775 	 * And finally, at least qemu nvme does not support async events,
1776 	 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
1777 	 * will avoid posting async events.
1778 	 */
1779 
1780 	if (nvme_check_cmd_status(cmd) != 0) {
1781 		dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1782 		    "!async event request returned failure, sct = %x, "
1783 		    "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
1784 		    cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
1785 		    cmd->nc_cqe.cqe_sf.sf_m);
1786 
1787 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1788 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
1789 			cmd->nc_nvme->n_dead = B_TRUE;
1790 			ddi_fm_service_impact(cmd->nc_nvme->n_dip,
1791 			    DDI_SERVICE_LOST);
1792 		}
1793 
1794 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1795 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_OPC &&
1796 		    cmd->nc_cqe.cqe_sf.sf_dnr == 1) {
1797 			nvme->n_async_event_supported = B_FALSE;
1798 		}
1799 
1800 		nvme_free_cmd(cmd);
1801 		return;
1802 	}
1803 
1804 
1805 	event.r = cmd->nc_cqe.cqe_dw0;
1806 
1807 	/* Clear CQE and re-submit the async request. */
1808 	bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
1809 	nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1810 
1811 	switch (event.b.ae_type) {
1812 	case NVME_ASYNC_TYPE_ERROR:
1813 		if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
1814 			(void) nvme_get_logpage(nvme, B_FALSE,
1815 			    (void **)&error_log, &logsize, event.b.ae_logpage);
1816 		} else {
1817 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
1818 			    "async event reply: %d", event.b.ae_logpage);
1819 			atomic_inc_32(&nvme->n_wrong_logpage);
1820 		}
1821 
1822 		switch (event.b.ae_info) {
1823 		case NVME_ASYNC_ERROR_INV_SQ:
1824 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1825 			    "invalid submission queue");
1826 			return;
1827 
1828 		case NVME_ASYNC_ERROR_INV_DBL:
1829 			dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1830 			    "invalid doorbell write value");
1831 			return;
1832 
1833 		case NVME_ASYNC_ERROR_DIAGFAIL:
1834 			dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure");
1835 			ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1836 			nvme->n_dead = B_TRUE;
1837 			atomic_inc_32(&nvme->n_diagfail_event);
1838 			break;
1839 
1840 		case NVME_ASYNC_ERROR_PERSISTENT:
1841 			dev_err(nvme->n_dip, CE_WARN, "!persistent internal "
1842 			    "device error");
1843 			ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1844 			nvme->n_dead = B_TRUE;
1845 			atomic_inc_32(&nvme->n_persistent_event);
1846 			break;
1847 
1848 		case NVME_ASYNC_ERROR_TRANSIENT:
1849 			dev_err(nvme->n_dip, CE_WARN, "!transient internal "
1850 			    "device error");
1851 			/* TODO: send ereport */
1852 			atomic_inc_32(&nvme->n_transient_event);
1853 			break;
1854 
1855 		case NVME_ASYNC_ERROR_FW_LOAD:
1856 			dev_err(nvme->n_dip, CE_WARN,
1857 			    "!firmware image load error");
1858 			atomic_inc_32(&nvme->n_fw_load_event);
1859 			break;
1860 		}
1861 		break;
1862 
1863 	case NVME_ASYNC_TYPE_HEALTH:
1864 		if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) {
1865 			(void) nvme_get_logpage(nvme, B_FALSE,
1866 			    (void **)&health_log, &logsize, event.b.ae_logpage,
1867 			    -1);
1868 		} else {
1869 			dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
1870 			    "async event reply: %d", event.b.ae_logpage);
1871 			atomic_inc_32(&nvme->n_wrong_logpage);
1872 		}
1873 
1874 		switch (event.b.ae_info) {
1875 		case NVME_ASYNC_HEALTH_RELIABILITY:
1876 			dev_err(nvme->n_dip, CE_WARN,
1877 			    "!device reliability compromised");
1878 			/* TODO: send ereport */
1879 			atomic_inc_32(&nvme->n_reliability_event);
1880 			break;
1881 
1882 		case NVME_ASYNC_HEALTH_TEMPERATURE:
1883 			dev_err(nvme->n_dip, CE_WARN,
1884 			    "!temperature above threshold");
1885 			/* TODO: send ereport */
1886 			atomic_inc_32(&nvme->n_temperature_event);
1887 			break;
1888 
1889 		case NVME_ASYNC_HEALTH_SPARE:
1890 			dev_err(nvme->n_dip, CE_WARN,
1891 			    "!spare space below threshold");
1892 			/* TODO: send ereport */
1893 			atomic_inc_32(&nvme->n_spare_event);
1894 			break;
1895 		}
1896 		break;
1897 
1898 	case NVME_ASYNC_TYPE_VENDOR:
1899 		dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event "
1900 		    "received, info = %x, logpage = %x", event.b.ae_info,
1901 		    event.b.ae_logpage);
1902 		atomic_inc_32(&nvme->n_vendor_event);
1903 		break;
1904 
1905 	default:
1906 		dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
1907 		    "type = %x, info = %x, logpage = %x", event.b.ae_type,
1908 		    event.b.ae_info, event.b.ae_logpage);
1909 		atomic_inc_32(&nvme->n_unknown_event);
1910 		break;
1911 	}
1912 
1913 	if (error_log)
1914 		kmem_free(error_log, logsize);
1915 
1916 	if (health_log)
1917 		kmem_free(health_log, logsize);
1918 }
1919 
1920 static void
1921 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
1922 {
1923 	mutex_enter(&cmd->nc_mutex);
1924 	nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
1925 	nvme_wait_cmd(cmd, sec);
1926 	mutex_exit(&cmd->nc_mutex);
1927 }
1928 
1929 static void
1930 nvme_async_event(nvme_t *nvme)
1931 {
1932 	nvme_cmd_t *cmd;
1933 
1934 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1935 	cmd->nc_sqid = 0;
1936 	cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
1937 	cmd->nc_callback = nvme_async_event_task;
1938 	cmd->nc_dontpanic = B_TRUE;
1939 
1940 	nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1941 }
1942 
1943 static int
1944 nvme_format_nvm(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t lbaf,
1945     boolean_t ms, uint8_t pi, boolean_t pil, uint8_t ses)
1946 {
1947 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1948 	nvme_format_nvm_t format_nvm = { 0 };
1949 	int ret;
1950 
1951 	format_nvm.b.fm_lbaf = lbaf & 0xf;
1952 	format_nvm.b.fm_ms = ms ? 1 : 0;
1953 	format_nvm.b.fm_pi = pi & 0x7;
1954 	format_nvm.b.fm_pil = pil ? 1 : 0;
1955 	format_nvm.b.fm_ses = ses & 0x7;
1956 
1957 	cmd->nc_sqid = 0;
1958 	cmd->nc_callback = nvme_wakeup_cmd;
1959 	cmd->nc_sqe.sqe_nsid = nsid;
1960 	cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
1961 	cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
1962 
1963 	/*
1964 	 * Some devices like Samsung SM951 don't allow formatting of all
1965 	 * namespaces in one command. Handle that gracefully.
1966 	 */
1967 	if (nsid == (uint32_t)-1)
1968 		cmd->nc_dontpanic = B_TRUE;
1969 	/*
1970 	 * If this format request was initiated by the user, then don't allow a
1971 	 * programmer error to panic the system.
1972 	 */
1973 	if (user)
1974 		cmd->nc_dontpanic = B_TRUE;
1975 
1976 	nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
1977 
1978 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1979 		dev_err(nvme->n_dip, CE_WARN,
1980 		    "!FORMAT failed with sct = %x, sc = %x",
1981 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1982 	}
1983 
1984 	nvme_free_cmd(cmd);
1985 	return (ret);
1986 }
1987 
1988 static int
1989 nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize,
1990     uint8_t logpage, ...)
1991 {
1992 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1993 	nvme_getlogpage_t getlogpage = { 0 };
1994 	va_list ap;
1995 	int ret;
1996 
1997 	va_start(ap, logpage);
1998 
1999 	cmd->nc_sqid = 0;
2000 	cmd->nc_callback = nvme_wakeup_cmd;
2001 	cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
2002 
2003 	if (user)
2004 		cmd->nc_dontpanic = B_TRUE;
2005 
2006 	getlogpage.b.lp_lid = logpage;
2007 
2008 	switch (logpage) {
2009 	case NVME_LOGPAGE_ERROR:
2010 		cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
2011 		/*
2012 		 * The GET LOG PAGE command can use at most 2 pages to return
2013 		 * data, PRP lists are not supported.
2014 		 */
2015 		*bufsize = MIN(2 * nvme->n_pagesize,
2016 		    nvme->n_error_log_len * sizeof (nvme_error_log_entry_t));
2017 		break;
2018 
2019 	case NVME_LOGPAGE_HEALTH:
2020 		cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t);
2021 		*bufsize = sizeof (nvme_health_log_t);
2022 		break;
2023 
2024 	case NVME_LOGPAGE_FWSLOT:
2025 		cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
2026 		*bufsize = sizeof (nvme_fwslot_log_t);
2027 		break;
2028 
2029 	default:
2030 		dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d",
2031 		    logpage);
2032 		atomic_inc_32(&nvme->n_unknown_logpage);
2033 		ret = EINVAL;
2034 		goto fail;
2035 	}
2036 
2037 	va_end(ap);
2038 
2039 	getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1;
2040 
2041 	cmd->nc_sqe.sqe_cdw10 = getlogpage.r;
2042 
2043 	if (nvme_zalloc_dma(nvme, *bufsize,
2044 	    DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2045 		dev_err(nvme->n_dip, CE_WARN,
2046 		    "!nvme_zalloc_dma failed for GET LOG PAGE");
2047 		ret = ENOMEM;
2048 		goto fail;
2049 	}
2050 
2051 	if (cmd->nc_dma->nd_ncookie > 2) {
2052 		dev_err(nvme->n_dip, CE_WARN,
2053 		    "!too many DMA cookies for GET LOG PAGE");
2054 		atomic_inc_32(&nvme->n_too_many_cookies);
2055 		ret = ENOMEM;
2056 		goto fail;
2057 	}
2058 
2059 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
2060 	if (cmd->nc_dma->nd_ncookie > 1) {
2061 		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2062 		    &cmd->nc_dma->nd_cookie);
2063 		cmd->nc_sqe.sqe_dptr.d_prp[1] =
2064 		    cmd->nc_dma->nd_cookie.dmac_laddress;
2065 	}
2066 
2067 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2068 
2069 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2070 		dev_err(nvme->n_dip, CE_WARN,
2071 		    "!GET LOG PAGE failed with sct = %x, sc = %x",
2072 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2073 		goto fail;
2074 	}
2075 
2076 	*buf = kmem_alloc(*bufsize, KM_SLEEP);
2077 	bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
2078 
2079 fail:
2080 	nvme_free_cmd(cmd);
2081 
2082 	return (ret);
2083 }
2084 
2085 static int
2086 nvme_identify(nvme_t *nvme, boolean_t user, uint32_t nsid, void **buf)
2087 {
2088 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2089 	int ret;
2090 
2091 	if (buf == NULL)
2092 		return (EINVAL);
2093 
2094 	cmd->nc_sqid = 0;
2095 	cmd->nc_callback = nvme_wakeup_cmd;
2096 	cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
2097 	cmd->nc_sqe.sqe_nsid = nsid;
2098 	cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL;
2099 
2100 	if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
2101 	    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2102 		dev_err(nvme->n_dip, CE_WARN,
2103 		    "!nvme_zalloc_dma failed for IDENTIFY");
2104 		ret = ENOMEM;
2105 		goto fail;
2106 	}
2107 
2108 	if (cmd->nc_dma->nd_ncookie > 2) {
2109 		dev_err(nvme->n_dip, CE_WARN,
2110 		    "!too many DMA cookies for IDENTIFY");
2111 		atomic_inc_32(&nvme->n_too_many_cookies);
2112 		ret = ENOMEM;
2113 		goto fail;
2114 	}
2115 
2116 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
2117 	if (cmd->nc_dma->nd_ncookie > 1) {
2118 		ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2119 		    &cmd->nc_dma->nd_cookie);
2120 		cmd->nc_sqe.sqe_dptr.d_prp[1] =
2121 		    cmd->nc_dma->nd_cookie.dmac_laddress;
2122 	}
2123 
2124 	if (user)
2125 		cmd->nc_dontpanic = B_TRUE;
2126 
2127 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2128 
2129 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2130 		dev_err(nvme->n_dip, CE_WARN,
2131 		    "!IDENTIFY failed with sct = %x, sc = %x",
2132 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2133 		goto fail;
2134 	}
2135 
2136 	*buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
2137 	bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
2138 
2139 fail:
2140 	nvme_free_cmd(cmd);
2141 
2142 	return (ret);
2143 }
2144 
2145 static int
2146 nvme_set_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
2147     uint32_t val, uint32_t *res)
2148 {
2149 	_NOTE(ARGUNUSED(nsid));
2150 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2151 	int ret = EINVAL;
2152 
2153 	ASSERT(res != NULL);
2154 
2155 	cmd->nc_sqid = 0;
2156 	cmd->nc_callback = nvme_wakeup_cmd;
2157 	cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
2158 	cmd->nc_sqe.sqe_cdw10 = feature;
2159 	cmd->nc_sqe.sqe_cdw11 = val;
2160 
2161 	if (user)
2162 		cmd->nc_dontpanic = B_TRUE;
2163 
2164 	switch (feature) {
2165 	case NVME_FEAT_WRITE_CACHE:
2166 		if (!nvme->n_write_cache_present)
2167 			goto fail;
2168 		break;
2169 
2170 	case NVME_FEAT_NQUEUES:
2171 		break;
2172 
2173 	default:
2174 		goto fail;
2175 	}
2176 
2177 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2178 
2179 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2180 		dev_err(nvme->n_dip, CE_WARN,
2181 		    "!SET FEATURES %d failed with sct = %x, sc = %x",
2182 		    feature, cmd->nc_cqe.cqe_sf.sf_sct,
2183 		    cmd->nc_cqe.cqe_sf.sf_sc);
2184 		goto fail;
2185 	}
2186 
2187 	*res = cmd->nc_cqe.cqe_dw0;
2188 
2189 fail:
2190 	nvme_free_cmd(cmd);
2191 	return (ret);
2192 }
2193 
2194 static int
2195 nvme_get_features(nvme_t *nvme, boolean_t user, uint32_t nsid, uint8_t feature,
2196     uint32_t *res, void **buf, size_t *bufsize)
2197 {
2198 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2199 	int ret = EINVAL;
2200 
2201 	ASSERT(res != NULL);
2202 
2203 	if (bufsize != NULL)
2204 		*bufsize = 0;
2205 
2206 	cmd->nc_sqid = 0;
2207 	cmd->nc_callback = nvme_wakeup_cmd;
2208 	cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
2209 	cmd->nc_sqe.sqe_cdw10 = feature;
2210 	cmd->nc_sqe.sqe_cdw11 = *res;
2211 
2212 	/*
2213 	 * For some of the optional features there doesn't seem to be a method
2214 	 * of detecting whether it is supported other than using it.  This will
2215 	 * cause "Invalid Field in Command" error, which is normally considered
2216 	 * a programming error.  Set the nc_dontpanic flag to override the panic
2217 	 * in nvme_check_generic_cmd_status().
2218 	 */
2219 	switch (feature) {
2220 	case NVME_FEAT_ARBITRATION:
2221 	case NVME_FEAT_POWER_MGMT:
2222 	case NVME_FEAT_TEMPERATURE:
2223 	case NVME_FEAT_ERROR:
2224 	case NVME_FEAT_NQUEUES:
2225 	case NVME_FEAT_INTR_COAL:
2226 	case NVME_FEAT_INTR_VECT:
2227 	case NVME_FEAT_WRITE_ATOM:
2228 	case NVME_FEAT_ASYNC_EVENT:
2229 		break;
2230 
2231 	case NVME_FEAT_WRITE_CACHE:
2232 		if (!nvme->n_write_cache_present)
2233 			goto fail;
2234 		break;
2235 
2236 	case NVME_FEAT_LBA_RANGE:
2237 		if (!nvme->n_lba_range_supported)
2238 			goto fail;
2239 
2240 		cmd->nc_dontpanic = B_TRUE;
2241 		cmd->nc_sqe.sqe_nsid = nsid;
2242 		ASSERT(bufsize != NULL);
2243 		*bufsize = NVME_LBA_RANGE_BUFSIZE;
2244 		break;
2245 
2246 	case NVME_FEAT_AUTO_PST:
2247 		if (!nvme->n_auto_pst_supported)
2248 			goto fail;
2249 
2250 		ASSERT(bufsize != NULL);
2251 		*bufsize = NVME_AUTO_PST_BUFSIZE;
2252 		break;
2253 
2254 	case NVME_FEAT_PROGRESS:
2255 		if (!nvme->n_progress_supported)
2256 			goto fail;
2257 
2258 		cmd->nc_dontpanic = B_TRUE;
2259 		break;
2260 
2261 	default:
2262 		goto fail;
2263 	}
2264 
2265 	if (user)
2266 		cmd->nc_dontpanic = B_TRUE;
2267 
2268 	if (bufsize != NULL && *bufsize != 0) {
2269 		if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
2270 		    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
2271 			dev_err(nvme->n_dip, CE_WARN,
2272 			    "!nvme_zalloc_dma failed for GET FEATURES");
2273 			ret = ENOMEM;
2274 			goto fail;
2275 		}
2276 
2277 		if (cmd->nc_dma->nd_ncookie > 2) {
2278 			dev_err(nvme->n_dip, CE_WARN,
2279 			    "!too many DMA cookies for GET FEATURES");
2280 			atomic_inc_32(&nvme->n_too_many_cookies);
2281 			ret = ENOMEM;
2282 			goto fail;
2283 		}
2284 
2285 		cmd->nc_sqe.sqe_dptr.d_prp[0] =
2286 		    cmd->nc_dma->nd_cookie.dmac_laddress;
2287 		if (cmd->nc_dma->nd_ncookie > 1) {
2288 			ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
2289 			    &cmd->nc_dma->nd_cookie);
2290 			cmd->nc_sqe.sqe_dptr.d_prp[1] =
2291 			    cmd->nc_dma->nd_cookie.dmac_laddress;
2292 		}
2293 	}
2294 
2295 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2296 
2297 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2298 		boolean_t known = B_TRUE;
2299 
2300 		/* Check if this is unsupported optional feature */
2301 		if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
2302 		    cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) {
2303 			switch (feature) {
2304 			case NVME_FEAT_LBA_RANGE:
2305 				nvme->n_lba_range_supported = B_FALSE;
2306 				break;
2307 			case NVME_FEAT_PROGRESS:
2308 				nvme->n_progress_supported = B_FALSE;
2309 				break;
2310 			default:
2311 				known = B_FALSE;
2312 				break;
2313 			}
2314 		} else {
2315 			known = B_FALSE;
2316 		}
2317 
2318 		/* Report the error otherwise */
2319 		if (!known) {
2320 			dev_err(nvme->n_dip, CE_WARN,
2321 			    "!GET FEATURES %d failed with sct = %x, sc = %x",
2322 			    feature, cmd->nc_cqe.cqe_sf.sf_sct,
2323 			    cmd->nc_cqe.cqe_sf.sf_sc);
2324 		}
2325 
2326 		goto fail;
2327 	}
2328 
2329 	if (bufsize != NULL && *bufsize != 0) {
2330 		ASSERT(buf != NULL);
2331 		*buf = kmem_alloc(*bufsize, KM_SLEEP);
2332 		bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
2333 	}
2334 
2335 	*res = cmd->nc_cqe.cqe_dw0;
2336 
2337 fail:
2338 	nvme_free_cmd(cmd);
2339 	return (ret);
2340 }
2341 
2342 static int
2343 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
2344 {
2345 	nvme_write_cache_t nwc = { 0 };
2346 
2347 	if (enable)
2348 		nwc.b.wc_wce = 1;
2349 
2350 	return (nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_WRITE_CACHE,
2351 	    nwc.r, &nwc.r));
2352 }
2353 
2354 static int
2355 nvme_set_nqueues(nvme_t *nvme)
2356 {
2357 	nvme_nqueues_t nq = { 0 };
2358 	int ret;
2359 
2360 	/*
2361 	 * The default is to allocate one completion queue per vector.
2362 	 */
2363 	if (nvme->n_completion_queues == -1)
2364 		nvme->n_completion_queues = nvme->n_intr_cnt;
2365 
2366 	/*
2367 	 * There is no point in having more compeletion queues than
2368 	 * interrupt vectors.
2369 	 */
2370 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2371 	    nvme->n_intr_cnt);
2372 
2373 	/*
2374 	 * The default is to use one submission queue per completion queue.
2375 	 */
2376 	if (nvme->n_submission_queues == -1)
2377 		nvme->n_submission_queues = nvme->n_completion_queues;
2378 
2379 	/*
2380 	 * There is no point in having more compeletion queues than
2381 	 * submission queues.
2382 	 */
2383 	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2384 	    nvme->n_submission_queues);
2385 
2386 	ASSERT(nvme->n_submission_queues > 0);
2387 	ASSERT(nvme->n_completion_queues > 0);
2388 
2389 	nq.b.nq_nsq = nvme->n_submission_queues - 1;
2390 	nq.b.nq_ncq = nvme->n_completion_queues - 1;
2391 
2392 	ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r,
2393 	    &nq.r);
2394 
2395 	if (ret == 0) {
2396 		/*
2397 		 * Never use more than the requested number of queues.
2398 		 */
2399 		nvme->n_submission_queues = MIN(nvme->n_submission_queues,
2400 		    nq.b.nq_nsq + 1);
2401 		nvme->n_completion_queues = MIN(nvme->n_completion_queues,
2402 		    nq.b.nq_ncq + 1);
2403 	}
2404 
2405 	return (ret);
2406 }
2407 
2408 static int
2409 nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq)
2410 {
2411 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2412 	nvme_create_queue_dw10_t dw10 = { 0 };
2413 	nvme_create_cq_dw11_t c_dw11 = { 0 };
2414 	int ret;
2415 
2416 	dw10.b.q_qid = cq->ncq_id;
2417 	dw10.b.q_qsize = cq->ncq_nentry - 1;
2418 
2419 	c_dw11.b.cq_pc = 1;
2420 	c_dw11.b.cq_ien = 1;
2421 	c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt;
2422 
2423 	cmd->nc_sqid = 0;
2424 	cmd->nc_callback = nvme_wakeup_cmd;
2425 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
2426 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2427 	cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
2428 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress;
2429 
2430 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2431 
2432 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2433 		dev_err(nvme->n_dip, CE_WARN,
2434 		    "!CREATE CQUEUE failed with sct = %x, sc = %x",
2435 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2436 	}
2437 
2438 	nvme_free_cmd(cmd);
2439 
2440 	return (ret);
2441 }
2442 
2443 static int
2444 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
2445 {
2446 	nvme_cq_t *cq = qp->nq_cq;
2447 	nvme_cmd_t *cmd;
2448 	nvme_create_queue_dw10_t dw10 = { 0 };
2449 	nvme_create_sq_dw11_t s_dw11 = { 0 };
2450 	int ret;
2451 
2452 	/*
2453 	 * It is possible to have more qpairs than completion queues,
2454 	 * and when the idx > ncq_id, that completion queue is shared
2455 	 * and has already been created.
2456 	 */
2457 	if (idx <= cq->ncq_id &&
2458 	    nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS)
2459 		return (DDI_FAILURE);
2460 
2461 	dw10.b.q_qid = idx;
2462 	dw10.b.q_qsize = qp->nq_nentry - 1;
2463 
2464 	s_dw11.b.sq_pc = 1;
2465 	s_dw11.b.sq_cqid = cq->ncq_id;
2466 
2467 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2468 	cmd->nc_sqid = 0;
2469 	cmd->nc_callback = nvme_wakeup_cmd;
2470 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
2471 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
2472 	cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
2473 	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
2474 
2475 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2476 
2477 	if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2478 		dev_err(nvme->n_dip, CE_WARN,
2479 		    "!CREATE SQUEUE failed with sct = %x, sc = %x",
2480 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2481 	}
2482 
2483 	nvme_free_cmd(cmd);
2484 
2485 	return (ret);
2486 }
2487 
2488 static boolean_t
2489 nvme_reset(nvme_t *nvme, boolean_t quiesce)
2490 {
2491 	nvme_reg_csts_t csts;
2492 	int i;
2493 
2494 	nvme_put32(nvme, NVME_REG_CC, 0);
2495 
2496 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2497 	if (csts.b.csts_rdy == 1) {
2498 		nvme_put32(nvme, NVME_REG_CC, 0);
2499 		for (i = 0; i != nvme->n_timeout * 10; i++) {
2500 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2501 			if (csts.b.csts_rdy == 0)
2502 				break;
2503 
2504 			if (quiesce)
2505 				drv_usecwait(50000);
2506 			else
2507 				delay(drv_usectohz(50000));
2508 		}
2509 	}
2510 
2511 	nvme_put32(nvme, NVME_REG_AQA, 0);
2512 	nvme_put32(nvme, NVME_REG_ASQ, 0);
2513 	nvme_put32(nvme, NVME_REG_ACQ, 0);
2514 
2515 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2516 	return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE);
2517 }
2518 
2519 static void
2520 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce)
2521 {
2522 	nvme_reg_cc_t cc;
2523 	nvme_reg_csts_t csts;
2524 	int i;
2525 
2526 	ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT);
2527 
2528 	cc.r = nvme_get32(nvme, NVME_REG_CC);
2529 	cc.b.cc_shn = mode & 0x3;
2530 	nvme_put32(nvme, NVME_REG_CC, cc.r);
2531 
2532 	for (i = 0; i != 10; i++) {
2533 		csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2534 		if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE)
2535 			break;
2536 
2537 		if (quiesce)
2538 			drv_usecwait(100000);
2539 		else
2540 			delay(drv_usectohz(100000));
2541 	}
2542 }
2543 
2544 
2545 static void
2546 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
2547 {
2548 	/*
2549 	 * Section 7.7 of the spec describes how to get a unique ID for
2550 	 * the controller: the vendor ID, the model name and the serial
2551 	 * number shall be unique when combined.
2552 	 *
2553 	 * If a namespace has no EUI64 we use the above and add the hex
2554 	 * namespace ID to get a unique ID for the namespace.
2555 	 */
2556 	char model[sizeof (nvme->n_idctl->id_model) + 1];
2557 	char serial[sizeof (nvme->n_idctl->id_serial) + 1];
2558 
2559 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2560 	bcopy(nvme->n_idctl->id_serial, serial,
2561 	    sizeof (nvme->n_idctl->id_serial));
2562 
2563 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
2564 	serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
2565 
2566 	nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X",
2567 	    nvme->n_idctl->id_vid, model, serial, nsid);
2568 }
2569 
2570 static int
2571 nvme_init_ns(nvme_t *nvme, int nsid)
2572 {
2573 	nvme_namespace_t *ns = &nvme->n_ns[nsid - 1];
2574 	nvme_identify_nsid_t *idns;
2575 	boolean_t was_ignored;
2576 	int last_rp;
2577 
2578 	ns->ns_nvme = nvme;
2579 
2580 	if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) {
2581 		dev_err(nvme->n_dip, CE_WARN,
2582 		    "!failed to identify namespace %d", nsid);
2583 		return (DDI_FAILURE);
2584 	}
2585 
2586 	ns->ns_idns = idns;
2587 	ns->ns_id = nsid;
2588 	ns->ns_block_count = idns->id_nsize;
2589 	ns->ns_block_size =
2590 	    1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
2591 	ns->ns_best_block_size = ns->ns_block_size;
2592 
2593 	/*
2594 	 * Get the EUI64 if present. Use it for devid and device node names.
2595 	 */
2596 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2597 		bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
2598 
2599 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
2600 	if (*(uint64_t *)ns->ns_eui64 != 0) {
2601 		uint8_t *eui64 = ns->ns_eui64;
2602 
2603 		(void) snprintf(ns->ns_name, sizeof (ns->ns_name),
2604 		    "%02x%02x%02x%02x%02x%02x%02x%02x",
2605 		    eui64[0], eui64[1], eui64[2], eui64[3],
2606 		    eui64[4], eui64[5], eui64[6], eui64[7]);
2607 	} else {
2608 		(void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d",
2609 		    ns->ns_id);
2610 
2611 		nvme_prepare_devid(nvme, ns->ns_id);
2612 	}
2613 
2614 	/*
2615 	 * Find the LBA format with no metadata and the best relative
2616 	 * performance. A value of 3 means "degraded", 0 is best.
2617 	 */
2618 	last_rp = 3;
2619 	for (int j = 0; j <= idns->id_nlbaf; j++) {
2620 		if (idns->id_lbaf[j].lbaf_lbads == 0)
2621 			break;
2622 		if (idns->id_lbaf[j].lbaf_ms != 0)
2623 			continue;
2624 		if (idns->id_lbaf[j].lbaf_rp >= last_rp)
2625 			continue;
2626 		last_rp = idns->id_lbaf[j].lbaf_rp;
2627 		ns->ns_best_block_size =
2628 		    1 << idns->id_lbaf[j].lbaf_lbads;
2629 	}
2630 
2631 	if (ns->ns_best_block_size < nvme->n_min_block_size)
2632 		ns->ns_best_block_size = nvme->n_min_block_size;
2633 
2634 	was_ignored = ns->ns_ignore;
2635 
2636 	/*
2637 	 * We currently don't support namespaces that use either:
2638 	 * - protection information
2639 	 * - illegal block size (< 512)
2640 	 */
2641 	if (idns->id_dps.dp_pinfo) {
2642 		dev_err(nvme->n_dip, CE_WARN,
2643 		    "!ignoring namespace %d, unsupported feature: "
2644 		    "pinfo = %d", nsid, idns->id_dps.dp_pinfo);
2645 		ns->ns_ignore = B_TRUE;
2646 	} else if (ns->ns_block_size < 512) {
2647 		dev_err(nvme->n_dip, CE_WARN,
2648 		    "!ignoring namespace %d, unsupported block size %"PRIu64,
2649 		    nsid, (uint64_t)ns->ns_block_size);
2650 		ns->ns_ignore = B_TRUE;
2651 	} else {
2652 		ns->ns_ignore = B_FALSE;
2653 	}
2654 
2655 	/*
2656 	 * Keep a count of namespaces which are attachable.
2657 	 * See comments in nvme_bd_driveinfo() to understand its effect.
2658 	 */
2659 	if (was_ignored) {
2660 		/*
2661 		 * Previously ignored, but now not. Count it.
2662 		 */
2663 		if (!ns->ns_ignore)
2664 			nvme->n_namespaces_attachable++;
2665 	} else {
2666 		/*
2667 		 * Wasn't ignored previously, but now needs to be.
2668 		 * Discount it.
2669 		 */
2670 		if (ns->ns_ignore)
2671 			nvme->n_namespaces_attachable--;
2672 	}
2673 
2674 	return (DDI_SUCCESS);
2675 }
2676 
2677 static int
2678 nvme_init(nvme_t *nvme)
2679 {
2680 	nvme_reg_cc_t cc = { 0 };
2681 	nvme_reg_aqa_t aqa = { 0 };
2682 	nvme_reg_asq_t asq = { 0 };
2683 	nvme_reg_acq_t acq = { 0 };
2684 	nvme_reg_cap_t cap;
2685 	nvme_reg_vs_t vs;
2686 	nvme_reg_csts_t csts;
2687 	int i = 0;
2688 	uint16_t nqueues;
2689 	uint_t tq_threads;
2690 	char model[sizeof (nvme->n_idctl->id_model) + 1];
2691 	char *vendor, *product;
2692 
2693 	/* Check controller version */
2694 	vs.r = nvme_get32(nvme, NVME_REG_VS);
2695 	nvme->n_version.v_major = vs.b.vs_mjr;
2696 	nvme->n_version.v_minor = vs.b.vs_mnr;
2697 	dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2698 	    nvme->n_version.v_major, nvme->n_version.v_minor);
2699 
2700 	if (nvme->n_version.v_major > nvme_version_major) {
2701 		dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.x",
2702 		    nvme_version_major);
2703 		if (nvme->n_strict_version)
2704 			goto fail;
2705 	}
2706 
2707 	/* retrieve controller configuration */
2708 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
2709 
2710 	if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) {
2711 		dev_err(nvme->n_dip, CE_WARN,
2712 		    "!NVM command set not supported by hardware");
2713 		goto fail;
2714 	}
2715 
2716 	nvme->n_nssr_supported = cap.b.cap_nssrs;
2717 	nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd;
2718 	nvme->n_timeout = cap.b.cap_to;
2719 	nvme->n_arbitration_mechanisms = cap.b.cap_ams;
2720 	nvme->n_cont_queues_reqd = cap.b.cap_cqr;
2721 	nvme->n_max_queue_entries = cap.b.cap_mqes + 1;
2722 
2723 	/*
2724 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
2725 	 * the base page size of 4k (1<<12), so add 12 here to get the real
2726 	 * page size value.
2727 	 */
2728 	nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT),
2729 	    cap.b.cap_mpsmax + 12);
2730 	nvme->n_pagesize = 1UL << (nvme->n_pageshift);
2731 
2732 	/*
2733 	 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
2734 	 */
2735 	nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize;
2736 	nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
2737 
2738 	/*
2739 	 * Set up PRP DMA to transfer 1 page-aligned page at a time.
2740 	 * Maxxfer may be increased after we identified the controller limits.
2741 	 */
2742 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize;
2743 	nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize;
2744 	nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize;
2745 	nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1;
2746 
2747 	/*
2748 	 * Reset controller if it's still in ready state.
2749 	 */
2750 	if (nvme_reset(nvme, B_FALSE) == B_FALSE) {
2751 		dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller");
2752 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
2753 		nvme->n_dead = B_TRUE;
2754 		goto fail;
2755 	}
2756 
2757 	/*
2758 	 * Create the cq array with one completion queue to be assigned
2759 	 * to the admin queue pair and a limited number of taskqs (4).
2760 	 */
2761 	if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len, 4) !=
2762 	    DDI_SUCCESS) {
2763 		dev_err(nvme->n_dip, CE_WARN,
2764 		    "!failed to pre-allocate admin completion queue");
2765 		goto fail;
2766 	}
2767 	/*
2768 	 * Create the admin queue pair.
2769 	 */
2770 	if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
2771 	    != DDI_SUCCESS) {
2772 		dev_err(nvme->n_dip, CE_WARN,
2773 		    "!unable to allocate admin qpair");
2774 		goto fail;
2775 	}
2776 	nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP);
2777 	nvme->n_ioq[0] = nvme->n_adminq;
2778 
2779 	nvme->n_progress |= NVME_ADMIN_QUEUE;
2780 
2781 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2782 	    "admin-queue-len", nvme->n_admin_queue_len);
2783 
2784 	aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
2785 	asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
2786 	acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress;
2787 
2788 	ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
2789 	ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
2790 
2791 	nvme_put32(nvme, NVME_REG_AQA, aqa.r);
2792 	nvme_put64(nvme, NVME_REG_ASQ, asq);
2793 	nvme_put64(nvme, NVME_REG_ACQ, acq);
2794 
2795 	cc.b.cc_ams = 0;	/* use Round-Robin arbitration */
2796 	cc.b.cc_css = 0;	/* use NVM command set */
2797 	cc.b.cc_mps = nvme->n_pageshift - 12;
2798 	cc.b.cc_shn = 0;	/* no shutdown in progress */
2799 	cc.b.cc_en = 1;		/* enable controller */
2800 	cc.b.cc_iosqes = 6;	/* submission queue entry is 2^6 bytes long */
2801 	cc.b.cc_iocqes = 4;	/* completion queue entry is 2^4 bytes long */
2802 
2803 	nvme_put32(nvme, NVME_REG_CC, cc.r);
2804 
2805 	/*
2806 	 * Wait for the controller to become ready.
2807 	 */
2808 	csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2809 	if (csts.b.csts_rdy == 0) {
2810 		for (i = 0; i != nvme->n_timeout * 10; i++) {
2811 			delay(drv_usectohz(50000));
2812 			csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2813 
2814 			if (csts.b.csts_cfs == 1) {
2815 				dev_err(nvme->n_dip, CE_WARN,
2816 				    "!controller fatal status at init");
2817 				ddi_fm_service_impact(nvme->n_dip,
2818 				    DDI_SERVICE_LOST);
2819 				nvme->n_dead = B_TRUE;
2820 				goto fail;
2821 			}
2822 
2823 			if (csts.b.csts_rdy == 1)
2824 				break;
2825 		}
2826 	}
2827 
2828 	if (csts.b.csts_rdy == 0) {
2829 		dev_err(nvme->n_dip, CE_WARN, "!controller not ready");
2830 		ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
2831 		nvme->n_dead = B_TRUE;
2832 		goto fail;
2833 	}
2834 
2835 	/*
2836 	 * Assume an abort command limit of 1. We'll destroy and re-init
2837 	 * that later when we know the true abort command limit.
2838 	 */
2839 	sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
2840 
2841 	/*
2842 	 * Setup initial interrupt for admin queue.
2843 	 */
2844 	if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
2845 	    != DDI_SUCCESS) &&
2846 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
2847 	    != DDI_SUCCESS) &&
2848 	    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
2849 	    != DDI_SUCCESS)) {
2850 		dev_err(nvme->n_dip, CE_WARN,
2851 		    "!failed to setup initial interrupt");
2852 		goto fail;
2853 	}
2854 
2855 	/*
2856 	 * Post an asynchronous event command to catch errors.
2857 	 * We assume the asynchronous events are supported as required by
2858 	 * specification (Figure 40 in section 5 of NVMe 1.2).
2859 	 * However, since at least qemu does not follow the specification,
2860 	 * we need a mechanism to protect ourselves.
2861 	 */
2862 	nvme->n_async_event_supported = B_TRUE;
2863 	nvme_async_event(nvme);
2864 
2865 	/*
2866 	 * Identify Controller
2867 	 */
2868 	if (nvme_identify(nvme, B_FALSE, 0, (void **)&nvme->n_idctl) != 0) {
2869 		dev_err(nvme->n_dip, CE_WARN,
2870 		    "!failed to identify controller");
2871 		goto fail;
2872 	}
2873 
2874 	/*
2875 	 * Get Vendor & Product ID
2876 	 */
2877 	bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2878 	model[sizeof (nvme->n_idctl->id_model)] = '\0';
2879 	sata_split_model(model, &vendor, &product);
2880 
2881 	if (vendor == NULL)
2882 		nvme->n_vendor = strdup("NVMe");
2883 	else
2884 		nvme->n_vendor = strdup(vendor);
2885 
2886 	nvme->n_product = strdup(product);
2887 
2888 	/*
2889 	 * Get controller limits.
2890 	 */
2891 	nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT,
2892 	    MIN(nvme->n_admin_queue_len / 10,
2893 	    MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit)));
2894 
2895 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2896 	    "async-event-limit", nvme->n_async_event_limit);
2897 
2898 	nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1;
2899 
2900 	/*
2901 	 * Reinitialize the semaphore with the true abort command limit
2902 	 * supported by the hardware. It's not necessary to disable interrupts
2903 	 * as only command aborts use the semaphore, and no commands are
2904 	 * executed or aborted while we're here.
2905 	 */
2906 	sema_destroy(&nvme->n_abort_sema);
2907 	sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL,
2908 	    SEMA_DRIVER, NULL);
2909 
2910 	nvme->n_progress |= NVME_CTRL_LIMITS;
2911 
2912 	if (nvme->n_idctl->id_mdts == 0)
2913 		nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536;
2914 	else
2915 		nvme->n_max_data_transfer_size =
2916 		    1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts);
2917 
2918 	nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1;
2919 
2920 	/*
2921 	 * Limit n_max_data_transfer_size to what we can handle in one PRP.
2922 	 * Chained PRPs are currently unsupported.
2923 	 *
2924 	 * This is a no-op on hardware which doesn't support a transfer size
2925 	 * big enough to require chained PRPs.
2926 	 */
2927 	nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size,
2928 	    (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize));
2929 
2930 	nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size;
2931 
2932 	/*
2933 	 * Make sure the minimum/maximum queue entry sizes are not
2934 	 * larger/smaller than the default.
2935 	 */
2936 
2937 	if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
2938 	    ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
2939 	    ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
2940 	    ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
2941 		goto fail;
2942 
2943 	/*
2944 	 * Check for the presence of a Volatile Write Cache. If present,
2945 	 * enable or disable based on the value of the property
2946 	 * volatile-write-cache-enable (default is enabled).
2947 	 */
2948 	nvme->n_write_cache_present =
2949 	    nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
2950 
2951 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2952 	    "volatile-write-cache-present",
2953 	    nvme->n_write_cache_present ? 1 : 0);
2954 
2955 	if (!nvme->n_write_cache_present) {
2956 		nvme->n_write_cache_enabled = B_FALSE;
2957 	} else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
2958 	    != 0) {
2959 		dev_err(nvme->n_dip, CE_WARN,
2960 		    "!failed to %sable volatile write cache",
2961 		    nvme->n_write_cache_enabled ? "en" : "dis");
2962 		/*
2963 		 * Assume the cache is (still) enabled.
2964 		 */
2965 		nvme->n_write_cache_enabled = B_TRUE;
2966 	}
2967 
2968 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2969 	    "volatile-write-cache-enable",
2970 	    nvme->n_write_cache_enabled ? 1 : 0);
2971 
2972 	/*
2973 	 * Assume LBA Range Type feature is supported. If it isn't this
2974 	 * will be set to B_FALSE by nvme_get_features().
2975 	 */
2976 	nvme->n_lba_range_supported = B_TRUE;
2977 
2978 	/*
2979 	 * Check support for Autonomous Power State Transition.
2980 	 */
2981 	if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2982 		nvme->n_auto_pst_supported =
2983 		    nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE;
2984 
2985 	/*
2986 	 * Assume Software Progress Marker feature is supported.  If it isn't
2987 	 * this will be set to B_FALSE by nvme_get_features().
2988 	 */
2989 	nvme->n_progress_supported = B_TRUE;
2990 
2991 	/*
2992 	 * Identify Namespaces
2993 	 */
2994 	nvme->n_namespace_count = nvme->n_idctl->id_nn;
2995 
2996 	if (nvme->n_namespace_count == 0) {
2997 		dev_err(nvme->n_dip, CE_WARN,
2998 		    "!controllers without namespaces are not supported");
2999 		goto fail;
3000 	}
3001 
3002 	if (nvme->n_namespace_count > NVME_MINOR_MAX) {
3003 		dev_err(nvme->n_dip, CE_WARN,
3004 		    "!too many namespaces: %d, limiting to %d\n",
3005 		    nvme->n_namespace_count, NVME_MINOR_MAX);
3006 		nvme->n_namespace_count = NVME_MINOR_MAX;
3007 	}
3008 
3009 	nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) *
3010 	    nvme->n_namespace_count, KM_SLEEP);
3011 
3012 	for (i = 0; i != nvme->n_namespace_count; i++) {
3013 		mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER,
3014 		    NULL);
3015 		nvme->n_ns[i].ns_ignore = B_TRUE;
3016 		if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS)
3017 			goto fail;
3018 	}
3019 
3020 	/*
3021 	 * Try to set up MSI/MSI-X interrupts.
3022 	 */
3023 	if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX))
3024 	    != 0) {
3025 		nvme_release_interrupts(nvme);
3026 
3027 		nqueues = MIN(UINT16_MAX, ncpus);
3028 
3029 		if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
3030 		    nqueues) != DDI_SUCCESS) &&
3031 		    (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
3032 		    nqueues) != DDI_SUCCESS)) {
3033 			dev_err(nvme->n_dip, CE_WARN,
3034 			    "!failed to setup MSI/MSI-X interrupts");
3035 			goto fail;
3036 		}
3037 	}
3038 
3039 	/*
3040 	 * Create I/O queue pairs.
3041 	 */
3042 
3043 	if (nvme_set_nqueues(nvme) != 0) {
3044 		dev_err(nvme->n_dip, CE_WARN,
3045 		    "!failed to set number of I/O queues to %d",
3046 		    nvme->n_intr_cnt);
3047 		goto fail;
3048 	}
3049 
3050 	/*
3051 	 * Reallocate I/O queue array
3052 	 */
3053 	kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
3054 	nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
3055 	    (nvme->n_submission_queues + 1), KM_SLEEP);
3056 	nvme->n_ioq[0] = nvme->n_adminq;
3057 
3058 	/*
3059 	 * There should always be at least as many submission queues
3060 	 * as completion queues.
3061 	 */
3062 	ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues);
3063 
3064 	nvme->n_ioq_count = nvme->n_submission_queues;
3065 
3066 	nvme->n_io_squeue_len =
3067 	    MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries);
3068 
3069 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len",
3070 	    nvme->n_io_squeue_len);
3071 
3072 	/*
3073 	 * Pre-allocate completion queues.
3074 	 * When there are the same number of submission and completion
3075 	 * queues there is no value in having a larger completion
3076 	 * queue length.
3077 	 */
3078 	if (nvme->n_submission_queues == nvme->n_completion_queues)
3079 		nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
3080 		    nvme->n_io_squeue_len);
3081 
3082 	nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
3083 	    nvme->n_max_queue_entries);
3084 
3085 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
3086 	    nvme->n_io_cqueue_len);
3087 
3088 	/*
3089 	 * Assign the equal quantity of taskq threads to each completion
3090 	 * queue, capping the total number of threads to the number
3091 	 * of CPUs.
3092 	 */
3093 	tq_threads = MIN(UINT16_MAX, ncpus) / nvme->n_completion_queues;
3094 
3095 	/*
3096 	 * In case the calculation above is zero, we need at least one
3097 	 * thread per completion queue.
3098 	 */
3099 	tq_threads = MAX(1, tq_threads);
3100 
3101 	if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
3102 	    nvme->n_io_cqueue_len, tq_threads) != DDI_SUCCESS) {
3103 		dev_err(nvme->n_dip, CE_WARN,
3104 		    "!failed to pre-allocate completion queues");
3105 		goto fail;
3106 	}
3107 
3108 	/*
3109 	 * If we use less completion queues than interrupt vectors return
3110 	 * some of the interrupt vectors back to the system.
3111 	 */
3112 	if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) {
3113 		nvme_release_interrupts(nvme);
3114 
3115 		if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
3116 		    nvme->n_completion_queues + 1) != DDI_SUCCESS) {
3117 			dev_err(nvme->n_dip, CE_WARN,
3118 			    "!failed to reduce number of interrupts");
3119 			goto fail;
3120 		}
3121 	}
3122 
3123 	/*
3124 	 * Alloc & register I/O queue pairs
3125 	 */
3126 
3127 	for (i = 1; i != nvme->n_ioq_count + 1; i++) {
3128 		if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len,
3129 		    &nvme->n_ioq[i], i) != DDI_SUCCESS) {
3130 			dev_err(nvme->n_dip, CE_WARN,
3131 			    "!unable to allocate I/O qpair %d", i);
3132 			goto fail;
3133 		}
3134 
3135 		if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
3136 			dev_err(nvme->n_dip, CE_WARN,
3137 			    "!unable to create I/O qpair %d", i);
3138 			goto fail;
3139 		}
3140 	}
3141 
3142 	/*
3143 	 * Post more asynchronous events commands to reduce event reporting
3144 	 * latency as suggested by the spec.
3145 	 */
3146 	if (nvme->n_async_event_supported) {
3147 		for (i = 1; i != nvme->n_async_event_limit; i++)
3148 			nvme_async_event(nvme);
3149 	}
3150 
3151 	return (DDI_SUCCESS);
3152 
3153 fail:
3154 	(void) nvme_reset(nvme, B_FALSE);
3155 	return (DDI_FAILURE);
3156 }
3157 
3158 static uint_t
3159 nvme_intr(caddr_t arg1, caddr_t arg2)
3160 {
3161 	/*LINTED: E_PTR_BAD_CAST_ALIGN*/
3162 	nvme_t *nvme = (nvme_t *)arg1;
3163 	int inum = (int)(uintptr_t)arg2;
3164 	int ccnt = 0;
3165 	int qnum;
3166 
3167 	if (inum >= nvme->n_intr_cnt)
3168 		return (DDI_INTR_UNCLAIMED);
3169 
3170 	if (nvme->n_dead)
3171 		return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
3172 		    DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
3173 
3174 	/*
3175 	 * The interrupt vector a queue uses is calculated as queue_idx %
3176 	 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
3177 	 * in steps of n_intr_cnt to process all queues using this vector.
3178 	 */
3179 	for (qnum = inum;
3180 	    qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL;
3181 	    qnum += nvme->n_intr_cnt) {
3182 		ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]);
3183 	}
3184 
3185 	return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
3186 }
3187 
3188 static void
3189 nvme_release_interrupts(nvme_t *nvme)
3190 {
3191 	int i;
3192 
3193 	for (i = 0; i < nvme->n_intr_cnt; i++) {
3194 		if (nvme->n_inth[i] == NULL)
3195 			break;
3196 
3197 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
3198 			(void) ddi_intr_block_disable(&nvme->n_inth[i], 1);
3199 		else
3200 			(void) ddi_intr_disable(nvme->n_inth[i]);
3201 
3202 		(void) ddi_intr_remove_handler(nvme->n_inth[i]);
3203 		(void) ddi_intr_free(nvme->n_inth[i]);
3204 	}
3205 
3206 	kmem_free(nvme->n_inth, nvme->n_inth_sz);
3207 	nvme->n_inth = NULL;
3208 	nvme->n_inth_sz = 0;
3209 
3210 	nvme->n_progress &= ~NVME_INTERRUPTS;
3211 }
3212 
3213 static int
3214 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs)
3215 {
3216 	int nintrs, navail, count;
3217 	int ret;
3218 	int i;
3219 
3220 	if (nvme->n_intr_types == 0) {
3221 		ret = ddi_intr_get_supported_types(nvme->n_dip,
3222 		    &nvme->n_intr_types);
3223 		if (ret != DDI_SUCCESS) {
3224 			dev_err(nvme->n_dip, CE_WARN,
3225 			    "!%s: ddi_intr_get_supported types failed",
3226 			    __func__);
3227 			return (ret);
3228 		}
3229 #ifdef __x86
3230 		if (get_hwenv() == HW_VMWARE)
3231 			nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX;
3232 #endif
3233 	}
3234 
3235 	if ((nvme->n_intr_types & intr_type) == 0)
3236 		return (DDI_FAILURE);
3237 
3238 	ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs);
3239 	if (ret != DDI_SUCCESS) {
3240 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed",
3241 		    __func__);
3242 		return (ret);
3243 	}
3244 
3245 	ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail);
3246 	if (ret != DDI_SUCCESS) {
3247 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed",
3248 		    __func__);
3249 		return (ret);
3250 	}
3251 
3252 	/* We want at most one interrupt per queue pair. */
3253 	if (navail > nqpairs)
3254 		navail = nqpairs;
3255 
3256 	nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail;
3257 	nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP);
3258 
3259 	ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail,
3260 	    &count, 0);
3261 	if (ret != DDI_SUCCESS) {
3262 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed",
3263 		    __func__);
3264 		goto fail;
3265 	}
3266 
3267 	nvme->n_intr_cnt = count;
3268 
3269 	ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri);
3270 	if (ret != DDI_SUCCESS) {
3271 		dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed",
3272 		    __func__);
3273 		goto fail;
3274 	}
3275 
3276 	for (i = 0; i < count; i++) {
3277 		ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr,
3278 		    (void *)nvme, (void *)(uintptr_t)i);
3279 		if (ret != DDI_SUCCESS) {
3280 			dev_err(nvme->n_dip, CE_WARN,
3281 			    "!%s: ddi_intr_add_handler failed", __func__);
3282 			goto fail;
3283 		}
3284 	}
3285 
3286 	(void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap);
3287 
3288 	for (i = 0; i < count; i++) {
3289 		if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK)
3290 			ret = ddi_intr_block_enable(&nvme->n_inth[i], 1);
3291 		else
3292 			ret = ddi_intr_enable(nvme->n_inth[i]);
3293 
3294 		if (ret != DDI_SUCCESS) {
3295 			dev_err(nvme->n_dip, CE_WARN,
3296 			    "!%s: enabling interrupt %d failed", __func__, i);
3297 			goto fail;
3298 		}
3299 	}
3300 
3301 	nvme->n_intr_type = intr_type;
3302 
3303 	nvme->n_progress |= NVME_INTERRUPTS;
3304 
3305 	return (DDI_SUCCESS);
3306 
3307 fail:
3308 	nvme_release_interrupts(nvme);
3309 
3310 	return (ret);
3311 }
3312 
3313 static int
3314 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg)
3315 {
3316 	_NOTE(ARGUNUSED(arg));
3317 
3318 	pci_ereport_post(dip, fm_error, NULL);
3319 	return (fm_error->fme_status);
3320 }
3321 
3322 static void
3323 nvme_remove_callback(dev_info_t *dip, ddi_eventcookie_t cookie, void *a,
3324     void *b)
3325 {
3326 	nvme_t *nvme = a;
3327 
3328 	nvme->n_dead = B_TRUE;
3329 
3330 	/*
3331 	 * Fail all outstanding commands, including those in the admin queue
3332 	 * (queue 0).
3333 	 */
3334 	for (uint_t i = 0; i < nvme->n_ioq_count + 1; i++) {
3335 		nvme_qpair_t *qp = nvme->n_ioq[i];
3336 
3337 		mutex_enter(&qp->nq_mutex);
3338 		for (size_t j = 0; j < qp->nq_nentry; j++) {
3339 			nvme_cmd_t *cmd = qp->nq_cmd[j];
3340 			nvme_cmd_t *u_cmd;
3341 
3342 			if (cmd == NULL) {
3343 				continue;
3344 			}
3345 
3346 			/*
3347 			 * Since we have the queue lock held the entire time we
3348 			 * iterate over it, it's not possible for the queue to
3349 			 * change underneath us. Thus, we don't need to check
3350 			 * that the return value of nvme_unqueue_cmd matches the
3351 			 * requested cmd to unqueue.
3352 			 */
3353 			u_cmd = nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
3354 			taskq_dispatch_ent(qp->nq_cq->ncq_cmd_taskq,
3355 			    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
3356 
3357 			ASSERT3P(u_cmd, ==, cmd);
3358 		}
3359 		mutex_exit(&qp->nq_mutex);
3360 	}
3361 }
3362 
3363 static int
3364 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3365 {
3366 	nvme_t *nvme;
3367 	int instance;
3368 	int nregs;
3369 	off_t regsize;
3370 	int i;
3371 	char name[32];
3372 	bd_ops_t ops = nvme_bd_ops;
3373 
3374 	if (cmd != DDI_ATTACH)
3375 		return (DDI_FAILURE);
3376 
3377 	instance = ddi_get_instance(dip);
3378 
3379 	if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS)
3380 		return (DDI_FAILURE);
3381 
3382 	nvme = ddi_get_soft_state(nvme_state, instance);
3383 	ddi_set_driver_private(dip, nvme);
3384 	nvme->n_dip = dip;
3385 
3386 	/* Set up event handlers for hot removal. */
3387 	if (ddi_get_eventcookie(nvme->n_dip, DDI_DEVI_REMOVE_EVENT,
3388 	    &nvme->n_rm_cookie) != DDI_SUCCESS) {
3389 		goto fail;
3390 	}
3391 	if (ddi_add_event_handler(nvme->n_dip, nvme->n_rm_cookie,
3392 	    nvme_remove_callback, nvme, &nvme->n_ev_rm_cb_id) !=
3393 	    DDI_SUCCESS) {
3394 		goto fail;
3395 	}
3396 
3397 	mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL);
3398 
3399 	nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3400 	    DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE;
3401 	nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY,
3402 	    dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ?
3403 	    B_TRUE : B_FALSE;
3404 	nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3405 	    DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
3406 	nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3407 	    DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN);
3408 	/*
3409 	 * Double up the default for completion queues in case of
3410 	 * queue sharing.
3411 	 */
3412 	nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3413 	    DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN);
3414 	nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3415 	    DDI_PROP_DONTPASS, "async-event-limit",
3416 	    NVME_DEFAULT_ASYNC_EVENT_LIMIT);
3417 	nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3418 	    DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ?
3419 	    B_TRUE : B_FALSE;
3420 	nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3421 	    DDI_PROP_DONTPASS, "min-phys-block-size",
3422 	    NVME_DEFAULT_MIN_BLOCK_SIZE);
3423 	nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3424 	    DDI_PROP_DONTPASS, "max-submission-queues", -1);
3425 	nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
3426 	    DDI_PROP_DONTPASS, "max-completion-queues", -1);
3427 
3428 	if (!ISP2(nvme->n_min_block_size) ||
3429 	    (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) {
3430 		dev_err(dip, CE_WARN, "!min-phys-block-size %s, "
3431 		    "using default %d", ISP2(nvme->n_min_block_size) ?
3432 		    "too low" : "not a power of 2",
3433 		    NVME_DEFAULT_MIN_BLOCK_SIZE);
3434 		nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
3435 	}
3436 
3437 	if (nvme->n_submission_queues != -1 &&
3438 	    (nvme->n_submission_queues < 1 ||
3439 	    nvme->n_submission_queues > UINT16_MAX)) {
3440 		dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not "
3441 		    "valid. Must be [1..%d]", nvme->n_submission_queues,
3442 		    UINT16_MAX);
3443 		nvme->n_submission_queues = -1;
3444 	}
3445 
3446 	if (nvme->n_completion_queues != -1 &&
3447 	    (nvme->n_completion_queues < 1 ||
3448 	    nvme->n_completion_queues > UINT16_MAX)) {
3449 		dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not "
3450 		    "valid. Must be [1..%d]", nvme->n_completion_queues,
3451 		    UINT16_MAX);
3452 		nvme->n_completion_queues = -1;
3453 	}
3454 
3455 	if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
3456 		nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
3457 	else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
3458 		nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
3459 
3460 	if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN)
3461 		nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN;
3462 	if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN)
3463 		nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN;
3464 
3465 	if (nvme->n_async_event_limit < 1)
3466 		nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
3467 
3468 	nvme->n_reg_acc_attr = nvme_reg_acc_attr;
3469 	nvme->n_queue_dma_attr = nvme_queue_dma_attr;
3470 	nvme->n_prp_dma_attr = nvme_prp_dma_attr;
3471 	nvme->n_sgl_dma_attr = nvme_sgl_dma_attr;
3472 
3473 	/*
3474 	 * Setup FMA support.
3475 	 */
3476 	nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip,
3477 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable",
3478 	    DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
3479 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE);
3480 
3481 	ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc);
3482 
3483 	if (nvme->n_fm_cap) {
3484 		if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE)
3485 			nvme->n_reg_acc_attr.devacc_attr_access =
3486 			    DDI_FLAGERR_ACC;
3487 
3488 		if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) {
3489 			nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
3490 			nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR;
3491 		}
3492 
3493 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
3494 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3495 			pci_ereport_setup(dip);
3496 
3497 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3498 			ddi_fm_handler_register(dip, nvme_fm_errcb,
3499 			    (void *)nvme);
3500 	}
3501 
3502 	nvme->n_progress |= NVME_FMA_INIT;
3503 
3504 	/*
3505 	 * The spec defines several register sets. Only the controller
3506 	 * registers (set 1) are currently used.
3507 	 */
3508 	if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE ||
3509 	    nregs < 2 ||
3510 	    ddi_dev_regsize(dip, 1, &regsize) == DDI_FAILURE)
3511 		goto fail;
3512 
3513 	if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize,
3514 	    &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) {
3515 		dev_err(dip, CE_WARN, "!failed to map regset 1");
3516 		goto fail;
3517 	}
3518 
3519 	nvme->n_progress |= NVME_REGS_MAPPED;
3520 
3521 	/*
3522 	 * Create PRP DMA cache
3523 	 */
3524 	(void) snprintf(name, sizeof (name), "%s%d_prp_cache",
3525 	    ddi_driver_name(dip), ddi_get_instance(dip));
3526 	nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t),
3527 	    0, nvme_prp_dma_constructor, nvme_prp_dma_destructor,
3528 	    NULL, (void *)nvme, NULL, 0);
3529 
3530 	if (nvme_init(nvme) != DDI_SUCCESS)
3531 		goto fail;
3532 
3533 	if (!nvme->n_idctl->id_oncs.on_dset_mgmt)
3534 		ops.o_free_space = NULL;
3535 
3536 	/*
3537 	 * Initialize the driver with the UFM subsystem
3538 	 */
3539 	if (ddi_ufm_init(dip, DDI_UFM_CURRENT_VERSION, &nvme_ufm_ops,
3540 	    &nvme->n_ufmh, nvme) != 0) {
3541 		dev_err(dip, CE_WARN, "!failed to initialize UFM subsystem");
3542 		goto fail;
3543 	}
3544 	mutex_init(&nvme->n_fwslot_mutex, NULL, MUTEX_DRIVER, NULL);
3545 	ddi_ufm_update(nvme->n_ufmh);
3546 	nvme->n_progress |= NVME_UFM_INIT;
3547 
3548 	/*
3549 	 * Attach the blkdev driver for each namespace.
3550 	 */
3551 	for (i = 0; i != nvme->n_namespace_count; i++) {
3552 		if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name,
3553 		    S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1),
3554 		    DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) {
3555 			dev_err(dip, CE_WARN,
3556 			    "!failed to create minor node for namespace %d", i);
3557 			goto fail;
3558 		}
3559 
3560 		if (nvme->n_ns[i].ns_ignore)
3561 			continue;
3562 
3563 		nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i],
3564 		    &ops, &nvme->n_prp_dma_attr, KM_SLEEP);
3565 
3566 		if (nvme->n_ns[i].ns_bd_hdl == NULL) {
3567 			dev_err(dip, CE_WARN,
3568 			    "!failed to get blkdev handle for namespace %d", i);
3569 			goto fail;
3570 		}
3571 
3572 		if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl)
3573 		    != DDI_SUCCESS) {
3574 			dev_err(dip, CE_WARN,
3575 			    "!failed to attach blkdev handle for namespace %d",
3576 			    i);
3577 			goto fail;
3578 		}
3579 	}
3580 
3581 	if (ddi_create_minor_node(dip, "devctl", S_IFCHR,
3582 	    NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0)
3583 	    != DDI_SUCCESS) {
3584 		dev_err(dip, CE_WARN, "nvme_attach: "
3585 		    "cannot create devctl minor node");
3586 		goto fail;
3587 	}
3588 
3589 	return (DDI_SUCCESS);
3590 
3591 fail:
3592 	/* attach successful anyway so that FMA can retire the device */
3593 	if (nvme->n_dead)
3594 		return (DDI_SUCCESS);
3595 
3596 	(void) nvme_detach(dip, DDI_DETACH);
3597 
3598 	return (DDI_FAILURE);
3599 }
3600 
3601 static int
3602 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
3603 {
3604 	int instance, i;
3605 	nvme_t *nvme;
3606 
3607 	if (cmd != DDI_DETACH)
3608 		return (DDI_FAILURE);
3609 
3610 	instance = ddi_get_instance(dip);
3611 
3612 	nvme = ddi_get_soft_state(nvme_state, instance);
3613 
3614 	if (nvme == NULL)
3615 		return (DDI_FAILURE);
3616 
3617 	ddi_remove_minor_node(dip, "devctl");
3618 	mutex_destroy(&nvme->n_minor.nm_mutex);
3619 
3620 	if (nvme->n_ns) {
3621 		for (i = 0; i != nvme->n_namespace_count; i++) {
3622 			ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name);
3623 			mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex);
3624 
3625 			if (nvme->n_ns[i].ns_bd_hdl) {
3626 				(void) bd_detach_handle(
3627 				    nvme->n_ns[i].ns_bd_hdl);
3628 				bd_free_handle(nvme->n_ns[i].ns_bd_hdl);
3629 			}
3630 
3631 			if (nvme->n_ns[i].ns_idns)
3632 				kmem_free(nvme->n_ns[i].ns_idns,
3633 				    sizeof (nvme_identify_nsid_t));
3634 			if (nvme->n_ns[i].ns_devid)
3635 				strfree(nvme->n_ns[i].ns_devid);
3636 		}
3637 
3638 		kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) *
3639 		    nvme->n_namespace_count);
3640 	}
3641 	if (nvme->n_progress & NVME_UFM_INIT) {
3642 		ddi_ufm_fini(nvme->n_ufmh);
3643 		mutex_destroy(&nvme->n_fwslot_mutex);
3644 	}
3645 
3646 	if (nvme->n_progress & NVME_INTERRUPTS)
3647 		nvme_release_interrupts(nvme);
3648 
3649 	for (i = 0; i < nvme->n_cq_count; i++) {
3650 		if (nvme->n_cq[i]->ncq_cmd_taskq != NULL)
3651 			taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq);
3652 	}
3653 
3654 	if (nvme->n_ioq_count > 0) {
3655 		for (i = 1; i != nvme->n_ioq_count + 1; i++) {
3656 			if (nvme->n_ioq[i] != NULL) {
3657 				/* TODO: send destroy queue commands */
3658 				nvme_free_qpair(nvme->n_ioq[i]);
3659 			}
3660 		}
3661 
3662 		kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) *
3663 		    (nvme->n_ioq_count + 1));
3664 	}
3665 
3666 	if (nvme->n_prp_cache != NULL) {
3667 		kmem_cache_destroy(nvme->n_prp_cache);
3668 	}
3669 
3670 	if (nvme->n_progress & NVME_REGS_MAPPED) {
3671 		nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE);
3672 		(void) nvme_reset(nvme, B_FALSE);
3673 	}
3674 
3675 	if (nvme->n_progress & NVME_CTRL_LIMITS)
3676 		sema_destroy(&nvme->n_abort_sema);
3677 
3678 	if (nvme->n_progress & NVME_ADMIN_QUEUE)
3679 		nvme_free_qpair(nvme->n_adminq);
3680 
3681 	if (nvme->n_cq_count > 0) {
3682 		nvme_destroy_cq_array(nvme, 0);
3683 		nvme->n_cq = NULL;
3684 		nvme->n_cq_count = 0;
3685 	}
3686 
3687 	if (nvme->n_idctl)
3688 		kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE);
3689 
3690 	if (nvme->n_progress & NVME_REGS_MAPPED)
3691 		ddi_regs_map_free(&nvme->n_regh);
3692 
3693 	if (nvme->n_progress & NVME_FMA_INIT) {
3694 		if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3695 			ddi_fm_handler_unregister(nvme->n_dip);
3696 
3697 		if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) ||
3698 		    DDI_FM_ERRCB_CAP(nvme->n_fm_cap))
3699 			pci_ereport_teardown(nvme->n_dip);
3700 
3701 		ddi_fm_fini(nvme->n_dip);
3702 	}
3703 
3704 	if (nvme->n_vendor != NULL)
3705 		strfree(nvme->n_vendor);
3706 
3707 	if (nvme->n_product != NULL)
3708 		strfree(nvme->n_product);
3709 
3710 	/* Clean up hot removal event handler. */
3711 	if (nvme->n_ev_rm_cb_id != NULL) {
3712 		(void) ddi_remove_event_handler(nvme->n_ev_rm_cb_id);
3713 	}
3714 	nvme->n_ev_rm_cb_id = NULL;
3715 
3716 	ddi_soft_state_free(nvme_state, instance);
3717 
3718 	return (DDI_SUCCESS);
3719 }
3720 
3721 static int
3722 nvme_quiesce(dev_info_t *dip)
3723 {
3724 	int instance;
3725 	nvme_t *nvme;
3726 
3727 	instance = ddi_get_instance(dip);
3728 
3729 	nvme = ddi_get_soft_state(nvme_state, instance);
3730 
3731 	if (nvme == NULL)
3732 		return (DDI_FAILURE);
3733 
3734 	nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE);
3735 
3736 	(void) nvme_reset(nvme, B_TRUE);
3737 
3738 	return (DDI_FAILURE);
3739 }
3740 
3741 static int
3742 nvme_fill_prp(nvme_cmd_t *cmd, ddi_dma_handle_t dma)
3743 {
3744 	nvme_t *nvme = cmd->nc_nvme;
3745 	uint_t nprp_per_page, nprp;
3746 	uint64_t *prp;
3747 	const ddi_dma_cookie_t *cookie;
3748 	uint_t idx;
3749 	uint_t ncookies = ddi_dma_ncookies(dma);
3750 
3751 	if (ncookies == 0)
3752 		return (DDI_FAILURE);
3753 
3754 	if ((cookie = ddi_dma_cookie_get(dma, 0)) == NULL)
3755 		return (DDI_FAILURE);
3756 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cookie->dmac_laddress;
3757 
3758 	if (ncookies == 1) {
3759 		cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
3760 		return (DDI_SUCCESS);
3761 	} else if (ncookies == 2) {
3762 		if ((cookie = ddi_dma_cookie_get(dma, 1)) == NULL)
3763 			return (DDI_FAILURE);
3764 		cmd->nc_sqe.sqe_dptr.d_prp[1] = cookie->dmac_laddress;
3765 		return (DDI_SUCCESS);
3766 	}
3767 
3768 	/*
3769 	 * At this point, we're always operating on cookies at
3770 	 * index >= 1 and writing the addresses of those cookies
3771 	 * into a new page. The address of that page is stored
3772 	 * as the second PRP entry.
3773 	 */
3774 	nprp_per_page = nvme->n_pagesize / sizeof (uint64_t);
3775 	ASSERT(nprp_per_page > 0);
3776 
3777 	/*
3778 	 * We currently don't support chained PRPs and set up our DMA
3779 	 * attributes to reflect that. If we still get an I/O request
3780 	 * that needs a chained PRP something is very wrong. Account
3781 	 * for the first cookie here, which we've placed in d_prp[0].
3782 	 */
3783 	nprp = howmany(ncookies - 1, nprp_per_page);
3784 	VERIFY(nprp == 1);
3785 
3786 	/*
3787 	 * Allocate a page of pointers, in which we'll write the
3788 	 * addresses of cookies 1 to `ncookies`.
3789 	 */
3790 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP);
3791 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
3792 	cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_prp->nd_cookie.dmac_laddress;
3793 
3794 	prp = (uint64_t *)cmd->nc_prp->nd_memp;
3795 	for (idx = 1; idx < ncookies; idx++) {
3796 		if ((cookie = ddi_dma_cookie_get(dma, idx)) == NULL)
3797 			return (DDI_FAILURE);
3798 		*prp++ = cookie->dmac_laddress;
3799 	}
3800 
3801 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
3802 	    DDI_DMA_SYNC_FORDEV);
3803 	return (DDI_SUCCESS);
3804 }
3805 
3806 /*
3807  * The maximum number of requests supported for a deallocate request is
3808  * NVME_DSET_MGMT_MAX_RANGES (256) -- this is from the NVMe 1.1 spec (and
3809  * unchanged through at least 1.4a). The definition of nvme_range_t is also
3810  * from the NVMe 1.1 spec. Together, the result is that all of the ranges for
3811  * a deallocate request will fit into the smallest supported namespace page
3812  * (4k).
3813  */
3814 CTASSERT(sizeof (nvme_range_t) * NVME_DSET_MGMT_MAX_RANGES == 4096);
3815 
3816 static int
3817 nvme_fill_ranges(nvme_cmd_t *cmd, bd_xfer_t *xfer, uint64_t blocksize,
3818     int allocflag)
3819 {
3820 	const dkioc_free_list_t *dfl = xfer->x_dfl;
3821 	const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
3822 	nvme_t *nvme = cmd->nc_nvme;
3823 	nvme_range_t *ranges = NULL;
3824 	uint_t i;
3825 
3826 	/*
3827 	 * The number of ranges in the request is 0s based (that is
3828 	 * word10 == 0 -> 1 range, word10 == 1 -> 2 ranges, ...,
3829 	 * word10 == 255 -> 256 ranges). Therefore the allowed values are
3830 	 * [1..NVME_DSET_MGMT_MAX_RANGES]. If blkdev gives us a bad request,
3831 	 * we either provided bad info in nvme_bd_driveinfo() or there is a bug
3832 	 * in blkdev.
3833 	 */
3834 	VERIFY3U(dfl->dfl_num_exts, >, 0);
3835 	VERIFY3U(dfl->dfl_num_exts, <=, NVME_DSET_MGMT_MAX_RANGES);
3836 	cmd->nc_sqe.sqe_cdw10 = (dfl->dfl_num_exts - 1) & 0xff;
3837 
3838 	cmd->nc_sqe.sqe_cdw11 = NVME_DSET_MGMT_ATTR_DEALLOCATE;
3839 
3840 	cmd->nc_prp = kmem_cache_alloc(nvme->n_prp_cache, allocflag);
3841 	if (cmd->nc_prp == NULL)
3842 		return (DDI_FAILURE);
3843 
3844 	bzero(cmd->nc_prp->nd_memp, cmd->nc_prp->nd_len);
3845 	ranges = (nvme_range_t *)cmd->nc_prp->nd_memp;
3846 
3847 	cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_prp->nd_cookie.dmac_laddress;
3848 	cmd->nc_sqe.sqe_dptr.d_prp[1] = 0;
3849 
3850 	for (i = 0; i < dfl->dfl_num_exts; i++) {
3851 		uint64_t lba, len;
3852 
3853 		lba = (dfl->dfl_offset + exts[i].dfle_start) / blocksize;
3854 		len = exts[i].dfle_length / blocksize;
3855 
3856 		VERIFY3U(len, <=, UINT32_MAX);
3857 
3858 		/* No context attributes for a deallocate request */
3859 		ranges[i].nr_ctxattr = 0;
3860 		ranges[i].nr_len = len;
3861 		ranges[i].nr_lba = lba;
3862 	}
3863 
3864 	(void) ddi_dma_sync(cmd->nc_prp->nd_dmah, 0, cmd->nc_prp->nd_len,
3865 	    DDI_DMA_SYNC_FORDEV);
3866 
3867 	return (DDI_SUCCESS);
3868 }
3869 
3870 static nvme_cmd_t *
3871 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer)
3872 {
3873 	nvme_t *nvme = ns->ns_nvme;
3874 	nvme_cmd_t *cmd;
3875 	int allocflag;
3876 
3877 	/*
3878 	 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
3879 	 */
3880 	allocflag = (xfer->x_flags & BD_XFER_POLL) ? KM_NOSLEEP : KM_SLEEP;
3881 	cmd = nvme_alloc_cmd(nvme, allocflag);
3882 
3883 	if (cmd == NULL)
3884 		return (NULL);
3885 
3886 	cmd->nc_sqe.sqe_opc = opc;
3887 	cmd->nc_callback = nvme_bd_xfer_done;
3888 	cmd->nc_xfer = xfer;
3889 
3890 	switch (opc) {
3891 	case NVME_OPC_NVM_WRITE:
3892 	case NVME_OPC_NVM_READ:
3893 		VERIFY(xfer->x_nblks <= 0x10000);
3894 
3895 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
3896 
3897 		cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu;
3898 		cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32);
3899 		cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1);
3900 
3901 		if (nvme_fill_prp(cmd, xfer->x_dmah) != DDI_SUCCESS)
3902 			goto fail;
3903 		break;
3904 
3905 	case NVME_OPC_NVM_FLUSH:
3906 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
3907 		break;
3908 
3909 	case NVME_OPC_NVM_DSET_MGMT:
3910 		cmd->nc_sqe.sqe_nsid = ns->ns_id;
3911 
3912 		if (nvme_fill_ranges(cmd, xfer,
3913 		    (uint64_t)ns->ns_block_size, allocflag) != DDI_SUCCESS)
3914 			goto fail;
3915 		break;
3916 
3917 	default:
3918 		goto fail;
3919 	}
3920 
3921 	return (cmd);
3922 
3923 fail:
3924 	nvme_free_cmd(cmd);
3925 	return (NULL);
3926 }
3927 
3928 static void
3929 nvme_bd_xfer_done(void *arg)
3930 {
3931 	nvme_cmd_t *cmd = arg;
3932 	bd_xfer_t *xfer = cmd->nc_xfer;
3933 	int error = 0;
3934 
3935 	error = nvme_check_cmd_status(cmd);
3936 	nvme_free_cmd(cmd);
3937 
3938 	bd_xfer_done(xfer, error);
3939 }
3940 
3941 static void
3942 nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
3943 {
3944 	nvme_namespace_t *ns = arg;
3945 	nvme_t *nvme = ns->ns_nvme;
3946 	uint_t ns_count = MAX(1, nvme->n_namespaces_attachable);
3947 
3948 	/*
3949 	 * Set the blkdev qcount to the number of submission queues.
3950 	 * It will then create one waitq/runq pair for each submission
3951 	 * queue and spread I/O requests across the queues.
3952 	 */
3953 	drive->d_qcount = nvme->n_ioq_count;
3954 
3955 	/*
3956 	 * I/O activity to individual namespaces is distributed across
3957 	 * each of the d_qcount blkdev queues (which has been set to
3958 	 * the number of nvme submission queues). d_qsize is the number
3959 	 * of submitted and not completed I/Os within each queue that blkdev
3960 	 * will allow before it starts holding them in the waitq.
3961 	 *
3962 	 * Each namespace will create a child blkdev instance, for each one
3963 	 * we try and set the d_qsize so that each namespace gets an
3964 	 * equal portion of the submission queue.
3965 	 *
3966 	 * If post instantiation of the nvme drive, n_namespaces_attachable
3967 	 * changes and a namespace is attached it could calculate a
3968 	 * different d_qsize. It may even be that the sum of the d_qsizes is
3969 	 * now beyond the submission queue size. Should that be the case
3970 	 * and the I/O rate is such that blkdev attempts to submit more
3971 	 * I/Os than the size of the submission queue, the excess I/Os
3972 	 * will be held behind the semaphore nq_sema.
3973 	 */
3974 	drive->d_qsize = nvme->n_io_squeue_len / ns_count;
3975 
3976 	/*
3977 	 * Don't let the queue size drop below the minimum, though.
3978 	 */
3979 	drive->d_qsize = MAX(drive->d_qsize, NVME_MIN_IO_QUEUE_LEN);
3980 
3981 	/*
3982 	 * d_maxxfer is not set, which means the value is taken from the DMA
3983 	 * attributes specified to bd_alloc_handle.
3984 	 */
3985 
3986 	drive->d_removable = B_FALSE;
3987 	drive->d_hotpluggable = B_FALSE;
3988 
3989 	bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64));
3990 	drive->d_target = ns->ns_id;
3991 	drive->d_lun = 0;
3992 
3993 	drive->d_model = nvme->n_idctl->id_model;
3994 	drive->d_model_len = sizeof (nvme->n_idctl->id_model);
3995 	drive->d_vendor = nvme->n_vendor;
3996 	drive->d_vendor_len = strlen(nvme->n_vendor);
3997 	drive->d_product = nvme->n_product;
3998 	drive->d_product_len = strlen(nvme->n_product);
3999 	drive->d_serial = nvme->n_idctl->id_serial;
4000 	drive->d_serial_len = sizeof (nvme->n_idctl->id_serial);
4001 	drive->d_revision = nvme->n_idctl->id_fwrev;
4002 	drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev);
4003 
4004 	/*
4005 	 * If we support the dataset management command, the only restrictions
4006 	 * on a discard request are the maximum number of ranges (segments)
4007 	 * per single request.
4008 	 */
4009 	if (nvme->n_idctl->id_oncs.on_dset_mgmt)
4010 		drive->d_max_free_seg = NVME_DSET_MGMT_MAX_RANGES;
4011 }
4012 
4013 static int
4014 nvme_bd_mediainfo(void *arg, bd_media_t *media)
4015 {
4016 	nvme_namespace_t *ns = arg;
4017 	nvme_t *nvme = ns->ns_nvme;
4018 
4019 	if (nvme->n_dead) {
4020 		return (EIO);
4021 	}
4022 
4023 	media->m_nblks = ns->ns_block_count;
4024 	media->m_blksize = ns->ns_block_size;
4025 	media->m_readonly = B_FALSE;
4026 	media->m_solidstate = B_TRUE;
4027 
4028 	media->m_pblksize = ns->ns_best_block_size;
4029 
4030 	return (0);
4031 }
4032 
4033 static int
4034 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
4035 {
4036 	nvme_t *nvme = ns->ns_nvme;
4037 	nvme_cmd_t *cmd;
4038 	nvme_qpair_t *ioq;
4039 	boolean_t poll;
4040 	int ret;
4041 
4042 	if (nvme->n_dead) {
4043 		return (EIO);
4044 	}
4045 
4046 	cmd = nvme_create_nvm_cmd(ns, opc, xfer);
4047 	if (cmd == NULL)
4048 		return (ENOMEM);
4049 
4050 	cmd->nc_sqid = xfer->x_qnum + 1;
4051 	ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
4052 	ioq = nvme->n_ioq[cmd->nc_sqid];
4053 
4054 	/*
4055 	 * Get the polling flag before submitting the command. The command may
4056 	 * complete immediately after it was submitted, which means we must
4057 	 * treat both cmd and xfer as if they have been freed already.
4058 	 */
4059 	poll = (xfer->x_flags & BD_XFER_POLL) != 0;
4060 
4061 	ret = nvme_submit_io_cmd(ioq, cmd);
4062 
4063 	if (ret != 0)
4064 		return (ret);
4065 
4066 	if (!poll)
4067 		return (0);
4068 
4069 	do {
4070 		cmd = nvme_retrieve_cmd(nvme, ioq);
4071 		if (cmd != NULL)
4072 			cmd->nc_callback(cmd);
4073 		else
4074 			drv_usecwait(10);
4075 	} while (ioq->nq_active_cmds != 0);
4076 
4077 	return (0);
4078 }
4079 
4080 static int
4081 nvme_bd_read(void *arg, bd_xfer_t *xfer)
4082 {
4083 	nvme_namespace_t *ns = arg;
4084 
4085 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
4086 }
4087 
4088 static int
4089 nvme_bd_write(void *arg, bd_xfer_t *xfer)
4090 {
4091 	nvme_namespace_t *ns = arg;
4092 
4093 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE));
4094 }
4095 
4096 static int
4097 nvme_bd_sync(void *arg, bd_xfer_t *xfer)
4098 {
4099 	nvme_namespace_t *ns = arg;
4100 
4101 	if (ns->ns_nvme->n_dead)
4102 		return (EIO);
4103 
4104 	/*
4105 	 * If the volatile write cache is not present or not enabled the FLUSH
4106 	 * command is a no-op, so we can take a shortcut here.
4107 	 */
4108 	if (!ns->ns_nvme->n_write_cache_present) {
4109 		bd_xfer_done(xfer, ENOTSUP);
4110 		return (0);
4111 	}
4112 
4113 	if (!ns->ns_nvme->n_write_cache_enabled) {
4114 		bd_xfer_done(xfer, 0);
4115 		return (0);
4116 	}
4117 
4118 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH));
4119 }
4120 
4121 static int
4122 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid)
4123 {
4124 	nvme_namespace_t *ns = arg;
4125 	nvme_t *nvme = ns->ns_nvme;
4126 
4127 	if (nvme->n_dead) {
4128 		return (EIO);
4129 	}
4130 
4131 	/*LINTED: E_BAD_PTR_CAST_ALIGN*/
4132 	if (*(uint64_t *)ns->ns_eui64 != 0) {
4133 		return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN,
4134 		    sizeof (ns->ns_eui64), ns->ns_eui64, devid));
4135 	} else {
4136 		return (ddi_devid_init(devinfo, DEVID_ENCAP,
4137 		    strlen(ns->ns_devid), ns->ns_devid, devid));
4138 	}
4139 }
4140 
4141 static int
4142 nvme_bd_free_space(void *arg, bd_xfer_t *xfer)
4143 {
4144 	nvme_namespace_t *ns = arg;
4145 
4146 	if (xfer->x_dfl == NULL)
4147 		return (EINVAL);
4148 
4149 	if (!ns->ns_nvme->n_idctl->id_oncs.on_dset_mgmt)
4150 		return (ENOTSUP);
4151 
4152 	return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_DSET_MGMT));
4153 }
4154 
4155 static int
4156 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
4157 {
4158 #ifndef __lock_lint
4159 	_NOTE(ARGUNUSED(cred_p));
4160 #endif
4161 	minor_t minor = getminor(*devp);
4162 	nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
4163 	int nsid = NVME_MINOR_NSID(minor);
4164 	nvme_minor_state_t *nm;
4165 	int rv = 0;
4166 
4167 	if (otyp != OTYP_CHR)
4168 		return (EINVAL);
4169 
4170 	if (nvme == NULL)
4171 		return (ENXIO);
4172 
4173 	if (nsid > nvme->n_namespace_count)
4174 		return (ENXIO);
4175 
4176 	if (nvme->n_dead)
4177 		return (EIO);
4178 
4179 	nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor;
4180 
4181 	mutex_enter(&nm->nm_mutex);
4182 	if (nm->nm_oexcl) {
4183 		rv = EBUSY;
4184 		goto out;
4185 	}
4186 
4187 	if (flag & FEXCL) {
4188 		if (nm->nm_ocnt != 0) {
4189 			rv = EBUSY;
4190 			goto out;
4191 		}
4192 		nm->nm_oexcl = B_TRUE;
4193 	}
4194 
4195 	nm->nm_ocnt++;
4196 
4197 out:
4198 	mutex_exit(&nm->nm_mutex);
4199 	return (rv);
4200 
4201 }
4202 
4203 static int
4204 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
4205 {
4206 #ifndef __lock_lint
4207 	_NOTE(ARGUNUSED(cred_p));
4208 	_NOTE(ARGUNUSED(flag));
4209 #endif
4210 	minor_t minor = getminor(dev);
4211 	nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
4212 	int nsid = NVME_MINOR_NSID(minor);
4213 	nvme_minor_state_t *nm;
4214 
4215 	if (otyp != OTYP_CHR)
4216 		return (ENXIO);
4217 
4218 	if (nvme == NULL)
4219 		return (ENXIO);
4220 
4221 	if (nsid > nvme->n_namespace_count)
4222 		return (ENXIO);
4223 
4224 	nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor;
4225 
4226 	mutex_enter(&nm->nm_mutex);
4227 	if (nm->nm_oexcl)
4228 		nm->nm_oexcl = B_FALSE;
4229 
4230 	ASSERT(nm->nm_ocnt > 0);
4231 	nm->nm_ocnt--;
4232 	mutex_exit(&nm->nm_mutex);
4233 
4234 	return (0);
4235 }
4236 
4237 static int
4238 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4239     cred_t *cred_p)
4240 {
4241 	_NOTE(ARGUNUSED(cred_p));
4242 	int rv = 0;
4243 	void *idctl;
4244 
4245 	if ((mode & FREAD) == 0)
4246 		return (EPERM);
4247 
4248 	if (nioc->n_len < NVME_IDENTIFY_BUFSIZE)
4249 		return (EINVAL);
4250 
4251 	if ((rv = nvme_identify(nvme, B_TRUE, nsid, (void **)&idctl)) != 0)
4252 		return (rv);
4253 
4254 	if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode)
4255 	    != 0)
4256 		rv = EFAULT;
4257 
4258 	kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
4259 
4260 	return (rv);
4261 }
4262 
4263 /*
4264  * Execute commands on behalf of the various ioctls.
4265  */
4266 static int
4267 nvme_ioc_cmd(nvme_t *nvme, nvme_sqe_t *sqe, boolean_t is_admin, void *data_addr,
4268     uint32_t data_len, int rwk, nvme_cqe_t *cqe, uint_t timeout)
4269 {
4270 	nvme_cmd_t *cmd;
4271 	nvme_qpair_t *ioq;
4272 	int rv = 0;
4273 
4274 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
4275 	if (is_admin) {
4276 		cmd->nc_sqid = 0;
4277 		ioq = nvme->n_adminq;
4278 	} else {
4279 		cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
4280 		ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
4281 		ioq = nvme->n_ioq[cmd->nc_sqid];
4282 	}
4283 
4284 	/*
4285 	 * This function is used to faciliate requests from
4286 	 * userspace, so don't panic if the command fails. This
4287 	 * is especially true for admin passthru commands, where
4288 	 * the actual command data structure is entirely defined
4289 	 * by userspace.
4290 	 */
4291 	cmd->nc_dontpanic = B_TRUE;
4292 
4293 	cmd->nc_callback = nvme_wakeup_cmd;
4294 	cmd->nc_sqe = *sqe;
4295 
4296 	if ((rwk & (FREAD | FWRITE)) != 0) {
4297 		if (data_addr == NULL) {
4298 			rv = EINVAL;
4299 			goto free_cmd;
4300 		}
4301 
4302 		if (nvme_zalloc_dma(nvme, data_len, DDI_DMA_READ,
4303 		    &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
4304 			dev_err(nvme->n_dip, CE_WARN,
4305 			    "!nvme_zalloc_dma failed for nvme_ioc_cmd()");
4306 
4307 			rv = ENOMEM;
4308 			goto free_cmd;
4309 		}
4310 
4311 		if ((rv = nvme_fill_prp(cmd, cmd->nc_dma->nd_dmah)) != 0)
4312 			goto free_cmd;
4313 
4314 		if ((rwk & FWRITE) != 0) {
4315 			if (ddi_copyin(data_addr, cmd->nc_dma->nd_memp,
4316 			    data_len, rwk & FKIOCTL) != 0) {
4317 				rv = EFAULT;
4318 				goto free_cmd;
4319 			}
4320 		}
4321 	}
4322 
4323 	if (is_admin) {
4324 		nvme_admin_cmd(cmd, timeout);
4325 	} else {
4326 		mutex_enter(&cmd->nc_mutex);
4327 
4328 		rv = nvme_submit_io_cmd(ioq, cmd);
4329 
4330 		if (rv == EAGAIN) {
4331 			mutex_exit(&cmd->nc_mutex);
4332 			dev_err(cmd->nc_nvme->n_dip, CE_WARN,
4333 			    "!nvme_ioc_cmd() failed, I/O Q full");
4334 			goto free_cmd;
4335 		}
4336 
4337 		nvme_wait_cmd(cmd, timeout);
4338 
4339 		mutex_exit(&cmd->nc_mutex);
4340 	}
4341 
4342 	if (cqe != NULL)
4343 		*cqe = cmd->nc_cqe;
4344 
4345 	if ((rv = nvme_check_cmd_status(cmd)) != 0) {
4346 		dev_err(nvme->n_dip, CE_WARN,
4347 		    "!nvme_ioc_cmd() failed with sct = %x, sc = %x",
4348 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
4349 
4350 		goto free_cmd;
4351 	}
4352 
4353 	if ((rwk & FREAD) != 0) {
4354 		if (ddi_copyout(cmd->nc_dma->nd_memp,
4355 		    data_addr, data_len, rwk & FKIOCTL) != 0)
4356 			rv = EFAULT;
4357 	}
4358 
4359 free_cmd:
4360 	nvme_free_cmd(cmd);
4361 
4362 	return (rv);
4363 }
4364 
4365 static int
4366 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4367     int mode, cred_t *cred_p)
4368 {
4369 	_NOTE(ARGUNUSED(nsid, cred_p));
4370 	int rv = 0;
4371 	nvme_reg_cap_t cap = { 0 };
4372 	nvme_capabilities_t nc;
4373 
4374 	if ((mode & FREAD) == 0)
4375 		return (EPERM);
4376 
4377 	if (nioc->n_len < sizeof (nc))
4378 		return (EINVAL);
4379 
4380 	cap.r = nvme_get64(nvme, NVME_REG_CAP);
4381 
4382 	/*
4383 	 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
4384 	 * specify the base page size of 4k (1<<12), so add 12 here to
4385 	 * get the real page size value.
4386 	 */
4387 	nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax);
4388 	nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin);
4389 
4390 	if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0)
4391 		rv = EFAULT;
4392 
4393 	return (rv);
4394 }
4395 
4396 static int
4397 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4398     int mode, cred_t *cred_p)
4399 {
4400 	_NOTE(ARGUNUSED(cred_p));
4401 	void *log = NULL;
4402 	size_t bufsize = 0;
4403 	int rv = 0;
4404 
4405 	if ((mode & FREAD) == 0)
4406 		return (EPERM);
4407 
4408 	switch (nioc->n_arg) {
4409 	case NVME_LOGPAGE_ERROR:
4410 		if (nsid != 0)
4411 			return (EINVAL);
4412 		break;
4413 	case NVME_LOGPAGE_HEALTH:
4414 		if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0)
4415 			return (EINVAL);
4416 
4417 		if (nsid == 0)
4418 			nsid = (uint32_t)-1;
4419 
4420 		break;
4421 	case NVME_LOGPAGE_FWSLOT:
4422 		if (nsid != 0)
4423 			return (EINVAL);
4424 		break;
4425 	default:
4426 		return (EINVAL);
4427 	}
4428 
4429 	if (nvme_get_logpage(nvme, B_TRUE, &log, &bufsize, nioc->n_arg, nsid)
4430 	    != DDI_SUCCESS)
4431 		return (EIO);
4432 
4433 	if (nioc->n_len < bufsize) {
4434 		kmem_free(log, bufsize);
4435 		return (EINVAL);
4436 	}
4437 
4438 	if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0)
4439 		rv = EFAULT;
4440 
4441 	nioc->n_len = bufsize;
4442 	kmem_free(log, bufsize);
4443 
4444 	return (rv);
4445 }
4446 
4447 static int
4448 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4449     int mode, cred_t *cred_p)
4450 {
4451 	_NOTE(ARGUNUSED(cred_p));
4452 	void *buf = NULL;
4453 	size_t bufsize = 0;
4454 	uint32_t res = 0;
4455 	uint8_t feature;
4456 	int rv = 0;
4457 
4458 	if ((mode & FREAD) == 0)
4459 		return (EPERM);
4460 
4461 	if ((nioc->n_arg >> 32) > 0xff)
4462 		return (EINVAL);
4463 
4464 	feature = (uint8_t)(nioc->n_arg >> 32);
4465 
4466 	switch (feature) {
4467 	case NVME_FEAT_ARBITRATION:
4468 	case NVME_FEAT_POWER_MGMT:
4469 	case NVME_FEAT_ERROR:
4470 	case NVME_FEAT_NQUEUES:
4471 	case NVME_FEAT_INTR_COAL:
4472 	case NVME_FEAT_WRITE_ATOM:
4473 	case NVME_FEAT_ASYNC_EVENT:
4474 	case NVME_FEAT_PROGRESS:
4475 		if (nsid != 0)
4476 			return (EINVAL);
4477 		break;
4478 
4479 	case NVME_FEAT_TEMPERATURE:
4480 		if (nsid != 0)
4481 			return (EINVAL);
4482 		res = nioc->n_arg & 0xffffffffUL;
4483 		if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 2)) {
4484 			nvme_temp_threshold_t tt;
4485 
4486 			tt.r = res;
4487 			if (tt.b.tt_thsel != NVME_TEMP_THRESH_OVER &&
4488 			    tt.b.tt_thsel != NVME_TEMP_THRESH_UNDER) {
4489 				return (EINVAL);
4490 			}
4491 
4492 			if (tt.b.tt_tmpsel > NVME_TEMP_THRESH_MAX_SENSOR) {
4493 				return (EINVAL);
4494 			}
4495 		} else if (res != 0) {
4496 			return (EINVAL);
4497 		}
4498 		break;
4499 
4500 	case NVME_FEAT_INTR_VECT:
4501 		if (nsid != 0)
4502 			return (EINVAL);
4503 
4504 		res = nioc->n_arg & 0xffffffffUL;
4505 		if (res >= nvme->n_intr_cnt)
4506 			return (EINVAL);
4507 		break;
4508 
4509 	case NVME_FEAT_LBA_RANGE:
4510 		if (nvme->n_lba_range_supported == B_FALSE)
4511 			return (EINVAL);
4512 
4513 		if (nsid == 0 ||
4514 		    nsid > nvme->n_namespace_count)
4515 			return (EINVAL);
4516 
4517 		break;
4518 
4519 	case NVME_FEAT_WRITE_CACHE:
4520 		if (nsid != 0)
4521 			return (EINVAL);
4522 
4523 		if (!nvme->n_write_cache_present)
4524 			return (EINVAL);
4525 
4526 		break;
4527 
4528 	case NVME_FEAT_AUTO_PST:
4529 		if (nsid != 0)
4530 			return (EINVAL);
4531 
4532 		if (!nvme->n_auto_pst_supported)
4533 			return (EINVAL);
4534 
4535 		break;
4536 
4537 	default:
4538 		return (EINVAL);
4539 	}
4540 
4541 	rv = nvme_get_features(nvme, B_TRUE, nsid, feature, &res, &buf,
4542 	    &bufsize);
4543 	if (rv != 0)
4544 		return (rv);
4545 
4546 	if (nioc->n_len < bufsize) {
4547 		kmem_free(buf, bufsize);
4548 		return (EINVAL);
4549 	}
4550 
4551 	if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0)
4552 		rv = EFAULT;
4553 
4554 	kmem_free(buf, bufsize);
4555 	nioc->n_arg = res;
4556 	nioc->n_len = bufsize;
4557 
4558 	return (rv);
4559 }
4560 
4561 static int
4562 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4563     cred_t *cred_p)
4564 {
4565 	_NOTE(ARGUNUSED(nsid, mode, cred_p));
4566 
4567 	if ((mode & FREAD) == 0)
4568 		return (EPERM);
4569 
4570 	nioc->n_arg = nvme->n_intr_cnt;
4571 	return (0);
4572 }
4573 
4574 static int
4575 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4576     cred_t *cred_p)
4577 {
4578 	_NOTE(ARGUNUSED(nsid, cred_p));
4579 	int rv = 0;
4580 
4581 	if ((mode & FREAD) == 0)
4582 		return (EPERM);
4583 
4584 	if (nioc->n_len < sizeof (nvme->n_version))
4585 		return (ENOMEM);
4586 
4587 	if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf,
4588 	    sizeof (nvme->n_version), mode) != 0)
4589 		rv = EFAULT;
4590 
4591 	return (rv);
4592 }
4593 
4594 static int
4595 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4596     cred_t *cred_p)
4597 {
4598 	_NOTE(ARGUNUSED(mode));
4599 	nvme_format_nvm_t frmt = { 0 };
4600 	int c_nsid = nsid != 0 ? nsid - 1 : 0;
4601 
4602 	if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4603 		return (EPERM);
4604 
4605 	frmt.r = nioc->n_arg & 0xffffffff;
4606 
4607 	/*
4608 	 * Check whether the FORMAT NVM command is supported.
4609 	 */
4610 	if (nvme->n_idctl->id_oacs.oa_format == 0)
4611 		return (EINVAL);
4612 
4613 	/*
4614 	 * Don't allow format or secure erase of individual namespace if that
4615 	 * would cause a format or secure erase of all namespaces.
4616 	 */
4617 	if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0)
4618 		return (EINVAL);
4619 
4620 	if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE &&
4621 	    nvme->n_idctl->id_fna.fn_sec_erase != 0)
4622 		return (EINVAL);
4623 
4624 	/*
4625 	 * Don't allow formatting with Protection Information.
4626 	 */
4627 	if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0)
4628 		return (EINVAL);
4629 
4630 	/*
4631 	 * Don't allow formatting using an illegal LBA format, or any LBA format
4632 	 * that uses metadata.
4633 	 */
4634 	if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf ||
4635 	    nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0)
4636 		return (EINVAL);
4637 
4638 	/*
4639 	 * Don't allow formatting using an illegal Secure Erase setting.
4640 	 */
4641 	if (frmt.b.fm_ses > NVME_FRMT_MAX_SES ||
4642 	    (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO &&
4643 	    nvme->n_idctl->id_fna.fn_crypt_erase == 0))
4644 		return (EINVAL);
4645 
4646 	if (nsid == 0)
4647 		nsid = (uint32_t)-1;
4648 
4649 	return (nvme_format_nvm(nvme, B_TRUE, nsid, frmt.b.fm_lbaf, B_FALSE, 0,
4650 	    B_FALSE, frmt.b.fm_ses));
4651 }
4652 
4653 static int
4654 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4655     cred_t *cred_p)
4656 {
4657 	_NOTE(ARGUNUSED(nioc, mode));
4658 	int rv = 0;
4659 
4660 	if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4661 		return (EPERM);
4662 
4663 	if (nsid == 0)
4664 		return (EINVAL);
4665 
4666 	if (nvme->n_ns[nsid - 1].ns_ignore)
4667 		return (0);
4668 
4669 	rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl);
4670 	if (rv != DDI_SUCCESS)
4671 		rv = EBUSY;
4672 
4673 	return (rv);
4674 }
4675 
4676 static int
4677 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4678     cred_t *cred_p)
4679 {
4680 	_NOTE(ARGUNUSED(nioc, mode));
4681 	nvme_identify_nsid_t *idns;
4682 	int rv = 0;
4683 
4684 	if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4685 		return (EPERM);
4686 
4687 	if (nsid == 0)
4688 		return (EINVAL);
4689 
4690 	/*
4691 	 * Identify namespace again, free old identify data.
4692 	 */
4693 	idns = nvme->n_ns[nsid - 1].ns_idns;
4694 	if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS)
4695 		return (EIO);
4696 
4697 	kmem_free(idns, sizeof (nvme_identify_nsid_t));
4698 
4699 	if (nvme->n_ns[nsid - 1].ns_ignore)
4700 		return (ENOTSUP);
4701 
4702 	if (nvme->n_ns[nsid - 1].ns_bd_hdl == NULL)
4703 		nvme->n_ns[nsid - 1].ns_bd_hdl = bd_alloc_handle(
4704 		    &nvme->n_ns[nsid - 1], &nvme_bd_ops, &nvme->n_prp_dma_attr,
4705 		    KM_SLEEP);
4706 
4707 	rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl);
4708 	if (rv != DDI_SUCCESS)
4709 		rv = EBUSY;
4710 
4711 	return (rv);
4712 }
4713 
4714 static void
4715 nvme_ufm_update(nvme_t *nvme)
4716 {
4717 	mutex_enter(&nvme->n_fwslot_mutex);
4718 	ddi_ufm_update(nvme->n_ufmh);
4719 	if (nvme->n_fwslot != NULL) {
4720 		kmem_free(nvme->n_fwslot, sizeof (nvme_fwslot_log_t));
4721 		nvme->n_fwslot = NULL;
4722 	}
4723 	mutex_exit(&nvme->n_fwslot_mutex);
4724 }
4725 
4726 static int
4727 nvme_ioctl_firmware_download(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4728     int mode, cred_t *cred_p)
4729 {
4730 	int rv = 0;
4731 	size_t len, copylen;
4732 	offset_t offset;
4733 	uintptr_t buf;
4734 	nvme_sqe_t sqe = {
4735 	    .sqe_opc	= NVME_OPC_FW_IMAGE_LOAD
4736 	};
4737 
4738 	if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4739 		return (EPERM);
4740 
4741 	if (nsid != 0)
4742 		return (EINVAL);
4743 
4744 	/*
4745 	 * The offset (in n_len) is restricted to the number of DWORDs in
4746 	 * 32 bits.
4747 	 */
4748 	if (nioc->n_len > NVME_FW_OFFSETB_MAX)
4749 		return (EINVAL);
4750 
4751 	/* Confirm that both offset and length are a multiple of DWORD bytes */
4752 	if ((nioc->n_len & NVME_DWORD_MASK) != 0 ||
4753 	    (nioc->n_arg & NVME_DWORD_MASK) != 0)
4754 		return (EINVAL);
4755 
4756 	len = nioc->n_len;
4757 	offset = nioc->n_arg;
4758 	buf = (uintptr_t)nioc->n_buf;
4759 	while (len > 0 && rv == 0) {
4760 		/*
4761 		 * nvme_ioc_cmd() does not use SGLs or PRP lists.
4762 		 * It is limited to 2 PRPs per NVM command, so limit
4763 		 * the size of the data to 2 pages.
4764 		 */
4765 		copylen = MIN(2 * nvme->n_pagesize, len);
4766 
4767 		sqe.sqe_cdw10 = (uint32_t)(copylen >> NVME_DWORD_SHIFT) - 1;
4768 		sqe.sqe_cdw11 = (uint32_t)(offset >> NVME_DWORD_SHIFT);
4769 
4770 		rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void *)buf, copylen,
4771 		    FWRITE, NULL, nvme_admin_cmd_timeout);
4772 
4773 		buf += copylen;
4774 		offset += copylen;
4775 		len -= copylen;
4776 	}
4777 
4778 	/*
4779 	 * Let the DDI UFM subsystem know that the firmware information for
4780 	 * this device has changed.
4781 	 */
4782 	nvme_ufm_update(nvme);
4783 
4784 	return (rv);
4785 }
4786 
4787 static int
4788 nvme_ioctl_firmware_commit(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
4789     int mode, cred_t *cred_p)
4790 {
4791 	nvme_firmware_commit_dw10_t fc_dw10 = { 0 };
4792 	uint32_t slot = nioc->n_arg & 0xffffffff;
4793 	uint32_t action = nioc->n_arg >> 32;
4794 	nvme_cqe_t cqe = { 0 };
4795 	nvme_sqe_t sqe = {
4796 	    .sqe_opc	= NVME_OPC_FW_ACTIVATE
4797 	};
4798 	int timeout;
4799 	int rv;
4800 
4801 	if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4802 		return (EPERM);
4803 
4804 	if (nsid != 0)
4805 		return (EINVAL);
4806 
4807 	/* Validate slot is in range. */
4808 	if (slot < NVME_FW_SLOT_MIN || slot > NVME_FW_SLOT_MAX)
4809 		return (EINVAL);
4810 
4811 	switch (action) {
4812 	case NVME_FWC_SAVE:
4813 	case NVME_FWC_SAVE_ACTIVATE:
4814 		timeout = nvme_commit_save_cmd_timeout;
4815 		break;
4816 	case NVME_FWC_ACTIVATE:
4817 	case NVME_FWC_ACTIVATE_IMMED:
4818 		timeout = nvme_admin_cmd_timeout;
4819 		break;
4820 	default:
4821 		return (EINVAL);
4822 	}
4823 
4824 	fc_dw10.b.fc_slot = slot;
4825 	fc_dw10.b.fc_action = action;
4826 	sqe.sqe_cdw10 = fc_dw10.r;
4827 
4828 	rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, NULL, 0, 0, &cqe, timeout);
4829 
4830 	nioc->n_arg = ((uint64_t)cqe.cqe_sf.sf_sct << 16) | cqe.cqe_sf.sf_sc;
4831 
4832 	/*
4833 	 * Let the DDI UFM subsystem know that the firmware information for
4834 	 * this device has changed.
4835 	 */
4836 	nvme_ufm_update(nvme);
4837 
4838 	return (rv);
4839 }
4840 
4841 /*
4842  * Helper to copy in a passthru command from userspace, handling
4843  * different data models.
4844  */
4845 static int
4846 nvme_passthru_copy_cmd_in(const void *buf, nvme_passthru_cmd_t *cmd, int mode)
4847 {
4848 #ifdef _MULTI_DATAMODEL
4849 	switch (ddi_model_convert_from(mode & FMODELS)) {
4850 	case DDI_MODEL_ILP32: {
4851 		nvme_passthru_cmd32_t cmd32;
4852 		if (ddi_copyin(buf, (void*)&cmd32, sizeof (cmd32), mode) != 0)
4853 			return (-1);
4854 		cmd->npc_opcode = cmd32.npc_opcode;
4855 		cmd->npc_timeout = cmd32.npc_timeout;
4856 		cmd->npc_flags = cmd32.npc_flags;
4857 		cmd->npc_cdw12 = cmd32.npc_cdw12;
4858 		cmd->npc_cdw13 = cmd32.npc_cdw13;
4859 		cmd->npc_cdw14 = cmd32.npc_cdw14;
4860 		cmd->npc_cdw15 = cmd32.npc_cdw15;
4861 		cmd->npc_buflen = cmd32.npc_buflen;
4862 		cmd->npc_buf = cmd32.npc_buf;
4863 		break;
4864 	}
4865 	case DDI_MODEL_NONE:
4866 #endif
4867 	if (ddi_copyin(buf, (void*)cmd, sizeof (nvme_passthru_cmd_t),
4868 	    mode) != 0)
4869 		return (-1);
4870 #ifdef _MULTI_DATAMODEL
4871 		break;
4872 	}
4873 #endif
4874 	return (0);
4875 }
4876 
4877 /*
4878  * Helper to copy out a passthru command result to userspace, handling
4879  * different data models.
4880  */
4881 static int
4882 nvme_passthru_copy_cmd_out(const nvme_passthru_cmd_t *cmd, void *buf, int mode)
4883 {
4884 #ifdef _MULTI_DATAMODEL
4885 	switch (ddi_model_convert_from(mode & FMODELS)) {
4886 	case DDI_MODEL_ILP32: {
4887 		nvme_passthru_cmd32_t cmd32;
4888 		bzero(&cmd32, sizeof (cmd32));
4889 		cmd32.npc_opcode = cmd->npc_opcode;
4890 		cmd32.npc_status = cmd->npc_status;
4891 		cmd32.npc_err = cmd->npc_err;
4892 		cmd32.npc_timeout = cmd->npc_timeout;
4893 		cmd32.npc_flags = cmd->npc_flags;
4894 		cmd32.npc_cdw0 = cmd->npc_cdw0;
4895 		cmd32.npc_cdw12 = cmd->npc_cdw12;
4896 		cmd32.npc_cdw13 = cmd->npc_cdw13;
4897 		cmd32.npc_cdw14 = cmd->npc_cdw14;
4898 		cmd32.npc_cdw15 = cmd->npc_cdw15;
4899 		cmd32.npc_buflen = (size32_t)cmd->npc_buflen;
4900 		cmd32.npc_buf = (uintptr32_t)cmd->npc_buf;
4901 		if (ddi_copyout(&cmd32, buf, sizeof (cmd32), mode) != 0)
4902 			return (-1);
4903 		break;
4904 	}
4905 	case DDI_MODEL_NONE:
4906 #endif
4907 		if (ddi_copyout(cmd, buf, sizeof (nvme_passthru_cmd_t),
4908 		    mode) != 0)
4909 			return (-1);
4910 #ifdef _MULTI_DATAMODEL
4911 		break;
4912 	}
4913 #endif
4914 	return (0);
4915 }
4916 
4917 /*
4918  * Run an arbitrary vendor-specific admin command on the device.
4919  */
4920 static int
4921 nvme_ioctl_passthru(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
4922     cred_t *cred_p)
4923 {
4924 	int rv = 0;
4925 	uint_t timeout = 0;
4926 	int rwk = 0;
4927 	nvme_passthru_cmd_t cmd;
4928 	size_t expected_passthru_size = 0;
4929 	nvme_sqe_t sqe;
4930 	nvme_cqe_t cqe;
4931 
4932 	bzero(&cmd, sizeof (cmd));
4933 	bzero(&sqe, sizeof (sqe));
4934 	bzero(&cqe, sizeof (cqe));
4935 
4936 	/*
4937 	 * Basic checks: permissions, data model, argument size.
4938 	 */
4939 	if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0)
4940 		return (EPERM);
4941 
4942 	/*
4943 	 * Compute the expected size of the argument buffer
4944 	 */
4945 #ifdef _MULTI_DATAMODEL
4946 	switch (ddi_model_convert_from(mode & FMODELS)) {
4947 	case DDI_MODEL_ILP32:
4948 		expected_passthru_size = sizeof (nvme_passthru_cmd32_t);
4949 		break;
4950 	case DDI_MODEL_NONE:
4951 #endif
4952 		expected_passthru_size = sizeof (nvme_passthru_cmd_t);
4953 #ifdef _MULTI_DATAMODEL
4954 		break;
4955 	}
4956 #endif
4957 
4958 	if (nioc->n_len != expected_passthru_size) {
4959 		cmd.npc_err = NVME_PASSTHRU_ERR_CMD_SIZE;
4960 		rv = EINVAL;
4961 		goto out;
4962 	}
4963 
4964 	/*
4965 	 * Ensure the device supports the standard vendor specific
4966 	 * admin command format.
4967 	 */
4968 	if (!nvme->n_idctl->id_nvscc.nv_spec) {
4969 		cmd.npc_err = NVME_PASSTHRU_ERR_NOT_SUPPORTED;
4970 		rv = ENOTSUP;
4971 		goto out;
4972 	}
4973 
4974 	if (nvme_passthru_copy_cmd_in((const void*)nioc->n_buf, &cmd, mode))
4975 		return (EFAULT);
4976 
4977 	if (!NVME_IS_VENDOR_UNIQUE_CMD(cmd.npc_opcode)) {
4978 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_OPCODE;
4979 		rv = EINVAL;
4980 		goto out;
4981 	}
4982 
4983 	/*
4984 	 * This restriction is not mandated by the spec, so future work
4985 	 * could relax this if it's necessary to support commands that both
4986 	 * read and write.
4987 	 */
4988 	if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0 &&
4989 	    (cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0) {
4990 		cmd.npc_err = NVME_PASSTHRU_ERR_READ_AND_WRITE;
4991 		rv = EINVAL;
4992 		goto out;
4993 	}
4994 	if (cmd.npc_timeout > nvme_vendor_specific_admin_cmd_max_timeout) {
4995 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_TIMEOUT;
4996 		rv = EINVAL;
4997 		goto out;
4998 	}
4999 	timeout = cmd.npc_timeout;
5000 
5001 	/*
5002 	 * Passed-thru command buffer verification:
5003 	 *  - Size is multiple of DWords
5004 	 *  - Non-null iff the length is non-zero
5005 	 *  - Null if neither reading nor writing data.
5006 	 *  - Non-null if reading or writing.
5007 	 *  - Maximum buffer size.
5008 	 */
5009 	if ((cmd.npc_buflen % sizeof (uint32_t)) != 0) {
5010 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER;
5011 		rv = EINVAL;
5012 		goto out;
5013 	}
5014 	if (((void*)cmd.npc_buf != NULL && cmd.npc_buflen == 0) ||
5015 	    ((void*)cmd.npc_buf == NULL && cmd.npc_buflen != 0)) {
5016 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER;
5017 		rv = EINVAL;
5018 		goto out;
5019 	}
5020 	if (cmd.npc_flags == 0 && (void*)cmd.npc_buf != NULL) {
5021 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER;
5022 		rv = EINVAL;
5023 		goto out;
5024 	}
5025 	if ((cmd.npc_flags != 0) && ((void*)cmd.npc_buf == NULL)) {
5026 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER;
5027 		rv = EINVAL;
5028 		goto out;
5029 	}
5030 	if (cmd.npc_buflen > nvme_vendor_specific_admin_cmd_size) {
5031 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER;
5032 		rv = EINVAL;
5033 		goto out;
5034 	}
5035 	if ((cmd.npc_buflen >> NVME_DWORD_SHIFT) > UINT32_MAX) {
5036 		cmd.npc_err = NVME_PASSTHRU_ERR_INVALID_BUFFER;
5037 		rv = EINVAL;
5038 		goto out;
5039 	}
5040 
5041 	sqe.sqe_opc = cmd.npc_opcode;
5042 	sqe.sqe_nsid = nsid;
5043 	sqe.sqe_cdw10 = (uint32_t)(cmd.npc_buflen >> NVME_DWORD_SHIFT);
5044 	sqe.sqe_cdw12 = cmd.npc_cdw12;
5045 	sqe.sqe_cdw13 = cmd.npc_cdw13;
5046 	sqe.sqe_cdw14 = cmd.npc_cdw14;
5047 	sqe.sqe_cdw15 = cmd.npc_cdw15;
5048 	if ((cmd.npc_flags & NVME_PASSTHRU_READ) != 0)
5049 		rwk = FREAD;
5050 	else if ((cmd.npc_flags & NVME_PASSTHRU_WRITE) != 0)
5051 		rwk = FWRITE;
5052 
5053 	rv = nvme_ioc_cmd(nvme, &sqe, B_TRUE, (void*)cmd.npc_buf,
5054 	    cmd.npc_buflen, rwk, &cqe, timeout);
5055 	cmd.npc_status = cqe.cqe_sf.sf_sc;
5056 	cmd.npc_cdw0 = cqe.cqe_dw0;
5057 
5058 out:
5059 	if (nvme_passthru_copy_cmd_out(&cmd, (void*)nioc->n_buf, mode))
5060 		rv = EFAULT;
5061 	return (rv);
5062 }
5063 
5064 static int
5065 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p,
5066     int *rval_p)
5067 {
5068 #ifndef __lock_lint
5069 	_NOTE(ARGUNUSED(rval_p));
5070 #endif
5071 	minor_t minor = getminor(dev);
5072 	nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
5073 	int nsid = NVME_MINOR_NSID(minor);
5074 	int rv = 0;
5075 	nvme_ioctl_t nioc;
5076 
5077 	int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = {
5078 		NULL,
5079 		nvme_ioctl_identify,
5080 		nvme_ioctl_identify,
5081 		nvme_ioctl_capabilities,
5082 		nvme_ioctl_get_logpage,
5083 		nvme_ioctl_get_features,
5084 		nvme_ioctl_intr_cnt,
5085 		nvme_ioctl_version,
5086 		nvme_ioctl_format,
5087 		nvme_ioctl_detach,
5088 		nvme_ioctl_attach,
5089 		nvme_ioctl_firmware_download,
5090 		nvme_ioctl_firmware_commit,
5091 		nvme_ioctl_passthru
5092 	};
5093 
5094 	if (nvme == NULL)
5095 		return (ENXIO);
5096 
5097 	if (nsid > nvme->n_namespace_count)
5098 		return (ENXIO);
5099 
5100 	if (IS_DEVCTL(cmd))
5101 		return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0));
5102 
5103 #ifdef _MULTI_DATAMODEL
5104 	switch (ddi_model_convert_from(mode & FMODELS)) {
5105 	case DDI_MODEL_ILP32: {
5106 		nvme_ioctl32_t nioc32;
5107 		if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t),
5108 		    mode) != 0)
5109 			return (EFAULT);
5110 		nioc.n_len = nioc32.n_len;
5111 		nioc.n_buf = nioc32.n_buf;
5112 		nioc.n_arg = nioc32.n_arg;
5113 		break;
5114 	}
5115 	case DDI_MODEL_NONE:
5116 #endif
5117 		if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode)
5118 		    != 0)
5119 			return (EFAULT);
5120 #ifdef _MULTI_DATAMODEL
5121 		break;
5122 	}
5123 #endif
5124 
5125 	if (nvme->n_dead && cmd != NVME_IOC_DETACH)
5126 		return (EIO);
5127 
5128 
5129 	if (cmd == NVME_IOC_IDENTIFY_CTRL) {
5130 		/*
5131 		 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
5132 		 * attachment point nodes.
5133 		 */
5134 		nsid = 0;
5135 	} else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) {
5136 		/*
5137 		 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it
5138 		 * will always return identify data for namespace 1.
5139 		 */
5140 		nsid = 1;
5141 	}
5142 
5143 	if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL)
5144 		rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode,
5145 		    cred_p);
5146 	else
5147 		rv = EINVAL;
5148 
5149 #ifdef _MULTI_DATAMODEL
5150 	switch (ddi_model_convert_from(mode & FMODELS)) {
5151 	case DDI_MODEL_ILP32: {
5152 		nvme_ioctl32_t nioc32;
5153 
5154 		nioc32.n_len = (size32_t)nioc.n_len;
5155 		nioc32.n_buf = (uintptr32_t)nioc.n_buf;
5156 		nioc32.n_arg = nioc.n_arg;
5157 
5158 		if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t),
5159 		    mode) != 0)
5160 			return (EFAULT);
5161 		break;
5162 	}
5163 	case DDI_MODEL_NONE:
5164 #endif
5165 		if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode)
5166 		    != 0)
5167 			return (EFAULT);
5168 #ifdef _MULTI_DATAMODEL
5169 		break;
5170 	}
5171 #endif
5172 
5173 	return (rv);
5174 }
5175 
5176 /*
5177  * DDI UFM Callbacks
5178  */
5179 static int
5180 nvme_ufm_fill_image(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
5181     ddi_ufm_image_t *img)
5182 {
5183 	nvme_t *nvme = arg;
5184 
5185 	if (imgno != 0)
5186 		return (EINVAL);
5187 
5188 	ddi_ufm_image_set_desc(img, "Firmware");
5189 	ddi_ufm_image_set_nslots(img, nvme->n_idctl->id_frmw.fw_nslot);
5190 
5191 	return (0);
5192 }
5193 
5194 /*
5195  * Fill out firmware slot information for the requested slot.  The firmware
5196  * slot information is gathered by requesting the Firmware Slot Information log
5197  * page.  The format of the page is described in section 5.10.1.3.
5198  *
5199  * We lazily cache the log page on the first call and then invalidate the cache
5200  * data after a successful firmware download or firmware commit command.
5201  * The cached data is protected by a mutex as the state can change
5202  * asynchronous to this callback.
5203  */
5204 static int
5205 nvme_ufm_fill_slot(ddi_ufm_handle_t *ufmh, void *arg, uint_t imgno,
5206     uint_t slotno, ddi_ufm_slot_t *slot)
5207 {
5208 	nvme_t *nvme = arg;
5209 	void *log = NULL;
5210 	size_t bufsize;
5211 	ddi_ufm_attr_t attr = 0;
5212 	char fw_ver[NVME_FWVER_SZ + 1];
5213 	int ret;
5214 
5215 	if (imgno > 0 || slotno > (nvme->n_idctl->id_frmw.fw_nslot - 1))
5216 		return (EINVAL);
5217 
5218 	mutex_enter(&nvme->n_fwslot_mutex);
5219 	if (nvme->n_fwslot == NULL) {
5220 		ret = nvme_get_logpage(nvme, B_TRUE, &log, &bufsize,
5221 		    NVME_LOGPAGE_FWSLOT, 0);
5222 		if (ret != DDI_SUCCESS ||
5223 		    bufsize != sizeof (nvme_fwslot_log_t)) {
5224 			if (log != NULL)
5225 				kmem_free(log, bufsize);
5226 			mutex_exit(&nvme->n_fwslot_mutex);
5227 			return (EIO);
5228 		}
5229 		nvme->n_fwslot = (nvme_fwslot_log_t *)log;
5230 	}
5231 
5232 	/*
5233 	 * NVMe numbers firmware slots starting at 1
5234 	 */
5235 	if (slotno == (nvme->n_fwslot->fw_afi - 1))
5236 		attr |= DDI_UFM_ATTR_ACTIVE;
5237 
5238 	if (slotno != 0 || nvme->n_idctl->id_frmw.fw_readonly == 0)
5239 		attr |= DDI_UFM_ATTR_WRITEABLE;
5240 
5241 	if (nvme->n_fwslot->fw_frs[slotno][0] == '\0') {
5242 		attr |= DDI_UFM_ATTR_EMPTY;
5243 	} else {
5244 		(void) strncpy(fw_ver, nvme->n_fwslot->fw_frs[slotno],
5245 		    NVME_FWVER_SZ);
5246 		fw_ver[NVME_FWVER_SZ] = '\0';
5247 		ddi_ufm_slot_set_version(slot, fw_ver);
5248 	}
5249 	mutex_exit(&nvme->n_fwslot_mutex);
5250 
5251 	ddi_ufm_slot_set_attrs(slot, attr);
5252 
5253 	return (0);
5254 }
5255 
5256 static int
5257 nvme_ufm_getcaps(ddi_ufm_handle_t *ufmh, void *arg, ddi_ufm_cap_t *caps)
5258 {
5259 	*caps = DDI_UFM_CAP_REPORT;
5260 	return (0);
5261 }
5262