xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision a4955f4fa65e38d70c07d38e657a9aff43fa155f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <semaphore.h>
73 #include <stdbool.h>
74 #include <stddef.h>
75 #include <stdint.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 
80 #include <machine/atomic.h>
81 #include <machine/vmm.h>
82 #include <vmmapi.h>
83 
84 #include <dev/nvme/nvme.h>
85 
86 #include "bhyverun.h"
87 #include "block_if.h"
88 #include "config.h"
89 #include "debug.h"
90 #include "pci_emul.h"
91 
92 
93 static int nvme_debug = 0;
94 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
95 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
96 
97 /* defaults; can be overridden */
98 #define	NVME_MSIX_BAR		4
99 
100 #define	NVME_IOSLOTS		8
101 
102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
103 #define NVME_MMIO_SPACE_MIN	(1 << 14)
104 
105 #define	NVME_QUEUES		16
106 #define	NVME_MAX_QENTRIES	2048
107 /* Memory Page size Minimum reported in CAP register */
108 #define	NVME_MPSMIN		0
109 /* MPSMIN converted to bytes */
110 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
111 
112 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
113 #define	NVME_MDTS		9
114 /* Note the + 1 allows for the initial descriptor to not be page aligned */
115 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
116 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
117 
118 /* This is a synthetic status code to indicate there is no status */
119 #define NVME_NO_STATUS		0xffff
120 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
121 
122 /* Reported temperature in Kelvin (i.e. room temperature) */
123 #define NVME_TEMPERATURE 296
124 
125 /* helpers */
126 
127 /* Convert a zero-based value into a one-based value */
128 #define ONE_BASED(zero)		((zero) + 1)
129 /* Convert a one-based value into a zero-based value */
130 #define ZERO_BASED(one)		((one)  - 1)
131 
132 /* Encode number of SQ's and CQ's for Set/Get Features */
133 #define NVME_FEATURE_NUM_QUEUES(sc) \
134 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
135 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
136 
137 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
138 
139 enum nvme_controller_register_offsets {
140 	NVME_CR_CAP_LOW = 0x00,
141 	NVME_CR_CAP_HI  = 0x04,
142 	NVME_CR_VS      = 0x08,
143 	NVME_CR_INTMS   = 0x0c,
144 	NVME_CR_INTMC   = 0x10,
145 	NVME_CR_CC      = 0x14,
146 	NVME_CR_CSTS    = 0x1c,
147 	NVME_CR_NSSR    = 0x20,
148 	NVME_CR_AQA     = 0x24,
149 	NVME_CR_ASQ_LOW = 0x28,
150 	NVME_CR_ASQ_HI  = 0x2c,
151 	NVME_CR_ACQ_LOW = 0x30,
152 	NVME_CR_ACQ_HI  = 0x34,
153 };
154 
155 enum nvme_cmd_cdw11 {
156 	NVME_CMD_CDW11_PC  = 0x0001,
157 	NVME_CMD_CDW11_IEN = 0x0002,
158 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
159 };
160 
161 enum nvme_copy_dir {
162 	NVME_COPY_TO_PRP,
163 	NVME_COPY_FROM_PRP,
164 };
165 
166 #define	NVME_CQ_INTEN	0x01
167 #define	NVME_CQ_INTCOAL	0x02
168 
169 struct nvme_completion_queue {
170 	struct nvme_completion *qbase;
171 	pthread_mutex_t	mtx;
172 	uint32_t	size;
173 	uint16_t	tail; /* nvme progress */
174 	uint16_t	head; /* guest progress */
175 	uint16_t	intr_vec;
176 	uint32_t	intr_en;
177 };
178 
179 struct nvme_submission_queue {
180 	struct nvme_command *qbase;
181 	pthread_mutex_t	mtx;
182 	uint32_t	size;
183 	uint16_t	head; /* nvme progress */
184 	uint16_t	tail; /* guest progress */
185 	uint16_t	cqid; /* completion queue id */
186 	int		qpriority;
187 };
188 
189 enum nvme_storage_type {
190 	NVME_STOR_BLOCKIF = 0,
191 	NVME_STOR_RAM = 1,
192 };
193 
194 struct pci_nvme_blockstore {
195 	enum nvme_storage_type type;
196 	void		*ctx;
197 	uint64_t	size;
198 	uint32_t	sectsz;
199 	uint32_t	sectsz_bits;
200 	uint64_t	eui64;
201 	uint32_t	deallocate:1;
202 };
203 
204 /*
205  * Calculate the number of additional page descriptors for guest IO requests
206  * based on the advertised Max Data Transfer (MDTS) and given the number of
207  * default iovec's in a struct blockif_req.
208  */
209 #define MDTS_PAD_SIZE \
210 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
211 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
212 	  0 )
213 
214 struct pci_nvme_ioreq {
215 	struct pci_nvme_softc *sc;
216 	STAILQ_ENTRY(pci_nvme_ioreq) link;
217 	struct nvme_submission_queue *nvme_sq;
218 	uint16_t	sqid;
219 
220 	/* command information */
221 	uint16_t	opc;
222 	uint16_t	cid;
223 	uint32_t	nsid;
224 
225 	uint64_t	prev_gpaddr;
226 	size_t		prev_size;
227 	size_t		bytes;
228 
229 	struct blockif_req io_req;
230 
231 	struct iovec	iovpadding[MDTS_PAD_SIZE];
232 };
233 
234 enum nvme_dsm_type {
235 	/* Dataset Management bit in ONCS reflects backing storage capability */
236 	NVME_DATASET_MANAGEMENT_AUTO,
237 	/* Unconditionally set Dataset Management bit in ONCS */
238 	NVME_DATASET_MANAGEMENT_ENABLE,
239 	/* Unconditionally clear Dataset Management bit in ONCS */
240 	NVME_DATASET_MANAGEMENT_DISABLE,
241 };
242 
243 struct pci_nvme_softc;
244 struct nvme_feature_obj;
245 
246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
247     struct nvme_feature_obj *,
248     struct nvme_command *,
249     struct nvme_completion *);
250 
251 struct nvme_feature_obj {
252 	uint32_t	cdw11;
253 	nvme_feature_cb	set;
254 	nvme_feature_cb	get;
255 	bool namespace_specific;
256 };
257 
258 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
259 
260 typedef enum {
261 	PCI_NVME_AE_TYPE_ERROR = 0,
262 	PCI_NVME_AE_TYPE_SMART,
263 	PCI_NVME_AE_TYPE_NOTICE,
264 	PCI_NVME_AE_TYPE_IO_CMD = 6,
265 	PCI_NVME_AE_TYPE_VENDOR = 7,
266 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
267 } pci_nvme_async_type;
268 
269 /* Asynchronous Event Requests */
270 struct pci_nvme_aer {
271 	STAILQ_ENTRY(pci_nvme_aer) link;
272 	uint16_t	cid;	/* Command ID of the submitted AER */
273 };
274 
275 /** Asynchronous Event Information - Notice */
276 typedef enum {
277 	PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
278 	PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
279 	PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
280 	PCI_NVME_AEI_NOTICE_ANA_CHANGE,
281 	PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
282 	PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
283 	PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
284 	PCI_NVME_AEI_NOTICE_MAX,
285 } pci_nvme_async_event_info_notice;
286 
287 #define PCI_NVME_AEI_NOTICE_SHIFT		8
288 #define PCI_NVME_AEI_NOTICE_MASK(event)	(1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
289 
290 /* Asynchronous Event Notifications */
291 struct pci_nvme_aen {
292 	pci_nvme_async_type atype;
293 	uint32_t	event_data;
294 	bool		posted;
295 };
296 
297 /*
298  * By default, enable all Asynchrnous Event Notifications:
299  *     SMART / Health Critical Warnings
300  *     Namespace Attribute Notices
301  */
302 #define PCI_NVME_AEN_DEFAULT_MASK	0x11f
303 
304 typedef enum {
305 	NVME_CNTRLTYPE_IO = 1,
306 	NVME_CNTRLTYPE_DISCOVERY = 2,
307 	NVME_CNTRLTYPE_ADMIN = 3,
308 } pci_nvme_cntrl_type;
309 
310 struct pci_nvme_softc {
311 	struct pci_devinst *nsc_pi;
312 
313 	pthread_mutex_t	mtx;
314 
315 	struct nvme_registers regs;
316 
317 	struct nvme_namespace_data  nsdata;
318 	struct nvme_controller_data ctrldata;
319 	struct nvme_error_information_entry err_log;
320 	struct nvme_health_information_page health_log;
321 	struct nvme_firmware_page fw_log;
322 	struct nvme_ns_list ns_log;
323 
324 	struct pci_nvme_blockstore nvstore;
325 
326 	uint16_t	max_qentries;	/* max entries per queue */
327 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
328 	uint32_t	num_cqueues;
329 	uint32_t	num_squeues;
330 	bool		num_q_is_set; /* Has host set Number of Queues */
331 
332 	struct pci_nvme_ioreq *ioreqs;
333 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
334 	uint32_t	pending_ios;
335 	uint32_t	ioslots;
336 	sem_t		iosemlock;
337 
338 	/*
339 	 * Memory mapped Submission and Completion queues
340 	 * Each array includes both Admin and IO queues
341 	 */
342 	struct nvme_completion_queue *compl_queues;
343 	struct nvme_submission_queue *submit_queues;
344 
345 	struct nvme_feature_obj feat[NVME_FID_MAX];
346 
347 	enum nvme_dsm_type dataset_management;
348 
349 	/* Accounting for SMART data */
350 	__uint128_t	read_data_units;
351 	__uint128_t	write_data_units;
352 	__uint128_t	read_commands;
353 	__uint128_t	write_commands;
354 	uint32_t	read_dunits_remainder;
355 	uint32_t	write_dunits_remainder;
356 
357 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
358 	pthread_mutex_t	aer_mtx;
359 	uint32_t	aer_count;
360 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
361 	pthread_t	aen_tid;
362 	pthread_mutex_t	aen_mtx;
363 	pthread_cond_t	aen_cond;
364 };
365 
366 
367 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
368     struct nvme_completion_queue *cq,
369     uint32_t cdw0,
370     uint16_t cid,
371     uint16_t sqid,
372     uint16_t status);
373 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
374 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
375 static void pci_nvme_io_done(struct blockif_req *, int);
376 
377 /* Controller Configuration utils */
378 #define	NVME_CC_GET_EN(cc) \
379 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
380 #define	NVME_CC_GET_CSS(cc) \
381 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
382 #define	NVME_CC_GET_SHN(cc) \
383 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
384 #define	NVME_CC_GET_IOSQES(cc) \
385 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
386 #define	NVME_CC_GET_IOCQES(cc) \
387 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
388 
389 #define	NVME_CC_WRITE_MASK \
390 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
391 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
392 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
393 
394 #define	NVME_CC_NEN_WRITE_MASK \
395 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
396 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
397 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
398 
399 /* Controller Status utils */
400 #define	NVME_CSTS_GET_RDY(sts) \
401 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
402 
403 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
404 #define	NVME_CSTS_CFS	(1 << NVME_CSTS_REG_CFS_SHIFT)
405 
406 /* Completion Queue status word utils */
407 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
408 #define	NVME_STATUS_MASK \
409 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
410 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
411 
412 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
413 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
414 
415 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
416     struct nvme_feature_obj *,
417     struct nvme_command *,
418     struct nvme_completion *);
419 static void nvme_feature_temperature(struct pci_nvme_softc *,
420     struct nvme_feature_obj *,
421     struct nvme_command *,
422     struct nvme_completion *);
423 static void nvme_feature_num_queues(struct pci_nvme_softc *,
424     struct nvme_feature_obj *,
425     struct nvme_command *,
426     struct nvme_completion *);
427 static void nvme_feature_iv_config(struct pci_nvme_softc *,
428     struct nvme_feature_obj *,
429     struct nvme_command *,
430     struct nvme_completion *);
431 static void nvme_feature_async_event(struct pci_nvme_softc *,
432     struct nvme_feature_obj *,
433     struct nvme_command *,
434     struct nvme_completion *);
435 
436 static void *aen_thr(void *arg);
437 
438 static __inline void
439 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
440 {
441 	size_t len;
442 
443 	len = strnlen(src, dst_size);
444 	memset(dst, pad, dst_size);
445 	memcpy(dst, src, len);
446 }
447 
448 static __inline void
449 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
450 {
451 
452 	*status &= ~NVME_STATUS_MASK;
453 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
454 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
455 }
456 
457 static __inline void
458 pci_nvme_status_genc(uint16_t *status, uint16_t code)
459 {
460 
461 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
462 }
463 
464 /*
465  * Initialize the requested number or IO Submission and Completion Queues.
466  * Admin queues are allocated implicitly.
467  */
468 static void
469 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
470 {
471 	uint32_t i;
472 
473 	/*
474 	 * Allocate and initialize the Submission Queues
475 	 */
476 	if (nsq > NVME_QUEUES) {
477 		WPRINTF("%s: clamping number of SQ from %u to %u",
478 					__func__, nsq, NVME_QUEUES);
479 		nsq = NVME_QUEUES;
480 	}
481 
482 	sc->num_squeues = nsq;
483 
484 	sc->submit_queues = calloc(sc->num_squeues + 1,
485 				sizeof(struct nvme_submission_queue));
486 	if (sc->submit_queues == NULL) {
487 		WPRINTF("%s: SQ allocation failed", __func__);
488 		sc->num_squeues = 0;
489 	} else {
490 		struct nvme_submission_queue *sq = sc->submit_queues;
491 
492 		for (i = 0; i < sc->num_squeues + 1; i++)
493 			pthread_mutex_init(&sq[i].mtx, NULL);
494 	}
495 
496 	/*
497 	 * Allocate and initialize the Completion Queues
498 	 */
499 	if (ncq > NVME_QUEUES) {
500 		WPRINTF("%s: clamping number of CQ from %u to %u",
501 					__func__, ncq, NVME_QUEUES);
502 		ncq = NVME_QUEUES;
503 	}
504 
505 	sc->num_cqueues = ncq;
506 
507 	sc->compl_queues = calloc(sc->num_cqueues + 1,
508 				sizeof(struct nvme_completion_queue));
509 	if (sc->compl_queues == NULL) {
510 		WPRINTF("%s: CQ allocation failed", __func__);
511 		sc->num_cqueues = 0;
512 	} else {
513 		struct nvme_completion_queue *cq = sc->compl_queues;
514 
515 		for (i = 0; i < sc->num_cqueues + 1; i++)
516 			pthread_mutex_init(&cq[i].mtx, NULL);
517 	}
518 }
519 
520 static void
521 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
522 {
523 	struct nvme_controller_data *cd = &sc->ctrldata;
524 
525 	cd->vid = 0xFB5D;
526 	cd->ssvid = 0x0000;
527 
528 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
529 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
530 
531 	/* Num of submission commands that we can handle at a time (2^rab) */
532 	cd->rab   = 4;
533 
534 	/* FreeBSD OUI */
535 	cd->ieee[0] = 0x58;
536 	cd->ieee[1] = 0x9c;
537 	cd->ieee[2] = 0xfc;
538 
539 	cd->mic = 0;
540 
541 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
542 
543 	cd->ver = NVME_REV(1,4);
544 
545 	cd->cntrltype = NVME_CNTRLTYPE_IO;
546 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
547 	cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
548 	cd->acl = 2;
549 	cd->aerl = 4;
550 
551 	/* Advertise 1, Read-only firmware slot */
552 	cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
553 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
554 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
555 	cd->elpe = 0;	/* max error log page entries */
556 	/*
557 	 * Report a single power state (zero-based value)
558 	 * power_state[] values are left as zero to indicate "Not reported"
559 	 */
560 	cd->npss = 0;
561 
562 	/* Warning Composite Temperature Threshold */
563 	cd->wctemp = 0x0157;
564 	cd->cctemp = 0x0157;
565 
566 	/* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
567 	cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
568 			NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
569 
570 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
571 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
572 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
573 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
574 	cd->nn = 1;	/* number of namespaces */
575 
576 	cd->oncs = 0;
577 	switch (sc->dataset_management) {
578 	case NVME_DATASET_MANAGEMENT_AUTO:
579 		if (sc->nvstore.deallocate)
580 			cd->oncs |= NVME_ONCS_DSM;
581 		break;
582 	case NVME_DATASET_MANAGEMENT_ENABLE:
583 		cd->oncs |= NVME_ONCS_DSM;
584 		break;
585 	default:
586 		break;
587 	}
588 
589 	cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
590 	    NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
591 
592 	cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
593 }
594 
595 /*
596  * Calculate the CRC-16 of the given buffer
597  * See copyright attribution at top of file
598  */
599 static uint16_t
600 crc16(uint16_t crc, const void *buffer, unsigned int len)
601 {
602 	const unsigned char *cp = buffer;
603 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
604 	static uint16_t const crc16_table[256] = {
605 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
606 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
607 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
608 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
609 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
610 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
611 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
612 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
613 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
614 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
615 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
616 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
617 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
618 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
619 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
620 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
621 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
622 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
623 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
624 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
625 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
626 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
627 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
628 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
629 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
630 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
631 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
632 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
633 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
634 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
635 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
636 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
637 	};
638 
639 	while (len--)
640 		crc = (((crc >> 8) & 0xffU) ^
641 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
642 	return crc;
643 }
644 
645 static void
646 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
647     struct nvme_namespace_data *nd)
648 {
649 
650 	/* Get capacity and block size information from backing store */
651 	nd->nsze = nvstore->size / nvstore->sectsz;
652 	nd->ncap = nd->nsze;
653 	nd->nuse = nd->nsze;
654 }
655 
656 static void
657 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
658     struct nvme_namespace_data *nd, uint32_t nsid,
659     struct pci_nvme_blockstore *nvstore)
660 {
661 
662 	pci_nvme_init_nsdata_size(nvstore, nd);
663 
664 	if (nvstore->type == NVME_STOR_BLOCKIF)
665 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
666 
667 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
668 	nd->flbas = 0;
669 
670 	/* Create an EUI-64 if user did not provide one */
671 	if (nvstore->eui64 == 0) {
672 		char *data = NULL;
673 		uint64_t eui64 = nvstore->eui64;
674 
675 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
676 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
677 		    sc->nsc_pi->pi_func);
678 
679 		if (data != NULL) {
680 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
681 			free(data);
682 		}
683 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
684 	}
685 	be64enc(nd->eui64, nvstore->eui64);
686 
687 	/* LBA data-sz = 2^lbads */
688 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
689 }
690 
691 static void
692 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
693 {
694 
695 	memset(&sc->err_log, 0, sizeof(sc->err_log));
696 	memset(&sc->health_log, 0, sizeof(sc->health_log));
697 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
698 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
699 
700 	/* Set read/write remainder to round up according to spec */
701 	sc->read_dunits_remainder = 999;
702 	sc->write_dunits_remainder = 999;
703 
704 	/* Set nominal Health values checked by implementations */
705 	sc->health_log.temperature = NVME_TEMPERATURE;
706 	sc->health_log.available_spare = 100;
707 	sc->health_log.available_spare_threshold = 10;
708 
709 	/* Set Active Firmware Info to slot 1 */
710 	sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
711 	memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
712 	    sizeof(sc->fw_log.revision[0]));
713 }
714 
715 static void
716 pci_nvme_init_features(struct pci_nvme_softc *sc)
717 {
718 	enum nvme_feature	fid;
719 
720 	for (fid = 0; fid < NVME_FID_MAX; fid++) {
721 		switch (fid) {
722 		case NVME_FEAT_ARBITRATION:
723 		case NVME_FEAT_POWER_MANAGEMENT:
724 		case NVME_FEAT_INTERRUPT_COALESCING: //XXX
725 		case NVME_FEAT_WRITE_ATOMICITY:
726 			/* Mandatory but no special handling required */
727 		//XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
728 		//XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
729 		//		  this returns a data buffer
730 			break;
731 		case NVME_FEAT_TEMPERATURE_THRESHOLD:
732 			sc->feat[fid].set = nvme_feature_temperature;
733 			break;
734 		case NVME_FEAT_ERROR_RECOVERY:
735 			sc->feat[fid].namespace_specific = true;
736 			break;
737 		case NVME_FEAT_NUMBER_OF_QUEUES:
738 			sc->feat[fid].set = nvme_feature_num_queues;
739 			break;
740 		case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
741 			sc->feat[fid].set = nvme_feature_iv_config;
742 			break;
743 		case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
744 			sc->feat[fid].set = nvme_feature_async_event;
745 			/* Enable all AENs by default */
746 			sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
747 			break;
748 		default:
749 			sc->feat[fid].set = nvme_feature_invalid_cb;
750 			sc->feat[fid].get = nvme_feature_invalid_cb;
751 		}
752 	}
753 }
754 
755 static void
756 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
757 {
758 
759 	STAILQ_INIT(&sc->aer_list);
760 	sc->aer_count = 0;
761 }
762 
763 static void
764 pci_nvme_aer_init(struct pci_nvme_softc *sc)
765 {
766 
767 	pthread_mutex_init(&sc->aer_mtx, NULL);
768 	pci_nvme_aer_reset(sc);
769 }
770 
771 static void
772 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
773 {
774 	struct pci_nvme_aer *aer = NULL;
775 
776 	pthread_mutex_lock(&sc->aer_mtx);
777 	while (!STAILQ_EMPTY(&sc->aer_list)) {
778 		aer = STAILQ_FIRST(&sc->aer_list);
779 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
780 		free(aer);
781 	}
782 	pthread_mutex_unlock(&sc->aer_mtx);
783 
784 	pci_nvme_aer_reset(sc);
785 }
786 
787 static bool
788 pci_nvme_aer_available(struct pci_nvme_softc *sc)
789 {
790 
791 	return (sc->aer_count != 0);
792 }
793 
794 static bool
795 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
796 {
797 	struct nvme_controller_data *cd = &sc->ctrldata;
798 
799 	/* AERL is a zero based value while aer_count is one's based */
800 	return (sc->aer_count == (cd->aerl + 1));
801 }
802 
803 /*
804  * Add an Async Event Request
805  *
806  * Stores an AER to be returned later if the Controller needs to notify the
807  * host of an event.
808  * Note that while the NVMe spec doesn't require Controllers to return AER's
809  * in order, this implementation does preserve the order.
810  */
811 static int
812 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
813 {
814 	struct pci_nvme_aer *aer = NULL;
815 
816 	aer = calloc(1, sizeof(struct pci_nvme_aer));
817 	if (aer == NULL)
818 		return (-1);
819 
820 	/* Save the Command ID for use in the completion message */
821 	aer->cid = cid;
822 
823 	pthread_mutex_lock(&sc->aer_mtx);
824 	sc->aer_count++;
825 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
826 	pthread_mutex_unlock(&sc->aer_mtx);
827 
828 	return (0);
829 }
830 
831 /*
832  * Get an Async Event Request structure
833  *
834  * Returns a pointer to an AER previously submitted by the host or NULL if
835  * no AER's exist. Caller is responsible for freeing the returned struct.
836  */
837 static struct pci_nvme_aer *
838 pci_nvme_aer_get(struct pci_nvme_softc *sc)
839 {
840 	struct pci_nvme_aer *aer = NULL;
841 
842 	pthread_mutex_lock(&sc->aer_mtx);
843 	aer = STAILQ_FIRST(&sc->aer_list);
844 	if (aer != NULL) {
845 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
846 		sc->aer_count--;
847 	}
848 	pthread_mutex_unlock(&sc->aer_mtx);
849 
850 	return (aer);
851 }
852 
853 static void
854 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
855 {
856 	uint32_t	atype;
857 
858 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
859 
860 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
861 		sc->aen[atype].atype = atype;
862 	}
863 }
864 
865 static void
866 pci_nvme_aen_init(struct pci_nvme_softc *sc)
867 {
868 	char nstr[80];
869 
870 	pci_nvme_aen_reset(sc);
871 
872 	pthread_mutex_init(&sc->aen_mtx, NULL);
873 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
874 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
875 	    sc->nsc_pi->pi_func);
876 	pthread_set_name_np(sc->aen_tid, nstr);
877 }
878 
879 static void
880 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
881 {
882 
883 	pci_nvme_aen_reset(sc);
884 }
885 
886 /* Notify the AEN thread of pending work */
887 static void
888 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
889 {
890 
891 	pthread_cond_signal(&sc->aen_cond);
892 }
893 
894 /*
895  * Post an Asynchronous Event Notification
896  */
897 static int32_t
898 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
899 		uint32_t event_data)
900 {
901 	struct pci_nvme_aen *aen;
902 
903 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
904 		return(EINVAL);
905 	}
906 
907 	pthread_mutex_lock(&sc->aen_mtx);
908 	aen = &sc->aen[atype];
909 
910 	/* Has the controller already posted an event of this type? */
911 	if (aen->posted) {
912 		pthread_mutex_unlock(&sc->aen_mtx);
913 		return(EALREADY);
914 	}
915 
916 	aen->event_data = event_data;
917 	aen->posted = true;
918 	pthread_mutex_unlock(&sc->aen_mtx);
919 
920 	pci_nvme_aen_notify(sc);
921 
922 	return(0);
923 }
924 
925 static void
926 pci_nvme_aen_process(struct pci_nvme_softc *sc)
927 {
928 	struct pci_nvme_aer *aer;
929 	struct pci_nvme_aen *aen;
930 	pci_nvme_async_type atype;
931 	uint32_t mask;
932 	uint16_t status;
933 	uint8_t lid;
934 
935 #ifndef __FreeBSD__
936 	lid = 0;
937 #endif
938 
939 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
940 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
941 		aen = &sc->aen[atype];
942 		/* Previous iterations may have depleted the available AER's */
943 		if (!pci_nvme_aer_available(sc)) {
944 			DPRINTF("%s: no AER", __func__);
945 			break;
946 		}
947 
948 		if (!aen->posted) {
949 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
950 			continue;
951 		}
952 
953 		status = NVME_SC_SUCCESS;
954 
955 		/* Is the event masked? */
956 		mask =
957 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
958 
959 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
960 		switch (atype) {
961 		case PCI_NVME_AE_TYPE_ERROR:
962 			lid = NVME_LOG_ERROR;
963 			break;
964 		case PCI_NVME_AE_TYPE_SMART:
965 			mask &= 0xff;
966 			if ((mask & aen->event_data) == 0)
967 				continue;
968 			lid = NVME_LOG_HEALTH_INFORMATION;
969 			break;
970 		case PCI_NVME_AE_TYPE_NOTICE:
971 			if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
972 				EPRINTLN("%s unknown AEN notice type %u",
973 				    __func__, aen->event_data);
974 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
975 				break;
976 			}
977 			if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
978 				continue;
979 			switch (aen->event_data) {
980 			case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
981 				lid = NVME_LOG_CHANGED_NAMESPACE;
982 				break;
983 			case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
984 				lid = NVME_LOG_FIRMWARE_SLOT;
985 				break;
986 			case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
987 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
988 				break;
989 			case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
990 				lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
991 				break;
992 			case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
993 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
994 				break;
995 			case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
996 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
997 				break;
998 			case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
999 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
1000 				break;
1001 			default:
1002 				lid = 0;
1003 			}
1004 			break;
1005 		default:
1006 			/* bad type?!? */
1007 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
1008 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
1009 			break;
1010 		}
1011 
1012 		aer = pci_nvme_aer_get(sc);
1013 		assert(aer != NULL);
1014 
1015 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1016 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
1017 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1018 		    aer->cid,
1019 		    0,		/* SQID */
1020 		    status);
1021 
1022 		aen->event_data = 0;
1023 		aen->posted = false;
1024 
1025 		pci_generate_msix(sc->nsc_pi, 0);
1026 	}
1027 }
1028 
1029 static void *
1030 aen_thr(void *arg)
1031 {
1032 	struct pci_nvme_softc *sc;
1033 
1034 	sc = arg;
1035 
1036 	pthread_mutex_lock(&sc->aen_mtx);
1037 	for (;;) {
1038 		pci_nvme_aen_process(sc);
1039 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1040 	}
1041 #ifdef __FreeBSD__	/* Smatch spots unreachable code */
1042 	pthread_mutex_unlock(&sc->aen_mtx);
1043 
1044 	pthread_exit(NULL);
1045 #endif
1046 	return (NULL);
1047 }
1048 
1049 static void
1050 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1051 {
1052 	uint32_t i;
1053 
1054 	DPRINTF("%s", __func__);
1055 
1056 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1057 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1058 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1059 
1060 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1061 
1062 	sc->regs.vs = NVME_REV(1,4);	/* NVMe v1.4 */
1063 
1064 	sc->regs.cc = 0;
1065 
1066 	assert(sc->submit_queues != NULL);
1067 
1068 	for (i = 0; i < sc->num_squeues + 1; i++) {
1069 		sc->submit_queues[i].qbase = NULL;
1070 		sc->submit_queues[i].size = 0;
1071 		sc->submit_queues[i].cqid = 0;
1072 		sc->submit_queues[i].tail = 0;
1073 		sc->submit_queues[i].head = 0;
1074 	}
1075 
1076 	assert(sc->compl_queues != NULL);
1077 
1078 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1079 		sc->compl_queues[i].qbase = NULL;
1080 		sc->compl_queues[i].size = 0;
1081 		sc->compl_queues[i].tail = 0;
1082 		sc->compl_queues[i].head = 0;
1083 	}
1084 
1085 	sc->num_q_is_set = false;
1086 
1087 	pci_nvme_aer_destroy(sc);
1088 	pci_nvme_aen_destroy(sc);
1089 
1090 	/*
1091 	 * Clear CSTS.RDY last to prevent the host from enabling Controller
1092 	 * before cleanup completes
1093 	 */
1094 	sc->regs.csts = 0;
1095 }
1096 
1097 static void
1098 pci_nvme_reset(struct pci_nvme_softc *sc)
1099 {
1100 	pthread_mutex_lock(&sc->mtx);
1101 	pci_nvme_reset_locked(sc);
1102 	pthread_mutex_unlock(&sc->mtx);
1103 }
1104 
1105 static int
1106 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1107 {
1108 	uint16_t acqs, asqs;
1109 
1110 	DPRINTF("%s", __func__);
1111 
1112 	/*
1113 	 * NVMe 2.0 states that "enabling a controller while this field is
1114 	 * cleared to 0h produces undefined results" for both ACQS and
1115 	 * ASQS. If zero, set CFS and do not become ready.
1116 	 */
1117 	asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1118 	if (asqs < 2) {
1119 		EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1120 		    asqs - 1, sc->regs.aqa);
1121 		sc->regs.csts |= NVME_CSTS_CFS;
1122 		return (-1);
1123 	}
1124 	sc->submit_queues[0].size = asqs;
1125 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1126 	            sizeof(struct nvme_command) * asqs);
1127 	if (sc->submit_queues[0].qbase == NULL) {
1128 		EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1129 		    sc->regs.asq);
1130 		sc->regs.csts |= NVME_CSTS_CFS;
1131 		return (-1);
1132 	}
1133 
1134 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1135 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1136 
1137 	acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1138 	    NVME_AQA_REG_ACQS_MASK);
1139 	if (acqs < 2) {
1140 		EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1141 		    acqs - 1, sc->regs.aqa);
1142 		sc->regs.csts |= NVME_CSTS_CFS;
1143 		return (-1);
1144 	}
1145 	sc->compl_queues[0].size = acqs;
1146 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1147 	         sizeof(struct nvme_completion) * acqs);
1148 	if (sc->compl_queues[0].qbase == NULL) {
1149 		EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1150 		    sc->regs.acq);
1151 		sc->regs.csts |= NVME_CSTS_CFS;
1152 		return (-1);
1153 	}
1154 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1155 
1156 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1157 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1158 
1159 	return (0);
1160 }
1161 
1162 static int
1163 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1164 	size_t len, enum nvme_copy_dir dir)
1165 {
1166 	uint8_t *p;
1167 	size_t bytes;
1168 
1169 	if (len > (8 * 1024)) {
1170 		return (-1);
1171 	}
1172 
1173 	/* Copy from the start of prp1 to the end of the physical page */
1174 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1175 	bytes = MIN(bytes, len);
1176 
1177 	p = vm_map_gpa(ctx, prp1, bytes);
1178 	if (p == NULL) {
1179 		return (-1);
1180 	}
1181 
1182 	if (dir == NVME_COPY_TO_PRP)
1183 		memcpy(p, b, bytes);
1184 	else
1185 		memcpy(b, p, bytes);
1186 
1187 	b += bytes;
1188 
1189 	len -= bytes;
1190 	if (len == 0) {
1191 		return (0);
1192 	}
1193 
1194 	len = MIN(len, PAGE_SIZE);
1195 
1196 	p = vm_map_gpa(ctx, prp2, len);
1197 	if (p == NULL) {
1198 		return (-1);
1199 	}
1200 
1201 	if (dir == NVME_COPY_TO_PRP)
1202 		memcpy(p, b, len);
1203 	else
1204 		memcpy(b, p, len);
1205 
1206 	return (0);
1207 }
1208 
1209 /*
1210  * Write a Completion Queue Entry update
1211  *
1212  * Write the completion and update the doorbell value
1213  */
1214 static void
1215 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1216 		struct nvme_completion_queue *cq,
1217 		uint32_t cdw0,
1218 		uint16_t cid,
1219 		uint16_t sqid,
1220 		uint16_t status)
1221 {
1222 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1223 	struct nvme_completion *cqe;
1224 
1225 	assert(cq->qbase != NULL);
1226 
1227 	pthread_mutex_lock(&cq->mtx);
1228 
1229 	cqe = &cq->qbase[cq->tail];
1230 
1231 	/* Flip the phase bit */
1232 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1233 
1234 	cqe->cdw0 = cdw0;
1235 	cqe->sqhd = sq->head;
1236 	cqe->sqid = sqid;
1237 	cqe->cid = cid;
1238 	cqe->status = status;
1239 
1240 	cq->tail++;
1241 	if (cq->tail >= cq->size) {
1242 		cq->tail = 0;
1243 	}
1244 
1245 	pthread_mutex_unlock(&cq->mtx);
1246 }
1247 
1248 static int
1249 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1250 	struct nvme_completion* compl)
1251 {
1252 	uint16_t qid = command->cdw10 & 0xffff;
1253 
1254 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1255 	if (qid == 0 || qid > sc->num_squeues ||
1256 	    (sc->submit_queues[qid].qbase == NULL)) {
1257 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1258 		        __func__, qid, sc->num_squeues);
1259 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1260 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1261 		return (1);
1262 	}
1263 
1264 	sc->submit_queues[qid].qbase = NULL;
1265 	sc->submit_queues[qid].cqid = 0;
1266 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1267 	return (1);
1268 }
1269 
1270 static int
1271 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1272 	struct nvme_completion* compl)
1273 {
1274 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1275 		uint16_t qid = command->cdw10 & 0xffff;
1276 		struct nvme_submission_queue *nsq;
1277 
1278 		if ((qid == 0) || (qid > sc->num_squeues) ||
1279 		    (sc->submit_queues[qid].qbase != NULL)) {
1280 			WPRINTF("%s queue index %u > num_squeues %u",
1281 			        __func__, qid, sc->num_squeues);
1282 			pci_nvme_status_tc(&compl->status,
1283 			    NVME_SCT_COMMAND_SPECIFIC,
1284 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1285 			return (1);
1286 		}
1287 
1288 		nsq = &sc->submit_queues[qid];
1289 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1290 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1291 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1292 			/*
1293 			 * Queues must specify at least two entries
1294 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1295 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1296 			 */
1297 			pci_nvme_status_tc(&compl->status,
1298 			    NVME_SCT_COMMAND_SPECIFIC,
1299 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1300 			return (1);
1301 		}
1302 		nsq->head = nsq->tail = 0;
1303 
1304 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1305 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1306 			pci_nvme_status_tc(&compl->status,
1307 			    NVME_SCT_COMMAND_SPECIFIC,
1308 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1309 			return (1);
1310 		}
1311 
1312 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1313 			pci_nvme_status_tc(&compl->status,
1314 			    NVME_SCT_COMMAND_SPECIFIC,
1315 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1316 			return (1);
1317 		}
1318 
1319 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1320 
1321 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1322 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1323 
1324 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1325 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1326 
1327 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1328 
1329 		DPRINTF("%s completed creating IOSQ qid %u",
1330 		         __func__, qid);
1331 	} else {
1332 		/*
1333 		 * Guest sent non-cont submission queue request.
1334 		 * This setting is unsupported by this emulation.
1335 		 */
1336 		WPRINTF("%s unsupported non-contig (list-based) "
1337 		         "create i/o submission queue", __func__);
1338 
1339 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1340 	}
1341 	return (1);
1342 }
1343 
1344 static int
1345 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1346 	struct nvme_completion* compl)
1347 {
1348 	uint16_t qid = command->cdw10 & 0xffff;
1349 	uint16_t sqid;
1350 
1351 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1352 	if (qid == 0 || qid > sc->num_cqueues ||
1353 	    (sc->compl_queues[qid].qbase == NULL)) {
1354 		WPRINTF("%s queue index %u / num_cqueues %u",
1355 		        __func__, qid, sc->num_cqueues);
1356 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1357 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1358 		return (1);
1359 	}
1360 
1361 	/* Deleting an Active CQ is an error */
1362 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1363 		if (sc->submit_queues[sqid].cqid == qid) {
1364 			pci_nvme_status_tc(&compl->status,
1365 			    NVME_SCT_COMMAND_SPECIFIC,
1366 			    NVME_SC_INVALID_QUEUE_DELETION);
1367 			return (1);
1368 		}
1369 
1370 	sc->compl_queues[qid].qbase = NULL;
1371 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1372 	return (1);
1373 }
1374 
1375 static int
1376 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1377 	struct nvme_completion* compl)
1378 {
1379 	struct nvme_completion_queue *ncq;
1380 	uint16_t qid = command->cdw10 & 0xffff;
1381 
1382 	/* Only support Physically Contiguous queues */
1383 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1384 		WPRINTF("%s unsupported non-contig (list-based) "
1385 		         "create i/o completion queue",
1386 		         __func__);
1387 
1388 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1389 		return (1);
1390 	}
1391 
1392 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1393 	    (sc->compl_queues[qid].qbase != NULL)) {
1394 		WPRINTF("%s queue index %u > num_cqueues %u",
1395 			__func__, qid, sc->num_cqueues);
1396 		pci_nvme_status_tc(&compl->status,
1397 		    NVME_SCT_COMMAND_SPECIFIC,
1398 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1399 		return (1);
1400  	}
1401 
1402 	ncq = &sc->compl_queues[qid];
1403 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1404 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1405 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1406 		pci_nvme_status_tc(&compl->status,
1407 		    NVME_SCT_COMMAND_SPECIFIC,
1408 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1409 		return (1);
1410 	}
1411 
1412 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1413 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1414 		/*
1415 		 * Queues must specify at least two entries
1416 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1417 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1418 		 */
1419 		pci_nvme_status_tc(&compl->status,
1420 		    NVME_SCT_COMMAND_SPECIFIC,
1421 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1422 		return (1);
1423 	}
1424 	ncq->head = ncq->tail = 0;
1425 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1426 		     command->prp1,
1427 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1428 
1429 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1430 
1431 
1432 	return (1);
1433 }
1434 
1435 static int
1436 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1437 	struct nvme_completion* compl)
1438 {
1439 	uint64_t logoff;
1440 	uint32_t logsize;
1441 	uint8_t logpage;
1442 
1443 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1444 
1445 	/*
1446 	 * Command specifies the number of dwords to return in fields NUMDU
1447 	 * and NUMDL. This is a zero-based value.
1448 	 */
1449 	logpage = command->cdw10 & 0xFF;
1450 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1451 	logsize *= sizeof(uint32_t);
1452 	logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1453 
1454 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1455 
1456 	switch (logpage) {
1457 	case NVME_LOG_ERROR:
1458 		if (logoff >= sizeof(sc->err_log)) {
1459 			pci_nvme_status_genc(&compl->status,
1460 			    NVME_SC_INVALID_FIELD);
1461 			break;
1462 		}
1463 
1464 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1465 		    command->prp2, (uint8_t *)&sc->err_log + logoff,
1466 		    MIN(logsize - logoff, sizeof(sc->err_log)),
1467 		    NVME_COPY_TO_PRP);
1468 		break;
1469 	case NVME_LOG_HEALTH_INFORMATION:
1470 		if (logoff >= sizeof(sc->health_log)) {
1471 			pci_nvme_status_genc(&compl->status,
1472 			    NVME_SC_INVALID_FIELD);
1473 			break;
1474 		}
1475 
1476 		pthread_mutex_lock(&sc->mtx);
1477 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1478 		    sizeof(sc->health_log.data_units_read));
1479 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1480 		    sizeof(sc->health_log.data_units_written));
1481 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1482 		    sizeof(sc->health_log.host_read_commands));
1483 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1484 		    sizeof(sc->health_log.host_write_commands));
1485 		pthread_mutex_unlock(&sc->mtx);
1486 
1487 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1488 		    command->prp2, (uint8_t *)&sc->health_log + logoff,
1489 		    MIN(logsize - logoff, sizeof(sc->health_log)),
1490 		    NVME_COPY_TO_PRP);
1491 		break;
1492 	case NVME_LOG_FIRMWARE_SLOT:
1493 		if (logoff >= sizeof(sc->fw_log)) {
1494 			pci_nvme_status_genc(&compl->status,
1495 			    NVME_SC_INVALID_FIELD);
1496 			break;
1497 		}
1498 
1499 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1500 		    command->prp2, (uint8_t *)&sc->fw_log + logoff,
1501 		    MIN(logsize - logoff, sizeof(sc->fw_log)),
1502 		    NVME_COPY_TO_PRP);
1503 		break;
1504 	case NVME_LOG_CHANGED_NAMESPACE:
1505 		if (logoff >= sizeof(sc->ns_log)) {
1506 			pci_nvme_status_genc(&compl->status,
1507 			    NVME_SC_INVALID_FIELD);
1508 			break;
1509 		}
1510 
1511 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1512 		    command->prp2, (uint8_t *)&sc->ns_log + logoff,
1513 		    MIN(logsize - logoff, sizeof(sc->ns_log)),
1514 		    NVME_COPY_TO_PRP);
1515 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1516 		break;
1517 	default:
1518 		DPRINTF("%s get log page %x command not supported",
1519 		        __func__, logpage);
1520 
1521 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1522 		    NVME_SC_INVALID_LOG_PAGE);
1523 	}
1524 
1525 	return (1);
1526 }
1527 
1528 static int
1529 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1530 	struct nvme_completion* compl)
1531 {
1532 	void *dest;
1533 	uint16_t status;
1534 
1535 #ifndef __FreeBSD__
1536 	status = 0;
1537 #endif
1538 
1539 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1540 	        command->cdw10 & 0xFF, command->nsid);
1541 
1542 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1543 
1544 	switch (command->cdw10 & 0xFF) {
1545 	case 0x00: /* return Identify Namespace data structure */
1546 		/* Global NS only valid with NS Management */
1547 		if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1548 			pci_nvme_status_genc(&status,
1549 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1550 			break;
1551 		}
1552 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1553 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1554 		    NVME_COPY_TO_PRP);
1555 		break;
1556 	case 0x01: /* return Identify Controller data structure */
1557 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1558 		    command->prp2, (uint8_t *)&sc->ctrldata,
1559 		    sizeof(sc->ctrldata),
1560 		    NVME_COPY_TO_PRP);
1561 		break;
1562 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1563 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1564 		                  sizeof(uint32_t) * 1024);
1565 		/* All unused entries shall be zero */
1566 		memset(dest, 0, sizeof(uint32_t) * 1024);
1567 		((uint32_t *)dest)[0] = 1;
1568 		break;
1569 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1570 		if (command->nsid != 1) {
1571 			pci_nvme_status_genc(&status,
1572 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1573 			break;
1574 		}
1575 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1576 		                  sizeof(uint32_t) * 1024);
1577 		/* All bytes after the descriptor shall be zero */
1578 		memset(dest, 0, sizeof(uint32_t) * 1024);
1579 
1580 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1581 		((uint8_t *)dest)[0] = 1;
1582 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1583 		memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1584 		break;
1585 	case 0x13:
1586 		/*
1587 		 * Controller list is optional but used by UNH tests. Return
1588 		 * a valid but empty list.
1589 		 */
1590 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1591 		                  sizeof(uint16_t) * 2048);
1592 		memset(dest, 0, sizeof(uint16_t) * 2048);
1593 		break;
1594 	default:
1595 		DPRINTF("%s unsupported identify command requested 0x%x",
1596 		         __func__, command->cdw10 & 0xFF);
1597 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1598 		break;
1599 	}
1600 
1601 	compl->status = status;
1602 	return (1);
1603 }
1604 
1605 static const char *
1606 nvme_fid_to_name(uint8_t fid)
1607 {
1608 	const char *name;
1609 
1610 	switch (fid) {
1611 	case NVME_FEAT_ARBITRATION:
1612 		name = "Arbitration";
1613 		break;
1614 	case NVME_FEAT_POWER_MANAGEMENT:
1615 		name = "Power Management";
1616 		break;
1617 	case NVME_FEAT_LBA_RANGE_TYPE:
1618 		name = "LBA Range Type";
1619 		break;
1620 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1621 		name = "Temperature Threshold";
1622 		break;
1623 	case NVME_FEAT_ERROR_RECOVERY:
1624 		name = "Error Recovery";
1625 		break;
1626 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1627 		name = "Volatile Write Cache";
1628 		break;
1629 	case NVME_FEAT_NUMBER_OF_QUEUES:
1630 		name = "Number of Queues";
1631 		break;
1632 	case NVME_FEAT_INTERRUPT_COALESCING:
1633 		name = "Interrupt Coalescing";
1634 		break;
1635 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1636 		name = "Interrupt Vector Configuration";
1637 		break;
1638 	case NVME_FEAT_WRITE_ATOMICITY:
1639 		name = "Write Atomicity Normal";
1640 		break;
1641 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1642 		name = "Asynchronous Event Configuration";
1643 		break;
1644 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1645 		name = "Autonomous Power State Transition";
1646 		break;
1647 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1648 		name = "Host Memory Buffer";
1649 		break;
1650 	case NVME_FEAT_TIMESTAMP:
1651 		name = "Timestamp";
1652 		break;
1653 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1654 		name = "Keep Alive Timer";
1655 		break;
1656 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1657 		name = "Host Controlled Thermal Management";
1658 		break;
1659 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1660 		name = "Non-Operation Power State Config";
1661 		break;
1662 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1663 		name = "Read Recovery Level Config";
1664 		break;
1665 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1666 		name = "Predictable Latency Mode Config";
1667 		break;
1668 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1669 		name = "Predictable Latency Mode Window";
1670 		break;
1671 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1672 		name = "LBA Status Information Report Interval";
1673 		break;
1674 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1675 		name = "Host Behavior Support";
1676 		break;
1677 	case NVME_FEAT_SANITIZE_CONFIG:
1678 		name = "Sanitize Config";
1679 		break;
1680 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1681 		name = "Endurance Group Event Configuration";
1682 		break;
1683 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1684 		name = "Software Progress Marker";
1685 		break;
1686 	case NVME_FEAT_HOST_IDENTIFIER:
1687 		name = "Host Identifier";
1688 		break;
1689 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1690 		name = "Reservation Notification Mask";
1691 		break;
1692 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1693 		name = "Reservation Persistence";
1694 		break;
1695 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1696 		name = "Namespace Write Protection Config";
1697 		break;
1698 	default:
1699 		name = "Unknown";
1700 		break;
1701 	}
1702 
1703 	return (name);
1704 }
1705 
1706 static void
1707 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1708     struct nvme_feature_obj *feat __unused,
1709     struct nvme_command *command __unused,
1710     struct nvme_completion *compl)
1711 {
1712 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1713 }
1714 
1715 static void
1716 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1717     struct nvme_feature_obj *feat __unused,
1718     struct nvme_command *command,
1719     struct nvme_completion *compl)
1720 {
1721 	uint32_t i;
1722 	uint32_t cdw11 = command->cdw11;
1723 	uint16_t iv;
1724 	bool cd;
1725 
1726 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1727 
1728 	iv = cdw11 & 0xffff;
1729 	cd = cdw11 & (1 << 16);
1730 
1731 	if (iv > (sc->max_queues + 1)) {
1732 		return;
1733 	}
1734 
1735 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1736 	if ((iv == 0) && !cd)
1737 		return;
1738 
1739 	/* Requested Interrupt Vector must be used by a CQ */
1740 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1741 		if (sc->compl_queues[i].intr_vec == iv) {
1742 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1743 		}
1744 	}
1745 }
1746 
1747 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP		(0x4000)
1748 static void
1749 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1750     struct nvme_feature_obj *feat __unused,
1751     struct nvme_command *command,
1752     struct nvme_completion *compl)
1753 {
1754 	if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1755 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1756 }
1757 
1758 #define NVME_TEMP_THRESH_OVER	0
1759 #define NVME_TEMP_THRESH_UNDER	1
1760 static void
1761 nvme_feature_temperature(struct pci_nvme_softc *sc,
1762     struct nvme_feature_obj *feat __unused,
1763     struct nvme_command *command,
1764     struct nvme_completion *compl)
1765 {
1766 	uint16_t	tmpth;	/* Temperature Threshold */
1767 	uint8_t		tmpsel; /* Threshold Temperature Select */
1768 	uint8_t		thsel;  /* Threshold Type Select */
1769 	bool		set_crit = false;
1770 
1771 	tmpth  = command->cdw11 & 0xffff;
1772 	tmpsel = (command->cdw11 >> 16) & 0xf;
1773 	thsel  = (command->cdw11 >> 20) & 0x3;
1774 
1775 	DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1776 
1777 	/* Check for unsupported values */
1778 	if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1779 	    (thsel > NVME_TEMP_THRESH_UNDER)) {
1780 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1781 		return;
1782 	}
1783 
1784 	if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1785 	    ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1786 		set_crit = true;
1787 
1788 	pthread_mutex_lock(&sc->mtx);
1789 	if (set_crit)
1790 		sc->health_log.critical_warning |=
1791 		    NVME_CRIT_WARN_ST_TEMPERATURE;
1792 	else
1793 		sc->health_log.critical_warning &=
1794 		    ~NVME_CRIT_WARN_ST_TEMPERATURE;
1795 	pthread_mutex_unlock(&sc->mtx);
1796 
1797 	if (set_crit)
1798 		pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1799 		    sc->health_log.critical_warning);
1800 
1801 
1802 	DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1803 }
1804 
1805 static void
1806 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1807     struct nvme_feature_obj *feat __unused,
1808     struct nvme_command *command,
1809     struct nvme_completion *compl)
1810 {
1811 	uint16_t nqr;	/* Number of Queues Requested */
1812 
1813 	if (sc->num_q_is_set) {
1814 		WPRINTF("%s: Number of Queues already set", __func__);
1815 		pci_nvme_status_genc(&compl->status,
1816 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1817 		return;
1818 	}
1819 
1820 	nqr = command->cdw11 & 0xFFFF;
1821 	if (nqr == 0xffff) {
1822 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1823 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1824 		return;
1825 	}
1826 
1827 	sc->num_squeues = ONE_BASED(nqr);
1828 	if (sc->num_squeues > sc->max_queues) {
1829 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1830 					sc->max_queues);
1831 		sc->num_squeues = sc->max_queues;
1832 	}
1833 
1834 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1835 	if (nqr == 0xffff) {
1836 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1837 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1838 		return;
1839 	}
1840 
1841 	sc->num_cqueues = ONE_BASED(nqr);
1842 	if (sc->num_cqueues > sc->max_queues) {
1843 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1844 					sc->max_queues);
1845 		sc->num_cqueues = sc->max_queues;
1846 	}
1847 
1848 	/* Patch the command value which will be saved on callback's return */
1849 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1850 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1851 
1852 	sc->num_q_is_set = true;
1853 }
1854 
1855 static int
1856 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1857 	struct nvme_completion *compl)
1858 {
1859 	struct nvme_feature_obj *feat;
1860 	uint32_t nsid = command->nsid;
1861 	uint8_t fid = command->cdw10 & 0xFF;
1862 
1863 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1864 
1865 	if (fid >= NVME_FID_MAX) {
1866 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1867 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1868 		return (1);
1869 	}
1870 	feat = &sc->feat[fid];
1871 
1872 	if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1873 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1874 		return (1);
1875 	}
1876 
1877 	if (!feat->namespace_specific &&
1878 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1879 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1880 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1881 		return (1);
1882 	}
1883 
1884 	compl->cdw0 = 0;
1885 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1886 
1887 	if (feat->set)
1888 		feat->set(sc, feat, command, compl);
1889 
1890 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1891 	if (compl->status == NVME_SC_SUCCESS) {
1892 		feat->cdw11 = command->cdw11;
1893 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1894 		    (command->cdw11 != 0))
1895 			pci_nvme_aen_notify(sc);
1896 	}
1897 
1898 	return (0);
1899 }
1900 
1901 #define NVME_FEATURES_SEL_SUPPORTED	0x3
1902 #define NVME_FEATURES_NS_SPECIFIC	(1 << 1)
1903 
1904 static int
1905 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1906 	struct nvme_completion* compl)
1907 {
1908 	struct nvme_feature_obj *feat;
1909 	uint8_t fid = command->cdw10 & 0xFF;
1910 	uint8_t sel = (command->cdw10 >> 8) & 0x7;
1911 
1912 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1913 
1914 	if (fid >= NVME_FID_MAX) {
1915 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1916 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1917 		return (1);
1918 	}
1919 
1920 	compl->cdw0 = 0;
1921 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1922 
1923 	feat = &sc->feat[fid];
1924 	if (feat->get) {
1925 		feat->get(sc, feat, command, compl);
1926 	}
1927 
1928 	if (compl->status == NVME_SC_SUCCESS) {
1929 		if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1930 			compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1931 		else
1932 			compl->cdw0 = feat->cdw11;
1933 	}
1934 
1935 	return (0);
1936 }
1937 
1938 static int
1939 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1940 	struct nvme_completion* compl)
1941 {
1942 	uint8_t	ses, lbaf, pi;
1943 
1944 	/* Only supports Secure Erase Setting - User Data Erase */
1945 	ses = (command->cdw10 >> 9) & 0x7;
1946 	if (ses > 0x1) {
1947 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1948 		return (1);
1949 	}
1950 
1951 	/* Only supports a single LBA Format */
1952 	lbaf = command->cdw10 & 0xf;
1953 	if (lbaf != 0) {
1954 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1955 		    NVME_SC_INVALID_FORMAT);
1956 		return (1);
1957 	}
1958 
1959 	/* Doesn't support Protection Infomation */
1960 	pi = (command->cdw10 >> 5) & 0x7;
1961 	if (pi != 0) {
1962 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1963 		return (1);
1964 	}
1965 
1966 	if (sc->nvstore.type == NVME_STOR_RAM) {
1967 		if (sc->nvstore.ctx)
1968 			free(sc->nvstore.ctx);
1969 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1970 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1971 	} else {
1972 		struct pci_nvme_ioreq *req;
1973 		int err;
1974 
1975 		req = pci_nvme_get_ioreq(sc);
1976 		if (req == NULL) {
1977 			pci_nvme_status_genc(&compl->status,
1978 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1979 			WPRINTF("%s: unable to allocate IO req", __func__);
1980 			return (1);
1981 		}
1982 		req->nvme_sq = &sc->submit_queues[0];
1983 		req->sqid = 0;
1984 		req->opc = command->opc;
1985 		req->cid = command->cid;
1986 		req->nsid = command->nsid;
1987 
1988 		req->io_req.br_offset = 0;
1989 		req->io_req.br_resid = sc->nvstore.size;
1990 		req->io_req.br_callback = pci_nvme_io_done;
1991 
1992 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1993 		if (err) {
1994 			pci_nvme_status_genc(&compl->status,
1995 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1996 			pci_nvme_release_ioreq(sc, req);
1997 		} else
1998 			compl->status = NVME_NO_STATUS;
1999 	}
2000 
2001 	return (1);
2002 }
2003 
2004 static int
2005 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
2006     struct nvme_completion *compl)
2007 {
2008 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
2009 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
2010 
2011 	/* TODO: search for the command ID and abort it */
2012 
2013 	compl->cdw0 = 1;
2014 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2015 	return (1);
2016 }
2017 
2018 static int
2019 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2020 	struct nvme_command* command, struct nvme_completion* compl)
2021 {
2022 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2023 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
2024 
2025 	/* Don't exceed the Async Event Request Limit (AERL). */
2026 	if (pci_nvme_aer_limit_reached(sc)) {
2027 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2028 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2029 		return (1);
2030 	}
2031 
2032 	if (pci_nvme_aer_add(sc, command->cid)) {
2033 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2034 				NVME_SC_INTERNAL_DEVICE_ERROR);
2035 		return (1);
2036 	}
2037 
2038 	/*
2039 	 * Raise events when they happen based on the Set Features cmd.
2040 	 * These events happen async, so only set completion successful if
2041 	 * there is an event reflective of the request to get event.
2042 	 */
2043 	compl->status = NVME_NO_STATUS;
2044 	pci_nvme_aen_notify(sc);
2045 
2046 	return (0);
2047 }
2048 
2049 static void
2050 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2051 {
2052 	struct nvme_completion compl;
2053 	struct nvme_command *cmd;
2054 	struct nvme_submission_queue *sq;
2055 	struct nvme_completion_queue *cq;
2056 	uint16_t sqhead;
2057 
2058 	DPRINTF("%s index %u", __func__, (uint32_t)value);
2059 
2060 	sq = &sc->submit_queues[0];
2061 	cq = &sc->compl_queues[0];
2062 
2063 	pthread_mutex_lock(&sq->mtx);
2064 
2065 	sqhead = sq->head;
2066 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2067 
2068 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2069 		cmd = &(sq->qbase)[sqhead];
2070 		compl.cdw0 = 0;
2071 		compl.status = 0;
2072 
2073 		switch (cmd->opc) {
2074 		case NVME_OPC_DELETE_IO_SQ:
2075 			DPRINTF("%s command DELETE_IO_SQ", __func__);
2076 			nvme_opc_delete_io_sq(sc, cmd, &compl);
2077 			break;
2078 		case NVME_OPC_CREATE_IO_SQ:
2079 			DPRINTF("%s command CREATE_IO_SQ", __func__);
2080 			nvme_opc_create_io_sq(sc, cmd, &compl);
2081 			break;
2082 		case NVME_OPC_DELETE_IO_CQ:
2083 			DPRINTF("%s command DELETE_IO_CQ", __func__);
2084 			nvme_opc_delete_io_cq(sc, cmd, &compl);
2085 			break;
2086 		case NVME_OPC_CREATE_IO_CQ:
2087 			DPRINTF("%s command CREATE_IO_CQ", __func__);
2088 			nvme_opc_create_io_cq(sc, cmd, &compl);
2089 			break;
2090 		case NVME_OPC_GET_LOG_PAGE:
2091 			DPRINTF("%s command GET_LOG_PAGE", __func__);
2092 			nvme_opc_get_log_page(sc, cmd, &compl);
2093 			break;
2094 		case NVME_OPC_IDENTIFY:
2095 			DPRINTF("%s command IDENTIFY", __func__);
2096 			nvme_opc_identify(sc, cmd, &compl);
2097 			break;
2098 		case NVME_OPC_ABORT:
2099 			DPRINTF("%s command ABORT", __func__);
2100 			nvme_opc_abort(sc, cmd, &compl);
2101 			break;
2102 		case NVME_OPC_SET_FEATURES:
2103 			DPRINTF("%s command SET_FEATURES", __func__);
2104 			nvme_opc_set_features(sc, cmd, &compl);
2105 			break;
2106 		case NVME_OPC_GET_FEATURES:
2107 			DPRINTF("%s command GET_FEATURES", __func__);
2108 			nvme_opc_get_features(sc, cmd, &compl);
2109 			break;
2110 		case NVME_OPC_FIRMWARE_ACTIVATE:
2111 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2112 			pci_nvme_status_tc(&compl.status,
2113 			    NVME_SCT_COMMAND_SPECIFIC,
2114 			    NVME_SC_INVALID_FIRMWARE_SLOT);
2115 			break;
2116 		case NVME_OPC_ASYNC_EVENT_REQUEST:
2117 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2118 			nvme_opc_async_event_req(sc, cmd, &compl);
2119 			break;
2120 		case NVME_OPC_FORMAT_NVM:
2121 			DPRINTF("%s command FORMAT_NVM", __func__);
2122 			if ((sc->ctrldata.oacs &
2123 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2124 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2125 				break;
2126 			}
2127 			nvme_opc_format_nvm(sc, cmd, &compl);
2128 			break;
2129 		case NVME_OPC_SECURITY_SEND:
2130 		case NVME_OPC_SECURITY_RECEIVE:
2131 		case NVME_OPC_SANITIZE:
2132 		case NVME_OPC_GET_LBA_STATUS:
2133 			DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2134 			    cmd->opc);
2135 			/* Valid but unsupported opcodes */
2136 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2137 			break;
2138 		default:
2139 			DPRINTF("%s command OPC=%#X (not implemented)",
2140 			    __func__,
2141 			    cmd->opc);
2142 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2143 		}
2144 		sqhead = (sqhead + 1) % sq->size;
2145 
2146 		if (NVME_COMPLETION_VALID(compl)) {
2147 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
2148 			    compl.cdw0,
2149 			    cmd->cid,
2150 			    0,		/* SQID */
2151 			    compl.status);
2152 		}
2153 	}
2154 
2155 	DPRINTF("setting sqhead %u", sqhead);
2156 	sq->head = sqhead;
2157 
2158 	if (cq->head != cq->tail)
2159 		pci_generate_msix(sc->nsc_pi, 0);
2160 
2161 	pthread_mutex_unlock(&sq->mtx);
2162 }
2163 
2164 /*
2165  * Update the Write and Read statistics reported in SMART data
2166  *
2167  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2168  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2169  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2170  */
2171 static void
2172 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2173     size_t bytes, uint16_t status)
2174 {
2175 
2176 	pthread_mutex_lock(&sc->mtx);
2177 	switch (opc) {
2178 	case NVME_OPC_WRITE:
2179 		sc->write_commands++;
2180 		if (status != NVME_SC_SUCCESS)
2181 			break;
2182 		sc->write_dunits_remainder += (bytes / 512);
2183 		while (sc->write_dunits_remainder >= 1000) {
2184 			sc->write_data_units++;
2185 			sc->write_dunits_remainder -= 1000;
2186 		}
2187 		break;
2188 	case NVME_OPC_READ:
2189 		sc->read_commands++;
2190 		if (status != NVME_SC_SUCCESS)
2191 			break;
2192 		sc->read_dunits_remainder += (bytes / 512);
2193 		while (sc->read_dunits_remainder >= 1000) {
2194 			sc->read_data_units++;
2195 			sc->read_dunits_remainder -= 1000;
2196 		}
2197 		break;
2198 	default:
2199 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2200 		break;
2201 	}
2202 	pthread_mutex_unlock(&sc->mtx);
2203 }
2204 
2205 /*
2206  * Check if the combination of Starting LBA (slba) and number of blocks
2207  * exceeds the range of the underlying storage.
2208  *
2209  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2210  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2211  * overflow.
2212  */
2213 static bool
2214 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2215     uint32_t nblocks)
2216 {
2217 	size_t	offset, bytes;
2218 
2219 	/* Overflow check of multiplying Starting LBA by the sector size */
2220 	if (slba >> (64 - nvstore->sectsz_bits))
2221 		return (true);
2222 
2223 	offset = slba << nvstore->sectsz_bits;
2224 	bytes = nblocks << nvstore->sectsz_bits;
2225 
2226 	/* Overflow check of Number of Logical Blocks */
2227 	if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2228 		return (true);
2229 
2230 	return (false);
2231 }
2232 
2233 static int
2234 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2235 	uint64_t gpaddr, size_t size, int do_write, uint64_t offset)
2236 {
2237 	int iovidx;
2238 	bool range_is_contiguous;
2239 
2240 	if (req == NULL)
2241 		return (-1);
2242 
2243 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2244 		return (-1);
2245 	}
2246 
2247 	/*
2248 	 * Minimize the number of IOVs by concatenating contiguous address
2249 	 * ranges. If the IOV count is zero, there is no previous range to
2250 	 * concatenate.
2251 	 */
2252 	if (req->io_req.br_iovcnt == 0)
2253 		range_is_contiguous = false;
2254 	else
2255 		range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2256 
2257 	if (range_is_contiguous) {
2258 		iovidx = req->io_req.br_iovcnt - 1;
2259 
2260 		req->io_req.br_iov[iovidx].iov_base =
2261 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2262 				     req->prev_gpaddr, size);
2263 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2264 			return (-1);
2265 
2266 		req->prev_size += size;
2267 		req->io_req.br_resid += size;
2268 
2269 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2270 	} else {
2271 		iovidx = req->io_req.br_iovcnt;
2272 		if (iovidx == 0) {
2273 			req->io_req.br_offset = offset;
2274 			req->io_req.br_resid = 0;
2275 			req->io_req.br_param = req;
2276 		}
2277 
2278 		req->io_req.br_iov[iovidx].iov_base =
2279 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2280 				     gpaddr, size);
2281 		if (req->io_req.br_iov[iovidx].iov_base == NULL)
2282 			return (-1);
2283 
2284 		req->io_req.br_iov[iovidx].iov_len = size;
2285 
2286 		req->prev_gpaddr = gpaddr;
2287 		req->prev_size = size;
2288 		req->io_req.br_resid += size;
2289 
2290 		req->io_req.br_iovcnt++;
2291 	}
2292 
2293 	return (0);
2294 }
2295 
2296 static void
2297 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2298     struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2299 {
2300 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2301 
2302 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2303 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2304 		 NVME_STATUS_GET_SC(status));
2305 
2306 	pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2307 
2308 	if (cq->head != cq->tail) {
2309 		if (cq->intr_en & NVME_CQ_INTEN) {
2310 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2311 		} else {
2312 			DPRINTF("%s: CQ%u interrupt disabled",
2313 						__func__, sq->cqid);
2314 		}
2315 	}
2316 }
2317 
2318 static void
2319 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2320 {
2321 	req->sc = NULL;
2322 	req->nvme_sq = NULL;
2323 	req->sqid = 0;
2324 
2325 	pthread_mutex_lock(&sc->mtx);
2326 
2327 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2328 	sc->pending_ios--;
2329 
2330 	/* when no more IO pending, can set to ready if device reset/enabled */
2331 	if (sc->pending_ios == 0 &&
2332 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2333 		sc->regs.csts |= NVME_CSTS_RDY;
2334 
2335 	pthread_mutex_unlock(&sc->mtx);
2336 
2337 	sem_post(&sc->iosemlock);
2338 }
2339 
2340 static struct pci_nvme_ioreq *
2341 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2342 {
2343 	struct pci_nvme_ioreq *req = NULL;
2344 
2345 	sem_wait(&sc->iosemlock);
2346 	pthread_mutex_lock(&sc->mtx);
2347 
2348 	req = STAILQ_FIRST(&sc->ioreqs_free);
2349 	assert(req != NULL);
2350 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2351 
2352 	req->sc = sc;
2353 
2354 	sc->pending_ios++;
2355 
2356 	pthread_mutex_unlock(&sc->mtx);
2357 
2358 	req->io_req.br_iovcnt = 0;
2359 	req->io_req.br_offset = 0;
2360 	req->io_req.br_resid = 0;
2361 	req->io_req.br_param = req;
2362 	req->prev_gpaddr = 0;
2363 	req->prev_size = 0;
2364 
2365 	return req;
2366 }
2367 
2368 static void
2369 pci_nvme_io_done(struct blockif_req *br, int err)
2370 {
2371 	struct pci_nvme_ioreq *req = br->br_param;
2372 	struct nvme_submission_queue *sq = req->nvme_sq;
2373 	uint16_t code, status;
2374 
2375 #ifndef __FreeBSD__
2376 	status = 0;
2377 #endif
2378 
2379 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2380 
2381 	/* TODO return correct error */
2382 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2383 	pci_nvme_status_genc(&status, code);
2384 
2385 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2386 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2387 	    req->bytes, status);
2388 	pci_nvme_release_ioreq(req->sc, req);
2389 }
2390 
2391 /*
2392  * Implements the Flush command. The specification states:
2393  *    If a volatile write cache is not present, Flush commands complete
2394  *    successfully and have no effect
2395  * in the description of the Volatile Write Cache (VWC) field of the Identify
2396  * Controller data. Therefore, set status to Success if the command is
2397  * not supported (i.e. RAM or as indicated by the blockif).
2398  */
2399 static bool
2400 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2401     struct nvme_command *cmd __unused,
2402     struct pci_nvme_blockstore *nvstore,
2403     struct pci_nvme_ioreq *req,
2404     uint16_t *status)
2405 {
2406 	bool pending = false;
2407 
2408 	if (nvstore->type == NVME_STOR_RAM) {
2409 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2410 	} else {
2411 		int err;
2412 
2413 		req->io_req.br_callback = pci_nvme_io_done;
2414 
2415 		err = blockif_flush(nvstore->ctx, &req->io_req);
2416 		switch (err) {
2417 		case 0:
2418 			pending = true;
2419 			break;
2420 		case EOPNOTSUPP:
2421 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2422 			break;
2423 		default:
2424 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2425 		}
2426 	}
2427 
2428 	return (pending);
2429 }
2430 
2431 static uint16_t
2432 nvme_write_read_ram(struct pci_nvme_softc *sc,
2433     struct pci_nvme_blockstore *nvstore,
2434     uint64_t prp1, uint64_t prp2,
2435     size_t offset, uint64_t bytes,
2436     bool is_write)
2437 {
2438 	uint8_t *buf = nvstore->ctx;
2439 	enum nvme_copy_dir dir;
2440 	uint16_t status;
2441 
2442 #ifndef __FreeBSD__
2443 	status = 0;
2444 #endif
2445 
2446 	if (is_write)
2447 		dir = NVME_COPY_TO_PRP;
2448 	else
2449 		dir = NVME_COPY_FROM_PRP;
2450 
2451 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2452 	    buf + offset, bytes, dir))
2453 		pci_nvme_status_genc(&status,
2454 		    NVME_SC_DATA_TRANSFER_ERROR);
2455 	else
2456 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2457 
2458 	return (status);
2459 }
2460 
2461 static uint16_t
2462 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2463     struct pci_nvme_blockstore *nvstore,
2464     struct pci_nvme_ioreq *req,
2465     uint64_t prp1, uint64_t prp2,
2466     size_t offset, uint64_t bytes,
2467     bool is_write)
2468 {
2469 	uint64_t size;
2470 	int err;
2471 	uint16_t status = NVME_NO_STATUS;
2472 
2473 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2474 	if (pci_nvme_append_iov_req(sc, req, prp1,
2475 	    size, is_write, offset)) {
2476 		err = -1;
2477 		goto out;
2478 	}
2479 
2480 	offset += size;
2481 	bytes  -= size;
2482 
2483 	if (bytes == 0) {
2484 		;
2485 	} else if (bytes <= PAGE_SIZE) {
2486 		size = bytes;
2487 		if (pci_nvme_append_iov_req(sc, req, prp2,
2488 		    size, is_write, offset)) {
2489 			err = -1;
2490 			goto out;
2491 		}
2492 	} else {
2493 		void *vmctx = sc->nsc_pi->pi_vmctx;
2494 		uint64_t *prp_list = &prp2;
2495 		uint64_t *last = prp_list;
2496 
2497 		/* PRP2 is pointer to a physical region page list */
2498 		while (bytes) {
2499 			/* Last entry in list points to the next list */
2500 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2501 				uint64_t prp = *prp_list;
2502 
2503 				prp_list = paddr_guest2host(vmctx, prp,
2504 				    PAGE_SIZE - (prp % PAGE_SIZE));
2505 				if (prp_list == NULL) {
2506 					err = -1;
2507 					goto out;
2508 				}
2509 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2510 			}
2511 
2512 			size = MIN(bytes, PAGE_SIZE);
2513 
2514 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2515 			    size, is_write, offset)) {
2516 				err = -1;
2517 				goto out;
2518 			}
2519 
2520 			offset += size;
2521 			bytes  -= size;
2522 
2523 			prp_list++;
2524 		}
2525 	}
2526 	req->io_req.br_callback = pci_nvme_io_done;
2527 	if (is_write)
2528 		err = blockif_write(nvstore->ctx, &req->io_req);
2529 	else
2530 		err = blockif_read(nvstore->ctx, &req->io_req);
2531 out:
2532 	if (err)
2533 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2534 
2535 	return (status);
2536 }
2537 
2538 static bool
2539 nvme_opc_write_read(struct pci_nvme_softc *sc,
2540     struct nvme_command *cmd,
2541     struct pci_nvme_blockstore *nvstore,
2542     struct pci_nvme_ioreq *req,
2543     uint16_t *status)
2544 {
2545 	uint64_t lba, nblocks, bytes;
2546 	size_t offset;
2547 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2548 	bool pending = false;
2549 
2550 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2551 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2552 	bytes = nblocks << nvstore->sectsz_bits;
2553 	if (bytes > NVME_MAX_DATA_SIZE) {
2554 		WPRINTF("%s command would exceed MDTS", __func__);
2555 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2556 		goto out;
2557 	}
2558 
2559 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2560 		WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2561 		    __func__, lba, nblocks);
2562 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2563 		goto out;
2564 	}
2565 
2566 	offset = lba << nvstore->sectsz_bits;
2567 
2568 	req->bytes = bytes;
2569 	req->io_req.br_offset = lba;
2570 
2571 	/* PRP bits 1:0 must be zero */
2572 	cmd->prp1 &= ~0x3UL;
2573 	cmd->prp2 &= ~0x3UL;
2574 
2575 	if (nvstore->type == NVME_STOR_RAM) {
2576 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2577 		    cmd->prp2, offset, bytes, is_write);
2578 	} else {
2579 		*status = nvme_write_read_blockif(sc, nvstore, req,
2580 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2581 
2582 		if (*status == NVME_NO_STATUS)
2583 			pending = true;
2584 	}
2585 out:
2586 	if (!pending)
2587 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2588 
2589 	return (pending);
2590 }
2591 
2592 static void
2593 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2594 {
2595 	struct pci_nvme_ioreq *req = br->br_param;
2596 	struct pci_nvme_softc *sc = req->sc;
2597 	bool done = true;
2598 	uint16_t status;
2599 
2600 #ifndef __FreeBSD__
2601 	status = 0;
2602 #endif
2603 
2604 	if (err) {
2605 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2606 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2607 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2608 	} else {
2609 		struct iovec *iov = req->io_req.br_iov;
2610 
2611 		req->prev_gpaddr++;
2612 		iov += req->prev_gpaddr;
2613 
2614 		/* The iov_* values already include the sector size */
2615 		req->io_req.br_offset = (off_t)iov->iov_base;
2616 		req->io_req.br_resid = iov->iov_len;
2617 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2618 			pci_nvme_status_genc(&status,
2619 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2620 		} else
2621 			done = false;
2622 	}
2623 
2624 	if (done) {
2625 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2626 		    status);
2627 		pci_nvme_release_ioreq(sc, req);
2628 	}
2629 }
2630 
2631 static bool
2632 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2633     struct nvme_command *cmd,
2634     struct pci_nvme_blockstore *nvstore,
2635     struct pci_nvme_ioreq *req,
2636     uint16_t *status)
2637 {
2638 	struct nvme_dsm_range *range = NULL;
2639 	uint32_t nr, r, non_zero, dr;
2640 	int err;
2641 	bool pending = false;
2642 
2643 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2644 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2645 		goto out;
2646 	}
2647 
2648 	nr = cmd->cdw10 & 0xff;
2649 
2650 	/* copy locally because a range entry could straddle PRPs */
2651 	range = calloc(1, NVME_MAX_DSM_TRIM);
2652 	if (range == NULL) {
2653 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2654 		goto out;
2655 	}
2656 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2657 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2658 
2659 	/* Check for invalid ranges and the number of non-zero lengths */
2660 	non_zero = 0;
2661 	for (r = 0; r <= nr; r++) {
2662 		if (pci_nvme_out_of_range(nvstore,
2663 		    range[r].starting_lba, range[r].length)) {
2664 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2665 			goto out;
2666 		}
2667 		if (range[r].length != 0)
2668 			non_zero++;
2669 	}
2670 
2671 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2672 		size_t offset, bytes;
2673 		int sectsz_bits = sc->nvstore.sectsz_bits;
2674 
2675 		/*
2676 		 * DSM calls are advisory only, and compliant controllers
2677 		 * may choose to take no actions (i.e. return Success).
2678 		 */
2679 		if (!nvstore->deallocate) {
2680 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2681 			goto out;
2682 		}
2683 
2684 		/* If all ranges have a zero length, return Success */
2685 		if (non_zero == 0) {
2686 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2687 			goto out;
2688 		}
2689 
2690 		if (req == NULL) {
2691 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2692 			goto out;
2693 		}
2694 
2695 		offset = range[0].starting_lba << sectsz_bits;
2696 		bytes = range[0].length << sectsz_bits;
2697 
2698 		/*
2699 		 * If the request is for more than a single range, store
2700 		 * the ranges in the br_iov. Optimize for the common case
2701 		 * of a single range.
2702 		 *
2703 		 * Note that NVMe Number of Ranges is a zero based value
2704 		 */
2705 		req->io_req.br_iovcnt = 0;
2706 		req->io_req.br_offset = offset;
2707 		req->io_req.br_resid = bytes;
2708 
2709 		if (nr == 0) {
2710 			req->io_req.br_callback = pci_nvme_io_done;
2711 		} else {
2712 			struct iovec *iov = req->io_req.br_iov;
2713 
2714 			for (r = 0, dr = 0; r <= nr; r++) {
2715 				offset = range[r].starting_lba << sectsz_bits;
2716 				bytes = range[r].length << sectsz_bits;
2717 				if (bytes == 0)
2718 					continue;
2719 
2720 				if ((nvstore->size - offset) < bytes) {
2721 					pci_nvme_status_genc(status,
2722 					    NVME_SC_LBA_OUT_OF_RANGE);
2723 					goto out;
2724 				}
2725 				iov[dr].iov_base = (void *)offset;
2726 				iov[dr].iov_len = bytes;
2727 				dr++;
2728 			}
2729 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2730 
2731 			/*
2732 			 * Use prev_gpaddr to track the current entry and
2733 			 * prev_size to track the number of entries
2734 			 */
2735 			req->prev_gpaddr = 0;
2736 			req->prev_size = dr;
2737 		}
2738 
2739 		err = blockif_delete(nvstore->ctx, &req->io_req);
2740 		if (err)
2741 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2742 		else
2743 			pending = true;
2744 	}
2745 out:
2746 	free(range);
2747 	return (pending);
2748 }
2749 
2750 static void
2751 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2752 {
2753 	struct nvme_submission_queue *sq;
2754 	uint16_t status;
2755 	uint16_t sqhead;
2756 
2757 #ifndef __FreeBSD__
2758 	status = 0;
2759 #endif
2760 
2761 	/* handle all submissions up to sq->tail index */
2762 	sq = &sc->submit_queues[idx];
2763 
2764 	pthread_mutex_lock(&sq->mtx);
2765 
2766 	sqhead = sq->head;
2767 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2768 	         idx, sqhead, sq->tail, sq->qbase);
2769 
2770 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2771 		struct nvme_command *cmd;
2772 		struct pci_nvme_ioreq *req;
2773 		uint32_t nsid;
2774 		bool pending;
2775 
2776 		pending = false;
2777 		req = NULL;
2778 		status = 0;
2779 
2780 		cmd = &sq->qbase[sqhead];
2781 		sqhead = (sqhead + 1) % sq->size;
2782 
2783 		nsid = le32toh(cmd->nsid);
2784 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2785 			pci_nvme_status_genc(&status,
2786 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2787 			status |=
2788 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2789 			goto complete;
2790  		}
2791 
2792 		req = pci_nvme_get_ioreq(sc);
2793 		if (req == NULL) {
2794 			pci_nvme_status_genc(&status,
2795 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2796 			WPRINTF("%s: unable to allocate IO req", __func__);
2797 			goto complete;
2798 		}
2799 		req->nvme_sq = sq;
2800 		req->sqid = idx;
2801 		req->opc = cmd->opc;
2802 		req->cid = cmd->cid;
2803 		req->nsid = cmd->nsid;
2804 
2805 		switch (cmd->opc) {
2806 		case NVME_OPC_FLUSH:
2807 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2808 			    req, &status);
2809  			break;
2810 		case NVME_OPC_WRITE:
2811 		case NVME_OPC_READ:
2812 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2813 			    req, &status);
2814 			break;
2815 		case NVME_OPC_WRITE_ZEROES:
2816 			/* TODO: write zeroes
2817 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2818 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2819 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2820 			break;
2821 		case NVME_OPC_DATASET_MANAGEMENT:
2822  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2823 			    req, &status);
2824 			break;
2825  		default:
2826  			WPRINTF("%s unhandled io command 0x%x",
2827 			    __func__, cmd->opc);
2828 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2829 		}
2830 complete:
2831 		if (!pending) {
2832 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2833 			if (req != NULL)
2834 				pci_nvme_release_ioreq(sc, req);
2835 		}
2836 	}
2837 
2838 	sq->head = sqhead;
2839 
2840 	pthread_mutex_unlock(&sq->mtx);
2841 }
2842 
2843 static void
2844 pci_nvme_handle_doorbell(struct vmctx *ctx __unused, struct pci_nvme_softc* sc,
2845 	uint64_t idx, int is_sq, uint64_t value)
2846 {
2847 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2848 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2849 
2850 	if (is_sq) {
2851 		if (idx > sc->num_squeues) {
2852 			WPRINTF("%s queue index %lu overflow from "
2853 			         "guest (max %u)",
2854 			         __func__, idx, sc->num_squeues);
2855 			return;
2856 		}
2857 
2858 		atomic_store_short(&sc->submit_queues[idx].tail,
2859 		                   (uint16_t)value);
2860 
2861 		if (idx == 0) {
2862 			pci_nvme_handle_admin_cmd(sc, value);
2863 		} else {
2864 			/* submission queue; handle new entries in SQ */
2865 			if (idx > sc->num_squeues) {
2866 				WPRINTF("%s SQ index %lu overflow from "
2867 				         "guest (max %u)",
2868 				         __func__, idx, sc->num_squeues);
2869 				return;
2870 			}
2871 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2872 		}
2873 	} else {
2874 		if (idx > sc->num_cqueues) {
2875 			WPRINTF("%s queue index %lu overflow from "
2876 			         "guest (max %u)",
2877 			         __func__, idx, sc->num_cqueues);
2878 			return;
2879 		}
2880 
2881 		atomic_store_short(&sc->compl_queues[idx].head,
2882 				(uint16_t)value);
2883 	}
2884 }
2885 
2886 static void
2887 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2888 {
2889 	const char *s = iswrite ? "WRITE" : "READ";
2890 
2891 	switch (offset) {
2892 	case NVME_CR_CAP_LOW:
2893 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2894 		break;
2895 	case NVME_CR_CAP_HI:
2896 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2897 		break;
2898 	case NVME_CR_VS:
2899 		DPRINTF("%s %s NVME_CR_VS", func, s);
2900 		break;
2901 	case NVME_CR_INTMS:
2902 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2903 		break;
2904 	case NVME_CR_INTMC:
2905 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2906 		break;
2907 	case NVME_CR_CC:
2908 		DPRINTF("%s %s NVME_CR_CC", func, s);
2909 		break;
2910 	case NVME_CR_CSTS:
2911 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2912 		break;
2913 	case NVME_CR_NSSR:
2914 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2915 		break;
2916 	case NVME_CR_AQA:
2917 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2918 		break;
2919 	case NVME_CR_ASQ_LOW:
2920 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2921 		break;
2922 	case NVME_CR_ASQ_HI:
2923 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2924 		break;
2925 	case NVME_CR_ACQ_LOW:
2926 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2927 		break;
2928 	case NVME_CR_ACQ_HI:
2929 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2930 		break;
2931 	default:
2932 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2933 	}
2934 
2935 }
2936 
2937 static void
2938 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2939 	uint64_t offset, int size, uint64_t value)
2940 {
2941 	uint32_t ccreg;
2942 
2943 	if (offset >= NVME_DOORBELL_OFFSET) {
2944 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2945 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2946 		int is_sq = (belloffset % 8) < 4;
2947 
2948 		if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2949 			WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2950 			    offset);
2951 			return;
2952 		}
2953 
2954 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2955 			WPRINTF("guest attempted an overflow write offset "
2956 			         "0x%lx, val 0x%lx in %s",
2957 			         offset, value, __func__);
2958 			return;
2959 		}
2960 
2961 		if (is_sq) {
2962 			if (sc->submit_queues[idx].qbase == NULL)
2963 				return;
2964 		} else if (sc->compl_queues[idx].qbase == NULL)
2965 			return;
2966 
2967 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2968 		return;
2969 	}
2970 
2971 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2972 	        offset, size, value);
2973 
2974 	if (size != 4) {
2975 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2976 		         "val 0x%lx) to bar0 in %s",
2977 		         size, offset, value, __func__);
2978 		/* TODO: shutdown device */
2979 		return;
2980 	}
2981 
2982 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2983 
2984 	pthread_mutex_lock(&sc->mtx);
2985 
2986 	switch (offset) {
2987 	case NVME_CR_CAP_LOW:
2988 	case NVME_CR_CAP_HI:
2989 		/* readonly */
2990 		break;
2991 	case NVME_CR_VS:
2992 		/* readonly */
2993 		break;
2994 	case NVME_CR_INTMS:
2995 		/* MSI-X, so ignore */
2996 		break;
2997 	case NVME_CR_INTMC:
2998 		/* MSI-X, so ignore */
2999 		break;
3000 	case NVME_CR_CC:
3001 		ccreg = (uint32_t)value;
3002 
3003 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
3004 		         "iocqes %u",
3005 		        __func__,
3006 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
3007 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
3008 			 NVME_CC_GET_IOCQES(ccreg));
3009 
3010 		if (NVME_CC_GET_SHN(ccreg)) {
3011 			/* perform shutdown - flush out data to backend */
3012 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
3013 			    NVME_CSTS_REG_SHST_SHIFT);
3014 			sc->regs.csts |= NVME_SHST_COMPLETE <<
3015 			    NVME_CSTS_REG_SHST_SHIFT;
3016 		}
3017 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
3018 			if (NVME_CC_GET_EN(ccreg) == 0)
3019 				/* transition 1-> causes controller reset */
3020 				pci_nvme_reset_locked(sc);
3021 			else
3022 				pci_nvme_init_controller(ctx, sc);
3023 		}
3024 
3025 		/* Insert the iocqes, iosqes and en bits from the write */
3026 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
3027 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
3028 		if (NVME_CC_GET_EN(ccreg) == 0) {
3029 			/* Insert the ams, mps and css bit fields */
3030 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3031 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3032 			sc->regs.csts &= ~NVME_CSTS_RDY;
3033 		} else if ((sc->pending_ios == 0) &&
3034 		    !(sc->regs.csts & NVME_CSTS_CFS)) {
3035 			sc->regs.csts |= NVME_CSTS_RDY;
3036 		}
3037 		break;
3038 	case NVME_CR_CSTS:
3039 		break;
3040 	case NVME_CR_NSSR:
3041 		/* ignore writes; don't support subsystem reset */
3042 		break;
3043 	case NVME_CR_AQA:
3044 		sc->regs.aqa = (uint32_t)value;
3045 		break;
3046 	case NVME_CR_ASQ_LOW:
3047 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3048 		               (0xFFFFF000 & value);
3049 		break;
3050 	case NVME_CR_ASQ_HI:
3051 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3052 		               (value << 32);
3053 		break;
3054 	case NVME_CR_ACQ_LOW:
3055 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3056 		               (0xFFFFF000 & value);
3057 		break;
3058 	case NVME_CR_ACQ_HI:
3059 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3060 		               (value << 32);
3061 		break;
3062 	default:
3063 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3064 		         __func__, offset, value, size);
3065 	}
3066 	pthread_mutex_unlock(&sc->mtx);
3067 }
3068 
3069 static void
3070 pci_nvme_write(struct vmctx *ctx, int vcpu __unused, struct pci_devinst *pi,
3071     int baridx, uint64_t offset, int size, uint64_t value)
3072 {
3073 	struct pci_nvme_softc* sc = pi->pi_arg;
3074 
3075 	if (baridx == pci_msix_table_bar(pi) ||
3076 	    baridx == pci_msix_pba_bar(pi)) {
3077 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3078 		         " value 0x%lx", baridx, offset, size, value);
3079 
3080 		pci_emul_msix_twrite(pi, offset, size, value);
3081 		return;
3082 	}
3083 
3084 	switch (baridx) {
3085 	case 0:
3086 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3087 		break;
3088 
3089 	default:
3090 		DPRINTF("%s unknown baridx %d, val 0x%lx",
3091 		         __func__, baridx, value);
3092 	}
3093 }
3094 
3095 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3096 	uint64_t offset, int size)
3097 {
3098 	uint64_t value;
3099 
3100 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3101 
3102 	if (offset < NVME_DOORBELL_OFFSET) {
3103 		void *p = &(sc->regs);
3104 		pthread_mutex_lock(&sc->mtx);
3105 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
3106 		pthread_mutex_unlock(&sc->mtx);
3107 	} else {
3108 		value = 0;
3109                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3110 	}
3111 
3112 	switch (size) {
3113 	case 1:
3114 		value &= 0xFF;
3115 		break;
3116 	case 2:
3117 		value &= 0xFFFF;
3118 		break;
3119 	case 4:
3120 		value &= 0xFFFFFFFF;
3121 		break;
3122 	}
3123 
3124 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
3125 	         offset, size, (uint32_t)value);
3126 
3127 	return (value);
3128 }
3129 
3130 
3131 
3132 static uint64_t
3133 pci_nvme_read(struct vmctx *ctx __unused, int vcpu __unused,
3134     struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3135 {
3136 	struct pci_nvme_softc* sc = pi->pi_arg;
3137 
3138 	if (baridx == pci_msix_table_bar(pi) ||
3139 	    baridx == pci_msix_pba_bar(pi)) {
3140 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3141 		        baridx, offset, size);
3142 
3143 		return pci_emul_msix_tread(pi, offset, size);
3144 	}
3145 
3146 	switch (baridx) {
3147 	case 0:
3148        		return pci_nvme_read_bar_0(sc, offset, size);
3149 
3150 	default:
3151 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3152 	}
3153 
3154 	return (0);
3155 }
3156 
3157 static int
3158 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3159 {
3160 	char bident[sizeof("XX:X:X")];
3161 	const char *value;
3162 	uint32_t sectsz;
3163 
3164 	sc->max_queues = NVME_QUEUES;
3165 	sc->max_qentries = NVME_MAX_QENTRIES;
3166 	sc->ioslots = NVME_IOSLOTS;
3167 	sc->num_squeues = sc->max_queues;
3168 	sc->num_cqueues = sc->max_queues;
3169 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3170 	sectsz = 0;
3171 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3172 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3173 
3174 	value = get_config_value_node(nvl, "maxq");
3175 	if (value != NULL)
3176 		sc->max_queues = atoi(value);
3177 	value = get_config_value_node(nvl, "qsz");
3178 	if (value != NULL) {
3179 		sc->max_qentries = atoi(value);
3180 		if (sc->max_qentries <= 0) {
3181 			EPRINTLN("nvme: Invalid qsz option %d",
3182 			    sc->max_qentries);
3183 			return (-1);
3184 		}
3185 	}
3186 	value = get_config_value_node(nvl, "ioslots");
3187 	if (value != NULL) {
3188 		sc->ioslots = atoi(value);
3189 		if (sc->ioslots <= 0) {
3190 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3191 			return (-1);
3192 		}
3193 	}
3194 	value = get_config_value_node(nvl, "sectsz");
3195 	if (value != NULL)
3196 		sectsz = atoi(value);
3197 	value = get_config_value_node(nvl, "ser");
3198 	if (value != NULL) {
3199 		/*
3200 		 * This field indicates the Product Serial Number in
3201 		 * 7-bit ASCII, unused bytes should be space characters.
3202 		 * Ref: NVMe v1.3c.
3203 		 */
3204 		cpywithpad((char *)sc->ctrldata.sn,
3205 		    sizeof(sc->ctrldata.sn), value, ' ');
3206 	}
3207 	value = get_config_value_node(nvl, "eui64");
3208 	if (value != NULL)
3209 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3210 	value = get_config_value_node(nvl, "dsm");
3211 	if (value != NULL) {
3212 		if (strcmp(value, "auto") == 0)
3213 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3214 		else if (strcmp(value, "enable") == 0)
3215 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3216 		else if (strcmp(value, "disable") == 0)
3217 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3218 	}
3219 
3220 	value = get_config_value_node(nvl, "ram");
3221 	if (value != NULL) {
3222 		uint64_t sz = strtoull(value, NULL, 10);
3223 
3224 		sc->nvstore.type = NVME_STOR_RAM;
3225 		sc->nvstore.size = sz * 1024 * 1024;
3226 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3227 		sc->nvstore.sectsz = 4096;
3228 		sc->nvstore.sectsz_bits = 12;
3229 		if (sc->nvstore.ctx == NULL) {
3230 			EPRINTLN("nvme: Unable to allocate RAM");
3231 			return (-1);
3232 		}
3233 	} else {
3234 		snprintf(bident, sizeof(bident), "%d:%d",
3235 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3236 		sc->nvstore.ctx = blockif_open(nvl, bident);
3237 		if (sc->nvstore.ctx == NULL) {
3238 			EPRINTLN("nvme: Could not open backing file: %s",
3239 			    strerror(errno));
3240 			return (-1);
3241 		}
3242 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3243 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3244 	}
3245 
3246 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3247 		sc->nvstore.sectsz = sectsz;
3248 	else if (sc->nvstore.type != NVME_STOR_RAM)
3249 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3250 	for (sc->nvstore.sectsz_bits = 9;
3251 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3252 	     sc->nvstore.sectsz_bits++);
3253 
3254 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3255 		sc->max_queues = NVME_QUEUES;
3256 
3257 	return (0);
3258 }
3259 
3260 static void
3261 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3262     size_t new_size)
3263 {
3264 	struct pci_nvme_softc *sc;
3265 	struct pci_nvme_blockstore *nvstore;
3266 	struct nvme_namespace_data *nd;
3267 
3268 	sc = arg;
3269 	nvstore = &sc->nvstore;
3270 	nd = &sc->nsdata;
3271 
3272 	nvstore->size = new_size;
3273 	pci_nvme_init_nsdata_size(nvstore, nd);
3274 
3275 	/* Add changed NSID to list */
3276 	sc->ns_log.ns[0] = 1;
3277 	sc->ns_log.ns[1] = 0;
3278 
3279 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3280 	    PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3281 }
3282 
3283 static int
3284 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl)
3285 {
3286 	struct pci_nvme_softc *sc;
3287 	uint32_t pci_membar_sz;
3288 	int	error;
3289 
3290 	error = 0;
3291 
3292 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3293 	pi->pi_arg = sc;
3294 	sc->nsc_pi = pi;
3295 
3296 	error = pci_nvme_parse_config(sc, nvl);
3297 	if (error < 0)
3298 		goto done;
3299 	else
3300 		error = 0;
3301 
3302 	STAILQ_INIT(&sc->ioreqs_free);
3303 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3304 	for (int i = 0; i < sc->ioslots; i++) {
3305 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3306 	}
3307 
3308 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3309 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3310 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3311 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3312 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3313 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3314 
3315 	/*
3316 	 * Allocate size of NVMe registers + doorbell space for all queues.
3317 	 *
3318 	 * The specification requires a minimum memory I/O window size of 16K.
3319 	 * The Windows driver will refuse to start a device with a smaller
3320 	 * window.
3321 	 */
3322 	pci_membar_sz = sizeof(struct nvme_registers) +
3323 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3324 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3325 
3326 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3327 
3328 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3329 	if (error) {
3330 		WPRINTF("%s pci alloc mem bar failed", __func__);
3331 		goto done;
3332 	}
3333 
3334 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3335 	if (error) {
3336 		WPRINTF("%s pci add msixcap failed", __func__);
3337 		goto done;
3338 	}
3339 
3340 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3341 	if (error) {
3342 		WPRINTF("%s pci add Express capability failed", __func__);
3343 		goto done;
3344 	}
3345 
3346 	pthread_mutex_init(&sc->mtx, NULL);
3347 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3348 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3349 
3350 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3351 	/*
3352 	 * Controller data depends on Namespace data so initialize Namespace
3353 	 * data first.
3354 	 */
3355 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3356 	pci_nvme_init_ctrldata(sc);
3357 	pci_nvme_init_logpages(sc);
3358 	pci_nvme_init_features(sc);
3359 
3360 	pci_nvme_aer_init(sc);
3361 	pci_nvme_aen_init(sc);
3362 
3363 	pci_nvme_reset(sc);
3364 
3365 	pci_lintr_request(pi);
3366 
3367 done:
3368 	return (error);
3369 }
3370 
3371 static int
3372 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3373 {
3374 	char *cp, *ram;
3375 
3376 	if (opts == NULL)
3377 		return (0);
3378 
3379 	if (strncmp(opts, "ram=", 4) == 0) {
3380 		cp = strchr(opts, ',');
3381 		if (cp == NULL) {
3382 			set_config_value_node(nvl, "ram", opts + 4);
3383 			return (0);
3384 		}
3385 		ram = strndup(opts + 4, cp - opts - 4);
3386 		set_config_value_node(nvl, "ram", ram);
3387 		free(ram);
3388 		return (pci_parse_legacy_config(nvl, cp + 1));
3389 	} else
3390 		return (blockif_legacy_config(nvl, opts));
3391 }
3392 
3393 static const struct pci_devemu pci_de_nvme = {
3394 	.pe_emu =	"nvme",
3395 	.pe_init =	pci_nvme_init,
3396 	.pe_legacy_config = pci_nvme_legacy_config,
3397 	.pe_barwrite =	pci_nvme_write,
3398 	.pe_barread =	pci_nvme_read
3399 };
3400 PCI_EMUL_SET(pci_de_nvme);
3401