xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision f96a0cef040313f6281fbc014a0b63d5c5cc760f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <semaphore.h>
72 #include <stdbool.h>
73 #include <stddef.h>
74 #include <stdint.h>
75 #include <stdio.h>
76 #include <stdlib.h>
77 #include <string.h>
78 
79 #include <machine/atomic.h>
80 #include <machine/vmm.h>
81 #include <vmmapi.h>
82 
83 #include <dev/nvme/nvme.h>
84 
85 #include "bhyverun.h"
86 #include "block_if.h"
87 #include "config.h"
88 #include "debug.h"
89 #include "pci_emul.h"
90 
91 
92 static int nvme_debug = 0;
93 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
94 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
95 
96 /* defaults; can be overridden */
97 #define	NVME_MSIX_BAR		4
98 
99 #define	NVME_IOSLOTS		8
100 
101 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
102 #define NVME_MMIO_SPACE_MIN	(1 << 14)
103 
104 #define	NVME_QUEUES		16
105 #define	NVME_MAX_QENTRIES	2048
106 /* Memory Page size Minimum reported in CAP register */
107 #define	NVME_MPSMIN		0
108 /* MPSMIN converted to bytes */
109 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
110 
111 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
112 #define	NVME_MDTS		9
113 /* Note the + 1 allows for the initial descriptor to not be page aligned */
114 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
115 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
116 
117 /* This is a synthetic status code to indicate there is no status */
118 #define NVME_NO_STATUS		0xffff
119 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
120 
121 /* helpers */
122 
123 /* Convert a zero-based value into a one-based value */
124 #define ONE_BASED(zero)		((zero) + 1)
125 /* Convert a one-based value into a zero-based value */
126 #define ZERO_BASED(one)		((one)  - 1)
127 
128 /* Encode number of SQ's and CQ's for Set/Get Features */
129 #define NVME_FEATURE_NUM_QUEUES(sc) \
130 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
131 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
132 
133 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
134 
135 enum nvme_controller_register_offsets {
136 	NVME_CR_CAP_LOW = 0x00,
137 	NVME_CR_CAP_HI  = 0x04,
138 	NVME_CR_VS      = 0x08,
139 	NVME_CR_INTMS   = 0x0c,
140 	NVME_CR_INTMC   = 0x10,
141 	NVME_CR_CC      = 0x14,
142 	NVME_CR_CSTS    = 0x1c,
143 	NVME_CR_NSSR    = 0x20,
144 	NVME_CR_AQA     = 0x24,
145 	NVME_CR_ASQ_LOW = 0x28,
146 	NVME_CR_ASQ_HI  = 0x2c,
147 	NVME_CR_ACQ_LOW = 0x30,
148 	NVME_CR_ACQ_HI  = 0x34,
149 };
150 
151 enum nvme_cmd_cdw11 {
152 	NVME_CMD_CDW11_PC  = 0x0001,
153 	NVME_CMD_CDW11_IEN = 0x0002,
154 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
155 };
156 
157 enum nvme_copy_dir {
158 	NVME_COPY_TO_PRP,
159 	NVME_COPY_FROM_PRP,
160 };
161 
162 #define	NVME_CQ_INTEN	0x01
163 #define	NVME_CQ_INTCOAL	0x02
164 
165 struct nvme_completion_queue {
166 	struct nvme_completion *qbase;
167 	pthread_mutex_t	mtx;
168 	uint32_t	size;
169 	uint16_t	tail; /* nvme progress */
170 	uint16_t	head; /* guest progress */
171 	uint16_t	intr_vec;
172 	uint32_t	intr_en;
173 };
174 
175 struct nvme_submission_queue {
176 	struct nvme_command *qbase;
177 	pthread_mutex_t	mtx;
178 	uint32_t	size;
179 	uint16_t	head; /* nvme progress */
180 	uint16_t	tail; /* guest progress */
181 	uint16_t	cqid; /* completion queue id */
182 	int		qpriority;
183 };
184 
185 enum nvme_storage_type {
186 	NVME_STOR_BLOCKIF = 0,
187 	NVME_STOR_RAM = 1,
188 };
189 
190 struct pci_nvme_blockstore {
191 	enum nvme_storage_type type;
192 	void		*ctx;
193 	uint64_t	size;
194 	uint32_t	sectsz;
195 	uint32_t	sectsz_bits;
196 	uint64_t	eui64;
197 	uint32_t	deallocate:1;
198 };
199 
200 /*
201  * Calculate the number of additional page descriptors for guest IO requests
202  * based on the advertised Max Data Transfer (MDTS) and given the number of
203  * default iovec's in a struct blockif_req.
204  *
205  * Note the + 1 allows for the initial descriptor to not be page aligned.
206  */
207 #define MDTS_PAD_SIZE \
208 	NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
209 	NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
210 	0
211 
212 struct pci_nvme_ioreq {
213 	struct pci_nvme_softc *sc;
214 	STAILQ_ENTRY(pci_nvme_ioreq) link;
215 	struct nvme_submission_queue *nvme_sq;
216 	uint16_t	sqid;
217 
218 	/* command information */
219 	uint16_t	opc;
220 	uint16_t	cid;
221 	uint32_t	nsid;
222 
223 	uint64_t	prev_gpaddr;
224 	size_t		prev_size;
225 	size_t		bytes;
226 
227 	struct blockif_req io_req;
228 
229 	struct iovec	iovpadding[MDTS_PAD_SIZE];
230 };
231 
232 enum nvme_dsm_type {
233 	/* Dataset Management bit in ONCS reflects backing storage capability */
234 	NVME_DATASET_MANAGEMENT_AUTO,
235 	/* Unconditionally set Dataset Management bit in ONCS */
236 	NVME_DATASET_MANAGEMENT_ENABLE,
237 	/* Unconditionally clear Dataset Management bit in ONCS */
238 	NVME_DATASET_MANAGEMENT_DISABLE,
239 };
240 
241 struct pci_nvme_softc;
242 struct nvme_feature_obj;
243 
244 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
245     struct nvme_feature_obj *,
246     struct nvme_command *,
247     struct nvme_completion *);
248 
249 struct nvme_feature_obj {
250 	uint32_t	cdw11;
251 	nvme_feature_cb	set;
252 	nvme_feature_cb	get;
253 	bool namespace_specific;
254 };
255 
256 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
257 
258 struct pci_nvme_aer {
259 	STAILQ_ENTRY(pci_nvme_aer) link;
260 	uint16_t	cid;	/* Command ID of the submitted AER */
261 };
262 
263 struct pci_nvme_softc {
264 	struct pci_devinst *nsc_pi;
265 
266 	pthread_mutex_t	mtx;
267 
268 	struct nvme_registers regs;
269 
270 	struct nvme_namespace_data  nsdata;
271 	struct nvme_controller_data ctrldata;
272 	struct nvme_error_information_entry err_log;
273 	struct nvme_health_information_page health_log;
274 	struct nvme_firmware_page fw_log;
275 
276 	struct pci_nvme_blockstore nvstore;
277 
278 	uint16_t	max_qentries;	/* max entries per queue */
279 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
280 	uint32_t	num_cqueues;
281 	uint32_t	num_squeues;
282 	bool		num_q_is_set; /* Has host set Number of Queues */
283 
284 	struct pci_nvme_ioreq *ioreqs;
285 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
286 	uint32_t	pending_ios;
287 	uint32_t	ioslots;
288 	sem_t		iosemlock;
289 
290 	/*
291 	 * Memory mapped Submission and Completion queues
292 	 * Each array includes both Admin and IO queues
293 	 */
294 	struct nvme_completion_queue *compl_queues;
295 	struct nvme_submission_queue *submit_queues;
296 
297 	struct nvme_feature_obj feat[NVME_FID_MAX];
298 
299 	enum nvme_dsm_type dataset_management;
300 
301 	/* Accounting for SMART data */
302 	__uint128_t	read_data_units;
303 	__uint128_t	write_data_units;
304 	__uint128_t	read_commands;
305 	__uint128_t	write_commands;
306 	uint32_t	read_dunits_remainder;
307 	uint32_t	write_dunits_remainder;
308 
309 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
310 	uint32_t	aer_count;
311 };
312 
313 
314 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
315 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
316 static void pci_nvme_io_done(struct blockif_req *, int);
317 
318 /* Controller Configuration utils */
319 #define	NVME_CC_GET_EN(cc) \
320 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
321 #define	NVME_CC_GET_CSS(cc) \
322 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
323 #define	NVME_CC_GET_SHN(cc) \
324 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
325 #define	NVME_CC_GET_IOSQES(cc) \
326 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
327 #define	NVME_CC_GET_IOCQES(cc) \
328 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
329 
330 #define	NVME_CC_WRITE_MASK \
331 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
332 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
333 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
334 
335 #define	NVME_CC_NEN_WRITE_MASK \
336 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
337 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
338 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
339 
340 /* Controller Status utils */
341 #define	NVME_CSTS_GET_RDY(sts) \
342 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
343 
344 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
345 
346 /* Completion Queue status word utils */
347 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
348 #define	NVME_STATUS_MASK \
349 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
350 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
351 
352 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
353 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
354 
355 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
356     struct nvme_feature_obj *,
357     struct nvme_command *,
358     struct nvme_completion *);
359 static void nvme_feature_num_queues(struct pci_nvme_softc *,
360     struct nvme_feature_obj *,
361     struct nvme_command *,
362     struct nvme_completion *);
363 static void nvme_feature_iv_config(struct pci_nvme_softc *,
364     struct nvme_feature_obj *,
365     struct nvme_command *,
366     struct nvme_completion *);
367 
368 static __inline void
369 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
370 {
371 	size_t len;
372 
373 	len = strnlen(src, dst_size);
374 	memset(dst, pad, dst_size);
375 	memcpy(dst, src, len);
376 }
377 
378 static __inline void
379 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
380 {
381 
382 	*status &= ~NVME_STATUS_MASK;
383 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
384 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
385 }
386 
387 static __inline void
388 pci_nvme_status_genc(uint16_t *status, uint16_t code)
389 {
390 
391 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
392 }
393 
394 /*
395  * Initialize the requested number or IO Submission and Completion Queues.
396  * Admin queues are allocated implicitly.
397  */
398 static void
399 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
400 {
401 	uint32_t i;
402 
403 	/*
404 	 * Allocate and initialize the Submission Queues
405 	 */
406 	if (nsq > NVME_QUEUES) {
407 		WPRINTF("%s: clamping number of SQ from %u to %u",
408 					__func__, nsq, NVME_QUEUES);
409 		nsq = NVME_QUEUES;
410 	}
411 
412 	sc->num_squeues = nsq;
413 
414 	sc->submit_queues = calloc(sc->num_squeues + 1,
415 				sizeof(struct nvme_submission_queue));
416 	if (sc->submit_queues == NULL) {
417 		WPRINTF("%s: SQ allocation failed", __func__);
418 		sc->num_squeues = 0;
419 	} else {
420 		struct nvme_submission_queue *sq = sc->submit_queues;
421 
422 		for (i = 0; i < sc->num_squeues; i++)
423 			pthread_mutex_init(&sq[i].mtx, NULL);
424 	}
425 
426 	/*
427 	 * Allocate and initialize the Completion Queues
428 	 */
429 	if (ncq > NVME_QUEUES) {
430 		WPRINTF("%s: clamping number of CQ from %u to %u",
431 					__func__, ncq, NVME_QUEUES);
432 		ncq = NVME_QUEUES;
433 	}
434 
435 	sc->num_cqueues = ncq;
436 
437 	sc->compl_queues = calloc(sc->num_cqueues + 1,
438 				sizeof(struct nvme_completion_queue));
439 	if (sc->compl_queues == NULL) {
440 		WPRINTF("%s: CQ allocation failed", __func__);
441 		sc->num_cqueues = 0;
442 	} else {
443 		struct nvme_completion_queue *cq = sc->compl_queues;
444 
445 		for (i = 0; i < sc->num_cqueues; i++)
446 			pthread_mutex_init(&cq[i].mtx, NULL);
447 	}
448 }
449 
450 static void
451 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
452 {
453 	struct nvme_controller_data *cd = &sc->ctrldata;
454 
455 	cd->vid = 0xFB5D;
456 	cd->ssvid = 0x0000;
457 
458 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
459 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
460 
461 	/* Num of submission commands that we can handle at a time (2^rab) */
462 	cd->rab   = 4;
463 
464 	/* FreeBSD OUI */
465 	cd->ieee[0] = 0x58;
466 	cd->ieee[1] = 0x9c;
467 	cd->ieee[2] = 0xfc;
468 
469 	cd->mic = 0;
470 
471 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
472 
473 	cd->ver = 0x00010300;
474 
475 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
476 	cd->acl = 2;
477 	cd->aerl = 4;
478 
479 	/* Advertise 1, Read-only firmware slot */
480 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
481 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
482 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
483 	cd->elpe = 0;	/* max error log page entries */
484 	cd->npss = 1;	/* number of power states support */
485 
486 	/* Warning Composite Temperature Threshold */
487 	cd->wctemp = 0x0157;
488 
489 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
490 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
491 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
492 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
493 	cd->nn = 1;	/* number of namespaces */
494 
495 	cd->oncs = 0;
496 	switch (sc->dataset_management) {
497 	case NVME_DATASET_MANAGEMENT_AUTO:
498 		if (sc->nvstore.deallocate)
499 			cd->oncs |= NVME_ONCS_DSM;
500 		break;
501 	case NVME_DATASET_MANAGEMENT_ENABLE:
502 		cd->oncs |= NVME_ONCS_DSM;
503 		break;
504 	default:
505 		break;
506 	}
507 
508 	cd->fna = 0x03;
509 
510 	cd->power_state[0].mp = 10;
511 }
512 
513 /*
514  * Calculate the CRC-16 of the given buffer
515  * See copyright attribution at top of file
516  */
517 static uint16_t
518 crc16(uint16_t crc, const void *buffer, unsigned int len)
519 {
520 	const unsigned char *cp = buffer;
521 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
522 	static uint16_t const crc16_table[256] = {
523 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
524 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
525 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
526 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
527 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
528 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
529 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
530 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
531 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
532 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
533 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
534 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
535 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
536 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
537 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
538 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
539 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
540 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
541 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
542 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
543 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
544 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
545 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
546 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
547 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
548 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
549 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
550 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
551 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
552 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
553 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
554 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
555 	};
556 
557 	while (len--)
558 		crc = (((crc >> 8) & 0xffU) ^
559 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
560 	return crc;
561 }
562 
563 static void
564 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
565     struct nvme_namespace_data *nd, uint32_t nsid,
566     struct pci_nvme_blockstore *nvstore)
567 {
568 
569 	/* Get capacity and block size information from backing store */
570 	nd->nsze = nvstore->size / nvstore->sectsz;
571 	nd->ncap = nd->nsze;
572 	nd->nuse = nd->nsze;
573 
574 	if (nvstore->type == NVME_STOR_BLOCKIF)
575 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
576 
577 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
578 	nd->flbas = 0;
579 
580 	/* Create an EUI-64 if user did not provide one */
581 	if (nvstore->eui64 == 0) {
582 		char *data = NULL;
583 		uint64_t eui64 = nvstore->eui64;
584 
585 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
586 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
587 		    sc->nsc_pi->pi_func);
588 
589 		if (data != NULL) {
590 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
591 			free(data);
592 		}
593 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
594 	}
595 	be64enc(nd->eui64, nvstore->eui64);
596 
597 	/* LBA data-sz = 2^lbads */
598 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
599 }
600 
601 static void
602 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
603 {
604 
605 	memset(&sc->err_log, 0, sizeof(sc->err_log));
606 	memset(&sc->health_log, 0, sizeof(sc->health_log));
607 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
608 
609 	/* Set read/write remainder to round up according to spec */
610 	sc->read_dunits_remainder = 999;
611 	sc->write_dunits_remainder = 999;
612 
613 	/* Set nominal Health values checked by implementations */
614 	sc->health_log.temperature = 310;
615 	sc->health_log.available_spare = 100;
616 	sc->health_log.available_spare_threshold = 10;
617 }
618 
619 static void
620 pci_nvme_init_features(struct pci_nvme_softc *sc)
621 {
622 
623 	sc->feat[0].set = nvme_feature_invalid_cb;
624 	sc->feat[0].get = nvme_feature_invalid_cb;
625 
626 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
627 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
628 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
629 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
630 	    nvme_feature_iv_config;
631 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
632 	    nvme_feature_invalid_cb;
633 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
634 	    nvme_feature_invalid_cb;
635 }
636 
637 static void
638 pci_nvme_aer_init(struct pci_nvme_softc *sc)
639 {
640 
641 	STAILQ_INIT(&sc->aer_list);
642 	sc->aer_count = 0;
643 }
644 
645 static void
646 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
647 {
648 	struct pci_nvme_aer *aer = NULL;
649 
650 	while (!STAILQ_EMPTY(&sc->aer_list)) {
651 		aer = STAILQ_FIRST(&sc->aer_list);
652 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
653 		free(aer);
654 	}
655 
656 	pci_nvme_aer_init(sc);
657 }
658 
659 #ifdef __FreeBSD__
660 static bool
661 pci_nvme_aer_available(struct pci_nvme_softc *sc)
662 {
663 
664 	return (!STAILQ_EMPTY(&sc->aer_list));
665 }
666 #else
667 /* This is kept behind an ifdef while it's unused to appease the compiler. */
668 #endif
669 
670 static bool
671 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
672 {
673 	struct nvme_controller_data *cd = &sc->ctrldata;
674 
675 	/* AERL is a zero based value while aer_count is one's based */
676 	return (sc->aer_count == (cd->aerl + 1));
677 }
678 
679 /*
680  * Add an Async Event Request
681  *
682  * Stores an AER to be returned later if the Controller needs to notify the
683  * host of an event.
684  * Note that while the NVMe spec doesn't require Controllers to return AER's
685  * in order, this implementation does preserve the order.
686  */
687 static int
688 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
689 {
690 	struct pci_nvme_aer *aer = NULL;
691 
692 	if (pci_nvme_aer_limit_reached(sc))
693 		return (-1);
694 
695 	aer = calloc(1, sizeof(struct pci_nvme_aer));
696 	if (aer == NULL)
697 		return (-1);
698 
699 	sc->aer_count++;
700 
701 	/* Save the Command ID for use in the completion message */
702 	aer->cid = cid;
703 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
704 
705 	return (0);
706 }
707 
708 /*
709  * Get an Async Event Request structure
710  *
711  * Returns a pointer to an AER previously submitted by the host or NULL if
712  * no AER's exist. Caller is responsible for freeing the returned struct.
713  */
714 #ifdef __FreeBSD__
715 static struct pci_nvme_aer *
716 pci_nvme_aer_get(struct pci_nvme_softc *sc)
717 {
718 	struct pci_nvme_aer *aer = NULL;
719 
720 	aer = STAILQ_FIRST(&sc->aer_list);
721 	if (aer != NULL) {
722 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
723 		sc->aer_count--;
724 	}
725 
726 	return (aer);
727 }
728 #else
729 /* This is kept behind an ifdef while it's unused to appease the compiler. */
730 #endif
731 
732 static void
733 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
734 {
735 	uint32_t i;
736 
737 	DPRINTF("%s", __func__);
738 
739 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
740 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
741 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
742 
743 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
744 
745 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
746 
747 	sc->regs.cc = 0;
748 	sc->regs.csts = 0;
749 
750 	assert(sc->submit_queues != NULL);
751 
752 	for (i = 0; i < sc->num_squeues + 1; i++) {
753 		sc->submit_queues[i].qbase = NULL;
754 		sc->submit_queues[i].size = 0;
755 		sc->submit_queues[i].cqid = 0;
756 		sc->submit_queues[i].tail = 0;
757 		sc->submit_queues[i].head = 0;
758 	}
759 
760 	assert(sc->compl_queues != NULL);
761 
762 	for (i = 0; i < sc->num_cqueues + 1; i++) {
763 		sc->compl_queues[i].qbase = NULL;
764 		sc->compl_queues[i].size = 0;
765 		sc->compl_queues[i].tail = 0;
766 		sc->compl_queues[i].head = 0;
767 	}
768 
769 	sc->num_q_is_set = false;
770 
771 	pci_nvme_aer_destroy(sc);
772 }
773 
774 static void
775 pci_nvme_reset(struct pci_nvme_softc *sc)
776 {
777 	pthread_mutex_lock(&sc->mtx);
778 	pci_nvme_reset_locked(sc);
779 	pthread_mutex_unlock(&sc->mtx);
780 }
781 
782 static void
783 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
784 {
785 	uint16_t acqs, asqs;
786 
787 	DPRINTF("%s", __func__);
788 
789 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
790 	sc->submit_queues[0].size = asqs;
791 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
792 	            sizeof(struct nvme_command) * asqs);
793 
794 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
795 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
796 
797 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
798 	    NVME_AQA_REG_ACQS_MASK) + 1;
799 	sc->compl_queues[0].size = acqs;
800 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
801 	         sizeof(struct nvme_completion) * acqs);
802 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
803 
804 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
805 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
806 }
807 
808 static int
809 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
810 	size_t len, enum nvme_copy_dir dir)
811 {
812 	uint8_t *p;
813 	size_t bytes;
814 
815 	if (len > (8 * 1024)) {
816 		return (-1);
817 	}
818 
819 	/* Copy from the start of prp1 to the end of the physical page */
820 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
821 	bytes = MIN(bytes, len);
822 
823 	p = vm_map_gpa(ctx, prp1, bytes);
824 	if (p == NULL) {
825 		return (-1);
826 	}
827 
828 	if (dir == NVME_COPY_TO_PRP)
829 		memcpy(p, b, bytes);
830 	else
831 		memcpy(b, p, bytes);
832 
833 	b += bytes;
834 
835 	len -= bytes;
836 	if (len == 0) {
837 		return (0);
838 	}
839 
840 	len = MIN(len, PAGE_SIZE);
841 
842 	p = vm_map_gpa(ctx, prp2, len);
843 	if (p == NULL) {
844 		return (-1);
845 	}
846 
847 	if (dir == NVME_COPY_TO_PRP)
848 		memcpy(p, b, len);
849 	else
850 		memcpy(b, p, len);
851 
852 	return (0);
853 }
854 
855 /*
856  * Write a Completion Queue Entry update
857  *
858  * Write the completion and update the doorbell value
859  */
860 static void
861 pci_nvme_cq_update(struct pci_nvme_softc *sc,
862 		struct nvme_completion_queue *cq,
863 		uint32_t cdw0,
864 		uint16_t cid,
865 		uint16_t sqid,
866 		uint16_t status)
867 {
868 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
869 	struct nvme_completion *cqe;
870 
871 	assert(cq->qbase != NULL);
872 
873 	pthread_mutex_lock(&cq->mtx);
874 
875 	cqe = &cq->qbase[cq->tail];
876 
877 	/* Flip the phase bit */
878 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
879 
880 	cqe->cdw0 = cdw0;
881 	cqe->sqhd = sq->head;
882 	cqe->sqid = sqid;
883 	cqe->cid = cid;
884 	cqe->status = status;
885 
886 	cq->tail++;
887 	if (cq->tail >= cq->size) {
888 		cq->tail = 0;
889 	}
890 
891 	pthread_mutex_unlock(&cq->mtx);
892 }
893 
894 static int
895 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
896 	struct nvme_completion* compl)
897 {
898 	uint16_t qid = command->cdw10 & 0xffff;
899 
900 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
901 	if (qid == 0 || qid > sc->num_squeues ||
902 	    (sc->submit_queues[qid].qbase == NULL)) {
903 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
904 		        __func__, qid, sc->num_squeues);
905 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
906 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
907 		return (1);
908 	}
909 
910 	sc->submit_queues[qid].qbase = NULL;
911 	sc->submit_queues[qid].cqid = 0;
912 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
913 	return (1);
914 }
915 
916 static int
917 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
918 	struct nvme_completion* compl)
919 {
920 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
921 		uint16_t qid = command->cdw10 & 0xffff;
922 		struct nvme_submission_queue *nsq;
923 
924 		if ((qid == 0) || (qid > sc->num_squeues) ||
925 		    (sc->submit_queues[qid].qbase != NULL)) {
926 			WPRINTF("%s queue index %u > num_squeues %u",
927 			        __func__, qid, sc->num_squeues);
928 			pci_nvme_status_tc(&compl->status,
929 			    NVME_SCT_COMMAND_SPECIFIC,
930 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
931 			return (1);
932 		}
933 
934 		nsq = &sc->submit_queues[qid];
935 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
936 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
937 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
938 			/*
939 			 * Queues must specify at least two entries
940 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
941 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
942 			 */
943 			pci_nvme_status_tc(&compl->status,
944 			    NVME_SCT_COMMAND_SPECIFIC,
945 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
946 			return (1);
947 		}
948 		nsq->head = nsq->tail = 0;
949 
950 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
951 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
952 			pci_nvme_status_tc(&compl->status,
953 			    NVME_SCT_COMMAND_SPECIFIC,
954 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
955 			return (1);
956 		}
957 
958 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
959 			pci_nvme_status_tc(&compl->status,
960 			    NVME_SCT_COMMAND_SPECIFIC,
961 			    NVME_SC_COMPLETION_QUEUE_INVALID);
962 			return (1);
963 		}
964 
965 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
966 
967 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
968 		              sizeof(struct nvme_command) * (size_t)nsq->size);
969 
970 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
971 		        qid, nsq->size, nsq->qbase, nsq->cqid);
972 
973 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
974 
975 		DPRINTF("%s completed creating IOSQ qid %u",
976 		         __func__, qid);
977 	} else {
978 		/*
979 		 * Guest sent non-cont submission queue request.
980 		 * This setting is unsupported by this emulation.
981 		 */
982 		WPRINTF("%s unsupported non-contig (list-based) "
983 		         "create i/o submission queue", __func__);
984 
985 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
986 	}
987 	return (1);
988 }
989 
990 static int
991 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
992 	struct nvme_completion* compl)
993 {
994 	uint16_t qid = command->cdw10 & 0xffff;
995 	uint16_t sqid;
996 
997 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
998 	if (qid == 0 || qid > sc->num_cqueues ||
999 	    (sc->compl_queues[qid].qbase == NULL)) {
1000 		WPRINTF("%s queue index %u / num_cqueues %u",
1001 		        __func__, qid, sc->num_cqueues);
1002 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1003 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1004 		return (1);
1005 	}
1006 
1007 	/* Deleting an Active CQ is an error */
1008 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1009 		if (sc->submit_queues[sqid].cqid == qid) {
1010 			pci_nvme_status_tc(&compl->status,
1011 			    NVME_SCT_COMMAND_SPECIFIC,
1012 			    NVME_SC_INVALID_QUEUE_DELETION);
1013 			return (1);
1014 		}
1015 
1016 	sc->compl_queues[qid].qbase = NULL;
1017 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1018 	return (1);
1019 }
1020 
1021 static int
1022 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1023 	struct nvme_completion* compl)
1024 {
1025 	struct nvme_completion_queue *ncq;
1026 	uint16_t qid = command->cdw10 & 0xffff;
1027 
1028 	/* Only support Physically Contiguous queues */
1029 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1030 		WPRINTF("%s unsupported non-contig (list-based) "
1031 		         "create i/o completion queue",
1032 		         __func__);
1033 
1034 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1035 		return (1);
1036 	}
1037 
1038 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1039 	    (sc->compl_queues[qid].qbase != NULL)) {
1040 		WPRINTF("%s queue index %u > num_cqueues %u",
1041 			__func__, qid, sc->num_cqueues);
1042 		pci_nvme_status_tc(&compl->status,
1043 		    NVME_SCT_COMMAND_SPECIFIC,
1044 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1045 		return (1);
1046  	}
1047 
1048 	ncq = &sc->compl_queues[qid];
1049 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1050 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1051 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1052 		pci_nvme_status_tc(&compl->status,
1053 		    NVME_SCT_COMMAND_SPECIFIC,
1054 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1055 		return (1);
1056 	}
1057 
1058 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1059 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1060 		/*
1061 		 * Queues must specify at least two entries
1062 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1063 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1064 		 */
1065 		pci_nvme_status_tc(&compl->status,
1066 		    NVME_SCT_COMMAND_SPECIFIC,
1067 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1068 		return (1);
1069 	}
1070 	ncq->head = ncq->tail = 0;
1071 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1072 		     command->prp1,
1073 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1074 
1075 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1076 
1077 
1078 	return (1);
1079 }
1080 
1081 static int
1082 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1083 	struct nvme_completion* compl)
1084 {
1085 	uint32_t logsize;
1086 	uint8_t logpage = command->cdw10 & 0xFF;
1087 
1088 #ifndef __FreeBSD__
1089 	logsize = 0;
1090 #endif
1091 
1092 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1093 
1094 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1095 
1096 	/*
1097 	 * Command specifies the number of dwords to return in fields NUMDU
1098 	 * and NUMDL. This is a zero-based value.
1099 	 */
1100 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1101 	logsize *= sizeof(uint32_t);
1102 
1103 	switch (logpage) {
1104 	case NVME_LOG_ERROR:
1105 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1106 		    command->prp2, (uint8_t *)&sc->err_log,
1107 		    MIN(logsize, sizeof(sc->err_log)),
1108 		    NVME_COPY_TO_PRP);
1109 		break;
1110 	case NVME_LOG_HEALTH_INFORMATION:
1111 		pthread_mutex_lock(&sc->mtx);
1112 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1113 		    sizeof(sc->health_log.data_units_read));
1114 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1115 		    sizeof(sc->health_log.data_units_written));
1116 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1117 		    sizeof(sc->health_log.host_read_commands));
1118 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1119 		    sizeof(sc->health_log.host_write_commands));
1120 		pthread_mutex_unlock(&sc->mtx);
1121 
1122 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1123 		    command->prp2, (uint8_t *)&sc->health_log,
1124 		    MIN(logsize, sizeof(sc->health_log)),
1125 		    NVME_COPY_TO_PRP);
1126 		break;
1127 	case NVME_LOG_FIRMWARE_SLOT:
1128 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1129 		    command->prp2, (uint8_t *)&sc->fw_log,
1130 		    MIN(logsize, sizeof(sc->fw_log)),
1131 		    NVME_COPY_TO_PRP);
1132 		break;
1133 	default:
1134 		DPRINTF("%s get log page %x command not supported",
1135 		        __func__, logpage);
1136 
1137 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1138 		    NVME_SC_INVALID_LOG_PAGE);
1139 	}
1140 
1141 	return (1);
1142 }
1143 
1144 static int
1145 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1146 	struct nvme_completion* compl)
1147 {
1148 	void *dest;
1149 	uint16_t status;
1150 
1151 #ifndef __FreeBSD__
1152 	status = 0;
1153 #endif
1154 
1155 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1156 	        command->cdw10 & 0xFF, command->nsid);
1157 
1158 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1159 
1160 	switch (command->cdw10 & 0xFF) {
1161 	case 0x00: /* return Identify Namespace data structure */
1162 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1163 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1164 		    NVME_COPY_TO_PRP);
1165 		break;
1166 	case 0x01: /* return Identify Controller data structure */
1167 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1168 		    command->prp2, (uint8_t *)&sc->ctrldata,
1169 		    sizeof(sc->ctrldata),
1170 		    NVME_COPY_TO_PRP);
1171 		break;
1172 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1173 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1174 		                  sizeof(uint32_t) * 1024);
1175 		/* All unused entries shall be zero */
1176 		bzero(dest, sizeof(uint32_t) * 1024);
1177 		((uint32_t *)dest)[0] = 1;
1178 		break;
1179 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1180 		if (command->nsid != 1) {
1181 			pci_nvme_status_genc(&status,
1182 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1183 			break;
1184 		}
1185 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1186 		                  sizeof(uint32_t) * 1024);
1187 		/* All bytes after the descriptor shall be zero */
1188 		bzero(dest, sizeof(uint32_t) * 1024);
1189 
1190 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1191 		((uint8_t *)dest)[0] = 1;
1192 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1193 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1194 		break;
1195 	default:
1196 		DPRINTF("%s unsupported identify command requested 0x%x",
1197 		         __func__, command->cdw10 & 0xFF);
1198 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1199 		break;
1200 	}
1201 
1202 	compl->status = status;
1203 	return (1);
1204 }
1205 
1206 static const char *
1207 nvme_fid_to_name(uint8_t fid)
1208 {
1209 	const char *name;
1210 
1211 	switch (fid) {
1212 	case NVME_FEAT_ARBITRATION:
1213 		name = "Arbitration";
1214 		break;
1215 	case NVME_FEAT_POWER_MANAGEMENT:
1216 		name = "Power Management";
1217 		break;
1218 	case NVME_FEAT_LBA_RANGE_TYPE:
1219 		name = "LBA Range Type";
1220 		break;
1221 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1222 		name = "Temperature Threshold";
1223 		break;
1224 	case NVME_FEAT_ERROR_RECOVERY:
1225 		name = "Error Recovery";
1226 		break;
1227 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1228 		name = "Volatile Write Cache";
1229 		break;
1230 	case NVME_FEAT_NUMBER_OF_QUEUES:
1231 		name = "Number of Queues";
1232 		break;
1233 	case NVME_FEAT_INTERRUPT_COALESCING:
1234 		name = "Interrupt Coalescing";
1235 		break;
1236 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1237 		name = "Interrupt Vector Configuration";
1238 		break;
1239 	case NVME_FEAT_WRITE_ATOMICITY:
1240 		name = "Write Atomicity Normal";
1241 		break;
1242 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1243 		name = "Asynchronous Event Configuration";
1244 		break;
1245 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1246 		name = "Autonomous Power State Transition";
1247 		break;
1248 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1249 		name = "Host Memory Buffer";
1250 		break;
1251 	case NVME_FEAT_TIMESTAMP:
1252 		name = "Timestamp";
1253 		break;
1254 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1255 		name = "Keep Alive Timer";
1256 		break;
1257 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1258 		name = "Host Controlled Thermal Management";
1259 		break;
1260 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1261 		name = "Non-Operation Power State Config";
1262 		break;
1263 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1264 		name = "Read Recovery Level Config";
1265 		break;
1266 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1267 		name = "Predictable Latency Mode Config";
1268 		break;
1269 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1270 		name = "Predictable Latency Mode Window";
1271 		break;
1272 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1273 		name = "LBA Status Information Report Interval";
1274 		break;
1275 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1276 		name = "Host Behavior Support";
1277 		break;
1278 	case NVME_FEAT_SANITIZE_CONFIG:
1279 		name = "Sanitize Config";
1280 		break;
1281 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1282 		name = "Endurance Group Event Configuration";
1283 		break;
1284 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1285 		name = "Software Progress Marker";
1286 		break;
1287 	case NVME_FEAT_HOST_IDENTIFIER:
1288 		name = "Host Identifier";
1289 		break;
1290 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1291 		name = "Reservation Notification Mask";
1292 		break;
1293 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1294 		name = "Reservation Persistence";
1295 		break;
1296 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1297 		name = "Namespace Write Protection Config";
1298 		break;
1299 	default:
1300 		name = "Unknown";
1301 		break;
1302 	}
1303 
1304 	return (name);
1305 }
1306 
1307 static void
1308 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1309     struct nvme_feature_obj *feat,
1310     struct nvme_command *command,
1311     struct nvme_completion *compl)
1312 {
1313 
1314 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1315 }
1316 
1317 static void
1318 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1319     struct nvme_feature_obj *feat,
1320     struct nvme_command *command,
1321     struct nvme_completion *compl)
1322 {
1323 	uint32_t i;
1324 	uint32_t cdw11 = command->cdw11;
1325 	uint16_t iv;
1326 	bool cd;
1327 
1328 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1329 
1330 	iv = cdw11 & 0xffff;
1331 	cd = cdw11 & (1 << 16);
1332 
1333 	if (iv > (sc->max_queues + 1)) {
1334 		return;
1335 	}
1336 
1337 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1338 	if ((iv == 0) && !cd)
1339 		return;
1340 
1341 	/* Requested Interrupt Vector must be used by a CQ */
1342 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1343 		if (sc->compl_queues[i].intr_vec == iv) {
1344 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1345 		}
1346 	}
1347 
1348 }
1349 
1350 static void
1351 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1352     struct nvme_feature_obj *feat,
1353     struct nvme_command *command,
1354     struct nvme_completion *compl)
1355 {
1356 	uint16_t nqr;	/* Number of Queues Requested */
1357 
1358 	if (sc->num_q_is_set) {
1359 		WPRINTF("%s: Number of Queues already set", __func__);
1360 		pci_nvme_status_genc(&compl->status,
1361 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1362 		return;
1363 	}
1364 
1365 	nqr = command->cdw11 & 0xFFFF;
1366 	if (nqr == 0xffff) {
1367 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1368 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1369 		return;
1370 	}
1371 
1372 	sc->num_squeues = ONE_BASED(nqr);
1373 	if (sc->num_squeues > sc->max_queues) {
1374 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1375 					sc->max_queues);
1376 		sc->num_squeues = sc->max_queues;
1377 	}
1378 
1379 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1380 	if (nqr == 0xffff) {
1381 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1382 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1383 		return;
1384 	}
1385 
1386 	sc->num_cqueues = ONE_BASED(nqr);
1387 	if (sc->num_cqueues > sc->max_queues) {
1388 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1389 					sc->max_queues);
1390 		sc->num_cqueues = sc->max_queues;
1391 	}
1392 
1393 	/* Patch the command value which will be saved on callback's return */
1394 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1395 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1396 
1397 	sc->num_q_is_set = true;
1398 }
1399 
1400 static int
1401 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1402 	struct nvme_completion *compl)
1403 {
1404 	struct nvme_feature_obj *feat;
1405 	uint32_t nsid = command->nsid;
1406 	uint8_t fid = command->cdw10 & 0xFF;
1407 
1408 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1409 
1410 	if (fid >= NVME_FID_MAX) {
1411 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1412 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1413 		return (1);
1414 	}
1415 	feat = &sc->feat[fid];
1416 
1417 	if (!feat->namespace_specific &&
1418 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1419 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1420 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1421 		return (1);
1422 	}
1423 
1424 	compl->cdw0 = 0;
1425 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1426 
1427 	if (feat->set)
1428 		feat->set(sc, feat, command, compl);
1429 
1430 	if (compl->status == NVME_SC_SUCCESS)
1431 		feat->cdw11 = command->cdw11;
1432 
1433 	return (0);
1434 }
1435 
1436 static int
1437 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1438 	struct nvme_completion* compl)
1439 {
1440 	struct nvme_feature_obj *feat;
1441 	uint8_t fid = command->cdw10 & 0xFF;
1442 
1443 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1444 
1445 	if (fid >= NVME_FID_MAX) {
1446 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1447 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1448 		return (1);
1449 	}
1450 
1451 	compl->cdw0 = 0;
1452 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1453 
1454 	feat = &sc->feat[fid];
1455 	if (feat->get) {
1456 		feat->get(sc, feat, command, compl);
1457 	}
1458 
1459 	if (compl->status == NVME_SC_SUCCESS) {
1460 		compl->cdw0 = feat->cdw11;
1461 	}
1462 
1463 	return (0);
1464 }
1465 
1466 static int
1467 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1468 	struct nvme_completion* compl)
1469 {
1470 	uint8_t	ses, lbaf, pi;
1471 
1472 	/* Only supports Secure Erase Setting - User Data Erase */
1473 	ses = (command->cdw10 >> 9) & 0x7;
1474 	if (ses > 0x1) {
1475 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1476 		return (1);
1477 	}
1478 
1479 	/* Only supports a single LBA Format */
1480 	lbaf = command->cdw10 & 0xf;
1481 	if (lbaf != 0) {
1482 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1483 		    NVME_SC_INVALID_FORMAT);
1484 		return (1);
1485 	}
1486 
1487 	/* Doesn't support Protection Infomation */
1488 	pi = (command->cdw10 >> 5) & 0x7;
1489 	if (pi != 0) {
1490 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1491 		return (1);
1492 	}
1493 
1494 	if (sc->nvstore.type == NVME_STOR_RAM) {
1495 		if (sc->nvstore.ctx)
1496 			free(sc->nvstore.ctx);
1497 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1498 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1499 	} else {
1500 		struct pci_nvme_ioreq *req;
1501 		int err;
1502 
1503 		req = pci_nvme_get_ioreq(sc);
1504 		if (req == NULL) {
1505 			pci_nvme_status_genc(&compl->status,
1506 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1507 			WPRINTF("%s: unable to allocate IO req", __func__);
1508 			return (1);
1509 		}
1510 		req->nvme_sq = &sc->submit_queues[0];
1511 		req->sqid = 0;
1512 		req->opc = command->opc;
1513 		req->cid = command->cid;
1514 		req->nsid = command->nsid;
1515 
1516 		req->io_req.br_offset = 0;
1517 		req->io_req.br_resid = sc->nvstore.size;
1518 		req->io_req.br_callback = pci_nvme_io_done;
1519 
1520 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1521 		if (err) {
1522 			pci_nvme_status_genc(&compl->status,
1523 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1524 			pci_nvme_release_ioreq(sc, req);
1525 		}
1526 	}
1527 
1528 	return (1);
1529 }
1530 
1531 static int
1532 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1533 	struct nvme_completion* compl)
1534 {
1535 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1536 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1537 
1538 	/* TODO: search for the command ID and abort it */
1539 
1540 	compl->cdw0 = 1;
1541 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1542 	return (1);
1543 }
1544 
1545 static int
1546 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1547 	struct nvme_command* command, struct nvme_completion* compl)
1548 {
1549 	DPRINTF("%s async event request 0x%x", __func__, command->cdw11);
1550 
1551 	/* Don't exceed the Async Event Request Limit (AERL). */
1552 	if (pci_nvme_aer_limit_reached(sc)) {
1553 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1554 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1555 		return (1);
1556 	}
1557 
1558 	if (pci_nvme_aer_add(sc, command->cid)) {
1559 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1560 				NVME_SC_INTERNAL_DEVICE_ERROR);
1561 		return (1);
1562 	}
1563 
1564 	/*
1565 	 * Raise events when they happen based on the Set Features cmd.
1566 	 * These events happen async, so only set completion successful if
1567 	 * there is an event reflective of the request to get event.
1568 	 */
1569 	compl->status = NVME_NO_STATUS;
1570 
1571 	return (0);
1572 }
1573 
1574 static void
1575 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1576 {
1577 	struct nvme_completion compl;
1578 	struct nvme_command *cmd;
1579 	struct nvme_submission_queue *sq;
1580 	struct nvme_completion_queue *cq;
1581 	uint16_t sqhead;
1582 
1583 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1584 
1585 	sq = &sc->submit_queues[0];
1586 	cq = &sc->compl_queues[0];
1587 
1588 	pthread_mutex_lock(&sq->mtx);
1589 
1590 	sqhead = sq->head;
1591 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1592 
1593 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1594 		cmd = &(sq->qbase)[sqhead];
1595 		compl.cdw0 = 0;
1596 		compl.status = 0;
1597 
1598 		switch (cmd->opc) {
1599 		case NVME_OPC_DELETE_IO_SQ:
1600 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1601 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1602 			break;
1603 		case NVME_OPC_CREATE_IO_SQ:
1604 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1605 			nvme_opc_create_io_sq(sc, cmd, &compl);
1606 			break;
1607 		case NVME_OPC_DELETE_IO_CQ:
1608 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1609 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1610 			break;
1611 		case NVME_OPC_CREATE_IO_CQ:
1612 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1613 			nvme_opc_create_io_cq(sc, cmd, &compl);
1614 			break;
1615 		case NVME_OPC_GET_LOG_PAGE:
1616 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1617 			nvme_opc_get_log_page(sc, cmd, &compl);
1618 			break;
1619 		case NVME_OPC_IDENTIFY:
1620 			DPRINTF("%s command IDENTIFY", __func__);
1621 			nvme_opc_identify(sc, cmd, &compl);
1622 			break;
1623 		case NVME_OPC_ABORT:
1624 			DPRINTF("%s command ABORT", __func__);
1625 			nvme_opc_abort(sc, cmd, &compl);
1626 			break;
1627 		case NVME_OPC_SET_FEATURES:
1628 			DPRINTF("%s command SET_FEATURES", __func__);
1629 			nvme_opc_set_features(sc, cmd, &compl);
1630 			break;
1631 		case NVME_OPC_GET_FEATURES:
1632 			DPRINTF("%s command GET_FEATURES", __func__);
1633 			nvme_opc_get_features(sc, cmd, &compl);
1634 			break;
1635 		case NVME_OPC_FIRMWARE_ACTIVATE:
1636 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1637 			pci_nvme_status_tc(&compl.status,
1638 			    NVME_SCT_COMMAND_SPECIFIC,
1639 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1640 			break;
1641 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1642 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1643 			nvme_opc_async_event_req(sc, cmd, &compl);
1644 			break;
1645 		case NVME_OPC_FORMAT_NVM:
1646 			DPRINTF("%s command FORMAT_NVM", __func__);
1647 			if ((sc->ctrldata.oacs &
1648 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1649 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1650 			}
1651 			compl.status = NVME_NO_STATUS;
1652 			nvme_opc_format_nvm(sc, cmd, &compl);
1653 			break;
1654 		default:
1655 			DPRINTF("0x%x command is not implemented",
1656 			    cmd->opc);
1657 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1658 		}
1659 		sqhead = (sqhead + 1) % sq->size;
1660 
1661 		if (NVME_COMPLETION_VALID(compl)) {
1662 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1663 			    compl.cdw0,
1664 			    cmd->cid,
1665 			    0,		/* SQID */
1666 			    compl.status);
1667 		}
1668 	}
1669 
1670 	DPRINTF("setting sqhead %u", sqhead);
1671 	sq->head = sqhead;
1672 
1673 	if (cq->head != cq->tail)
1674 		pci_generate_msix(sc->nsc_pi, 0);
1675 
1676 	pthread_mutex_unlock(&sq->mtx);
1677 }
1678 
1679 /*
1680  * Update the Write and Read statistics reported in SMART data
1681  *
1682  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1683  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1684  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1685  */
1686 static void
1687 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1688     size_t bytes, uint16_t status)
1689 {
1690 
1691 	pthread_mutex_lock(&sc->mtx);
1692 	switch (opc) {
1693 	case NVME_OPC_WRITE:
1694 		sc->write_commands++;
1695 		if (status != NVME_SC_SUCCESS)
1696 			break;
1697 		sc->write_dunits_remainder += (bytes / 512);
1698 		while (sc->write_dunits_remainder >= 1000) {
1699 			sc->write_data_units++;
1700 			sc->write_dunits_remainder -= 1000;
1701 		}
1702 		break;
1703 	case NVME_OPC_READ:
1704 		sc->read_commands++;
1705 		if (status != NVME_SC_SUCCESS)
1706 			break;
1707 		sc->read_dunits_remainder += (bytes / 512);
1708 		while (sc->read_dunits_remainder >= 1000) {
1709 			sc->read_data_units++;
1710 			sc->read_dunits_remainder -= 1000;
1711 		}
1712 		break;
1713 	default:
1714 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
1715 		break;
1716 	}
1717 	pthread_mutex_unlock(&sc->mtx);
1718 }
1719 
1720 /*
1721  * Check if the combination of Starting LBA (slba) and Number of Logical
1722  * Blocks (nlb) exceeds the range of the underlying storage.
1723  *
1724  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
1725  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
1726  * overflow.
1727  */
1728 static bool
1729 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
1730     uint32_t nlb)
1731 {
1732 	size_t	offset, bytes;
1733 
1734 	/* Overflow check of multiplying Starting LBA by the sector size */
1735 	if (slba >> (64 - nvstore->sectsz_bits))
1736 		return (true);
1737 
1738 	offset = slba << nvstore->sectsz_bits;
1739 	bytes = nlb << nvstore->sectsz_bits;
1740 
1741 	/* Overflow check of Number of Logical Blocks */
1742 	if ((nvstore->size - offset) < bytes)
1743 		return (true);
1744 
1745 	return (false);
1746 }
1747 
1748 static int
1749 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1750 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1751 {
1752 	int iovidx;
1753 
1754 	if (req == NULL)
1755 		return (-1);
1756 
1757 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
1758 		return (-1);
1759 	}
1760 
1761 	/* concatenate contig block-iovs to minimize number of iovs */
1762 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1763 		iovidx = req->io_req.br_iovcnt - 1;
1764 
1765 		req->io_req.br_iov[iovidx].iov_base =
1766 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1767 				     req->prev_gpaddr, size);
1768 
1769 		req->prev_size += size;
1770 		req->io_req.br_resid += size;
1771 
1772 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1773 	} else {
1774 		iovidx = req->io_req.br_iovcnt;
1775 		if (iovidx == 0) {
1776 			req->io_req.br_offset = lba;
1777 			req->io_req.br_resid = 0;
1778 			req->io_req.br_param = req;
1779 		}
1780 
1781 		req->io_req.br_iov[iovidx].iov_base =
1782 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1783 				     gpaddr, size);
1784 
1785 		req->io_req.br_iov[iovidx].iov_len = size;
1786 
1787 		req->prev_gpaddr = gpaddr;
1788 		req->prev_size = size;
1789 		req->io_req.br_resid += size;
1790 
1791 		req->io_req.br_iovcnt++;
1792 	}
1793 
1794 	return (0);
1795 }
1796 
1797 static void
1798 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1799 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1800 	uint32_t cdw0, uint16_t status)
1801 {
1802 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1803 
1804 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1805 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1806 		 NVME_STATUS_GET_SC(status));
1807 
1808 	pci_nvme_cq_update(sc, cq,
1809 	    0,		/* CDW0 */
1810 	    cid,
1811 	    sqid,
1812 	    status);
1813 
1814 	if (cq->head != cq->tail) {
1815 		if (cq->intr_en & NVME_CQ_INTEN) {
1816 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1817 		} else {
1818 			DPRINTF("%s: CQ%u interrupt disabled",
1819 						__func__, sq->cqid);
1820 		}
1821 	}
1822 }
1823 
1824 static void
1825 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1826 {
1827 	req->sc = NULL;
1828 	req->nvme_sq = NULL;
1829 	req->sqid = 0;
1830 
1831 	pthread_mutex_lock(&sc->mtx);
1832 
1833 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1834 	sc->pending_ios--;
1835 
1836 	/* when no more IO pending, can set to ready if device reset/enabled */
1837 	if (sc->pending_ios == 0 &&
1838 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1839 		sc->regs.csts |= NVME_CSTS_RDY;
1840 
1841 	pthread_mutex_unlock(&sc->mtx);
1842 
1843 	sem_post(&sc->iosemlock);
1844 }
1845 
1846 static struct pci_nvme_ioreq *
1847 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1848 {
1849 	struct pci_nvme_ioreq *req = NULL;;
1850 
1851 	sem_wait(&sc->iosemlock);
1852 	pthread_mutex_lock(&sc->mtx);
1853 
1854 	req = STAILQ_FIRST(&sc->ioreqs_free);
1855 	assert(req != NULL);
1856 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1857 
1858 	req->sc = sc;
1859 
1860 	sc->pending_ios++;
1861 
1862 	pthread_mutex_unlock(&sc->mtx);
1863 
1864 	req->io_req.br_iovcnt = 0;
1865 	req->io_req.br_offset = 0;
1866 	req->io_req.br_resid = 0;
1867 	req->io_req.br_param = req;
1868 	req->prev_gpaddr = 0;
1869 	req->prev_size = 0;
1870 
1871 	return req;
1872 }
1873 
1874 static void
1875 pci_nvme_io_done(struct blockif_req *br, int err)
1876 {
1877 	struct pci_nvme_ioreq *req = br->br_param;
1878 	struct nvme_submission_queue *sq = req->nvme_sq;
1879 	uint16_t code, status;
1880 
1881 #ifndef __FreeBSD__
1882 	status = 0;
1883 #endif
1884 
1885 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
1886 
1887 	/* TODO return correct error */
1888 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1889 	pci_nvme_status_genc(&status, code);
1890 
1891 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
1892 	pci_nvme_stats_write_read_update(req->sc, req->opc,
1893 	    req->bytes, status);
1894 	pci_nvme_release_ioreq(req->sc, req);
1895 }
1896 
1897 /*
1898  * Implements the Flush command. The specification states:
1899  *    If a volatile write cache is not present, Flush commands complete
1900  *    successfully and have no effect
1901  * in the description of the Volatile Write Cache (VWC) field of the Identify
1902  * Controller data. Therefore, set status to Success if the command is
1903  * not supported (i.e. RAM or as indicated by the blockif).
1904  */
1905 static bool
1906 nvme_opc_flush(struct pci_nvme_softc *sc,
1907     struct nvme_command *cmd,
1908     struct pci_nvme_blockstore *nvstore,
1909     struct pci_nvme_ioreq *req,
1910     uint16_t *status)
1911 {
1912 	bool pending = false;
1913 
1914 	if (nvstore->type == NVME_STOR_RAM) {
1915 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1916 	} else {
1917 		int err;
1918 
1919 		req->io_req.br_callback = pci_nvme_io_done;
1920 
1921 		err = blockif_flush(nvstore->ctx, &req->io_req);
1922 		switch (err) {
1923 		case 0:
1924 			pending = true;
1925 			break;
1926 		case EOPNOTSUPP:
1927 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1928 			break;
1929 		default:
1930 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1931 		}
1932 	}
1933 
1934 	return (pending);
1935 }
1936 
1937 static uint16_t
1938 nvme_write_read_ram(struct pci_nvme_softc *sc,
1939     struct pci_nvme_blockstore *nvstore,
1940     uint64_t prp1, uint64_t prp2,
1941     size_t offset, uint64_t bytes,
1942     bool is_write)
1943 {
1944 	uint8_t *buf = nvstore->ctx;
1945 	enum nvme_copy_dir dir;
1946 	uint16_t status;
1947 
1948 #ifndef __FreeBSD__
1949 	status = 0;
1950 #endif
1951 
1952 	if (is_write)
1953 		dir = NVME_COPY_TO_PRP;
1954 	else
1955 		dir = NVME_COPY_FROM_PRP;
1956 
1957 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
1958 	    buf + offset, bytes, dir))
1959 		pci_nvme_status_genc(&status,
1960 		    NVME_SC_DATA_TRANSFER_ERROR);
1961 	else
1962 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1963 
1964 	return (status);
1965 }
1966 
1967 static uint16_t
1968 nvme_write_read_blockif(struct pci_nvme_softc *sc,
1969     struct pci_nvme_blockstore *nvstore,
1970     struct pci_nvme_ioreq *req,
1971     uint64_t prp1, uint64_t prp2,
1972     size_t offset, uint64_t bytes,
1973     bool is_write)
1974 {
1975 	uint64_t size;
1976 	int err;
1977 	uint16_t status = NVME_NO_STATUS;
1978 
1979 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
1980 	if (pci_nvme_append_iov_req(sc, req, prp1,
1981 	    size, is_write, offset)) {
1982 		pci_nvme_status_genc(&status,
1983 		    NVME_SC_DATA_TRANSFER_ERROR);
1984 		goto out;
1985 	}
1986 
1987 	offset += size;
1988 	bytes  -= size;
1989 
1990 	if (bytes == 0) {
1991 		;
1992 	} else if (bytes <= PAGE_SIZE) {
1993 		size = bytes;
1994 		if (pci_nvme_append_iov_req(sc, req, prp2,
1995 		    size, is_write, offset)) {
1996 			pci_nvme_status_genc(&status,
1997 			    NVME_SC_DATA_TRANSFER_ERROR);
1998 			goto out;
1999 		}
2000 	} else {
2001 		void *vmctx = sc->nsc_pi->pi_vmctx;
2002 		uint64_t *prp_list = &prp2;
2003 		uint64_t *last = prp_list;
2004 
2005 		/* PRP2 is pointer to a physical region page list */
2006 		while (bytes) {
2007 			/* Last entry in list points to the next list */
2008 			if (prp_list == last) {
2009 				uint64_t prp = *prp_list;
2010 
2011 				prp_list = paddr_guest2host(vmctx, prp,
2012 				    PAGE_SIZE - (prp % PAGE_SIZE));
2013 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2014 			}
2015 
2016 			size = MIN(bytes, PAGE_SIZE);
2017 
2018 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2019 			    size, is_write, offset)) {
2020 				pci_nvme_status_genc(&status,
2021 				    NVME_SC_DATA_TRANSFER_ERROR);
2022 				goto out;
2023 			}
2024 
2025 			offset += size;
2026 			bytes  -= size;
2027 
2028 			prp_list++;
2029 		}
2030 	}
2031 	req->io_req.br_callback = pci_nvme_io_done;
2032 	if (is_write)
2033 		err = blockif_write(nvstore->ctx, &req->io_req);
2034 	else
2035 		err = blockif_read(nvstore->ctx, &req->io_req);
2036 
2037 	if (err)
2038 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2039 out:
2040 	return (status);
2041 }
2042 
2043 static bool
2044 nvme_opc_write_read(struct pci_nvme_softc *sc,
2045     struct nvme_command *cmd,
2046     struct pci_nvme_blockstore *nvstore,
2047     struct pci_nvme_ioreq *req,
2048     uint16_t *status)
2049 {
2050 	uint64_t lba, nblocks, bytes;
2051 	size_t offset;
2052 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2053 	bool pending = false;
2054 
2055 #ifndef __FreeBSD__
2056 	bytes = 0;
2057 #endif
2058 
2059 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2060 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2061 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2062 		WPRINTF("%s command would exceed LBA range", __func__);
2063 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2064 		goto out;
2065 	}
2066 
2067 	bytes  = nblocks << nvstore->sectsz_bits;
2068 	if (bytes > NVME_MAX_DATA_SIZE) {
2069 		WPRINTF("%s command would exceed MDTS", __func__);
2070 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2071 		goto out;
2072 	}
2073 
2074 	offset = lba << nvstore->sectsz_bits;
2075 
2076 	req->bytes = bytes;
2077 	req->io_req.br_offset = lba;
2078 
2079 	/* PRP bits 1:0 must be zero */
2080 	cmd->prp1 &= ~0x3UL;
2081 	cmd->prp2 &= ~0x3UL;
2082 
2083 	if (nvstore->type == NVME_STOR_RAM) {
2084 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2085 		    cmd->prp2, offset, bytes, is_write);
2086 	} else {
2087 		*status = nvme_write_read_blockif(sc, nvstore, req,
2088 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2089 
2090 		if (*status == NVME_NO_STATUS)
2091 			pending = true;
2092 	}
2093 out:
2094 	if (!pending)
2095 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2096 
2097 	return (pending);
2098 }
2099 
2100 static void
2101 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2102 {
2103 	struct pci_nvme_ioreq *req = br->br_param;
2104 	struct pci_nvme_softc *sc = req->sc;
2105 	bool done = true;
2106 	uint16_t status;
2107 
2108 #ifndef __FreeBSD__
2109 	status = 0;
2110 #endif
2111 
2112 	if (err) {
2113 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2114 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2115 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2116 	} else {
2117 		struct iovec *iov = req->io_req.br_iov;
2118 
2119 		req->prev_gpaddr++;
2120 		iov += req->prev_gpaddr;
2121 
2122 		/* The iov_* values already include the sector size */
2123 		req->io_req.br_offset = (off_t)iov->iov_base;
2124 		req->io_req.br_resid = iov->iov_len;
2125 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2126 			pci_nvme_status_genc(&status,
2127 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2128 		} else
2129 			done = false;
2130 	}
2131 
2132 	if (done) {
2133 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2134 		    req->cid, 0, status);
2135 		pci_nvme_release_ioreq(sc, req);
2136 	}
2137 }
2138 
2139 static bool
2140 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2141     struct nvme_command *cmd,
2142     struct pci_nvme_blockstore *nvstore,
2143     struct pci_nvme_ioreq *req,
2144     uint16_t *status)
2145 {
2146 	struct nvme_dsm_range *range;
2147 	uint32_t nr, r, non_zero, dr;
2148 	int err;
2149 	bool pending = false;
2150 
2151 #ifndef __FreeBSD__
2152 	range = NULL;
2153 #endif
2154 
2155 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2156 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2157 		goto out;
2158 	}
2159 
2160 	nr = cmd->cdw10 & 0xff;
2161 
2162 	/* copy locally because a range entry could straddle PRPs */
2163 	range = calloc(1, NVME_MAX_DSM_TRIM);
2164 	if (range == NULL) {
2165 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2166 		goto out;
2167 	}
2168 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2169 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2170 
2171 	/* Check for invalid ranges and the number of non-zero lengths */
2172 	non_zero = 0;
2173 	for (r = 0; r <= nr; r++) {
2174 		if (pci_nvme_out_of_range(nvstore,
2175 		    range[r].starting_lba, range[r].length)) {
2176 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2177 			goto out;
2178 		}
2179 		if (range[r].length != 0)
2180 			non_zero++;
2181 	}
2182 
2183 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2184 		size_t offset, bytes;
2185 		int sectsz_bits = sc->nvstore.sectsz_bits;
2186 
2187 		/*
2188 		 * DSM calls are advisory only, and compliant controllers
2189 		 * may choose to take no actions (i.e. return Success).
2190 		 */
2191 		if (!nvstore->deallocate) {
2192 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2193 			goto out;
2194 		}
2195 
2196 		/* If all ranges have a zero length, return Success */
2197 		if (non_zero == 0) {
2198 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2199 			goto out;
2200 		}
2201 
2202 		if (req == NULL) {
2203 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2204 			goto out;
2205 		}
2206 
2207 		offset = range[0].starting_lba << sectsz_bits;
2208 		bytes = range[0].length << sectsz_bits;
2209 
2210 		/*
2211 		 * If the request is for more than a single range, store
2212 		 * the ranges in the br_iov. Optimize for the common case
2213 		 * of a single range.
2214 		 *
2215 		 * Note that NVMe Number of Ranges is a zero based value
2216 		 */
2217 		req->io_req.br_iovcnt = 0;
2218 		req->io_req.br_offset = offset;
2219 		req->io_req.br_resid = bytes;
2220 
2221 		if (nr == 0) {
2222 			req->io_req.br_callback = pci_nvme_io_done;
2223 		} else {
2224 			struct iovec *iov = req->io_req.br_iov;
2225 
2226 			for (r = 0, dr = 0; r <= nr; r++) {
2227 				offset = range[r].starting_lba << sectsz_bits;
2228 				bytes = range[r].length << sectsz_bits;
2229 				if (bytes == 0)
2230 					continue;
2231 
2232 				if ((nvstore->size - offset) < bytes) {
2233 					pci_nvme_status_genc(status,
2234 					    NVME_SC_LBA_OUT_OF_RANGE);
2235 					goto out;
2236 				}
2237 				iov[dr].iov_base = (void *)offset;
2238 				iov[dr].iov_len = bytes;
2239 				dr++;
2240 			}
2241 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2242 
2243 			/*
2244 			 * Use prev_gpaddr to track the current entry and
2245 			 * prev_size to track the number of entries
2246 			 */
2247 			req->prev_gpaddr = 0;
2248 			req->prev_size = dr;
2249 		}
2250 
2251 		err = blockif_delete(nvstore->ctx, &req->io_req);
2252 		if (err)
2253 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2254 		else
2255 			pending = true;
2256 	}
2257 out:
2258 	free(range);
2259 	return (pending);
2260 }
2261 
2262 static void
2263 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2264 {
2265 	struct nvme_submission_queue *sq;
2266 	uint16_t status;
2267 	uint16_t sqhead;
2268 
2269 #ifndef __FreeBSD__
2270 	status = 0;
2271 #endif
2272 
2273 	/* handle all submissions up to sq->tail index */
2274 	sq = &sc->submit_queues[idx];
2275 
2276 	pthread_mutex_lock(&sq->mtx);
2277 
2278 	sqhead = sq->head;
2279 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2280 	         idx, sqhead, sq->tail, sq->qbase);
2281 
2282 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2283 		struct nvme_command *cmd;
2284 		struct pci_nvme_ioreq *req;
2285 		uint32_t nsid;
2286 		bool pending;
2287 
2288 		pending = false;
2289 		req = NULL;
2290 		status = 0;
2291 
2292 		cmd = &sq->qbase[sqhead];
2293 		sqhead = (sqhead + 1) % sq->size;
2294 
2295 		nsid = le32toh(cmd->nsid);
2296 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2297 			pci_nvme_status_genc(&status,
2298 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2299 			status |=
2300 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2301 			goto complete;
2302  		}
2303 
2304 		req = pci_nvme_get_ioreq(sc);
2305 		if (req == NULL) {
2306 			pci_nvme_status_genc(&status,
2307 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2308 			WPRINTF("%s: unable to allocate IO req", __func__);
2309 			goto complete;
2310 		}
2311 		req->nvme_sq = sq;
2312 		req->sqid = idx;
2313 		req->opc = cmd->opc;
2314 		req->cid = cmd->cid;
2315 		req->nsid = cmd->nsid;
2316 
2317 		switch (cmd->opc) {
2318 		case NVME_OPC_FLUSH:
2319 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2320 			    req, &status);
2321  			break;
2322 		case NVME_OPC_WRITE:
2323 		case NVME_OPC_READ:
2324 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2325 			    req, &status);
2326 			break;
2327 		case NVME_OPC_WRITE_ZEROES:
2328 			/* TODO: write zeroes
2329 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2330 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2331 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2332 			break;
2333 		case NVME_OPC_DATASET_MANAGEMENT:
2334  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2335 			    req, &status);
2336 			break;
2337  		default:
2338  			WPRINTF("%s unhandled io command 0x%x",
2339 			    __func__, cmd->opc);
2340 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2341 		}
2342 complete:
2343 		if (!pending) {
2344 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2345 			    status);
2346 			if (req != NULL)
2347 				pci_nvme_release_ioreq(sc, req);
2348 		}
2349 	}
2350 
2351 	sq->head = sqhead;
2352 
2353 	pthread_mutex_unlock(&sq->mtx);
2354 }
2355 
2356 static void
2357 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2358 	uint64_t idx, int is_sq, uint64_t value)
2359 {
2360 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2361 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2362 
2363 	if (is_sq) {
2364 		if (idx > sc->num_squeues) {
2365 			WPRINTF("%s queue index %lu overflow from "
2366 			         "guest (max %u)",
2367 			         __func__, idx, sc->num_squeues);
2368 			return;
2369 		}
2370 
2371 		atomic_store_short(&sc->submit_queues[idx].tail,
2372 		                   (uint16_t)value);
2373 
2374 		if (idx == 0) {
2375 			pci_nvme_handle_admin_cmd(sc, value);
2376 		} else {
2377 			/* submission queue; handle new entries in SQ */
2378 			if (idx > sc->num_squeues) {
2379 				WPRINTF("%s SQ index %lu overflow from "
2380 				         "guest (max %u)",
2381 				         __func__, idx, sc->num_squeues);
2382 				return;
2383 			}
2384 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2385 		}
2386 	} else {
2387 		if (idx > sc->num_cqueues) {
2388 			WPRINTF("%s queue index %lu overflow from "
2389 			         "guest (max %u)",
2390 			         __func__, idx, sc->num_cqueues);
2391 			return;
2392 		}
2393 
2394 		atomic_store_short(&sc->compl_queues[idx].head,
2395 				(uint16_t)value);
2396 	}
2397 }
2398 
2399 static void
2400 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2401 {
2402 	const char *s = iswrite ? "WRITE" : "READ";
2403 
2404 	switch (offset) {
2405 	case NVME_CR_CAP_LOW:
2406 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2407 		break;
2408 	case NVME_CR_CAP_HI:
2409 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2410 		break;
2411 	case NVME_CR_VS:
2412 		DPRINTF("%s %s NVME_CR_VS", func, s);
2413 		break;
2414 	case NVME_CR_INTMS:
2415 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2416 		break;
2417 	case NVME_CR_INTMC:
2418 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2419 		break;
2420 	case NVME_CR_CC:
2421 		DPRINTF("%s %s NVME_CR_CC", func, s);
2422 		break;
2423 	case NVME_CR_CSTS:
2424 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2425 		break;
2426 	case NVME_CR_NSSR:
2427 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2428 		break;
2429 	case NVME_CR_AQA:
2430 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2431 		break;
2432 	case NVME_CR_ASQ_LOW:
2433 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2434 		break;
2435 	case NVME_CR_ASQ_HI:
2436 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2437 		break;
2438 	case NVME_CR_ACQ_LOW:
2439 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2440 		break;
2441 	case NVME_CR_ACQ_HI:
2442 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2443 		break;
2444 	default:
2445 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2446 	}
2447 
2448 }
2449 
2450 static void
2451 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2452 	uint64_t offset, int size, uint64_t value)
2453 {
2454 	uint32_t ccreg;
2455 
2456 	if (offset >= NVME_DOORBELL_OFFSET) {
2457 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2458 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2459 		int is_sq = (belloffset % 8) < 4;
2460 
2461 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2462 			WPRINTF("guest attempted an overflow write offset "
2463 			         "0x%lx, val 0x%lx in %s",
2464 			         offset, value, __func__);
2465 			return;
2466 		}
2467 
2468 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2469 		return;
2470 	}
2471 
2472 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2473 	        offset, size, value);
2474 
2475 	if (size != 4) {
2476 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2477 		         "val 0x%lx) to bar0 in %s",
2478 		         size, offset, value, __func__);
2479 		/* TODO: shutdown device */
2480 		return;
2481 	}
2482 
2483 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2484 
2485 	pthread_mutex_lock(&sc->mtx);
2486 
2487 	switch (offset) {
2488 	case NVME_CR_CAP_LOW:
2489 	case NVME_CR_CAP_HI:
2490 		/* readonly */
2491 		break;
2492 	case NVME_CR_VS:
2493 		/* readonly */
2494 		break;
2495 	case NVME_CR_INTMS:
2496 		/* MSI-X, so ignore */
2497 		break;
2498 	case NVME_CR_INTMC:
2499 		/* MSI-X, so ignore */
2500 		break;
2501 	case NVME_CR_CC:
2502 		ccreg = (uint32_t)value;
2503 
2504 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2505 		         "iocqes %u",
2506 		        __func__,
2507 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2508 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2509 			 NVME_CC_GET_IOCQES(ccreg));
2510 
2511 		if (NVME_CC_GET_SHN(ccreg)) {
2512 			/* perform shutdown - flush out data to backend */
2513 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2514 			    NVME_CSTS_REG_SHST_SHIFT);
2515 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2516 			    NVME_CSTS_REG_SHST_SHIFT;
2517 		}
2518 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2519 			if (NVME_CC_GET_EN(ccreg) == 0)
2520 				/* transition 1-> causes controller reset */
2521 				pci_nvme_reset_locked(sc);
2522 			else
2523 				pci_nvme_init_controller(ctx, sc);
2524 		}
2525 
2526 		/* Insert the iocqes, iosqes and en bits from the write */
2527 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2528 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2529 		if (NVME_CC_GET_EN(ccreg) == 0) {
2530 			/* Insert the ams, mps and css bit fields */
2531 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2532 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2533 			sc->regs.csts &= ~NVME_CSTS_RDY;
2534 		} else if (sc->pending_ios == 0) {
2535 			sc->regs.csts |= NVME_CSTS_RDY;
2536 		}
2537 		break;
2538 	case NVME_CR_CSTS:
2539 		break;
2540 	case NVME_CR_NSSR:
2541 		/* ignore writes; don't support subsystem reset */
2542 		break;
2543 	case NVME_CR_AQA:
2544 		sc->regs.aqa = (uint32_t)value;
2545 		break;
2546 	case NVME_CR_ASQ_LOW:
2547 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2548 		               (0xFFFFF000 & value);
2549 		break;
2550 	case NVME_CR_ASQ_HI:
2551 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2552 		               (value << 32);
2553 		break;
2554 	case NVME_CR_ACQ_LOW:
2555 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2556 		               (0xFFFFF000 & value);
2557 		break;
2558 	case NVME_CR_ACQ_HI:
2559 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2560 		               (value << 32);
2561 		break;
2562 	default:
2563 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2564 		         __func__, offset, value, size);
2565 	}
2566 	pthread_mutex_unlock(&sc->mtx);
2567 }
2568 
2569 static void
2570 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2571                 int baridx, uint64_t offset, int size, uint64_t value)
2572 {
2573 	struct pci_nvme_softc* sc = pi->pi_arg;
2574 
2575 	if (baridx == pci_msix_table_bar(pi) ||
2576 	    baridx == pci_msix_pba_bar(pi)) {
2577 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2578 		         " value 0x%lx", baridx, offset, size, value);
2579 
2580 		pci_emul_msix_twrite(pi, offset, size, value);
2581 		return;
2582 	}
2583 
2584 	switch (baridx) {
2585 	case 0:
2586 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2587 		break;
2588 
2589 	default:
2590 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2591 		         __func__, baridx, value);
2592 	}
2593 }
2594 
2595 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2596 	uint64_t offset, int size)
2597 {
2598 	uint64_t value;
2599 
2600 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2601 
2602 	if (offset < NVME_DOORBELL_OFFSET) {
2603 		void *p = &(sc->regs);
2604 		pthread_mutex_lock(&sc->mtx);
2605 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2606 		pthread_mutex_unlock(&sc->mtx);
2607 	} else {
2608 		value = 0;
2609                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2610 	}
2611 
2612 	switch (size) {
2613 	case 1:
2614 		value &= 0xFF;
2615 		break;
2616 	case 2:
2617 		value &= 0xFFFF;
2618 		break;
2619 	case 4:
2620 		value &= 0xFFFFFFFF;
2621 		break;
2622 	}
2623 
2624 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2625 	         offset, size, (uint32_t)value);
2626 
2627 	return (value);
2628 }
2629 
2630 
2631 
2632 static uint64_t
2633 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2634     uint64_t offset, int size)
2635 {
2636 	struct pci_nvme_softc* sc = pi->pi_arg;
2637 
2638 	if (baridx == pci_msix_table_bar(pi) ||
2639 	    baridx == pci_msix_pba_bar(pi)) {
2640 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2641 		        baridx, offset, size);
2642 
2643 		return pci_emul_msix_tread(pi, offset, size);
2644 	}
2645 
2646 	switch (baridx) {
2647 	case 0:
2648        		return pci_nvme_read_bar_0(sc, offset, size);
2649 
2650 	default:
2651 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2652 	}
2653 
2654 	return (0);
2655 }
2656 
2657 static int
2658 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2659 {
2660 	char bident[sizeof("XX:X:X")];
2661 	const char *value;
2662 	uint32_t sectsz;
2663 
2664 	sc->max_queues = NVME_QUEUES;
2665 	sc->max_qentries = NVME_MAX_QENTRIES;
2666 	sc->ioslots = NVME_IOSLOTS;
2667 	sc->num_squeues = sc->max_queues;
2668 	sc->num_cqueues = sc->max_queues;
2669 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2670 	sectsz = 0;
2671 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2672 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2673 
2674 	value = get_config_value_node(nvl, "maxq");
2675 	if (value != NULL)
2676 		sc->max_queues = atoi(value);
2677 	value = get_config_value_node(nvl, "qsz");
2678 	if (value != NULL) {
2679 		sc->max_qentries = atoi(value);
2680 		if (sc->max_qentries <= 0) {
2681 			EPRINTLN("nvme: Invalid qsz option %d",
2682 			    sc->max_qentries);
2683 			return (-1);
2684 		}
2685 	}
2686 	value = get_config_value_node(nvl, "ioslots");
2687 	if (value != NULL) {
2688 		sc->ioslots = atoi(value);
2689 		if (sc->ioslots <= 0) {
2690 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2691 			return (-1);
2692 		}
2693 	}
2694 	value = get_config_value_node(nvl, "sectsz");
2695 	if (value != NULL)
2696 		sectsz = atoi(value);
2697 	value = get_config_value_node(nvl, "ser");
2698 	if (value != NULL) {
2699 		/*
2700 		 * This field indicates the Product Serial Number in
2701 		 * 7-bit ASCII, unused bytes should be space characters.
2702 		 * Ref: NVMe v1.3c.
2703 		 */
2704 		cpywithpad((char *)sc->ctrldata.sn,
2705 		    sizeof(sc->ctrldata.sn), value, ' ');
2706 	}
2707 	value = get_config_value_node(nvl, "eui64");
2708 	if (value != NULL)
2709 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
2710 	value = get_config_value_node(nvl, "dsm");
2711 	if (value != NULL) {
2712 		if (strcmp(value, "auto") == 0)
2713 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2714 		else if (strcmp(value, "enable") == 0)
2715 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2716 		else if (strcmp(value, "disable") == 0)
2717 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2718 	}
2719 
2720 	value = get_config_value_node(nvl, "ram");
2721 	if (value != NULL) {
2722 		uint64_t sz = strtoull(value, NULL, 10);
2723 
2724 		sc->nvstore.type = NVME_STOR_RAM;
2725 		sc->nvstore.size = sz * 1024 * 1024;
2726 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2727 		sc->nvstore.sectsz = 4096;
2728 		sc->nvstore.sectsz_bits = 12;
2729 		if (sc->nvstore.ctx == NULL) {
2730 			EPRINTLN("nvme: Unable to allocate RAM");
2731 			return (-1);
2732 		}
2733 	} else {
2734 		snprintf(bident, sizeof(bident), "%d:%d",
2735 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2736 		sc->nvstore.ctx = blockif_open(nvl, bident);
2737 		if (sc->nvstore.ctx == NULL) {
2738 			EPRINTLN("nvme: Could not open backing file: %s",
2739 			    strerror(errno));
2740 			return (-1);
2741 		}
2742 		sc->nvstore.type = NVME_STOR_BLOCKIF;
2743 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2744 	}
2745 
2746 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2747 		sc->nvstore.sectsz = sectsz;
2748 	else if (sc->nvstore.type != NVME_STOR_RAM)
2749 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2750 	for (sc->nvstore.sectsz_bits = 9;
2751 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2752 	     sc->nvstore.sectsz_bits++);
2753 
2754 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2755 		sc->max_queues = NVME_QUEUES;
2756 
2757 	return (0);
2758 }
2759 
2760 static int
2761 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
2762 {
2763 	struct pci_nvme_softc *sc;
2764 	uint32_t pci_membar_sz;
2765 	int	error;
2766 
2767 	error = 0;
2768 
2769 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2770 	pi->pi_arg = sc;
2771 	sc->nsc_pi = pi;
2772 
2773 	error = pci_nvme_parse_config(sc, nvl);
2774 	if (error < 0)
2775 		goto done;
2776 	else
2777 		error = 0;
2778 
2779 	STAILQ_INIT(&sc->ioreqs_free);
2780 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2781 	for (int i = 0; i < sc->ioslots; i++) {
2782 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2783 	}
2784 
2785 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2786 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2787 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2788 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2789 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2790 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2791 
2792 	/*
2793 	 * Allocate size of NVMe registers + doorbell space for all queues.
2794 	 *
2795 	 * The specification requires a minimum memory I/O window size of 16K.
2796 	 * The Windows driver will refuse to start a device with a smaller
2797 	 * window.
2798 	 */
2799 	pci_membar_sz = sizeof(struct nvme_registers) +
2800 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2801 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2802 
2803 	DPRINTF("nvme membar size: %u", pci_membar_sz);
2804 
2805 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2806 	if (error) {
2807 		WPRINTF("%s pci alloc mem bar failed", __func__);
2808 		goto done;
2809 	}
2810 
2811 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2812 	if (error) {
2813 		WPRINTF("%s pci add msixcap failed", __func__);
2814 		goto done;
2815 	}
2816 
2817 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2818 	if (error) {
2819 		WPRINTF("%s pci add Express capability failed", __func__);
2820 		goto done;
2821 	}
2822 
2823 	pthread_mutex_init(&sc->mtx, NULL);
2824 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2825 
2826 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
2827 	/*
2828 	 * Controller data depends on Namespace data so initialize Namespace
2829 	 * data first.
2830 	 */
2831 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2832 	pci_nvme_init_ctrldata(sc);
2833 	pci_nvme_init_logpages(sc);
2834 	pci_nvme_init_features(sc);
2835 
2836 	pci_nvme_aer_init(sc);
2837 
2838 	pci_nvme_reset(sc);
2839 
2840 	pci_lintr_request(pi);
2841 
2842 done:
2843 	return (error);
2844 }
2845 
2846 
2847 struct pci_devemu pci_de_nvme = {
2848 	.pe_emu =	"nvme",
2849 	.pe_init =	pci_nvme_init,
2850 	.pe_legacy_config = blockif_legacy_config,
2851 	.pe_barwrite =	pci_nvme_write,
2852 	.pe_barread =	pci_nvme_read
2853 };
2854 PCI_EMUL_SET(pci_de_nvme);
2855