xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision 02b17e23cf5bf66a5ea787e066ae3d1aa49bd856)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53 
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
68 
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <semaphore.h>
73 #include <stdbool.h>
74 #include <stddef.h>
75 #include <stdint.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 
80 #include <machine/atomic.h>
81 #include <machine/vmm.h>
82 #include <vmmapi.h>
83 
84 #include <dev/nvme/nvme.h>
85 
86 #include "bhyverun.h"
87 #include "block_if.h"
88 #include "config.h"
89 #include "debug.h"
90 #include "pci_emul.h"
91 
92 
93 static int nvme_debug = 0;
94 #define	DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
95 #define	WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
96 
97 /* defaults; can be overridden */
98 #define	NVME_MSIX_BAR		4
99 
100 #define	NVME_IOSLOTS		8
101 
102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
103 #define NVME_MMIO_SPACE_MIN	(1 << 14)
104 
105 #define	NVME_QUEUES		16
106 #define	NVME_MAX_QENTRIES	2048
107 /* Memory Page size Minimum reported in CAP register */
108 #define	NVME_MPSMIN		0
109 /* MPSMIN converted to bytes */
110 #define	NVME_MPSMIN_BYTES	(1 << (12 + NVME_MPSMIN))
111 
112 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
113 #define	NVME_MDTS		9
114 /* Note the + 1 allows for the initial descriptor to not be page aligned */
115 #define	NVME_MAX_IOVEC		((1 << NVME_MDTS) + 1)
116 #define	NVME_MAX_DATA_SIZE	((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
117 
118 /* This is a synthetic status code to indicate there is no status */
119 #define NVME_NO_STATUS		0xffff
120 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
121 
122 /* helpers */
123 
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)		((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)		((one)  - 1)
128 
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133 
134 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
135 
136 enum nvme_controller_register_offsets {
137 	NVME_CR_CAP_LOW = 0x00,
138 	NVME_CR_CAP_HI  = 0x04,
139 	NVME_CR_VS      = 0x08,
140 	NVME_CR_INTMS   = 0x0c,
141 	NVME_CR_INTMC   = 0x10,
142 	NVME_CR_CC      = 0x14,
143 	NVME_CR_CSTS    = 0x1c,
144 	NVME_CR_NSSR    = 0x20,
145 	NVME_CR_AQA     = 0x24,
146 	NVME_CR_ASQ_LOW = 0x28,
147 	NVME_CR_ASQ_HI  = 0x2c,
148 	NVME_CR_ACQ_LOW = 0x30,
149 	NVME_CR_ACQ_HI  = 0x34,
150 };
151 
152 enum nvme_cmd_cdw11 {
153 	NVME_CMD_CDW11_PC  = 0x0001,
154 	NVME_CMD_CDW11_IEN = 0x0002,
155 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157 
158 enum nvme_copy_dir {
159 	NVME_COPY_TO_PRP,
160 	NVME_COPY_FROM_PRP,
161 };
162 
163 #define	NVME_CQ_INTEN	0x01
164 #define	NVME_CQ_INTCOAL	0x02
165 
166 struct nvme_completion_queue {
167 	struct nvme_completion *qbase;
168 	pthread_mutex_t	mtx;
169 	uint32_t	size;
170 	uint16_t	tail; /* nvme progress */
171 	uint16_t	head; /* guest progress */
172 	uint16_t	intr_vec;
173 	uint32_t	intr_en;
174 };
175 
176 struct nvme_submission_queue {
177 	struct nvme_command *qbase;
178 	pthread_mutex_t	mtx;
179 	uint32_t	size;
180 	uint16_t	head; /* nvme progress */
181 	uint16_t	tail; /* guest progress */
182 	uint16_t	cqid; /* completion queue id */
183 	int		qpriority;
184 };
185 
186 enum nvme_storage_type {
187 	NVME_STOR_BLOCKIF = 0,
188 	NVME_STOR_RAM = 1,
189 };
190 
191 struct pci_nvme_blockstore {
192 	enum nvme_storage_type type;
193 	void		*ctx;
194 	uint64_t	size;
195 	uint32_t	sectsz;
196 	uint32_t	sectsz_bits;
197 	uint64_t	eui64;
198 	uint32_t	deallocate:1;
199 };
200 
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207 	( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 	  NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209 	  0 )
210 
211 struct pci_nvme_ioreq {
212 	struct pci_nvme_softc *sc;
213 	STAILQ_ENTRY(pci_nvme_ioreq) link;
214 	struct nvme_submission_queue *nvme_sq;
215 	uint16_t	sqid;
216 
217 	/* command information */
218 	uint16_t	opc;
219 	uint16_t	cid;
220 	uint32_t	nsid;
221 
222 	uint64_t	prev_gpaddr;
223 	size_t		prev_size;
224 	size_t		bytes;
225 
226 	struct blockif_req io_req;
227 
228 	struct iovec	iovpadding[MDTS_PAD_SIZE];
229 };
230 
231 enum nvme_dsm_type {
232 	/* Dataset Management bit in ONCS reflects backing storage capability */
233 	NVME_DATASET_MANAGEMENT_AUTO,
234 	/* Unconditionally set Dataset Management bit in ONCS */
235 	NVME_DATASET_MANAGEMENT_ENABLE,
236 	/* Unconditionally clear Dataset Management bit in ONCS */
237 	NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239 
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242 
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247 
248 struct nvme_feature_obj {
249 	uint32_t	cdw11;
250 	nvme_feature_cb	set;
251 	nvme_feature_cb	get;
252 	bool namespace_specific;
253 };
254 
255 #define NVME_FID_MAX		(NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256 
257 typedef enum {
258 	PCI_NVME_AE_TYPE_ERROR = 0,
259 	PCI_NVME_AE_TYPE_SMART,
260 	PCI_NVME_AE_TYPE_NOTICE,
261 	PCI_NVME_AE_TYPE_IO_CMD = 6,
262 	PCI_NVME_AE_TYPE_VENDOR = 7,
263 	PCI_NVME_AE_TYPE_MAX		/* Must be last */
264 } pci_nvme_async_type;
265 
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 	STAILQ_ENTRY(pci_nvme_aer) link;
269 	uint16_t	cid;	/* Command ID of the submitted AER */
270 };
271 
272 typedef enum {
273 	PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
274 	PCI_NVME_AE_INFO_FW_ACTIVATION,
275 	PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
276 	PCI_NVME_AE_INFO_ANA_CHANGE,
277 	PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
278 	PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
279 	PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
280 	PCI_NVME_AE_INFO_MAX,
281 } pci_nvme_async_info;
282 
283 /* Asynchronous Event Notifications */
284 struct pci_nvme_aen {
285 	pci_nvme_async_type atype;
286 	uint32_t	event_data;
287 	bool		posted;
288 };
289 
290 struct pci_nvme_softc {
291 	struct pci_devinst *nsc_pi;
292 
293 	pthread_mutex_t	mtx;
294 
295 	struct nvme_registers regs;
296 
297 	struct nvme_namespace_data  nsdata;
298 	struct nvme_controller_data ctrldata;
299 	struct nvme_error_information_entry err_log;
300 	struct nvme_health_information_page health_log;
301 	struct nvme_firmware_page fw_log;
302 	struct nvme_ns_list ns_log;
303 
304 	struct pci_nvme_blockstore nvstore;
305 
306 	uint16_t	max_qentries;	/* max entries per queue */
307 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
308 	uint32_t	num_cqueues;
309 	uint32_t	num_squeues;
310 	bool		num_q_is_set; /* Has host set Number of Queues */
311 
312 	struct pci_nvme_ioreq *ioreqs;
313 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
314 	uint32_t	pending_ios;
315 	uint32_t	ioslots;
316 	sem_t		iosemlock;
317 
318 	/*
319 	 * Memory mapped Submission and Completion queues
320 	 * Each array includes both Admin and IO queues
321 	 */
322 	struct nvme_completion_queue *compl_queues;
323 	struct nvme_submission_queue *submit_queues;
324 
325 	struct nvme_feature_obj feat[NVME_FID_MAX];
326 
327 	enum nvme_dsm_type dataset_management;
328 
329 	/* Accounting for SMART data */
330 	__uint128_t	read_data_units;
331 	__uint128_t	write_data_units;
332 	__uint128_t	read_commands;
333 	__uint128_t	write_commands;
334 	uint32_t	read_dunits_remainder;
335 	uint32_t	write_dunits_remainder;
336 
337 	STAILQ_HEAD(, pci_nvme_aer) aer_list;
338 	pthread_mutex_t	aer_mtx;
339 	uint32_t	aer_count;
340 	struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
341 	pthread_t	aen_tid;
342 	pthread_mutex_t	aen_mtx;
343 	pthread_cond_t	aen_cond;
344 };
345 
346 
347 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
348     struct nvme_completion_queue *cq,
349     uint32_t cdw0,
350     uint16_t cid,
351     uint16_t sqid,
352     uint16_t status);
353 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
354 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
355 static void pci_nvme_io_done(struct blockif_req *, int);
356 
357 /* Controller Configuration utils */
358 #define	NVME_CC_GET_EN(cc) \
359 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
360 #define	NVME_CC_GET_CSS(cc) \
361 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
362 #define	NVME_CC_GET_SHN(cc) \
363 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
364 #define	NVME_CC_GET_IOSQES(cc) \
365 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
366 #define	NVME_CC_GET_IOCQES(cc) \
367 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
368 
369 #define	NVME_CC_WRITE_MASK \
370 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
371 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
372 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
373 
374 #define	NVME_CC_NEN_WRITE_MASK \
375 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
376 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
377 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
378 
379 /* Controller Status utils */
380 #define	NVME_CSTS_GET_RDY(sts) \
381 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
382 
383 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
384 
385 /* Completion Queue status word utils */
386 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
387 #define	NVME_STATUS_MASK \
388 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
389 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
390 
391 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
392 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
393 
394 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
395     struct nvme_feature_obj *,
396     struct nvme_command *,
397     struct nvme_completion *);
398 static void nvme_feature_num_queues(struct pci_nvme_softc *,
399     struct nvme_feature_obj *,
400     struct nvme_command *,
401     struct nvme_completion *);
402 static void nvme_feature_iv_config(struct pci_nvme_softc *,
403     struct nvme_feature_obj *,
404     struct nvme_command *,
405     struct nvme_completion *);
406 
407 static void *aen_thr(void *arg);
408 
409 static __inline void
410 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
411 {
412 	size_t len;
413 
414 	len = strnlen(src, dst_size);
415 	memset(dst, pad, dst_size);
416 	memcpy(dst, src, len);
417 }
418 
419 static __inline void
420 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
421 {
422 
423 	*status &= ~NVME_STATUS_MASK;
424 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
425 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
426 }
427 
428 static __inline void
429 pci_nvme_status_genc(uint16_t *status, uint16_t code)
430 {
431 
432 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
433 }
434 
435 /*
436  * Initialize the requested number or IO Submission and Completion Queues.
437  * Admin queues are allocated implicitly.
438  */
439 static void
440 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
441 {
442 	uint32_t i;
443 
444 	/*
445 	 * Allocate and initialize the Submission Queues
446 	 */
447 	if (nsq > NVME_QUEUES) {
448 		WPRINTF("%s: clamping number of SQ from %u to %u",
449 					__func__, nsq, NVME_QUEUES);
450 		nsq = NVME_QUEUES;
451 	}
452 
453 	sc->num_squeues = nsq;
454 
455 	sc->submit_queues = calloc(sc->num_squeues + 1,
456 				sizeof(struct nvme_submission_queue));
457 	if (sc->submit_queues == NULL) {
458 		WPRINTF("%s: SQ allocation failed", __func__);
459 		sc->num_squeues = 0;
460 	} else {
461 		struct nvme_submission_queue *sq = sc->submit_queues;
462 
463 #ifndef __FreeBSD__
464 		for (i = 0; i < sc->num_squeues + 1; i++)
465 			pthread_mutex_init(&sq[i].mtx, NULL);
466 #else
467 		for (i = 0; i < sc->num_squeues; i++)
468 			pthread_mutex_init(&sq[i].mtx, NULL);
469 #endif
470 	}
471 
472 	/*
473 	 * Allocate and initialize the Completion Queues
474 	 */
475 	if (ncq > NVME_QUEUES) {
476 		WPRINTF("%s: clamping number of CQ from %u to %u",
477 					__func__, ncq, NVME_QUEUES);
478 		ncq = NVME_QUEUES;
479 	}
480 
481 	sc->num_cqueues = ncq;
482 
483 	sc->compl_queues = calloc(sc->num_cqueues + 1,
484 				sizeof(struct nvme_completion_queue));
485 	if (sc->compl_queues == NULL) {
486 		WPRINTF("%s: CQ allocation failed", __func__);
487 		sc->num_cqueues = 0;
488 	} else {
489 		struct nvme_completion_queue *cq = sc->compl_queues;
490 
491 #ifndef __FreeBSD__
492 		for (i = 0; i < sc->num_cqueues + 1; i++)
493 			pthread_mutex_init(&cq[i].mtx, NULL);
494 #else
495 		for (i = 0; i < sc->num_cqueues; i++)
496 			pthread_mutex_init(&cq[i].mtx, NULL);
497 #endif
498 	}
499 }
500 
501 static void
502 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
503 {
504 	struct nvme_controller_data *cd = &sc->ctrldata;
505 
506 	cd->vid = 0xFB5D;
507 	cd->ssvid = 0x0000;
508 
509 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
510 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
511 
512 	/* Num of submission commands that we can handle at a time (2^rab) */
513 	cd->rab   = 4;
514 
515 	/* FreeBSD OUI */
516 	cd->ieee[0] = 0x58;
517 	cd->ieee[1] = 0x9c;
518 	cd->ieee[2] = 0xfc;
519 
520 	cd->mic = 0;
521 
522 	cd->mdts = NVME_MDTS;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
523 
524 	cd->ver = 0x00010300;
525 
526 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
527 #ifndef __FreeBSD__
528 	/*
529 	 * Reported upstream against https://reviews.freebsd.org/D32953
530 	 * which introduced support for the namespace attribute changed AEN
531 	 * and the corresponding changed namespace log page, without setting
532 	 * the bit in oaes. A future sync will likely include this
533 	 * definition in usr/src/contrib/bhyve/dev/nvme/nvme.h once it's
534 	 * fixed there.
535 	 */
536 #define	NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT	(8)
537 	cd->oaes = 1 << NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT;
538 #endif
539 	cd->acl = 2;
540 	cd->aerl = 4;
541 
542 	/* Advertise 1, Read-only firmware slot */
543 	cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
544 	    (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
545 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
546 	cd->elpe = 0;	/* max error log page entries */
547 	cd->npss = 1;	/* number of power states support */
548 
549 	/* Warning Composite Temperature Threshold */
550 	cd->wctemp = 0x0157;
551 
552 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
553 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
554 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
555 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
556 	cd->nn = 1;	/* number of namespaces */
557 
558 	cd->oncs = 0;
559 	switch (sc->dataset_management) {
560 	case NVME_DATASET_MANAGEMENT_AUTO:
561 		if (sc->nvstore.deallocate)
562 			cd->oncs |= NVME_ONCS_DSM;
563 		break;
564 	case NVME_DATASET_MANAGEMENT_ENABLE:
565 		cd->oncs |= NVME_ONCS_DSM;
566 		break;
567 	default:
568 		break;
569 	}
570 
571 	cd->fna = 0x03;
572 
573 	cd->power_state[0].mp = 10;
574 }
575 
576 /*
577  * Calculate the CRC-16 of the given buffer
578  * See copyright attribution at top of file
579  */
580 static uint16_t
581 crc16(uint16_t crc, const void *buffer, unsigned int len)
582 {
583 	const unsigned char *cp = buffer;
584 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
585 	static uint16_t const crc16_table[256] = {
586 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
587 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
588 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
589 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
590 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
591 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
592 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
593 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
594 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
595 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
596 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
597 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
598 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
599 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
600 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
601 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
602 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
603 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
604 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
605 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
606 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
607 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
608 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
609 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
610 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
611 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
612 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
613 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
614 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
615 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
616 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
617 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
618 	};
619 
620 	while (len--)
621 		crc = (((crc >> 8) & 0xffU) ^
622 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
623 	return crc;
624 }
625 
626 static void
627 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
628     struct nvme_namespace_data *nd)
629 {
630 
631 	/* Get capacity and block size information from backing store */
632 	nd->nsze = nvstore->size / nvstore->sectsz;
633 	nd->ncap = nd->nsze;
634 	nd->nuse = nd->nsze;
635 }
636 
637 static void
638 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
639     struct nvme_namespace_data *nd, uint32_t nsid,
640     struct pci_nvme_blockstore *nvstore)
641 {
642 
643 	pci_nvme_init_nsdata_size(nvstore, nd);
644 
645 	if (nvstore->type == NVME_STOR_BLOCKIF)
646 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
647 
648 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
649 	nd->flbas = 0;
650 
651 	/* Create an EUI-64 if user did not provide one */
652 	if (nvstore->eui64 == 0) {
653 		char *data = NULL;
654 		uint64_t eui64 = nvstore->eui64;
655 
656 		asprintf(&data, "%s%u%u%u", get_config_value("name"),
657 		    sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
658 		    sc->nsc_pi->pi_func);
659 
660 		if (data != NULL) {
661 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
662 			free(data);
663 		}
664 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
665 	}
666 	be64enc(nd->eui64, nvstore->eui64);
667 
668 	/* LBA data-sz = 2^lbads */
669 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
670 }
671 
672 static void
673 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
674 {
675 
676 	memset(&sc->err_log, 0, sizeof(sc->err_log));
677 	memset(&sc->health_log, 0, sizeof(sc->health_log));
678 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
679 	memset(&sc->ns_log, 0, sizeof(sc->ns_log));
680 
681 	/* Set read/write remainder to round up according to spec */
682 	sc->read_dunits_remainder = 999;
683 	sc->write_dunits_remainder = 999;
684 
685 	/* Set nominal Health values checked by implementations */
686 	sc->health_log.temperature = 310;
687 	sc->health_log.available_spare = 100;
688 	sc->health_log.available_spare_threshold = 10;
689 }
690 
691 static void
692 pci_nvme_init_features(struct pci_nvme_softc *sc)
693 {
694 
695 	sc->feat[0].set = nvme_feature_invalid_cb;
696 	sc->feat[0].get = nvme_feature_invalid_cb;
697 
698 	sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true;
699 	sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true;
700 	sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues;
701 	sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set =
702 	    nvme_feature_iv_config;
703 	/* Enable all AENs by default */
704 	sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f;
705 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get =
706 	    nvme_feature_invalid_cb;
707 	sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get =
708 	    nvme_feature_invalid_cb;
709 }
710 
711 static void
712 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
713 {
714 
715 	STAILQ_INIT(&sc->aer_list);
716 	sc->aer_count = 0;
717 }
718 
719 static void
720 pci_nvme_aer_init(struct pci_nvme_softc *sc)
721 {
722 
723 	pthread_mutex_init(&sc->aer_mtx, NULL);
724 	pci_nvme_aer_reset(sc);
725 }
726 
727 static void
728 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
729 {
730 	struct pci_nvme_aer *aer = NULL;
731 
732 	pthread_mutex_lock(&sc->aer_mtx);
733 	while (!STAILQ_EMPTY(&sc->aer_list)) {
734 		aer = STAILQ_FIRST(&sc->aer_list);
735 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
736 		free(aer);
737 	}
738 	pthread_mutex_unlock(&sc->aer_mtx);
739 
740 	pci_nvme_aer_reset(sc);
741 }
742 
743 static bool
744 pci_nvme_aer_available(struct pci_nvme_softc *sc)
745 {
746 
747 	return (sc->aer_count != 0);
748 }
749 
750 static bool
751 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
752 {
753 	struct nvme_controller_data *cd = &sc->ctrldata;
754 
755 	/* AERL is a zero based value while aer_count is one's based */
756 	return (sc->aer_count == (cd->aerl + 1));
757 }
758 
759 /*
760  * Add an Async Event Request
761  *
762  * Stores an AER to be returned later if the Controller needs to notify the
763  * host of an event.
764  * Note that while the NVMe spec doesn't require Controllers to return AER's
765  * in order, this implementation does preserve the order.
766  */
767 static int
768 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
769 {
770 	struct pci_nvme_aer *aer = NULL;
771 
772 	if (pci_nvme_aer_limit_reached(sc))
773 		return (-1);
774 
775 	aer = calloc(1, sizeof(struct pci_nvme_aer));
776 	if (aer == NULL)
777 		return (-1);
778 
779 	/* Save the Command ID for use in the completion message */
780 	aer->cid = cid;
781 
782 	pthread_mutex_lock(&sc->aer_mtx);
783 	sc->aer_count++;
784 	STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
785 	pthread_mutex_unlock(&sc->aer_mtx);
786 
787 	return (0);
788 }
789 
790 /*
791  * Get an Async Event Request structure
792  *
793  * Returns a pointer to an AER previously submitted by the host or NULL if
794  * no AER's exist. Caller is responsible for freeing the returned struct.
795  */
796 static struct pci_nvme_aer *
797 pci_nvme_aer_get(struct pci_nvme_softc *sc)
798 {
799 	struct pci_nvme_aer *aer = NULL;
800 
801 	pthread_mutex_lock(&sc->aer_mtx);
802 	aer = STAILQ_FIRST(&sc->aer_list);
803 	if (aer != NULL) {
804 		STAILQ_REMOVE_HEAD(&sc->aer_list, link);
805 		sc->aer_count--;
806 	}
807 	pthread_mutex_unlock(&sc->aer_mtx);
808 
809 	return (aer);
810 }
811 
812 static void
813 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
814 {
815 	uint32_t	atype;
816 
817 	memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
818 
819 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
820 		sc->aen[atype].atype = atype;
821 	}
822 }
823 
824 static void
825 pci_nvme_aen_init(struct pci_nvme_softc *sc)
826 {
827 	char nstr[80];
828 
829 	pci_nvme_aen_reset(sc);
830 
831 	pthread_mutex_init(&sc->aen_mtx, NULL);
832 	pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
833 	snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
834 	    sc->nsc_pi->pi_func);
835 	pthread_set_name_np(sc->aen_tid, nstr);
836 }
837 
838 static void
839 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
840 {
841 
842 	pci_nvme_aen_reset(sc);
843 }
844 
845 /* Notify the AEN thread of pending work */
846 static void
847 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
848 {
849 
850 	pthread_cond_signal(&sc->aen_cond);
851 }
852 
853 /*
854  * Post an Asynchronous Event Notification
855  */
856 static int32_t
857 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
858 		uint32_t event_data)
859 {
860 	struct pci_nvme_aen *aen;
861 
862 	if (atype >= PCI_NVME_AE_TYPE_MAX) {
863 		return(EINVAL);
864 	}
865 
866 	pthread_mutex_lock(&sc->aen_mtx);
867 	aen = &sc->aen[atype];
868 
869 	/* Has the controller already posted an event of this type? */
870 	if (aen->posted) {
871 		pthread_mutex_unlock(&sc->aen_mtx);
872 		return(EALREADY);
873 	}
874 
875 	aen->event_data = event_data;
876 	aen->posted = true;
877 	pthread_mutex_unlock(&sc->aen_mtx);
878 
879 	pci_nvme_aen_notify(sc);
880 
881 	return(0);
882 }
883 
884 static void
885 pci_nvme_aen_process(struct pci_nvme_softc *sc)
886 {
887 	struct pci_nvme_aer *aer;
888 	struct pci_nvme_aen *aen;
889 	pci_nvme_async_type atype;
890 	uint32_t mask;
891 	uint16_t status;
892 	uint8_t lid;
893 
894 #ifndef __FreeBSD__
895 	lid = 0;
896 #endif
897 
898 	assert(pthread_mutex_isowned_np(&sc->aen_mtx));
899 	for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
900 		aen = &sc->aen[atype];
901 		/* Previous iterations may have depleted the available AER's */
902 		if (!pci_nvme_aer_available(sc)) {
903 			DPRINTF("%s: no AER", __func__);
904 			break;
905 		}
906 
907 		if (!aen->posted) {
908 			DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
909 			continue;
910 		}
911 
912 		status = NVME_SC_SUCCESS;
913 
914 		/* Is the event masked? */
915 		mask =
916 		    sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
917 
918 		DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
919 		switch (atype) {
920 		case PCI_NVME_AE_TYPE_ERROR:
921 			lid = NVME_LOG_ERROR;
922 			break;
923 		case PCI_NVME_AE_TYPE_SMART:
924 			mask &= 0xff;
925 			if ((mask & aen->event_data) == 0)
926 				continue;
927 			lid = NVME_LOG_HEALTH_INFORMATION;
928 			break;
929 		case PCI_NVME_AE_TYPE_NOTICE:
930 			if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
931 				EPRINTLN("%s unknown AEN notice type %u",
932 				    __func__, aen->event_data);
933 				status = NVME_SC_INTERNAL_DEVICE_ERROR;
934 				break;
935 			}
936 			mask >>= 8;
937 			if (((1 << aen->event_data) & mask) == 0)
938 				continue;
939 			switch (aen->event_data) {
940 			case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
941 				lid = NVME_LOG_CHANGED_NAMESPACE;
942 				break;
943 			case PCI_NVME_AE_INFO_FW_ACTIVATION:
944 				lid = NVME_LOG_FIRMWARE_SLOT;
945 				break;
946 			case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
947 				lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
948 				break;
949 			case PCI_NVME_AE_INFO_ANA_CHANGE:
950 				lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
951 				break;
952 			case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
953 				lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
954 				break;
955 			case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
956 				lid = NVME_LOG_LBA_STATUS_INFORMATION;
957 				break;
958 			case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
959 				lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
960 				break;
961 			default:
962 				lid = 0;
963 			}
964 			break;
965 		default:
966 			/* bad type?!? */
967 			EPRINTLN("%s unknown AEN type %u", __func__, atype);
968 			status = NVME_SC_INTERNAL_DEVICE_ERROR;
969 			break;
970 		}
971 
972 		aer = pci_nvme_aer_get(sc);
973 		assert(aer != NULL);
974 
975 		DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
976 		pci_nvme_cq_update(sc, &sc->compl_queues[0],
977 		    (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
978 		    aer->cid,
979 		    0,		/* SQID */
980 		    status);
981 
982 		aen->event_data = 0;
983 		aen->posted = false;
984 
985 		pci_generate_msix(sc->nsc_pi, 0);
986 	}
987 }
988 
989 static void *
990 aen_thr(void *arg)
991 {
992 	struct pci_nvme_softc *sc;
993 
994 	sc = arg;
995 
996 	pthread_mutex_lock(&sc->aen_mtx);
997 	for (;;) {
998 		pci_nvme_aen_process(sc);
999 		pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1000 	}
1001 #ifdef __FreeBSD__
1002 	pthread_mutex_unlock(&sc->aen_mtx);
1003 
1004 	pthread_exit(NULL);
1005 #endif
1006 	return (NULL);
1007 }
1008 
1009 static void
1010 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1011 {
1012 	uint32_t i;
1013 
1014 	DPRINTF("%s", __func__);
1015 
1016 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1017 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1018 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
1019 
1020 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1021 
1022 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
1023 
1024 	sc->regs.cc = 0;
1025 	sc->regs.csts = 0;
1026 
1027 	assert(sc->submit_queues != NULL);
1028 
1029 	for (i = 0; i < sc->num_squeues + 1; i++) {
1030 		sc->submit_queues[i].qbase = NULL;
1031 		sc->submit_queues[i].size = 0;
1032 		sc->submit_queues[i].cqid = 0;
1033 		sc->submit_queues[i].tail = 0;
1034 		sc->submit_queues[i].head = 0;
1035 	}
1036 
1037 	assert(sc->compl_queues != NULL);
1038 
1039 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1040 		sc->compl_queues[i].qbase = NULL;
1041 		sc->compl_queues[i].size = 0;
1042 		sc->compl_queues[i].tail = 0;
1043 		sc->compl_queues[i].head = 0;
1044 	}
1045 
1046 	sc->num_q_is_set = false;
1047 
1048 	pci_nvme_aer_destroy(sc);
1049 	pci_nvme_aen_destroy(sc);
1050 }
1051 
1052 static void
1053 pci_nvme_reset(struct pci_nvme_softc *sc)
1054 {
1055 	pthread_mutex_lock(&sc->mtx);
1056 	pci_nvme_reset_locked(sc);
1057 	pthread_mutex_unlock(&sc->mtx);
1058 }
1059 
1060 static void
1061 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1062 {
1063 	uint16_t acqs, asqs;
1064 
1065 	DPRINTF("%s", __func__);
1066 
1067 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1068 	sc->submit_queues[0].size = asqs;
1069 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1070 	            sizeof(struct nvme_command) * asqs);
1071 
1072 	DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1073 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1074 
1075 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1076 	    NVME_AQA_REG_ACQS_MASK) + 1;
1077 	sc->compl_queues[0].size = acqs;
1078 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1079 	         sizeof(struct nvme_completion) * acqs);
1080 	sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1081 
1082 	DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1083 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1084 }
1085 
1086 static int
1087 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1088 	size_t len, enum nvme_copy_dir dir)
1089 {
1090 	uint8_t *p;
1091 	size_t bytes;
1092 
1093 	if (len > (8 * 1024)) {
1094 		return (-1);
1095 	}
1096 
1097 	/* Copy from the start of prp1 to the end of the physical page */
1098 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1099 	bytes = MIN(bytes, len);
1100 
1101 	p = vm_map_gpa(ctx, prp1, bytes);
1102 	if (p == NULL) {
1103 		return (-1);
1104 	}
1105 
1106 	if (dir == NVME_COPY_TO_PRP)
1107 		memcpy(p, b, bytes);
1108 	else
1109 		memcpy(b, p, bytes);
1110 
1111 	b += bytes;
1112 
1113 	len -= bytes;
1114 	if (len == 0) {
1115 		return (0);
1116 	}
1117 
1118 	len = MIN(len, PAGE_SIZE);
1119 
1120 	p = vm_map_gpa(ctx, prp2, len);
1121 	if (p == NULL) {
1122 		return (-1);
1123 	}
1124 
1125 	if (dir == NVME_COPY_TO_PRP)
1126 		memcpy(p, b, len);
1127 	else
1128 		memcpy(b, p, len);
1129 
1130 	return (0);
1131 }
1132 
1133 /*
1134  * Write a Completion Queue Entry update
1135  *
1136  * Write the completion and update the doorbell value
1137  */
1138 static void
1139 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1140 		struct nvme_completion_queue *cq,
1141 		uint32_t cdw0,
1142 		uint16_t cid,
1143 		uint16_t sqid,
1144 		uint16_t status)
1145 {
1146 	struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1147 	struct nvme_completion *cqe;
1148 
1149 	assert(cq->qbase != NULL);
1150 
1151 	pthread_mutex_lock(&cq->mtx);
1152 
1153 	cqe = &cq->qbase[cq->tail];
1154 
1155 	/* Flip the phase bit */
1156 	status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1157 
1158 	cqe->cdw0 = cdw0;
1159 	cqe->sqhd = sq->head;
1160 	cqe->sqid = sqid;
1161 	cqe->cid = cid;
1162 	cqe->status = status;
1163 
1164 	cq->tail++;
1165 	if (cq->tail >= cq->size) {
1166 		cq->tail = 0;
1167 	}
1168 
1169 	pthread_mutex_unlock(&cq->mtx);
1170 }
1171 
1172 static int
1173 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1174 	struct nvme_completion* compl)
1175 {
1176 	uint16_t qid = command->cdw10 & 0xffff;
1177 
1178 	DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1179 	if (qid == 0 || qid > sc->num_squeues ||
1180 	    (sc->submit_queues[qid].qbase == NULL)) {
1181 		WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1182 		        __func__, qid, sc->num_squeues);
1183 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1184 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1185 		return (1);
1186 	}
1187 
1188 	sc->submit_queues[qid].qbase = NULL;
1189 	sc->submit_queues[qid].cqid = 0;
1190 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1191 	return (1);
1192 }
1193 
1194 static int
1195 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1196 	struct nvme_completion* compl)
1197 {
1198 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
1199 		uint16_t qid = command->cdw10 & 0xffff;
1200 		struct nvme_submission_queue *nsq;
1201 
1202 		if ((qid == 0) || (qid > sc->num_squeues) ||
1203 		    (sc->submit_queues[qid].qbase != NULL)) {
1204 			WPRINTF("%s queue index %u > num_squeues %u",
1205 			        __func__, qid, sc->num_squeues);
1206 			pci_nvme_status_tc(&compl->status,
1207 			    NVME_SCT_COMMAND_SPECIFIC,
1208 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1209 			return (1);
1210 		}
1211 
1212 		nsq = &sc->submit_queues[qid];
1213 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1214 		DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1215 		if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1216 			/*
1217 			 * Queues must specify at least two entries
1218 			 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1219 			 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1220 			 */
1221 			pci_nvme_status_tc(&compl->status,
1222 			    NVME_SCT_COMMAND_SPECIFIC,
1223 			    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1224 			return (1);
1225 		}
1226 		nsq->head = nsq->tail = 0;
1227 
1228 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1229 		if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1230 			pci_nvme_status_tc(&compl->status,
1231 			    NVME_SCT_COMMAND_SPECIFIC,
1232 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1233 			return (1);
1234 		}
1235 
1236 		if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1237 			pci_nvme_status_tc(&compl->status,
1238 			    NVME_SCT_COMMAND_SPECIFIC,
1239 			    NVME_SC_COMPLETION_QUEUE_INVALID);
1240 			return (1);
1241 		}
1242 
1243 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1244 
1245 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1246 		              sizeof(struct nvme_command) * (size_t)nsq->size);
1247 
1248 		DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1249 		        qid, nsq->size, nsq->qbase, nsq->cqid);
1250 
1251 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1252 
1253 		DPRINTF("%s completed creating IOSQ qid %u",
1254 		         __func__, qid);
1255 	} else {
1256 		/*
1257 		 * Guest sent non-cont submission queue request.
1258 		 * This setting is unsupported by this emulation.
1259 		 */
1260 		WPRINTF("%s unsupported non-contig (list-based) "
1261 		         "create i/o submission queue", __func__);
1262 
1263 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1264 	}
1265 	return (1);
1266 }
1267 
1268 static int
1269 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1270 	struct nvme_completion* compl)
1271 {
1272 	uint16_t qid = command->cdw10 & 0xffff;
1273 	uint16_t sqid;
1274 
1275 	DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1276 	if (qid == 0 || qid > sc->num_cqueues ||
1277 	    (sc->compl_queues[qid].qbase == NULL)) {
1278 		WPRINTF("%s queue index %u / num_cqueues %u",
1279 		        __func__, qid, sc->num_cqueues);
1280 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1281 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1282 		return (1);
1283 	}
1284 
1285 	/* Deleting an Active CQ is an error */
1286 	for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1287 		if (sc->submit_queues[sqid].cqid == qid) {
1288 			pci_nvme_status_tc(&compl->status,
1289 			    NVME_SCT_COMMAND_SPECIFIC,
1290 			    NVME_SC_INVALID_QUEUE_DELETION);
1291 			return (1);
1292 		}
1293 
1294 	sc->compl_queues[qid].qbase = NULL;
1295 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1296 	return (1);
1297 }
1298 
1299 static int
1300 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1301 	struct nvme_completion* compl)
1302 {
1303 	struct nvme_completion_queue *ncq;
1304 	uint16_t qid = command->cdw10 & 0xffff;
1305 
1306 	/* Only support Physically Contiguous queues */
1307 	if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1308 		WPRINTF("%s unsupported non-contig (list-based) "
1309 		         "create i/o completion queue",
1310 		         __func__);
1311 
1312 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1313 		return (1);
1314 	}
1315 
1316 	if ((qid == 0) || (qid > sc->num_cqueues) ||
1317 	    (sc->compl_queues[qid].qbase != NULL)) {
1318 		WPRINTF("%s queue index %u > num_cqueues %u",
1319 			__func__, qid, sc->num_cqueues);
1320 		pci_nvme_status_tc(&compl->status,
1321 		    NVME_SCT_COMMAND_SPECIFIC,
1322 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
1323 		return (1);
1324  	}
1325 
1326 	ncq = &sc->compl_queues[qid];
1327 	ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1328 	ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1329 	if (ncq->intr_vec > (sc->max_queues + 1)) {
1330 		pci_nvme_status_tc(&compl->status,
1331 		    NVME_SCT_COMMAND_SPECIFIC,
1332 		    NVME_SC_INVALID_INTERRUPT_VECTOR);
1333 		return (1);
1334 	}
1335 
1336 	ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1337 	if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1338 		/*
1339 		 * Queues must specify at least two entries
1340 		 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1341 		 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1342 		 */
1343 		pci_nvme_status_tc(&compl->status,
1344 		    NVME_SCT_COMMAND_SPECIFIC,
1345 		    NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1346 		return (1);
1347 	}
1348 	ncq->head = ncq->tail = 0;
1349 	ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1350 		     command->prp1,
1351 		     sizeof(struct nvme_command) * (size_t)ncq->size);
1352 
1353 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1354 
1355 
1356 	return (1);
1357 }
1358 
1359 static int
1360 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1361 	struct nvme_completion* compl)
1362 {
1363 	uint32_t logsize;
1364 	uint8_t logpage = command->cdw10 & 0xFF;
1365 
1366 #ifndef __FreeBSD__
1367 	logsize = 0;
1368 #endif
1369 
1370 	DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1371 
1372 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1373 
1374 	/*
1375 	 * Command specifies the number of dwords to return in fields NUMDU
1376 	 * and NUMDL. This is a zero-based value.
1377 	 */
1378 	logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1379 	logsize *= sizeof(uint32_t);
1380 
1381 	switch (logpage) {
1382 	case NVME_LOG_ERROR:
1383 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1384 		    command->prp2, (uint8_t *)&sc->err_log,
1385 		    MIN(logsize, sizeof(sc->err_log)),
1386 		    NVME_COPY_TO_PRP);
1387 		break;
1388 	case NVME_LOG_HEALTH_INFORMATION:
1389 		pthread_mutex_lock(&sc->mtx);
1390 		memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1391 		    sizeof(sc->health_log.data_units_read));
1392 		memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1393 		    sizeof(sc->health_log.data_units_written));
1394 		memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1395 		    sizeof(sc->health_log.host_read_commands));
1396 		memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1397 		    sizeof(sc->health_log.host_write_commands));
1398 		pthread_mutex_unlock(&sc->mtx);
1399 
1400 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1401 		    command->prp2, (uint8_t *)&sc->health_log,
1402 		    MIN(logsize, sizeof(sc->health_log)),
1403 		    NVME_COPY_TO_PRP);
1404 		break;
1405 	case NVME_LOG_FIRMWARE_SLOT:
1406 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1407 		    command->prp2, (uint8_t *)&sc->fw_log,
1408 		    MIN(logsize, sizeof(sc->fw_log)),
1409 		    NVME_COPY_TO_PRP);
1410 		break;
1411 	case NVME_LOG_CHANGED_NAMESPACE:
1412 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1413 		    command->prp2, (uint8_t *)&sc->ns_log,
1414 		    MIN(logsize, sizeof(sc->ns_log)),
1415 		    NVME_COPY_TO_PRP);
1416 		memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1417 		break;
1418 	default:
1419 		DPRINTF("%s get log page %x command not supported",
1420 		        __func__, logpage);
1421 
1422 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1423 		    NVME_SC_INVALID_LOG_PAGE);
1424 	}
1425 
1426 	return (1);
1427 }
1428 
1429 static int
1430 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1431 	struct nvme_completion* compl)
1432 {
1433 	void *dest;
1434 	uint16_t status;
1435 
1436 #ifndef __FreeBSD__
1437 	status = 0;
1438 #endif
1439 
1440 	DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1441 	        command->cdw10 & 0xFF, command->nsid);
1442 
1443 	pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1444 
1445 	switch (command->cdw10 & 0xFF) {
1446 	case 0x00: /* return Identify Namespace data structure */
1447 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1448 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1449 		    NVME_COPY_TO_PRP);
1450 		break;
1451 	case 0x01: /* return Identify Controller data structure */
1452 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1453 		    command->prp2, (uint8_t *)&sc->ctrldata,
1454 		    sizeof(sc->ctrldata),
1455 		    NVME_COPY_TO_PRP);
1456 		break;
1457 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1458 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1459 		                  sizeof(uint32_t) * 1024);
1460 		/* All unused entries shall be zero */
1461 		bzero(dest, sizeof(uint32_t) * 1024);
1462 		((uint32_t *)dest)[0] = 1;
1463 		break;
1464 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1465 		if (command->nsid != 1) {
1466 			pci_nvme_status_genc(&status,
1467 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1468 			break;
1469 		}
1470 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1471 		                  sizeof(uint32_t) * 1024);
1472 		/* All bytes after the descriptor shall be zero */
1473 		bzero(dest, sizeof(uint32_t) * 1024);
1474 
1475 		/* Return NIDT=1 (i.e. EUI64) descriptor */
1476 		((uint8_t *)dest)[0] = 1;
1477 		((uint8_t *)dest)[1] = sizeof(uint64_t);
1478 		bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1479 		break;
1480 	default:
1481 		DPRINTF("%s unsupported identify command requested 0x%x",
1482 		         __func__, command->cdw10 & 0xFF);
1483 		pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1484 		break;
1485 	}
1486 
1487 	compl->status = status;
1488 	return (1);
1489 }
1490 
1491 static const char *
1492 nvme_fid_to_name(uint8_t fid)
1493 {
1494 	const char *name;
1495 
1496 	switch (fid) {
1497 	case NVME_FEAT_ARBITRATION:
1498 		name = "Arbitration";
1499 		break;
1500 	case NVME_FEAT_POWER_MANAGEMENT:
1501 		name = "Power Management";
1502 		break;
1503 	case NVME_FEAT_LBA_RANGE_TYPE:
1504 		name = "LBA Range Type";
1505 		break;
1506 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
1507 		name = "Temperature Threshold";
1508 		break;
1509 	case NVME_FEAT_ERROR_RECOVERY:
1510 		name = "Error Recovery";
1511 		break;
1512 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1513 		name = "Volatile Write Cache";
1514 		break;
1515 	case NVME_FEAT_NUMBER_OF_QUEUES:
1516 		name = "Number of Queues";
1517 		break;
1518 	case NVME_FEAT_INTERRUPT_COALESCING:
1519 		name = "Interrupt Coalescing";
1520 		break;
1521 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1522 		name = "Interrupt Vector Configuration";
1523 		break;
1524 	case NVME_FEAT_WRITE_ATOMICITY:
1525 		name = "Write Atomicity Normal";
1526 		break;
1527 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1528 		name = "Asynchronous Event Configuration";
1529 		break;
1530 	case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1531 		name = "Autonomous Power State Transition";
1532 		break;
1533 	case NVME_FEAT_HOST_MEMORY_BUFFER:
1534 		name = "Host Memory Buffer";
1535 		break;
1536 	case NVME_FEAT_TIMESTAMP:
1537 		name = "Timestamp";
1538 		break;
1539 	case NVME_FEAT_KEEP_ALIVE_TIMER:
1540 		name = "Keep Alive Timer";
1541 		break;
1542 	case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1543 		name = "Host Controlled Thermal Management";
1544 		break;
1545 	case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1546 		name = "Non-Operation Power State Config";
1547 		break;
1548 	case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1549 		name = "Read Recovery Level Config";
1550 		break;
1551 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1552 		name = "Predictable Latency Mode Config";
1553 		break;
1554 	case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1555 		name = "Predictable Latency Mode Window";
1556 		break;
1557 	case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1558 		name = "LBA Status Information Report Interval";
1559 		break;
1560 	case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1561 		name = "Host Behavior Support";
1562 		break;
1563 	case NVME_FEAT_SANITIZE_CONFIG:
1564 		name = "Sanitize Config";
1565 		break;
1566 	case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1567 		name = "Endurance Group Event Configuration";
1568 		break;
1569 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1570 		name = "Software Progress Marker";
1571 		break;
1572 	case NVME_FEAT_HOST_IDENTIFIER:
1573 		name = "Host Identifier";
1574 		break;
1575 	case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1576 		name = "Reservation Notification Mask";
1577 		break;
1578 	case NVME_FEAT_RESERVATION_PERSISTENCE:
1579 		name = "Reservation Persistence";
1580 		break;
1581 	case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1582 		name = "Namespace Write Protection Config";
1583 		break;
1584 	default:
1585 		name = "Unknown";
1586 		break;
1587 	}
1588 
1589 	return (name);
1590 }
1591 
1592 static void
1593 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1594     struct nvme_feature_obj *feat,
1595     struct nvme_command *command,
1596     struct nvme_completion *compl)
1597 {
1598 
1599 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1600 }
1601 
1602 static void
1603 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1604     struct nvme_feature_obj *feat,
1605     struct nvme_command *command,
1606     struct nvme_completion *compl)
1607 {
1608 	uint32_t i;
1609 	uint32_t cdw11 = command->cdw11;
1610 	uint16_t iv;
1611 	bool cd;
1612 
1613 	pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1614 
1615 	iv = cdw11 & 0xffff;
1616 	cd = cdw11 & (1 << 16);
1617 
1618 	if (iv > (sc->max_queues + 1)) {
1619 		return;
1620 	}
1621 
1622 	/* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1623 	if ((iv == 0) && !cd)
1624 		return;
1625 
1626 	/* Requested Interrupt Vector must be used by a CQ */
1627 	for (i = 0; i < sc->num_cqueues + 1; i++) {
1628 		if (sc->compl_queues[i].intr_vec == iv) {
1629 			pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1630 		}
1631 	}
1632 
1633 }
1634 
1635 static void
1636 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1637     struct nvme_feature_obj *feat,
1638     struct nvme_command *command,
1639     struct nvme_completion *compl)
1640 {
1641 	uint16_t nqr;	/* Number of Queues Requested */
1642 
1643 	if (sc->num_q_is_set) {
1644 		WPRINTF("%s: Number of Queues already set", __func__);
1645 		pci_nvme_status_genc(&compl->status,
1646 		    NVME_SC_COMMAND_SEQUENCE_ERROR);
1647 		return;
1648 	}
1649 
1650 	nqr = command->cdw11 & 0xFFFF;
1651 	if (nqr == 0xffff) {
1652 		WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1653 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1654 		return;
1655 	}
1656 
1657 	sc->num_squeues = ONE_BASED(nqr);
1658 	if (sc->num_squeues > sc->max_queues) {
1659 		DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1660 					sc->max_queues);
1661 		sc->num_squeues = sc->max_queues;
1662 	}
1663 
1664 	nqr = (command->cdw11 >> 16) & 0xFFFF;
1665 	if (nqr == 0xffff) {
1666 		WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1667 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1668 		return;
1669 	}
1670 
1671 	sc->num_cqueues = ONE_BASED(nqr);
1672 	if (sc->num_cqueues > sc->max_queues) {
1673 		DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1674 					sc->max_queues);
1675 		sc->num_cqueues = sc->max_queues;
1676 	}
1677 
1678 	/* Patch the command value which will be saved on callback's return */
1679 	command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1680 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1681 
1682 	sc->num_q_is_set = true;
1683 }
1684 
1685 static int
1686 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1687 	struct nvme_completion *compl)
1688 {
1689 	struct nvme_feature_obj *feat;
1690 	uint32_t nsid = command->nsid;
1691 	uint8_t fid = command->cdw10 & 0xFF;
1692 
1693 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1694 
1695 	if (fid >= NVME_FID_MAX) {
1696 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1697 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1698 		return (1);
1699 	}
1700 	feat = &sc->feat[fid];
1701 
1702 	if (!feat->namespace_specific &&
1703 	    !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1704 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1705 		    NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1706 		return (1);
1707 	}
1708 
1709 	compl->cdw0 = 0;
1710 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1711 
1712 	if (feat->set)
1713 		feat->set(sc, feat, command, compl);
1714 
1715 	DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1716 	if (compl->status == NVME_SC_SUCCESS) {
1717 		feat->cdw11 = command->cdw11;
1718 		if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1719 		    (command->cdw11 != 0))
1720 			pci_nvme_aen_notify(sc);
1721 	}
1722 
1723 	return (0);
1724 }
1725 
1726 static int
1727 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1728 	struct nvme_completion* compl)
1729 {
1730 	struct nvme_feature_obj *feat;
1731 	uint8_t fid = command->cdw10 & 0xFF;
1732 
1733 	DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1734 
1735 	if (fid >= NVME_FID_MAX) {
1736 		DPRINTF("%s invalid feature 0x%x", __func__, fid);
1737 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1738 		return (1);
1739 	}
1740 
1741 	compl->cdw0 = 0;
1742 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1743 
1744 	feat = &sc->feat[fid];
1745 	if (feat->get) {
1746 		feat->get(sc, feat, command, compl);
1747 	}
1748 
1749 	if (compl->status == NVME_SC_SUCCESS) {
1750 		compl->cdw0 = feat->cdw11;
1751 	}
1752 
1753 	return (0);
1754 }
1755 
1756 static int
1757 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1758 	struct nvme_completion* compl)
1759 {
1760 	uint8_t	ses, lbaf, pi;
1761 
1762 	/* Only supports Secure Erase Setting - User Data Erase */
1763 	ses = (command->cdw10 >> 9) & 0x7;
1764 	if (ses > 0x1) {
1765 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1766 		return (1);
1767 	}
1768 
1769 	/* Only supports a single LBA Format */
1770 	lbaf = command->cdw10 & 0xf;
1771 	if (lbaf != 0) {
1772 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1773 		    NVME_SC_INVALID_FORMAT);
1774 		return (1);
1775 	}
1776 
1777 	/* Doesn't support Protection Infomation */
1778 	pi = (command->cdw10 >> 5) & 0x7;
1779 	if (pi != 0) {
1780 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1781 		return (1);
1782 	}
1783 
1784 	if (sc->nvstore.type == NVME_STOR_RAM) {
1785 		if (sc->nvstore.ctx)
1786 			free(sc->nvstore.ctx);
1787 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1788 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1789 	} else {
1790 		struct pci_nvme_ioreq *req;
1791 		int err;
1792 
1793 		req = pci_nvme_get_ioreq(sc);
1794 		if (req == NULL) {
1795 			pci_nvme_status_genc(&compl->status,
1796 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1797 			WPRINTF("%s: unable to allocate IO req", __func__);
1798 			return (1);
1799 		}
1800 		req->nvme_sq = &sc->submit_queues[0];
1801 		req->sqid = 0;
1802 		req->opc = command->opc;
1803 		req->cid = command->cid;
1804 		req->nsid = command->nsid;
1805 
1806 		req->io_req.br_offset = 0;
1807 		req->io_req.br_resid = sc->nvstore.size;
1808 		req->io_req.br_callback = pci_nvme_io_done;
1809 
1810 		err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1811 		if (err) {
1812 			pci_nvme_status_genc(&compl->status,
1813 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1814 			pci_nvme_release_ioreq(sc, req);
1815 		}
1816 	}
1817 
1818 	return (1);
1819 }
1820 
1821 static int
1822 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1823 	struct nvme_completion* compl)
1824 {
1825 	DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1826 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1827 
1828 	/* TODO: search for the command ID and abort it */
1829 
1830 	compl->cdw0 = 1;
1831 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1832 	return (1);
1833 }
1834 
1835 static int
1836 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1837 	struct nvme_command* command, struct nvme_completion* compl)
1838 {
1839 	DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1840 	    sc->aer_count, sc->ctrldata.aerl, command->cid);
1841 
1842 	/* Don't exceed the Async Event Request Limit (AERL). */
1843 	if (pci_nvme_aer_limit_reached(sc)) {
1844 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1845 				NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1846 		return (1);
1847 	}
1848 
1849 	if (pci_nvme_aer_add(sc, command->cid)) {
1850 		pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1851 				NVME_SC_INTERNAL_DEVICE_ERROR);
1852 		return (1);
1853 	}
1854 
1855 	/*
1856 	 * Raise events when they happen based on the Set Features cmd.
1857 	 * These events happen async, so only set completion successful if
1858 	 * there is an event reflective of the request to get event.
1859 	 */
1860 	compl->status = NVME_NO_STATUS;
1861 	pci_nvme_aen_notify(sc);
1862 
1863 	return (0);
1864 }
1865 
1866 static void
1867 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1868 {
1869 	struct nvme_completion compl;
1870 	struct nvme_command *cmd;
1871 	struct nvme_submission_queue *sq;
1872 	struct nvme_completion_queue *cq;
1873 	uint16_t sqhead;
1874 
1875 	DPRINTF("%s index %u", __func__, (uint32_t)value);
1876 
1877 	sq = &sc->submit_queues[0];
1878 	cq = &sc->compl_queues[0];
1879 
1880 	pthread_mutex_lock(&sq->mtx);
1881 
1882 	sqhead = sq->head;
1883 	DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1884 
1885 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1886 		cmd = &(sq->qbase)[sqhead];
1887 		compl.cdw0 = 0;
1888 		compl.status = 0;
1889 
1890 		switch (cmd->opc) {
1891 		case NVME_OPC_DELETE_IO_SQ:
1892 			DPRINTF("%s command DELETE_IO_SQ", __func__);
1893 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1894 			break;
1895 		case NVME_OPC_CREATE_IO_SQ:
1896 			DPRINTF("%s command CREATE_IO_SQ", __func__);
1897 			nvme_opc_create_io_sq(sc, cmd, &compl);
1898 			break;
1899 		case NVME_OPC_DELETE_IO_CQ:
1900 			DPRINTF("%s command DELETE_IO_CQ", __func__);
1901 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1902 			break;
1903 		case NVME_OPC_CREATE_IO_CQ:
1904 			DPRINTF("%s command CREATE_IO_CQ", __func__);
1905 			nvme_opc_create_io_cq(sc, cmd, &compl);
1906 			break;
1907 		case NVME_OPC_GET_LOG_PAGE:
1908 			DPRINTF("%s command GET_LOG_PAGE", __func__);
1909 			nvme_opc_get_log_page(sc, cmd, &compl);
1910 			break;
1911 		case NVME_OPC_IDENTIFY:
1912 			DPRINTF("%s command IDENTIFY", __func__);
1913 			nvme_opc_identify(sc, cmd, &compl);
1914 			break;
1915 		case NVME_OPC_ABORT:
1916 			DPRINTF("%s command ABORT", __func__);
1917 			nvme_opc_abort(sc, cmd, &compl);
1918 			break;
1919 		case NVME_OPC_SET_FEATURES:
1920 			DPRINTF("%s command SET_FEATURES", __func__);
1921 			nvme_opc_set_features(sc, cmd, &compl);
1922 			break;
1923 		case NVME_OPC_GET_FEATURES:
1924 			DPRINTF("%s command GET_FEATURES", __func__);
1925 			nvme_opc_get_features(sc, cmd, &compl);
1926 			break;
1927 		case NVME_OPC_FIRMWARE_ACTIVATE:
1928 			DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
1929 			pci_nvme_status_tc(&compl.status,
1930 			    NVME_SCT_COMMAND_SPECIFIC,
1931 			    NVME_SC_INVALID_FIRMWARE_SLOT);
1932 			break;
1933 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1934 			DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
1935 			nvme_opc_async_event_req(sc, cmd, &compl);
1936 			break;
1937 		case NVME_OPC_FORMAT_NVM:
1938 			DPRINTF("%s command FORMAT_NVM", __func__);
1939 			if ((sc->ctrldata.oacs &
1940 			    (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
1941 				pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1942 			}
1943 			compl.status = NVME_NO_STATUS;
1944 			nvme_opc_format_nvm(sc, cmd, &compl);
1945 			break;
1946 		default:
1947 			DPRINTF("0x%x command is not implemented",
1948 			    cmd->opc);
1949 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1950 		}
1951 		sqhead = (sqhead + 1) % sq->size;
1952 
1953 		if (NVME_COMPLETION_VALID(compl)) {
1954 			pci_nvme_cq_update(sc, &sc->compl_queues[0],
1955 			    compl.cdw0,
1956 			    cmd->cid,
1957 			    0,		/* SQID */
1958 			    compl.status);
1959 		}
1960 	}
1961 
1962 	DPRINTF("setting sqhead %u", sqhead);
1963 	sq->head = sqhead;
1964 
1965 	if (cq->head != cq->tail)
1966 		pci_generate_msix(sc->nsc_pi, 0);
1967 
1968 	pthread_mutex_unlock(&sq->mtx);
1969 }
1970 
1971 /*
1972  * Update the Write and Read statistics reported in SMART data
1973  *
1974  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
1975  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
1976  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
1977  */
1978 static void
1979 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
1980     size_t bytes, uint16_t status)
1981 {
1982 
1983 	pthread_mutex_lock(&sc->mtx);
1984 	switch (opc) {
1985 	case NVME_OPC_WRITE:
1986 		sc->write_commands++;
1987 		if (status != NVME_SC_SUCCESS)
1988 			break;
1989 		sc->write_dunits_remainder += (bytes / 512);
1990 		while (sc->write_dunits_remainder >= 1000) {
1991 			sc->write_data_units++;
1992 			sc->write_dunits_remainder -= 1000;
1993 		}
1994 		break;
1995 	case NVME_OPC_READ:
1996 		sc->read_commands++;
1997 		if (status != NVME_SC_SUCCESS)
1998 			break;
1999 		sc->read_dunits_remainder += (bytes / 512);
2000 		while (sc->read_dunits_remainder >= 1000) {
2001 			sc->read_data_units++;
2002 			sc->read_dunits_remainder -= 1000;
2003 		}
2004 		break;
2005 	default:
2006 		DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2007 		break;
2008 	}
2009 	pthread_mutex_unlock(&sc->mtx);
2010 }
2011 
2012 /*
2013  * Check if the combination of Starting LBA (slba) and Number of Logical
2014  * Blocks (nlb) exceeds the range of the underlying storage.
2015  *
2016  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2017  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2018  * overflow.
2019  */
2020 static bool
2021 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2022     uint32_t nlb)
2023 {
2024 	size_t	offset, bytes;
2025 
2026 	/* Overflow check of multiplying Starting LBA by the sector size */
2027 	if (slba >> (64 - nvstore->sectsz_bits))
2028 		return (true);
2029 
2030 	offset = slba << nvstore->sectsz_bits;
2031 	bytes = nlb << nvstore->sectsz_bits;
2032 
2033 	/* Overflow check of Number of Logical Blocks */
2034 	if ((nvstore->size - offset) < bytes)
2035 		return (true);
2036 
2037 	return (false);
2038 }
2039 
2040 static int
2041 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2042 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2043 {
2044 	int iovidx;
2045 
2046 	if (req == NULL)
2047 		return (-1);
2048 
2049 	if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2050 		return (-1);
2051 	}
2052 
2053 	/* concatenate contig block-iovs to minimize number of iovs */
2054 	if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2055 		iovidx = req->io_req.br_iovcnt - 1;
2056 
2057 		req->io_req.br_iov[iovidx].iov_base =
2058 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2059 				     req->prev_gpaddr, size);
2060 
2061 		req->prev_size += size;
2062 		req->io_req.br_resid += size;
2063 
2064 		req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2065 	} else {
2066 		iovidx = req->io_req.br_iovcnt;
2067 		if (iovidx == 0) {
2068 			req->io_req.br_offset = lba;
2069 			req->io_req.br_resid = 0;
2070 			req->io_req.br_param = req;
2071 		}
2072 
2073 		req->io_req.br_iov[iovidx].iov_base =
2074 		    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2075 				     gpaddr, size);
2076 
2077 		req->io_req.br_iov[iovidx].iov_len = size;
2078 
2079 		req->prev_gpaddr = gpaddr;
2080 		req->prev_size = size;
2081 		req->io_req.br_resid += size;
2082 
2083 		req->io_req.br_iovcnt++;
2084 	}
2085 
2086 	return (0);
2087 }
2088 
2089 static void
2090 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2091 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2092 	uint32_t cdw0, uint16_t status)
2093 {
2094 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2095 
2096 	DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2097 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2098 		 NVME_STATUS_GET_SC(status));
2099 
2100 	pci_nvme_cq_update(sc, cq,
2101 	    0,		/* CDW0 */
2102 	    cid,
2103 	    sqid,
2104 	    status);
2105 
2106 	if (cq->head != cq->tail) {
2107 		if (cq->intr_en & NVME_CQ_INTEN) {
2108 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2109 		} else {
2110 			DPRINTF("%s: CQ%u interrupt disabled",
2111 						__func__, sq->cqid);
2112 		}
2113 	}
2114 }
2115 
2116 static void
2117 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2118 {
2119 	req->sc = NULL;
2120 	req->nvme_sq = NULL;
2121 	req->sqid = 0;
2122 
2123 	pthread_mutex_lock(&sc->mtx);
2124 
2125 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2126 	sc->pending_ios--;
2127 
2128 	/* when no more IO pending, can set to ready if device reset/enabled */
2129 	if (sc->pending_ios == 0 &&
2130 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2131 		sc->regs.csts |= NVME_CSTS_RDY;
2132 
2133 	pthread_mutex_unlock(&sc->mtx);
2134 
2135 	sem_post(&sc->iosemlock);
2136 }
2137 
2138 static struct pci_nvme_ioreq *
2139 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2140 {
2141 	struct pci_nvme_ioreq *req = NULL;
2142 
2143 	sem_wait(&sc->iosemlock);
2144 	pthread_mutex_lock(&sc->mtx);
2145 
2146 	req = STAILQ_FIRST(&sc->ioreqs_free);
2147 	assert(req != NULL);
2148 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2149 
2150 	req->sc = sc;
2151 
2152 	sc->pending_ios++;
2153 
2154 	pthread_mutex_unlock(&sc->mtx);
2155 
2156 	req->io_req.br_iovcnt = 0;
2157 	req->io_req.br_offset = 0;
2158 	req->io_req.br_resid = 0;
2159 	req->io_req.br_param = req;
2160 	req->prev_gpaddr = 0;
2161 	req->prev_size = 0;
2162 
2163 	return req;
2164 }
2165 
2166 static void
2167 pci_nvme_io_done(struct blockif_req *br, int err)
2168 {
2169 	struct pci_nvme_ioreq *req = br->br_param;
2170 	struct nvme_submission_queue *sq = req->nvme_sq;
2171 	uint16_t code, status;
2172 
2173 #ifndef __FreeBSD__
2174 	status = 0;
2175 #endif
2176 
2177 	DPRINTF("%s error %d %s", __func__, err, strerror(err));
2178 
2179 	/* TODO return correct error */
2180 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2181 	pci_nvme_status_genc(&status, code);
2182 
2183 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2184 	pci_nvme_stats_write_read_update(req->sc, req->opc,
2185 	    req->bytes, status);
2186 	pci_nvme_release_ioreq(req->sc, req);
2187 }
2188 
2189 /*
2190  * Implements the Flush command. The specification states:
2191  *    If a volatile write cache is not present, Flush commands complete
2192  *    successfully and have no effect
2193  * in the description of the Volatile Write Cache (VWC) field of the Identify
2194  * Controller data. Therefore, set status to Success if the command is
2195  * not supported (i.e. RAM or as indicated by the blockif).
2196  */
2197 static bool
2198 nvme_opc_flush(struct pci_nvme_softc *sc,
2199     struct nvme_command *cmd,
2200     struct pci_nvme_blockstore *nvstore,
2201     struct pci_nvme_ioreq *req,
2202     uint16_t *status)
2203 {
2204 	bool pending = false;
2205 
2206 	if (nvstore->type == NVME_STOR_RAM) {
2207 		pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2208 	} else {
2209 		int err;
2210 
2211 		req->io_req.br_callback = pci_nvme_io_done;
2212 
2213 		err = blockif_flush(nvstore->ctx, &req->io_req);
2214 		switch (err) {
2215 		case 0:
2216 			pending = true;
2217 			break;
2218 		case EOPNOTSUPP:
2219 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2220 			break;
2221 		default:
2222 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2223 		}
2224 	}
2225 
2226 	return (pending);
2227 }
2228 
2229 static uint16_t
2230 nvme_write_read_ram(struct pci_nvme_softc *sc,
2231     struct pci_nvme_blockstore *nvstore,
2232     uint64_t prp1, uint64_t prp2,
2233     size_t offset, uint64_t bytes,
2234     bool is_write)
2235 {
2236 	uint8_t *buf = nvstore->ctx;
2237 	enum nvme_copy_dir dir;
2238 	uint16_t status;
2239 
2240 #ifndef __FreeBSD__
2241 	status = 0;
2242 #endif
2243 
2244 	if (is_write)
2245 		dir = NVME_COPY_TO_PRP;
2246 	else
2247 		dir = NVME_COPY_FROM_PRP;
2248 
2249 	if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2250 	    buf + offset, bytes, dir))
2251 		pci_nvme_status_genc(&status,
2252 		    NVME_SC_DATA_TRANSFER_ERROR);
2253 	else
2254 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2255 
2256 	return (status);
2257 }
2258 
2259 static uint16_t
2260 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2261     struct pci_nvme_blockstore *nvstore,
2262     struct pci_nvme_ioreq *req,
2263     uint64_t prp1, uint64_t prp2,
2264     size_t offset, uint64_t bytes,
2265     bool is_write)
2266 {
2267 	uint64_t size;
2268 	int err;
2269 	uint16_t status = NVME_NO_STATUS;
2270 
2271 	size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2272 	if (pci_nvme_append_iov_req(sc, req, prp1,
2273 	    size, is_write, offset)) {
2274 		pci_nvme_status_genc(&status,
2275 		    NVME_SC_DATA_TRANSFER_ERROR);
2276 		goto out;
2277 	}
2278 
2279 	offset += size;
2280 	bytes  -= size;
2281 
2282 	if (bytes == 0) {
2283 		;
2284 	} else if (bytes <= PAGE_SIZE) {
2285 		size = bytes;
2286 		if (pci_nvme_append_iov_req(sc, req, prp2,
2287 		    size, is_write, offset)) {
2288 			pci_nvme_status_genc(&status,
2289 			    NVME_SC_DATA_TRANSFER_ERROR);
2290 			goto out;
2291 		}
2292 	} else {
2293 		void *vmctx = sc->nsc_pi->pi_vmctx;
2294 		uint64_t *prp_list = &prp2;
2295 		uint64_t *last = prp_list;
2296 
2297 		/* PRP2 is pointer to a physical region page list */
2298 		while (bytes) {
2299 			/* Last entry in list points to the next list */
2300 			if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2301 				uint64_t prp = *prp_list;
2302 
2303 				prp_list = paddr_guest2host(vmctx, prp,
2304 				    PAGE_SIZE - (prp % PAGE_SIZE));
2305 				last = prp_list + (NVME_PRP2_ITEMS - 1);
2306 			}
2307 
2308 			size = MIN(bytes, PAGE_SIZE);
2309 
2310 			if (pci_nvme_append_iov_req(sc, req, *prp_list,
2311 			    size, is_write, offset)) {
2312 				pci_nvme_status_genc(&status,
2313 				    NVME_SC_DATA_TRANSFER_ERROR);
2314 				goto out;
2315 			}
2316 
2317 			offset += size;
2318 			bytes  -= size;
2319 
2320 			prp_list++;
2321 		}
2322 	}
2323 	req->io_req.br_callback = pci_nvme_io_done;
2324 	if (is_write)
2325 		err = blockif_write(nvstore->ctx, &req->io_req);
2326 	else
2327 		err = blockif_read(nvstore->ctx, &req->io_req);
2328 
2329 	if (err)
2330 		pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2331 out:
2332 	return (status);
2333 }
2334 
2335 static bool
2336 nvme_opc_write_read(struct pci_nvme_softc *sc,
2337     struct nvme_command *cmd,
2338     struct pci_nvme_blockstore *nvstore,
2339     struct pci_nvme_ioreq *req,
2340     uint16_t *status)
2341 {
2342 	uint64_t lba, nblocks, bytes;
2343 	size_t offset;
2344 	bool is_write = cmd->opc == NVME_OPC_WRITE;
2345 	bool pending = false;
2346 
2347 #ifndef __FreeBSD__
2348 	bytes = 0;
2349 #endif
2350 
2351 	lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2352 	nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2353 
2354 	if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2355 		WPRINTF("%s command would exceed LBA range", __func__);
2356 		pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2357 		goto out;
2358 	}
2359 
2360 	bytes  = nblocks << nvstore->sectsz_bits;
2361 	if (bytes > NVME_MAX_DATA_SIZE) {
2362 		WPRINTF("%s command would exceed MDTS", __func__);
2363 		pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2364 		goto out;
2365 	}
2366 
2367 	offset = lba << nvstore->sectsz_bits;
2368 
2369 	req->bytes = bytes;
2370 	req->io_req.br_offset = lba;
2371 
2372 	/* PRP bits 1:0 must be zero */
2373 	cmd->prp1 &= ~0x3UL;
2374 	cmd->prp2 &= ~0x3UL;
2375 
2376 	if (nvstore->type == NVME_STOR_RAM) {
2377 		*status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2378 		    cmd->prp2, offset, bytes, is_write);
2379 	} else {
2380 		*status = nvme_write_read_blockif(sc, nvstore, req,
2381 		    cmd->prp1, cmd->prp2, offset, bytes, is_write);
2382 
2383 		if (*status == NVME_NO_STATUS)
2384 			pending = true;
2385 	}
2386 out:
2387 	if (!pending)
2388 		pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2389 
2390 	return (pending);
2391 }
2392 
2393 static void
2394 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2395 {
2396 	struct pci_nvme_ioreq *req = br->br_param;
2397 	struct pci_nvme_softc *sc = req->sc;
2398 	bool done = true;
2399 	uint16_t status;
2400 
2401 #ifndef __FreeBSD__
2402 	status = 0;
2403 #endif
2404 
2405 	if (err) {
2406 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2407 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2408 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2409 	} else {
2410 		struct iovec *iov = req->io_req.br_iov;
2411 
2412 		req->prev_gpaddr++;
2413 		iov += req->prev_gpaddr;
2414 
2415 		/* The iov_* values already include the sector size */
2416 		req->io_req.br_offset = (off_t)iov->iov_base;
2417 		req->io_req.br_resid = iov->iov_len;
2418 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2419 			pci_nvme_status_genc(&status,
2420 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2421 		} else
2422 			done = false;
2423 	}
2424 
2425 	if (done) {
2426 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2427 		    req->cid, 0, status);
2428 		pci_nvme_release_ioreq(sc, req);
2429 	}
2430 }
2431 
2432 static bool
2433 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2434     struct nvme_command *cmd,
2435     struct pci_nvme_blockstore *nvstore,
2436     struct pci_nvme_ioreq *req,
2437     uint16_t *status)
2438 {
2439 	struct nvme_dsm_range *range;
2440 	uint32_t nr, r, non_zero, dr;
2441 	int err;
2442 	bool pending = false;
2443 
2444 #ifndef __FreeBSD__
2445 	range = NULL;
2446 #endif
2447 
2448 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2449 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2450 		goto out;
2451 	}
2452 
2453 	nr = cmd->cdw10 & 0xff;
2454 
2455 	/* copy locally because a range entry could straddle PRPs */
2456 	range = calloc(1, NVME_MAX_DSM_TRIM);
2457 	if (range == NULL) {
2458 		pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2459 		goto out;
2460 	}
2461 	nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2462 	    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2463 
2464 	/* Check for invalid ranges and the number of non-zero lengths */
2465 	non_zero = 0;
2466 	for (r = 0; r <= nr; r++) {
2467 		if (pci_nvme_out_of_range(nvstore,
2468 		    range[r].starting_lba, range[r].length)) {
2469 			pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2470 			goto out;
2471 		}
2472 		if (range[r].length != 0)
2473 			non_zero++;
2474 	}
2475 
2476 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2477 		size_t offset, bytes;
2478 		int sectsz_bits = sc->nvstore.sectsz_bits;
2479 
2480 		/*
2481 		 * DSM calls are advisory only, and compliant controllers
2482 		 * may choose to take no actions (i.e. return Success).
2483 		 */
2484 		if (!nvstore->deallocate) {
2485 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2486 			goto out;
2487 		}
2488 
2489 		/* If all ranges have a zero length, return Success */
2490 		if (non_zero == 0) {
2491 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2492 			goto out;
2493 		}
2494 
2495 		if (req == NULL) {
2496 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2497 			goto out;
2498 		}
2499 
2500 		offset = range[0].starting_lba << sectsz_bits;
2501 		bytes = range[0].length << sectsz_bits;
2502 
2503 		/*
2504 		 * If the request is for more than a single range, store
2505 		 * the ranges in the br_iov. Optimize for the common case
2506 		 * of a single range.
2507 		 *
2508 		 * Note that NVMe Number of Ranges is a zero based value
2509 		 */
2510 		req->io_req.br_iovcnt = 0;
2511 		req->io_req.br_offset = offset;
2512 		req->io_req.br_resid = bytes;
2513 
2514 		if (nr == 0) {
2515 			req->io_req.br_callback = pci_nvme_io_done;
2516 		} else {
2517 			struct iovec *iov = req->io_req.br_iov;
2518 
2519 			for (r = 0, dr = 0; r <= nr; r++) {
2520 				offset = range[r].starting_lba << sectsz_bits;
2521 				bytes = range[r].length << sectsz_bits;
2522 				if (bytes == 0)
2523 					continue;
2524 
2525 				if ((nvstore->size - offset) < bytes) {
2526 					pci_nvme_status_genc(status,
2527 					    NVME_SC_LBA_OUT_OF_RANGE);
2528 					goto out;
2529 				}
2530 				iov[dr].iov_base = (void *)offset;
2531 				iov[dr].iov_len = bytes;
2532 				dr++;
2533 			}
2534 			req->io_req.br_callback = pci_nvme_dealloc_sm;
2535 
2536 			/*
2537 			 * Use prev_gpaddr to track the current entry and
2538 			 * prev_size to track the number of entries
2539 			 */
2540 			req->prev_gpaddr = 0;
2541 			req->prev_size = dr;
2542 		}
2543 
2544 		err = blockif_delete(nvstore->ctx, &req->io_req);
2545 		if (err)
2546 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2547 		else
2548 			pending = true;
2549 	}
2550 out:
2551 	free(range);
2552 	return (pending);
2553 }
2554 
2555 static void
2556 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2557 {
2558 	struct nvme_submission_queue *sq;
2559 	uint16_t status;
2560 	uint16_t sqhead;
2561 
2562 #ifndef __FreeBSD__
2563 	status = 0;
2564 #endif
2565 
2566 	/* handle all submissions up to sq->tail index */
2567 	sq = &sc->submit_queues[idx];
2568 
2569 	pthread_mutex_lock(&sq->mtx);
2570 
2571 	sqhead = sq->head;
2572 	DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2573 	         idx, sqhead, sq->tail, sq->qbase);
2574 
2575 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
2576 		struct nvme_command *cmd;
2577 		struct pci_nvme_ioreq *req;
2578 		uint32_t nsid;
2579 		bool pending;
2580 
2581 		pending = false;
2582 		req = NULL;
2583 		status = 0;
2584 
2585 		cmd = &sq->qbase[sqhead];
2586 		sqhead = (sqhead + 1) % sq->size;
2587 
2588 		nsid = le32toh(cmd->nsid);
2589 		if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2590 			pci_nvme_status_genc(&status,
2591 			    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2592 			status |=
2593 			    NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2594 			goto complete;
2595  		}
2596 
2597 		req = pci_nvme_get_ioreq(sc);
2598 		if (req == NULL) {
2599 			pci_nvme_status_genc(&status,
2600 			    NVME_SC_INTERNAL_DEVICE_ERROR);
2601 			WPRINTF("%s: unable to allocate IO req", __func__);
2602 			goto complete;
2603 		}
2604 		req->nvme_sq = sq;
2605 		req->sqid = idx;
2606 		req->opc = cmd->opc;
2607 		req->cid = cmd->cid;
2608 		req->nsid = cmd->nsid;
2609 
2610 		switch (cmd->opc) {
2611 		case NVME_OPC_FLUSH:
2612 			pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2613 			    req, &status);
2614  			break;
2615 		case NVME_OPC_WRITE:
2616 		case NVME_OPC_READ:
2617 			pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2618 			    req, &status);
2619 			break;
2620 		case NVME_OPC_WRITE_ZEROES:
2621 			/* TODO: write zeroes
2622 			WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2623 			        __func__, lba, cmd->cdw12 & 0xFFFF); */
2624 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2625 			break;
2626 		case NVME_OPC_DATASET_MANAGEMENT:
2627  			pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2628 			    req, &status);
2629 			break;
2630  		default:
2631  			WPRINTF("%s unhandled io command 0x%x",
2632 			    __func__, cmd->opc);
2633 			pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2634 		}
2635 complete:
2636 		if (!pending) {
2637 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2638 			    status);
2639 			if (req != NULL)
2640 				pci_nvme_release_ioreq(sc, req);
2641 		}
2642 	}
2643 
2644 	sq->head = sqhead;
2645 
2646 	pthread_mutex_unlock(&sq->mtx);
2647 }
2648 
2649 static void
2650 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2651 	uint64_t idx, int is_sq, uint64_t value)
2652 {
2653 	DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2654 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2655 
2656 	if (is_sq) {
2657 		if (idx > sc->num_squeues) {
2658 			WPRINTF("%s queue index %lu overflow from "
2659 			         "guest (max %u)",
2660 			         __func__, idx, sc->num_squeues);
2661 			return;
2662 		}
2663 
2664 		atomic_store_short(&sc->submit_queues[idx].tail,
2665 		                   (uint16_t)value);
2666 
2667 		if (idx == 0) {
2668 			pci_nvme_handle_admin_cmd(sc, value);
2669 		} else {
2670 			/* submission queue; handle new entries in SQ */
2671 			if (idx > sc->num_squeues) {
2672 				WPRINTF("%s SQ index %lu overflow from "
2673 				         "guest (max %u)",
2674 				         __func__, idx, sc->num_squeues);
2675 				return;
2676 			}
2677 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2678 		}
2679 	} else {
2680 		if (idx > sc->num_cqueues) {
2681 			WPRINTF("%s queue index %lu overflow from "
2682 			         "guest (max %u)",
2683 			         __func__, idx, sc->num_cqueues);
2684 			return;
2685 		}
2686 
2687 		atomic_store_short(&sc->compl_queues[idx].head,
2688 				(uint16_t)value);
2689 	}
2690 }
2691 
2692 static void
2693 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2694 {
2695 	const char *s = iswrite ? "WRITE" : "READ";
2696 
2697 	switch (offset) {
2698 	case NVME_CR_CAP_LOW:
2699 		DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2700 		break;
2701 	case NVME_CR_CAP_HI:
2702 		DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2703 		break;
2704 	case NVME_CR_VS:
2705 		DPRINTF("%s %s NVME_CR_VS", func, s);
2706 		break;
2707 	case NVME_CR_INTMS:
2708 		DPRINTF("%s %s NVME_CR_INTMS", func, s);
2709 		break;
2710 	case NVME_CR_INTMC:
2711 		DPRINTF("%s %s NVME_CR_INTMC", func, s);
2712 		break;
2713 	case NVME_CR_CC:
2714 		DPRINTF("%s %s NVME_CR_CC", func, s);
2715 		break;
2716 	case NVME_CR_CSTS:
2717 		DPRINTF("%s %s NVME_CR_CSTS", func, s);
2718 		break;
2719 	case NVME_CR_NSSR:
2720 		DPRINTF("%s %s NVME_CR_NSSR", func, s);
2721 		break;
2722 	case NVME_CR_AQA:
2723 		DPRINTF("%s %s NVME_CR_AQA", func, s);
2724 		break;
2725 	case NVME_CR_ASQ_LOW:
2726 		DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2727 		break;
2728 	case NVME_CR_ASQ_HI:
2729 		DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2730 		break;
2731 	case NVME_CR_ACQ_LOW:
2732 		DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2733 		break;
2734 	case NVME_CR_ACQ_HI:
2735 		DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2736 		break;
2737 	default:
2738 		DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2739 	}
2740 
2741 }
2742 
2743 static void
2744 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2745 	uint64_t offset, int size, uint64_t value)
2746 {
2747 	uint32_t ccreg;
2748 
2749 	if (offset >= NVME_DOORBELL_OFFSET) {
2750 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2751 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2752 		int is_sq = (belloffset % 8) < 4;
2753 
2754 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2755 			WPRINTF("guest attempted an overflow write offset "
2756 			         "0x%lx, val 0x%lx in %s",
2757 			         offset, value, __func__);
2758 			return;
2759 		}
2760 
2761 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2762 		return;
2763 	}
2764 
2765 	DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2766 	        offset, size, value);
2767 
2768 	if (size != 4) {
2769 		WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2770 		         "val 0x%lx) to bar0 in %s",
2771 		         size, offset, value, __func__);
2772 		/* TODO: shutdown device */
2773 		return;
2774 	}
2775 
2776 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2777 
2778 	pthread_mutex_lock(&sc->mtx);
2779 
2780 	switch (offset) {
2781 	case NVME_CR_CAP_LOW:
2782 	case NVME_CR_CAP_HI:
2783 		/* readonly */
2784 		break;
2785 	case NVME_CR_VS:
2786 		/* readonly */
2787 		break;
2788 	case NVME_CR_INTMS:
2789 		/* MSI-X, so ignore */
2790 		break;
2791 	case NVME_CR_INTMC:
2792 		/* MSI-X, so ignore */
2793 		break;
2794 	case NVME_CR_CC:
2795 		ccreg = (uint32_t)value;
2796 
2797 		DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2798 		         "iocqes %u",
2799 		        __func__,
2800 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2801 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2802 			 NVME_CC_GET_IOCQES(ccreg));
2803 
2804 		if (NVME_CC_GET_SHN(ccreg)) {
2805 			/* perform shutdown - flush out data to backend */
2806 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2807 			    NVME_CSTS_REG_SHST_SHIFT);
2808 			sc->regs.csts |= NVME_SHST_COMPLETE <<
2809 			    NVME_CSTS_REG_SHST_SHIFT;
2810 		}
2811 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2812 			if (NVME_CC_GET_EN(ccreg) == 0)
2813 				/* transition 1-> causes controller reset */
2814 				pci_nvme_reset_locked(sc);
2815 			else
2816 				pci_nvme_init_controller(ctx, sc);
2817 		}
2818 
2819 		/* Insert the iocqes, iosqes and en bits from the write */
2820 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2821 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2822 		if (NVME_CC_GET_EN(ccreg) == 0) {
2823 			/* Insert the ams, mps and css bit fields */
2824 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2825 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2826 			sc->regs.csts &= ~NVME_CSTS_RDY;
2827 		} else if (sc->pending_ios == 0) {
2828 			sc->regs.csts |= NVME_CSTS_RDY;
2829 		}
2830 		break;
2831 	case NVME_CR_CSTS:
2832 		break;
2833 	case NVME_CR_NSSR:
2834 		/* ignore writes; don't support subsystem reset */
2835 		break;
2836 	case NVME_CR_AQA:
2837 		sc->regs.aqa = (uint32_t)value;
2838 		break;
2839 	case NVME_CR_ASQ_LOW:
2840 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2841 		               (0xFFFFF000 & value);
2842 		break;
2843 	case NVME_CR_ASQ_HI:
2844 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2845 		               (value << 32);
2846 		break;
2847 	case NVME_CR_ACQ_LOW:
2848 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2849 		               (0xFFFFF000 & value);
2850 		break;
2851 	case NVME_CR_ACQ_HI:
2852 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2853 		               (value << 32);
2854 		break;
2855 	default:
2856 		DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2857 		         __func__, offset, value, size);
2858 	}
2859 	pthread_mutex_unlock(&sc->mtx);
2860 }
2861 
2862 static void
2863 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2864                 int baridx, uint64_t offset, int size, uint64_t value)
2865 {
2866 	struct pci_nvme_softc* sc = pi->pi_arg;
2867 
2868 	if (baridx == pci_msix_table_bar(pi) ||
2869 	    baridx == pci_msix_pba_bar(pi)) {
2870 		DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2871 		         " value 0x%lx", baridx, offset, size, value);
2872 
2873 		pci_emul_msix_twrite(pi, offset, size, value);
2874 		return;
2875 	}
2876 
2877 	switch (baridx) {
2878 	case 0:
2879 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2880 		break;
2881 
2882 	default:
2883 		DPRINTF("%s unknown baridx %d, val 0x%lx",
2884 		         __func__, baridx, value);
2885 	}
2886 }
2887 
2888 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2889 	uint64_t offset, int size)
2890 {
2891 	uint64_t value;
2892 
2893 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2894 
2895 	if (offset < NVME_DOORBELL_OFFSET) {
2896 		void *p = &(sc->regs);
2897 		pthread_mutex_lock(&sc->mtx);
2898 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
2899 		pthread_mutex_unlock(&sc->mtx);
2900 	} else {
2901 		value = 0;
2902                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2903 	}
2904 
2905 	switch (size) {
2906 	case 1:
2907 		value &= 0xFF;
2908 		break;
2909 	case 2:
2910 		value &= 0xFFFF;
2911 		break;
2912 	case 4:
2913 		value &= 0xFFFFFFFF;
2914 		break;
2915 	}
2916 
2917 	DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2918 	         offset, size, (uint32_t)value);
2919 
2920 	return (value);
2921 }
2922 
2923 
2924 
2925 static uint64_t
2926 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2927     uint64_t offset, int size)
2928 {
2929 	struct pci_nvme_softc* sc = pi->pi_arg;
2930 
2931 	if (baridx == pci_msix_table_bar(pi) ||
2932 	    baridx == pci_msix_pba_bar(pi)) {
2933 		DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2934 		        baridx, offset, size);
2935 
2936 		return pci_emul_msix_tread(pi, offset, size);
2937 	}
2938 
2939 	switch (baridx) {
2940 	case 0:
2941        		return pci_nvme_read_bar_0(sc, offset, size);
2942 
2943 	default:
2944 		DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
2945 	}
2946 
2947 	return (0);
2948 }
2949 
2950 static int
2951 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
2952 {
2953 	char bident[sizeof("XX:X:X")];
2954 	const char *value;
2955 	uint32_t sectsz;
2956 
2957 	sc->max_queues = NVME_QUEUES;
2958 	sc->max_qentries = NVME_MAX_QENTRIES;
2959 	sc->ioslots = NVME_IOSLOTS;
2960 	sc->num_squeues = sc->max_queues;
2961 	sc->num_cqueues = sc->max_queues;
2962 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2963 	sectsz = 0;
2964 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2965 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2966 
2967 	value = get_config_value_node(nvl, "maxq");
2968 	if (value != NULL)
2969 		sc->max_queues = atoi(value);
2970 	value = get_config_value_node(nvl, "qsz");
2971 	if (value != NULL) {
2972 		sc->max_qentries = atoi(value);
2973 		if (sc->max_qentries <= 0) {
2974 			EPRINTLN("nvme: Invalid qsz option %d",
2975 			    sc->max_qentries);
2976 			return (-1);
2977 		}
2978 	}
2979 	value = get_config_value_node(nvl, "ioslots");
2980 	if (value != NULL) {
2981 		sc->ioslots = atoi(value);
2982 		if (sc->ioslots <= 0) {
2983 			EPRINTLN("Invalid ioslots option %d", sc->ioslots);
2984 			return (-1);
2985 		}
2986 	}
2987 	value = get_config_value_node(nvl, "sectsz");
2988 	if (value != NULL)
2989 		sectsz = atoi(value);
2990 	value = get_config_value_node(nvl, "ser");
2991 	if (value != NULL) {
2992 		/*
2993 		 * This field indicates the Product Serial Number in
2994 		 * 7-bit ASCII, unused bytes should be space characters.
2995 		 * Ref: NVMe v1.3c.
2996 		 */
2997 		cpywithpad((char *)sc->ctrldata.sn,
2998 		    sizeof(sc->ctrldata.sn), value, ' ');
2999 	}
3000 	value = get_config_value_node(nvl, "eui64");
3001 	if (value != NULL)
3002 		sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3003 	value = get_config_value_node(nvl, "dsm");
3004 	if (value != NULL) {
3005 		if (strcmp(value, "auto") == 0)
3006 			sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3007 		else if (strcmp(value, "enable") == 0)
3008 			sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3009 		else if (strcmp(value, "disable") == 0)
3010 			sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3011 	}
3012 
3013 	value = get_config_value_node(nvl, "ram");
3014 	if (value != NULL) {
3015 		uint64_t sz = strtoull(value, NULL, 10);
3016 
3017 		sc->nvstore.type = NVME_STOR_RAM;
3018 		sc->nvstore.size = sz * 1024 * 1024;
3019 		sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3020 		sc->nvstore.sectsz = 4096;
3021 		sc->nvstore.sectsz_bits = 12;
3022 		if (sc->nvstore.ctx == NULL) {
3023 			EPRINTLN("nvme: Unable to allocate RAM");
3024 			return (-1);
3025 		}
3026 	} else {
3027 		snprintf(bident, sizeof(bident), "%d:%d",
3028 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3029 		sc->nvstore.ctx = blockif_open(nvl, bident);
3030 		if (sc->nvstore.ctx == NULL) {
3031 			EPRINTLN("nvme: Could not open backing file: %s",
3032 			    strerror(errno));
3033 			return (-1);
3034 		}
3035 		sc->nvstore.type = NVME_STOR_BLOCKIF;
3036 		sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3037 	}
3038 
3039 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3040 		sc->nvstore.sectsz = sectsz;
3041 	else if (sc->nvstore.type != NVME_STOR_RAM)
3042 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3043 	for (sc->nvstore.sectsz_bits = 9;
3044 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3045 	     sc->nvstore.sectsz_bits++);
3046 
3047 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3048 		sc->max_queues = NVME_QUEUES;
3049 
3050 	return (0);
3051 }
3052 
3053 static void
3054 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3055 {
3056 	struct pci_nvme_softc *sc;
3057 	struct pci_nvme_blockstore *nvstore;
3058 	struct nvme_namespace_data *nd;
3059 
3060 	sc = arg;
3061 	nvstore = &sc->nvstore;
3062 	nd = &sc->nsdata;
3063 
3064 	nvstore->size = new_size;
3065 	pci_nvme_init_nsdata_size(nvstore, nd);
3066 
3067 	/* Add changed NSID to list */
3068 	sc->ns_log.ns[0] = 1;
3069 	sc->ns_log.ns[1] = 0;
3070 
3071 	pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3072 	    PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3073 }
3074 
3075 static int
3076 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3077 {
3078 	struct pci_nvme_softc *sc;
3079 	uint32_t pci_membar_sz;
3080 	int	error;
3081 
3082 	error = 0;
3083 
3084 	sc = calloc(1, sizeof(struct pci_nvme_softc));
3085 	pi->pi_arg = sc;
3086 	sc->nsc_pi = pi;
3087 
3088 	error = pci_nvme_parse_config(sc, nvl);
3089 	if (error < 0)
3090 		goto done;
3091 	else
3092 		error = 0;
3093 
3094 	STAILQ_INIT(&sc->ioreqs_free);
3095 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3096 	for (int i = 0; i < sc->ioslots; i++) {
3097 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3098 	}
3099 
3100 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3101 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3102 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3103 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3104 	pci_set_cfgdata8(pi, PCIR_PROGIF,
3105 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3106 
3107 	/*
3108 	 * Allocate size of NVMe registers + doorbell space for all queues.
3109 	 *
3110 	 * The specification requires a minimum memory I/O window size of 16K.
3111 	 * The Windows driver will refuse to start a device with a smaller
3112 	 * window.
3113 	 */
3114 	pci_membar_sz = sizeof(struct nvme_registers) +
3115 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
3116 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3117 
3118 	DPRINTF("nvme membar size: %u", pci_membar_sz);
3119 
3120 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3121 	if (error) {
3122 		WPRINTF("%s pci alloc mem bar failed", __func__);
3123 		goto done;
3124 	}
3125 
3126 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3127 	if (error) {
3128 		WPRINTF("%s pci add msixcap failed", __func__);
3129 		goto done;
3130 	}
3131 
3132 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3133 	if (error) {
3134 		WPRINTF("%s pci add Express capability failed", __func__);
3135 		goto done;
3136 	}
3137 
3138 	pthread_mutex_init(&sc->mtx, NULL);
3139 	sem_init(&sc->iosemlock, 0, sc->ioslots);
3140 	blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3141 
3142 	pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3143 	/*
3144 	 * Controller data depends on Namespace data so initialize Namespace
3145 	 * data first.
3146 	 */
3147 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3148 	pci_nvme_init_ctrldata(sc);
3149 	pci_nvme_init_logpages(sc);
3150 	pci_nvme_init_features(sc);
3151 
3152 	pci_nvme_aer_init(sc);
3153 	pci_nvme_aen_init(sc);
3154 
3155 	pci_nvme_reset(sc);
3156 
3157 	pci_lintr_request(pi);
3158 
3159 done:
3160 	return (error);
3161 }
3162 
3163 static int
3164 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3165 {
3166 	char *cp, *ram;
3167 
3168 	if (opts == NULL)
3169 		return (0);
3170 
3171 	if (strncmp(opts, "ram=", 4) == 0) {
3172 		cp = strchr(opts, ',');
3173 		if (cp == NULL) {
3174 			set_config_value_node(nvl, "ram", opts + 4);
3175 			return (0);
3176 		}
3177 		ram = strndup(opts + 4, cp - opts - 4);
3178 		set_config_value_node(nvl, "ram", ram);
3179 		free(ram);
3180 		return (pci_parse_legacy_config(nvl, cp + 1));
3181 	} else
3182 		return (blockif_legacy_config(nvl, opts));
3183 }
3184 
3185 struct pci_devemu pci_de_nvme = {
3186 	.pe_emu =	"nvme",
3187 	.pe_init =	pci_nvme_init,
3188 	.pe_legacy_config = pci_nvme_legacy_config,
3189 	.pe_barwrite =	pci_nvme_write,
3190 	.pe_barread =	pci_nvme_read
3191 };
3192 PCI_EMUL_SET(pci_de_nvme);
3193