xref: /illumos-gate/usr/src/cmd/bhyve/pci_nvme.c (revision c94be9439c4f0773ef60e2cec21d548359cfea20)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  *
7  * Function crc16 Copyright (c) 2017, Fedor Uporov
8  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * bhyve PCIe-NVMe device emulation.
34  *
35  * options:
36  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#
37  *
38  *  accepted devpath:
39  *    /dev/blockdev
40  *    /path/to/image
41  *    ram=size_in_MiB
42  *
43  *  maxq    = max number of queues
44  *  qsz     = max elements in each queue
45  *  ioslots = max number of concurrent io requests
46  *  sectsz  = sector size (defaults to blockif sector size)
47  *  ser     = serial number (20-chars max)
48  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
49  *
50  */
51 
52 /* TODO:
53     - create async event for smart and log
54     - intr coalesce
55  */
56 
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59 
60 #include <sys/types.h>
61 #include <net/ieee_oui.h>
62 #ifndef __FreeBSD__
63 #include <endian.h>
64 #endif
65 
66 #include <assert.h>
67 #include <pthread.h>
68 #include <semaphore.h>
69 #include <stdbool.h>
70 #include <stddef.h>
71 #include <stdint.h>
72 #include <stdio.h>
73 #include <stdlib.h>
74 #include <string.h>
75 
76 #include <machine/atomic.h>
77 #include <machine/vmm.h>
78 #include <vmmapi.h>
79 
80 #include <dev/nvme/nvme.h>
81 
82 #include "bhyverun.h"
83 #include "block_if.h"
84 #include "debug.h"
85 #include "pci_emul.h"
86 
87 
88 static int nvme_debug = 0;
89 #define	DPRINTF(params) if (nvme_debug) PRINTLN params
90 #define	WPRINTF(params) PRINTLN params
91 
92 /* defaults; can be overridden */
93 #define	NVME_MSIX_BAR		4
94 
95 #define	NVME_IOSLOTS		8
96 
97 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
98 #define NVME_MMIO_SPACE_MIN	(1 << 14)
99 
100 #define	NVME_QUEUES		16
101 #define	NVME_MAX_QENTRIES	2048
102 
103 #define	NVME_PRP2_ITEMS		(PAGE_SIZE/sizeof(uint64_t))
104 #define	NVME_MAX_BLOCKIOVS	512
105 
106 /* This is a synthetic status code to indicate there is no status */
107 #define NVME_NO_STATUS		0xffff
108 #define NVME_COMPLETION_VALID(c)	((c).status != NVME_NO_STATUS)
109 
110 /* helpers */
111 
112 /* Convert a zero-based value into a one-based value */
113 #define ONE_BASED(zero)		((zero) + 1)
114 /* Convert a one-based value into a zero-based value */
115 #define ZERO_BASED(one)		((one)  - 1)
116 
117 /* Encode number of SQ's and CQ's for Set/Get Features */
118 #define NVME_FEATURE_NUM_QUEUES(sc) \
119 	(ZERO_BASED((sc)->num_squeues) & 0xffff) | \
120 	(ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
121 
122 #define	NVME_DOORBELL_OFFSET	offsetof(struct nvme_registers, doorbell)
123 
124 enum nvme_controller_register_offsets {
125 	NVME_CR_CAP_LOW = 0x00,
126 	NVME_CR_CAP_HI  = 0x04,
127 	NVME_CR_VS      = 0x08,
128 	NVME_CR_INTMS   = 0x0c,
129 	NVME_CR_INTMC   = 0x10,
130 	NVME_CR_CC      = 0x14,
131 	NVME_CR_CSTS    = 0x1c,
132 	NVME_CR_NSSR    = 0x20,
133 	NVME_CR_AQA     = 0x24,
134 	NVME_CR_ASQ_LOW = 0x28,
135 	NVME_CR_ASQ_HI  = 0x2c,
136 	NVME_CR_ACQ_LOW = 0x30,
137 	NVME_CR_ACQ_HI  = 0x34,
138 };
139 
140 enum nvme_cmd_cdw11 {
141 	NVME_CMD_CDW11_PC  = 0x0001,
142 	NVME_CMD_CDW11_IEN = 0x0002,
143 	NVME_CMD_CDW11_IV  = 0xFFFF0000,
144 };
145 
146 enum nvme_copy_dir {
147 	NVME_COPY_TO_PRP,
148 	NVME_COPY_FROM_PRP,
149 };
150 
151 #define	NVME_CQ_INTEN	0x01
152 #define	NVME_CQ_INTCOAL	0x02
153 
154 struct nvme_completion_queue {
155 	struct nvme_completion *qbase;
156 	uint32_t	size;
157 	uint16_t	tail; /* nvme progress */
158 	uint16_t	head; /* guest progress */
159 	uint16_t	intr_vec;
160 	uint32_t	intr_en;
161 	pthread_mutex_t	mtx;
162 };
163 
164 struct nvme_submission_queue {
165 	struct nvme_command *qbase;
166 	uint32_t	size;
167 	uint16_t	head; /* nvme progress */
168 	uint16_t	tail; /* guest progress */
169 	uint16_t	cqid; /* completion queue id */
170 	int		busy; /* queue is being processed */
171 	int		qpriority;
172 };
173 
174 enum nvme_storage_type {
175 	NVME_STOR_BLOCKIF = 0,
176 	NVME_STOR_RAM = 1,
177 };
178 
179 struct pci_nvme_blockstore {
180 	enum nvme_storage_type type;
181 	void		*ctx;
182 	uint64_t	size;
183 	uint32_t	sectsz;
184 	uint32_t	sectsz_bits;
185 	uint64_t	eui64;
186 	uint32_t	deallocate:1;
187 };
188 
189 struct pci_nvme_ioreq {
190 	struct pci_nvme_softc *sc;
191 	STAILQ_ENTRY(pci_nvme_ioreq) link;
192 	struct nvme_submission_queue *nvme_sq;
193 	uint16_t	sqid;
194 
195 	/* command information */
196 	uint16_t	opc;
197 	uint16_t	cid;
198 	uint32_t	nsid;
199 
200 	uint64_t	prev_gpaddr;
201 	size_t		prev_size;
202 
203 	/*
204 	 * lock if all iovs consumed (big IO);
205 	 * complete transaction before continuing
206 	 */
207 	pthread_mutex_t	mtx;
208 	pthread_cond_t	cv;
209 
210 	struct blockif_req io_req;
211 
212 	/* pad to fit up to 512 page descriptors from guest IO request */
213 	struct iovec	iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX];
214 };
215 
216 enum nvme_dsm_type {
217 	/* Dataset Management bit in ONCS reflects backing storage capability */
218 	NVME_DATASET_MANAGEMENT_AUTO,
219 	/* Unconditionally set Dataset Management bit in ONCS */
220 	NVME_DATASET_MANAGEMENT_ENABLE,
221 	/* Unconditionally clear Dataset Management bit in ONCS */
222 	NVME_DATASET_MANAGEMENT_DISABLE,
223 };
224 
225 struct pci_nvme_softc {
226 	struct pci_devinst *nsc_pi;
227 
228 	pthread_mutex_t	mtx;
229 
230 	struct nvme_registers regs;
231 
232 	struct nvme_namespace_data  nsdata;
233 	struct nvme_controller_data ctrldata;
234 	struct nvme_error_information_entry err_log;
235 	struct nvme_health_information_page health_log;
236 	struct nvme_firmware_page fw_log;
237 
238 	struct pci_nvme_blockstore nvstore;
239 
240 	uint16_t	max_qentries;	/* max entries per queue */
241 	uint32_t	max_queues;	/* max number of IO SQ's or CQ's */
242 	uint32_t	num_cqueues;
243 	uint32_t	num_squeues;
244 
245 	struct pci_nvme_ioreq *ioreqs;
246 	STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
247 	uint32_t	pending_ios;
248 	uint32_t	ioslots;
249 	sem_t		iosemlock;
250 
251 	/*
252 	 * Memory mapped Submission and Completion queues
253 	 * Each array includes both Admin and IO queues
254 	 */
255 	struct nvme_completion_queue *compl_queues;
256 	struct nvme_submission_queue *submit_queues;
257 
258 	/* controller features */
259 	uint32_t	intr_coales_aggr_time;   /* 0x08: uS to delay intr */
260 	uint32_t	intr_coales_aggr_thresh; /* 0x08: compl-Q entries */
261 	uint32_t	async_ev_config;         /* 0x0B: async event config */
262 
263 	enum nvme_dsm_type dataset_management;
264 };
265 
266 
267 static void pci_nvme_io_partial(struct blockif_req *br, int err);
268 
269 /* Controller Configuration utils */
270 #define	NVME_CC_GET_EN(cc) \
271 	((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
272 #define	NVME_CC_GET_CSS(cc) \
273 	((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
274 #define	NVME_CC_GET_SHN(cc) \
275 	((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
276 #define	NVME_CC_GET_IOSQES(cc) \
277 	((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
278 #define	NVME_CC_GET_IOCQES(cc) \
279 	((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
280 
281 #define	NVME_CC_WRITE_MASK \
282 	((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
283 	 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
284 	 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
285 
286 #define	NVME_CC_NEN_WRITE_MASK \
287 	((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
288 	 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
289 	 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
290 
291 /* Controller Status utils */
292 #define	NVME_CSTS_GET_RDY(sts) \
293 	((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
294 
295 #define	NVME_CSTS_RDY	(1 << NVME_CSTS_REG_RDY_SHIFT)
296 
297 /* Completion Queue status word utils */
298 #define	NVME_STATUS_P	(1 << NVME_STATUS_P_SHIFT)
299 #define	NVME_STATUS_MASK \
300 	((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
301 	 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
302 
303 #define NVME_ONCS_DSM	(NVME_CTRLR_DATA_ONCS_DSM_MASK << \
304 	NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
305 
306 static __inline void
307 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
308 {
309 	size_t len;
310 
311 	len = strnlen(src, dst_size);
312 	memset(dst, pad, dst_size);
313 	memcpy(dst, src, len);
314 }
315 
316 static __inline void
317 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
318 {
319 
320 	*status &= ~NVME_STATUS_MASK;
321 	*status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
322 		(code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
323 }
324 
325 static __inline void
326 pci_nvme_status_genc(uint16_t *status, uint16_t code)
327 {
328 
329 	pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
330 }
331 
332 static __inline void
333 pci_nvme_toggle_phase(uint16_t *status, int prev)
334 {
335 
336 	if (prev)
337 		*status &= ~NVME_STATUS_P;
338 	else
339 		*status |= NVME_STATUS_P;
340 }
341 
342 static void
343 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
344 {
345 	struct nvme_controller_data *cd = &sc->ctrldata;
346 
347 	cd->vid = 0xFB5D;
348 	cd->ssvid = 0x0000;
349 
350 	cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
351 	cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
352 
353 	/* Num of submission commands that we can handle at a time (2^rab) */
354 	cd->rab   = 4;
355 
356 	/* FreeBSD OUI */
357 	cd->ieee[0] = 0x58;
358 	cd->ieee[1] = 0x9c;
359 	cd->ieee[2] = 0xfc;
360 
361 	cd->mic = 0;
362 
363 	cd->mdts = 9;	/* max data transfer size (2^mdts * CAP.MPSMIN) */
364 
365 	cd->ver = 0x00010300;
366 
367 	cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
368 	cd->acl = 2;
369 	cd->aerl = 4;
370 
371 	cd->lpa = 0;	/* TODO: support some simple things like SMART */
372 	cd->elpe = 0;	/* max error log page entries */
373 	cd->npss = 1;	/* number of power states support */
374 
375 	/* Warning Composite Temperature Threshold */
376 	cd->wctemp = 0x0157;
377 
378 	cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
379 	    (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
380 	cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
381 	    (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
382 	cd->nn = 1;	/* number of namespaces */
383 
384 	cd->oncs = 0;
385 	switch (sc->dataset_management) {
386 	case NVME_DATASET_MANAGEMENT_AUTO:
387 		if (sc->nvstore.deallocate)
388 			cd->oncs |= NVME_ONCS_DSM;
389 		break;
390 	case NVME_DATASET_MANAGEMENT_ENABLE:
391 		cd->oncs |= NVME_ONCS_DSM;
392 		break;
393 	default:
394 		break;
395 	}
396 
397 	cd->fna = 0x03;
398 
399 	cd->power_state[0].mp = 10;
400 }
401 
402 /*
403  * Calculate the CRC-16 of the given buffer
404  * See copyright attribution at top of file
405  */
406 static uint16_t
407 crc16(uint16_t crc, const void *buffer, unsigned int len)
408 {
409 	const unsigned char *cp = buffer;
410 	/* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
411 	static uint16_t const crc16_table[256] = {
412 		0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
413 		0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
414 		0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
415 		0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
416 		0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
417 		0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
418 		0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
419 		0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
420 		0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
421 		0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
422 		0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
423 		0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
424 		0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
425 		0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
426 		0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
427 		0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
428 		0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
429 		0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
430 		0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
431 		0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
432 		0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
433 		0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
434 		0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
435 		0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
436 		0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
437 		0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
438 		0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
439 		0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
440 		0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
441 		0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
442 		0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
443 		0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
444 	};
445 
446 	while (len--)
447 		crc = (((crc >> 8) & 0xffU) ^
448 		    crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
449 	return crc;
450 }
451 
452 static void
453 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
454     struct nvme_namespace_data *nd, uint32_t nsid,
455     struct pci_nvme_blockstore *nvstore)
456 {
457 
458 	/* Get capacity and block size information from backing store */
459 	nd->nsze = nvstore->size / nvstore->sectsz;
460 	nd->ncap = nd->nsze;
461 	nd->nuse = nd->nsze;
462 
463 	if (nvstore->type == NVME_STOR_BLOCKIF)
464 		nvstore->deallocate = blockif_candelete(nvstore->ctx);
465 
466 	nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
467 	nd->flbas = 0;
468 
469 	/* Create an EUI-64 if user did not provide one */
470 	if (nvstore->eui64 == 0) {
471 		char *data = NULL;
472 		uint64_t eui64 = nvstore->eui64;
473 
474 		asprintf(&data, "%s%u%u%u", vmname, sc->nsc_pi->pi_bus,
475 		    sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
476 
477 		if (data != NULL) {
478 			eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
479 			free(data);
480 		}
481 		nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
482 	}
483 	be64enc(nd->eui64, nvstore->eui64);
484 
485 	/* LBA data-sz = 2^lbads */
486 	nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
487 }
488 
489 static void
490 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
491 {
492 
493 	memset(&sc->err_log, 0, sizeof(sc->err_log));
494 	memset(&sc->health_log, 0, sizeof(sc->health_log));
495 	memset(&sc->fw_log, 0, sizeof(sc->fw_log));
496 }
497 
498 static void
499 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
500 {
501 	DPRINTF(("%s", __func__));
502 
503 	sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
504 	    (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
505 	    (60 << NVME_CAP_LO_REG_TO_SHIFT);
506 
507 	sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
508 
509 	sc->regs.vs = 0x00010300;	/* NVMe v1.3 */
510 
511 	sc->regs.cc = 0;
512 	sc->regs.csts = 0;
513 
514 	sc->num_cqueues = sc->num_squeues = sc->max_queues;
515 	if (sc->submit_queues != NULL) {
516 		for (int i = 0; i < sc->num_squeues + 1; i++) {
517 			/*
518 			 * The Admin Submission Queue is at index 0.
519 			 * It must not be changed at reset otherwise the
520 			 * emulation will be out of sync with the guest.
521 			 */
522 			if (i != 0) {
523 				sc->submit_queues[i].qbase = NULL;
524 				sc->submit_queues[i].size = 0;
525 				sc->submit_queues[i].cqid = 0;
526 			}
527 			sc->submit_queues[i].tail = 0;
528 			sc->submit_queues[i].head = 0;
529 			sc->submit_queues[i].busy = 0;
530 		}
531 	} else
532 		sc->submit_queues = calloc(sc->num_squeues + 1,
533 		                        sizeof(struct nvme_submission_queue));
534 
535 	if (sc->compl_queues != NULL) {
536 		for (int i = 0; i < sc->num_cqueues + 1; i++) {
537 			/* See Admin Submission Queue note above */
538 			if (i != 0) {
539 				sc->compl_queues[i].qbase = NULL;
540 				sc->compl_queues[i].size = 0;
541 			}
542 
543 			sc->compl_queues[i].tail = 0;
544 			sc->compl_queues[i].head = 0;
545 		}
546 	} else {
547 		sc->compl_queues = calloc(sc->num_cqueues + 1,
548 		                        sizeof(struct nvme_completion_queue));
549 
550 		for (int i = 0; i < sc->num_cqueues + 1; i++)
551 			pthread_mutex_init(&sc->compl_queues[i].mtx, NULL);
552 	}
553 }
554 
555 static void
556 pci_nvme_reset(struct pci_nvme_softc *sc)
557 {
558 	pthread_mutex_lock(&sc->mtx);
559 	pci_nvme_reset_locked(sc);
560 	pthread_mutex_unlock(&sc->mtx);
561 }
562 
563 static void
564 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
565 {
566 	uint16_t acqs, asqs;
567 
568 	DPRINTF(("%s", __func__));
569 
570 	asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
571 	sc->submit_queues[0].size = asqs;
572 	sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
573 	            sizeof(struct nvme_command) * asqs);
574 
575 	DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p",
576 	        __func__, sc->regs.asq, sc->submit_queues[0].qbase));
577 
578 	acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
579 	    NVME_AQA_REG_ACQS_MASK) + 1;
580 	sc->compl_queues[0].size = acqs;
581 	sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
582 	         sizeof(struct nvme_completion) * acqs);
583 	DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p",
584 	        __func__, sc->regs.acq, sc->compl_queues[0].qbase));
585 }
586 
587 static int
588 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
589 	size_t len, enum nvme_copy_dir dir)
590 {
591 	uint8_t *p;
592 	size_t bytes;
593 
594 	if (len > (8 * 1024)) {
595 		return (-1);
596 	}
597 
598 	/* Copy from the start of prp1 to the end of the physical page */
599 	bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
600 	bytes = MIN(bytes, len);
601 
602 	p = vm_map_gpa(ctx, prp1, bytes);
603 	if (p == NULL) {
604 		return (-1);
605 	}
606 
607 	if (dir == NVME_COPY_TO_PRP)
608 		memcpy(p, b, bytes);
609 	else
610 		memcpy(b, p, bytes);
611 
612 	b += bytes;
613 
614 	len -= bytes;
615 	if (len == 0) {
616 		return (0);
617 	}
618 
619 	len = MIN(len, PAGE_SIZE);
620 
621 	p = vm_map_gpa(ctx, prp2, len);
622 	if (p == NULL) {
623 		return (-1);
624 	}
625 
626 	if (dir == NVME_COPY_TO_PRP)
627 		memcpy(p, b, len);
628 	else
629 		memcpy(b, p, len);
630 
631 	return (0);
632 }
633 
634 static int
635 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
636 	struct nvme_completion* compl)
637 {
638 	uint16_t qid = command->cdw10 & 0xffff;
639 
640 	DPRINTF(("%s DELETE_IO_SQ %u", __func__, qid));
641 	if (qid == 0 || qid > sc->num_squeues) {
642 		WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u",
643 		        __func__, qid, sc->num_squeues));
644 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
645 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
646 		return (1);
647 	}
648 
649 	sc->submit_queues[qid].qbase = NULL;
650 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
651 	return (1);
652 }
653 
654 static int
655 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
656 	struct nvme_completion* compl)
657 {
658 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
659 		uint16_t qid = command->cdw10 & 0xffff;
660 		struct nvme_submission_queue *nsq;
661 
662 		if ((qid == 0) || (qid > sc->num_squeues)) {
663 			WPRINTF(("%s queue index %u > num_squeues %u",
664 			        __func__, qid, sc->num_squeues));
665 			pci_nvme_status_tc(&compl->status,
666 			    NVME_SCT_COMMAND_SPECIFIC,
667 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
668 			return (1);
669 		}
670 
671 		nsq = &sc->submit_queues[qid];
672 		nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
673 
674 		nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
675 		              sizeof(struct nvme_command) * (size_t)nsq->size);
676 		nsq->cqid = (command->cdw11 >> 16) & 0xffff;
677 		nsq->qpriority = (command->cdw11 >> 1) & 0x03;
678 
679 		DPRINTF(("%s sq %u size %u gaddr %p cqid %u", __func__,
680 		        qid, nsq->size, nsq->qbase, nsq->cqid));
681 
682 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
683 
684 		DPRINTF(("%s completed creating IOSQ qid %u",
685 		         __func__, qid));
686 	} else {
687 		/*
688 		 * Guest sent non-cont submission queue request.
689 		 * This setting is unsupported by this emulation.
690 		 */
691 		WPRINTF(("%s unsupported non-contig (list-based) "
692 		         "create i/o submission queue", __func__));
693 
694 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
695 	}
696 	return (1);
697 }
698 
699 static int
700 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
701 	struct nvme_completion* compl)
702 {
703 	uint16_t qid = command->cdw10 & 0xffff;
704 
705 	DPRINTF(("%s DELETE_IO_CQ %u", __func__, qid));
706 	if (qid == 0 || qid > sc->num_cqueues) {
707 		WPRINTF(("%s queue index %u / num_cqueues %u",
708 		        __func__, qid, sc->num_cqueues));
709 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
710 		    NVME_SC_INVALID_QUEUE_IDENTIFIER);
711 		return (1);
712 	}
713 
714 	sc->compl_queues[qid].qbase = NULL;
715 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
716 	return (1);
717 }
718 
719 static int
720 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
721 	struct nvme_completion* compl)
722 {
723 	if (command->cdw11 & NVME_CMD_CDW11_PC) {
724 		uint16_t qid = command->cdw10 & 0xffff;
725 		struct nvme_completion_queue *ncq;
726 
727 		if ((qid == 0) || (qid > sc->num_cqueues)) {
728 			WPRINTF(("%s queue index %u > num_cqueues %u",
729 			        __func__, qid, sc->num_cqueues));
730 			pci_nvme_status_tc(&compl->status,
731 			    NVME_SCT_COMMAND_SPECIFIC,
732 			    NVME_SC_INVALID_QUEUE_IDENTIFIER);
733 			return (1);
734 		}
735 
736 		ncq = &sc->compl_queues[qid];
737 		ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
738 		ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
739 		ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
740 
741 		ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
742 		             command->prp1,
743 		             sizeof(struct nvme_command) * (size_t)ncq->size);
744 
745 		pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
746 	} else {
747 		/*
748 		 * Non-contig completion queue unsupported.
749 		 */
750 		WPRINTF(("%s unsupported non-contig (list-based) "
751 		         "create i/o completion queue",
752 		         __func__));
753 
754 		/* 0x12 = Invalid Use of Controller Memory Buffer */
755 		pci_nvme_status_genc(&compl->status, 0x12);
756 	}
757 
758 	return (1);
759 }
760 
761 static int
762 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
763 	struct nvme_completion* compl)
764 {
765 	uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2;
766 	uint8_t logpage = command->cdw10 & 0xFF;
767 
768 	DPRINTF(("%s log page %u len %u", __func__, logpage, logsize));
769 
770 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
771 
772 	switch (logpage) {
773 	case NVME_LOG_ERROR:
774 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
775 		    command->prp2, (uint8_t *)&sc->err_log, logsize,
776 		    NVME_COPY_TO_PRP);
777 		break;
778 	case NVME_LOG_HEALTH_INFORMATION:
779 		/* TODO: present some smart info */
780 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
781 		    command->prp2, (uint8_t *)&sc->health_log, logsize,
782 		    NVME_COPY_TO_PRP);
783 		break;
784 	case NVME_LOG_FIRMWARE_SLOT:
785 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
786 		    command->prp2, (uint8_t *)&sc->fw_log, logsize,
787 		    NVME_COPY_TO_PRP);
788 		break;
789 	default:
790 		WPRINTF(("%s get log page %x command not supported",
791 		        __func__, logpage));
792 
793 		pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
794 		    NVME_SC_INVALID_LOG_PAGE);
795 	}
796 
797 	return (1);
798 }
799 
800 static int
801 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
802 	struct nvme_completion* compl)
803 {
804 	void *dest;
805 
806 	DPRINTF(("%s identify 0x%x nsid 0x%x", __func__,
807 	        command->cdw10 & 0xFF, command->nsid));
808 
809 	switch (command->cdw10 & 0xFF) {
810 	case 0x00: /* return Identify Namespace data structure */
811 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
812 		    command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
813 		    NVME_COPY_TO_PRP);
814 		break;
815 	case 0x01: /* return Identify Controller data structure */
816 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
817 		    command->prp2, (uint8_t *)&sc->ctrldata,
818 		    sizeof(sc->ctrldata),
819 		    NVME_COPY_TO_PRP);
820 		break;
821 	case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
822 		dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
823 		                  sizeof(uint32_t) * 1024);
824 		((uint32_t *)dest)[0] = 1;
825 		((uint32_t *)dest)[1] = 0;
826 		break;
827 	case 0x11:
828 		pci_nvme_status_genc(&compl->status,
829 		    NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
830 		return (1);
831 	case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
832 	case 0x10:
833 	case 0x12:
834 	case 0x13:
835 	case 0x14:
836 	case 0x15:
837 	default:
838 		DPRINTF(("%s unsupported identify command requested 0x%x",
839 		         __func__, command->cdw10 & 0xFF));
840 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
841 		return (1);
842 	}
843 
844 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
845 	return (1);
846 }
847 
848 static int
849 nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command,
850 	struct nvme_completion* compl)
851 {
852 	uint16_t nqr;	/* Number of Queues Requested */
853 
854 	nqr = command->cdw11 & 0xFFFF;
855 	if (nqr == 0xffff) {
856 		WPRINTF(("%s: Illegal NSQR value %#x", __func__, nqr));
857 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
858 		return (-1);
859 	}
860 
861 	sc->num_squeues = ONE_BASED(nqr);
862 	if (sc->num_squeues > sc->max_queues) {
863 		DPRINTF(("NSQR=%u is greater than max %u", sc->num_squeues,
864 					sc->max_queues));
865 		sc->num_squeues = sc->max_queues;
866 	}
867 
868 	nqr = (command->cdw11 >> 16) & 0xFFFF;
869 	if (nqr == 0xffff) {
870 		WPRINTF(("%s: Illegal NCQR value %#x", __func__, nqr));
871 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
872 		return (-1);
873 	}
874 
875 	sc->num_cqueues = ONE_BASED(nqr);
876 	if (sc->num_cqueues > sc->max_queues) {
877 		DPRINTF(("NCQR=%u is greater than max %u", sc->num_cqueues,
878 					sc->max_queues));
879 		sc->num_cqueues = sc->max_queues;
880 	}
881 
882 	compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
883 
884 	return (0);
885 }
886 
887 static int
888 nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command,
889 	struct nvme_completion* compl)
890 {
891 	int feature = command->cdw10 & 0xFF;
892 	uint32_t iv;
893 
894 	DPRINTF(("%s feature 0x%x", __func__, feature));
895 	compl->cdw0 = 0;
896 
897 	switch (feature) {
898 	case NVME_FEAT_ARBITRATION:
899 		DPRINTF(("  arbitration 0x%x", command->cdw11));
900 		break;
901 	case NVME_FEAT_POWER_MANAGEMENT:
902 		DPRINTF(("  power management 0x%x", command->cdw11));
903 		break;
904 	case NVME_FEAT_LBA_RANGE_TYPE:
905 		DPRINTF(("  lba range 0x%x", command->cdw11));
906 		break;
907 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
908 		DPRINTF(("  temperature threshold 0x%x", command->cdw11));
909 		break;
910 	case NVME_FEAT_ERROR_RECOVERY:
911 		DPRINTF(("  error recovery 0x%x", command->cdw11));
912 		break;
913 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
914 		DPRINTF(("  volatile write cache 0x%x", command->cdw11));
915 		break;
916 	case NVME_FEAT_NUMBER_OF_QUEUES:
917 		nvme_set_feature_queues(sc, command, compl);
918 		break;
919 	case NVME_FEAT_INTERRUPT_COALESCING:
920 		DPRINTF(("  interrupt coalescing 0x%x", command->cdw11));
921 
922 		/* in uS */
923 		sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100;
924 
925 		sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF;
926 		break;
927 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
928 		iv = command->cdw11 & 0xFFFF;
929 
930 		DPRINTF(("  interrupt vector configuration 0x%x",
931 		        command->cdw11));
932 
933 		for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) {
934 			if (sc->compl_queues[i].intr_vec == iv) {
935 				if (command->cdw11 & (1 << 16))
936 					sc->compl_queues[i].intr_en |=
937 					                      NVME_CQ_INTCOAL;
938 				else
939 					sc->compl_queues[i].intr_en &=
940 					                     ~NVME_CQ_INTCOAL;
941 			}
942 		}
943 		break;
944 	case NVME_FEAT_WRITE_ATOMICITY:
945 		DPRINTF(("  write atomicity 0x%x", command->cdw11));
946 		break;
947 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
948 		DPRINTF(("  async event configuration 0x%x",
949 		        command->cdw11));
950 		sc->async_ev_config = command->cdw11;
951 		break;
952 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
953 		DPRINTF(("  software progress marker 0x%x",
954 		        command->cdw11));
955 		break;
956 	case 0x0C:
957 		DPRINTF(("  autonomous power state transition 0x%x",
958 		        command->cdw11));
959 		break;
960 	default:
961 		WPRINTF(("%s invalid feature", __func__));
962 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
963 		return (1);
964 	}
965 
966 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
967 	return (1);
968 }
969 
970 static int
971 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
972 	struct nvme_completion* compl)
973 {
974 	int feature = command->cdw10 & 0xFF;
975 
976 	DPRINTF(("%s feature 0x%x", __func__, feature));
977 
978 	compl->cdw0 = 0;
979 
980 	switch (feature) {
981 	case NVME_FEAT_ARBITRATION:
982 		DPRINTF(("  arbitration"));
983 		break;
984 	case NVME_FEAT_POWER_MANAGEMENT:
985 		DPRINTF(("  power management"));
986 		break;
987 	case NVME_FEAT_LBA_RANGE_TYPE:
988 		DPRINTF(("  lba range"));
989 		break;
990 	case NVME_FEAT_TEMPERATURE_THRESHOLD:
991 		DPRINTF(("  temperature threshold"));
992 		switch ((command->cdw11 >> 20) & 0x3) {
993 		case 0:
994 			/* Over temp threshold */
995 			compl->cdw0 = 0xFFFF;
996 			break;
997 		case 1:
998 			/* Under temp threshold */
999 			compl->cdw0 = 0;
1000 			break;
1001 		default:
1002 			WPRINTF(("  invalid threshold type select"));
1003 			pci_nvme_status_genc(&compl->status,
1004 			    NVME_SC_INVALID_FIELD);
1005 			return (1);
1006 		}
1007 		break;
1008 	case NVME_FEAT_ERROR_RECOVERY:
1009 		DPRINTF(("  error recovery"));
1010 		break;
1011 	case NVME_FEAT_VOLATILE_WRITE_CACHE:
1012 		DPRINTF(("  volatile write cache"));
1013 		break;
1014 	case NVME_FEAT_NUMBER_OF_QUEUES:
1015 		compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1016 
1017 		DPRINTF(("  number of queues (submit %u, completion %u)",
1018 		        compl->cdw0 & 0xFFFF,
1019 		        (compl->cdw0 >> 16) & 0xFFFF));
1020 
1021 		break;
1022 	case NVME_FEAT_INTERRUPT_COALESCING:
1023 		DPRINTF(("  interrupt coalescing"));
1024 		break;
1025 	case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1026 		DPRINTF(("  interrupt vector configuration"));
1027 		break;
1028 	case NVME_FEAT_WRITE_ATOMICITY:
1029 		DPRINTF(("  write atomicity"));
1030 		break;
1031 	case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1032 		DPRINTF(("  async event configuration"));
1033 		sc->async_ev_config = command->cdw11;
1034 		break;
1035 	case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1036 		DPRINTF(("  software progress marker"));
1037 		break;
1038 	case 0x0C:
1039 		DPRINTF(("  autonomous power state transition"));
1040 		break;
1041 	default:
1042 		WPRINTF(("%s invalid feature 0x%x", __func__, feature));
1043 		pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1044 		return (1);
1045 	}
1046 
1047 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1048 	return (1);
1049 }
1050 
1051 static int
1052 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1053 	struct nvme_completion* compl)
1054 {
1055 	DPRINTF(("%s submission queue %u, command ID 0x%x", __func__,
1056 	        command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF));
1057 
1058 	/* TODO: search for the command ID and abort it */
1059 
1060 	compl->cdw0 = 1;
1061 	pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1062 	return (1);
1063 }
1064 
1065 #ifdef __FreeBSD__
1066 static int
1067 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1068 	struct nvme_command* command, struct nvme_completion* compl)
1069 {
1070 	DPRINTF(("%s async event request 0x%x", __func__, command->cdw11));
1071 
1072 	/*
1073 	 * TODO: raise events when they happen based on the Set Features cmd.
1074 	 * These events happen async, so only set completion successful if
1075 	 * there is an event reflective of the request to get event.
1076 	 */
1077 	pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1078 	    NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1079 	return (0);
1080 }
1081 #else
1082 /* This is kept behind an ifdef while it's unused to appease the compiler. */
1083 #endif /* __FreeBSD__ */
1084 
1085 static void
1086 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1087 {
1088 	struct nvme_completion compl;
1089 	struct nvme_command *cmd;
1090 	struct nvme_submission_queue *sq;
1091 	struct nvme_completion_queue *cq;
1092 	uint16_t sqhead;
1093 
1094 	DPRINTF(("%s index %u", __func__, (uint32_t)value));
1095 
1096 	sq = &sc->submit_queues[0];
1097 	cq = &sc->compl_queues[0];
1098 
1099 	sqhead = atomic_load_acq_short(&sq->head);
1100 
1101 	if (atomic_testandset_int(&sq->busy, 1)) {
1102 		DPRINTF(("%s SQ busy, head %u, tail %u",
1103 		        __func__, sqhead, sq->tail));
1104 		return;
1105 	}
1106 
1107 	DPRINTF(("sqhead %u, tail %u", sqhead, sq->tail));
1108 
1109 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1110 		cmd = &(sq->qbase)[sqhead];
1111 		compl.cdw0 = 0;
1112 		compl.status = 0;
1113 
1114 		switch (cmd->opc) {
1115 		case NVME_OPC_DELETE_IO_SQ:
1116 			DPRINTF(("%s command DELETE_IO_SQ", __func__));
1117 			nvme_opc_delete_io_sq(sc, cmd, &compl);
1118 			break;
1119 		case NVME_OPC_CREATE_IO_SQ:
1120 			DPRINTF(("%s command CREATE_IO_SQ", __func__));
1121 			nvme_opc_create_io_sq(sc, cmd, &compl);
1122 			break;
1123 		case NVME_OPC_DELETE_IO_CQ:
1124 			DPRINTF(("%s command DELETE_IO_CQ", __func__));
1125 			nvme_opc_delete_io_cq(sc, cmd, &compl);
1126 			break;
1127 		case NVME_OPC_CREATE_IO_CQ:
1128 			DPRINTF(("%s command CREATE_IO_CQ", __func__));
1129 			nvme_opc_create_io_cq(sc, cmd, &compl);
1130 			break;
1131 		case NVME_OPC_GET_LOG_PAGE:
1132 			DPRINTF(("%s command GET_LOG_PAGE", __func__));
1133 			nvme_opc_get_log_page(sc, cmd, &compl);
1134 			break;
1135 		case NVME_OPC_IDENTIFY:
1136 			DPRINTF(("%s command IDENTIFY", __func__));
1137 			nvme_opc_identify(sc, cmd, &compl);
1138 			break;
1139 		case NVME_OPC_ABORT:
1140 			DPRINTF(("%s command ABORT", __func__));
1141 			nvme_opc_abort(sc, cmd, &compl);
1142 			break;
1143 		case NVME_OPC_SET_FEATURES:
1144 			DPRINTF(("%s command SET_FEATURES", __func__));
1145 			nvme_opc_set_features(sc, cmd, &compl);
1146 			break;
1147 		case NVME_OPC_GET_FEATURES:
1148 			DPRINTF(("%s command GET_FEATURES", __func__));
1149 			nvme_opc_get_features(sc, cmd, &compl);
1150 			break;
1151 		case NVME_OPC_ASYNC_EVENT_REQUEST:
1152 			DPRINTF(("%s command ASYNC_EVENT_REQ", __func__));
1153 			/* XXX dont care, unhandled for now
1154 			nvme_opc_async_event_req(sc, cmd, &compl);
1155 			*/
1156 			compl.status = NVME_NO_STATUS;
1157 			break;
1158 		default:
1159 			WPRINTF(("0x%x command is not implemented",
1160 			    cmd->opc));
1161 			pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
1162 		}
1163 		sqhead = (sqhead + 1) % sq->size;
1164 
1165 		if (NVME_COMPLETION_VALID(compl)) {
1166 			struct nvme_completion *cp;
1167 			int phase;
1168 
1169 			cp = &(cq->qbase)[cq->tail];
1170 			cp->cdw0 = compl.cdw0;
1171 			cp->sqid = 0;
1172 			cp->sqhd = sqhead;
1173 			cp->cid = cmd->cid;
1174 
1175 			phase = NVME_STATUS_GET_P(cp->status);
1176 			cp->status = compl.status;
1177 			pci_nvme_toggle_phase(&cp->status, phase);
1178 
1179 			cq->tail = (cq->tail + 1) % cq->size;
1180 		}
1181 	}
1182 
1183 	DPRINTF(("setting sqhead %u", sqhead));
1184 	atomic_store_short(&sq->head, sqhead);
1185 	atomic_store_int(&sq->busy, 0);
1186 
1187 	if (cq->head != cq->tail)
1188 		pci_generate_msix(sc->nsc_pi, 0);
1189 
1190 }
1191 
1192 static int
1193 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
1194 	uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
1195 {
1196 	int iovidx;
1197 
1198 	if (req != NULL) {
1199 		/* concatenate contig block-iovs to minimize number of iovs */
1200 		if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
1201 			iovidx = req->io_req.br_iovcnt - 1;
1202 
1203 			req->io_req.br_iov[iovidx].iov_base =
1204 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1205 			                     req->prev_gpaddr, size);
1206 
1207 			req->prev_size += size;
1208 			req->io_req.br_resid += size;
1209 
1210 			req->io_req.br_iov[iovidx].iov_len = req->prev_size;
1211 		} else {
1212 			pthread_mutex_lock(&req->mtx);
1213 
1214 			iovidx = req->io_req.br_iovcnt;
1215 			if (iovidx == NVME_MAX_BLOCKIOVS) {
1216 				int err = 0;
1217 
1218 				DPRINTF(("large I/O, doing partial req"));
1219 
1220 				iovidx = 0;
1221 				req->io_req.br_iovcnt = 0;
1222 
1223 				req->io_req.br_callback = pci_nvme_io_partial;
1224 
1225 				if (!do_write)
1226 					err = blockif_read(sc->nvstore.ctx,
1227 					                   &req->io_req);
1228 				else
1229 					err = blockif_write(sc->nvstore.ctx,
1230 					                    &req->io_req);
1231 
1232 				/* wait until req completes before cont */
1233 				if (err == 0)
1234 					pthread_cond_wait(&req->cv, &req->mtx);
1235 			}
1236 			if (iovidx == 0) {
1237 				req->io_req.br_offset = lba;
1238 				req->io_req.br_resid = 0;
1239 				req->io_req.br_param = req;
1240 			}
1241 
1242 			req->io_req.br_iov[iovidx].iov_base =
1243 			    paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
1244 			                     gpaddr, size);
1245 
1246 			req->io_req.br_iov[iovidx].iov_len = size;
1247 
1248 			req->prev_gpaddr = gpaddr;
1249 			req->prev_size = size;
1250 			req->io_req.br_resid += size;
1251 
1252 			req->io_req.br_iovcnt++;
1253 
1254 			pthread_mutex_unlock(&req->mtx);
1255 		}
1256 	} else {
1257 		/* RAM buffer: read/write directly */
1258 		void *p = sc->nvstore.ctx;
1259 		void *gptr;
1260 
1261 		if ((lba + size) > sc->nvstore.size) {
1262 			WPRINTF(("%s write would overflow RAM", __func__));
1263 			return (-1);
1264 		}
1265 
1266 		p = (void *)((uintptr_t)p + (uintptr_t)lba);
1267 		gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size);
1268 		if (do_write)
1269 			memcpy(p, gptr, size);
1270 		else
1271 			memcpy(gptr, p, size);
1272 	}
1273 	return (0);
1274 }
1275 
1276 static void
1277 pci_nvme_set_completion(struct pci_nvme_softc *sc,
1278 	struct nvme_submission_queue *sq, int sqid, uint16_t cid,
1279 	uint32_t cdw0, uint16_t status, int ignore_busy)
1280 {
1281 	struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
1282 	struct nvme_completion *compl;
1283 	int phase;
1284 
1285 	DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
1286 		 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
1287 		 NVME_STATUS_GET_SC(status)));
1288 
1289 	pthread_mutex_lock(&cq->mtx);
1290 
1291 	assert(cq->qbase != NULL);
1292 
1293 	compl = &cq->qbase[cq->tail];
1294 
1295 	compl->cdw0 = cdw0;
1296 	compl->sqid = sqid;
1297 	compl->sqhd = atomic_load_acq_short(&sq->head);
1298 	compl->cid = cid;
1299 
1300 	// toggle phase
1301 	phase = NVME_STATUS_GET_P(compl->status);
1302 	compl->status = status;
1303 	pci_nvme_toggle_phase(&compl->status, phase);
1304 
1305 	cq->tail = (cq->tail + 1) % cq->size;
1306 
1307 	pthread_mutex_unlock(&cq->mtx);
1308 
1309 	if (cq->head != cq->tail) {
1310 		if (cq->intr_en & NVME_CQ_INTEN) {
1311 			pci_generate_msix(sc->nsc_pi, cq->intr_vec);
1312 		} else {
1313 			DPRINTF(("%s: CQ%u interrupt disabled\n",
1314 						__func__, sq->cqid));
1315 		}
1316 	}
1317 }
1318 
1319 static void
1320 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
1321 {
1322 	req->sc = NULL;
1323 	req->nvme_sq = NULL;
1324 	req->sqid = 0;
1325 
1326 	pthread_mutex_lock(&sc->mtx);
1327 
1328 	STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
1329 	sc->pending_ios--;
1330 
1331 	/* when no more IO pending, can set to ready if device reset/enabled */
1332 	if (sc->pending_ios == 0 &&
1333 	    NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
1334 		sc->regs.csts |= NVME_CSTS_RDY;
1335 
1336 	pthread_mutex_unlock(&sc->mtx);
1337 
1338 	sem_post(&sc->iosemlock);
1339 }
1340 
1341 static struct pci_nvme_ioreq *
1342 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
1343 {
1344 	struct pci_nvme_ioreq *req = NULL;;
1345 
1346 	sem_wait(&sc->iosemlock);
1347 	pthread_mutex_lock(&sc->mtx);
1348 
1349 	req = STAILQ_FIRST(&sc->ioreqs_free);
1350 	assert(req != NULL);
1351 	STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
1352 
1353 	req->sc = sc;
1354 
1355 	sc->pending_ios++;
1356 
1357 	pthread_mutex_unlock(&sc->mtx);
1358 
1359 	req->io_req.br_iovcnt = 0;
1360 	req->io_req.br_offset = 0;
1361 	req->io_req.br_resid = 0;
1362 	req->io_req.br_param = req;
1363 	req->prev_gpaddr = 0;
1364 	req->prev_size = 0;
1365 
1366 	return req;
1367 }
1368 
1369 static void
1370 pci_nvme_io_done(struct blockif_req *br, int err)
1371 {
1372 	struct pci_nvme_ioreq *req = br->br_param;
1373 	struct nvme_submission_queue *sq = req->nvme_sq;
1374 	uint16_t code, status = 0;
1375 
1376 	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1377 
1378 	/* TODO return correct error */
1379 	code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
1380 	pci_nvme_status_genc(&status, code);
1381 
1382 	pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0);
1383 	pci_nvme_release_ioreq(req->sc, req);
1384 }
1385 
1386 static void
1387 pci_nvme_io_partial(struct blockif_req *br, int err)
1388 {
1389 	struct pci_nvme_ioreq *req = br->br_param;
1390 
1391 	DPRINTF(("%s error %d %s", __func__, err, strerror(err)));
1392 
1393 	pthread_cond_signal(&req->cv);
1394 }
1395 
1396 static void
1397 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
1398 {
1399 	struct pci_nvme_ioreq *req = br->br_param;
1400 	struct pci_nvme_softc *sc = req->sc;
1401 	bool done = true;
1402 #ifdef __FreeBSD__
1403 	uint16_t status;
1404 #else
1405 	uint16_t status = 0;
1406 #endif
1407 
1408 	if (err) {
1409 		pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
1410 	} else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
1411 		pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1412 	} else {
1413 		struct iovec *iov = req->io_req.br_iov;
1414 
1415 		req->prev_gpaddr++;
1416 		iov += req->prev_gpaddr;
1417 
1418 		/* The iov_* values already include the sector size */
1419 		req->io_req.br_offset = (off_t)iov->iov_base;
1420 		req->io_req.br_resid = iov->iov_len;
1421 		if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
1422 			pci_nvme_status_genc(&status,
1423 			    NVME_SC_INTERNAL_DEVICE_ERROR);
1424 		} else
1425 			done = false;
1426 	}
1427 
1428 	if (done) {
1429 		pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
1430 		    req->cid, 0, status, 0);
1431 		pci_nvme_release_ioreq(sc, req);
1432 	}
1433 }
1434 
1435 static int
1436 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
1437     struct nvme_command *cmd,
1438     struct pci_nvme_blockstore *nvstore,
1439     struct pci_nvme_ioreq *req,
1440     uint16_t *status)
1441 {
1442 	int err = -1;
1443 
1444 	if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
1445 		pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
1446 		goto out;
1447 	}
1448 
1449 	if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
1450 		struct nvme_dsm_range *range;
1451 		uint32_t nr, r;
1452 		int sectsz = sc->nvstore.sectsz;
1453 
1454 		/*
1455 		 * DSM calls are advisory only, and compliant controllers
1456 		 * may choose to take no actions (i.e. return Success).
1457 		 */
1458 		if (!nvstore->deallocate) {
1459 			pci_nvme_status_genc(status, NVME_SC_SUCCESS);
1460 			goto out;
1461 		}
1462 
1463 		if (req == NULL) {
1464 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1465 			goto out;
1466 		}
1467 
1468 		/* copy locally because a range entry could straddle PRPs */
1469 		range = calloc(1, NVME_MAX_DSM_TRIM);
1470 		if (range == NULL) {
1471 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1472 			goto out;
1473 		}
1474 		nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
1475 		    (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
1476 
1477 		req->opc = cmd->opc;
1478 		req->cid = cmd->cid;
1479 		req->nsid = cmd->nsid;
1480 		/*
1481 		 * If the request is for more than a single range, store
1482 		 * the ranges in the br_iov. Optimize for the common case
1483 		 * of a single range.
1484 		 *
1485 		 * Note that NVMe Number of Ranges is a zero based value
1486 		 */
1487 		nr = cmd->cdw10 & 0xff;
1488 
1489 		req->io_req.br_iovcnt = 0;
1490 		req->io_req.br_offset = range[0].starting_lba * sectsz;
1491 		req->io_req.br_resid = range[0].length * sectsz;
1492 
1493 		if (nr == 0) {
1494 			req->io_req.br_callback = pci_nvme_io_done;
1495 		} else {
1496 			struct iovec *iov = req->io_req.br_iov;
1497 
1498 			for (r = 0; r <= nr; r++) {
1499 				iov[r].iov_base = (void *)(range[r].starting_lba * sectsz);
1500 				iov[r].iov_len = range[r].length * sectsz;
1501 			}
1502 			req->io_req.br_callback = pci_nvme_dealloc_sm;
1503 
1504 			/*
1505 			 * Use prev_gpaddr to track the current entry and
1506 			 * prev_size to track the number of entries
1507 			 */
1508 			req->prev_gpaddr = 0;
1509 			req->prev_size = r;
1510 		}
1511 
1512 		err = blockif_delete(nvstore->ctx, &req->io_req);
1513 		if (err)
1514 			pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
1515 
1516 		free(range);
1517 	}
1518 out:
1519 	return (err);
1520 }
1521 
1522 static void
1523 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
1524 {
1525 	struct nvme_submission_queue *sq;
1526 	uint16_t status = 0;
1527 	uint16_t sqhead;
1528 	int err;
1529 
1530 	/* handle all submissions up to sq->tail index */
1531 	sq = &sc->submit_queues[idx];
1532 
1533 	if (atomic_testandset_int(&sq->busy, 1)) {
1534 		DPRINTF(("%s sqid %u busy", __func__, idx));
1535 		return;
1536 	}
1537 
1538 	sqhead = atomic_load_acq_short(&sq->head);
1539 
1540 	DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p",
1541 	         idx, sqhead, sq->tail, sq->qbase));
1542 
1543 	while (sqhead != atomic_load_acq_short(&sq->tail)) {
1544 		struct nvme_command *cmd;
1545 		struct pci_nvme_ioreq *req = NULL;
1546 		uint64_t lba;
1547 		uint64_t nblocks, bytes, size, cpsz;
1548 
1549 		/* TODO: support scatter gather list handling */
1550 
1551 		cmd = &sq->qbase[sqhead];
1552 		sqhead = (sqhead + 1) % sq->size;
1553 
1554 		lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
1555 
1556 		if (cmd->opc == NVME_OPC_FLUSH) {
1557 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1558 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1559 			                        status, 1);
1560 
1561 			continue;
1562 		} else if (cmd->opc == 0x08) {
1563 			/* TODO: write zeroes */
1564 			WPRINTF(("%s write zeroes lba 0x%lx blocks %u",
1565 			        __func__, lba, cmd->cdw12 & 0xFFFF));
1566 			pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1567 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1568 			                        status, 1);
1569 
1570 			continue;
1571 		}
1572 
1573 		if (sc->nvstore.type == NVME_STOR_BLOCKIF) {
1574 			req = pci_nvme_get_ioreq(sc);
1575 			req->nvme_sq = sq;
1576 			req->sqid = idx;
1577 		}
1578 
1579 		if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) {
1580 			if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req,
1581 			    &status)) {
1582 				pci_nvme_set_completion(sc, sq, idx, cmd->cid,
1583 				    0, status, 1);
1584 				if (req)
1585 					pci_nvme_release_ioreq(sc, req);
1586 			}
1587 			continue;
1588 		}
1589 
1590 		nblocks = (cmd->cdw12 & 0xFFFF) + 1;
1591 
1592 		bytes = nblocks * sc->nvstore.sectsz;
1593 
1594 		/*
1595 		 * If data starts mid-page and flows into the next page, then
1596 		 * increase page count
1597 		 */
1598 
1599 		DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu "
1600 		         "(%lu-bytes)",
1601 		         sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size,
1602 		         cmd->opc == NVME_OPC_WRITE ?
1603 			     "WRITE" : "READ",
1604 		         lba, nblocks, bytes));
1605 
1606 		cmd->prp1 &= ~(0x03UL);
1607 		cmd->prp2 &= ~(0x03UL);
1608 
1609 		DPRINTF((" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2));
1610 
1611 		size = bytes;
1612 		lba *= sc->nvstore.sectsz;
1613 
1614 		cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE);
1615 
1616 		if (cpsz > bytes)
1617 			cpsz = bytes;
1618 
1619 		if (req != NULL) {
1620 			req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) |
1621 			                        cmd->cdw10;
1622 			req->opc = cmd->opc;
1623 			req->cid = cmd->cid;
1624 			req->nsid = cmd->nsid;
1625 		}
1626 
1627 		err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz,
1628 		    cmd->opc == NVME_OPC_WRITE, lba);
1629 		lba += cpsz;
1630 		size -= cpsz;
1631 
1632 		if (size == 0)
1633 			goto iodone;
1634 
1635 		if (size <= PAGE_SIZE) {
1636 			/* prp2 is second (and final) page in transfer */
1637 
1638 			err = pci_nvme_append_iov_req(sc, req, cmd->prp2,
1639 			    size,
1640 			    cmd->opc == NVME_OPC_WRITE,
1641 			    lba);
1642 		} else {
1643 			uint64_t *prp_list;
1644 			int i;
1645 
1646 			/* prp2 is pointer to a physical region page list */
1647 			prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx,
1648 			                            cmd->prp2, PAGE_SIZE);
1649 
1650 			i = 0;
1651 			while (size != 0) {
1652 				cpsz = MIN(size, PAGE_SIZE);
1653 
1654 				/*
1655 				 * Move to linked physical region page list
1656 				 * in last item.
1657 				 */
1658 				if (i == (NVME_PRP2_ITEMS-1) &&
1659 				    size > PAGE_SIZE) {
1660 					assert((prp_list[i] & (PAGE_SIZE-1)) == 0);
1661 					prp_list = paddr_guest2host(
1662 					              sc->nsc_pi->pi_vmctx,
1663 					              prp_list[i], PAGE_SIZE);
1664 					i = 0;
1665 				}
1666 				if (prp_list[i] == 0) {
1667 					WPRINTF(("PRP2[%d] = 0 !!!", i));
1668 					err = 1;
1669 					break;
1670 				}
1671 
1672 				err = pci_nvme_append_iov_req(sc, req,
1673 				    prp_list[i], cpsz,
1674 				    cmd->opc == NVME_OPC_WRITE, lba);
1675 				if (err)
1676 					break;
1677 
1678 				lba += cpsz;
1679 				size -= cpsz;
1680 				i++;
1681 			}
1682 		}
1683 
1684 iodone:
1685 		if (sc->nvstore.type == NVME_STOR_RAM) {
1686 			uint16_t code, status = 0;
1687 
1688 			code = err ? NVME_SC_LBA_OUT_OF_RANGE :
1689 			    NVME_SC_SUCCESS;
1690 			pci_nvme_status_genc(&status, code);
1691 
1692 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1693 			                        status, 1);
1694 
1695 			continue;
1696 		}
1697 
1698 
1699 		if (err)
1700 			goto do_error;
1701 
1702 		req->io_req.br_callback = pci_nvme_io_done;
1703 
1704 		err = 0;
1705 		switch (cmd->opc) {
1706 		case NVME_OPC_READ:
1707 			err = blockif_read(sc->nvstore.ctx, &req->io_req);
1708 			break;
1709 		case NVME_OPC_WRITE:
1710 			err = blockif_write(sc->nvstore.ctx, &req->io_req);
1711 			break;
1712 		default:
1713 			WPRINTF(("%s unhandled io command 0x%x",
1714 				 __func__, cmd->opc));
1715 			err = 1;
1716 		}
1717 
1718 do_error:
1719 		if (err) {
1720 			uint16_t status = 0;
1721 
1722 			pci_nvme_status_genc(&status,
1723 			    NVME_SC_DATA_TRANSFER_ERROR);
1724 
1725 			pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
1726 			                        status, 1);
1727 			pci_nvme_release_ioreq(sc, req);
1728 		}
1729 	}
1730 
1731 	atomic_store_short(&sq->head, sqhead);
1732 	atomic_store_int(&sq->busy, 0);
1733 }
1734 
1735 static void
1736 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
1737 	uint64_t idx, int is_sq, uint64_t value)
1738 {
1739 	DPRINTF(("nvme doorbell %lu, %s, val 0x%lx",
1740 	        idx, is_sq ? "SQ" : "CQ", value & 0xFFFF));
1741 
1742 	if (is_sq) {
1743 		atomic_store_short(&sc->submit_queues[idx].tail,
1744 		                   (uint16_t)value);
1745 
1746 		if (idx == 0) {
1747 			pci_nvme_handle_admin_cmd(sc, value);
1748 		} else {
1749 			/* submission queue; handle new entries in SQ */
1750 			if (idx > sc->num_squeues) {
1751 				WPRINTF(("%s SQ index %lu overflow from "
1752 				         "guest (max %u)",
1753 				         __func__, idx, sc->num_squeues));
1754 				return;
1755 			}
1756 			pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
1757 		}
1758 	} else {
1759 		if (idx > sc->num_cqueues) {
1760 			WPRINTF(("%s queue index %lu overflow from "
1761 			         "guest (max %u)",
1762 			         __func__, idx, sc->num_cqueues));
1763 			return;
1764 		}
1765 
1766 		sc->compl_queues[idx].head = (uint16_t)value;
1767 	}
1768 }
1769 
1770 static void
1771 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
1772 {
1773 	const char *s = iswrite ? "WRITE" : "READ";
1774 
1775 	switch (offset) {
1776 	case NVME_CR_CAP_LOW:
1777 		DPRINTF(("%s %s NVME_CR_CAP_LOW", func, s));
1778 		break;
1779 	case NVME_CR_CAP_HI:
1780 		DPRINTF(("%s %s NVME_CR_CAP_HI", func, s));
1781 		break;
1782 	case NVME_CR_VS:
1783 		DPRINTF(("%s %s NVME_CR_VS", func, s));
1784 		break;
1785 	case NVME_CR_INTMS:
1786 		DPRINTF(("%s %s NVME_CR_INTMS", func, s));
1787 		break;
1788 	case NVME_CR_INTMC:
1789 		DPRINTF(("%s %s NVME_CR_INTMC", func, s));
1790 		break;
1791 	case NVME_CR_CC:
1792 		DPRINTF(("%s %s NVME_CR_CC", func, s));
1793 		break;
1794 	case NVME_CR_CSTS:
1795 		DPRINTF(("%s %s NVME_CR_CSTS", func, s));
1796 		break;
1797 	case NVME_CR_NSSR:
1798 		DPRINTF(("%s %s NVME_CR_NSSR", func, s));
1799 		break;
1800 	case NVME_CR_AQA:
1801 		DPRINTF(("%s %s NVME_CR_AQA", func, s));
1802 		break;
1803 	case NVME_CR_ASQ_LOW:
1804 		DPRINTF(("%s %s NVME_CR_ASQ_LOW", func, s));
1805 		break;
1806 	case NVME_CR_ASQ_HI:
1807 		DPRINTF(("%s %s NVME_CR_ASQ_HI", func, s));
1808 		break;
1809 	case NVME_CR_ACQ_LOW:
1810 		DPRINTF(("%s %s NVME_CR_ACQ_LOW", func, s));
1811 		break;
1812 	case NVME_CR_ACQ_HI:
1813 		DPRINTF(("%s %s NVME_CR_ACQ_HI", func, s));
1814 		break;
1815 	default:
1816 		DPRINTF(("unknown nvme bar-0 offset 0x%lx", offset));
1817 	}
1818 
1819 }
1820 
1821 static void
1822 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
1823 	uint64_t offset, int size, uint64_t value)
1824 {
1825 	uint32_t ccreg;
1826 
1827 	if (offset >= NVME_DOORBELL_OFFSET) {
1828 		uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
1829 		uint64_t idx = belloffset / 8; /* door bell size = 2*int */
1830 		int is_sq = (belloffset % 8) < 4;
1831 
1832 		if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
1833 			WPRINTF(("guest attempted an overflow write offset "
1834 			         "0x%lx, val 0x%lx in %s",
1835 			         offset, value, __func__));
1836 			return;
1837 		}
1838 
1839 		pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
1840 		return;
1841 	}
1842 
1843 	DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx",
1844 	        offset, size, value));
1845 
1846 	if (size != 4) {
1847 		WPRINTF(("guest wrote invalid size %d (offset 0x%lx, "
1848 		         "val 0x%lx) to bar0 in %s",
1849 		         size, offset, value, __func__));
1850 		/* TODO: shutdown device */
1851 		return;
1852 	}
1853 
1854 	pci_nvme_bar0_reg_dumps(__func__, offset, 1);
1855 
1856 	pthread_mutex_lock(&sc->mtx);
1857 
1858 	switch (offset) {
1859 	case NVME_CR_CAP_LOW:
1860 	case NVME_CR_CAP_HI:
1861 		/* readonly */
1862 		break;
1863 	case NVME_CR_VS:
1864 		/* readonly */
1865 		break;
1866 	case NVME_CR_INTMS:
1867 		/* MSI-X, so ignore */
1868 		break;
1869 	case NVME_CR_INTMC:
1870 		/* MSI-X, so ignore */
1871 		break;
1872 	case NVME_CR_CC:
1873 		ccreg = (uint32_t)value;
1874 
1875 		DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
1876 		         "iocqes %u",
1877 		        __func__,
1878 			 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
1879 			 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
1880 			 NVME_CC_GET_IOCQES(ccreg)));
1881 
1882 		if (NVME_CC_GET_SHN(ccreg)) {
1883 			/* perform shutdown - flush out data to backend */
1884 			sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
1885 			    NVME_CSTS_REG_SHST_SHIFT);
1886 			sc->regs.csts |= NVME_SHST_COMPLETE <<
1887 			    NVME_CSTS_REG_SHST_SHIFT;
1888 		}
1889 		if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
1890 			if (NVME_CC_GET_EN(ccreg) == 0)
1891 				/* transition 1-> causes controller reset */
1892 				pci_nvme_reset_locked(sc);
1893 			else
1894 				pci_nvme_init_controller(ctx, sc);
1895 		}
1896 
1897 		/* Insert the iocqes, iosqes and en bits from the write */
1898 		sc->regs.cc &= ~NVME_CC_WRITE_MASK;
1899 		sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
1900 		if (NVME_CC_GET_EN(ccreg) == 0) {
1901 			/* Insert the ams, mps and css bit fields */
1902 			sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
1903 			sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
1904 			sc->regs.csts &= ~NVME_CSTS_RDY;
1905 		} else if (sc->pending_ios == 0) {
1906 			sc->regs.csts |= NVME_CSTS_RDY;
1907 		}
1908 		break;
1909 	case NVME_CR_CSTS:
1910 		break;
1911 	case NVME_CR_NSSR:
1912 		/* ignore writes; don't support subsystem reset */
1913 		break;
1914 	case NVME_CR_AQA:
1915 		sc->regs.aqa = (uint32_t)value;
1916 		break;
1917 	case NVME_CR_ASQ_LOW:
1918 		sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
1919 		               (0xFFFFF000 & value);
1920 		break;
1921 	case NVME_CR_ASQ_HI:
1922 		sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
1923 		               (value << 32);
1924 		break;
1925 	case NVME_CR_ACQ_LOW:
1926 		sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
1927 		               (0xFFFFF000 & value);
1928 		break;
1929 	case NVME_CR_ACQ_HI:
1930 		sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
1931 		               (value << 32);
1932 		break;
1933 	default:
1934 		DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d",
1935 		         __func__, offset, value, size));
1936 	}
1937 	pthread_mutex_unlock(&sc->mtx);
1938 }
1939 
1940 static void
1941 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
1942                 int baridx, uint64_t offset, int size, uint64_t value)
1943 {
1944 	struct pci_nvme_softc* sc = pi->pi_arg;
1945 
1946 	if (baridx == pci_msix_table_bar(pi) ||
1947 	    baridx == pci_msix_pba_bar(pi)) {
1948 		DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, "
1949 		         " value 0x%lx", baridx, offset, size, value));
1950 
1951 		pci_emul_msix_twrite(pi, offset, size, value);
1952 		return;
1953 	}
1954 
1955 	switch (baridx) {
1956 	case 0:
1957 		pci_nvme_write_bar_0(ctx, sc, offset, size, value);
1958 		break;
1959 
1960 	default:
1961 		DPRINTF(("%s unknown baridx %d, val 0x%lx",
1962 		         __func__, baridx, value));
1963 	}
1964 }
1965 
1966 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
1967 	uint64_t offset, int size)
1968 {
1969 	uint64_t value;
1970 
1971 	pci_nvme_bar0_reg_dumps(__func__, offset, 0);
1972 
1973 	if (offset < NVME_DOORBELL_OFFSET) {
1974 		void *p = &(sc->regs);
1975 		pthread_mutex_lock(&sc->mtx);
1976 		memcpy(&value, (void *)((uintptr_t)p + offset), size);
1977 		pthread_mutex_unlock(&sc->mtx);
1978 	} else {
1979 		value = 0;
1980                 WPRINTF(("pci_nvme: read invalid offset %ld", offset));
1981 	}
1982 
1983 	switch (size) {
1984 	case 1:
1985 		value &= 0xFF;
1986 		break;
1987 	case 2:
1988 		value &= 0xFFFF;
1989 		break;
1990 	case 4:
1991 		value &= 0xFFFFFFFF;
1992 		break;
1993 	}
1994 
1995 	DPRINTF(("   nvme-read offset 0x%lx, size %d -> value 0x%x",
1996 	         offset, size, (uint32_t)value));
1997 
1998 	return (value);
1999 }
2000 
2001 
2002 
2003 static uint64_t
2004 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2005     uint64_t offset, int size)
2006 {
2007 	struct pci_nvme_softc* sc = pi->pi_arg;
2008 
2009 	if (baridx == pci_msix_table_bar(pi) ||
2010 	    baridx == pci_msix_pba_bar(pi)) {
2011 		DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
2012 		        baridx, offset, size));
2013 
2014 		return pci_emul_msix_tread(pi, offset, size);
2015 	}
2016 
2017 	switch (baridx) {
2018 	case 0:
2019        		return pci_nvme_read_bar_0(sc, offset, size);
2020 
2021 	default:
2022 		DPRINTF(("unknown bar %d, 0x%lx", baridx, offset));
2023 	}
2024 
2025 	return (0);
2026 }
2027 
2028 
2029 static int
2030 pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts)
2031 {
2032 	char bident[sizeof("XX:X:X")];
2033 	char	*uopt, *xopts, *config;
2034 	uint32_t sectsz;
2035 	int optidx;
2036 
2037 	sc->max_queues = NVME_QUEUES;
2038 	sc->max_qentries = NVME_MAX_QENTRIES;
2039 	sc->ioslots = NVME_IOSLOTS;
2040 	sc->num_squeues = sc->max_queues;
2041 	sc->num_cqueues = sc->max_queues;
2042 	sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2043 	sectsz = 0;
2044 
2045 	uopt = strdup(opts);
2046 	optidx = 0;
2047 	snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
2048 	         "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2049 	for (xopts = strtok(uopt, ",");
2050 	     xopts != NULL;
2051 	     xopts = strtok(NULL, ",")) {
2052 
2053 		if ((config = strchr(xopts, '=')) != NULL)
2054 			*config++ = '\0';
2055 
2056 		if (!strcmp("maxq", xopts)) {
2057 			sc->max_queues = atoi(config);
2058 		} else if (!strcmp("qsz", xopts)) {
2059 			sc->max_qentries = atoi(config);
2060 		} else if (!strcmp("ioslots", xopts)) {
2061 			sc->ioslots = atoi(config);
2062 		} else if (!strcmp("sectsz", xopts)) {
2063 			sectsz = atoi(config);
2064 		} else if (!strcmp("ser", xopts)) {
2065 			/*
2066 			 * This field indicates the Product Serial Number in
2067 			 * 7-bit ASCII, unused bytes should be space characters.
2068 			 * Ref: NVMe v1.3c.
2069 			 */
2070 			cpywithpad((char *)sc->ctrldata.sn,
2071 			           sizeof(sc->ctrldata.sn), config, ' ');
2072 		} else if (!strcmp("ram", xopts)) {
2073 			uint64_t sz = strtoull(&xopts[4], NULL, 10);
2074 
2075 			sc->nvstore.type = NVME_STOR_RAM;
2076 			sc->nvstore.size = sz * 1024 * 1024;
2077 			sc->nvstore.ctx = calloc(1, sc->nvstore.size);
2078 			sc->nvstore.sectsz = 4096;
2079 			sc->nvstore.sectsz_bits = 12;
2080 			if (sc->nvstore.ctx == NULL) {
2081 				perror("Unable to allocate RAM");
2082 				free(uopt);
2083 				return (-1);
2084 			}
2085 		} else if (!strcmp("eui64", xopts)) {
2086 			sc->nvstore.eui64 = htobe64(strtoull(config, NULL, 0));
2087 		} else if (!strcmp("dsm", xopts)) {
2088 			if (!strcmp("auto", config))
2089 				sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
2090 			else if (!strcmp("enable", config))
2091 				sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
2092 			else if (!strcmp("disable", config))
2093 				sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
2094 		} else if (optidx == 0) {
2095 			snprintf(bident, sizeof(bident), "%d:%d",
2096 			         sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
2097 			sc->nvstore.ctx = blockif_open(xopts, bident);
2098 			if (sc->nvstore.ctx == NULL) {
2099 				perror("Could not open backing file");
2100 				free(uopt);
2101 				return (-1);
2102 			}
2103 			sc->nvstore.type = NVME_STOR_BLOCKIF;
2104 			sc->nvstore.size = blockif_size(sc->nvstore.ctx);
2105 		} else {
2106 			EPRINTLN("Invalid option %s", xopts);
2107 			free(uopt);
2108 			return (-1);
2109 		}
2110 
2111 		optidx++;
2112 	}
2113 	free(uopt);
2114 
2115 	if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) {
2116 		EPRINTLN("backing store not specified");
2117 		return (-1);
2118 	}
2119 	if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
2120 		sc->nvstore.sectsz = sectsz;
2121 	else if (sc->nvstore.type != NVME_STOR_RAM)
2122 		sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
2123 	for (sc->nvstore.sectsz_bits = 9;
2124 	     (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
2125 	     sc->nvstore.sectsz_bits++);
2126 
2127 	if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
2128 		sc->max_queues = NVME_QUEUES;
2129 
2130 	if (sc->max_qentries <= 0) {
2131 		EPRINTLN("Invalid qsz option");
2132 		return (-1);
2133 	}
2134 	if (sc->ioslots <= 0) {
2135 		EPRINTLN("Invalid ioslots option");
2136 		return (-1);
2137 	}
2138 
2139 	return (0);
2140 }
2141 
2142 static int
2143 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
2144 {
2145 	struct pci_nvme_softc *sc;
2146 	uint32_t pci_membar_sz;
2147 	int	error;
2148 
2149 	error = 0;
2150 
2151 	sc = calloc(1, sizeof(struct pci_nvme_softc));
2152 	pi->pi_arg = sc;
2153 	sc->nsc_pi = pi;
2154 
2155 	error = pci_nvme_parse_opts(sc, opts);
2156 	if (error < 0)
2157 		goto done;
2158 	else
2159 		error = 0;
2160 
2161 	STAILQ_INIT(&sc->ioreqs_free);
2162 	sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
2163 	for (int i = 0; i < sc->ioslots; i++) {
2164 		STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
2165 		pthread_mutex_init(&sc->ioreqs[i].mtx, NULL);
2166 		pthread_cond_init(&sc->ioreqs[i].cv, NULL);
2167 	}
2168 	sc->intr_coales_aggr_thresh = 1;
2169 
2170 	pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
2171 	pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
2172 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
2173 	pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
2174 	pci_set_cfgdata8(pi, PCIR_PROGIF,
2175 	                 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
2176 
2177 	/*
2178 	 * Allocate size of NVMe registers + doorbell space for all queues.
2179 	 *
2180 	 * The specification requires a minimum memory I/O window size of 16K.
2181 	 * The Windows driver will refuse to start a device with a smaller
2182 	 * window.
2183 	 */
2184 	pci_membar_sz = sizeof(struct nvme_registers) +
2185 	    2 * sizeof(uint32_t) * (sc->max_queues + 1);
2186 	pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
2187 
2188 	DPRINTF(("nvme membar size: %u", pci_membar_sz));
2189 
2190 	error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
2191 	if (error) {
2192 		WPRINTF(("%s pci alloc mem bar failed", __func__));
2193 		goto done;
2194 	}
2195 
2196 	error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
2197 	if (error) {
2198 		WPRINTF(("%s pci add msixcap failed", __func__));
2199 		goto done;
2200 	}
2201 
2202 	error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
2203 	if (error) {
2204 		WPRINTF(("%s pci add Express capability failed", __func__));
2205 		goto done;
2206 	}
2207 
2208 	pthread_mutex_init(&sc->mtx, NULL);
2209 	sem_init(&sc->iosemlock, 0, sc->ioslots);
2210 
2211 	pci_nvme_reset(sc);
2212 	/*
2213 	 * Controller data depends on Namespace data so initialize Namespace
2214 	 * data first.
2215 	 */
2216 	pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
2217 	pci_nvme_init_ctrldata(sc);
2218 	pci_nvme_init_logpages(sc);
2219 
2220 	pci_lintr_request(pi);
2221 
2222 done:
2223 	return (error);
2224 }
2225 
2226 
2227 struct pci_devemu pci_de_nvme = {
2228 	.pe_emu =	"nvme",
2229 	.pe_init =	pci_nvme_init,
2230 	.pe_barwrite =	pci_nvme_write,
2231 	.pe_barread =	pci_nvme_read
2232 };
2233 PCI_EMUL_SET(pci_de_nvme);
2234