1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <semaphore.h> 73 #include <stdbool.h> 74 #include <stddef.h> 75 #include <stdint.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 80 #include <machine/atomic.h> 81 #include <machine/vmm.h> 82 #include <vmmapi.h> 83 84 #include <dev/nvme/nvme.h> 85 86 #include "bhyverun.h" 87 #include "block_if.h" 88 #include "config.h" 89 #include "debug.h" 90 #include "pci_emul.h" 91 92 93 static int nvme_debug = 0; 94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 96 97 /* defaults; can be overridden */ 98 #define NVME_MSIX_BAR 4 99 100 #define NVME_IOSLOTS 8 101 102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 103 #define NVME_MMIO_SPACE_MIN (1 << 14) 104 105 #define NVME_QUEUES 16 106 #define NVME_MAX_QENTRIES 2048 107 /* Memory Page size Minimum reported in CAP register */ 108 #define NVME_MPSMIN 0 109 /* MPSMIN converted to bytes */ 110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 111 112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 113 #define NVME_MDTS 9 114 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 117 118 /* This is a synthetic status code to indicate there is no status */ 119 #define NVME_NO_STATUS 0xffff 120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 121 122 /* helpers */ 123 124 /* Convert a zero-based value into a one-based value */ 125 #define ONE_BASED(zero) ((zero) + 1) 126 /* Convert a one-based value into a zero-based value */ 127 #define ZERO_BASED(one) ((one) - 1) 128 129 /* Encode number of SQ's and CQ's for Set/Get Features */ 130 #define NVME_FEATURE_NUM_QUEUES(sc) \ 131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 133 134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 135 136 enum nvme_controller_register_offsets { 137 NVME_CR_CAP_LOW = 0x00, 138 NVME_CR_CAP_HI = 0x04, 139 NVME_CR_VS = 0x08, 140 NVME_CR_INTMS = 0x0c, 141 NVME_CR_INTMC = 0x10, 142 NVME_CR_CC = 0x14, 143 NVME_CR_CSTS = 0x1c, 144 NVME_CR_NSSR = 0x20, 145 NVME_CR_AQA = 0x24, 146 NVME_CR_ASQ_LOW = 0x28, 147 NVME_CR_ASQ_HI = 0x2c, 148 NVME_CR_ACQ_LOW = 0x30, 149 NVME_CR_ACQ_HI = 0x34, 150 }; 151 152 enum nvme_cmd_cdw11 { 153 NVME_CMD_CDW11_PC = 0x0001, 154 NVME_CMD_CDW11_IEN = 0x0002, 155 NVME_CMD_CDW11_IV = 0xFFFF0000, 156 }; 157 158 enum nvme_copy_dir { 159 NVME_COPY_TO_PRP, 160 NVME_COPY_FROM_PRP, 161 }; 162 163 #define NVME_CQ_INTEN 0x01 164 #define NVME_CQ_INTCOAL 0x02 165 166 struct nvme_completion_queue { 167 struct nvme_completion *qbase; 168 pthread_mutex_t mtx; 169 uint32_t size; 170 uint16_t tail; /* nvme progress */ 171 uint16_t head; /* guest progress */ 172 uint16_t intr_vec; 173 uint32_t intr_en; 174 }; 175 176 struct nvme_submission_queue { 177 struct nvme_command *qbase; 178 pthread_mutex_t mtx; 179 uint32_t size; 180 uint16_t head; /* nvme progress */ 181 uint16_t tail; /* guest progress */ 182 uint16_t cqid; /* completion queue id */ 183 int qpriority; 184 }; 185 186 enum nvme_storage_type { 187 NVME_STOR_BLOCKIF = 0, 188 NVME_STOR_RAM = 1, 189 }; 190 191 struct pci_nvme_blockstore { 192 enum nvme_storage_type type; 193 void *ctx; 194 uint64_t size; 195 uint32_t sectsz; 196 uint32_t sectsz_bits; 197 uint64_t eui64; 198 uint32_t deallocate:1; 199 }; 200 201 /* 202 * Calculate the number of additional page descriptors for guest IO requests 203 * based on the advertised Max Data Transfer (MDTS) and given the number of 204 * default iovec's in a struct blockif_req. 205 */ 206 #define MDTS_PAD_SIZE \ 207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 209 0 ) 210 211 struct pci_nvme_ioreq { 212 struct pci_nvme_softc *sc; 213 STAILQ_ENTRY(pci_nvme_ioreq) link; 214 struct nvme_submission_queue *nvme_sq; 215 uint16_t sqid; 216 217 /* command information */ 218 uint16_t opc; 219 uint16_t cid; 220 uint32_t nsid; 221 222 uint64_t prev_gpaddr; 223 size_t prev_size; 224 size_t bytes; 225 226 struct blockif_req io_req; 227 228 struct iovec iovpadding[MDTS_PAD_SIZE]; 229 }; 230 231 enum nvme_dsm_type { 232 /* Dataset Management bit in ONCS reflects backing storage capability */ 233 NVME_DATASET_MANAGEMENT_AUTO, 234 /* Unconditionally set Dataset Management bit in ONCS */ 235 NVME_DATASET_MANAGEMENT_ENABLE, 236 /* Unconditionally clear Dataset Management bit in ONCS */ 237 NVME_DATASET_MANAGEMENT_DISABLE, 238 }; 239 240 struct pci_nvme_softc; 241 struct nvme_feature_obj; 242 243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 244 struct nvme_feature_obj *, 245 struct nvme_command *, 246 struct nvme_completion *); 247 248 struct nvme_feature_obj { 249 uint32_t cdw11; 250 nvme_feature_cb set; 251 nvme_feature_cb get; 252 bool namespace_specific; 253 }; 254 255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 256 257 typedef enum { 258 PCI_NVME_AE_TYPE_ERROR = 0, 259 PCI_NVME_AE_TYPE_SMART, 260 PCI_NVME_AE_TYPE_NOTICE, 261 PCI_NVME_AE_TYPE_IO_CMD = 6, 262 PCI_NVME_AE_TYPE_VENDOR = 7, 263 PCI_NVME_AE_TYPE_MAX /* Must be last */ 264 } pci_nvme_async_type; 265 266 /* Asynchronous Event Requests */ 267 struct pci_nvme_aer { 268 STAILQ_ENTRY(pci_nvme_aer) link; 269 uint16_t cid; /* Command ID of the submitted AER */ 270 }; 271 272 typedef enum { 273 PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0, 274 PCI_NVME_AE_INFO_FW_ACTIVATION, 275 PCI_NVME_AE_INFO_TELEMETRY_CHANGE, 276 PCI_NVME_AE_INFO_ANA_CHANGE, 277 PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE, 278 PCI_NVME_AE_INFO_LBA_STATUS_ALERT, 279 PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE, 280 PCI_NVME_AE_INFO_MAX, 281 } pci_nvme_async_info; 282 283 /* Asynchronous Event Notifications */ 284 struct pci_nvme_aen { 285 pci_nvme_async_type atype; 286 uint32_t event_data; 287 bool posted; 288 }; 289 290 struct pci_nvme_softc { 291 struct pci_devinst *nsc_pi; 292 293 pthread_mutex_t mtx; 294 295 struct nvme_registers regs; 296 297 struct nvme_namespace_data nsdata; 298 struct nvme_controller_data ctrldata; 299 struct nvme_error_information_entry err_log; 300 struct nvme_health_information_page health_log; 301 struct nvme_firmware_page fw_log; 302 struct nvme_ns_list ns_log; 303 304 struct pci_nvme_blockstore nvstore; 305 306 uint16_t max_qentries; /* max entries per queue */ 307 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 308 uint32_t num_cqueues; 309 uint32_t num_squeues; 310 bool num_q_is_set; /* Has host set Number of Queues */ 311 312 struct pci_nvme_ioreq *ioreqs; 313 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 314 uint32_t pending_ios; 315 uint32_t ioslots; 316 sem_t iosemlock; 317 318 /* 319 * Memory mapped Submission and Completion queues 320 * Each array includes both Admin and IO queues 321 */ 322 struct nvme_completion_queue *compl_queues; 323 struct nvme_submission_queue *submit_queues; 324 325 struct nvme_feature_obj feat[NVME_FID_MAX]; 326 327 enum nvme_dsm_type dataset_management; 328 329 /* Accounting for SMART data */ 330 __uint128_t read_data_units; 331 __uint128_t write_data_units; 332 __uint128_t read_commands; 333 __uint128_t write_commands; 334 uint32_t read_dunits_remainder; 335 uint32_t write_dunits_remainder; 336 337 STAILQ_HEAD(, pci_nvme_aer) aer_list; 338 pthread_mutex_t aer_mtx; 339 uint32_t aer_count; 340 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 341 pthread_t aen_tid; 342 pthread_mutex_t aen_mtx; 343 pthread_cond_t aen_cond; 344 }; 345 346 347 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 348 struct nvme_completion_queue *cq, 349 uint32_t cdw0, 350 uint16_t cid, 351 uint16_t sqid, 352 uint16_t status); 353 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 354 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 355 static void pci_nvme_io_done(struct blockif_req *, int); 356 357 /* Controller Configuration utils */ 358 #define NVME_CC_GET_EN(cc) \ 359 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 360 #define NVME_CC_GET_CSS(cc) \ 361 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 362 #define NVME_CC_GET_SHN(cc) \ 363 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 364 #define NVME_CC_GET_IOSQES(cc) \ 365 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 366 #define NVME_CC_GET_IOCQES(cc) \ 367 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 368 369 #define NVME_CC_WRITE_MASK \ 370 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 371 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 372 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 373 374 #define NVME_CC_NEN_WRITE_MASK \ 375 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 376 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 377 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 378 379 /* Controller Status utils */ 380 #define NVME_CSTS_GET_RDY(sts) \ 381 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 382 383 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 384 385 /* Completion Queue status word utils */ 386 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 387 #define NVME_STATUS_MASK \ 388 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 389 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 390 391 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 392 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 393 394 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 395 struct nvme_feature_obj *, 396 struct nvme_command *, 397 struct nvme_completion *); 398 static void nvme_feature_num_queues(struct pci_nvme_softc *, 399 struct nvme_feature_obj *, 400 struct nvme_command *, 401 struct nvme_completion *); 402 static void nvme_feature_iv_config(struct pci_nvme_softc *, 403 struct nvme_feature_obj *, 404 struct nvme_command *, 405 struct nvme_completion *); 406 407 static void *aen_thr(void *arg); 408 409 static __inline void 410 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 411 { 412 size_t len; 413 414 len = strnlen(src, dst_size); 415 memset(dst, pad, dst_size); 416 memcpy(dst, src, len); 417 } 418 419 static __inline void 420 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 421 { 422 423 *status &= ~NVME_STATUS_MASK; 424 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 425 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 426 } 427 428 static __inline void 429 pci_nvme_status_genc(uint16_t *status, uint16_t code) 430 { 431 432 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 433 } 434 435 /* 436 * Initialize the requested number or IO Submission and Completion Queues. 437 * Admin queues are allocated implicitly. 438 */ 439 static void 440 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 441 { 442 uint32_t i; 443 444 /* 445 * Allocate and initialize the Submission Queues 446 */ 447 if (nsq > NVME_QUEUES) { 448 WPRINTF("%s: clamping number of SQ from %u to %u", 449 __func__, nsq, NVME_QUEUES); 450 nsq = NVME_QUEUES; 451 } 452 453 sc->num_squeues = nsq; 454 455 sc->submit_queues = calloc(sc->num_squeues + 1, 456 sizeof(struct nvme_submission_queue)); 457 if (sc->submit_queues == NULL) { 458 WPRINTF("%s: SQ allocation failed", __func__); 459 sc->num_squeues = 0; 460 } else { 461 struct nvme_submission_queue *sq = sc->submit_queues; 462 463 #ifndef __FreeBSD__ 464 for (i = 0; i < sc->num_squeues + 1; i++) 465 pthread_mutex_init(&sq[i].mtx, NULL); 466 #else 467 for (i = 0; i < sc->num_squeues; i++) 468 pthread_mutex_init(&sq[i].mtx, NULL); 469 #endif 470 } 471 472 /* 473 * Allocate and initialize the Completion Queues 474 */ 475 if (ncq > NVME_QUEUES) { 476 WPRINTF("%s: clamping number of CQ from %u to %u", 477 __func__, ncq, NVME_QUEUES); 478 ncq = NVME_QUEUES; 479 } 480 481 sc->num_cqueues = ncq; 482 483 sc->compl_queues = calloc(sc->num_cqueues + 1, 484 sizeof(struct nvme_completion_queue)); 485 if (sc->compl_queues == NULL) { 486 WPRINTF("%s: CQ allocation failed", __func__); 487 sc->num_cqueues = 0; 488 } else { 489 struct nvme_completion_queue *cq = sc->compl_queues; 490 491 #ifndef __FreeBSD__ 492 for (i = 0; i < sc->num_cqueues + 1; i++) 493 pthread_mutex_init(&cq[i].mtx, NULL); 494 #else 495 for (i = 0; i < sc->num_cqueues; i++) 496 pthread_mutex_init(&cq[i].mtx, NULL); 497 #endif 498 } 499 } 500 501 static void 502 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 503 { 504 struct nvme_controller_data *cd = &sc->ctrldata; 505 506 cd->vid = 0xFB5D; 507 cd->ssvid = 0x0000; 508 509 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 510 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 511 512 /* Num of submission commands that we can handle at a time (2^rab) */ 513 cd->rab = 4; 514 515 /* FreeBSD OUI */ 516 cd->ieee[0] = 0x58; 517 cd->ieee[1] = 0x9c; 518 cd->ieee[2] = 0xfc; 519 520 cd->mic = 0; 521 522 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 523 524 cd->ver = 0x00010300; 525 526 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 527 #ifndef __FreeBSD__ 528 /* 529 * Reported upstream against https://reviews.freebsd.org/D32953 530 * which introduced support for the namespace attribute changed AEN 531 * and the corresponding changed namespace log page, without setting 532 * the bit in oaes. A future sync will likely include this 533 * definition in usr/src/contrib/bhyve/dev/nvme/nvme.h once it's 534 * fixed there. 535 */ 536 #define NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT (8) 537 cd->oaes = 1 << NVME_CTRLR_DATA_OAES_NSCHANGE_SHIFT; 538 #endif 539 cd->acl = 2; 540 cd->aerl = 4; 541 542 /* Advertise 1, Read-only firmware slot */ 543 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 544 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 545 cd->lpa = 0; /* TODO: support some simple things like SMART */ 546 cd->elpe = 0; /* max error log page entries */ 547 cd->npss = 1; /* number of power states support */ 548 549 /* Warning Composite Temperature Threshold */ 550 cd->wctemp = 0x0157; 551 552 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 553 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 554 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 555 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 556 cd->nn = 1; /* number of namespaces */ 557 558 cd->oncs = 0; 559 switch (sc->dataset_management) { 560 case NVME_DATASET_MANAGEMENT_AUTO: 561 if (sc->nvstore.deallocate) 562 cd->oncs |= NVME_ONCS_DSM; 563 break; 564 case NVME_DATASET_MANAGEMENT_ENABLE: 565 cd->oncs |= NVME_ONCS_DSM; 566 break; 567 default: 568 break; 569 } 570 571 cd->fna = 0x03; 572 573 cd->power_state[0].mp = 10; 574 } 575 576 /* 577 * Calculate the CRC-16 of the given buffer 578 * See copyright attribution at top of file 579 */ 580 static uint16_t 581 crc16(uint16_t crc, const void *buffer, unsigned int len) 582 { 583 const unsigned char *cp = buffer; 584 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 585 static uint16_t const crc16_table[256] = { 586 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 587 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 588 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 589 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 590 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 591 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 592 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 593 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 594 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 595 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 596 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 597 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 598 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 599 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 600 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 601 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 602 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 603 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 604 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 605 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 606 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 607 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 608 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 609 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 610 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 611 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 612 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 613 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 614 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 615 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 616 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 617 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 618 }; 619 620 while (len--) 621 crc = (((crc >> 8) & 0xffU) ^ 622 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 623 return crc; 624 } 625 626 static void 627 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 628 struct nvme_namespace_data *nd) 629 { 630 631 /* Get capacity and block size information from backing store */ 632 nd->nsze = nvstore->size / nvstore->sectsz; 633 nd->ncap = nd->nsze; 634 nd->nuse = nd->nsze; 635 } 636 637 static void 638 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 639 struct nvme_namespace_data *nd, uint32_t nsid, 640 struct pci_nvme_blockstore *nvstore) 641 { 642 643 pci_nvme_init_nsdata_size(nvstore, nd); 644 645 if (nvstore->type == NVME_STOR_BLOCKIF) 646 nvstore->deallocate = blockif_candelete(nvstore->ctx); 647 648 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 649 nd->flbas = 0; 650 651 /* Create an EUI-64 if user did not provide one */ 652 if (nvstore->eui64 == 0) { 653 char *data = NULL; 654 uint64_t eui64 = nvstore->eui64; 655 656 asprintf(&data, "%s%u%u%u", get_config_value("name"), 657 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 658 sc->nsc_pi->pi_func); 659 660 if (data != NULL) { 661 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 662 free(data); 663 } 664 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 665 } 666 be64enc(nd->eui64, nvstore->eui64); 667 668 /* LBA data-sz = 2^lbads */ 669 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 670 } 671 672 static void 673 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 674 { 675 676 memset(&sc->err_log, 0, sizeof(sc->err_log)); 677 memset(&sc->health_log, 0, sizeof(sc->health_log)); 678 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 679 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 680 681 /* Set read/write remainder to round up according to spec */ 682 sc->read_dunits_remainder = 999; 683 sc->write_dunits_remainder = 999; 684 685 /* Set nominal Health values checked by implementations */ 686 sc->health_log.temperature = 310; 687 sc->health_log.available_spare = 100; 688 sc->health_log.available_spare_threshold = 10; 689 } 690 691 static void 692 pci_nvme_init_features(struct pci_nvme_softc *sc) 693 { 694 695 sc->feat[0].set = nvme_feature_invalid_cb; 696 sc->feat[0].get = nvme_feature_invalid_cb; 697 698 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 699 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 700 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 701 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 702 nvme_feature_iv_config; 703 /* Enable all AENs by default */ 704 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 = 0x31f; 705 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 706 nvme_feature_invalid_cb; 707 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 708 nvme_feature_invalid_cb; 709 } 710 711 static void 712 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 713 { 714 715 STAILQ_INIT(&sc->aer_list); 716 sc->aer_count = 0; 717 } 718 719 static void 720 pci_nvme_aer_init(struct pci_nvme_softc *sc) 721 { 722 723 pthread_mutex_init(&sc->aer_mtx, NULL); 724 pci_nvme_aer_reset(sc); 725 } 726 727 static void 728 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 729 { 730 struct pci_nvme_aer *aer = NULL; 731 732 pthread_mutex_lock(&sc->aer_mtx); 733 while (!STAILQ_EMPTY(&sc->aer_list)) { 734 aer = STAILQ_FIRST(&sc->aer_list); 735 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 736 free(aer); 737 } 738 pthread_mutex_unlock(&sc->aer_mtx); 739 740 pci_nvme_aer_reset(sc); 741 } 742 743 static bool 744 pci_nvme_aer_available(struct pci_nvme_softc *sc) 745 { 746 747 return (sc->aer_count != 0); 748 } 749 750 static bool 751 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 752 { 753 struct nvme_controller_data *cd = &sc->ctrldata; 754 755 /* AERL is a zero based value while aer_count is one's based */ 756 return (sc->aer_count == (cd->aerl + 1)); 757 } 758 759 /* 760 * Add an Async Event Request 761 * 762 * Stores an AER to be returned later if the Controller needs to notify the 763 * host of an event. 764 * Note that while the NVMe spec doesn't require Controllers to return AER's 765 * in order, this implementation does preserve the order. 766 */ 767 static int 768 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 769 { 770 struct pci_nvme_aer *aer = NULL; 771 772 if (pci_nvme_aer_limit_reached(sc)) 773 return (-1); 774 775 aer = calloc(1, sizeof(struct pci_nvme_aer)); 776 if (aer == NULL) 777 return (-1); 778 779 /* Save the Command ID for use in the completion message */ 780 aer->cid = cid; 781 782 pthread_mutex_lock(&sc->aer_mtx); 783 sc->aer_count++; 784 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 785 pthread_mutex_unlock(&sc->aer_mtx); 786 787 return (0); 788 } 789 790 /* 791 * Get an Async Event Request structure 792 * 793 * Returns a pointer to an AER previously submitted by the host or NULL if 794 * no AER's exist. Caller is responsible for freeing the returned struct. 795 */ 796 static struct pci_nvme_aer * 797 pci_nvme_aer_get(struct pci_nvme_softc *sc) 798 { 799 struct pci_nvme_aer *aer = NULL; 800 801 pthread_mutex_lock(&sc->aer_mtx); 802 aer = STAILQ_FIRST(&sc->aer_list); 803 if (aer != NULL) { 804 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 805 sc->aer_count--; 806 } 807 pthread_mutex_unlock(&sc->aer_mtx); 808 809 return (aer); 810 } 811 812 static void 813 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 814 { 815 uint32_t atype; 816 817 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 818 819 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 820 sc->aen[atype].atype = atype; 821 } 822 } 823 824 static void 825 pci_nvme_aen_init(struct pci_nvme_softc *sc) 826 { 827 char nstr[80]; 828 829 pci_nvme_aen_reset(sc); 830 831 pthread_mutex_init(&sc->aen_mtx, NULL); 832 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 833 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 834 sc->nsc_pi->pi_func); 835 pthread_set_name_np(sc->aen_tid, nstr); 836 } 837 838 static void 839 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 840 { 841 842 pci_nvme_aen_reset(sc); 843 } 844 845 /* Notify the AEN thread of pending work */ 846 static void 847 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 848 { 849 850 pthread_cond_signal(&sc->aen_cond); 851 } 852 853 /* 854 * Post an Asynchronous Event Notification 855 */ 856 static int32_t 857 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 858 uint32_t event_data) 859 { 860 struct pci_nvme_aen *aen; 861 862 if (atype >= PCI_NVME_AE_TYPE_MAX) { 863 return(EINVAL); 864 } 865 866 pthread_mutex_lock(&sc->aen_mtx); 867 aen = &sc->aen[atype]; 868 869 /* Has the controller already posted an event of this type? */ 870 if (aen->posted) { 871 pthread_mutex_unlock(&sc->aen_mtx); 872 return(EALREADY); 873 } 874 875 aen->event_data = event_data; 876 aen->posted = true; 877 pthread_mutex_unlock(&sc->aen_mtx); 878 879 pci_nvme_aen_notify(sc); 880 881 return(0); 882 } 883 884 static void 885 pci_nvme_aen_process(struct pci_nvme_softc *sc) 886 { 887 struct pci_nvme_aer *aer; 888 struct pci_nvme_aen *aen; 889 pci_nvme_async_type atype; 890 uint32_t mask; 891 uint16_t status; 892 uint8_t lid; 893 894 #ifndef __FreeBSD__ 895 lid = 0; 896 #endif 897 898 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 899 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 900 aen = &sc->aen[atype]; 901 /* Previous iterations may have depleted the available AER's */ 902 if (!pci_nvme_aer_available(sc)) { 903 DPRINTF("%s: no AER", __func__); 904 break; 905 } 906 907 if (!aen->posted) { 908 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 909 continue; 910 } 911 912 status = NVME_SC_SUCCESS; 913 914 /* Is the event masked? */ 915 mask = 916 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 917 918 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 919 switch (atype) { 920 case PCI_NVME_AE_TYPE_ERROR: 921 lid = NVME_LOG_ERROR; 922 break; 923 case PCI_NVME_AE_TYPE_SMART: 924 mask &= 0xff; 925 if ((mask & aen->event_data) == 0) 926 continue; 927 lid = NVME_LOG_HEALTH_INFORMATION; 928 break; 929 case PCI_NVME_AE_TYPE_NOTICE: 930 if (aen->event_data >= PCI_NVME_AE_INFO_MAX) { 931 EPRINTLN("%s unknown AEN notice type %u", 932 __func__, aen->event_data); 933 status = NVME_SC_INTERNAL_DEVICE_ERROR; 934 break; 935 } 936 mask >>= 8; 937 if (((1 << aen->event_data) & mask) == 0) 938 continue; 939 switch (aen->event_data) { 940 case PCI_NVME_AE_INFO_NS_ATTR_CHANGED: 941 lid = NVME_LOG_CHANGED_NAMESPACE; 942 break; 943 case PCI_NVME_AE_INFO_FW_ACTIVATION: 944 lid = NVME_LOG_FIRMWARE_SLOT; 945 break; 946 case PCI_NVME_AE_INFO_TELEMETRY_CHANGE: 947 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 948 break; 949 case PCI_NVME_AE_INFO_ANA_CHANGE: 950 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling 951 break; 952 case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE: 953 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 954 break; 955 case PCI_NVME_AE_INFO_LBA_STATUS_ALERT: 956 lid = NVME_LOG_LBA_STATUS_INFORMATION; 957 break; 958 case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE: 959 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 960 break; 961 default: 962 lid = 0; 963 } 964 break; 965 default: 966 /* bad type?!? */ 967 EPRINTLN("%s unknown AEN type %u", __func__, atype); 968 status = NVME_SC_INTERNAL_DEVICE_ERROR; 969 break; 970 } 971 972 aer = pci_nvme_aer_get(sc); 973 assert(aer != NULL); 974 975 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 976 pci_nvme_cq_update(sc, &sc->compl_queues[0], 977 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 978 aer->cid, 979 0, /* SQID */ 980 status); 981 982 aen->event_data = 0; 983 aen->posted = false; 984 985 pci_generate_msix(sc->nsc_pi, 0); 986 } 987 } 988 989 static void * 990 aen_thr(void *arg) 991 { 992 struct pci_nvme_softc *sc; 993 994 sc = arg; 995 996 pthread_mutex_lock(&sc->aen_mtx); 997 for (;;) { 998 pci_nvme_aen_process(sc); 999 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1000 } 1001 #ifdef __FreeBSD__ 1002 pthread_mutex_unlock(&sc->aen_mtx); 1003 1004 pthread_exit(NULL); 1005 #endif 1006 return (NULL); 1007 } 1008 1009 static void 1010 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1011 { 1012 uint32_t i; 1013 1014 DPRINTF("%s", __func__); 1015 1016 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1017 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1018 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1019 1020 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1021 1022 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 1023 1024 sc->regs.cc = 0; 1025 sc->regs.csts = 0; 1026 1027 assert(sc->submit_queues != NULL); 1028 1029 for (i = 0; i < sc->num_squeues + 1; i++) { 1030 sc->submit_queues[i].qbase = NULL; 1031 sc->submit_queues[i].size = 0; 1032 sc->submit_queues[i].cqid = 0; 1033 sc->submit_queues[i].tail = 0; 1034 sc->submit_queues[i].head = 0; 1035 } 1036 1037 assert(sc->compl_queues != NULL); 1038 1039 for (i = 0; i < sc->num_cqueues + 1; i++) { 1040 sc->compl_queues[i].qbase = NULL; 1041 sc->compl_queues[i].size = 0; 1042 sc->compl_queues[i].tail = 0; 1043 sc->compl_queues[i].head = 0; 1044 } 1045 1046 sc->num_q_is_set = false; 1047 1048 pci_nvme_aer_destroy(sc); 1049 pci_nvme_aen_destroy(sc); 1050 } 1051 1052 static void 1053 pci_nvme_reset(struct pci_nvme_softc *sc) 1054 { 1055 pthread_mutex_lock(&sc->mtx); 1056 pci_nvme_reset_locked(sc); 1057 pthread_mutex_unlock(&sc->mtx); 1058 } 1059 1060 static void 1061 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 1062 { 1063 uint16_t acqs, asqs; 1064 1065 DPRINTF("%s", __func__); 1066 1067 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 1068 sc->submit_queues[0].size = asqs; 1069 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 1070 sizeof(struct nvme_command) * asqs); 1071 1072 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1073 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1074 1075 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1076 NVME_AQA_REG_ACQS_MASK) + 1; 1077 sc->compl_queues[0].size = acqs; 1078 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 1079 sizeof(struct nvme_completion) * acqs); 1080 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1081 1082 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1083 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1084 } 1085 1086 static int 1087 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1088 size_t len, enum nvme_copy_dir dir) 1089 { 1090 uint8_t *p; 1091 size_t bytes; 1092 1093 if (len > (8 * 1024)) { 1094 return (-1); 1095 } 1096 1097 /* Copy from the start of prp1 to the end of the physical page */ 1098 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1099 bytes = MIN(bytes, len); 1100 1101 p = vm_map_gpa(ctx, prp1, bytes); 1102 if (p == NULL) { 1103 return (-1); 1104 } 1105 1106 if (dir == NVME_COPY_TO_PRP) 1107 memcpy(p, b, bytes); 1108 else 1109 memcpy(b, p, bytes); 1110 1111 b += bytes; 1112 1113 len -= bytes; 1114 if (len == 0) { 1115 return (0); 1116 } 1117 1118 len = MIN(len, PAGE_SIZE); 1119 1120 p = vm_map_gpa(ctx, prp2, len); 1121 if (p == NULL) { 1122 return (-1); 1123 } 1124 1125 if (dir == NVME_COPY_TO_PRP) 1126 memcpy(p, b, len); 1127 else 1128 memcpy(b, p, len); 1129 1130 return (0); 1131 } 1132 1133 /* 1134 * Write a Completion Queue Entry update 1135 * 1136 * Write the completion and update the doorbell value 1137 */ 1138 static void 1139 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1140 struct nvme_completion_queue *cq, 1141 uint32_t cdw0, 1142 uint16_t cid, 1143 uint16_t sqid, 1144 uint16_t status) 1145 { 1146 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1147 struct nvme_completion *cqe; 1148 1149 assert(cq->qbase != NULL); 1150 1151 pthread_mutex_lock(&cq->mtx); 1152 1153 cqe = &cq->qbase[cq->tail]; 1154 1155 /* Flip the phase bit */ 1156 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1157 1158 cqe->cdw0 = cdw0; 1159 cqe->sqhd = sq->head; 1160 cqe->sqid = sqid; 1161 cqe->cid = cid; 1162 cqe->status = status; 1163 1164 cq->tail++; 1165 if (cq->tail >= cq->size) { 1166 cq->tail = 0; 1167 } 1168 1169 pthread_mutex_unlock(&cq->mtx); 1170 } 1171 1172 static int 1173 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1174 struct nvme_completion* compl) 1175 { 1176 uint16_t qid = command->cdw10 & 0xffff; 1177 1178 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1179 if (qid == 0 || qid > sc->num_squeues || 1180 (sc->submit_queues[qid].qbase == NULL)) { 1181 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1182 __func__, qid, sc->num_squeues); 1183 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1184 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1185 return (1); 1186 } 1187 1188 sc->submit_queues[qid].qbase = NULL; 1189 sc->submit_queues[qid].cqid = 0; 1190 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1191 return (1); 1192 } 1193 1194 static int 1195 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1196 struct nvme_completion* compl) 1197 { 1198 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1199 uint16_t qid = command->cdw10 & 0xffff; 1200 struct nvme_submission_queue *nsq; 1201 1202 if ((qid == 0) || (qid > sc->num_squeues) || 1203 (sc->submit_queues[qid].qbase != NULL)) { 1204 WPRINTF("%s queue index %u > num_squeues %u", 1205 __func__, qid, sc->num_squeues); 1206 pci_nvme_status_tc(&compl->status, 1207 NVME_SCT_COMMAND_SPECIFIC, 1208 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1209 return (1); 1210 } 1211 1212 nsq = &sc->submit_queues[qid]; 1213 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1214 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1215 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1216 /* 1217 * Queues must specify at least two entries 1218 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1219 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1220 */ 1221 pci_nvme_status_tc(&compl->status, 1222 NVME_SCT_COMMAND_SPECIFIC, 1223 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1224 return (1); 1225 } 1226 nsq->head = nsq->tail = 0; 1227 1228 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1229 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1230 pci_nvme_status_tc(&compl->status, 1231 NVME_SCT_COMMAND_SPECIFIC, 1232 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1233 return (1); 1234 } 1235 1236 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1237 pci_nvme_status_tc(&compl->status, 1238 NVME_SCT_COMMAND_SPECIFIC, 1239 NVME_SC_COMPLETION_QUEUE_INVALID); 1240 return (1); 1241 } 1242 1243 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1244 1245 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1246 sizeof(struct nvme_command) * (size_t)nsq->size); 1247 1248 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1249 qid, nsq->size, nsq->qbase, nsq->cqid); 1250 1251 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1252 1253 DPRINTF("%s completed creating IOSQ qid %u", 1254 __func__, qid); 1255 } else { 1256 /* 1257 * Guest sent non-cont submission queue request. 1258 * This setting is unsupported by this emulation. 1259 */ 1260 WPRINTF("%s unsupported non-contig (list-based) " 1261 "create i/o submission queue", __func__); 1262 1263 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1264 } 1265 return (1); 1266 } 1267 1268 static int 1269 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1270 struct nvme_completion* compl) 1271 { 1272 uint16_t qid = command->cdw10 & 0xffff; 1273 uint16_t sqid; 1274 1275 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1276 if (qid == 0 || qid > sc->num_cqueues || 1277 (sc->compl_queues[qid].qbase == NULL)) { 1278 WPRINTF("%s queue index %u / num_cqueues %u", 1279 __func__, qid, sc->num_cqueues); 1280 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1281 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1282 return (1); 1283 } 1284 1285 /* Deleting an Active CQ is an error */ 1286 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1287 if (sc->submit_queues[sqid].cqid == qid) { 1288 pci_nvme_status_tc(&compl->status, 1289 NVME_SCT_COMMAND_SPECIFIC, 1290 NVME_SC_INVALID_QUEUE_DELETION); 1291 return (1); 1292 } 1293 1294 sc->compl_queues[qid].qbase = NULL; 1295 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1296 return (1); 1297 } 1298 1299 static int 1300 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1301 struct nvme_completion* compl) 1302 { 1303 struct nvme_completion_queue *ncq; 1304 uint16_t qid = command->cdw10 & 0xffff; 1305 1306 /* Only support Physically Contiguous queues */ 1307 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1308 WPRINTF("%s unsupported non-contig (list-based) " 1309 "create i/o completion queue", 1310 __func__); 1311 1312 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1313 return (1); 1314 } 1315 1316 if ((qid == 0) || (qid > sc->num_cqueues) || 1317 (sc->compl_queues[qid].qbase != NULL)) { 1318 WPRINTF("%s queue index %u > num_cqueues %u", 1319 __func__, qid, sc->num_cqueues); 1320 pci_nvme_status_tc(&compl->status, 1321 NVME_SCT_COMMAND_SPECIFIC, 1322 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1323 return (1); 1324 } 1325 1326 ncq = &sc->compl_queues[qid]; 1327 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1328 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1329 if (ncq->intr_vec > (sc->max_queues + 1)) { 1330 pci_nvme_status_tc(&compl->status, 1331 NVME_SCT_COMMAND_SPECIFIC, 1332 NVME_SC_INVALID_INTERRUPT_VECTOR); 1333 return (1); 1334 } 1335 1336 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1337 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1338 /* 1339 * Queues must specify at least two entries 1340 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1341 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1342 */ 1343 pci_nvme_status_tc(&compl->status, 1344 NVME_SCT_COMMAND_SPECIFIC, 1345 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1346 return (1); 1347 } 1348 ncq->head = ncq->tail = 0; 1349 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1350 command->prp1, 1351 sizeof(struct nvme_command) * (size_t)ncq->size); 1352 1353 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1354 1355 1356 return (1); 1357 } 1358 1359 static int 1360 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1361 struct nvme_completion* compl) 1362 { 1363 uint32_t logsize; 1364 uint8_t logpage = command->cdw10 & 0xFF; 1365 1366 #ifndef __FreeBSD__ 1367 logsize = 0; 1368 #endif 1369 1370 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1371 1372 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1373 1374 /* 1375 * Command specifies the number of dwords to return in fields NUMDU 1376 * and NUMDL. This is a zero-based value. 1377 */ 1378 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1379 logsize *= sizeof(uint32_t); 1380 1381 switch (logpage) { 1382 case NVME_LOG_ERROR: 1383 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1384 command->prp2, (uint8_t *)&sc->err_log, 1385 MIN(logsize, sizeof(sc->err_log)), 1386 NVME_COPY_TO_PRP); 1387 break; 1388 case NVME_LOG_HEALTH_INFORMATION: 1389 pthread_mutex_lock(&sc->mtx); 1390 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1391 sizeof(sc->health_log.data_units_read)); 1392 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1393 sizeof(sc->health_log.data_units_written)); 1394 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1395 sizeof(sc->health_log.host_read_commands)); 1396 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1397 sizeof(sc->health_log.host_write_commands)); 1398 pthread_mutex_unlock(&sc->mtx); 1399 1400 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1401 command->prp2, (uint8_t *)&sc->health_log, 1402 MIN(logsize, sizeof(sc->health_log)), 1403 NVME_COPY_TO_PRP); 1404 break; 1405 case NVME_LOG_FIRMWARE_SLOT: 1406 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1407 command->prp2, (uint8_t *)&sc->fw_log, 1408 MIN(logsize, sizeof(sc->fw_log)), 1409 NVME_COPY_TO_PRP); 1410 break; 1411 case NVME_LOG_CHANGED_NAMESPACE: 1412 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1413 command->prp2, (uint8_t *)&sc->ns_log, 1414 MIN(logsize, sizeof(sc->ns_log)), 1415 NVME_COPY_TO_PRP); 1416 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1417 break; 1418 default: 1419 DPRINTF("%s get log page %x command not supported", 1420 __func__, logpage); 1421 1422 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1423 NVME_SC_INVALID_LOG_PAGE); 1424 } 1425 1426 return (1); 1427 } 1428 1429 static int 1430 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1431 struct nvme_completion* compl) 1432 { 1433 void *dest; 1434 uint16_t status; 1435 1436 #ifndef __FreeBSD__ 1437 status = 0; 1438 #endif 1439 1440 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1441 command->cdw10 & 0xFF, command->nsid); 1442 1443 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1444 1445 switch (command->cdw10 & 0xFF) { 1446 case 0x00: /* return Identify Namespace data structure */ 1447 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1448 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1449 NVME_COPY_TO_PRP); 1450 break; 1451 case 0x01: /* return Identify Controller data structure */ 1452 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1453 command->prp2, (uint8_t *)&sc->ctrldata, 1454 sizeof(sc->ctrldata), 1455 NVME_COPY_TO_PRP); 1456 break; 1457 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1458 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1459 sizeof(uint32_t) * 1024); 1460 /* All unused entries shall be zero */ 1461 bzero(dest, sizeof(uint32_t) * 1024); 1462 ((uint32_t *)dest)[0] = 1; 1463 break; 1464 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1465 if (command->nsid != 1) { 1466 pci_nvme_status_genc(&status, 1467 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1468 break; 1469 } 1470 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1471 sizeof(uint32_t) * 1024); 1472 /* All bytes after the descriptor shall be zero */ 1473 bzero(dest, sizeof(uint32_t) * 1024); 1474 1475 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1476 ((uint8_t *)dest)[0] = 1; 1477 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1478 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1479 break; 1480 default: 1481 DPRINTF("%s unsupported identify command requested 0x%x", 1482 __func__, command->cdw10 & 0xFF); 1483 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1484 break; 1485 } 1486 1487 compl->status = status; 1488 return (1); 1489 } 1490 1491 static const char * 1492 nvme_fid_to_name(uint8_t fid) 1493 { 1494 const char *name; 1495 1496 switch (fid) { 1497 case NVME_FEAT_ARBITRATION: 1498 name = "Arbitration"; 1499 break; 1500 case NVME_FEAT_POWER_MANAGEMENT: 1501 name = "Power Management"; 1502 break; 1503 case NVME_FEAT_LBA_RANGE_TYPE: 1504 name = "LBA Range Type"; 1505 break; 1506 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1507 name = "Temperature Threshold"; 1508 break; 1509 case NVME_FEAT_ERROR_RECOVERY: 1510 name = "Error Recovery"; 1511 break; 1512 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1513 name = "Volatile Write Cache"; 1514 break; 1515 case NVME_FEAT_NUMBER_OF_QUEUES: 1516 name = "Number of Queues"; 1517 break; 1518 case NVME_FEAT_INTERRUPT_COALESCING: 1519 name = "Interrupt Coalescing"; 1520 break; 1521 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1522 name = "Interrupt Vector Configuration"; 1523 break; 1524 case NVME_FEAT_WRITE_ATOMICITY: 1525 name = "Write Atomicity Normal"; 1526 break; 1527 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1528 name = "Asynchronous Event Configuration"; 1529 break; 1530 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1531 name = "Autonomous Power State Transition"; 1532 break; 1533 case NVME_FEAT_HOST_MEMORY_BUFFER: 1534 name = "Host Memory Buffer"; 1535 break; 1536 case NVME_FEAT_TIMESTAMP: 1537 name = "Timestamp"; 1538 break; 1539 case NVME_FEAT_KEEP_ALIVE_TIMER: 1540 name = "Keep Alive Timer"; 1541 break; 1542 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1543 name = "Host Controlled Thermal Management"; 1544 break; 1545 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1546 name = "Non-Operation Power State Config"; 1547 break; 1548 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1549 name = "Read Recovery Level Config"; 1550 break; 1551 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1552 name = "Predictable Latency Mode Config"; 1553 break; 1554 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1555 name = "Predictable Latency Mode Window"; 1556 break; 1557 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1558 name = "LBA Status Information Report Interval"; 1559 break; 1560 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1561 name = "Host Behavior Support"; 1562 break; 1563 case NVME_FEAT_SANITIZE_CONFIG: 1564 name = "Sanitize Config"; 1565 break; 1566 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1567 name = "Endurance Group Event Configuration"; 1568 break; 1569 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1570 name = "Software Progress Marker"; 1571 break; 1572 case NVME_FEAT_HOST_IDENTIFIER: 1573 name = "Host Identifier"; 1574 break; 1575 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1576 name = "Reservation Notification Mask"; 1577 break; 1578 case NVME_FEAT_RESERVATION_PERSISTENCE: 1579 name = "Reservation Persistence"; 1580 break; 1581 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1582 name = "Namespace Write Protection Config"; 1583 break; 1584 default: 1585 name = "Unknown"; 1586 break; 1587 } 1588 1589 return (name); 1590 } 1591 1592 static void 1593 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1594 struct nvme_feature_obj *feat, 1595 struct nvme_command *command, 1596 struct nvme_completion *compl) 1597 { 1598 1599 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1600 } 1601 1602 static void 1603 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1604 struct nvme_feature_obj *feat, 1605 struct nvme_command *command, 1606 struct nvme_completion *compl) 1607 { 1608 uint32_t i; 1609 uint32_t cdw11 = command->cdw11; 1610 uint16_t iv; 1611 bool cd; 1612 1613 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1614 1615 iv = cdw11 & 0xffff; 1616 cd = cdw11 & (1 << 16); 1617 1618 if (iv > (sc->max_queues + 1)) { 1619 return; 1620 } 1621 1622 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1623 if ((iv == 0) && !cd) 1624 return; 1625 1626 /* Requested Interrupt Vector must be used by a CQ */ 1627 for (i = 0; i < sc->num_cqueues + 1; i++) { 1628 if (sc->compl_queues[i].intr_vec == iv) { 1629 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1630 } 1631 } 1632 1633 } 1634 1635 static void 1636 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1637 struct nvme_feature_obj *feat, 1638 struct nvme_command *command, 1639 struct nvme_completion *compl) 1640 { 1641 uint16_t nqr; /* Number of Queues Requested */ 1642 1643 if (sc->num_q_is_set) { 1644 WPRINTF("%s: Number of Queues already set", __func__); 1645 pci_nvme_status_genc(&compl->status, 1646 NVME_SC_COMMAND_SEQUENCE_ERROR); 1647 return; 1648 } 1649 1650 nqr = command->cdw11 & 0xFFFF; 1651 if (nqr == 0xffff) { 1652 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1653 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1654 return; 1655 } 1656 1657 sc->num_squeues = ONE_BASED(nqr); 1658 if (sc->num_squeues > sc->max_queues) { 1659 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1660 sc->max_queues); 1661 sc->num_squeues = sc->max_queues; 1662 } 1663 1664 nqr = (command->cdw11 >> 16) & 0xFFFF; 1665 if (nqr == 0xffff) { 1666 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1667 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1668 return; 1669 } 1670 1671 sc->num_cqueues = ONE_BASED(nqr); 1672 if (sc->num_cqueues > sc->max_queues) { 1673 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1674 sc->max_queues); 1675 sc->num_cqueues = sc->max_queues; 1676 } 1677 1678 /* Patch the command value which will be saved on callback's return */ 1679 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1680 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1681 1682 sc->num_q_is_set = true; 1683 } 1684 1685 static int 1686 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1687 struct nvme_completion *compl) 1688 { 1689 struct nvme_feature_obj *feat; 1690 uint32_t nsid = command->nsid; 1691 uint8_t fid = command->cdw10 & 0xFF; 1692 1693 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1694 1695 if (fid >= NVME_FID_MAX) { 1696 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1697 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1698 return (1); 1699 } 1700 feat = &sc->feat[fid]; 1701 1702 if (!feat->namespace_specific && 1703 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1704 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1705 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1706 return (1); 1707 } 1708 1709 compl->cdw0 = 0; 1710 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1711 1712 if (feat->set) 1713 feat->set(sc, feat, command, compl); 1714 1715 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1716 if (compl->status == NVME_SC_SUCCESS) { 1717 feat->cdw11 = command->cdw11; 1718 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1719 (command->cdw11 != 0)) 1720 pci_nvme_aen_notify(sc); 1721 } 1722 1723 return (0); 1724 } 1725 1726 static int 1727 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1728 struct nvme_completion* compl) 1729 { 1730 struct nvme_feature_obj *feat; 1731 uint8_t fid = command->cdw10 & 0xFF; 1732 1733 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1734 1735 if (fid >= NVME_FID_MAX) { 1736 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1737 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1738 return (1); 1739 } 1740 1741 compl->cdw0 = 0; 1742 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1743 1744 feat = &sc->feat[fid]; 1745 if (feat->get) { 1746 feat->get(sc, feat, command, compl); 1747 } 1748 1749 if (compl->status == NVME_SC_SUCCESS) { 1750 compl->cdw0 = feat->cdw11; 1751 } 1752 1753 return (0); 1754 } 1755 1756 static int 1757 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1758 struct nvme_completion* compl) 1759 { 1760 uint8_t ses, lbaf, pi; 1761 1762 /* Only supports Secure Erase Setting - User Data Erase */ 1763 ses = (command->cdw10 >> 9) & 0x7; 1764 if (ses > 0x1) { 1765 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1766 return (1); 1767 } 1768 1769 /* Only supports a single LBA Format */ 1770 lbaf = command->cdw10 & 0xf; 1771 if (lbaf != 0) { 1772 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1773 NVME_SC_INVALID_FORMAT); 1774 return (1); 1775 } 1776 1777 /* Doesn't support Protection Infomation */ 1778 pi = (command->cdw10 >> 5) & 0x7; 1779 if (pi != 0) { 1780 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1781 return (1); 1782 } 1783 1784 if (sc->nvstore.type == NVME_STOR_RAM) { 1785 if (sc->nvstore.ctx) 1786 free(sc->nvstore.ctx); 1787 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1788 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1789 } else { 1790 struct pci_nvme_ioreq *req; 1791 int err; 1792 1793 req = pci_nvme_get_ioreq(sc); 1794 if (req == NULL) { 1795 pci_nvme_status_genc(&compl->status, 1796 NVME_SC_INTERNAL_DEVICE_ERROR); 1797 WPRINTF("%s: unable to allocate IO req", __func__); 1798 return (1); 1799 } 1800 req->nvme_sq = &sc->submit_queues[0]; 1801 req->sqid = 0; 1802 req->opc = command->opc; 1803 req->cid = command->cid; 1804 req->nsid = command->nsid; 1805 1806 req->io_req.br_offset = 0; 1807 req->io_req.br_resid = sc->nvstore.size; 1808 req->io_req.br_callback = pci_nvme_io_done; 1809 1810 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1811 if (err) { 1812 pci_nvme_status_genc(&compl->status, 1813 NVME_SC_INTERNAL_DEVICE_ERROR); 1814 pci_nvme_release_ioreq(sc, req); 1815 } 1816 } 1817 1818 return (1); 1819 } 1820 1821 static int 1822 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1823 struct nvme_completion* compl) 1824 { 1825 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1826 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1827 1828 /* TODO: search for the command ID and abort it */ 1829 1830 compl->cdw0 = 1; 1831 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1832 return (1); 1833 } 1834 1835 static int 1836 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1837 struct nvme_command* command, struct nvme_completion* compl) 1838 { 1839 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 1840 sc->aer_count, sc->ctrldata.aerl, command->cid); 1841 1842 /* Don't exceed the Async Event Request Limit (AERL). */ 1843 if (pci_nvme_aer_limit_reached(sc)) { 1844 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1845 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1846 return (1); 1847 } 1848 1849 if (pci_nvme_aer_add(sc, command->cid)) { 1850 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1851 NVME_SC_INTERNAL_DEVICE_ERROR); 1852 return (1); 1853 } 1854 1855 /* 1856 * Raise events when they happen based on the Set Features cmd. 1857 * These events happen async, so only set completion successful if 1858 * there is an event reflective of the request to get event. 1859 */ 1860 compl->status = NVME_NO_STATUS; 1861 pci_nvme_aen_notify(sc); 1862 1863 return (0); 1864 } 1865 1866 static void 1867 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1868 { 1869 struct nvme_completion compl; 1870 struct nvme_command *cmd; 1871 struct nvme_submission_queue *sq; 1872 struct nvme_completion_queue *cq; 1873 uint16_t sqhead; 1874 1875 DPRINTF("%s index %u", __func__, (uint32_t)value); 1876 1877 sq = &sc->submit_queues[0]; 1878 cq = &sc->compl_queues[0]; 1879 1880 pthread_mutex_lock(&sq->mtx); 1881 1882 sqhead = sq->head; 1883 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1884 1885 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1886 cmd = &(sq->qbase)[sqhead]; 1887 compl.cdw0 = 0; 1888 compl.status = 0; 1889 1890 switch (cmd->opc) { 1891 case NVME_OPC_DELETE_IO_SQ: 1892 DPRINTF("%s command DELETE_IO_SQ", __func__); 1893 nvme_opc_delete_io_sq(sc, cmd, &compl); 1894 break; 1895 case NVME_OPC_CREATE_IO_SQ: 1896 DPRINTF("%s command CREATE_IO_SQ", __func__); 1897 nvme_opc_create_io_sq(sc, cmd, &compl); 1898 break; 1899 case NVME_OPC_DELETE_IO_CQ: 1900 DPRINTF("%s command DELETE_IO_CQ", __func__); 1901 nvme_opc_delete_io_cq(sc, cmd, &compl); 1902 break; 1903 case NVME_OPC_CREATE_IO_CQ: 1904 DPRINTF("%s command CREATE_IO_CQ", __func__); 1905 nvme_opc_create_io_cq(sc, cmd, &compl); 1906 break; 1907 case NVME_OPC_GET_LOG_PAGE: 1908 DPRINTF("%s command GET_LOG_PAGE", __func__); 1909 nvme_opc_get_log_page(sc, cmd, &compl); 1910 break; 1911 case NVME_OPC_IDENTIFY: 1912 DPRINTF("%s command IDENTIFY", __func__); 1913 nvme_opc_identify(sc, cmd, &compl); 1914 break; 1915 case NVME_OPC_ABORT: 1916 DPRINTF("%s command ABORT", __func__); 1917 nvme_opc_abort(sc, cmd, &compl); 1918 break; 1919 case NVME_OPC_SET_FEATURES: 1920 DPRINTF("%s command SET_FEATURES", __func__); 1921 nvme_opc_set_features(sc, cmd, &compl); 1922 break; 1923 case NVME_OPC_GET_FEATURES: 1924 DPRINTF("%s command GET_FEATURES", __func__); 1925 nvme_opc_get_features(sc, cmd, &compl); 1926 break; 1927 case NVME_OPC_FIRMWARE_ACTIVATE: 1928 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1929 pci_nvme_status_tc(&compl.status, 1930 NVME_SCT_COMMAND_SPECIFIC, 1931 NVME_SC_INVALID_FIRMWARE_SLOT); 1932 break; 1933 case NVME_OPC_ASYNC_EVENT_REQUEST: 1934 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1935 nvme_opc_async_event_req(sc, cmd, &compl); 1936 break; 1937 case NVME_OPC_FORMAT_NVM: 1938 DPRINTF("%s command FORMAT_NVM", __func__); 1939 if ((sc->ctrldata.oacs & 1940 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1941 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1942 } 1943 compl.status = NVME_NO_STATUS; 1944 nvme_opc_format_nvm(sc, cmd, &compl); 1945 break; 1946 default: 1947 DPRINTF("0x%x command is not implemented", 1948 cmd->opc); 1949 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1950 } 1951 sqhead = (sqhead + 1) % sq->size; 1952 1953 if (NVME_COMPLETION_VALID(compl)) { 1954 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1955 compl.cdw0, 1956 cmd->cid, 1957 0, /* SQID */ 1958 compl.status); 1959 } 1960 } 1961 1962 DPRINTF("setting sqhead %u", sqhead); 1963 sq->head = sqhead; 1964 1965 if (cq->head != cq->tail) 1966 pci_generate_msix(sc->nsc_pi, 0); 1967 1968 pthread_mutex_unlock(&sq->mtx); 1969 } 1970 1971 /* 1972 * Update the Write and Read statistics reported in SMART data 1973 * 1974 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1975 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1976 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1977 */ 1978 static void 1979 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1980 size_t bytes, uint16_t status) 1981 { 1982 1983 pthread_mutex_lock(&sc->mtx); 1984 switch (opc) { 1985 case NVME_OPC_WRITE: 1986 sc->write_commands++; 1987 if (status != NVME_SC_SUCCESS) 1988 break; 1989 sc->write_dunits_remainder += (bytes / 512); 1990 while (sc->write_dunits_remainder >= 1000) { 1991 sc->write_data_units++; 1992 sc->write_dunits_remainder -= 1000; 1993 } 1994 break; 1995 case NVME_OPC_READ: 1996 sc->read_commands++; 1997 if (status != NVME_SC_SUCCESS) 1998 break; 1999 sc->read_dunits_remainder += (bytes / 512); 2000 while (sc->read_dunits_remainder >= 1000) { 2001 sc->read_data_units++; 2002 sc->read_dunits_remainder -= 1000; 2003 } 2004 break; 2005 default: 2006 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2007 break; 2008 } 2009 pthread_mutex_unlock(&sc->mtx); 2010 } 2011 2012 /* 2013 * Check if the combination of Starting LBA (slba) and Number of Logical 2014 * Blocks (nlb) exceeds the range of the underlying storage. 2015 * 2016 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2017 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2018 * overflow. 2019 */ 2020 static bool 2021 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2022 uint32_t nlb) 2023 { 2024 size_t offset, bytes; 2025 2026 /* Overflow check of multiplying Starting LBA by the sector size */ 2027 if (slba >> (64 - nvstore->sectsz_bits)) 2028 return (true); 2029 2030 offset = slba << nvstore->sectsz_bits; 2031 bytes = nlb << nvstore->sectsz_bits; 2032 2033 /* Overflow check of Number of Logical Blocks */ 2034 if ((nvstore->size - offset) < bytes) 2035 return (true); 2036 2037 return (false); 2038 } 2039 2040 static int 2041 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 2042 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 2043 { 2044 int iovidx; 2045 2046 if (req == NULL) 2047 return (-1); 2048 2049 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2050 return (-1); 2051 } 2052 2053 /* concatenate contig block-iovs to minimize number of iovs */ 2054 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 2055 iovidx = req->io_req.br_iovcnt - 1; 2056 2057 req->io_req.br_iov[iovidx].iov_base = 2058 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2059 req->prev_gpaddr, size); 2060 2061 req->prev_size += size; 2062 req->io_req.br_resid += size; 2063 2064 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2065 } else { 2066 iovidx = req->io_req.br_iovcnt; 2067 if (iovidx == 0) { 2068 req->io_req.br_offset = lba; 2069 req->io_req.br_resid = 0; 2070 req->io_req.br_param = req; 2071 } 2072 2073 req->io_req.br_iov[iovidx].iov_base = 2074 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2075 gpaddr, size); 2076 2077 req->io_req.br_iov[iovidx].iov_len = size; 2078 2079 req->prev_gpaddr = gpaddr; 2080 req->prev_size = size; 2081 req->io_req.br_resid += size; 2082 2083 req->io_req.br_iovcnt++; 2084 } 2085 2086 return (0); 2087 } 2088 2089 static void 2090 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2091 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 2092 uint32_t cdw0, uint16_t status) 2093 { 2094 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2095 2096 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2097 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2098 NVME_STATUS_GET_SC(status)); 2099 2100 pci_nvme_cq_update(sc, cq, 2101 0, /* CDW0 */ 2102 cid, 2103 sqid, 2104 status); 2105 2106 if (cq->head != cq->tail) { 2107 if (cq->intr_en & NVME_CQ_INTEN) { 2108 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2109 } else { 2110 DPRINTF("%s: CQ%u interrupt disabled", 2111 __func__, sq->cqid); 2112 } 2113 } 2114 } 2115 2116 static void 2117 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2118 { 2119 req->sc = NULL; 2120 req->nvme_sq = NULL; 2121 req->sqid = 0; 2122 2123 pthread_mutex_lock(&sc->mtx); 2124 2125 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2126 sc->pending_ios--; 2127 2128 /* when no more IO pending, can set to ready if device reset/enabled */ 2129 if (sc->pending_ios == 0 && 2130 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2131 sc->regs.csts |= NVME_CSTS_RDY; 2132 2133 pthread_mutex_unlock(&sc->mtx); 2134 2135 sem_post(&sc->iosemlock); 2136 } 2137 2138 static struct pci_nvme_ioreq * 2139 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2140 { 2141 struct pci_nvme_ioreq *req = NULL; 2142 2143 sem_wait(&sc->iosemlock); 2144 pthread_mutex_lock(&sc->mtx); 2145 2146 req = STAILQ_FIRST(&sc->ioreqs_free); 2147 assert(req != NULL); 2148 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2149 2150 req->sc = sc; 2151 2152 sc->pending_ios++; 2153 2154 pthread_mutex_unlock(&sc->mtx); 2155 2156 req->io_req.br_iovcnt = 0; 2157 req->io_req.br_offset = 0; 2158 req->io_req.br_resid = 0; 2159 req->io_req.br_param = req; 2160 req->prev_gpaddr = 0; 2161 req->prev_size = 0; 2162 2163 return req; 2164 } 2165 2166 static void 2167 pci_nvme_io_done(struct blockif_req *br, int err) 2168 { 2169 struct pci_nvme_ioreq *req = br->br_param; 2170 struct nvme_submission_queue *sq = req->nvme_sq; 2171 uint16_t code, status; 2172 2173 #ifndef __FreeBSD__ 2174 status = 0; 2175 #endif 2176 2177 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2178 2179 /* TODO return correct error */ 2180 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2181 pci_nvme_status_genc(&status, code); 2182 2183 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 2184 pci_nvme_stats_write_read_update(req->sc, req->opc, 2185 req->bytes, status); 2186 pci_nvme_release_ioreq(req->sc, req); 2187 } 2188 2189 /* 2190 * Implements the Flush command. The specification states: 2191 * If a volatile write cache is not present, Flush commands complete 2192 * successfully and have no effect 2193 * in the description of the Volatile Write Cache (VWC) field of the Identify 2194 * Controller data. Therefore, set status to Success if the command is 2195 * not supported (i.e. RAM or as indicated by the blockif). 2196 */ 2197 static bool 2198 nvme_opc_flush(struct pci_nvme_softc *sc, 2199 struct nvme_command *cmd, 2200 struct pci_nvme_blockstore *nvstore, 2201 struct pci_nvme_ioreq *req, 2202 uint16_t *status) 2203 { 2204 bool pending = false; 2205 2206 if (nvstore->type == NVME_STOR_RAM) { 2207 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2208 } else { 2209 int err; 2210 2211 req->io_req.br_callback = pci_nvme_io_done; 2212 2213 err = blockif_flush(nvstore->ctx, &req->io_req); 2214 switch (err) { 2215 case 0: 2216 pending = true; 2217 break; 2218 case EOPNOTSUPP: 2219 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2220 break; 2221 default: 2222 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2223 } 2224 } 2225 2226 return (pending); 2227 } 2228 2229 static uint16_t 2230 nvme_write_read_ram(struct pci_nvme_softc *sc, 2231 struct pci_nvme_blockstore *nvstore, 2232 uint64_t prp1, uint64_t prp2, 2233 size_t offset, uint64_t bytes, 2234 bool is_write) 2235 { 2236 uint8_t *buf = nvstore->ctx; 2237 enum nvme_copy_dir dir; 2238 uint16_t status; 2239 2240 #ifndef __FreeBSD__ 2241 status = 0; 2242 #endif 2243 2244 if (is_write) 2245 dir = NVME_COPY_TO_PRP; 2246 else 2247 dir = NVME_COPY_FROM_PRP; 2248 2249 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2250 buf + offset, bytes, dir)) 2251 pci_nvme_status_genc(&status, 2252 NVME_SC_DATA_TRANSFER_ERROR); 2253 else 2254 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2255 2256 return (status); 2257 } 2258 2259 static uint16_t 2260 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2261 struct pci_nvme_blockstore *nvstore, 2262 struct pci_nvme_ioreq *req, 2263 uint64_t prp1, uint64_t prp2, 2264 size_t offset, uint64_t bytes, 2265 bool is_write) 2266 { 2267 uint64_t size; 2268 int err; 2269 uint16_t status = NVME_NO_STATUS; 2270 2271 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2272 if (pci_nvme_append_iov_req(sc, req, prp1, 2273 size, is_write, offset)) { 2274 pci_nvme_status_genc(&status, 2275 NVME_SC_DATA_TRANSFER_ERROR); 2276 goto out; 2277 } 2278 2279 offset += size; 2280 bytes -= size; 2281 2282 if (bytes == 0) { 2283 ; 2284 } else if (bytes <= PAGE_SIZE) { 2285 size = bytes; 2286 if (pci_nvme_append_iov_req(sc, req, prp2, 2287 size, is_write, offset)) { 2288 pci_nvme_status_genc(&status, 2289 NVME_SC_DATA_TRANSFER_ERROR); 2290 goto out; 2291 } 2292 } else { 2293 void *vmctx = sc->nsc_pi->pi_vmctx; 2294 uint64_t *prp_list = &prp2; 2295 uint64_t *last = prp_list; 2296 2297 /* PRP2 is pointer to a physical region page list */ 2298 while (bytes) { 2299 /* Last entry in list points to the next list */ 2300 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2301 uint64_t prp = *prp_list; 2302 2303 prp_list = paddr_guest2host(vmctx, prp, 2304 PAGE_SIZE - (prp % PAGE_SIZE)); 2305 last = prp_list + (NVME_PRP2_ITEMS - 1); 2306 } 2307 2308 size = MIN(bytes, PAGE_SIZE); 2309 2310 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2311 size, is_write, offset)) { 2312 pci_nvme_status_genc(&status, 2313 NVME_SC_DATA_TRANSFER_ERROR); 2314 goto out; 2315 } 2316 2317 offset += size; 2318 bytes -= size; 2319 2320 prp_list++; 2321 } 2322 } 2323 req->io_req.br_callback = pci_nvme_io_done; 2324 if (is_write) 2325 err = blockif_write(nvstore->ctx, &req->io_req); 2326 else 2327 err = blockif_read(nvstore->ctx, &req->io_req); 2328 2329 if (err) 2330 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2331 out: 2332 return (status); 2333 } 2334 2335 static bool 2336 nvme_opc_write_read(struct pci_nvme_softc *sc, 2337 struct nvme_command *cmd, 2338 struct pci_nvme_blockstore *nvstore, 2339 struct pci_nvme_ioreq *req, 2340 uint16_t *status) 2341 { 2342 uint64_t lba, nblocks, bytes; 2343 size_t offset; 2344 bool is_write = cmd->opc == NVME_OPC_WRITE; 2345 bool pending = false; 2346 2347 #ifndef __FreeBSD__ 2348 bytes = 0; 2349 #endif 2350 2351 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2352 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2353 2354 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2355 WPRINTF("%s command would exceed LBA range", __func__); 2356 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2357 goto out; 2358 } 2359 2360 bytes = nblocks << nvstore->sectsz_bits; 2361 if (bytes > NVME_MAX_DATA_SIZE) { 2362 WPRINTF("%s command would exceed MDTS", __func__); 2363 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2364 goto out; 2365 } 2366 2367 offset = lba << nvstore->sectsz_bits; 2368 2369 req->bytes = bytes; 2370 req->io_req.br_offset = lba; 2371 2372 /* PRP bits 1:0 must be zero */ 2373 cmd->prp1 &= ~0x3UL; 2374 cmd->prp2 &= ~0x3UL; 2375 2376 if (nvstore->type == NVME_STOR_RAM) { 2377 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2378 cmd->prp2, offset, bytes, is_write); 2379 } else { 2380 *status = nvme_write_read_blockif(sc, nvstore, req, 2381 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2382 2383 if (*status == NVME_NO_STATUS) 2384 pending = true; 2385 } 2386 out: 2387 if (!pending) 2388 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2389 2390 return (pending); 2391 } 2392 2393 static void 2394 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2395 { 2396 struct pci_nvme_ioreq *req = br->br_param; 2397 struct pci_nvme_softc *sc = req->sc; 2398 bool done = true; 2399 uint16_t status; 2400 2401 #ifndef __FreeBSD__ 2402 status = 0; 2403 #endif 2404 2405 if (err) { 2406 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2407 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2408 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2409 } else { 2410 struct iovec *iov = req->io_req.br_iov; 2411 2412 req->prev_gpaddr++; 2413 iov += req->prev_gpaddr; 2414 2415 /* The iov_* values already include the sector size */ 2416 req->io_req.br_offset = (off_t)iov->iov_base; 2417 req->io_req.br_resid = iov->iov_len; 2418 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2419 pci_nvme_status_genc(&status, 2420 NVME_SC_INTERNAL_DEVICE_ERROR); 2421 } else 2422 done = false; 2423 } 2424 2425 if (done) { 2426 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2427 req->cid, 0, status); 2428 pci_nvme_release_ioreq(sc, req); 2429 } 2430 } 2431 2432 static bool 2433 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2434 struct nvme_command *cmd, 2435 struct pci_nvme_blockstore *nvstore, 2436 struct pci_nvme_ioreq *req, 2437 uint16_t *status) 2438 { 2439 struct nvme_dsm_range *range; 2440 uint32_t nr, r, non_zero, dr; 2441 int err; 2442 bool pending = false; 2443 2444 #ifndef __FreeBSD__ 2445 range = NULL; 2446 #endif 2447 2448 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2449 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2450 goto out; 2451 } 2452 2453 nr = cmd->cdw10 & 0xff; 2454 2455 /* copy locally because a range entry could straddle PRPs */ 2456 range = calloc(1, NVME_MAX_DSM_TRIM); 2457 if (range == NULL) { 2458 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2459 goto out; 2460 } 2461 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2462 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2463 2464 /* Check for invalid ranges and the number of non-zero lengths */ 2465 non_zero = 0; 2466 for (r = 0; r <= nr; r++) { 2467 if (pci_nvme_out_of_range(nvstore, 2468 range[r].starting_lba, range[r].length)) { 2469 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2470 goto out; 2471 } 2472 if (range[r].length != 0) 2473 non_zero++; 2474 } 2475 2476 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2477 size_t offset, bytes; 2478 int sectsz_bits = sc->nvstore.sectsz_bits; 2479 2480 /* 2481 * DSM calls are advisory only, and compliant controllers 2482 * may choose to take no actions (i.e. return Success). 2483 */ 2484 if (!nvstore->deallocate) { 2485 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2486 goto out; 2487 } 2488 2489 /* If all ranges have a zero length, return Success */ 2490 if (non_zero == 0) { 2491 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2492 goto out; 2493 } 2494 2495 if (req == NULL) { 2496 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2497 goto out; 2498 } 2499 2500 offset = range[0].starting_lba << sectsz_bits; 2501 bytes = range[0].length << sectsz_bits; 2502 2503 /* 2504 * If the request is for more than a single range, store 2505 * the ranges in the br_iov. Optimize for the common case 2506 * of a single range. 2507 * 2508 * Note that NVMe Number of Ranges is a zero based value 2509 */ 2510 req->io_req.br_iovcnt = 0; 2511 req->io_req.br_offset = offset; 2512 req->io_req.br_resid = bytes; 2513 2514 if (nr == 0) { 2515 req->io_req.br_callback = pci_nvme_io_done; 2516 } else { 2517 struct iovec *iov = req->io_req.br_iov; 2518 2519 for (r = 0, dr = 0; r <= nr; r++) { 2520 offset = range[r].starting_lba << sectsz_bits; 2521 bytes = range[r].length << sectsz_bits; 2522 if (bytes == 0) 2523 continue; 2524 2525 if ((nvstore->size - offset) < bytes) { 2526 pci_nvme_status_genc(status, 2527 NVME_SC_LBA_OUT_OF_RANGE); 2528 goto out; 2529 } 2530 iov[dr].iov_base = (void *)offset; 2531 iov[dr].iov_len = bytes; 2532 dr++; 2533 } 2534 req->io_req.br_callback = pci_nvme_dealloc_sm; 2535 2536 /* 2537 * Use prev_gpaddr to track the current entry and 2538 * prev_size to track the number of entries 2539 */ 2540 req->prev_gpaddr = 0; 2541 req->prev_size = dr; 2542 } 2543 2544 err = blockif_delete(nvstore->ctx, &req->io_req); 2545 if (err) 2546 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2547 else 2548 pending = true; 2549 } 2550 out: 2551 free(range); 2552 return (pending); 2553 } 2554 2555 static void 2556 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2557 { 2558 struct nvme_submission_queue *sq; 2559 uint16_t status; 2560 uint16_t sqhead; 2561 2562 #ifndef __FreeBSD__ 2563 status = 0; 2564 #endif 2565 2566 /* handle all submissions up to sq->tail index */ 2567 sq = &sc->submit_queues[idx]; 2568 2569 pthread_mutex_lock(&sq->mtx); 2570 2571 sqhead = sq->head; 2572 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2573 idx, sqhead, sq->tail, sq->qbase); 2574 2575 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2576 struct nvme_command *cmd; 2577 struct pci_nvme_ioreq *req; 2578 uint32_t nsid; 2579 bool pending; 2580 2581 pending = false; 2582 req = NULL; 2583 status = 0; 2584 2585 cmd = &sq->qbase[sqhead]; 2586 sqhead = (sqhead + 1) % sq->size; 2587 2588 nsid = le32toh(cmd->nsid); 2589 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2590 pci_nvme_status_genc(&status, 2591 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2592 status |= 2593 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2594 goto complete; 2595 } 2596 2597 req = pci_nvme_get_ioreq(sc); 2598 if (req == NULL) { 2599 pci_nvme_status_genc(&status, 2600 NVME_SC_INTERNAL_DEVICE_ERROR); 2601 WPRINTF("%s: unable to allocate IO req", __func__); 2602 goto complete; 2603 } 2604 req->nvme_sq = sq; 2605 req->sqid = idx; 2606 req->opc = cmd->opc; 2607 req->cid = cmd->cid; 2608 req->nsid = cmd->nsid; 2609 2610 switch (cmd->opc) { 2611 case NVME_OPC_FLUSH: 2612 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2613 req, &status); 2614 break; 2615 case NVME_OPC_WRITE: 2616 case NVME_OPC_READ: 2617 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2618 req, &status); 2619 break; 2620 case NVME_OPC_WRITE_ZEROES: 2621 /* TODO: write zeroes 2622 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2623 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2624 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2625 break; 2626 case NVME_OPC_DATASET_MANAGEMENT: 2627 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2628 req, &status); 2629 break; 2630 default: 2631 WPRINTF("%s unhandled io command 0x%x", 2632 __func__, cmd->opc); 2633 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2634 } 2635 complete: 2636 if (!pending) { 2637 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2638 status); 2639 if (req != NULL) 2640 pci_nvme_release_ioreq(sc, req); 2641 } 2642 } 2643 2644 sq->head = sqhead; 2645 2646 pthread_mutex_unlock(&sq->mtx); 2647 } 2648 2649 static void 2650 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2651 uint64_t idx, int is_sq, uint64_t value) 2652 { 2653 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2654 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2655 2656 if (is_sq) { 2657 if (idx > sc->num_squeues) { 2658 WPRINTF("%s queue index %lu overflow from " 2659 "guest (max %u)", 2660 __func__, idx, sc->num_squeues); 2661 return; 2662 } 2663 2664 atomic_store_short(&sc->submit_queues[idx].tail, 2665 (uint16_t)value); 2666 2667 if (idx == 0) { 2668 pci_nvme_handle_admin_cmd(sc, value); 2669 } else { 2670 /* submission queue; handle new entries in SQ */ 2671 if (idx > sc->num_squeues) { 2672 WPRINTF("%s SQ index %lu overflow from " 2673 "guest (max %u)", 2674 __func__, idx, sc->num_squeues); 2675 return; 2676 } 2677 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2678 } 2679 } else { 2680 if (idx > sc->num_cqueues) { 2681 WPRINTF("%s queue index %lu overflow from " 2682 "guest (max %u)", 2683 __func__, idx, sc->num_cqueues); 2684 return; 2685 } 2686 2687 atomic_store_short(&sc->compl_queues[idx].head, 2688 (uint16_t)value); 2689 } 2690 } 2691 2692 static void 2693 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2694 { 2695 const char *s = iswrite ? "WRITE" : "READ"; 2696 2697 switch (offset) { 2698 case NVME_CR_CAP_LOW: 2699 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2700 break; 2701 case NVME_CR_CAP_HI: 2702 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2703 break; 2704 case NVME_CR_VS: 2705 DPRINTF("%s %s NVME_CR_VS", func, s); 2706 break; 2707 case NVME_CR_INTMS: 2708 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2709 break; 2710 case NVME_CR_INTMC: 2711 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2712 break; 2713 case NVME_CR_CC: 2714 DPRINTF("%s %s NVME_CR_CC", func, s); 2715 break; 2716 case NVME_CR_CSTS: 2717 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2718 break; 2719 case NVME_CR_NSSR: 2720 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2721 break; 2722 case NVME_CR_AQA: 2723 DPRINTF("%s %s NVME_CR_AQA", func, s); 2724 break; 2725 case NVME_CR_ASQ_LOW: 2726 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2727 break; 2728 case NVME_CR_ASQ_HI: 2729 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2730 break; 2731 case NVME_CR_ACQ_LOW: 2732 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2733 break; 2734 case NVME_CR_ACQ_HI: 2735 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2736 break; 2737 default: 2738 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2739 } 2740 2741 } 2742 2743 static void 2744 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2745 uint64_t offset, int size, uint64_t value) 2746 { 2747 uint32_t ccreg; 2748 2749 if (offset >= NVME_DOORBELL_OFFSET) { 2750 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2751 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2752 int is_sq = (belloffset % 8) < 4; 2753 2754 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2755 WPRINTF("guest attempted an overflow write offset " 2756 "0x%lx, val 0x%lx in %s", 2757 offset, value, __func__); 2758 return; 2759 } 2760 2761 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2762 return; 2763 } 2764 2765 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2766 offset, size, value); 2767 2768 if (size != 4) { 2769 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2770 "val 0x%lx) to bar0 in %s", 2771 size, offset, value, __func__); 2772 /* TODO: shutdown device */ 2773 return; 2774 } 2775 2776 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2777 2778 pthread_mutex_lock(&sc->mtx); 2779 2780 switch (offset) { 2781 case NVME_CR_CAP_LOW: 2782 case NVME_CR_CAP_HI: 2783 /* readonly */ 2784 break; 2785 case NVME_CR_VS: 2786 /* readonly */ 2787 break; 2788 case NVME_CR_INTMS: 2789 /* MSI-X, so ignore */ 2790 break; 2791 case NVME_CR_INTMC: 2792 /* MSI-X, so ignore */ 2793 break; 2794 case NVME_CR_CC: 2795 ccreg = (uint32_t)value; 2796 2797 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2798 "iocqes %u", 2799 __func__, 2800 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2801 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2802 NVME_CC_GET_IOCQES(ccreg)); 2803 2804 if (NVME_CC_GET_SHN(ccreg)) { 2805 /* perform shutdown - flush out data to backend */ 2806 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2807 NVME_CSTS_REG_SHST_SHIFT); 2808 sc->regs.csts |= NVME_SHST_COMPLETE << 2809 NVME_CSTS_REG_SHST_SHIFT; 2810 } 2811 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2812 if (NVME_CC_GET_EN(ccreg) == 0) 2813 /* transition 1-> causes controller reset */ 2814 pci_nvme_reset_locked(sc); 2815 else 2816 pci_nvme_init_controller(ctx, sc); 2817 } 2818 2819 /* Insert the iocqes, iosqes and en bits from the write */ 2820 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2821 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2822 if (NVME_CC_GET_EN(ccreg) == 0) { 2823 /* Insert the ams, mps and css bit fields */ 2824 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2825 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2826 sc->regs.csts &= ~NVME_CSTS_RDY; 2827 } else if (sc->pending_ios == 0) { 2828 sc->regs.csts |= NVME_CSTS_RDY; 2829 } 2830 break; 2831 case NVME_CR_CSTS: 2832 break; 2833 case NVME_CR_NSSR: 2834 /* ignore writes; don't support subsystem reset */ 2835 break; 2836 case NVME_CR_AQA: 2837 sc->regs.aqa = (uint32_t)value; 2838 break; 2839 case NVME_CR_ASQ_LOW: 2840 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2841 (0xFFFFF000 & value); 2842 break; 2843 case NVME_CR_ASQ_HI: 2844 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2845 (value << 32); 2846 break; 2847 case NVME_CR_ACQ_LOW: 2848 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2849 (0xFFFFF000 & value); 2850 break; 2851 case NVME_CR_ACQ_HI: 2852 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2853 (value << 32); 2854 break; 2855 default: 2856 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2857 __func__, offset, value, size); 2858 } 2859 pthread_mutex_unlock(&sc->mtx); 2860 } 2861 2862 static void 2863 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2864 int baridx, uint64_t offset, int size, uint64_t value) 2865 { 2866 struct pci_nvme_softc* sc = pi->pi_arg; 2867 2868 if (baridx == pci_msix_table_bar(pi) || 2869 baridx == pci_msix_pba_bar(pi)) { 2870 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2871 " value 0x%lx", baridx, offset, size, value); 2872 2873 pci_emul_msix_twrite(pi, offset, size, value); 2874 return; 2875 } 2876 2877 switch (baridx) { 2878 case 0: 2879 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2880 break; 2881 2882 default: 2883 DPRINTF("%s unknown baridx %d, val 0x%lx", 2884 __func__, baridx, value); 2885 } 2886 } 2887 2888 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2889 uint64_t offset, int size) 2890 { 2891 uint64_t value; 2892 2893 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2894 2895 if (offset < NVME_DOORBELL_OFFSET) { 2896 void *p = &(sc->regs); 2897 pthread_mutex_lock(&sc->mtx); 2898 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2899 pthread_mutex_unlock(&sc->mtx); 2900 } else { 2901 value = 0; 2902 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2903 } 2904 2905 switch (size) { 2906 case 1: 2907 value &= 0xFF; 2908 break; 2909 case 2: 2910 value &= 0xFFFF; 2911 break; 2912 case 4: 2913 value &= 0xFFFFFFFF; 2914 break; 2915 } 2916 2917 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2918 offset, size, (uint32_t)value); 2919 2920 return (value); 2921 } 2922 2923 2924 2925 static uint64_t 2926 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2927 uint64_t offset, int size) 2928 { 2929 struct pci_nvme_softc* sc = pi->pi_arg; 2930 2931 if (baridx == pci_msix_table_bar(pi) || 2932 baridx == pci_msix_pba_bar(pi)) { 2933 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2934 baridx, offset, size); 2935 2936 return pci_emul_msix_tread(pi, offset, size); 2937 } 2938 2939 switch (baridx) { 2940 case 0: 2941 return pci_nvme_read_bar_0(sc, offset, size); 2942 2943 default: 2944 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2945 } 2946 2947 return (0); 2948 } 2949 2950 static int 2951 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 2952 { 2953 char bident[sizeof("XX:X:X")]; 2954 const char *value; 2955 uint32_t sectsz; 2956 2957 sc->max_queues = NVME_QUEUES; 2958 sc->max_qentries = NVME_MAX_QENTRIES; 2959 sc->ioslots = NVME_IOSLOTS; 2960 sc->num_squeues = sc->max_queues; 2961 sc->num_cqueues = sc->max_queues; 2962 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2963 sectsz = 0; 2964 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2965 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2966 2967 value = get_config_value_node(nvl, "maxq"); 2968 if (value != NULL) 2969 sc->max_queues = atoi(value); 2970 value = get_config_value_node(nvl, "qsz"); 2971 if (value != NULL) { 2972 sc->max_qentries = atoi(value); 2973 if (sc->max_qentries <= 0) { 2974 EPRINTLN("nvme: Invalid qsz option %d", 2975 sc->max_qentries); 2976 return (-1); 2977 } 2978 } 2979 value = get_config_value_node(nvl, "ioslots"); 2980 if (value != NULL) { 2981 sc->ioslots = atoi(value); 2982 if (sc->ioslots <= 0) { 2983 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 2984 return (-1); 2985 } 2986 } 2987 value = get_config_value_node(nvl, "sectsz"); 2988 if (value != NULL) 2989 sectsz = atoi(value); 2990 value = get_config_value_node(nvl, "ser"); 2991 if (value != NULL) { 2992 /* 2993 * This field indicates the Product Serial Number in 2994 * 7-bit ASCII, unused bytes should be space characters. 2995 * Ref: NVMe v1.3c. 2996 */ 2997 cpywithpad((char *)sc->ctrldata.sn, 2998 sizeof(sc->ctrldata.sn), value, ' '); 2999 } 3000 value = get_config_value_node(nvl, "eui64"); 3001 if (value != NULL) 3002 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3003 value = get_config_value_node(nvl, "dsm"); 3004 if (value != NULL) { 3005 if (strcmp(value, "auto") == 0) 3006 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3007 else if (strcmp(value, "enable") == 0) 3008 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3009 else if (strcmp(value, "disable") == 0) 3010 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3011 } 3012 3013 value = get_config_value_node(nvl, "ram"); 3014 if (value != NULL) { 3015 uint64_t sz = strtoull(value, NULL, 10); 3016 3017 sc->nvstore.type = NVME_STOR_RAM; 3018 sc->nvstore.size = sz * 1024 * 1024; 3019 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3020 sc->nvstore.sectsz = 4096; 3021 sc->nvstore.sectsz_bits = 12; 3022 if (sc->nvstore.ctx == NULL) { 3023 EPRINTLN("nvme: Unable to allocate RAM"); 3024 return (-1); 3025 } 3026 } else { 3027 snprintf(bident, sizeof(bident), "%d:%d", 3028 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3029 sc->nvstore.ctx = blockif_open(nvl, bident); 3030 if (sc->nvstore.ctx == NULL) { 3031 EPRINTLN("nvme: Could not open backing file: %s", 3032 strerror(errno)); 3033 return (-1); 3034 } 3035 sc->nvstore.type = NVME_STOR_BLOCKIF; 3036 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3037 } 3038 3039 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3040 sc->nvstore.sectsz = sectsz; 3041 else if (sc->nvstore.type != NVME_STOR_RAM) 3042 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3043 for (sc->nvstore.sectsz_bits = 9; 3044 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3045 sc->nvstore.sectsz_bits++); 3046 3047 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3048 sc->max_queues = NVME_QUEUES; 3049 3050 return (0); 3051 } 3052 3053 static void 3054 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size) 3055 { 3056 struct pci_nvme_softc *sc; 3057 struct pci_nvme_blockstore *nvstore; 3058 struct nvme_namespace_data *nd; 3059 3060 sc = arg; 3061 nvstore = &sc->nvstore; 3062 nd = &sc->nsdata; 3063 3064 nvstore->size = new_size; 3065 pci_nvme_init_nsdata_size(nvstore, nd); 3066 3067 /* Add changed NSID to list */ 3068 sc->ns_log.ns[0] = 1; 3069 sc->ns_log.ns[1] = 0; 3070 3071 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3072 PCI_NVME_AE_INFO_NS_ATTR_CHANGED); 3073 } 3074 3075 static int 3076 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 3077 { 3078 struct pci_nvme_softc *sc; 3079 uint32_t pci_membar_sz; 3080 int error; 3081 3082 error = 0; 3083 3084 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3085 pi->pi_arg = sc; 3086 sc->nsc_pi = pi; 3087 3088 error = pci_nvme_parse_config(sc, nvl); 3089 if (error < 0) 3090 goto done; 3091 else 3092 error = 0; 3093 3094 STAILQ_INIT(&sc->ioreqs_free); 3095 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3096 for (int i = 0; i < sc->ioslots; i++) { 3097 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3098 } 3099 3100 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3101 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3102 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3103 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3104 pci_set_cfgdata8(pi, PCIR_PROGIF, 3105 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3106 3107 /* 3108 * Allocate size of NVMe registers + doorbell space for all queues. 3109 * 3110 * The specification requires a minimum memory I/O window size of 16K. 3111 * The Windows driver will refuse to start a device with a smaller 3112 * window. 3113 */ 3114 pci_membar_sz = sizeof(struct nvme_registers) + 3115 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3116 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3117 3118 DPRINTF("nvme membar size: %u", pci_membar_sz); 3119 3120 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3121 if (error) { 3122 WPRINTF("%s pci alloc mem bar failed", __func__); 3123 goto done; 3124 } 3125 3126 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3127 if (error) { 3128 WPRINTF("%s pci add msixcap failed", __func__); 3129 goto done; 3130 } 3131 3132 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3133 if (error) { 3134 WPRINTF("%s pci add Express capability failed", __func__); 3135 goto done; 3136 } 3137 3138 pthread_mutex_init(&sc->mtx, NULL); 3139 sem_init(&sc->iosemlock, 0, sc->ioslots); 3140 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3141 3142 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3143 /* 3144 * Controller data depends on Namespace data so initialize Namespace 3145 * data first. 3146 */ 3147 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3148 pci_nvme_init_ctrldata(sc); 3149 pci_nvme_init_logpages(sc); 3150 pci_nvme_init_features(sc); 3151 3152 pci_nvme_aer_init(sc); 3153 pci_nvme_aen_init(sc); 3154 3155 pci_nvme_reset(sc); 3156 3157 pci_lintr_request(pi); 3158 3159 done: 3160 return (error); 3161 } 3162 3163 static int 3164 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3165 { 3166 char *cp, *ram; 3167 3168 if (opts == NULL) 3169 return (0); 3170 3171 if (strncmp(opts, "ram=", 4) == 0) { 3172 cp = strchr(opts, ','); 3173 if (cp == NULL) { 3174 set_config_value_node(nvl, "ram", opts + 4); 3175 return (0); 3176 } 3177 ram = strndup(opts + 4, cp - opts - 4); 3178 set_config_value_node(nvl, "ram", ram); 3179 free(ram); 3180 return (pci_parse_legacy_config(nvl, cp + 1)); 3181 } else 3182 return (blockif_legacy_config(nvl, opts)); 3183 } 3184 3185 struct pci_devemu pci_de_nvme = { 3186 .pe_emu = "nvme", 3187 .pe_init = pci_nvme_init, 3188 .pe_legacy_config = pci_nvme_legacy_config, 3189 .pe_barwrite = pci_nvme_write, 3190 .pe_barread = pci_nvme_read 3191 }; 3192 PCI_EMUL_SET(pci_de_nvme); 3193