1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 /* 31 * bhyve PCIe-NVMe device emulation. 32 * 33 * options: 34 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 35 * 36 * accepted devpath: 37 * /dev/blockdev 38 * /path/to/image 39 * ram=size_in_MiB 40 * 41 * maxq = max number of queues 42 * qsz = max elements in each queue 43 * ioslots = max number of concurrent io requests 44 * sectsz = sector size (defaults to blockif sector size) 45 * ser = serial number (20-chars max) 46 * eui64 = IEEE Extended Unique Identifier (8 byte value) 47 * dsm = DataSet Management support. Option is one of auto, enable,disable 48 * 49 */ 50 51 /* TODO: 52 - create async event for smart and log 53 - intr coalesce 54 */ 55 56 #include <sys/cdefs.h> 57 58 #include <sys/errno.h> 59 #include <sys/types.h> 60 #ifdef __FreeBSD__ 61 #include <sys/crc16.h> 62 #else 63 #include "crc16.h" 64 #endif 65 #include <net/ieee_oui.h> 66 #ifndef __FreeBSD__ 67 #include <endian.h> 68 #endif 69 70 #include <assert.h> 71 #include <pthread.h> 72 #include <pthread_np.h> 73 #include <semaphore.h> 74 #include <stdbool.h> 75 #include <stddef.h> 76 #include <stdint.h> 77 #include <stdio.h> 78 #include <stdlib.h> 79 #include <string.h> 80 81 #include <machine/atomic.h> 82 #include <machine/vmm.h> 83 #include <vmmapi.h> 84 85 #include <dev/nvme/nvme.h> 86 87 #include "bhyverun.h" 88 #include "block_if.h" 89 #include "config.h" 90 #include "debug.h" 91 #include "pci_emul.h" 92 93 94 static int nvme_debug = 0; 95 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 96 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 97 98 /* defaults; can be overridden */ 99 #define NVME_MSIX_BAR 4 100 101 #define NVME_IOSLOTS 8 102 103 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 104 #define NVME_MMIO_SPACE_MIN (1 << 14) 105 106 #define NVME_QUEUES 16 107 #define NVME_MAX_QENTRIES 2048 108 /* Memory Page size Minimum reported in CAP register */ 109 #define NVME_MPSMIN 0 110 /* MPSMIN converted to bytes */ 111 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 112 113 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 114 #define NVME_MDTS 9 115 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 116 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 117 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 118 119 /* This is a synthetic status code to indicate there is no status */ 120 #define NVME_NO_STATUS 0xffff 121 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 122 123 /* Reported temperature in Kelvin (i.e. room temperature) */ 124 #define NVME_TEMPERATURE 296 125 126 /* helpers */ 127 128 /* Convert a zero-based value into a one-based value */ 129 #define ONE_BASED(zero) ((zero) + 1) 130 /* Convert a one-based value into a zero-based value */ 131 #define ZERO_BASED(one) ((one) - 1) 132 133 /* Encode number of SQ's and CQ's for Set/Get Features */ 134 #define NVME_FEATURE_NUM_QUEUES(sc) \ 135 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 136 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16 137 138 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 139 140 enum nvme_controller_register_offsets { 141 NVME_CR_CAP_LOW = 0x00, 142 NVME_CR_CAP_HI = 0x04, 143 NVME_CR_VS = 0x08, 144 NVME_CR_INTMS = 0x0c, 145 NVME_CR_INTMC = 0x10, 146 NVME_CR_CC = 0x14, 147 NVME_CR_CSTS = 0x1c, 148 NVME_CR_NSSR = 0x20, 149 NVME_CR_AQA = 0x24, 150 NVME_CR_ASQ_LOW = 0x28, 151 NVME_CR_ASQ_HI = 0x2c, 152 NVME_CR_ACQ_LOW = 0x30, 153 NVME_CR_ACQ_HI = 0x34, 154 }; 155 156 enum nvme_cmd_cdw11 { 157 NVME_CMD_CDW11_PC = 0x0001, 158 NVME_CMD_CDW11_IEN = 0x0002, 159 NVME_CMD_CDW11_IV = 0xFFFF0000, 160 }; 161 162 enum nvme_copy_dir { 163 NVME_COPY_TO_PRP, 164 NVME_COPY_FROM_PRP, 165 }; 166 167 #define NVME_CQ_INTEN 0x01 168 #define NVME_CQ_INTCOAL 0x02 169 170 struct nvme_completion_queue { 171 struct nvme_completion *qbase; 172 pthread_mutex_t mtx; 173 uint32_t size; 174 uint16_t tail; /* nvme progress */ 175 uint16_t head; /* guest progress */ 176 uint16_t intr_vec; 177 uint32_t intr_en; 178 }; 179 180 struct nvme_submission_queue { 181 struct nvme_command *qbase; 182 pthread_mutex_t mtx; 183 uint32_t size; 184 uint16_t head; /* nvme progress */ 185 uint16_t tail; /* guest progress */ 186 uint16_t cqid; /* completion queue id */ 187 int qpriority; 188 }; 189 190 enum nvme_storage_type { 191 NVME_STOR_BLOCKIF = 0, 192 NVME_STOR_RAM = 1, 193 }; 194 195 struct pci_nvme_blockstore { 196 enum nvme_storage_type type; 197 void *ctx; 198 uint64_t size; 199 uint32_t sectsz; 200 uint32_t sectsz_bits; 201 uint64_t eui64; 202 uint32_t deallocate:1; 203 }; 204 205 /* 206 * Calculate the number of additional page descriptors for guest IO requests 207 * based on the advertised Max Data Transfer (MDTS) and given the number of 208 * default iovec's in a struct blockif_req. 209 */ 210 #define MDTS_PAD_SIZE \ 211 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 212 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 213 0 ) 214 215 struct pci_nvme_ioreq { 216 struct pci_nvme_softc *sc; 217 STAILQ_ENTRY(pci_nvme_ioreq) link; 218 struct nvme_submission_queue *nvme_sq; 219 uint16_t sqid; 220 221 /* command information */ 222 uint16_t opc; 223 uint16_t cid; 224 uint32_t nsid; 225 226 uint64_t prev_gpaddr; 227 size_t prev_size; 228 size_t bytes; 229 230 struct blockif_req io_req; 231 232 struct iovec iovpadding[MDTS_PAD_SIZE]; 233 }; 234 235 enum nvme_dsm_type { 236 /* Dataset Management bit in ONCS reflects backing storage capability */ 237 NVME_DATASET_MANAGEMENT_AUTO, 238 /* Unconditionally set Dataset Management bit in ONCS */ 239 NVME_DATASET_MANAGEMENT_ENABLE, 240 /* Unconditionally clear Dataset Management bit in ONCS */ 241 NVME_DATASET_MANAGEMENT_DISABLE, 242 }; 243 244 struct pci_nvme_softc; 245 struct nvme_feature_obj; 246 247 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 248 struct nvme_feature_obj *, 249 struct nvme_command *, 250 struct nvme_completion *); 251 252 struct nvme_feature_obj { 253 uint32_t cdw11; 254 nvme_feature_cb set; 255 nvme_feature_cb get; 256 bool namespace_specific; 257 }; 258 259 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 260 261 typedef enum { 262 PCI_NVME_AE_TYPE_ERROR = 0, 263 PCI_NVME_AE_TYPE_SMART, 264 PCI_NVME_AE_TYPE_NOTICE, 265 PCI_NVME_AE_TYPE_IO_CMD = 6, 266 PCI_NVME_AE_TYPE_VENDOR = 7, 267 PCI_NVME_AE_TYPE_MAX /* Must be last */ 268 } pci_nvme_async_type; 269 270 /* Asynchronous Event Requests */ 271 struct pci_nvme_aer { 272 STAILQ_ENTRY(pci_nvme_aer) link; 273 uint16_t cid; /* Command ID of the submitted AER */ 274 }; 275 276 /** Asynchronous Event Information - Notice */ 277 typedef enum { 278 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0, 279 PCI_NVME_AEI_NOTICE_FW_ACTIVATION, 280 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE, 281 PCI_NVME_AEI_NOTICE_ANA_CHANGE, 282 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE, 283 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT, 284 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE, 285 PCI_NVME_AEI_NOTICE_MAX, 286 } pci_nvme_async_event_info_notice; 287 288 #define PCI_NVME_AEI_NOTICE_SHIFT 8 289 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT)) 290 291 /* Asynchronous Event Notifications */ 292 struct pci_nvme_aen { 293 pci_nvme_async_type atype; 294 uint32_t event_data; 295 bool posted; 296 }; 297 298 /* 299 * By default, enable all Asynchrnous Event Notifications: 300 * SMART / Health Critical Warnings 301 * Namespace Attribute Notices 302 */ 303 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f 304 305 typedef enum { 306 NVME_CNTRLTYPE_IO = 1, 307 NVME_CNTRLTYPE_DISCOVERY = 2, 308 NVME_CNTRLTYPE_ADMIN = 3, 309 } pci_nvme_cntrl_type; 310 311 struct pci_nvme_softc { 312 struct pci_devinst *nsc_pi; 313 314 pthread_mutex_t mtx; 315 316 struct nvme_registers regs; 317 318 struct nvme_namespace_data nsdata; 319 struct nvme_controller_data ctrldata; 320 struct nvme_error_information_entry err_log; 321 struct nvme_health_information_page health_log; 322 struct nvme_firmware_page fw_log; 323 struct nvme_ns_list ns_log; 324 325 struct pci_nvme_blockstore nvstore; 326 327 uint16_t max_qentries; /* max entries per queue */ 328 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 329 uint32_t num_cqueues; 330 uint32_t num_squeues; 331 bool num_q_is_set; /* Has host set Number of Queues */ 332 333 struct pci_nvme_ioreq *ioreqs; 334 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 335 uint32_t pending_ios; 336 uint32_t ioslots; 337 sem_t iosemlock; 338 339 /* 340 * Memory mapped Submission and Completion queues 341 * Each array includes both Admin and IO queues 342 */ 343 struct nvme_completion_queue *compl_queues; 344 struct nvme_submission_queue *submit_queues; 345 346 struct nvme_feature_obj feat[NVME_FID_MAX]; 347 348 enum nvme_dsm_type dataset_management; 349 350 /* Accounting for SMART data */ 351 __uint128_t read_data_units; 352 __uint128_t write_data_units; 353 __uint128_t read_commands; 354 __uint128_t write_commands; 355 uint32_t read_dunits_remainder; 356 uint32_t write_dunits_remainder; 357 358 STAILQ_HEAD(, pci_nvme_aer) aer_list; 359 pthread_mutex_t aer_mtx; 360 uint32_t aer_count; 361 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX]; 362 pthread_t aen_tid; 363 pthread_mutex_t aen_mtx; 364 pthread_cond_t aen_cond; 365 }; 366 367 368 static void pci_nvme_cq_update(struct pci_nvme_softc *sc, 369 struct nvme_completion_queue *cq, 370 uint32_t cdw0, 371 uint16_t cid, 372 uint16_t sqid, 373 uint16_t status); 374 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 375 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 376 static void pci_nvme_io_done(struct blockif_req *, int); 377 378 /* Controller Configuration utils */ 379 #define NVME_CC_GET_EN(cc) \ 380 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 381 #define NVME_CC_GET_CSS(cc) \ 382 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 383 #define NVME_CC_GET_SHN(cc) \ 384 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 385 #define NVME_CC_GET_IOSQES(cc) \ 386 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 387 #define NVME_CC_GET_IOCQES(cc) \ 388 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 389 390 #define NVME_CC_WRITE_MASK \ 391 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 392 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 393 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 394 395 #define NVME_CC_NEN_WRITE_MASK \ 396 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 397 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 398 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 399 400 /* Controller Status utils */ 401 #define NVME_CSTS_GET_RDY(sts) \ 402 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 403 404 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 405 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT) 406 407 /* Completion Queue status word utils */ 408 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 409 #define NVME_STATUS_MASK \ 410 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 411 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 412 413 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 414 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 415 416 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 417 struct nvme_feature_obj *, 418 struct nvme_command *, 419 struct nvme_completion *); 420 static void nvme_feature_temperature(struct pci_nvme_softc *, 421 struct nvme_feature_obj *, 422 struct nvme_command *, 423 struct nvme_completion *); 424 static void nvme_feature_num_queues(struct pci_nvme_softc *, 425 struct nvme_feature_obj *, 426 struct nvme_command *, 427 struct nvme_completion *); 428 static void nvme_feature_iv_config(struct pci_nvme_softc *, 429 struct nvme_feature_obj *, 430 struct nvme_command *, 431 struct nvme_completion *); 432 static void nvme_feature_async_event(struct pci_nvme_softc *, 433 struct nvme_feature_obj *, 434 struct nvme_command *, 435 struct nvme_completion *); 436 437 static void *aen_thr(void *arg); 438 439 static __inline void 440 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 441 { 442 size_t len; 443 444 len = strnlen(src, dst_size); 445 memset(dst, pad, dst_size); 446 memcpy(dst, src, len); 447 } 448 449 static __inline void 450 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 451 { 452 453 *status &= ~NVME_STATUS_MASK; 454 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 455 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 456 } 457 458 static __inline void 459 pci_nvme_status_genc(uint16_t *status, uint16_t code) 460 { 461 462 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 463 } 464 465 /* 466 * Initialize the requested number or IO Submission and Completion Queues. 467 * Admin queues are allocated implicitly. 468 */ 469 static void 470 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 471 { 472 uint32_t i; 473 474 /* 475 * Allocate and initialize the Submission Queues 476 */ 477 if (nsq > NVME_QUEUES) { 478 WPRINTF("%s: clamping number of SQ from %u to %u", 479 __func__, nsq, NVME_QUEUES); 480 nsq = NVME_QUEUES; 481 } 482 483 sc->num_squeues = nsq; 484 485 sc->submit_queues = calloc(sc->num_squeues + 1, 486 sizeof(struct nvme_submission_queue)); 487 if (sc->submit_queues == NULL) { 488 WPRINTF("%s: SQ allocation failed", __func__); 489 sc->num_squeues = 0; 490 } else { 491 struct nvme_submission_queue *sq = sc->submit_queues; 492 493 for (i = 0; i < sc->num_squeues + 1; i++) 494 pthread_mutex_init(&sq[i].mtx, NULL); 495 } 496 497 /* 498 * Allocate and initialize the Completion Queues 499 */ 500 if (ncq > NVME_QUEUES) { 501 WPRINTF("%s: clamping number of CQ from %u to %u", 502 __func__, ncq, NVME_QUEUES); 503 ncq = NVME_QUEUES; 504 } 505 506 sc->num_cqueues = ncq; 507 508 sc->compl_queues = calloc(sc->num_cqueues + 1, 509 sizeof(struct nvme_completion_queue)); 510 if (sc->compl_queues == NULL) { 511 WPRINTF("%s: CQ allocation failed", __func__); 512 sc->num_cqueues = 0; 513 } else { 514 struct nvme_completion_queue *cq = sc->compl_queues; 515 516 for (i = 0; i < sc->num_cqueues + 1; i++) 517 pthread_mutex_init(&cq[i].mtx, NULL); 518 } 519 } 520 521 static void 522 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 523 { 524 struct nvme_controller_data *cd = &sc->ctrldata; 525 int ret; 526 527 cd->vid = 0xFB5D; 528 cd->ssvid = 0x0000; 529 530 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 531 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 532 533 /* Num of submission commands that we can handle at a time (2^rab) */ 534 cd->rab = 4; 535 536 /* FreeBSD OUI */ 537 cd->ieee[0] = 0xfc; 538 cd->ieee[1] = 0x9c; 539 cd->ieee[2] = 0x58; 540 541 cd->mic = 0; 542 543 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 544 545 cd->ver = NVME_REV(1,4); 546 547 cd->cntrltype = NVME_CNTRLTYPE_IO; 548 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 549 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR); 550 cd->acl = 2; 551 cd->aerl = 4; 552 553 /* Advertise 1, Read-only firmware slot */ 554 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) | 555 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 556 cd->lpa = 0; /* TODO: support some simple things like SMART */ 557 cd->elpe = 0; /* max error log page entries */ 558 /* 559 * Report a single power state (zero-based value) 560 * power_state[] values are left as zero to indicate "Not reported" 561 */ 562 cd->npss = 0; 563 564 /* Warning Composite Temperature Threshold */ 565 cd->wctemp = 0x0157; 566 cd->cctemp = 0x0157; 567 568 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */ 569 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO << 570 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT); 571 572 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 573 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 574 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 575 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 576 cd->nn = 1; /* number of namespaces */ 577 578 cd->oncs = 0; 579 switch (sc->dataset_management) { 580 case NVME_DATASET_MANAGEMENT_AUTO: 581 if (sc->nvstore.deallocate) 582 cd->oncs |= NVME_ONCS_DSM; 583 break; 584 case NVME_DATASET_MANAGEMENT_ENABLE: 585 cd->oncs |= NVME_ONCS_DSM; 586 break; 587 default: 588 break; 589 } 590 591 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK << 592 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT; 593 594 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT; 595 596 #ifdef __FreeBSD__ 597 ret = snprintf(cd->subnqn, sizeof(cd->subnqn), 598 "nqn.2013-12.org.freebsd:bhyve-%s-%u-%u-%u", 599 get_config_value("name"), sc->nsc_pi->pi_bus, 600 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 601 #else 602 ret = snprintf((char *)cd->subnqn, sizeof (cd->subnqn), 603 "nqn.2013-12.org.illumos:bhyve-%s-%u-%u-%u", 604 get_config_value("name"), sc->nsc_pi->pi_bus, 605 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 606 #endif 607 if ((ret < 0) || ((unsigned)ret > sizeof(cd->subnqn))) 608 EPRINTLN("%s: error setting subnqn (%d)", __func__, ret); 609 } 610 611 static void 612 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore, 613 struct nvme_namespace_data *nd) 614 { 615 616 /* Get capacity and block size information from backing store */ 617 nd->nsze = nvstore->size / nvstore->sectsz; 618 nd->ncap = nd->nsze; 619 nd->nuse = nd->nsze; 620 } 621 622 static void 623 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 624 struct nvme_namespace_data *nd, uint32_t nsid, 625 struct pci_nvme_blockstore *nvstore) 626 { 627 628 pci_nvme_init_nsdata_size(nvstore, nd); 629 630 if (nvstore->type == NVME_STOR_BLOCKIF) 631 nvstore->deallocate = blockif_candelete(nvstore->ctx); 632 633 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 634 nd->flbas = 0; 635 636 /* Create an EUI-64 if user did not provide one */ 637 if (nvstore->eui64 == 0) { 638 char *data = NULL; 639 uint64_t eui64 = nvstore->eui64; 640 641 asprintf(&data, "%s%u%u%u", get_config_value("name"), 642 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 643 sc->nsc_pi->pi_func); 644 645 if (data != NULL) { 646 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 647 free(data); 648 } 649 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 650 } 651 be64enc(nd->eui64, nvstore->eui64); 652 653 /* LBA data-sz = 2^lbads */ 654 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 655 } 656 657 static void 658 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 659 { 660 __uint128_t power_cycles = 1; 661 662 memset(&sc->err_log, 0, sizeof(sc->err_log)); 663 memset(&sc->health_log, 0, sizeof(sc->health_log)); 664 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 665 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 666 667 /* Set read/write remainder to round up according to spec */ 668 sc->read_dunits_remainder = 999; 669 sc->write_dunits_remainder = 999; 670 671 /* Set nominal Health values checked by implementations */ 672 sc->health_log.temperature = NVME_TEMPERATURE; 673 sc->health_log.available_spare = 100; 674 sc->health_log.available_spare_threshold = 10; 675 676 /* Set Active Firmware Info to slot 1 */ 677 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT); 678 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr, 679 sizeof(sc->fw_log.revision[0])); 680 681 memcpy(&sc->health_log.power_cycles, &power_cycles, 682 sizeof(sc->health_log.power_cycles)); 683 } 684 685 static void 686 pci_nvme_init_features(struct pci_nvme_softc *sc) 687 { 688 enum nvme_feature fid; 689 690 for (fid = 0; fid < NVME_FID_MAX; fid++) { 691 switch (fid) { 692 case NVME_FEAT_ARBITRATION: 693 case NVME_FEAT_POWER_MANAGEMENT: 694 case NVME_FEAT_INTERRUPT_COALESCING: //XXX 695 case NVME_FEAT_WRITE_ATOMICITY: 696 /* Mandatory but no special handling required */ 697 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 698 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 699 // this returns a data buffer 700 break; 701 case NVME_FEAT_TEMPERATURE_THRESHOLD: 702 sc->feat[fid].set = nvme_feature_temperature; 703 break; 704 case NVME_FEAT_ERROR_RECOVERY: 705 sc->feat[fid].namespace_specific = true; 706 break; 707 case NVME_FEAT_NUMBER_OF_QUEUES: 708 sc->feat[fid].set = nvme_feature_num_queues; 709 break; 710 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 711 sc->feat[fid].set = nvme_feature_iv_config; 712 break; 713 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 714 sc->feat[fid].set = nvme_feature_async_event; 715 /* Enable all AENs by default */ 716 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK; 717 break; 718 default: 719 sc->feat[fid].set = nvme_feature_invalid_cb; 720 sc->feat[fid].get = nvme_feature_invalid_cb; 721 } 722 } 723 } 724 725 static void 726 pci_nvme_aer_reset(struct pci_nvme_softc *sc) 727 { 728 729 STAILQ_INIT(&sc->aer_list); 730 sc->aer_count = 0; 731 } 732 733 static void 734 pci_nvme_aer_init(struct pci_nvme_softc *sc) 735 { 736 737 pthread_mutex_init(&sc->aer_mtx, NULL); 738 pci_nvme_aer_reset(sc); 739 } 740 741 static void 742 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 743 { 744 struct pci_nvme_aer *aer = NULL; 745 746 pthread_mutex_lock(&sc->aer_mtx); 747 while (!STAILQ_EMPTY(&sc->aer_list)) { 748 aer = STAILQ_FIRST(&sc->aer_list); 749 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 750 free(aer); 751 } 752 pthread_mutex_unlock(&sc->aer_mtx); 753 754 pci_nvme_aer_reset(sc); 755 } 756 757 static bool 758 pci_nvme_aer_available(struct pci_nvme_softc *sc) 759 { 760 761 return (sc->aer_count != 0); 762 } 763 764 static bool 765 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 766 { 767 struct nvme_controller_data *cd = &sc->ctrldata; 768 769 /* AERL is a zero based value while aer_count is one's based */ 770 return (sc->aer_count == (cd->aerl + 1U)); 771 } 772 773 /* 774 * Add an Async Event Request 775 * 776 * Stores an AER to be returned later if the Controller needs to notify the 777 * host of an event. 778 * Note that while the NVMe spec doesn't require Controllers to return AER's 779 * in order, this implementation does preserve the order. 780 */ 781 static int 782 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 783 { 784 struct pci_nvme_aer *aer = NULL; 785 786 aer = calloc(1, sizeof(struct pci_nvme_aer)); 787 if (aer == NULL) 788 return (-1); 789 790 /* Save the Command ID for use in the completion message */ 791 aer->cid = cid; 792 793 pthread_mutex_lock(&sc->aer_mtx); 794 sc->aer_count++; 795 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 796 pthread_mutex_unlock(&sc->aer_mtx); 797 798 return (0); 799 } 800 801 /* 802 * Get an Async Event Request structure 803 * 804 * Returns a pointer to an AER previously submitted by the host or NULL if 805 * no AER's exist. Caller is responsible for freeing the returned struct. 806 */ 807 static struct pci_nvme_aer * 808 pci_nvme_aer_get(struct pci_nvme_softc *sc) 809 { 810 struct pci_nvme_aer *aer = NULL; 811 812 pthread_mutex_lock(&sc->aer_mtx); 813 aer = STAILQ_FIRST(&sc->aer_list); 814 if (aer != NULL) { 815 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 816 sc->aer_count--; 817 } 818 pthread_mutex_unlock(&sc->aer_mtx); 819 820 return (aer); 821 } 822 823 static void 824 pci_nvme_aen_reset(struct pci_nvme_softc *sc) 825 { 826 uint32_t atype; 827 828 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen)); 829 830 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 831 sc->aen[atype].atype = atype; 832 } 833 } 834 835 static void 836 pci_nvme_aen_init(struct pci_nvme_softc *sc) 837 { 838 char nstr[80]; 839 840 pci_nvme_aen_reset(sc); 841 842 pthread_mutex_init(&sc->aen_mtx, NULL); 843 pthread_create(&sc->aen_tid, NULL, aen_thr, sc); 844 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot, 845 sc->nsc_pi->pi_func); 846 pthread_set_name_np(sc->aen_tid, nstr); 847 } 848 849 static void 850 pci_nvme_aen_destroy(struct pci_nvme_softc *sc) 851 { 852 853 pci_nvme_aen_reset(sc); 854 } 855 856 /* Notify the AEN thread of pending work */ 857 static void 858 pci_nvme_aen_notify(struct pci_nvme_softc *sc) 859 { 860 861 pthread_cond_signal(&sc->aen_cond); 862 } 863 864 /* 865 * Post an Asynchronous Event Notification 866 */ 867 static int32_t 868 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype, 869 uint32_t event_data) 870 { 871 struct pci_nvme_aen *aen; 872 873 if (atype >= PCI_NVME_AE_TYPE_MAX) { 874 return(EINVAL); 875 } 876 877 pthread_mutex_lock(&sc->aen_mtx); 878 aen = &sc->aen[atype]; 879 880 /* Has the controller already posted an event of this type? */ 881 if (aen->posted) { 882 pthread_mutex_unlock(&sc->aen_mtx); 883 return(EALREADY); 884 } 885 886 aen->event_data = event_data; 887 aen->posted = true; 888 pthread_mutex_unlock(&sc->aen_mtx); 889 890 pci_nvme_aen_notify(sc); 891 892 return(0); 893 } 894 895 static void 896 pci_nvme_aen_process(struct pci_nvme_softc *sc) 897 { 898 struct pci_nvme_aer *aer; 899 struct pci_nvme_aen *aen; 900 pci_nvme_async_type atype; 901 uint32_t mask; 902 uint16_t status; 903 uint8_t lid; 904 905 #ifndef __FreeBSD__ 906 lid = 0; 907 #endif 908 909 assert(pthread_mutex_isowned_np(&sc->aen_mtx)); 910 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) { 911 aen = &sc->aen[atype]; 912 /* Previous iterations may have depleted the available AER's */ 913 if (!pci_nvme_aer_available(sc)) { 914 DPRINTF("%s: no AER", __func__); 915 break; 916 } 917 918 if (!aen->posted) { 919 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype); 920 continue; 921 } 922 923 status = NVME_SC_SUCCESS; 924 925 /* Is the event masked? */ 926 mask = 927 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11; 928 929 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data); 930 switch (atype) { 931 case PCI_NVME_AE_TYPE_ERROR: 932 lid = NVME_LOG_ERROR; 933 break; 934 case PCI_NVME_AE_TYPE_SMART: 935 mask &= 0xff; 936 if ((mask & aen->event_data) == 0) 937 continue; 938 lid = NVME_LOG_HEALTH_INFORMATION; 939 break; 940 case PCI_NVME_AE_TYPE_NOTICE: 941 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) { 942 EPRINTLN("%s unknown AEN notice type %u", 943 __func__, aen->event_data); 944 status = NVME_SC_INTERNAL_DEVICE_ERROR; 945 lid = 0; 946 break; 947 } 948 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0) 949 continue; 950 switch (aen->event_data) { 951 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED: 952 lid = NVME_LOG_CHANGED_NAMESPACE; 953 break; 954 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION: 955 lid = NVME_LOG_FIRMWARE_SLOT; 956 break; 957 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE: 958 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED; 959 break; 960 case PCI_NVME_AEI_NOTICE_ANA_CHANGE: 961 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS; 962 break; 963 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE: 964 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE; 965 break; 966 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT: 967 lid = NVME_LOG_LBA_STATUS_INFORMATION; 968 break; 969 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE: 970 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE; 971 break; 972 default: 973 lid = 0; 974 } 975 break; 976 default: 977 /* bad type?!? */ 978 EPRINTLN("%s unknown AEN type %u", __func__, atype); 979 status = NVME_SC_INTERNAL_DEVICE_ERROR; 980 lid = 0; 981 break; 982 } 983 984 aer = pci_nvme_aer_get(sc); 985 assert(aer != NULL); 986 987 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype); 988 pci_nvme_cq_update(sc, &sc->compl_queues[0], 989 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */ 990 aer->cid, 991 0, /* SQID */ 992 status); 993 994 aen->event_data = 0; 995 aen->posted = false; 996 997 pci_generate_msix(sc->nsc_pi, 0); 998 } 999 } 1000 1001 static void * 1002 aen_thr(void *arg) 1003 { 1004 struct pci_nvme_softc *sc; 1005 1006 sc = arg; 1007 1008 pthread_mutex_lock(&sc->aen_mtx); 1009 for (;;) { 1010 pci_nvme_aen_process(sc); 1011 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx); 1012 } 1013 #ifdef __FreeBSD__ /* Smatch spots unreachable code */ 1014 pthread_mutex_unlock(&sc->aen_mtx); 1015 1016 pthread_exit(NULL); 1017 #endif 1018 return (NULL); 1019 } 1020 1021 static void 1022 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 1023 { 1024 uint32_t i; 1025 1026 DPRINTF("%s", __func__); 1027 1028 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 1029 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 1030 (60 << NVME_CAP_LO_REG_TO_SHIFT); 1031 1032 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 1033 1034 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */ 1035 1036 sc->regs.cc = 0; 1037 1038 assert(sc->submit_queues != NULL); 1039 1040 for (i = 0; i < sc->num_squeues + 1; i++) { 1041 sc->submit_queues[i].qbase = NULL; 1042 sc->submit_queues[i].size = 0; 1043 sc->submit_queues[i].cqid = 0; 1044 sc->submit_queues[i].tail = 0; 1045 sc->submit_queues[i].head = 0; 1046 } 1047 1048 assert(sc->compl_queues != NULL); 1049 1050 for (i = 0; i < sc->num_cqueues + 1; i++) { 1051 sc->compl_queues[i].qbase = NULL; 1052 sc->compl_queues[i].size = 0; 1053 sc->compl_queues[i].tail = 0; 1054 sc->compl_queues[i].head = 0; 1055 } 1056 1057 sc->num_q_is_set = false; 1058 1059 pci_nvme_aer_destroy(sc); 1060 pci_nvme_aen_destroy(sc); 1061 1062 /* 1063 * Clear CSTS.RDY last to prevent the host from enabling Controller 1064 * before cleanup completes 1065 */ 1066 sc->regs.csts = 0; 1067 } 1068 1069 static void 1070 pci_nvme_reset(struct pci_nvme_softc *sc) 1071 { 1072 pthread_mutex_lock(&sc->mtx); 1073 pci_nvme_reset_locked(sc); 1074 pthread_mutex_unlock(&sc->mtx); 1075 } 1076 1077 static int 1078 pci_nvme_init_controller(struct pci_nvme_softc *sc) 1079 { 1080 uint16_t acqs, asqs; 1081 1082 DPRINTF("%s", __func__); 1083 1084 /* 1085 * NVMe 2.0 states that "enabling a controller while this field is 1086 * cleared to 0h produces undefined results" for both ACQS and 1087 * ASQS. If zero, set CFS and do not become ready. 1088 */ 1089 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK); 1090 if (asqs < 2) { 1091 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__, 1092 asqs - 1, sc->regs.aqa); 1093 sc->regs.csts |= NVME_CSTS_CFS; 1094 return (-1); 1095 } 1096 sc->submit_queues[0].size = asqs; 1097 sc->submit_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1098 sc->regs.asq, sizeof(struct nvme_command) * asqs); 1099 if (sc->submit_queues[0].qbase == NULL) { 1100 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__, 1101 sc->regs.asq); 1102 sc->regs.csts |= NVME_CSTS_CFS; 1103 return (-1); 1104 } 1105 1106 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 1107 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 1108 1109 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 1110 NVME_AQA_REG_ACQS_MASK); 1111 if (acqs < 2) { 1112 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__, 1113 acqs - 1, sc->regs.aqa); 1114 sc->regs.csts |= NVME_CSTS_CFS; 1115 return (-1); 1116 } 1117 sc->compl_queues[0].size = acqs; 1118 sc->compl_queues[0].qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1119 sc->regs.acq, sizeof(struct nvme_completion) * acqs); 1120 if (sc->compl_queues[0].qbase == NULL) { 1121 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__, 1122 sc->regs.acq); 1123 sc->regs.csts |= NVME_CSTS_CFS; 1124 return (-1); 1125 } 1126 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 1127 1128 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 1129 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 1130 1131 return (0); 1132 } 1133 1134 static int 1135 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 1136 size_t len, enum nvme_copy_dir dir) 1137 { 1138 uint8_t *p; 1139 size_t bytes; 1140 1141 if (len > (8 * 1024)) { 1142 return (-1); 1143 } 1144 1145 /* Copy from the start of prp1 to the end of the physical page */ 1146 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 1147 bytes = MIN(bytes, len); 1148 1149 p = vm_map_gpa(ctx, prp1, bytes); 1150 if (p == NULL) { 1151 return (-1); 1152 } 1153 1154 if (dir == NVME_COPY_TO_PRP) 1155 memcpy(p, b, bytes); 1156 else 1157 memcpy(b, p, bytes); 1158 1159 b += bytes; 1160 1161 len -= bytes; 1162 if (len == 0) { 1163 return (0); 1164 } 1165 1166 len = MIN(len, PAGE_SIZE); 1167 1168 p = vm_map_gpa(ctx, prp2, len); 1169 if (p == NULL) { 1170 return (-1); 1171 } 1172 1173 if (dir == NVME_COPY_TO_PRP) 1174 memcpy(p, b, len); 1175 else 1176 memcpy(b, p, len); 1177 1178 return (0); 1179 } 1180 1181 /* 1182 * Write a Completion Queue Entry update 1183 * 1184 * Write the completion and update the doorbell value 1185 */ 1186 static void 1187 pci_nvme_cq_update(struct pci_nvme_softc *sc, 1188 struct nvme_completion_queue *cq, 1189 uint32_t cdw0, 1190 uint16_t cid, 1191 uint16_t sqid, 1192 uint16_t status) 1193 { 1194 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 1195 struct nvme_completion *cqe; 1196 1197 assert(cq->qbase != NULL); 1198 1199 pthread_mutex_lock(&cq->mtx); 1200 1201 cqe = &cq->qbase[cq->tail]; 1202 1203 /* Flip the phase bit */ 1204 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 1205 1206 cqe->cdw0 = cdw0; 1207 cqe->sqhd = sq->head; 1208 cqe->sqid = sqid; 1209 cqe->cid = cid; 1210 cqe->status = status; 1211 1212 cq->tail++; 1213 if (cq->tail >= cq->size) { 1214 cq->tail = 0; 1215 } 1216 1217 pthread_mutex_unlock(&cq->mtx); 1218 } 1219 1220 static int 1221 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1222 struct nvme_completion* compl) 1223 { 1224 uint16_t qid = command->cdw10 & 0xffff; 1225 1226 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 1227 if (qid == 0 || qid > sc->num_squeues || 1228 (sc->submit_queues[qid].qbase == NULL)) { 1229 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 1230 __func__, qid, sc->num_squeues); 1231 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1232 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1233 return (1); 1234 } 1235 1236 sc->submit_queues[qid].qbase = NULL; 1237 sc->submit_queues[qid].cqid = 0; 1238 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1239 return (1); 1240 } 1241 1242 static int 1243 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 1244 struct nvme_completion* compl) 1245 { 1246 if (command->cdw11 & NVME_CMD_CDW11_PC) { 1247 uint16_t qid = command->cdw10 & 0xffff; 1248 struct nvme_submission_queue *nsq; 1249 1250 if ((qid == 0) || (qid > sc->num_squeues) || 1251 (sc->submit_queues[qid].qbase != NULL)) { 1252 WPRINTF("%s queue index %u > num_squeues %u", 1253 __func__, qid, sc->num_squeues); 1254 pci_nvme_status_tc(&compl->status, 1255 NVME_SCT_COMMAND_SPECIFIC, 1256 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1257 return (1); 1258 } 1259 1260 nsq = &sc->submit_queues[qid]; 1261 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1262 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 1263 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 1264 /* 1265 * Queues must specify at least two entries 1266 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1267 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1268 */ 1269 pci_nvme_status_tc(&compl->status, 1270 NVME_SCT_COMMAND_SPECIFIC, 1271 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1272 return (1); 1273 } 1274 nsq->head = nsq->tail = 0; 1275 1276 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 1277 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 1278 pci_nvme_status_tc(&compl->status, 1279 NVME_SCT_COMMAND_SPECIFIC, 1280 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1281 return (1); 1282 } 1283 1284 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 1285 pci_nvme_status_tc(&compl->status, 1286 NVME_SCT_COMMAND_SPECIFIC, 1287 NVME_SC_COMPLETION_QUEUE_INVALID); 1288 return (1); 1289 } 1290 1291 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 1292 1293 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1294 sizeof(struct nvme_command) * (size_t)nsq->size); 1295 1296 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 1297 qid, nsq->size, nsq->qbase, nsq->cqid); 1298 1299 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1300 1301 DPRINTF("%s completed creating IOSQ qid %u", 1302 __func__, qid); 1303 } else { 1304 /* 1305 * Guest sent non-cont submission queue request. 1306 * This setting is unsupported by this emulation. 1307 */ 1308 WPRINTF("%s unsupported non-contig (list-based) " 1309 "create i/o submission queue", __func__); 1310 1311 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1312 } 1313 return (1); 1314 } 1315 1316 static int 1317 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1318 struct nvme_completion* compl) 1319 { 1320 uint16_t qid = command->cdw10 & 0xffff; 1321 uint16_t sqid; 1322 1323 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 1324 if (qid == 0 || qid > sc->num_cqueues || 1325 (sc->compl_queues[qid].qbase == NULL)) { 1326 WPRINTF("%s queue index %u / num_cqueues %u", 1327 __func__, qid, sc->num_cqueues); 1328 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1329 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1330 return (1); 1331 } 1332 1333 /* Deleting an Active CQ is an error */ 1334 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1335 if (sc->submit_queues[sqid].cqid == qid) { 1336 pci_nvme_status_tc(&compl->status, 1337 NVME_SCT_COMMAND_SPECIFIC, 1338 NVME_SC_INVALID_QUEUE_DELETION); 1339 return (1); 1340 } 1341 1342 sc->compl_queues[qid].qbase = NULL; 1343 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1344 return (1); 1345 } 1346 1347 static int 1348 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1349 struct nvme_completion* compl) 1350 { 1351 struct nvme_completion_queue *ncq; 1352 uint16_t qid = command->cdw10 & 0xffff; 1353 1354 /* Only support Physically Contiguous queues */ 1355 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1356 WPRINTF("%s unsupported non-contig (list-based) " 1357 "create i/o completion queue", 1358 __func__); 1359 1360 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1361 return (1); 1362 } 1363 1364 if ((qid == 0) || (qid > sc->num_cqueues) || 1365 (sc->compl_queues[qid].qbase != NULL)) { 1366 WPRINTF("%s queue index %u > num_cqueues %u", 1367 __func__, qid, sc->num_cqueues); 1368 pci_nvme_status_tc(&compl->status, 1369 NVME_SCT_COMMAND_SPECIFIC, 1370 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1371 return (1); 1372 } 1373 1374 ncq = &sc->compl_queues[qid]; 1375 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1376 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1377 if (ncq->intr_vec > (sc->max_queues + 1)) { 1378 pci_nvme_status_tc(&compl->status, 1379 NVME_SCT_COMMAND_SPECIFIC, 1380 NVME_SC_INVALID_INTERRUPT_VECTOR); 1381 return (1); 1382 } 1383 1384 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1385 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1386 /* 1387 * Queues must specify at least two entries 1388 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1389 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1390 */ 1391 pci_nvme_status_tc(&compl->status, 1392 NVME_SCT_COMMAND_SPECIFIC, 1393 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1394 return (1); 1395 } 1396 ncq->head = ncq->tail = 0; 1397 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1398 command->prp1, 1399 sizeof(struct nvme_command) * (size_t)ncq->size); 1400 1401 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1402 1403 1404 return (1); 1405 } 1406 1407 static int 1408 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1409 struct nvme_completion* compl) 1410 { 1411 uint64_t logoff; 1412 uint32_t logsize; 1413 uint8_t logpage; 1414 1415 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1416 1417 /* 1418 * Command specifies the number of dwords to return in fields NUMDU 1419 * and NUMDL. This is a zero-based value. 1420 */ 1421 logpage = command->cdw10 & 0xFF; 1422 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1423 logsize *= sizeof(uint32_t); 1424 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12; 1425 1426 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1427 1428 switch (logpage) { 1429 case NVME_LOG_ERROR: 1430 if (logoff >= sizeof(sc->err_log)) { 1431 pci_nvme_status_genc(&compl->status, 1432 NVME_SC_INVALID_FIELD); 1433 break; 1434 } 1435 1436 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1437 command->prp2, (uint8_t *)&sc->err_log + logoff, 1438 MIN(logsize - logoff, sizeof(sc->err_log)), 1439 NVME_COPY_TO_PRP); 1440 break; 1441 case NVME_LOG_HEALTH_INFORMATION: 1442 if (logoff >= sizeof(sc->health_log)) { 1443 pci_nvme_status_genc(&compl->status, 1444 NVME_SC_INVALID_FIELD); 1445 break; 1446 } 1447 1448 pthread_mutex_lock(&sc->mtx); 1449 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1450 sizeof(sc->health_log.data_units_read)); 1451 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1452 sizeof(sc->health_log.data_units_written)); 1453 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1454 sizeof(sc->health_log.host_read_commands)); 1455 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1456 sizeof(sc->health_log.host_write_commands)); 1457 pthread_mutex_unlock(&sc->mtx); 1458 1459 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1460 command->prp2, (uint8_t *)&sc->health_log + logoff, 1461 MIN(logsize - logoff, sizeof(sc->health_log)), 1462 NVME_COPY_TO_PRP); 1463 break; 1464 case NVME_LOG_FIRMWARE_SLOT: 1465 if (logoff >= sizeof(sc->fw_log)) { 1466 pci_nvme_status_genc(&compl->status, 1467 NVME_SC_INVALID_FIELD); 1468 break; 1469 } 1470 1471 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1472 command->prp2, (uint8_t *)&sc->fw_log + logoff, 1473 MIN(logsize - logoff, sizeof(sc->fw_log)), 1474 NVME_COPY_TO_PRP); 1475 break; 1476 case NVME_LOG_CHANGED_NAMESPACE: 1477 if (logoff >= sizeof(sc->ns_log)) { 1478 pci_nvme_status_genc(&compl->status, 1479 NVME_SC_INVALID_FIELD); 1480 break; 1481 } 1482 1483 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1484 command->prp2, (uint8_t *)&sc->ns_log + logoff, 1485 MIN(logsize - logoff, sizeof(sc->ns_log)), 1486 NVME_COPY_TO_PRP); 1487 memset(&sc->ns_log, 0, sizeof(sc->ns_log)); 1488 break; 1489 default: 1490 DPRINTF("%s get log page %x command not supported", 1491 __func__, logpage); 1492 1493 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1494 NVME_SC_INVALID_LOG_PAGE); 1495 } 1496 1497 return (1); 1498 } 1499 1500 static int 1501 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1502 struct nvme_completion* compl) 1503 { 1504 void *dest; 1505 uint16_t status; 1506 1507 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1508 command->cdw10 & 0xFF, command->nsid); 1509 1510 status = 0; 1511 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1512 1513 switch (command->cdw10 & 0xFF) { 1514 case 0x00: /* return Identify Namespace data structure */ 1515 /* Global NS only valid with NS Management */ 1516 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) { 1517 pci_nvme_status_genc(&status, 1518 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1519 break; 1520 } 1521 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1522 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1523 NVME_COPY_TO_PRP); 1524 break; 1525 case 0x01: /* return Identify Controller data structure */ 1526 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1527 command->prp2, (uint8_t *)&sc->ctrldata, 1528 sizeof(sc->ctrldata), 1529 NVME_COPY_TO_PRP); 1530 break; 1531 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1532 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1533 sizeof(uint32_t) * 1024); 1534 /* All unused entries shall be zero */ 1535 memset(dest, 0, sizeof(uint32_t) * 1024); 1536 ((uint32_t *)dest)[0] = 1; 1537 break; 1538 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1539 if (command->nsid != 1) { 1540 pci_nvme_status_genc(&status, 1541 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1542 break; 1543 } 1544 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1545 sizeof(uint32_t) * 1024); 1546 /* All bytes after the descriptor shall be zero */ 1547 memset(dest, 0, sizeof(uint32_t) * 1024); 1548 1549 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1550 ((uint8_t *)dest)[0] = 1; 1551 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1552 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t)); 1553 break; 1554 case 0x13: 1555 /* 1556 * Controller list is optional but used by UNH tests. Return 1557 * a valid but empty list. 1558 */ 1559 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1560 sizeof(uint16_t) * 2048); 1561 memset(dest, 0, sizeof(uint16_t) * 2048); 1562 break; 1563 default: 1564 DPRINTF("%s unsupported identify command requested 0x%x", 1565 __func__, command->cdw10 & 0xFF); 1566 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1567 break; 1568 } 1569 1570 compl->status = status; 1571 return (1); 1572 } 1573 1574 static const char * 1575 nvme_fid_to_name(uint8_t fid) 1576 { 1577 const char *name; 1578 1579 switch (fid) { 1580 case NVME_FEAT_ARBITRATION: 1581 name = "Arbitration"; 1582 break; 1583 case NVME_FEAT_POWER_MANAGEMENT: 1584 name = "Power Management"; 1585 break; 1586 case NVME_FEAT_LBA_RANGE_TYPE: 1587 name = "LBA Range Type"; 1588 break; 1589 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1590 name = "Temperature Threshold"; 1591 break; 1592 case NVME_FEAT_ERROR_RECOVERY: 1593 name = "Error Recovery"; 1594 break; 1595 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1596 name = "Volatile Write Cache"; 1597 break; 1598 case NVME_FEAT_NUMBER_OF_QUEUES: 1599 name = "Number of Queues"; 1600 break; 1601 case NVME_FEAT_INTERRUPT_COALESCING: 1602 name = "Interrupt Coalescing"; 1603 break; 1604 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1605 name = "Interrupt Vector Configuration"; 1606 break; 1607 case NVME_FEAT_WRITE_ATOMICITY: 1608 name = "Write Atomicity Normal"; 1609 break; 1610 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1611 name = "Asynchronous Event Configuration"; 1612 break; 1613 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1614 name = "Autonomous Power State Transition"; 1615 break; 1616 case NVME_FEAT_HOST_MEMORY_BUFFER: 1617 name = "Host Memory Buffer"; 1618 break; 1619 case NVME_FEAT_TIMESTAMP: 1620 name = "Timestamp"; 1621 break; 1622 case NVME_FEAT_KEEP_ALIVE_TIMER: 1623 name = "Keep Alive Timer"; 1624 break; 1625 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1626 name = "Host Controlled Thermal Management"; 1627 break; 1628 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1629 name = "Non-Operation Power State Config"; 1630 break; 1631 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1632 name = "Read Recovery Level Config"; 1633 break; 1634 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1635 name = "Predictable Latency Mode Config"; 1636 break; 1637 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1638 name = "Predictable Latency Mode Window"; 1639 break; 1640 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1641 name = "LBA Status Information Report Interval"; 1642 break; 1643 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1644 name = "Host Behavior Support"; 1645 break; 1646 case NVME_FEAT_SANITIZE_CONFIG: 1647 name = "Sanitize Config"; 1648 break; 1649 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1650 name = "Endurance Group Event Configuration"; 1651 break; 1652 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1653 name = "Software Progress Marker"; 1654 break; 1655 case NVME_FEAT_HOST_IDENTIFIER: 1656 name = "Host Identifier"; 1657 break; 1658 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1659 name = "Reservation Notification Mask"; 1660 break; 1661 case NVME_FEAT_RESERVATION_PERSISTENCE: 1662 name = "Reservation Persistence"; 1663 break; 1664 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1665 name = "Namespace Write Protection Config"; 1666 break; 1667 default: 1668 name = "Unknown"; 1669 break; 1670 } 1671 1672 return (name); 1673 } 1674 1675 static void 1676 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused, 1677 struct nvme_feature_obj *feat __unused, 1678 struct nvme_command *command __unused, 1679 struct nvme_completion *compl) 1680 { 1681 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1682 } 1683 1684 static void 1685 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1686 struct nvme_feature_obj *feat __unused, 1687 struct nvme_command *command, 1688 struct nvme_completion *compl) 1689 { 1690 uint32_t i; 1691 uint32_t cdw11 = command->cdw11; 1692 uint16_t iv; 1693 bool cd; 1694 1695 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1696 1697 iv = cdw11 & 0xffff; 1698 cd = cdw11 & (1 << 16); 1699 1700 if (iv > (sc->max_queues + 1)) { 1701 return; 1702 } 1703 1704 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1705 if ((iv == 0) && !cd) 1706 return; 1707 1708 /* Requested Interrupt Vector must be used by a CQ */ 1709 for (i = 0; i < sc->num_cqueues + 1; i++) { 1710 if (sc->compl_queues[i].intr_vec == iv) { 1711 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1712 } 1713 } 1714 } 1715 1716 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000) 1717 static void 1718 nvme_feature_async_event(struct pci_nvme_softc *sc __unused, 1719 struct nvme_feature_obj *feat __unused, 1720 struct nvme_command *command, 1721 struct nvme_completion *compl) 1722 { 1723 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP) 1724 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1725 } 1726 1727 #define NVME_TEMP_THRESH_OVER 0 1728 #define NVME_TEMP_THRESH_UNDER 1 1729 static void 1730 nvme_feature_temperature(struct pci_nvme_softc *sc, 1731 struct nvme_feature_obj *feat __unused, 1732 struct nvme_command *command, 1733 struct nvme_completion *compl) 1734 { 1735 uint16_t tmpth; /* Temperature Threshold */ 1736 uint8_t tmpsel; /* Threshold Temperature Select */ 1737 uint8_t thsel; /* Threshold Type Select */ 1738 bool set_crit = false; 1739 bool report_crit; 1740 1741 tmpth = command->cdw11 & 0xffff; 1742 tmpsel = (command->cdw11 >> 16) & 0xf; 1743 thsel = (command->cdw11 >> 20) & 0x3; 1744 1745 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel); 1746 1747 /* Check for unsupported values */ 1748 if (((tmpsel != 0) && (tmpsel != 0xf)) || 1749 (thsel > NVME_TEMP_THRESH_UNDER)) { 1750 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1751 return; 1752 } 1753 1754 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) || 1755 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth))) 1756 set_crit = true; 1757 1758 pthread_mutex_lock(&sc->mtx); 1759 if (set_crit) 1760 sc->health_log.critical_warning |= 1761 NVME_CRIT_WARN_ST_TEMPERATURE; 1762 else 1763 sc->health_log.critical_warning &= 1764 ~NVME_CRIT_WARN_ST_TEMPERATURE; 1765 pthread_mutex_unlock(&sc->mtx); 1766 1767 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 & 1768 NVME_CRIT_WARN_ST_TEMPERATURE; 1769 1770 if (set_crit && report_crit) 1771 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART, 1772 sc->health_log.critical_warning); 1773 1774 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status); 1775 } 1776 1777 static void 1778 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1779 struct nvme_feature_obj *feat __unused, 1780 struct nvme_command *command, 1781 struct nvme_completion *compl) 1782 { 1783 uint16_t nqr; /* Number of Queues Requested */ 1784 1785 if (sc->num_q_is_set) { 1786 WPRINTF("%s: Number of Queues already set", __func__); 1787 pci_nvme_status_genc(&compl->status, 1788 NVME_SC_COMMAND_SEQUENCE_ERROR); 1789 return; 1790 } 1791 1792 nqr = command->cdw11 & 0xFFFF; 1793 if (nqr == 0xffff) { 1794 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1795 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1796 return; 1797 } 1798 1799 sc->num_squeues = ONE_BASED(nqr); 1800 if (sc->num_squeues > sc->max_queues) { 1801 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1802 sc->max_queues); 1803 sc->num_squeues = sc->max_queues; 1804 } 1805 1806 nqr = (command->cdw11 >> 16) & 0xFFFF; 1807 if (nqr == 0xffff) { 1808 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1809 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1810 return; 1811 } 1812 1813 sc->num_cqueues = ONE_BASED(nqr); 1814 if (sc->num_cqueues > sc->max_queues) { 1815 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1816 sc->max_queues); 1817 sc->num_cqueues = sc->max_queues; 1818 } 1819 1820 /* Patch the command value which will be saved on callback's return */ 1821 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1822 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1823 1824 sc->num_q_is_set = true; 1825 } 1826 1827 static int 1828 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1829 struct nvme_completion *compl) 1830 { 1831 struct nvme_feature_obj *feat; 1832 uint32_t nsid = command->nsid; 1833 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10); 1834 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10); 1835 1836 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1837 1838 if (fid >= NVME_FID_MAX) { 1839 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1840 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1841 return (1); 1842 } 1843 1844 if (sv) { 1845 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1846 NVME_SC_FEATURE_NOT_SAVEABLE); 1847 return (1); 1848 } 1849 1850 feat = &sc->feat[fid]; 1851 1852 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) { 1853 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1854 return (1); 1855 } 1856 1857 if (!feat->namespace_specific && 1858 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1859 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1860 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1861 return (1); 1862 } 1863 1864 compl->cdw0 = 0; 1865 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1866 1867 if (feat->set) 1868 feat->set(sc, feat, command, compl); 1869 else { 1870 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1871 NVME_SC_FEATURE_NOT_CHANGEABLE); 1872 return (1); 1873 } 1874 1875 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11); 1876 if (compl->status == NVME_SC_SUCCESS) { 1877 feat->cdw11 = command->cdw11; 1878 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) && 1879 (command->cdw11 != 0)) 1880 pci_nvme_aen_notify(sc); 1881 } 1882 1883 return (0); 1884 } 1885 1886 #define NVME_FEATURES_SEL_SUPPORTED 0x3 1887 #define NVME_FEATURES_NS_SPECIFIC (1 << 1) 1888 1889 static int 1890 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1891 struct nvme_completion* compl) 1892 { 1893 struct nvme_feature_obj *feat; 1894 uint8_t fid = command->cdw10 & 0xFF; 1895 uint8_t sel = (command->cdw10 >> 8) & 0x7; 1896 1897 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1898 1899 if (fid >= NVME_FID_MAX) { 1900 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1901 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1902 return (1); 1903 } 1904 1905 compl->cdw0 = 0; 1906 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1907 1908 feat = &sc->feat[fid]; 1909 if (feat->get) { 1910 feat->get(sc, feat, command, compl); 1911 } 1912 1913 if (compl->status == NVME_SC_SUCCESS) { 1914 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific) 1915 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC; 1916 else 1917 compl->cdw0 = feat->cdw11; 1918 } 1919 1920 return (0); 1921 } 1922 1923 static int 1924 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1925 struct nvme_completion* compl) 1926 { 1927 uint8_t ses, lbaf, pi; 1928 1929 /* Only supports Secure Erase Setting - User Data Erase */ 1930 ses = (command->cdw10 >> 9) & 0x7; 1931 if (ses > 0x1) { 1932 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1933 return (1); 1934 } 1935 1936 /* Only supports a single LBA Format */ 1937 lbaf = command->cdw10 & 0xf; 1938 if (lbaf != 0) { 1939 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1940 NVME_SC_INVALID_FORMAT); 1941 return (1); 1942 } 1943 1944 /* Doesn't support Protection Information */ 1945 pi = (command->cdw10 >> 5) & 0x7; 1946 if (pi != 0) { 1947 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1948 return (1); 1949 } 1950 1951 if (sc->nvstore.type == NVME_STOR_RAM) { 1952 if (sc->nvstore.ctx) 1953 free(sc->nvstore.ctx); 1954 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1955 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1956 } else { 1957 struct pci_nvme_ioreq *req; 1958 int err; 1959 1960 req = pci_nvme_get_ioreq(sc); 1961 if (req == NULL) { 1962 pci_nvme_status_genc(&compl->status, 1963 NVME_SC_INTERNAL_DEVICE_ERROR); 1964 WPRINTF("%s: unable to allocate IO req", __func__); 1965 return (1); 1966 } 1967 req->nvme_sq = &sc->submit_queues[0]; 1968 req->sqid = 0; 1969 req->opc = command->opc; 1970 req->cid = command->cid; 1971 req->nsid = command->nsid; 1972 1973 req->io_req.br_offset = 0; 1974 req->io_req.br_resid = sc->nvstore.size; 1975 req->io_req.br_callback = pci_nvme_io_done; 1976 1977 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1978 if (err) { 1979 pci_nvme_status_genc(&compl->status, 1980 NVME_SC_INTERNAL_DEVICE_ERROR); 1981 pci_nvme_release_ioreq(sc, req); 1982 } else 1983 compl->status = NVME_NO_STATUS; 1984 } 1985 1986 return (1); 1987 } 1988 1989 static int 1990 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command, 1991 struct nvme_completion *compl) 1992 { 1993 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1994 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1995 1996 /* TODO: search for the command ID and abort it */ 1997 1998 compl->cdw0 = 1; 1999 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 2000 return (1); 2001 } 2002 2003 static int 2004 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 2005 struct nvme_command* command, struct nvme_completion* compl) 2006 { 2007 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__, 2008 sc->aer_count, sc->ctrldata.aerl, command->cid); 2009 2010 /* Don't exceed the Async Event Request Limit (AERL). */ 2011 if (pci_nvme_aer_limit_reached(sc)) { 2012 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 2013 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 2014 return (1); 2015 } 2016 2017 if (pci_nvme_aer_add(sc, command->cid)) { 2018 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 2019 NVME_SC_INTERNAL_DEVICE_ERROR); 2020 return (1); 2021 } 2022 2023 /* 2024 * Raise events when they happen based on the Set Features cmd. 2025 * These events happen async, so only set completion successful if 2026 * there is an event reflective of the request to get event. 2027 */ 2028 compl->status = NVME_NO_STATUS; 2029 pci_nvme_aen_notify(sc); 2030 2031 return (0); 2032 } 2033 2034 static void 2035 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 2036 { 2037 struct nvme_completion compl; 2038 struct nvme_command *cmd; 2039 struct nvme_submission_queue *sq; 2040 struct nvme_completion_queue *cq; 2041 uint16_t sqhead; 2042 2043 DPRINTF("%s index %u", __func__, (uint32_t)value); 2044 2045 sq = &sc->submit_queues[0]; 2046 cq = &sc->compl_queues[0]; 2047 2048 pthread_mutex_lock(&sq->mtx); 2049 2050 sqhead = sq->head; 2051 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 2052 2053 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2054 cmd = &(sq->qbase)[sqhead]; 2055 compl.cdw0 = 0; 2056 compl.status = 0; 2057 2058 switch (cmd->opc) { 2059 case NVME_OPC_DELETE_IO_SQ: 2060 DPRINTF("%s command DELETE_IO_SQ", __func__); 2061 nvme_opc_delete_io_sq(sc, cmd, &compl); 2062 break; 2063 case NVME_OPC_CREATE_IO_SQ: 2064 DPRINTF("%s command CREATE_IO_SQ", __func__); 2065 nvme_opc_create_io_sq(sc, cmd, &compl); 2066 break; 2067 case NVME_OPC_DELETE_IO_CQ: 2068 DPRINTF("%s command DELETE_IO_CQ", __func__); 2069 nvme_opc_delete_io_cq(sc, cmd, &compl); 2070 break; 2071 case NVME_OPC_CREATE_IO_CQ: 2072 DPRINTF("%s command CREATE_IO_CQ", __func__); 2073 nvme_opc_create_io_cq(sc, cmd, &compl); 2074 break; 2075 case NVME_OPC_GET_LOG_PAGE: 2076 DPRINTF("%s command GET_LOG_PAGE", __func__); 2077 nvme_opc_get_log_page(sc, cmd, &compl); 2078 break; 2079 case NVME_OPC_IDENTIFY: 2080 DPRINTF("%s command IDENTIFY", __func__); 2081 nvme_opc_identify(sc, cmd, &compl); 2082 break; 2083 case NVME_OPC_ABORT: 2084 DPRINTF("%s command ABORT", __func__); 2085 nvme_opc_abort(sc, cmd, &compl); 2086 break; 2087 case NVME_OPC_SET_FEATURES: 2088 DPRINTF("%s command SET_FEATURES", __func__); 2089 nvme_opc_set_features(sc, cmd, &compl); 2090 break; 2091 case NVME_OPC_GET_FEATURES: 2092 DPRINTF("%s command GET_FEATURES", __func__); 2093 nvme_opc_get_features(sc, cmd, &compl); 2094 break; 2095 case NVME_OPC_FIRMWARE_ACTIVATE: 2096 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 2097 pci_nvme_status_tc(&compl.status, 2098 NVME_SCT_COMMAND_SPECIFIC, 2099 NVME_SC_INVALID_FIRMWARE_SLOT); 2100 break; 2101 case NVME_OPC_ASYNC_EVENT_REQUEST: 2102 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 2103 nvme_opc_async_event_req(sc, cmd, &compl); 2104 break; 2105 case NVME_OPC_FORMAT_NVM: 2106 DPRINTF("%s command FORMAT_NVM", __func__); 2107 if ((sc->ctrldata.oacs & 2108 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 2109 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2110 break; 2111 } 2112 nvme_opc_format_nvm(sc, cmd, &compl); 2113 break; 2114 case NVME_OPC_SECURITY_SEND: 2115 case NVME_OPC_SECURITY_RECEIVE: 2116 case NVME_OPC_SANITIZE: 2117 case NVME_OPC_GET_LBA_STATUS: 2118 DPRINTF("%s command OPC=%#x (unsupported)", __func__, 2119 cmd->opc); 2120 /* Valid but unsupported opcodes */ 2121 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD); 2122 break; 2123 default: 2124 DPRINTF("%s command OPC=%#X (not implemented)", 2125 __func__, 2126 cmd->opc); 2127 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 2128 } 2129 sqhead = (sqhead + 1) % sq->size; 2130 2131 if (NVME_COMPLETION_VALID(compl)) { 2132 pci_nvme_cq_update(sc, &sc->compl_queues[0], 2133 compl.cdw0, 2134 cmd->cid, 2135 0, /* SQID */ 2136 compl.status); 2137 } 2138 } 2139 2140 DPRINTF("setting sqhead %u", sqhead); 2141 sq->head = sqhead; 2142 2143 if (cq->head != cq->tail) 2144 pci_generate_msix(sc->nsc_pi, 0); 2145 2146 pthread_mutex_unlock(&sq->mtx); 2147 } 2148 2149 /* 2150 * Update the Write and Read statistics reported in SMART data 2151 * 2152 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 2153 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 2154 * 512 byte blocks. Rounding up is achieved by initializing the remainder to 999. 2155 */ 2156 static void 2157 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 2158 size_t bytes, uint16_t status) 2159 { 2160 2161 pthread_mutex_lock(&sc->mtx); 2162 switch (opc) { 2163 case NVME_OPC_WRITE: 2164 sc->write_commands++; 2165 if (status != NVME_SC_SUCCESS) 2166 break; 2167 sc->write_dunits_remainder += (bytes / 512); 2168 while (sc->write_dunits_remainder >= 1000) { 2169 sc->write_data_units++; 2170 sc->write_dunits_remainder -= 1000; 2171 } 2172 break; 2173 case NVME_OPC_READ: 2174 sc->read_commands++; 2175 if (status != NVME_SC_SUCCESS) 2176 break; 2177 sc->read_dunits_remainder += (bytes / 512); 2178 while (sc->read_dunits_remainder >= 1000) { 2179 sc->read_data_units++; 2180 sc->read_dunits_remainder -= 1000; 2181 } 2182 break; 2183 default: 2184 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 2185 break; 2186 } 2187 pthread_mutex_unlock(&sc->mtx); 2188 } 2189 2190 /* 2191 * Check if the combination of Starting LBA (slba) and number of blocks 2192 * exceeds the range of the underlying storage. 2193 * 2194 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 2195 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 2196 * overflow. 2197 */ 2198 static bool 2199 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 2200 uint32_t nblocks) 2201 { 2202 size_t offset, bytes; 2203 2204 /* Overflow check of multiplying Starting LBA by the sector size */ 2205 if (slba >> (64 - nvstore->sectsz_bits)) 2206 return (true); 2207 2208 offset = slba << nvstore->sectsz_bits; 2209 bytes = nblocks << nvstore->sectsz_bits; 2210 2211 /* Overflow check of Number of Logical Blocks */ 2212 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes)) 2213 return (true); 2214 2215 return (false); 2216 } 2217 2218 static int 2219 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused, 2220 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset) 2221 { 2222 int iovidx; 2223 bool range_is_contiguous; 2224 2225 if (req == NULL) 2226 return (-1); 2227 2228 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 2229 return (-1); 2230 } 2231 2232 /* 2233 * Minimize the number of IOVs by concatenating contiguous address 2234 * ranges. If the IOV count is zero, there is no previous range to 2235 * concatenate. 2236 */ 2237 if (req->io_req.br_iovcnt == 0) 2238 range_is_contiguous = false; 2239 else 2240 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr; 2241 2242 if (range_is_contiguous) { 2243 iovidx = req->io_req.br_iovcnt - 1; 2244 2245 req->io_req.br_iov[iovidx].iov_base = 2246 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2247 req->prev_gpaddr, size); 2248 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2249 return (-1); 2250 2251 req->prev_size += size; 2252 req->io_req.br_resid += size; 2253 2254 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 2255 } else { 2256 iovidx = req->io_req.br_iovcnt; 2257 if (iovidx == 0) { 2258 req->io_req.br_offset = offset; 2259 req->io_req.br_resid = 0; 2260 req->io_req.br_param = req; 2261 } 2262 2263 req->io_req.br_iov[iovidx].iov_base = 2264 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 2265 gpaddr, size); 2266 if (req->io_req.br_iov[iovidx].iov_base == NULL) 2267 return (-1); 2268 2269 req->io_req.br_iov[iovidx].iov_len = size; 2270 2271 req->prev_gpaddr = gpaddr; 2272 req->prev_size = size; 2273 req->io_req.br_resid += size; 2274 2275 req->io_req.br_iovcnt++; 2276 } 2277 2278 return (0); 2279 } 2280 2281 static void 2282 pci_nvme_set_completion(struct pci_nvme_softc *sc, 2283 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status) 2284 { 2285 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 2286 2287 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 2288 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 2289 NVME_STATUS_GET_SC(status)); 2290 2291 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status); 2292 2293 if (cq->head != cq->tail) { 2294 if (cq->intr_en & NVME_CQ_INTEN) { 2295 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 2296 } else { 2297 DPRINTF("%s: CQ%u interrupt disabled", 2298 __func__, sq->cqid); 2299 } 2300 } 2301 } 2302 2303 static void 2304 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 2305 { 2306 req->sc = NULL; 2307 req->nvme_sq = NULL; 2308 req->sqid = 0; 2309 2310 pthread_mutex_lock(&sc->mtx); 2311 2312 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 2313 sc->pending_ios--; 2314 2315 /* when no more IO pending, can set to ready if device reset/enabled */ 2316 if (sc->pending_ios == 0 && 2317 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 2318 sc->regs.csts |= NVME_CSTS_RDY; 2319 2320 pthread_mutex_unlock(&sc->mtx); 2321 2322 sem_post(&sc->iosemlock); 2323 } 2324 2325 static struct pci_nvme_ioreq * 2326 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 2327 { 2328 struct pci_nvme_ioreq *req = NULL; 2329 2330 sem_wait(&sc->iosemlock); 2331 pthread_mutex_lock(&sc->mtx); 2332 2333 req = STAILQ_FIRST(&sc->ioreqs_free); 2334 assert(req != NULL); 2335 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 2336 2337 req->sc = sc; 2338 2339 sc->pending_ios++; 2340 2341 pthread_mutex_unlock(&sc->mtx); 2342 2343 req->io_req.br_iovcnt = 0; 2344 req->io_req.br_offset = 0; 2345 req->io_req.br_resid = 0; 2346 req->io_req.br_param = req; 2347 req->prev_gpaddr = 0; 2348 req->prev_size = 0; 2349 2350 return req; 2351 } 2352 2353 static void 2354 pci_nvme_io_done(struct blockif_req *br, int err) 2355 { 2356 struct pci_nvme_ioreq *req = br->br_param; 2357 struct nvme_submission_queue *sq = req->nvme_sq; 2358 uint16_t code, status; 2359 2360 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 2361 2362 /* TODO return correct error */ 2363 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 2364 status = 0; 2365 pci_nvme_status_genc(&status, code); 2366 2367 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status); 2368 pci_nvme_stats_write_read_update(req->sc, req->opc, 2369 req->bytes, status); 2370 pci_nvme_release_ioreq(req->sc, req); 2371 } 2372 2373 /* 2374 * Implements the Flush command. The specification states: 2375 * If a volatile write cache is not present, Flush commands complete 2376 * successfully and have no effect 2377 * in the description of the Volatile Write Cache (VWC) field of the Identify 2378 * Controller data. Therefore, set status to Success if the command is 2379 * not supported (i.e. RAM or as indicated by the blockif). 2380 */ 2381 static bool 2382 nvme_opc_flush(struct pci_nvme_softc *sc __unused, 2383 struct nvme_command *cmd __unused, 2384 struct pci_nvme_blockstore *nvstore, 2385 struct pci_nvme_ioreq *req, 2386 uint16_t *status) 2387 { 2388 bool pending = false; 2389 2390 if (nvstore->type == NVME_STOR_RAM) { 2391 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2392 } else { 2393 int err; 2394 2395 req->io_req.br_callback = pci_nvme_io_done; 2396 2397 err = blockif_flush(nvstore->ctx, &req->io_req); 2398 switch (err) { 2399 case 0: 2400 pending = true; 2401 break; 2402 case EOPNOTSUPP: 2403 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2404 break; 2405 default: 2406 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2407 } 2408 } 2409 2410 return (pending); 2411 } 2412 2413 static uint16_t 2414 nvme_write_read_ram(struct pci_nvme_softc *sc, 2415 struct pci_nvme_blockstore *nvstore, 2416 uint64_t prp1, uint64_t prp2, 2417 size_t offset, uint64_t bytes, 2418 bool is_write) 2419 { 2420 uint8_t *buf = nvstore->ctx; 2421 enum nvme_copy_dir dir; 2422 uint16_t status; 2423 2424 if (is_write) 2425 dir = NVME_COPY_TO_PRP; 2426 else 2427 dir = NVME_COPY_FROM_PRP; 2428 2429 status = 0; 2430 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 2431 buf + offset, bytes, dir)) 2432 pci_nvme_status_genc(&status, 2433 NVME_SC_DATA_TRANSFER_ERROR); 2434 else 2435 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2436 2437 return (status); 2438 } 2439 2440 static uint16_t 2441 nvme_write_read_blockif(struct pci_nvme_softc *sc, 2442 struct pci_nvme_blockstore *nvstore, 2443 struct pci_nvme_ioreq *req, 2444 uint64_t prp1, uint64_t prp2, 2445 size_t offset, uint64_t bytes, 2446 bool is_write) 2447 { 2448 uint64_t size; 2449 int err; 2450 uint16_t status = NVME_NO_STATUS; 2451 2452 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 2453 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) { 2454 err = -1; 2455 goto out; 2456 } 2457 2458 offset += size; 2459 bytes -= size; 2460 2461 if (bytes == 0) { 2462 ; 2463 } else if (bytes <= PAGE_SIZE) { 2464 size = bytes; 2465 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) { 2466 err = -1; 2467 goto out; 2468 } 2469 } else { 2470 void *vmctx = sc->nsc_pi->pi_vmctx; 2471 uint64_t *prp_list = &prp2; 2472 uint64_t *last = prp_list; 2473 2474 /* PRP2 is pointer to a physical region page list */ 2475 while (bytes) { 2476 /* Last entry in list points to the next list */ 2477 if ((prp_list == last) && (bytes > PAGE_SIZE)) { 2478 uint64_t prp = *prp_list; 2479 2480 prp_list = paddr_guest2host(vmctx, prp, 2481 PAGE_SIZE - (prp % PAGE_SIZE)); 2482 if (prp_list == NULL) { 2483 err = -1; 2484 goto out; 2485 } 2486 last = prp_list + (NVME_PRP2_ITEMS - 1); 2487 } 2488 2489 size = MIN(bytes, PAGE_SIZE); 2490 2491 if (pci_nvme_append_iov_req(sc, req, *prp_list, size, 2492 offset)) { 2493 err = -1; 2494 goto out; 2495 } 2496 2497 offset += size; 2498 bytes -= size; 2499 2500 prp_list++; 2501 } 2502 } 2503 req->io_req.br_callback = pci_nvme_io_done; 2504 if (is_write) 2505 err = blockif_write(nvstore->ctx, &req->io_req); 2506 else 2507 err = blockif_read(nvstore->ctx, &req->io_req); 2508 out: 2509 if (err) 2510 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2511 2512 return (status); 2513 } 2514 2515 static bool 2516 nvme_opc_write_read(struct pci_nvme_softc *sc, 2517 struct nvme_command *cmd, 2518 struct pci_nvme_blockstore *nvstore, 2519 struct pci_nvme_ioreq *req, 2520 uint16_t *status) 2521 { 2522 uint64_t lba, nblocks, bytes; 2523 size_t offset; 2524 bool is_write = cmd->opc == NVME_OPC_WRITE; 2525 bool pending = false; 2526 2527 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2528 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2529 bytes = nblocks << nvstore->sectsz_bits; 2530 if (bytes > NVME_MAX_DATA_SIZE) { 2531 WPRINTF("%s command would exceed MDTS", __func__); 2532 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2533 goto out; 2534 } 2535 2536 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2537 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)", 2538 __func__, lba, nblocks); 2539 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2540 goto out; 2541 } 2542 2543 offset = lba << nvstore->sectsz_bits; 2544 2545 req->bytes = bytes; 2546 req->io_req.br_offset = lba; 2547 2548 /* PRP bits 1:0 must be zero */ 2549 cmd->prp1 &= ~0x3UL; 2550 cmd->prp2 &= ~0x3UL; 2551 2552 if (nvstore->type == NVME_STOR_RAM) { 2553 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2554 cmd->prp2, offset, bytes, is_write); 2555 } else { 2556 *status = nvme_write_read_blockif(sc, nvstore, req, 2557 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2558 2559 if (*status == NVME_NO_STATUS) 2560 pending = true; 2561 } 2562 out: 2563 if (!pending) 2564 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2565 2566 return (pending); 2567 } 2568 2569 static void 2570 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2571 { 2572 struct pci_nvme_ioreq *req = br->br_param; 2573 struct pci_nvme_softc *sc = req->sc; 2574 bool done = true; 2575 uint16_t status; 2576 2577 status = 0; 2578 if (err) { 2579 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2580 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2581 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2582 } else { 2583 struct iovec *iov = req->io_req.br_iov; 2584 2585 req->prev_gpaddr++; 2586 iov += req->prev_gpaddr; 2587 2588 /* The iov_* values already include the sector size */ 2589 req->io_req.br_offset = (off_t)iov->iov_base; 2590 req->io_req.br_resid = iov->iov_len; 2591 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2592 pci_nvme_status_genc(&status, 2593 NVME_SC_INTERNAL_DEVICE_ERROR); 2594 } else 2595 done = false; 2596 } 2597 2598 if (done) { 2599 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid, 2600 status); 2601 pci_nvme_release_ioreq(sc, req); 2602 } 2603 } 2604 2605 static bool 2606 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2607 struct nvme_command *cmd, 2608 struct pci_nvme_blockstore *nvstore, 2609 struct pci_nvme_ioreq *req, 2610 uint16_t *status) 2611 { 2612 struct nvme_dsm_range *range = NULL; 2613 uint32_t nr, r, non_zero, dr; 2614 int err; 2615 bool pending = false; 2616 2617 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2618 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2619 goto out; 2620 } 2621 2622 nr = cmd->cdw10 & 0xff; 2623 2624 /* copy locally because a range entry could straddle PRPs */ 2625 #ifdef __FreeBSD__ 2626 range = calloc(1, NVME_MAX_DSM_TRIM); 2627 #else 2628 _Static_assert(NVME_MAX_DSM_TRIM % sizeof(struct nvme_dsm_range) == 0, 2629 "NVME_MAX_DSM_TRIM is not a multiple of struct size"); 2630 range = calloc(NVME_MAX_DSM_TRIM / sizeof (*range), sizeof (*range)); 2631 #endif 2632 if (range == NULL) { 2633 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2634 goto out; 2635 } 2636 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2637 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2638 2639 /* Check for invalid ranges and the number of non-zero lengths */ 2640 non_zero = 0; 2641 for (r = 0; r <= nr; r++) { 2642 if (pci_nvme_out_of_range(nvstore, 2643 range[r].starting_lba, range[r].length)) { 2644 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2645 goto out; 2646 } 2647 if (range[r].length != 0) 2648 non_zero++; 2649 } 2650 2651 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2652 size_t offset, bytes; 2653 int sectsz_bits = sc->nvstore.sectsz_bits; 2654 2655 /* 2656 * DSM calls are advisory only, and compliant controllers 2657 * may choose to take no actions (i.e. return Success). 2658 */ 2659 if (!nvstore->deallocate) { 2660 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2661 goto out; 2662 } 2663 2664 /* If all ranges have a zero length, return Success */ 2665 if (non_zero == 0) { 2666 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2667 goto out; 2668 } 2669 2670 if (req == NULL) { 2671 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2672 goto out; 2673 } 2674 2675 offset = range[0].starting_lba << sectsz_bits; 2676 bytes = range[0].length << sectsz_bits; 2677 2678 /* 2679 * If the request is for more than a single range, store 2680 * the ranges in the br_iov. Optimize for the common case 2681 * of a single range. 2682 * 2683 * Note that NVMe Number of Ranges is a zero based value 2684 */ 2685 req->io_req.br_iovcnt = 0; 2686 req->io_req.br_offset = offset; 2687 req->io_req.br_resid = bytes; 2688 2689 if (nr == 0) { 2690 req->io_req.br_callback = pci_nvme_io_done; 2691 } else { 2692 struct iovec *iov = req->io_req.br_iov; 2693 2694 for (r = 0, dr = 0; r <= nr; r++) { 2695 offset = range[r].starting_lba << sectsz_bits; 2696 bytes = range[r].length << sectsz_bits; 2697 if (bytes == 0) 2698 continue; 2699 2700 if ((nvstore->size - offset) < bytes) { 2701 pci_nvme_status_genc(status, 2702 NVME_SC_LBA_OUT_OF_RANGE); 2703 goto out; 2704 } 2705 iov[dr].iov_base = (void *)offset; 2706 iov[dr].iov_len = bytes; 2707 dr++; 2708 } 2709 req->io_req.br_callback = pci_nvme_dealloc_sm; 2710 2711 /* 2712 * Use prev_gpaddr to track the current entry and 2713 * prev_size to track the number of entries 2714 */ 2715 req->prev_gpaddr = 0; 2716 req->prev_size = dr; 2717 } 2718 2719 err = blockif_delete(nvstore->ctx, &req->io_req); 2720 if (err) 2721 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2722 else 2723 pending = true; 2724 } 2725 out: 2726 free(range); 2727 return (pending); 2728 } 2729 2730 static void 2731 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2732 { 2733 struct nvme_submission_queue *sq; 2734 uint16_t status; 2735 uint16_t sqhead; 2736 2737 /* handle all submissions up to sq->tail index */ 2738 sq = &sc->submit_queues[idx]; 2739 2740 pthread_mutex_lock(&sq->mtx); 2741 2742 sqhead = sq->head; 2743 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2744 idx, sqhead, sq->tail, sq->qbase); 2745 2746 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2747 struct nvme_command *cmd; 2748 struct pci_nvme_ioreq *req; 2749 uint32_t nsid; 2750 bool pending; 2751 2752 pending = false; 2753 req = NULL; 2754 status = 0; 2755 2756 cmd = &sq->qbase[sqhead]; 2757 sqhead = (sqhead + 1) % sq->size; 2758 2759 nsid = le32toh(cmd->nsid); 2760 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2761 pci_nvme_status_genc(&status, 2762 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2763 status |= 2764 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2765 goto complete; 2766 } 2767 2768 req = pci_nvme_get_ioreq(sc); 2769 if (req == NULL) { 2770 pci_nvme_status_genc(&status, 2771 NVME_SC_INTERNAL_DEVICE_ERROR); 2772 WPRINTF("%s: unable to allocate IO req", __func__); 2773 goto complete; 2774 } 2775 req->nvme_sq = sq; 2776 req->sqid = idx; 2777 req->opc = cmd->opc; 2778 req->cid = cmd->cid; 2779 req->nsid = cmd->nsid; 2780 2781 switch (cmd->opc) { 2782 case NVME_OPC_FLUSH: 2783 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2784 req, &status); 2785 break; 2786 case NVME_OPC_WRITE: 2787 case NVME_OPC_READ: 2788 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2789 req, &status); 2790 break; 2791 case NVME_OPC_WRITE_ZEROES: 2792 /* TODO: write zeroes 2793 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2794 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2795 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2796 break; 2797 case NVME_OPC_DATASET_MANAGEMENT: 2798 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2799 req, &status); 2800 break; 2801 default: 2802 WPRINTF("%s unhandled io command 0x%x", 2803 __func__, cmd->opc); 2804 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2805 } 2806 complete: 2807 if (!pending) { 2808 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status); 2809 if (req != NULL) 2810 pci_nvme_release_ioreq(sc, req); 2811 } 2812 } 2813 2814 sq->head = sqhead; 2815 2816 pthread_mutex_unlock(&sq->mtx); 2817 } 2818 2819 static void 2820 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc, 2821 uint64_t idx, int is_sq, uint64_t value) 2822 { 2823 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2824 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2825 2826 if (is_sq) { 2827 if (idx > sc->num_squeues) { 2828 WPRINTF("%s queue index %lu overflow from " 2829 "guest (max %u)", 2830 __func__, idx, sc->num_squeues); 2831 return; 2832 } 2833 2834 atomic_store_short(&sc->submit_queues[idx].tail, 2835 (uint16_t)value); 2836 2837 if (idx == 0) { 2838 pci_nvme_handle_admin_cmd(sc, value); 2839 } else { 2840 /* submission queue; handle new entries in SQ */ 2841 if (idx > sc->num_squeues) { 2842 WPRINTF("%s SQ index %lu overflow from " 2843 "guest (max %u)", 2844 __func__, idx, sc->num_squeues); 2845 return; 2846 } 2847 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2848 } 2849 } else { 2850 if (idx > sc->num_cqueues) { 2851 WPRINTF("%s queue index %lu overflow from " 2852 "guest (max %u)", 2853 __func__, idx, sc->num_cqueues); 2854 return; 2855 } 2856 2857 atomic_store_short(&sc->compl_queues[idx].head, 2858 (uint16_t)value); 2859 } 2860 } 2861 2862 static void 2863 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2864 { 2865 const char *s = iswrite ? "WRITE" : "READ"; 2866 2867 switch (offset) { 2868 case NVME_CR_CAP_LOW: 2869 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2870 break; 2871 case NVME_CR_CAP_HI: 2872 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2873 break; 2874 case NVME_CR_VS: 2875 DPRINTF("%s %s NVME_CR_VS", func, s); 2876 break; 2877 case NVME_CR_INTMS: 2878 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2879 break; 2880 case NVME_CR_INTMC: 2881 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2882 break; 2883 case NVME_CR_CC: 2884 DPRINTF("%s %s NVME_CR_CC", func, s); 2885 break; 2886 case NVME_CR_CSTS: 2887 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2888 break; 2889 case NVME_CR_NSSR: 2890 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2891 break; 2892 case NVME_CR_AQA: 2893 DPRINTF("%s %s NVME_CR_AQA", func, s); 2894 break; 2895 case NVME_CR_ASQ_LOW: 2896 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2897 break; 2898 case NVME_CR_ASQ_HI: 2899 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2900 break; 2901 case NVME_CR_ACQ_LOW: 2902 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2903 break; 2904 case NVME_CR_ACQ_HI: 2905 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2906 break; 2907 default: 2908 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2909 } 2910 2911 } 2912 2913 static void 2914 pci_nvme_write_bar_0(struct pci_nvme_softc *sc, uint64_t offset, int size, 2915 uint64_t value) 2916 { 2917 uint32_t ccreg; 2918 2919 if (offset >= NVME_DOORBELL_OFFSET) { 2920 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2921 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2922 int is_sq = (belloffset % 8) < 4; 2923 2924 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) { 2925 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n", 2926 offset); 2927 return; 2928 } 2929 2930 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2931 WPRINTF("guest attempted an overflow write offset " 2932 "0x%lx, val 0x%lx in %s", 2933 offset, value, __func__); 2934 return; 2935 } 2936 2937 if (is_sq) { 2938 if (sc->submit_queues[idx].qbase == NULL) 2939 return; 2940 } else if (sc->compl_queues[idx].qbase == NULL) 2941 return; 2942 2943 pci_nvme_handle_doorbell(sc, idx, is_sq, value); 2944 return; 2945 } 2946 2947 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2948 offset, size, value); 2949 2950 if (size != 4) { 2951 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2952 "val 0x%lx) to bar0 in %s", 2953 size, offset, value, __func__); 2954 /* TODO: shutdown device */ 2955 return; 2956 } 2957 2958 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2959 2960 pthread_mutex_lock(&sc->mtx); 2961 2962 switch (offset) { 2963 case NVME_CR_CAP_LOW: 2964 case NVME_CR_CAP_HI: 2965 /* readonly */ 2966 break; 2967 case NVME_CR_VS: 2968 /* readonly */ 2969 break; 2970 case NVME_CR_INTMS: 2971 /* MSI-X, so ignore */ 2972 break; 2973 case NVME_CR_INTMC: 2974 /* MSI-X, so ignore */ 2975 break; 2976 case NVME_CR_CC: 2977 ccreg = (uint32_t)value; 2978 2979 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2980 "iocqes %u", 2981 __func__, 2982 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2983 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2984 NVME_CC_GET_IOCQES(ccreg)); 2985 2986 if (NVME_CC_GET_SHN(ccreg)) { 2987 /* perform shutdown - flush out data to backend */ 2988 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2989 NVME_CSTS_REG_SHST_SHIFT); 2990 sc->regs.csts |= NVME_SHST_COMPLETE << 2991 NVME_CSTS_REG_SHST_SHIFT; 2992 } 2993 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2994 if (NVME_CC_GET_EN(ccreg) == 0) 2995 /* transition 1-> causes controller reset */ 2996 pci_nvme_reset_locked(sc); 2997 else 2998 pci_nvme_init_controller(sc); 2999 } 3000 3001 /* Insert the iocqes, iosqes and en bits from the write */ 3002 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 3003 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 3004 if (NVME_CC_GET_EN(ccreg) == 0) { 3005 /* Insert the ams, mps and css bit fields */ 3006 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 3007 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 3008 sc->regs.csts &= ~NVME_CSTS_RDY; 3009 } else if ((sc->pending_ios == 0) && 3010 !(sc->regs.csts & NVME_CSTS_CFS)) { 3011 sc->regs.csts |= NVME_CSTS_RDY; 3012 } 3013 break; 3014 case NVME_CR_CSTS: 3015 break; 3016 case NVME_CR_NSSR: 3017 /* ignore writes; don't support subsystem reset */ 3018 break; 3019 case NVME_CR_AQA: 3020 sc->regs.aqa = (uint32_t)value; 3021 break; 3022 case NVME_CR_ASQ_LOW: 3023 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 3024 (0xFFFFF000 & value); 3025 break; 3026 case NVME_CR_ASQ_HI: 3027 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 3028 (value << 32); 3029 break; 3030 case NVME_CR_ACQ_LOW: 3031 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 3032 (0xFFFFF000 & value); 3033 break; 3034 case NVME_CR_ACQ_HI: 3035 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 3036 (value << 32); 3037 break; 3038 default: 3039 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 3040 __func__, offset, value, size); 3041 } 3042 pthread_mutex_unlock(&sc->mtx); 3043 } 3044 3045 static void 3046 pci_nvme_write(struct pci_devinst *pi, int baridx, uint64_t offset, int size, 3047 uint64_t value) 3048 { 3049 struct pci_nvme_softc* sc = pi->pi_arg; 3050 3051 if (baridx == pci_msix_table_bar(pi) || 3052 baridx == pci_msix_pba_bar(pi)) { 3053 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 3054 " value 0x%lx", baridx, offset, size, value); 3055 3056 pci_emul_msix_twrite(pi, offset, size, value); 3057 return; 3058 } 3059 3060 switch (baridx) { 3061 case 0: 3062 pci_nvme_write_bar_0(sc, offset, size, value); 3063 break; 3064 3065 default: 3066 DPRINTF("%s unknown baridx %d, val 0x%lx", 3067 __func__, baridx, value); 3068 } 3069 } 3070 3071 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 3072 uint64_t offset, int size) 3073 { 3074 uint64_t value; 3075 3076 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 3077 3078 if (offset < NVME_DOORBELL_OFFSET) { 3079 void *p = &(sc->regs); 3080 pthread_mutex_lock(&sc->mtx); 3081 memcpy(&value, (void *)((uintptr_t)p + offset), size); 3082 pthread_mutex_unlock(&sc->mtx); 3083 } else { 3084 value = 0; 3085 WPRINTF("pci_nvme: read invalid offset %ld", offset); 3086 } 3087 3088 switch (size) { 3089 case 1: 3090 value &= 0xFF; 3091 break; 3092 case 2: 3093 value &= 0xFFFF; 3094 break; 3095 case 4: 3096 value &= 0xFFFFFFFF; 3097 break; 3098 } 3099 3100 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 3101 offset, size, (uint32_t)value); 3102 3103 return (value); 3104 } 3105 3106 3107 3108 static uint64_t 3109 pci_nvme_read(struct pci_devinst *pi, int baridx, uint64_t offset, int size) 3110 { 3111 struct pci_nvme_softc* sc = pi->pi_arg; 3112 3113 if (baridx == pci_msix_table_bar(pi) || 3114 baridx == pci_msix_pba_bar(pi)) { 3115 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 3116 baridx, offset, size); 3117 3118 return pci_emul_msix_tread(pi, offset, size); 3119 } 3120 3121 switch (baridx) { 3122 case 0: 3123 return pci_nvme_read_bar_0(sc, offset, size); 3124 3125 default: 3126 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 3127 } 3128 3129 return (0); 3130 } 3131 3132 static int 3133 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 3134 { 3135 char bident[sizeof("XXX:XXX")]; 3136 const char *value; 3137 uint32_t sectsz; 3138 3139 sc->max_queues = NVME_QUEUES; 3140 sc->max_qentries = NVME_MAX_QENTRIES; 3141 sc->ioslots = NVME_IOSLOTS; 3142 sc->num_squeues = sc->max_queues; 3143 sc->num_cqueues = sc->max_queues; 3144 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3145 sectsz = 0; 3146 #ifdef __FreeBSD__ 3147 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3148 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3149 #else 3150 snprintf((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 3151 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3152 #endif 3153 3154 value = get_config_value_node(nvl, "maxq"); 3155 if (value != NULL) 3156 sc->max_queues = atoi(value); 3157 value = get_config_value_node(nvl, "qsz"); 3158 if (value != NULL) { 3159 sc->max_qentries = atoi(value); 3160 if (sc->max_qentries <= 0) { 3161 EPRINTLN("nvme: Invalid qsz option %d", 3162 sc->max_qentries); 3163 return (-1); 3164 } 3165 } 3166 value = get_config_value_node(nvl, "ioslots"); 3167 if (value != NULL) { 3168 sc->ioslots = atoi(value); 3169 if (sc->ioslots <= 0) { 3170 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 3171 return (-1); 3172 } 3173 } 3174 value = get_config_value_node(nvl, "sectsz"); 3175 if (value != NULL) 3176 sectsz = atoi(value); 3177 value = get_config_value_node(nvl, "ser"); 3178 if (value != NULL) { 3179 /* 3180 * This field indicates the Product Serial Number in 3181 * 7-bit ASCII, unused bytes should be space characters. 3182 * Ref: NVMe v1.3c. 3183 */ 3184 cpywithpad((char *)sc->ctrldata.sn, 3185 sizeof(sc->ctrldata.sn), value, ' '); 3186 } 3187 value = get_config_value_node(nvl, "eui64"); 3188 if (value != NULL) 3189 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 3190 value = get_config_value_node(nvl, "dsm"); 3191 if (value != NULL) { 3192 if (strcmp(value, "auto") == 0) 3193 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 3194 else if (strcmp(value, "enable") == 0) 3195 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 3196 else if (strcmp(value, "disable") == 0) 3197 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 3198 } 3199 3200 value = get_config_value_node(nvl, "bootindex"); 3201 if (value != NULL) { 3202 if (pci_emul_add_boot_device(sc->nsc_pi, atoi(value))) { 3203 EPRINTLN("Invalid bootindex %d", atoi(value)); 3204 return (-1); 3205 } 3206 } 3207 3208 value = get_config_value_node(nvl, "ram"); 3209 if (value != NULL) { 3210 uint64_t sz = strtoull(value, NULL, 10); 3211 3212 sc->nvstore.type = NVME_STOR_RAM; 3213 sc->nvstore.size = sz * 1024 * 1024; 3214 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 3215 sc->nvstore.sectsz = 4096; 3216 sc->nvstore.sectsz_bits = 12; 3217 if (sc->nvstore.ctx == NULL) { 3218 EPRINTLN("nvme: Unable to allocate RAM"); 3219 return (-1); 3220 } 3221 } else { 3222 snprintf(bident, sizeof(bident), "%u:%u", 3223 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 3224 sc->nvstore.ctx = blockif_open(nvl, bident); 3225 if (sc->nvstore.ctx == NULL) { 3226 EPRINTLN("nvme: Could not open backing file: %s", 3227 strerror(errno)); 3228 return (-1); 3229 } 3230 sc->nvstore.type = NVME_STOR_BLOCKIF; 3231 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 3232 } 3233 3234 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 3235 sc->nvstore.sectsz = sectsz; 3236 else if (sc->nvstore.type != NVME_STOR_RAM) 3237 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 3238 for (sc->nvstore.sectsz_bits = 9; 3239 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 3240 sc->nvstore.sectsz_bits++); 3241 3242 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 3243 sc->max_queues = NVME_QUEUES; 3244 3245 return (0); 3246 } 3247 3248 static void 3249 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg, 3250 size_t new_size) 3251 { 3252 struct pci_nvme_softc *sc; 3253 struct pci_nvme_blockstore *nvstore; 3254 struct nvme_namespace_data *nd; 3255 3256 sc = arg; 3257 nvstore = &sc->nvstore; 3258 nd = &sc->nsdata; 3259 3260 nvstore->size = new_size; 3261 pci_nvme_init_nsdata_size(nvstore, nd); 3262 3263 /* Add changed NSID to list */ 3264 sc->ns_log.ns[0] = 1; 3265 sc->ns_log.ns[1] = 0; 3266 3267 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE, 3268 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED); 3269 } 3270 3271 static int 3272 pci_nvme_init(struct pci_devinst *pi, nvlist_t *nvl) 3273 { 3274 struct pci_nvme_softc *sc; 3275 uint32_t pci_membar_sz; 3276 int error; 3277 3278 error = 0; 3279 3280 sc = calloc(1, sizeof(struct pci_nvme_softc)); 3281 pi->pi_arg = sc; 3282 sc->nsc_pi = pi; 3283 3284 error = pci_nvme_parse_config(sc, nvl); 3285 if (error < 0) 3286 goto done; 3287 else 3288 error = 0; 3289 3290 STAILQ_INIT(&sc->ioreqs_free); 3291 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 3292 for (uint32_t i = 0; i < sc->ioslots; i++) { 3293 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 3294 } 3295 3296 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 3297 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 3298 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 3299 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 3300 pci_set_cfgdata8(pi, PCIR_PROGIF, 3301 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 3302 3303 /* 3304 * Allocate size of NVMe registers + doorbell space for all queues. 3305 * 3306 * The specification requires a minimum memory I/O window size of 16K. 3307 * The Windows driver will refuse to start a device with a smaller 3308 * window. 3309 */ 3310 pci_membar_sz = sizeof(struct nvme_registers) + 3311 2 * sizeof(uint32_t) * (sc->max_queues + 1); 3312 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 3313 3314 DPRINTF("nvme membar size: %u", pci_membar_sz); 3315 3316 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 3317 if (error) { 3318 WPRINTF("%s pci alloc mem bar failed", __func__); 3319 goto done; 3320 } 3321 3322 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 3323 if (error) { 3324 WPRINTF("%s pci add msixcap failed", __func__); 3325 goto done; 3326 } 3327 3328 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 3329 if (error) { 3330 WPRINTF("%s pci add Express capability failed", __func__); 3331 goto done; 3332 } 3333 3334 pthread_mutex_init(&sc->mtx, NULL); 3335 sem_init(&sc->iosemlock, 0, sc->ioslots); 3336 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc); 3337 3338 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 3339 /* 3340 * Controller data depends on Namespace data so initialize Namespace 3341 * data first. 3342 */ 3343 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 3344 pci_nvme_init_ctrldata(sc); 3345 pci_nvme_init_logpages(sc); 3346 pci_nvme_init_features(sc); 3347 3348 pci_nvme_aer_init(sc); 3349 pci_nvme_aen_init(sc); 3350 3351 pci_nvme_reset(sc); 3352 done: 3353 return (error); 3354 } 3355 3356 static int 3357 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts) 3358 { 3359 char *cp, *ram; 3360 3361 if (opts == NULL) 3362 return (0); 3363 3364 if (strncmp(opts, "ram=", 4) == 0) { 3365 cp = strchr(opts, ','); 3366 if (cp == NULL) { 3367 set_config_value_node(nvl, "ram", opts + 4); 3368 return (0); 3369 } 3370 ram = strndup(opts + 4, cp - opts - 4); 3371 set_config_value_node(nvl, "ram", ram); 3372 free(ram); 3373 return (pci_parse_legacy_config(nvl, cp + 1)); 3374 } else 3375 return (blockif_legacy_config(nvl, opts)); 3376 } 3377 3378 static const struct pci_devemu pci_de_nvme = { 3379 .pe_emu = "nvme", 3380 .pe_init = pci_nvme_init, 3381 .pe_legacy_config = pci_nvme_legacy_config, 3382 .pe_barwrite = pci_nvme_write, 3383 .pe_barread = pci_nvme_read 3384 }; 3385 PCI_EMUL_SET(pci_de_nvme); 3386