1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2017 Shunsuke Mie 5 * Copyright (c) 2018 Leon Dang 6 * Copyright (c) 2020 Chuck Tuffli 7 * 8 * Function crc16 Copyright (c) 2017, Fedor Uporov 9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 /* 34 * bhyve PCIe-NVMe device emulation. 35 * 36 * options: 37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt> 38 * 39 * accepted devpath: 40 * /dev/blockdev 41 * /path/to/image 42 * ram=size_in_MiB 43 * 44 * maxq = max number of queues 45 * qsz = max elements in each queue 46 * ioslots = max number of concurrent io requests 47 * sectsz = sector size (defaults to blockif sector size) 48 * ser = serial number (20-chars max) 49 * eui64 = IEEE Extended Unique Identifier (8 byte value) 50 * dsm = DataSet Management support. Option is one of auto, enable,disable 51 * 52 */ 53 54 /* TODO: 55 - create async event for smart and log 56 - intr coalesce 57 */ 58 59 #include <sys/cdefs.h> 60 __FBSDID("$FreeBSD$"); 61 62 #include <sys/errno.h> 63 #include <sys/types.h> 64 #include <net/ieee_oui.h> 65 #ifndef __FreeBSD__ 66 #include <endian.h> 67 #endif 68 69 #include <assert.h> 70 #include <pthread.h> 71 #include <semaphore.h> 72 #include <stdbool.h> 73 #include <stddef.h> 74 #include <stdint.h> 75 #include <stdio.h> 76 #include <stdlib.h> 77 #include <string.h> 78 79 #include <machine/atomic.h> 80 #include <machine/vmm.h> 81 #include <vmmapi.h> 82 83 #include <dev/nvme/nvme.h> 84 85 #include "bhyverun.h" 86 #include "block_if.h" 87 #include "config.h" 88 #include "debug.h" 89 #include "pci_emul.h" 90 91 92 static int nvme_debug = 0; 93 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args) 94 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args) 95 96 /* defaults; can be overridden */ 97 #define NVME_MSIX_BAR 4 98 99 #define NVME_IOSLOTS 8 100 101 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */ 102 #define NVME_MMIO_SPACE_MIN (1 << 14) 103 104 #define NVME_QUEUES 16 105 #define NVME_MAX_QENTRIES 2048 106 /* Memory Page size Minimum reported in CAP register */ 107 #define NVME_MPSMIN 0 108 /* MPSMIN converted to bytes */ 109 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) 110 111 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) 112 #define NVME_MDTS 9 113 /* Note the + 1 allows for the initial descriptor to not be page aligned */ 114 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) 115 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) 116 117 /* This is a synthetic status code to indicate there is no status */ 118 #define NVME_NO_STATUS 0xffff 119 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS) 120 121 /* helpers */ 122 123 /* Convert a zero-based value into a one-based value */ 124 #define ONE_BASED(zero) ((zero) + 1) 125 /* Convert a one-based value into a zero-based value */ 126 #define ZERO_BASED(one) ((one) - 1) 127 128 /* Encode number of SQ's and CQ's for Set/Get Features */ 129 #define NVME_FEATURE_NUM_QUEUES(sc) \ 130 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ 131 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; 132 133 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) 134 135 enum nvme_controller_register_offsets { 136 NVME_CR_CAP_LOW = 0x00, 137 NVME_CR_CAP_HI = 0x04, 138 NVME_CR_VS = 0x08, 139 NVME_CR_INTMS = 0x0c, 140 NVME_CR_INTMC = 0x10, 141 NVME_CR_CC = 0x14, 142 NVME_CR_CSTS = 0x1c, 143 NVME_CR_NSSR = 0x20, 144 NVME_CR_AQA = 0x24, 145 NVME_CR_ASQ_LOW = 0x28, 146 NVME_CR_ASQ_HI = 0x2c, 147 NVME_CR_ACQ_LOW = 0x30, 148 NVME_CR_ACQ_HI = 0x34, 149 }; 150 151 enum nvme_cmd_cdw11 { 152 NVME_CMD_CDW11_PC = 0x0001, 153 NVME_CMD_CDW11_IEN = 0x0002, 154 NVME_CMD_CDW11_IV = 0xFFFF0000, 155 }; 156 157 enum nvme_copy_dir { 158 NVME_COPY_TO_PRP, 159 NVME_COPY_FROM_PRP, 160 }; 161 162 #define NVME_CQ_INTEN 0x01 163 #define NVME_CQ_INTCOAL 0x02 164 165 struct nvme_completion_queue { 166 struct nvme_completion *qbase; 167 pthread_mutex_t mtx; 168 uint32_t size; 169 uint16_t tail; /* nvme progress */ 170 uint16_t head; /* guest progress */ 171 uint16_t intr_vec; 172 uint32_t intr_en; 173 }; 174 175 struct nvme_submission_queue { 176 struct nvme_command *qbase; 177 pthread_mutex_t mtx; 178 uint32_t size; 179 uint16_t head; /* nvme progress */ 180 uint16_t tail; /* guest progress */ 181 uint16_t cqid; /* completion queue id */ 182 int qpriority; 183 }; 184 185 enum nvme_storage_type { 186 NVME_STOR_BLOCKIF = 0, 187 NVME_STOR_RAM = 1, 188 }; 189 190 struct pci_nvme_blockstore { 191 enum nvme_storage_type type; 192 void *ctx; 193 uint64_t size; 194 uint32_t sectsz; 195 uint32_t sectsz_bits; 196 uint64_t eui64; 197 uint32_t deallocate:1; 198 }; 199 200 /* 201 * Calculate the number of additional page descriptors for guest IO requests 202 * based on the advertised Max Data Transfer (MDTS) and given the number of 203 * default iovec's in a struct blockif_req. 204 * 205 * Note the + 1 allows for the initial descriptor to not be page aligned. 206 */ 207 #define MDTS_PAD_SIZE \ 208 NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ 209 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ 210 0 211 212 struct pci_nvme_ioreq { 213 struct pci_nvme_softc *sc; 214 STAILQ_ENTRY(pci_nvme_ioreq) link; 215 struct nvme_submission_queue *nvme_sq; 216 uint16_t sqid; 217 218 /* command information */ 219 uint16_t opc; 220 uint16_t cid; 221 uint32_t nsid; 222 223 uint64_t prev_gpaddr; 224 size_t prev_size; 225 size_t bytes; 226 227 struct blockif_req io_req; 228 229 struct iovec iovpadding[MDTS_PAD_SIZE]; 230 }; 231 232 enum nvme_dsm_type { 233 /* Dataset Management bit in ONCS reflects backing storage capability */ 234 NVME_DATASET_MANAGEMENT_AUTO, 235 /* Unconditionally set Dataset Management bit in ONCS */ 236 NVME_DATASET_MANAGEMENT_ENABLE, 237 /* Unconditionally clear Dataset Management bit in ONCS */ 238 NVME_DATASET_MANAGEMENT_DISABLE, 239 }; 240 241 struct pci_nvme_softc; 242 struct nvme_feature_obj; 243 244 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *, 245 struct nvme_feature_obj *, 246 struct nvme_command *, 247 struct nvme_completion *); 248 249 struct nvme_feature_obj { 250 uint32_t cdw11; 251 nvme_feature_cb set; 252 nvme_feature_cb get; 253 bool namespace_specific; 254 }; 255 256 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1) 257 258 struct pci_nvme_aer { 259 STAILQ_ENTRY(pci_nvme_aer) link; 260 uint16_t cid; /* Command ID of the submitted AER */ 261 }; 262 263 struct pci_nvme_softc { 264 struct pci_devinst *nsc_pi; 265 266 pthread_mutex_t mtx; 267 268 struct nvme_registers regs; 269 270 struct nvme_namespace_data nsdata; 271 struct nvme_controller_data ctrldata; 272 struct nvme_error_information_entry err_log; 273 struct nvme_health_information_page health_log; 274 struct nvme_firmware_page fw_log; 275 276 struct pci_nvme_blockstore nvstore; 277 278 uint16_t max_qentries; /* max entries per queue */ 279 uint32_t max_queues; /* max number of IO SQ's or CQ's */ 280 uint32_t num_cqueues; 281 uint32_t num_squeues; 282 bool num_q_is_set; /* Has host set Number of Queues */ 283 284 struct pci_nvme_ioreq *ioreqs; 285 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */ 286 uint32_t pending_ios; 287 uint32_t ioslots; 288 sem_t iosemlock; 289 290 /* 291 * Memory mapped Submission and Completion queues 292 * Each array includes both Admin and IO queues 293 */ 294 struct nvme_completion_queue *compl_queues; 295 struct nvme_submission_queue *submit_queues; 296 297 struct nvme_feature_obj feat[NVME_FID_MAX]; 298 299 enum nvme_dsm_type dataset_management; 300 301 /* Accounting for SMART data */ 302 __uint128_t read_data_units; 303 __uint128_t write_data_units; 304 __uint128_t read_commands; 305 __uint128_t write_commands; 306 uint32_t read_dunits_remainder; 307 uint32_t write_dunits_remainder; 308 309 STAILQ_HEAD(, pci_nvme_aer) aer_list; 310 uint32_t aer_count; 311 }; 312 313 314 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); 315 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); 316 static void pci_nvme_io_done(struct blockif_req *, int); 317 318 /* Controller Configuration utils */ 319 #define NVME_CC_GET_EN(cc) \ 320 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) 321 #define NVME_CC_GET_CSS(cc) \ 322 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) 323 #define NVME_CC_GET_SHN(cc) \ 324 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) 325 #define NVME_CC_GET_IOSQES(cc) \ 326 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) 327 #define NVME_CC_GET_IOCQES(cc) \ 328 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) 329 330 #define NVME_CC_WRITE_MASK \ 331 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ 332 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ 333 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) 334 335 #define NVME_CC_NEN_WRITE_MASK \ 336 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ 337 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ 338 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) 339 340 /* Controller Status utils */ 341 #define NVME_CSTS_GET_RDY(sts) \ 342 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) 343 344 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) 345 346 /* Completion Queue status word utils */ 347 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) 348 #define NVME_STATUS_MASK \ 349 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ 350 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) 351 352 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \ 353 NVME_CTRLR_DATA_ONCS_DSM_SHIFT) 354 355 static void nvme_feature_invalid_cb(struct pci_nvme_softc *, 356 struct nvme_feature_obj *, 357 struct nvme_command *, 358 struct nvme_completion *); 359 static void nvme_feature_num_queues(struct pci_nvme_softc *, 360 struct nvme_feature_obj *, 361 struct nvme_command *, 362 struct nvme_completion *); 363 static void nvme_feature_iv_config(struct pci_nvme_softc *, 364 struct nvme_feature_obj *, 365 struct nvme_command *, 366 struct nvme_completion *); 367 368 static __inline void 369 cpywithpad(char *dst, size_t dst_size, const char *src, char pad) 370 { 371 size_t len; 372 373 len = strnlen(src, dst_size); 374 memset(dst, pad, dst_size); 375 memcpy(dst, src, len); 376 } 377 378 static __inline void 379 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) 380 { 381 382 *status &= ~NVME_STATUS_MASK; 383 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | 384 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; 385 } 386 387 static __inline void 388 pci_nvme_status_genc(uint16_t *status, uint16_t code) 389 { 390 391 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); 392 } 393 394 /* 395 * Initialize the requested number or IO Submission and Completion Queues. 396 * Admin queues are allocated implicitly. 397 */ 398 static void 399 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) 400 { 401 uint32_t i; 402 403 /* 404 * Allocate and initialize the Submission Queues 405 */ 406 if (nsq > NVME_QUEUES) { 407 WPRINTF("%s: clamping number of SQ from %u to %u", 408 __func__, nsq, NVME_QUEUES); 409 nsq = NVME_QUEUES; 410 } 411 412 sc->num_squeues = nsq; 413 414 sc->submit_queues = calloc(sc->num_squeues + 1, 415 sizeof(struct nvme_submission_queue)); 416 if (sc->submit_queues == NULL) { 417 WPRINTF("%s: SQ allocation failed", __func__); 418 sc->num_squeues = 0; 419 } else { 420 struct nvme_submission_queue *sq = sc->submit_queues; 421 422 for (i = 0; i < sc->num_squeues; i++) 423 pthread_mutex_init(&sq[i].mtx, NULL); 424 } 425 426 /* 427 * Allocate and initialize the Completion Queues 428 */ 429 if (ncq > NVME_QUEUES) { 430 WPRINTF("%s: clamping number of CQ from %u to %u", 431 __func__, ncq, NVME_QUEUES); 432 ncq = NVME_QUEUES; 433 } 434 435 sc->num_cqueues = ncq; 436 437 sc->compl_queues = calloc(sc->num_cqueues + 1, 438 sizeof(struct nvme_completion_queue)); 439 if (sc->compl_queues == NULL) { 440 WPRINTF("%s: CQ allocation failed", __func__); 441 sc->num_cqueues = 0; 442 } else { 443 struct nvme_completion_queue *cq = sc->compl_queues; 444 445 for (i = 0; i < sc->num_cqueues; i++) 446 pthread_mutex_init(&cq[i].mtx, NULL); 447 } 448 } 449 450 static void 451 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) 452 { 453 struct nvme_controller_data *cd = &sc->ctrldata; 454 455 cd->vid = 0xFB5D; 456 cd->ssvid = 0x0000; 457 458 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); 459 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); 460 461 /* Num of submission commands that we can handle at a time (2^rab) */ 462 cd->rab = 4; 463 464 /* FreeBSD OUI */ 465 cd->ieee[0] = 0x58; 466 cd->ieee[1] = 0x9c; 467 cd->ieee[2] = 0xfc; 468 469 cd->mic = 0; 470 471 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ 472 473 cd->ver = 0x00010300; 474 475 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; 476 cd->acl = 2; 477 cd->aerl = 4; 478 479 /* Advertise 1, Read-only firmware slot */ 480 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK | 481 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT); 482 cd->lpa = 0; /* TODO: support some simple things like SMART */ 483 cd->elpe = 0; /* max error log page entries */ 484 cd->npss = 1; /* number of power states support */ 485 486 /* Warning Composite Temperature Threshold */ 487 cd->wctemp = 0x0157; 488 489 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | 490 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); 491 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | 492 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); 493 cd->nn = 1; /* number of namespaces */ 494 495 cd->oncs = 0; 496 switch (sc->dataset_management) { 497 case NVME_DATASET_MANAGEMENT_AUTO: 498 if (sc->nvstore.deallocate) 499 cd->oncs |= NVME_ONCS_DSM; 500 break; 501 case NVME_DATASET_MANAGEMENT_ENABLE: 502 cd->oncs |= NVME_ONCS_DSM; 503 break; 504 default: 505 break; 506 } 507 508 cd->fna = 0x03; 509 510 cd->power_state[0].mp = 10; 511 } 512 513 /* 514 * Calculate the CRC-16 of the given buffer 515 * See copyright attribution at top of file 516 */ 517 static uint16_t 518 crc16(uint16_t crc, const void *buffer, unsigned int len) 519 { 520 const unsigned char *cp = buffer; 521 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */ 522 static uint16_t const crc16_table[256] = { 523 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 524 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 525 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 526 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 527 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 528 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 529 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 530 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 531 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 532 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 533 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 534 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 535 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 536 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 537 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 538 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 539 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 540 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 541 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 542 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 543 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 544 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 545 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 546 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 547 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 548 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 549 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 550 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 551 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 552 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 553 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 554 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 555 }; 556 557 while (len--) 558 crc = (((crc >> 8) & 0xffU) ^ 559 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU; 560 return crc; 561 } 562 563 static void 564 pci_nvme_init_nsdata(struct pci_nvme_softc *sc, 565 struct nvme_namespace_data *nd, uint32_t nsid, 566 struct pci_nvme_blockstore *nvstore) 567 { 568 569 /* Get capacity and block size information from backing store */ 570 nd->nsze = nvstore->size / nvstore->sectsz; 571 nd->ncap = nd->nsze; 572 nd->nuse = nd->nsze; 573 574 if (nvstore->type == NVME_STOR_BLOCKIF) 575 nvstore->deallocate = blockif_candelete(nvstore->ctx); 576 577 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ 578 nd->flbas = 0; 579 580 /* Create an EUI-64 if user did not provide one */ 581 if (nvstore->eui64 == 0) { 582 char *data = NULL; 583 uint64_t eui64 = nvstore->eui64; 584 585 asprintf(&data, "%s%u%u%u", get_config_value("name"), 586 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot, 587 sc->nsc_pi->pi_func); 588 589 if (data != NULL) { 590 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data)); 591 free(data); 592 } 593 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff); 594 } 595 be64enc(nd->eui64, nvstore->eui64); 596 597 /* LBA data-sz = 2^lbads */ 598 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; 599 } 600 601 static void 602 pci_nvme_init_logpages(struct pci_nvme_softc *sc) 603 { 604 605 memset(&sc->err_log, 0, sizeof(sc->err_log)); 606 memset(&sc->health_log, 0, sizeof(sc->health_log)); 607 memset(&sc->fw_log, 0, sizeof(sc->fw_log)); 608 609 /* Set read/write remainder to round up according to spec */ 610 sc->read_dunits_remainder = 999; 611 sc->write_dunits_remainder = 999; 612 613 /* Set nominal Health values checked by implementations */ 614 sc->health_log.temperature = 310; 615 sc->health_log.available_spare = 100; 616 sc->health_log.available_spare_threshold = 10; 617 } 618 619 static void 620 pci_nvme_init_features(struct pci_nvme_softc *sc) 621 { 622 623 sc->feat[0].set = nvme_feature_invalid_cb; 624 sc->feat[0].get = nvme_feature_invalid_cb; 625 626 sc->feat[NVME_FEAT_LBA_RANGE_TYPE].namespace_specific = true; 627 sc->feat[NVME_FEAT_ERROR_RECOVERY].namespace_specific = true; 628 sc->feat[NVME_FEAT_NUMBER_OF_QUEUES].set = nvme_feature_num_queues; 629 sc->feat[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION].set = 630 nvme_feature_iv_config; 631 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG].get = 632 nvme_feature_invalid_cb; 633 sc->feat[NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW].get = 634 nvme_feature_invalid_cb; 635 } 636 637 static void 638 pci_nvme_aer_init(struct pci_nvme_softc *sc) 639 { 640 641 STAILQ_INIT(&sc->aer_list); 642 sc->aer_count = 0; 643 } 644 645 static void 646 pci_nvme_aer_destroy(struct pci_nvme_softc *sc) 647 { 648 struct pci_nvme_aer *aer = NULL; 649 650 while (!STAILQ_EMPTY(&sc->aer_list)) { 651 aer = STAILQ_FIRST(&sc->aer_list); 652 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 653 free(aer); 654 } 655 656 pci_nvme_aer_init(sc); 657 } 658 659 #ifdef __FreeBSD__ 660 static bool 661 pci_nvme_aer_available(struct pci_nvme_softc *sc) 662 { 663 664 return (!STAILQ_EMPTY(&sc->aer_list)); 665 } 666 #else 667 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 668 #endif 669 670 static bool 671 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc) 672 { 673 struct nvme_controller_data *cd = &sc->ctrldata; 674 675 /* AERL is a zero based value while aer_count is one's based */ 676 return (sc->aer_count == (cd->aerl + 1)); 677 } 678 679 /* 680 * Add an Async Event Request 681 * 682 * Stores an AER to be returned later if the Controller needs to notify the 683 * host of an event. 684 * Note that while the NVMe spec doesn't require Controllers to return AER's 685 * in order, this implementation does preserve the order. 686 */ 687 static int 688 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid) 689 { 690 struct pci_nvme_aer *aer = NULL; 691 692 if (pci_nvme_aer_limit_reached(sc)) 693 return (-1); 694 695 aer = calloc(1, sizeof(struct pci_nvme_aer)); 696 if (aer == NULL) 697 return (-1); 698 699 sc->aer_count++; 700 701 /* Save the Command ID for use in the completion message */ 702 aer->cid = cid; 703 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link); 704 705 return (0); 706 } 707 708 /* 709 * Get an Async Event Request structure 710 * 711 * Returns a pointer to an AER previously submitted by the host or NULL if 712 * no AER's exist. Caller is responsible for freeing the returned struct. 713 */ 714 #ifdef __FreeBSD__ 715 static struct pci_nvme_aer * 716 pci_nvme_aer_get(struct pci_nvme_softc *sc) 717 { 718 struct pci_nvme_aer *aer = NULL; 719 720 aer = STAILQ_FIRST(&sc->aer_list); 721 if (aer != NULL) { 722 STAILQ_REMOVE_HEAD(&sc->aer_list, link); 723 sc->aer_count--; 724 } 725 726 return (aer); 727 } 728 #else 729 /* This is kept behind an ifdef while it's unused to appease the compiler. */ 730 #endif 731 732 static void 733 pci_nvme_reset_locked(struct pci_nvme_softc *sc) 734 { 735 uint32_t i; 736 737 DPRINTF("%s", __func__); 738 739 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | 740 (1 << NVME_CAP_LO_REG_CQR_SHIFT) | 741 (60 << NVME_CAP_LO_REG_TO_SHIFT); 742 743 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; 744 745 sc->regs.vs = 0x00010300; /* NVMe v1.3 */ 746 747 sc->regs.cc = 0; 748 sc->regs.csts = 0; 749 750 assert(sc->submit_queues != NULL); 751 752 for (i = 0; i < sc->num_squeues + 1; i++) { 753 sc->submit_queues[i].qbase = NULL; 754 sc->submit_queues[i].size = 0; 755 sc->submit_queues[i].cqid = 0; 756 sc->submit_queues[i].tail = 0; 757 sc->submit_queues[i].head = 0; 758 } 759 760 assert(sc->compl_queues != NULL); 761 762 for (i = 0; i < sc->num_cqueues + 1; i++) { 763 sc->compl_queues[i].qbase = NULL; 764 sc->compl_queues[i].size = 0; 765 sc->compl_queues[i].tail = 0; 766 sc->compl_queues[i].head = 0; 767 } 768 769 sc->num_q_is_set = false; 770 771 pci_nvme_aer_destroy(sc); 772 } 773 774 static void 775 pci_nvme_reset(struct pci_nvme_softc *sc) 776 { 777 pthread_mutex_lock(&sc->mtx); 778 pci_nvme_reset_locked(sc); 779 pthread_mutex_unlock(&sc->mtx); 780 } 781 782 static void 783 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) 784 { 785 uint16_t acqs, asqs; 786 787 DPRINTF("%s", __func__); 788 789 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; 790 sc->submit_queues[0].size = asqs; 791 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, 792 sizeof(struct nvme_command) * asqs); 793 794 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p", 795 __func__, sc->regs.asq, sc->submit_queues[0].qbase); 796 797 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & 798 NVME_AQA_REG_ACQS_MASK) + 1; 799 sc->compl_queues[0].size = acqs; 800 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, 801 sizeof(struct nvme_completion) * acqs); 802 sc->compl_queues[0].intr_en = NVME_CQ_INTEN; 803 804 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p", 805 __func__, sc->regs.acq, sc->compl_queues[0].qbase); 806 } 807 808 static int 809 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b, 810 size_t len, enum nvme_copy_dir dir) 811 { 812 uint8_t *p; 813 size_t bytes; 814 815 if (len > (8 * 1024)) { 816 return (-1); 817 } 818 819 /* Copy from the start of prp1 to the end of the physical page */ 820 bytes = PAGE_SIZE - (prp1 & PAGE_MASK); 821 bytes = MIN(bytes, len); 822 823 p = vm_map_gpa(ctx, prp1, bytes); 824 if (p == NULL) { 825 return (-1); 826 } 827 828 if (dir == NVME_COPY_TO_PRP) 829 memcpy(p, b, bytes); 830 else 831 memcpy(b, p, bytes); 832 833 b += bytes; 834 835 len -= bytes; 836 if (len == 0) { 837 return (0); 838 } 839 840 len = MIN(len, PAGE_SIZE); 841 842 p = vm_map_gpa(ctx, prp2, len); 843 if (p == NULL) { 844 return (-1); 845 } 846 847 if (dir == NVME_COPY_TO_PRP) 848 memcpy(p, b, len); 849 else 850 memcpy(b, p, len); 851 852 return (0); 853 } 854 855 /* 856 * Write a Completion Queue Entry update 857 * 858 * Write the completion and update the doorbell value 859 */ 860 static void 861 pci_nvme_cq_update(struct pci_nvme_softc *sc, 862 struct nvme_completion_queue *cq, 863 uint32_t cdw0, 864 uint16_t cid, 865 uint16_t sqid, 866 uint16_t status) 867 { 868 struct nvme_submission_queue *sq = &sc->submit_queues[sqid]; 869 struct nvme_completion *cqe; 870 871 assert(cq->qbase != NULL); 872 873 pthread_mutex_lock(&cq->mtx); 874 875 cqe = &cq->qbase[cq->tail]; 876 877 /* Flip the phase bit */ 878 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK; 879 880 cqe->cdw0 = cdw0; 881 cqe->sqhd = sq->head; 882 cqe->sqid = sqid; 883 cqe->cid = cid; 884 cqe->status = status; 885 886 cq->tail++; 887 if (cq->tail >= cq->size) { 888 cq->tail = 0; 889 } 890 891 pthread_mutex_unlock(&cq->mtx); 892 } 893 894 static int 895 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 896 struct nvme_completion* compl) 897 { 898 uint16_t qid = command->cdw10 & 0xffff; 899 900 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid); 901 if (qid == 0 || qid > sc->num_squeues || 902 (sc->submit_queues[qid].qbase == NULL)) { 903 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u", 904 __func__, qid, sc->num_squeues); 905 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 906 NVME_SC_INVALID_QUEUE_IDENTIFIER); 907 return (1); 908 } 909 910 sc->submit_queues[qid].qbase = NULL; 911 sc->submit_queues[qid].cqid = 0; 912 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 913 return (1); 914 } 915 916 static int 917 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, 918 struct nvme_completion* compl) 919 { 920 if (command->cdw11 & NVME_CMD_CDW11_PC) { 921 uint16_t qid = command->cdw10 & 0xffff; 922 struct nvme_submission_queue *nsq; 923 924 if ((qid == 0) || (qid > sc->num_squeues) || 925 (sc->submit_queues[qid].qbase != NULL)) { 926 WPRINTF("%s queue index %u > num_squeues %u", 927 __func__, qid, sc->num_squeues); 928 pci_nvme_status_tc(&compl->status, 929 NVME_SCT_COMMAND_SPECIFIC, 930 NVME_SC_INVALID_QUEUE_IDENTIFIER); 931 return (1); 932 } 933 934 nsq = &sc->submit_queues[qid]; 935 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 936 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries); 937 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) { 938 /* 939 * Queues must specify at least two entries 940 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 941 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 942 */ 943 pci_nvme_status_tc(&compl->status, 944 NVME_SCT_COMMAND_SPECIFIC, 945 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 946 return (1); 947 } 948 nsq->head = nsq->tail = 0; 949 950 nsq->cqid = (command->cdw11 >> 16) & 0xffff; 951 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) { 952 pci_nvme_status_tc(&compl->status, 953 NVME_SCT_COMMAND_SPECIFIC, 954 NVME_SC_INVALID_QUEUE_IDENTIFIER); 955 return (1); 956 } 957 958 if (sc->compl_queues[nsq->cqid].qbase == NULL) { 959 pci_nvme_status_tc(&compl->status, 960 NVME_SCT_COMMAND_SPECIFIC, 961 NVME_SC_COMPLETION_QUEUE_INVALID); 962 return (1); 963 } 964 965 nsq->qpriority = (command->cdw11 >> 1) & 0x03; 966 967 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 968 sizeof(struct nvme_command) * (size_t)nsq->size); 969 970 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__, 971 qid, nsq->size, nsq->qbase, nsq->cqid); 972 973 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 974 975 DPRINTF("%s completed creating IOSQ qid %u", 976 __func__, qid); 977 } else { 978 /* 979 * Guest sent non-cont submission queue request. 980 * This setting is unsupported by this emulation. 981 */ 982 WPRINTF("%s unsupported non-contig (list-based) " 983 "create i/o submission queue", __func__); 984 985 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 986 } 987 return (1); 988 } 989 990 static int 991 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 992 struct nvme_completion* compl) 993 { 994 uint16_t qid = command->cdw10 & 0xffff; 995 uint16_t sqid; 996 997 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid); 998 if (qid == 0 || qid > sc->num_cqueues || 999 (sc->compl_queues[qid].qbase == NULL)) { 1000 WPRINTF("%s queue index %u / num_cqueues %u", 1001 __func__, qid, sc->num_cqueues); 1002 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1003 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1004 return (1); 1005 } 1006 1007 /* Deleting an Active CQ is an error */ 1008 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++) 1009 if (sc->submit_queues[sqid].cqid == qid) { 1010 pci_nvme_status_tc(&compl->status, 1011 NVME_SCT_COMMAND_SPECIFIC, 1012 NVME_SC_INVALID_QUEUE_DELETION); 1013 return (1); 1014 } 1015 1016 sc->compl_queues[qid].qbase = NULL; 1017 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1018 return (1); 1019 } 1020 1021 static int 1022 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, 1023 struct nvme_completion* compl) 1024 { 1025 struct nvme_completion_queue *ncq; 1026 uint16_t qid = command->cdw10 & 0xffff; 1027 1028 /* Only support Physically Contiguous queues */ 1029 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) { 1030 WPRINTF("%s unsupported non-contig (list-based) " 1031 "create i/o completion queue", 1032 __func__); 1033 1034 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1035 return (1); 1036 } 1037 1038 if ((qid == 0) || (qid > sc->num_cqueues) || 1039 (sc->compl_queues[qid].qbase != NULL)) { 1040 WPRINTF("%s queue index %u > num_cqueues %u", 1041 __func__, qid, sc->num_cqueues); 1042 pci_nvme_status_tc(&compl->status, 1043 NVME_SCT_COMMAND_SPECIFIC, 1044 NVME_SC_INVALID_QUEUE_IDENTIFIER); 1045 return (1); 1046 } 1047 1048 ncq = &sc->compl_queues[qid]; 1049 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; 1050 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; 1051 if (ncq->intr_vec > (sc->max_queues + 1)) { 1052 pci_nvme_status_tc(&compl->status, 1053 NVME_SCT_COMMAND_SPECIFIC, 1054 NVME_SC_INVALID_INTERRUPT_VECTOR); 1055 return (1); 1056 } 1057 1058 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); 1059 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) { 1060 /* 1061 * Queues must specify at least two entries 1062 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to 1063 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec 1064 */ 1065 pci_nvme_status_tc(&compl->status, 1066 NVME_SCT_COMMAND_SPECIFIC, 1067 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED); 1068 return (1); 1069 } 1070 ncq->head = ncq->tail = 0; 1071 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, 1072 command->prp1, 1073 sizeof(struct nvme_command) * (size_t)ncq->size); 1074 1075 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1076 1077 1078 return (1); 1079 } 1080 1081 static int 1082 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, 1083 struct nvme_completion* compl) 1084 { 1085 uint32_t logsize; 1086 uint8_t logpage = command->cdw10 & 0xFF; 1087 1088 #ifndef __FreeBSD__ 1089 logsize = 0; 1090 #endif 1091 1092 DPRINTF("%s log page %u len %u", __func__, logpage, logsize); 1093 1094 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1095 1096 /* 1097 * Command specifies the number of dwords to return in fields NUMDU 1098 * and NUMDL. This is a zero-based value. 1099 */ 1100 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1; 1101 logsize *= sizeof(uint32_t); 1102 1103 switch (logpage) { 1104 case NVME_LOG_ERROR: 1105 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1106 command->prp2, (uint8_t *)&sc->err_log, 1107 MIN(logsize, sizeof(sc->err_log)), 1108 NVME_COPY_TO_PRP); 1109 break; 1110 case NVME_LOG_HEALTH_INFORMATION: 1111 pthread_mutex_lock(&sc->mtx); 1112 memcpy(&sc->health_log.data_units_read, &sc->read_data_units, 1113 sizeof(sc->health_log.data_units_read)); 1114 memcpy(&sc->health_log.data_units_written, &sc->write_data_units, 1115 sizeof(sc->health_log.data_units_written)); 1116 memcpy(&sc->health_log.host_read_commands, &sc->read_commands, 1117 sizeof(sc->health_log.host_read_commands)); 1118 memcpy(&sc->health_log.host_write_commands, &sc->write_commands, 1119 sizeof(sc->health_log.host_write_commands)); 1120 pthread_mutex_unlock(&sc->mtx); 1121 1122 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1123 command->prp2, (uint8_t *)&sc->health_log, 1124 MIN(logsize, sizeof(sc->health_log)), 1125 NVME_COPY_TO_PRP); 1126 break; 1127 case NVME_LOG_FIRMWARE_SLOT: 1128 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1129 command->prp2, (uint8_t *)&sc->fw_log, 1130 MIN(logsize, sizeof(sc->fw_log)), 1131 NVME_COPY_TO_PRP); 1132 break; 1133 default: 1134 DPRINTF("%s get log page %x command not supported", 1135 __func__, logpage); 1136 1137 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1138 NVME_SC_INVALID_LOG_PAGE); 1139 } 1140 1141 return (1); 1142 } 1143 1144 static int 1145 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, 1146 struct nvme_completion* compl) 1147 { 1148 void *dest; 1149 uint16_t status; 1150 1151 #ifndef __FreeBSD__ 1152 status = 0; 1153 #endif 1154 1155 DPRINTF("%s identify 0x%x nsid 0x%x", __func__, 1156 command->cdw10 & 0xFF, command->nsid); 1157 1158 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1159 1160 switch (command->cdw10 & 0xFF) { 1161 case 0x00: /* return Identify Namespace data structure */ 1162 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1163 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata), 1164 NVME_COPY_TO_PRP); 1165 break; 1166 case 0x01: /* return Identify Controller data structure */ 1167 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, 1168 command->prp2, (uint8_t *)&sc->ctrldata, 1169 sizeof(sc->ctrldata), 1170 NVME_COPY_TO_PRP); 1171 break; 1172 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ 1173 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1174 sizeof(uint32_t) * 1024); 1175 /* All unused entries shall be zero */ 1176 bzero(dest, sizeof(uint32_t) * 1024); 1177 ((uint32_t *)dest)[0] = 1; 1178 break; 1179 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ 1180 if (command->nsid != 1) { 1181 pci_nvme_status_genc(&status, 1182 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 1183 break; 1184 } 1185 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, 1186 sizeof(uint32_t) * 1024); 1187 /* All bytes after the descriptor shall be zero */ 1188 bzero(dest, sizeof(uint32_t) * 1024); 1189 1190 /* Return NIDT=1 (i.e. EUI64) descriptor */ 1191 ((uint8_t *)dest)[0] = 1; 1192 ((uint8_t *)dest)[1] = sizeof(uint64_t); 1193 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t)); 1194 break; 1195 default: 1196 DPRINTF("%s unsupported identify command requested 0x%x", 1197 __func__, command->cdw10 & 0xFF); 1198 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD); 1199 break; 1200 } 1201 1202 compl->status = status; 1203 return (1); 1204 } 1205 1206 static const char * 1207 nvme_fid_to_name(uint8_t fid) 1208 { 1209 const char *name; 1210 1211 switch (fid) { 1212 case NVME_FEAT_ARBITRATION: 1213 name = "Arbitration"; 1214 break; 1215 case NVME_FEAT_POWER_MANAGEMENT: 1216 name = "Power Management"; 1217 break; 1218 case NVME_FEAT_LBA_RANGE_TYPE: 1219 name = "LBA Range Type"; 1220 break; 1221 case NVME_FEAT_TEMPERATURE_THRESHOLD: 1222 name = "Temperature Threshold"; 1223 break; 1224 case NVME_FEAT_ERROR_RECOVERY: 1225 name = "Error Recovery"; 1226 break; 1227 case NVME_FEAT_VOLATILE_WRITE_CACHE: 1228 name = "Volatile Write Cache"; 1229 break; 1230 case NVME_FEAT_NUMBER_OF_QUEUES: 1231 name = "Number of Queues"; 1232 break; 1233 case NVME_FEAT_INTERRUPT_COALESCING: 1234 name = "Interrupt Coalescing"; 1235 break; 1236 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: 1237 name = "Interrupt Vector Configuration"; 1238 break; 1239 case NVME_FEAT_WRITE_ATOMICITY: 1240 name = "Write Atomicity Normal"; 1241 break; 1242 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: 1243 name = "Asynchronous Event Configuration"; 1244 break; 1245 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: 1246 name = "Autonomous Power State Transition"; 1247 break; 1248 case NVME_FEAT_HOST_MEMORY_BUFFER: 1249 name = "Host Memory Buffer"; 1250 break; 1251 case NVME_FEAT_TIMESTAMP: 1252 name = "Timestamp"; 1253 break; 1254 case NVME_FEAT_KEEP_ALIVE_TIMER: 1255 name = "Keep Alive Timer"; 1256 break; 1257 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT: 1258 name = "Host Controlled Thermal Management"; 1259 break; 1260 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG: 1261 name = "Non-Operation Power State Config"; 1262 break; 1263 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG: 1264 name = "Read Recovery Level Config"; 1265 break; 1266 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG: 1267 name = "Predictable Latency Mode Config"; 1268 break; 1269 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW: 1270 name = "Predictable Latency Mode Window"; 1271 break; 1272 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES: 1273 name = "LBA Status Information Report Interval"; 1274 break; 1275 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT: 1276 name = "Host Behavior Support"; 1277 break; 1278 case NVME_FEAT_SANITIZE_CONFIG: 1279 name = "Sanitize Config"; 1280 break; 1281 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION: 1282 name = "Endurance Group Event Configuration"; 1283 break; 1284 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: 1285 name = "Software Progress Marker"; 1286 break; 1287 case NVME_FEAT_HOST_IDENTIFIER: 1288 name = "Host Identifier"; 1289 break; 1290 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK: 1291 name = "Reservation Notification Mask"; 1292 break; 1293 case NVME_FEAT_RESERVATION_PERSISTENCE: 1294 name = "Reservation Persistence"; 1295 break; 1296 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG: 1297 name = "Namespace Write Protection Config"; 1298 break; 1299 default: 1300 name = "Unknown"; 1301 break; 1302 } 1303 1304 return (name); 1305 } 1306 1307 static void 1308 nvme_feature_invalid_cb(struct pci_nvme_softc *sc, 1309 struct nvme_feature_obj *feat, 1310 struct nvme_command *command, 1311 struct nvme_completion *compl) 1312 { 1313 1314 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1315 } 1316 1317 static void 1318 nvme_feature_iv_config(struct pci_nvme_softc *sc, 1319 struct nvme_feature_obj *feat, 1320 struct nvme_command *command, 1321 struct nvme_completion *compl) 1322 { 1323 uint32_t i; 1324 uint32_t cdw11 = command->cdw11; 1325 uint16_t iv; 1326 bool cd; 1327 1328 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1329 1330 iv = cdw11 & 0xffff; 1331 cd = cdw11 & (1 << 16); 1332 1333 if (iv > (sc->max_queues + 1)) { 1334 return; 1335 } 1336 1337 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */ 1338 if ((iv == 0) && !cd) 1339 return; 1340 1341 /* Requested Interrupt Vector must be used by a CQ */ 1342 for (i = 0; i < sc->num_cqueues + 1; i++) { 1343 if (sc->compl_queues[i].intr_vec == iv) { 1344 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1345 } 1346 } 1347 1348 } 1349 1350 static void 1351 nvme_feature_num_queues(struct pci_nvme_softc *sc, 1352 struct nvme_feature_obj *feat, 1353 struct nvme_command *command, 1354 struct nvme_completion *compl) 1355 { 1356 uint16_t nqr; /* Number of Queues Requested */ 1357 1358 if (sc->num_q_is_set) { 1359 WPRINTF("%s: Number of Queues already set", __func__); 1360 pci_nvme_status_genc(&compl->status, 1361 NVME_SC_COMMAND_SEQUENCE_ERROR); 1362 return; 1363 } 1364 1365 nqr = command->cdw11 & 0xFFFF; 1366 if (nqr == 0xffff) { 1367 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr); 1368 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1369 return; 1370 } 1371 1372 sc->num_squeues = ONE_BASED(nqr); 1373 if (sc->num_squeues > sc->max_queues) { 1374 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues, 1375 sc->max_queues); 1376 sc->num_squeues = sc->max_queues; 1377 } 1378 1379 nqr = (command->cdw11 >> 16) & 0xFFFF; 1380 if (nqr == 0xffff) { 1381 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr); 1382 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1383 return; 1384 } 1385 1386 sc->num_cqueues = ONE_BASED(nqr); 1387 if (sc->num_cqueues > sc->max_queues) { 1388 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues, 1389 sc->max_queues); 1390 sc->num_cqueues = sc->max_queues; 1391 } 1392 1393 /* Patch the command value which will be saved on callback's return */ 1394 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc); 1395 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); 1396 1397 sc->num_q_is_set = true; 1398 } 1399 1400 static int 1401 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command, 1402 struct nvme_completion *compl) 1403 { 1404 struct nvme_feature_obj *feat; 1405 uint32_t nsid = command->nsid; 1406 uint8_t fid = command->cdw10 & 0xFF; 1407 1408 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1409 1410 if (fid >= NVME_FID_MAX) { 1411 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1412 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1413 return (1); 1414 } 1415 feat = &sc->feat[fid]; 1416 1417 if (!feat->namespace_specific && 1418 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) { 1419 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1420 NVME_SC_FEATURE_NOT_NS_SPECIFIC); 1421 return (1); 1422 } 1423 1424 compl->cdw0 = 0; 1425 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1426 1427 if (feat->set) 1428 feat->set(sc, feat, command, compl); 1429 1430 if (compl->status == NVME_SC_SUCCESS) 1431 feat->cdw11 = command->cdw11; 1432 1433 return (0); 1434 } 1435 1436 static int 1437 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, 1438 struct nvme_completion* compl) 1439 { 1440 struct nvme_feature_obj *feat; 1441 uint8_t fid = command->cdw10 & 0xFF; 1442 1443 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid)); 1444 1445 if (fid >= NVME_FID_MAX) { 1446 DPRINTF("%s invalid feature 0x%x", __func__, fid); 1447 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1448 return (1); 1449 } 1450 1451 compl->cdw0 = 0; 1452 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1453 1454 feat = &sc->feat[fid]; 1455 if (feat->get) { 1456 feat->get(sc, feat, command, compl); 1457 } 1458 1459 if (compl->status == NVME_SC_SUCCESS) { 1460 compl->cdw0 = feat->cdw11; 1461 } 1462 1463 return (0); 1464 } 1465 1466 static int 1467 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command, 1468 struct nvme_completion* compl) 1469 { 1470 uint8_t ses, lbaf, pi; 1471 1472 /* Only supports Secure Erase Setting - User Data Erase */ 1473 ses = (command->cdw10 >> 9) & 0x7; 1474 if (ses > 0x1) { 1475 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1476 return (1); 1477 } 1478 1479 /* Only supports a single LBA Format */ 1480 lbaf = command->cdw10 & 0xf; 1481 if (lbaf != 0) { 1482 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1483 NVME_SC_INVALID_FORMAT); 1484 return (1); 1485 } 1486 1487 /* Doesn't support Protection Infomation */ 1488 pi = (command->cdw10 >> 5) & 0x7; 1489 if (pi != 0) { 1490 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); 1491 return (1); 1492 } 1493 1494 if (sc->nvstore.type == NVME_STOR_RAM) { 1495 if (sc->nvstore.ctx) 1496 free(sc->nvstore.ctx); 1497 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 1498 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1499 } else { 1500 struct pci_nvme_ioreq *req; 1501 int err; 1502 1503 req = pci_nvme_get_ioreq(sc); 1504 if (req == NULL) { 1505 pci_nvme_status_genc(&compl->status, 1506 NVME_SC_INTERNAL_DEVICE_ERROR); 1507 WPRINTF("%s: unable to allocate IO req", __func__); 1508 return (1); 1509 } 1510 req->nvme_sq = &sc->submit_queues[0]; 1511 req->sqid = 0; 1512 req->opc = command->opc; 1513 req->cid = command->cid; 1514 req->nsid = command->nsid; 1515 1516 req->io_req.br_offset = 0; 1517 req->io_req.br_resid = sc->nvstore.size; 1518 req->io_req.br_callback = pci_nvme_io_done; 1519 1520 err = blockif_delete(sc->nvstore.ctx, &req->io_req); 1521 if (err) { 1522 pci_nvme_status_genc(&compl->status, 1523 NVME_SC_INTERNAL_DEVICE_ERROR); 1524 pci_nvme_release_ioreq(sc, req); 1525 } 1526 } 1527 1528 return (1); 1529 } 1530 1531 static int 1532 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, 1533 struct nvme_completion* compl) 1534 { 1535 DPRINTF("%s submission queue %u, command ID 0x%x", __func__, 1536 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF); 1537 1538 /* TODO: search for the command ID and abort it */ 1539 1540 compl->cdw0 = 1; 1541 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); 1542 return (1); 1543 } 1544 1545 static int 1546 nvme_opc_async_event_req(struct pci_nvme_softc* sc, 1547 struct nvme_command* command, struct nvme_completion* compl) 1548 { 1549 DPRINTF("%s async event request 0x%x", __func__, command->cdw11); 1550 1551 /* Don't exceed the Async Event Request Limit (AERL). */ 1552 if (pci_nvme_aer_limit_reached(sc)) { 1553 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, 1554 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); 1555 return (1); 1556 } 1557 1558 if (pci_nvme_aer_add(sc, command->cid)) { 1559 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC, 1560 NVME_SC_INTERNAL_DEVICE_ERROR); 1561 return (1); 1562 } 1563 1564 /* 1565 * Raise events when they happen based on the Set Features cmd. 1566 * These events happen async, so only set completion successful if 1567 * there is an event reflective of the request to get event. 1568 */ 1569 compl->status = NVME_NO_STATUS; 1570 1571 return (0); 1572 } 1573 1574 static void 1575 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) 1576 { 1577 struct nvme_completion compl; 1578 struct nvme_command *cmd; 1579 struct nvme_submission_queue *sq; 1580 struct nvme_completion_queue *cq; 1581 uint16_t sqhead; 1582 1583 DPRINTF("%s index %u", __func__, (uint32_t)value); 1584 1585 sq = &sc->submit_queues[0]; 1586 cq = &sc->compl_queues[0]; 1587 1588 pthread_mutex_lock(&sq->mtx); 1589 1590 sqhead = sq->head; 1591 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); 1592 1593 while (sqhead != atomic_load_acq_short(&sq->tail)) { 1594 cmd = &(sq->qbase)[sqhead]; 1595 compl.cdw0 = 0; 1596 compl.status = 0; 1597 1598 switch (cmd->opc) { 1599 case NVME_OPC_DELETE_IO_SQ: 1600 DPRINTF("%s command DELETE_IO_SQ", __func__); 1601 nvme_opc_delete_io_sq(sc, cmd, &compl); 1602 break; 1603 case NVME_OPC_CREATE_IO_SQ: 1604 DPRINTF("%s command CREATE_IO_SQ", __func__); 1605 nvme_opc_create_io_sq(sc, cmd, &compl); 1606 break; 1607 case NVME_OPC_DELETE_IO_CQ: 1608 DPRINTF("%s command DELETE_IO_CQ", __func__); 1609 nvme_opc_delete_io_cq(sc, cmd, &compl); 1610 break; 1611 case NVME_OPC_CREATE_IO_CQ: 1612 DPRINTF("%s command CREATE_IO_CQ", __func__); 1613 nvme_opc_create_io_cq(sc, cmd, &compl); 1614 break; 1615 case NVME_OPC_GET_LOG_PAGE: 1616 DPRINTF("%s command GET_LOG_PAGE", __func__); 1617 nvme_opc_get_log_page(sc, cmd, &compl); 1618 break; 1619 case NVME_OPC_IDENTIFY: 1620 DPRINTF("%s command IDENTIFY", __func__); 1621 nvme_opc_identify(sc, cmd, &compl); 1622 break; 1623 case NVME_OPC_ABORT: 1624 DPRINTF("%s command ABORT", __func__); 1625 nvme_opc_abort(sc, cmd, &compl); 1626 break; 1627 case NVME_OPC_SET_FEATURES: 1628 DPRINTF("%s command SET_FEATURES", __func__); 1629 nvme_opc_set_features(sc, cmd, &compl); 1630 break; 1631 case NVME_OPC_GET_FEATURES: 1632 DPRINTF("%s command GET_FEATURES", __func__); 1633 nvme_opc_get_features(sc, cmd, &compl); 1634 break; 1635 case NVME_OPC_FIRMWARE_ACTIVATE: 1636 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__); 1637 pci_nvme_status_tc(&compl.status, 1638 NVME_SCT_COMMAND_SPECIFIC, 1639 NVME_SC_INVALID_FIRMWARE_SLOT); 1640 break; 1641 case NVME_OPC_ASYNC_EVENT_REQUEST: 1642 DPRINTF("%s command ASYNC_EVENT_REQ", __func__); 1643 nvme_opc_async_event_req(sc, cmd, &compl); 1644 break; 1645 case NVME_OPC_FORMAT_NVM: 1646 DPRINTF("%s command FORMAT_NVM", __func__); 1647 if ((sc->ctrldata.oacs & 1648 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) { 1649 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1650 } 1651 compl.status = NVME_NO_STATUS; 1652 nvme_opc_format_nvm(sc, cmd, &compl); 1653 break; 1654 default: 1655 DPRINTF("0x%x command is not implemented", 1656 cmd->opc); 1657 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE); 1658 } 1659 sqhead = (sqhead + 1) % sq->size; 1660 1661 if (NVME_COMPLETION_VALID(compl)) { 1662 pci_nvme_cq_update(sc, &sc->compl_queues[0], 1663 compl.cdw0, 1664 cmd->cid, 1665 0, /* SQID */ 1666 compl.status); 1667 } 1668 } 1669 1670 DPRINTF("setting sqhead %u", sqhead); 1671 sq->head = sqhead; 1672 1673 if (cq->head != cq->tail) 1674 pci_generate_msix(sc->nsc_pi, 0); 1675 1676 pthread_mutex_unlock(&sq->mtx); 1677 } 1678 1679 /* 1680 * Update the Write and Read statistics reported in SMART data 1681 * 1682 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up. 1683 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000 1684 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999. 1685 */ 1686 static void 1687 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc, 1688 size_t bytes, uint16_t status) 1689 { 1690 1691 pthread_mutex_lock(&sc->mtx); 1692 switch (opc) { 1693 case NVME_OPC_WRITE: 1694 sc->write_commands++; 1695 if (status != NVME_SC_SUCCESS) 1696 break; 1697 sc->write_dunits_remainder += (bytes / 512); 1698 while (sc->write_dunits_remainder >= 1000) { 1699 sc->write_data_units++; 1700 sc->write_dunits_remainder -= 1000; 1701 } 1702 break; 1703 case NVME_OPC_READ: 1704 sc->read_commands++; 1705 if (status != NVME_SC_SUCCESS) 1706 break; 1707 sc->read_dunits_remainder += (bytes / 512); 1708 while (sc->read_dunits_remainder >= 1000) { 1709 sc->read_data_units++; 1710 sc->read_dunits_remainder -= 1000; 1711 } 1712 break; 1713 default: 1714 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc); 1715 break; 1716 } 1717 pthread_mutex_unlock(&sc->mtx); 1718 } 1719 1720 /* 1721 * Check if the combination of Starting LBA (slba) and Number of Logical 1722 * Blocks (nlb) exceeds the range of the underlying storage. 1723 * 1724 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores 1725 * the capacity in bytes as a uint64_t, care must be taken to avoid integer 1726 * overflow. 1727 */ 1728 static bool 1729 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba, 1730 uint32_t nlb) 1731 { 1732 size_t offset, bytes; 1733 1734 /* Overflow check of multiplying Starting LBA by the sector size */ 1735 if (slba >> (64 - nvstore->sectsz_bits)) 1736 return (true); 1737 1738 offset = slba << nvstore->sectsz_bits; 1739 bytes = nlb << nvstore->sectsz_bits; 1740 1741 /* Overflow check of Number of Logical Blocks */ 1742 if ((nvstore->size - offset) < bytes) 1743 return (true); 1744 1745 return (false); 1746 } 1747 1748 static int 1749 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, 1750 uint64_t gpaddr, size_t size, int do_write, uint64_t lba) 1751 { 1752 int iovidx; 1753 1754 if (req == NULL) 1755 return (-1); 1756 1757 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { 1758 return (-1); 1759 } 1760 1761 /* concatenate contig block-iovs to minimize number of iovs */ 1762 if ((req->prev_gpaddr + req->prev_size) == gpaddr) { 1763 iovidx = req->io_req.br_iovcnt - 1; 1764 1765 req->io_req.br_iov[iovidx].iov_base = 1766 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1767 req->prev_gpaddr, size); 1768 1769 req->prev_size += size; 1770 req->io_req.br_resid += size; 1771 1772 req->io_req.br_iov[iovidx].iov_len = req->prev_size; 1773 } else { 1774 iovidx = req->io_req.br_iovcnt; 1775 if (iovidx == 0) { 1776 req->io_req.br_offset = lba; 1777 req->io_req.br_resid = 0; 1778 req->io_req.br_param = req; 1779 } 1780 1781 req->io_req.br_iov[iovidx].iov_base = 1782 paddr_guest2host(req->sc->nsc_pi->pi_vmctx, 1783 gpaddr, size); 1784 1785 req->io_req.br_iov[iovidx].iov_len = size; 1786 1787 req->prev_gpaddr = gpaddr; 1788 req->prev_size = size; 1789 req->io_req.br_resid += size; 1790 1791 req->io_req.br_iovcnt++; 1792 } 1793 1794 return (0); 1795 } 1796 1797 static void 1798 pci_nvme_set_completion(struct pci_nvme_softc *sc, 1799 struct nvme_submission_queue *sq, int sqid, uint16_t cid, 1800 uint32_t cdw0, uint16_t status) 1801 { 1802 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; 1803 1804 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x", 1805 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), 1806 NVME_STATUS_GET_SC(status)); 1807 1808 pci_nvme_cq_update(sc, cq, 1809 0, /* CDW0 */ 1810 cid, 1811 sqid, 1812 status); 1813 1814 if (cq->head != cq->tail) { 1815 if (cq->intr_en & NVME_CQ_INTEN) { 1816 pci_generate_msix(sc->nsc_pi, cq->intr_vec); 1817 } else { 1818 DPRINTF("%s: CQ%u interrupt disabled", 1819 __func__, sq->cqid); 1820 } 1821 } 1822 } 1823 1824 static void 1825 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) 1826 { 1827 req->sc = NULL; 1828 req->nvme_sq = NULL; 1829 req->sqid = 0; 1830 1831 pthread_mutex_lock(&sc->mtx); 1832 1833 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link); 1834 sc->pending_ios--; 1835 1836 /* when no more IO pending, can set to ready if device reset/enabled */ 1837 if (sc->pending_ios == 0 && 1838 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) 1839 sc->regs.csts |= NVME_CSTS_RDY; 1840 1841 pthread_mutex_unlock(&sc->mtx); 1842 1843 sem_post(&sc->iosemlock); 1844 } 1845 1846 static struct pci_nvme_ioreq * 1847 pci_nvme_get_ioreq(struct pci_nvme_softc *sc) 1848 { 1849 struct pci_nvme_ioreq *req = NULL;; 1850 1851 sem_wait(&sc->iosemlock); 1852 pthread_mutex_lock(&sc->mtx); 1853 1854 req = STAILQ_FIRST(&sc->ioreqs_free); 1855 assert(req != NULL); 1856 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link); 1857 1858 req->sc = sc; 1859 1860 sc->pending_ios++; 1861 1862 pthread_mutex_unlock(&sc->mtx); 1863 1864 req->io_req.br_iovcnt = 0; 1865 req->io_req.br_offset = 0; 1866 req->io_req.br_resid = 0; 1867 req->io_req.br_param = req; 1868 req->prev_gpaddr = 0; 1869 req->prev_size = 0; 1870 1871 return req; 1872 } 1873 1874 static void 1875 pci_nvme_io_done(struct blockif_req *br, int err) 1876 { 1877 struct pci_nvme_ioreq *req = br->br_param; 1878 struct nvme_submission_queue *sq = req->nvme_sq; 1879 uint16_t code, status; 1880 1881 #ifndef __FreeBSD__ 1882 status = 0; 1883 #endif 1884 1885 DPRINTF("%s error %d %s", __func__, err, strerror(err)); 1886 1887 /* TODO return correct error */ 1888 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; 1889 pci_nvme_status_genc(&status, code); 1890 1891 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); 1892 pci_nvme_stats_write_read_update(req->sc, req->opc, 1893 req->bytes, status); 1894 pci_nvme_release_ioreq(req->sc, req); 1895 } 1896 1897 /* 1898 * Implements the Flush command. The specification states: 1899 * If a volatile write cache is not present, Flush commands complete 1900 * successfully and have no effect 1901 * in the description of the Volatile Write Cache (VWC) field of the Identify 1902 * Controller data. Therefore, set status to Success if the command is 1903 * not supported (i.e. RAM or as indicated by the blockif). 1904 */ 1905 static bool 1906 nvme_opc_flush(struct pci_nvme_softc *sc, 1907 struct nvme_command *cmd, 1908 struct pci_nvme_blockstore *nvstore, 1909 struct pci_nvme_ioreq *req, 1910 uint16_t *status) 1911 { 1912 bool pending = false; 1913 1914 if (nvstore->type == NVME_STOR_RAM) { 1915 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1916 } else { 1917 int err; 1918 1919 req->io_req.br_callback = pci_nvme_io_done; 1920 1921 err = blockif_flush(nvstore->ctx, &req->io_req); 1922 switch (err) { 1923 case 0: 1924 pending = true; 1925 break; 1926 case EOPNOTSUPP: 1927 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 1928 break; 1929 default: 1930 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 1931 } 1932 } 1933 1934 return (pending); 1935 } 1936 1937 static uint16_t 1938 nvme_write_read_ram(struct pci_nvme_softc *sc, 1939 struct pci_nvme_blockstore *nvstore, 1940 uint64_t prp1, uint64_t prp2, 1941 size_t offset, uint64_t bytes, 1942 bool is_write) 1943 { 1944 uint8_t *buf = nvstore->ctx; 1945 enum nvme_copy_dir dir; 1946 uint16_t status; 1947 1948 #ifndef __FreeBSD__ 1949 status = 0; 1950 #endif 1951 1952 if (is_write) 1953 dir = NVME_COPY_TO_PRP; 1954 else 1955 dir = NVME_COPY_FROM_PRP; 1956 1957 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2, 1958 buf + offset, bytes, dir)) 1959 pci_nvme_status_genc(&status, 1960 NVME_SC_DATA_TRANSFER_ERROR); 1961 else 1962 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 1963 1964 return (status); 1965 } 1966 1967 static uint16_t 1968 nvme_write_read_blockif(struct pci_nvme_softc *sc, 1969 struct pci_nvme_blockstore *nvstore, 1970 struct pci_nvme_ioreq *req, 1971 uint64_t prp1, uint64_t prp2, 1972 size_t offset, uint64_t bytes, 1973 bool is_write) 1974 { 1975 uint64_t size; 1976 int err; 1977 uint16_t status = NVME_NO_STATUS; 1978 1979 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes); 1980 if (pci_nvme_append_iov_req(sc, req, prp1, 1981 size, is_write, offset)) { 1982 pci_nvme_status_genc(&status, 1983 NVME_SC_DATA_TRANSFER_ERROR); 1984 goto out; 1985 } 1986 1987 offset += size; 1988 bytes -= size; 1989 1990 if (bytes == 0) { 1991 ; 1992 } else if (bytes <= PAGE_SIZE) { 1993 size = bytes; 1994 if (pci_nvme_append_iov_req(sc, req, prp2, 1995 size, is_write, offset)) { 1996 pci_nvme_status_genc(&status, 1997 NVME_SC_DATA_TRANSFER_ERROR); 1998 goto out; 1999 } 2000 } else { 2001 void *vmctx = sc->nsc_pi->pi_vmctx; 2002 uint64_t *prp_list = &prp2; 2003 uint64_t *last = prp_list; 2004 2005 /* PRP2 is pointer to a physical region page list */ 2006 while (bytes) { 2007 /* Last entry in list points to the next list */ 2008 if (prp_list == last) { 2009 uint64_t prp = *prp_list; 2010 2011 prp_list = paddr_guest2host(vmctx, prp, 2012 PAGE_SIZE - (prp % PAGE_SIZE)); 2013 last = prp_list + (NVME_PRP2_ITEMS - 1); 2014 } 2015 2016 size = MIN(bytes, PAGE_SIZE); 2017 2018 if (pci_nvme_append_iov_req(sc, req, *prp_list, 2019 size, is_write, offset)) { 2020 pci_nvme_status_genc(&status, 2021 NVME_SC_DATA_TRANSFER_ERROR); 2022 goto out; 2023 } 2024 2025 offset += size; 2026 bytes -= size; 2027 2028 prp_list++; 2029 } 2030 } 2031 req->io_req.br_callback = pci_nvme_io_done; 2032 if (is_write) 2033 err = blockif_write(nvstore->ctx, &req->io_req); 2034 else 2035 err = blockif_read(nvstore->ctx, &req->io_req); 2036 2037 if (err) 2038 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR); 2039 out: 2040 return (status); 2041 } 2042 2043 static bool 2044 nvme_opc_write_read(struct pci_nvme_softc *sc, 2045 struct nvme_command *cmd, 2046 struct pci_nvme_blockstore *nvstore, 2047 struct pci_nvme_ioreq *req, 2048 uint16_t *status) 2049 { 2050 uint64_t lba, nblocks, bytes; 2051 size_t offset; 2052 bool is_write = cmd->opc == NVME_OPC_WRITE; 2053 bool pending = false; 2054 2055 #ifndef __FreeBSD__ 2056 bytes = 0; 2057 #endif 2058 2059 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; 2060 nblocks = (cmd->cdw12 & 0xFFFF) + 1; 2061 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) { 2062 WPRINTF("%s command would exceed LBA range", __func__); 2063 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2064 goto out; 2065 } 2066 2067 bytes = nblocks << nvstore->sectsz_bits; 2068 if (bytes > NVME_MAX_DATA_SIZE) { 2069 WPRINTF("%s command would exceed MDTS", __func__); 2070 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); 2071 goto out; 2072 } 2073 2074 offset = lba << nvstore->sectsz_bits; 2075 2076 req->bytes = bytes; 2077 req->io_req.br_offset = lba; 2078 2079 /* PRP bits 1:0 must be zero */ 2080 cmd->prp1 &= ~0x3UL; 2081 cmd->prp2 &= ~0x3UL; 2082 2083 if (nvstore->type == NVME_STOR_RAM) { 2084 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1, 2085 cmd->prp2, offset, bytes, is_write); 2086 } else { 2087 *status = nvme_write_read_blockif(sc, nvstore, req, 2088 cmd->prp1, cmd->prp2, offset, bytes, is_write); 2089 2090 if (*status == NVME_NO_STATUS) 2091 pending = true; 2092 } 2093 out: 2094 if (!pending) 2095 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status); 2096 2097 return (pending); 2098 } 2099 2100 static void 2101 pci_nvme_dealloc_sm(struct blockif_req *br, int err) 2102 { 2103 struct pci_nvme_ioreq *req = br->br_param; 2104 struct pci_nvme_softc *sc = req->sc; 2105 bool done = true; 2106 uint16_t status; 2107 2108 #ifndef __FreeBSD__ 2109 status = 0; 2110 #endif 2111 2112 if (err) { 2113 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR); 2114 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) { 2115 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2116 } else { 2117 struct iovec *iov = req->io_req.br_iov; 2118 2119 req->prev_gpaddr++; 2120 iov += req->prev_gpaddr; 2121 2122 /* The iov_* values already include the sector size */ 2123 req->io_req.br_offset = (off_t)iov->iov_base; 2124 req->io_req.br_resid = iov->iov_len; 2125 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) { 2126 pci_nvme_status_genc(&status, 2127 NVME_SC_INTERNAL_DEVICE_ERROR); 2128 } else 2129 done = false; 2130 } 2131 2132 if (done) { 2133 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, 2134 req->cid, 0, status); 2135 pci_nvme_release_ioreq(sc, req); 2136 } 2137 } 2138 2139 static bool 2140 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, 2141 struct nvme_command *cmd, 2142 struct pci_nvme_blockstore *nvstore, 2143 struct pci_nvme_ioreq *req, 2144 uint16_t *status) 2145 { 2146 struct nvme_dsm_range *range; 2147 uint32_t nr, r, non_zero, dr; 2148 int err; 2149 bool pending = false; 2150 2151 #ifndef __FreeBSD__ 2152 range = NULL; 2153 #endif 2154 2155 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { 2156 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); 2157 goto out; 2158 } 2159 2160 nr = cmd->cdw10 & 0xff; 2161 2162 /* copy locally because a range entry could straddle PRPs */ 2163 range = calloc(1, NVME_MAX_DSM_TRIM); 2164 if (range == NULL) { 2165 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2166 goto out; 2167 } 2168 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, 2169 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); 2170 2171 /* Check for invalid ranges and the number of non-zero lengths */ 2172 non_zero = 0; 2173 for (r = 0; r <= nr; r++) { 2174 if (pci_nvme_out_of_range(nvstore, 2175 range[r].starting_lba, range[r].length)) { 2176 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); 2177 goto out; 2178 } 2179 if (range[r].length != 0) 2180 non_zero++; 2181 } 2182 2183 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) { 2184 size_t offset, bytes; 2185 int sectsz_bits = sc->nvstore.sectsz_bits; 2186 2187 /* 2188 * DSM calls are advisory only, and compliant controllers 2189 * may choose to take no actions (i.e. return Success). 2190 */ 2191 if (!nvstore->deallocate) { 2192 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2193 goto out; 2194 } 2195 2196 /* If all ranges have a zero length, return Success */ 2197 if (non_zero == 0) { 2198 pci_nvme_status_genc(status, NVME_SC_SUCCESS); 2199 goto out; 2200 } 2201 2202 if (req == NULL) { 2203 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2204 goto out; 2205 } 2206 2207 offset = range[0].starting_lba << sectsz_bits; 2208 bytes = range[0].length << sectsz_bits; 2209 2210 /* 2211 * If the request is for more than a single range, store 2212 * the ranges in the br_iov. Optimize for the common case 2213 * of a single range. 2214 * 2215 * Note that NVMe Number of Ranges is a zero based value 2216 */ 2217 req->io_req.br_iovcnt = 0; 2218 req->io_req.br_offset = offset; 2219 req->io_req.br_resid = bytes; 2220 2221 if (nr == 0) { 2222 req->io_req.br_callback = pci_nvme_io_done; 2223 } else { 2224 struct iovec *iov = req->io_req.br_iov; 2225 2226 for (r = 0, dr = 0; r <= nr; r++) { 2227 offset = range[r].starting_lba << sectsz_bits; 2228 bytes = range[r].length << sectsz_bits; 2229 if (bytes == 0) 2230 continue; 2231 2232 if ((nvstore->size - offset) < bytes) { 2233 pci_nvme_status_genc(status, 2234 NVME_SC_LBA_OUT_OF_RANGE); 2235 goto out; 2236 } 2237 iov[dr].iov_base = (void *)offset; 2238 iov[dr].iov_len = bytes; 2239 dr++; 2240 } 2241 req->io_req.br_callback = pci_nvme_dealloc_sm; 2242 2243 /* 2244 * Use prev_gpaddr to track the current entry and 2245 * prev_size to track the number of entries 2246 */ 2247 req->prev_gpaddr = 0; 2248 req->prev_size = dr; 2249 } 2250 2251 err = blockif_delete(nvstore->ctx, &req->io_req); 2252 if (err) 2253 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); 2254 else 2255 pending = true; 2256 } 2257 out: 2258 free(range); 2259 return (pending); 2260 } 2261 2262 static void 2263 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) 2264 { 2265 struct nvme_submission_queue *sq; 2266 uint16_t status; 2267 uint16_t sqhead; 2268 2269 #ifndef __FreeBSD__ 2270 status = 0; 2271 #endif 2272 2273 /* handle all submissions up to sq->tail index */ 2274 sq = &sc->submit_queues[idx]; 2275 2276 pthread_mutex_lock(&sq->mtx); 2277 2278 sqhead = sq->head; 2279 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", 2280 idx, sqhead, sq->tail, sq->qbase); 2281 2282 while (sqhead != atomic_load_acq_short(&sq->tail)) { 2283 struct nvme_command *cmd; 2284 struct pci_nvme_ioreq *req; 2285 uint32_t nsid; 2286 bool pending; 2287 2288 pending = false; 2289 req = NULL; 2290 status = 0; 2291 2292 cmd = &sq->qbase[sqhead]; 2293 sqhead = (sqhead + 1) % sq->size; 2294 2295 nsid = le32toh(cmd->nsid); 2296 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { 2297 pci_nvme_status_genc(&status, 2298 NVME_SC_INVALID_NAMESPACE_OR_FORMAT); 2299 status |= 2300 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; 2301 goto complete; 2302 } 2303 2304 req = pci_nvme_get_ioreq(sc); 2305 if (req == NULL) { 2306 pci_nvme_status_genc(&status, 2307 NVME_SC_INTERNAL_DEVICE_ERROR); 2308 WPRINTF("%s: unable to allocate IO req", __func__); 2309 goto complete; 2310 } 2311 req->nvme_sq = sq; 2312 req->sqid = idx; 2313 req->opc = cmd->opc; 2314 req->cid = cmd->cid; 2315 req->nsid = cmd->nsid; 2316 2317 switch (cmd->opc) { 2318 case NVME_OPC_FLUSH: 2319 pending = nvme_opc_flush(sc, cmd, &sc->nvstore, 2320 req, &status); 2321 break; 2322 case NVME_OPC_WRITE: 2323 case NVME_OPC_READ: 2324 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, 2325 req, &status); 2326 break; 2327 case NVME_OPC_WRITE_ZEROES: 2328 /* TODO: write zeroes 2329 WPRINTF("%s write zeroes lba 0x%lx blocks %u", 2330 __func__, lba, cmd->cdw12 & 0xFFFF); */ 2331 pci_nvme_status_genc(&status, NVME_SC_SUCCESS); 2332 break; 2333 case NVME_OPC_DATASET_MANAGEMENT: 2334 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, 2335 req, &status); 2336 break; 2337 default: 2338 WPRINTF("%s unhandled io command 0x%x", 2339 __func__, cmd->opc); 2340 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); 2341 } 2342 complete: 2343 if (!pending) { 2344 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, 2345 status); 2346 if (req != NULL) 2347 pci_nvme_release_ioreq(sc, req); 2348 } 2349 } 2350 2351 sq->head = sqhead; 2352 2353 pthread_mutex_unlock(&sq->mtx); 2354 } 2355 2356 static void 2357 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, 2358 uint64_t idx, int is_sq, uint64_t value) 2359 { 2360 DPRINTF("nvme doorbell %lu, %s, val 0x%lx", 2361 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); 2362 2363 if (is_sq) { 2364 if (idx > sc->num_squeues) { 2365 WPRINTF("%s queue index %lu overflow from " 2366 "guest (max %u)", 2367 __func__, idx, sc->num_squeues); 2368 return; 2369 } 2370 2371 atomic_store_short(&sc->submit_queues[idx].tail, 2372 (uint16_t)value); 2373 2374 if (idx == 0) { 2375 pci_nvme_handle_admin_cmd(sc, value); 2376 } else { 2377 /* submission queue; handle new entries in SQ */ 2378 if (idx > sc->num_squeues) { 2379 WPRINTF("%s SQ index %lu overflow from " 2380 "guest (max %u)", 2381 __func__, idx, sc->num_squeues); 2382 return; 2383 } 2384 pci_nvme_handle_io_cmd(sc, (uint16_t)idx); 2385 } 2386 } else { 2387 if (idx > sc->num_cqueues) { 2388 WPRINTF("%s queue index %lu overflow from " 2389 "guest (max %u)", 2390 __func__, idx, sc->num_cqueues); 2391 return; 2392 } 2393 2394 atomic_store_short(&sc->compl_queues[idx].head, 2395 (uint16_t)value); 2396 } 2397 } 2398 2399 static void 2400 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) 2401 { 2402 const char *s = iswrite ? "WRITE" : "READ"; 2403 2404 switch (offset) { 2405 case NVME_CR_CAP_LOW: 2406 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s); 2407 break; 2408 case NVME_CR_CAP_HI: 2409 DPRINTF("%s %s NVME_CR_CAP_HI", func, s); 2410 break; 2411 case NVME_CR_VS: 2412 DPRINTF("%s %s NVME_CR_VS", func, s); 2413 break; 2414 case NVME_CR_INTMS: 2415 DPRINTF("%s %s NVME_CR_INTMS", func, s); 2416 break; 2417 case NVME_CR_INTMC: 2418 DPRINTF("%s %s NVME_CR_INTMC", func, s); 2419 break; 2420 case NVME_CR_CC: 2421 DPRINTF("%s %s NVME_CR_CC", func, s); 2422 break; 2423 case NVME_CR_CSTS: 2424 DPRINTF("%s %s NVME_CR_CSTS", func, s); 2425 break; 2426 case NVME_CR_NSSR: 2427 DPRINTF("%s %s NVME_CR_NSSR", func, s); 2428 break; 2429 case NVME_CR_AQA: 2430 DPRINTF("%s %s NVME_CR_AQA", func, s); 2431 break; 2432 case NVME_CR_ASQ_LOW: 2433 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s); 2434 break; 2435 case NVME_CR_ASQ_HI: 2436 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s); 2437 break; 2438 case NVME_CR_ACQ_LOW: 2439 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s); 2440 break; 2441 case NVME_CR_ACQ_HI: 2442 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s); 2443 break; 2444 default: 2445 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset); 2446 } 2447 2448 } 2449 2450 static void 2451 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, 2452 uint64_t offset, int size, uint64_t value) 2453 { 2454 uint32_t ccreg; 2455 2456 if (offset >= NVME_DOORBELL_OFFSET) { 2457 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; 2458 uint64_t idx = belloffset / 8; /* door bell size = 2*int */ 2459 int is_sq = (belloffset % 8) < 4; 2460 2461 if (belloffset > ((sc->max_queues+1) * 8 - 4)) { 2462 WPRINTF("guest attempted an overflow write offset " 2463 "0x%lx, val 0x%lx in %s", 2464 offset, value, __func__); 2465 return; 2466 } 2467 2468 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); 2469 return; 2470 } 2471 2472 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx", 2473 offset, size, value); 2474 2475 if (size != 4) { 2476 WPRINTF("guest wrote invalid size %d (offset 0x%lx, " 2477 "val 0x%lx) to bar0 in %s", 2478 size, offset, value, __func__); 2479 /* TODO: shutdown device */ 2480 return; 2481 } 2482 2483 pci_nvme_bar0_reg_dumps(__func__, offset, 1); 2484 2485 pthread_mutex_lock(&sc->mtx); 2486 2487 switch (offset) { 2488 case NVME_CR_CAP_LOW: 2489 case NVME_CR_CAP_HI: 2490 /* readonly */ 2491 break; 2492 case NVME_CR_VS: 2493 /* readonly */ 2494 break; 2495 case NVME_CR_INTMS: 2496 /* MSI-X, so ignore */ 2497 break; 2498 case NVME_CR_INTMC: 2499 /* MSI-X, so ignore */ 2500 break; 2501 case NVME_CR_CC: 2502 ccreg = (uint32_t)value; 2503 2504 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u " 2505 "iocqes %u", 2506 __func__, 2507 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), 2508 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), 2509 NVME_CC_GET_IOCQES(ccreg)); 2510 2511 if (NVME_CC_GET_SHN(ccreg)) { 2512 /* perform shutdown - flush out data to backend */ 2513 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << 2514 NVME_CSTS_REG_SHST_SHIFT); 2515 sc->regs.csts |= NVME_SHST_COMPLETE << 2516 NVME_CSTS_REG_SHST_SHIFT; 2517 } 2518 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { 2519 if (NVME_CC_GET_EN(ccreg) == 0) 2520 /* transition 1-> causes controller reset */ 2521 pci_nvme_reset_locked(sc); 2522 else 2523 pci_nvme_init_controller(ctx, sc); 2524 } 2525 2526 /* Insert the iocqes, iosqes and en bits from the write */ 2527 sc->regs.cc &= ~NVME_CC_WRITE_MASK; 2528 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; 2529 if (NVME_CC_GET_EN(ccreg) == 0) { 2530 /* Insert the ams, mps and css bit fields */ 2531 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; 2532 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; 2533 sc->regs.csts &= ~NVME_CSTS_RDY; 2534 } else if (sc->pending_ios == 0) { 2535 sc->regs.csts |= NVME_CSTS_RDY; 2536 } 2537 break; 2538 case NVME_CR_CSTS: 2539 break; 2540 case NVME_CR_NSSR: 2541 /* ignore writes; don't support subsystem reset */ 2542 break; 2543 case NVME_CR_AQA: 2544 sc->regs.aqa = (uint32_t)value; 2545 break; 2546 case NVME_CR_ASQ_LOW: 2547 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | 2548 (0xFFFFF000 & value); 2549 break; 2550 case NVME_CR_ASQ_HI: 2551 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | 2552 (value << 32); 2553 break; 2554 case NVME_CR_ACQ_LOW: 2555 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | 2556 (0xFFFFF000 & value); 2557 break; 2558 case NVME_CR_ACQ_HI: 2559 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | 2560 (value << 32); 2561 break; 2562 default: 2563 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d", 2564 __func__, offset, value, size); 2565 } 2566 pthread_mutex_unlock(&sc->mtx); 2567 } 2568 2569 static void 2570 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, 2571 int baridx, uint64_t offset, int size, uint64_t value) 2572 { 2573 struct pci_nvme_softc* sc = pi->pi_arg; 2574 2575 if (baridx == pci_msix_table_bar(pi) || 2576 baridx == pci_msix_pba_bar(pi)) { 2577 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, " 2578 " value 0x%lx", baridx, offset, size, value); 2579 2580 pci_emul_msix_twrite(pi, offset, size, value); 2581 return; 2582 } 2583 2584 switch (baridx) { 2585 case 0: 2586 pci_nvme_write_bar_0(ctx, sc, offset, size, value); 2587 break; 2588 2589 default: 2590 DPRINTF("%s unknown baridx %d, val 0x%lx", 2591 __func__, baridx, value); 2592 } 2593 } 2594 2595 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, 2596 uint64_t offset, int size) 2597 { 2598 uint64_t value; 2599 2600 pci_nvme_bar0_reg_dumps(__func__, offset, 0); 2601 2602 if (offset < NVME_DOORBELL_OFFSET) { 2603 void *p = &(sc->regs); 2604 pthread_mutex_lock(&sc->mtx); 2605 memcpy(&value, (void *)((uintptr_t)p + offset), size); 2606 pthread_mutex_unlock(&sc->mtx); 2607 } else { 2608 value = 0; 2609 WPRINTF("pci_nvme: read invalid offset %ld", offset); 2610 } 2611 2612 switch (size) { 2613 case 1: 2614 value &= 0xFF; 2615 break; 2616 case 2: 2617 value &= 0xFFFF; 2618 break; 2619 case 4: 2620 value &= 0xFFFFFFFF; 2621 break; 2622 } 2623 2624 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x", 2625 offset, size, (uint32_t)value); 2626 2627 return (value); 2628 } 2629 2630 2631 2632 static uint64_t 2633 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, 2634 uint64_t offset, int size) 2635 { 2636 struct pci_nvme_softc* sc = pi->pi_arg; 2637 2638 if (baridx == pci_msix_table_bar(pi) || 2639 baridx == pci_msix_pba_bar(pi)) { 2640 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d", 2641 baridx, offset, size); 2642 2643 return pci_emul_msix_tread(pi, offset, size); 2644 } 2645 2646 switch (baridx) { 2647 case 0: 2648 return pci_nvme_read_bar_0(sc, offset, size); 2649 2650 default: 2651 DPRINTF("unknown bar %d, 0x%lx", baridx, offset); 2652 } 2653 2654 return (0); 2655 } 2656 2657 static int 2658 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl) 2659 { 2660 char bident[sizeof("XX:X:X")]; 2661 const char *value; 2662 uint32_t sectsz; 2663 2664 sc->max_queues = NVME_QUEUES; 2665 sc->max_qentries = NVME_MAX_QENTRIES; 2666 sc->ioslots = NVME_IOSLOTS; 2667 sc->num_squeues = sc->max_queues; 2668 sc->num_cqueues = sc->max_queues; 2669 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2670 sectsz = 0; 2671 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), 2672 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2673 2674 value = get_config_value_node(nvl, "maxq"); 2675 if (value != NULL) 2676 sc->max_queues = atoi(value); 2677 value = get_config_value_node(nvl, "qsz"); 2678 if (value != NULL) { 2679 sc->max_qentries = atoi(value); 2680 if (sc->max_qentries <= 0) { 2681 EPRINTLN("nvme: Invalid qsz option %d", 2682 sc->max_qentries); 2683 return (-1); 2684 } 2685 } 2686 value = get_config_value_node(nvl, "ioslots"); 2687 if (value != NULL) { 2688 sc->ioslots = atoi(value); 2689 if (sc->ioslots <= 0) { 2690 EPRINTLN("Invalid ioslots option %d", sc->ioslots); 2691 return (-1); 2692 } 2693 } 2694 value = get_config_value_node(nvl, "sectsz"); 2695 if (value != NULL) 2696 sectsz = atoi(value); 2697 value = get_config_value_node(nvl, "ser"); 2698 if (value != NULL) { 2699 /* 2700 * This field indicates the Product Serial Number in 2701 * 7-bit ASCII, unused bytes should be space characters. 2702 * Ref: NVMe v1.3c. 2703 */ 2704 cpywithpad((char *)sc->ctrldata.sn, 2705 sizeof(sc->ctrldata.sn), value, ' '); 2706 } 2707 value = get_config_value_node(nvl, "eui64"); 2708 if (value != NULL) 2709 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0)); 2710 value = get_config_value_node(nvl, "dsm"); 2711 if (value != NULL) { 2712 if (strcmp(value, "auto") == 0) 2713 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO; 2714 else if (strcmp(value, "enable") == 0) 2715 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE; 2716 else if (strcmp(value, "disable") == 0) 2717 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE; 2718 } 2719 2720 value = get_config_value_node(nvl, "ram"); 2721 if (value != NULL) { 2722 uint64_t sz = strtoull(value, NULL, 10); 2723 2724 sc->nvstore.type = NVME_STOR_RAM; 2725 sc->nvstore.size = sz * 1024 * 1024; 2726 sc->nvstore.ctx = calloc(1, sc->nvstore.size); 2727 sc->nvstore.sectsz = 4096; 2728 sc->nvstore.sectsz_bits = 12; 2729 if (sc->nvstore.ctx == NULL) { 2730 EPRINTLN("nvme: Unable to allocate RAM"); 2731 return (-1); 2732 } 2733 } else { 2734 snprintf(bident, sizeof(bident), "%d:%d", 2735 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); 2736 sc->nvstore.ctx = blockif_open(nvl, bident); 2737 if (sc->nvstore.ctx == NULL) { 2738 EPRINTLN("nvme: Could not open backing file: %s", 2739 strerror(errno)); 2740 return (-1); 2741 } 2742 sc->nvstore.type = NVME_STOR_BLOCKIF; 2743 sc->nvstore.size = blockif_size(sc->nvstore.ctx); 2744 } 2745 2746 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) 2747 sc->nvstore.sectsz = sectsz; 2748 else if (sc->nvstore.type != NVME_STOR_RAM) 2749 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); 2750 for (sc->nvstore.sectsz_bits = 9; 2751 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; 2752 sc->nvstore.sectsz_bits++); 2753 2754 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) 2755 sc->max_queues = NVME_QUEUES; 2756 2757 return (0); 2758 } 2759 2760 static int 2761 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl) 2762 { 2763 struct pci_nvme_softc *sc; 2764 uint32_t pci_membar_sz; 2765 int error; 2766 2767 error = 0; 2768 2769 sc = calloc(1, sizeof(struct pci_nvme_softc)); 2770 pi->pi_arg = sc; 2771 sc->nsc_pi = pi; 2772 2773 error = pci_nvme_parse_config(sc, nvl); 2774 if (error < 0) 2775 goto done; 2776 else 2777 error = 0; 2778 2779 STAILQ_INIT(&sc->ioreqs_free); 2780 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); 2781 for (int i = 0; i < sc->ioslots; i++) { 2782 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); 2783 } 2784 2785 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); 2786 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); 2787 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); 2788 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); 2789 pci_set_cfgdata8(pi, PCIR_PROGIF, 2790 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); 2791 2792 /* 2793 * Allocate size of NVMe registers + doorbell space for all queues. 2794 * 2795 * The specification requires a minimum memory I/O window size of 16K. 2796 * The Windows driver will refuse to start a device with a smaller 2797 * window. 2798 */ 2799 pci_membar_sz = sizeof(struct nvme_registers) + 2800 2 * sizeof(uint32_t) * (sc->max_queues + 1); 2801 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); 2802 2803 DPRINTF("nvme membar size: %u", pci_membar_sz); 2804 2805 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); 2806 if (error) { 2807 WPRINTF("%s pci alloc mem bar failed", __func__); 2808 goto done; 2809 } 2810 2811 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); 2812 if (error) { 2813 WPRINTF("%s pci add msixcap failed", __func__); 2814 goto done; 2815 } 2816 2817 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP); 2818 if (error) { 2819 WPRINTF("%s pci add Express capability failed", __func__); 2820 goto done; 2821 } 2822 2823 pthread_mutex_init(&sc->mtx, NULL); 2824 sem_init(&sc->iosemlock, 0, sc->ioslots); 2825 2826 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); 2827 /* 2828 * Controller data depends on Namespace data so initialize Namespace 2829 * data first. 2830 */ 2831 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); 2832 pci_nvme_init_ctrldata(sc); 2833 pci_nvme_init_logpages(sc); 2834 pci_nvme_init_features(sc); 2835 2836 pci_nvme_aer_init(sc); 2837 2838 pci_nvme_reset(sc); 2839 2840 pci_lintr_request(pi); 2841 2842 done: 2843 return (error); 2844 } 2845 2846 2847 struct pci_devemu pci_de_nvme = { 2848 .pe_emu = "nvme", 2849 .pe_init = pci_nvme_init, 2850 .pe_legacy_config = blockif_legacy_config, 2851 .pe_barwrite = pci_nvme_write, 2852 .pe_barread = pci_nvme_read 2853 }; 2854 PCI_EMUL_SET(pci_de_nvme); 2855