xref: /illumos-gate/usr/src/lib/fm/topo/modules/common/disk/disk_nvme.c (revision dd23d762c65e503874085a3893fbd3df9688da30)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020 Joyent, Inc.
14  * Copyright 2022 Tintri by DDN, Inc. All rights reserved.
15  * Copyright 2023 Oxide Computer Company
16  */
17 
18 /*
19  * This file drives topo node enumeration of NVMe controllers.  A single "nvme"
20  * node is enumerated for each NVMe controller.   Child "disk" nodes are then
21  * enumerated for each active or attached NVMe namespace.
22  *
23  * nvme nodes are expected to be enumerated under either a "bay" node (for U.2
24  * devices) or a "slot" node (for M.2 devices) or a "pciexfn" node (for AIC
25  * devices).
26  *
27  * Enumeration of NVMe controllers on PCIe add-in cards is automatically driven
28  * by the pcibus topo module.
29  *
30  * In order to allow for associating a given NVMe controller with a physical
31  * location, enumeration of U.2 and M.2 devices should be driven by a
32  * platform-specific topo map which statically sets the following two
33  * properties on the parent "bay" or "slot" node:
34  *
35  * propgroup        property        description
36  * ---------        --------        ------------
37  * binding          driver          "nvme"
38  * binding          parent-device   devpath of parent PCIe device
39  *
40  * for example:
41  *
42  * <propgroup name="binding" version="1" name-stability="Private"
43  *   data-stability="Private">
44  *     <propval name="driver" type="string" value="nvme"/>
45  *     <propval name="parent-device" type="string"
46  *       value="/pci@0,0/pci8086,6f09@3,1"/>
47  * </propgroup>
48  * <dependents grouping="children">
49  *     <range name="nvme" min="0" max="0">
50  *         <enum-method name="disk" version="1"/>
51  *     </range>
52  * </dependents>
53  */
54 #include <stdlib.h>
55 #include <sys/types.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <unistd.h>
59 #include <string.h>
60 #include <strings.h>
61 #include <stdbool.h>
62 
63 #include <sys/fm/protocol.h>
64 #include <fm/topo_hc.h>
65 #include <fm/topo_mod.h>
66 #include <topo_ufm.h>
67 
68 #include <sys/dkio.h>
69 #include <sys/scsi/generic/inquiry.h>
70 
71 #include <sys/nvme.h>
72 #include "disk.h"
73 #include "disk_drivers.h"
74 
75 typedef struct nvme_enum_info {
76 	topo_mod_t		*nei_mod;
77 	di_node_t		nei_dinode;
78 	nvme_identify_ctrl_t	*nei_idctl;
79 	nvme_version_t		nei_vers;
80 	tnode_t			*nei_parent;
81 	tnode_t			*nei_nvme;
82 	nvlist_t		*nei_nvme_fmri;
83 	const char		*nei_nvme_path;
84 	int			nei_fd;
85 } nvme_enum_info_t;
86 
87 typedef struct devlink_arg {
88 	topo_mod_t		*dla_mod;
89 	char			*dla_logical_disk;
90 	uint_t			dla_strsz;
91 } devlink_arg_t;
92 
93 static int
94 devlink_cb(di_devlink_t dl, void *arg)
95 {
96 	devlink_arg_t *dlarg = (devlink_arg_t *)arg;
97 	topo_mod_t *mod = dlarg->dla_mod;
98 	const char *devpath;
99 	char *slice, *ctds;
100 
101 	if ((devpath = di_devlink_path(dl)) == NULL ||
102 	    (dlarg->dla_logical_disk = topo_mod_strdup(mod, devpath)) ==
103 	    NULL) {
104 		return (DI_WALK_TERMINATE);
105 	}
106 
107 	/*
108 	 * We need to keep track of the original string size before we
109 	 * truncate it with a NUL, so that we can free the right number of
110 	 * bytes when we're done, otherwise libumem will complain.
111 	 */
112 	dlarg->dla_strsz = strlen(dlarg->dla_logical_disk) + 1;
113 
114 	/* trim the slice off the public name */
115 	if (((ctds = strrchr(dlarg->dla_logical_disk, '/')) != NULL) &&
116 	    ((slice = strchr(ctds, 's')) != NULL))
117 		*slice = '\0';
118 
119 	return (DI_WALK_TERMINATE);
120 }
121 
122 static char *
123 get_logical_disk(topo_mod_t *mod, const char *devpath, uint_t *bufsz)
124 {
125 	di_devlink_handle_t devhdl;
126 	devlink_arg_t dlarg = { 0 };
127 	char *minorpath = NULL;
128 
129 	if (asprintf(&minorpath, "%s:a", devpath) < 0) {
130 		return (NULL);
131 	}
132 
133 	if ((devhdl = di_devlink_init(NULL, 0)) == DI_NODE_NIL) {
134 		topo_mod_dprintf(mod, "%s: di_devlink_init failed", __func__);
135 		free(minorpath);
136 		return (NULL);
137 	}
138 
139 	dlarg.dla_mod = mod;
140 
141 	(void) di_devlink_walk(devhdl, "^dsk/", minorpath, DI_PRIMARY_LINK,
142 	    &dlarg, devlink_cb);
143 
144 	(void) di_devlink_fini(&devhdl);
145 	free(minorpath);
146 
147 	*bufsz = dlarg.dla_strsz;
148 	return (dlarg.dla_logical_disk);
149 }
150 
151 static bool
152 disk_nvme_make_ns_serial(topo_mod_t *mod, const nvme_identify_nsid_t *id,
153     uint32_t nsid, char *buf, size_t buflen)
154 {
155 	uint8_t zero_guid[16] = { 0 };
156 	int ret;
157 
158 	if (bcmp(zero_guid, id->id_nguid, sizeof (id->id_nguid)) != 0) {
159 		ret = snprintf(buf, buflen, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X"
160 		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
161 		    id->id_nguid[0], id->id_nguid[1], id->id_nguid[2],
162 		    id->id_nguid[3], id->id_nguid[4], id->id_nguid[5],
163 		    id->id_nguid[6], id->id_nguid[7], id->id_nguid[8],
164 		    id->id_nguid[9], id->id_nguid[10], id->id_nguid[11],
165 		    id->id_nguid[12], id->id_nguid[13], id->id_nguid[14],
166 		    id->id_nguid[15]);
167 	} else if (bcmp(zero_guid, id->id_eui64, sizeof (id->id_eui64)) != 0) {
168 		ret = snprintf(buf, buflen,
169 		    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
170 		    id->id_eui64[0], id->id_eui64[1], id->id_eui64[2],
171 		    id->id_eui64[3], id->id_eui64[4], id->id_eui64[5],
172 		    id->id_eui64[6], id->id_eui64[7]);
173 	} else {
174 		ret = snprintf(buf, buflen, "%u", nsid);
175 	}
176 
177 	if ((size_t)ret >= buflen) {
178 		topo_mod_dprintf(mod, "overflowed serial number for nsid %u: "
179 		    "needed %zu bytes, got %d", nsid, buflen, ret);
180 		return (false);
181 	}
182 
183 	return (true);
184 }
185 
186 /*
187  * Create the common I/O property group properties that are shared between
188  * controllers and namespaces. We assume the property group was already created.
189  */
190 static bool
191 disk_nvme_common_io(topo_mod_t *mod, tnode_t *tn, di_node_t di)
192 {
193 	int err;
194 	int inst = di_instance(di);
195 	const char *drv = di_driver_name(di);
196 	char *path;
197 	const char *ppaths[1];
198 
199 	if (inst != -1 && topo_prop_set_uint32(tn, TOPO_PGROUP_IO,
200 	    TOPO_IO_INSTANCE, TOPO_PROP_IMMUTABLE, (uint32_t)inst, &err) != 0) {
201 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
202 		    "%s", TOPO_PGROUP_IO, TOPO_IO_INSTANCE, topo_node_name(tn),
203 		    topo_node_instance(tn), topo_strerror(err));
204 		return (false);
205 	}
206 
207 	if (drv != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
208 	    TOPO_IO_DRIVER, TOPO_PROP_IMMUTABLE, drv, &err) != 0) {
209 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
210 		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
211 		    topo_node_instance(tn), topo_strerror(err));
212 		return (false);
213 	}
214 
215 	if (drv != NULL) {
216 		nvlist_t *fmri = topo_mod_modfmri(mod, FM_MOD_SCHEME_VERSION,
217 		    drv);
218 		if (mod != NULL && topo_prop_set_fmri(tn, TOPO_PGROUP_IO,
219 		    TOPO_IO_MODULE, TOPO_PROP_IMMUTABLE, fmri, &err) != 0) {
220 			topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
221 			    PRIu64 "]: %s", TOPO_PGROUP_IO, TOPO_IO_MODULE,
222 			    topo_node_name(tn), topo_node_instance(tn),
223 			    topo_strerror(err));
224 			nvlist_free(fmri);
225 			return (false);
226 		}
227 		nvlist_free(fmri);
228 	}
229 
230 	path = di_devfs_path(di);
231 	ppaths[0] = path;
232 	if (path != NULL && topo_prop_set_string(tn, TOPO_PGROUP_IO,
233 	    TOPO_IO_DEV_PATH, TOPO_PROP_IMMUTABLE, path, &err) != 0) {
234 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
235 		    "%s", TOPO_PGROUP_IO, TOPO_IO_DRIVER, topo_node_name(tn),
236 		    topo_node_instance(tn), topo_strerror(err));
237 		di_devfs_path_free(path);
238 		return (false);
239 	}
240 
241 	if (path != NULL && topo_prop_set_string_array(tn, TOPO_PGROUP_IO,
242 	    TOPO_IO_PHYS_PATH, TOPO_PROP_IMMUTABLE, ppaths, 1, &err) != 0) {
243 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%" PRIu64 "]: "
244 		    "%s", TOPO_PGROUP_IO, TOPO_IO_PHYS_PATH, topo_node_name(tn),
245 		    topo_node_instance(tn), topo_strerror(err));
246 		di_devfs_path_free(path);
247 		return (false);
248 	}
249 	di_devfs_path_free(path);
250 
251 	return (true);
252 }
253 
254 /*
255  * Add the various storage and I/O property group items that are appropriate
256  * given that we have a devinfo node. The storage property group has already
257  * been created, but the I/O property group has not.
258  */
259 static void
260 disk_nvme_make_ns_di_props(topo_mod_t *mod, tnode_t *tn, di_node_t di)
261 {
262 	int err;
263 	char *devid, *mfg, *model, *rev, *serial, *log, *path;
264 	uint_t buflen;
265 
266 	if (di_prop_lookup_strings(DDI_DEV_T_ANY, di, DEVID_PROP_NAME,
267 	    &devid) != 1 ||
268 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_VENDOR_ID,
269 	    &mfg) != 1 ||
270 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_PRODUCT_ID,
271 	    &model) != 1 ||
272 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_REVISION_ID,
273 	    &rev) != 1 ||
274 	    di_prop_lookup_strings(DDI_DEV_T_ANY, di, INQUIRY_SERIAL_NO,
275 	    &serial) != 1) {
276 		topo_mod_dprintf(mod, "failed to get devinfo props for %s[%"
277 		    PRIu64 "]", topo_node_name(tn), topo_node_instance(tn));
278 		return;
279 	}
280 
281 	/*
282 	 * Set the basic storage manufacturer information. Yes, this is
283 	 * information really about the NVMe controller and not the namespace.
284 	 * That's how the storage property group basically works here.
285 	 */
286 	if (topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
287 	    TOPO_STORAGE_MANUFACTURER, TOPO_PROP_IMMUTABLE, mfg, &err) != 0 ||
288 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
289 	    TOPO_STORAGE_SERIAL_NUM, TOPO_PROP_IMMUTABLE, serial, &err) != 0 ||
290 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
291 	    TOPO_STORAGE_FIRMWARE_REV, TOPO_PROP_IMMUTABLE, rev, &err) != 0 ||
292 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
293 	    TOPO_STORAGE_MODEL, TOPO_PROP_IMMUTABLE, model, &err) != 0) {
294 		topo_mod_dprintf(mod, "failed to set storage properties on "
295 		    "%s[%" PRIu64 "]: %s", topo_node_name(tn),
296 		    topo_node_instance(tn), topo_strerror(err));
297 		return;
298 	}
299 
300 	if (topo_pgroup_create(tn, &io_pgroup, &err) != 0) {
301 		topo_mod_dprintf(mod, "failed to create I/O property "
302 		    "group on %s[%" PRIu64 "]: %s",  topo_node_name(tn),
303 		    topo_node_instance(tn), topo_strerror(err));
304 	}
305 
306 	if (!disk_nvme_common_io(mod, tn, di)) {
307 		return;
308 	}
309 
310 	/*
311 	 * The last property that we'd like to attempt to create for a namespace
312 	 * is a mapping back to its corresponding logical disk entry in /dev.
313 	 * The logical disk will be everything past the trailing /, i.e. a
314 	 * cXtXdX value.
315 	 */
316 	path = di_devfs_path(di);
317 	if (path == NULL) {
318 		return;
319 	}
320 	log = get_logical_disk(mod, path, &buflen);
321 	di_devfs_path_free(path);
322 	if (log == NULL) {
323 		return;
324 	}
325 	path = strrchr(log, '/');
326 	if (path != NULL && path[1] != '\0' &&
327 	    topo_prop_set_string(tn, TOPO_PGROUP_STORAGE,
328 	    TOPO_STORAGE_LOGICAL_DISK_NAME, TOPO_PROP_IMMUTABLE, path + 1,
329 	    &err) != 0) {
330 		topo_mod_dprintf(mod, "failed to set %s:%s on %s[%"
331 		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
332 		    TOPO_STORAGE_LOGICAL_DISK_NAME, topo_node_name(tn),
333 		    topo_node_instance(tn), topo_strerror(err));
334 	}
335 	topo_mod_free(mod, log, buflen);
336 }
337 
338 static void
339 disk_nvme_make_ns(nvme_enum_info_t *nei, uint32_t nsid)
340 {
341 	topo_mod_t *mod = nei->nei_mod;
342 	nvlist_t *auth = NULL, *fmri = NULL;
343 	const topo_instance_t inst = nsid - 1;
344 	nvme_ns_info_t info;
345 	nvme_ioctl_t ioc;
346 	char serial[64], capstr[64];
347 	uint64_t cap, blksz;
348 	tnode_t *tn;
349 	uint8_t lba;
350 	int err;
351 
352 	bzero(&ioc, sizeof (ioc));
353 	bzero(&info, sizeof (info));
354 	ioc.n_len = sizeof (nvme_ns_info_t);
355 	ioc.n_buf = (uintptr_t)&info;
356 	ioc.n_arg = nsid;
357 
358 	if (ioctl(nei->nei_fd, NVME_IOC_NS_INFO, &ioc) != 0) {
359 		topo_mod_dprintf(mod, "failed to get namespace info for ns %u: "
360 		    "%s", nsid, strerror(errno));
361 		return;
362 	}
363 
364 	if ((info.nni_state & NVME_NS_STATE_IGNORED) != 0) {
365 		return;
366 	}
367 
368 	if ((info.nni_state &
369 	    (NVME_NS_STATE_ACTIVE | NVME_NS_STATE_ATTACHED)) == 0) {
370 		topo_mod_dprintf(mod, "skipping nsid %u because it is not "
371 		    "active or attached (state: 0x%x)", nsid, info.nni_state);
372 		return;
373 	}
374 
375 	auth = topo_mod_auth(mod, nei->nei_nvme);
376 	if (auth == NULL) {
377 		topo_mod_dprintf(mod, "failed to get auth for nsid %u from "
378 		    "parent %s[%" PRIu64 "]: %s", nsid,
379 		    topo_node_name(nei->nei_nvme),
380 		    topo_node_instance(nei->nei_nvme), topo_mod_errmsg(mod));
381 		goto done;
382 	}
383 
384 	/*
385 	 * We want to construct the FMRI for the namespace. The namespace is a
386 	 * little awkward in terms of things like the model, revision, and
387 	 * serial. While blkdev sets up standard inquiry properties to map these
388 	 * to the parent device which makes sense in the context of trying to
389 	 * use this as a normal block device, it's not really appropriate here.
390 	 * The namespace is not the NVMe controller. We construct the namespace
391 	 * serial number from the preferential ordering of information that
392 	 * we're given of the NGUID, EUI64, and then fall back to the namespace
393 	 * number.
394 	 */
395 	if (!disk_nvme_make_ns_serial(mod, &info.nni_id, nsid, serial,
396 	    sizeof (serial))) {
397 		goto done;
398 	}
399 	fmri = topo_mod_hcfmri(mod, nei->nei_nvme, FM_HC_SCHEME_VERSION,
400 	    DISK, inst, NULL, auth, NULL, NULL, serial);
401 	if (fmri == NULL) {
402 		topo_mod_dprintf(mod, "failed to make fmri for %s[%" PRIu64
403 		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
404 		goto done;
405 	}
406 
407 	tn = topo_node_bind(mod, nei->nei_nvme, DISK, inst, fmri);
408 	if (tn == NULL) {
409 		topo_mod_dprintf(mod, "failed to bind fmri for %s[%" PRIu64
410 		    "] on nsid %u: %s", DISK, inst, nsid, topo_mod_errmsg(mod));
411 		goto done;
412 	}
413 
414 	/*
415 	 * Always inherit our parent's FRU. The namespace is just a part of the
416 	 * device in reality.
417 	 */
418 	if (topo_node_fru_set(tn, NULL, 0, &err) != 0) {
419 		topo_mod_dprintf(mod, "failed to set FRU for %s[%" PRIu64
420 		    "] on nsid %u: %s", DISK, inst, nsid, topo_strerror(err));
421 		goto done;
422 
423 	}
424 
425 	/*
426 	 * Our namespace may or may not be attached. From the namespace we will
427 	 * always get the capacity and block information. The rest of it will
428 	 * end up being filled in if we find a devinfo node.
429 	 */
430 	if (topo_pgroup_create(tn, &storage_pgroup, &err) != 0) {
431 		topo_mod_dprintf(mod, "failed to create storage property "
432 		    "group on %s[%" PRIu64 "]: %s", DISK, inst,
433 		    topo_strerror(err));
434 	}
435 
436 	lba = info.nni_id.id_flbas.lba_format;
437 	blksz = 1ULL << info.nni_id.id_lbaf[lba].lbaf_lbads;
438 	if (blksz != 0 && topo_prop_set_uint64(tn, TOPO_PGROUP_STORAGE,
439 	    TOPO_STORAGE_LOG_BLOCK_SIZE, TOPO_PROP_IMMUTABLE, blksz, &err) !=
440 	    0) {
441 		topo_mod_dprintf(mod, "failed to create property %s:%s on %s[%"
442 		    PRIu64 "]: %s", TOPO_PGROUP_STORAGE,
443 		    TOPO_STORAGE_LOG_BLOCK_SIZE, DISK, inst,
444 		    topo_strerror(err));
445 		goto done;
446 	}
447 
448 	cap = blksz * info.nni_id.id_nsize;
449 	if (snprintf(capstr, sizeof (capstr), "%" PRIu64, cap) >=
450 	    sizeof (capstr)) {
451 		topo_mod_dprintf(mod, "overflowed capacity calculation on "
452 		    "nsid %u", nsid);
453 		goto done;
454 	}
455 
456 	/*
457 	 * Finally attempt to find a child node that has a matching name and go
458 	 * from there. Sorry, this does result in node creation being O(n^2),
459 	 * but at least n is usually small today.
460 	 */
461 	for (di_node_t di = di_child_node(nei->nei_dinode); di != DI_NODE_NIL;
462 	    di = di_sibling_node(di)) {
463 		const char *addr = di_bus_addr(di);
464 		if (addr != NULL && strcmp(addr, info.nni_addr) == 0) {
465 			disk_nvme_make_ns_di_props(mod, tn, di);
466 		}
467 	}
468 
469 done:
470 	nvlist_free(auth);
471 	nvlist_free(fmri);
472 }
473 
474 /*
475  * Attempt to make a ufm node, but swallow the error so we can try to get as
476  * much of the disk information as possible.
477  */
478 static void
479 disk_nvme_make_ufm(topo_mod_t *mod, nvme_enum_info_t *nei)
480 {
481 	topo_ufm_devinfo_t tud;
482 	char *path = di_devfs_path(nei->nei_dinode);
483 	if (path == NULL) {
484 		return;
485 	}
486 
487 	tud.tud_method = TOPO_UFM_M_DEVINFO;
488 	tud.tud_path = path;
489 	if (topo_mod_load(mod, TOPO_MOD_UFM, TOPO_VERSION) == NULL) {
490 		topo_mod_dprintf(mod, "disk enum could not load ufm module");
491 		di_devfs_path_free(path);
492 		return;
493 	}
494 
495 	(void) topo_mod_enumerate(mod, nei->nei_nvme, TOPO_MOD_UFM, UFM, 0, 0,
496 	    &tud);
497 	di_devfs_path_free(path);
498 }
499 
500 static const topo_pgroup_info_t nvme_pgroup = {
501 	TOPO_PGROUP_NVME,
502 	TOPO_STABILITY_PRIVATE,
503 	TOPO_STABILITY_PRIVATE,
504 	1
505 };
506 
507 static int
508 make_nvme_node(nvme_enum_info_t *nvme_info)
509 {
510 	topo_mod_t *mod = nvme_info->nei_mod;
511 	nvlist_t *auth = NULL, *fmri = NULL, *fru;
512 	tnode_t *nvme;
513 	char *rev = NULL, *model = NULL, *serial = NULL, *vers = NULL;
514 	char *pname = topo_node_name(nvme_info->nei_parent);
515 	char *label = NULL;
516 	topo_instance_t pinst = topo_node_instance(nvme_info->nei_parent);
517 	int err = 0, ret = -1;
518 
519 	/*
520 	 * Next we pass the strings through a function that sanitizes them of
521 	 * any characters that can't be used in an FMRI string. This also takes
522 	 * care of making them properly terminated.
523 	 */
524 	rev = topo_mod_clean_strn(mod, nvme_info->nei_idctl->id_fwrev,
525 	    NVME_FWVER_SZ);
526 	model = topo_mod_clean_strn(mod, nvme_info->nei_idctl->id_model,
527 	    NVME_MODEL_SZ);
528 	serial = topo_mod_clean_strn(mod, nvme_info->nei_idctl->id_serial,
529 	    NVME_SERIAL_SZ);
530 
531 	auth = topo_mod_auth(mod, nvme_info->nei_parent);
532 	fmri = topo_mod_hcfmri(mod, nvme_info->nei_parent, FM_HC_SCHEME_VERSION,
533 	    NVME, 0, NULL, auth, model, rev, serial);
534 
535 	if (fmri == NULL) {
536 		/* errno set */
537 		topo_mod_dprintf(mod, "%s: hcfmri failed for %s=%" PRIu64
538 		    "/%s=0", __func__, pname, pinst, NVME);
539 		goto error;
540 	}
541 
542 	/*
543 	 * If our parent is a pciexfn node, then we need to create a nvme range
544 	 * underneath it to hold the nvme hierarchy.  For other cases, where
545 	 * enumeration is being driven by a topo map file, this range will have
546 	 * already been statically defined in the XML.
547 	 */
548 	if (strcmp(pname, PCIEX_FUNCTION) == 0) {
549 		if (topo_node_range_create(mod, nvme_info->nei_parent, NVME, 0,
550 		    0) < 0) {
551 			/* errno set */
552 			topo_mod_dprintf(mod, "%s: error creating %s range",
553 			    __func__, NVME);
554 			goto error;
555 		}
556 	}
557 
558 	/*
559 	 * Create a new topo node to represent the NVMe controller and bind it
560 	 * to the parent node.
561 	 */
562 	if ((nvme = topo_node_bind(mod, nvme_info->nei_parent, NVME, 0,
563 	    fmri)) == NULL) {
564 		/* errno set */
565 		topo_mod_dprintf(mod, "%s: bind failed for %s=%" PRIu64
566 		    "/%s=0", __func__, pname, pinst, NVME);
567 		goto error;
568 	}
569 	nvme_info->nei_nvme = nvme;
570 	nvme_info->nei_nvme_fmri = fmri;
571 
572 	/*
573 	 * If our parent node is a "pciexfn" node then this is a NVMe device on
574 	 * a PCIe AIC, so we inherit our parent's FRU.  Otherwise, we set the
575 	 * FRU to ourself.
576 	 */
577 	if (strcmp(topo_node_name(nvme_info->nei_parent), PCIEX_FUNCTION) == 0)
578 		fru = NULL;
579 	else
580 		fru = fmri;
581 
582 	if (topo_node_fru_set(nvme, fru, 0, &err) != 0) {
583 		topo_mod_dprintf(mod, "%s: failed to set FRU: %s", __func__,
584 		    topo_strerror(err));
585 		(void) topo_mod_seterrno(mod, err);
586 		goto error;
587 	}
588 
589 	/*
590 	 * Clone the label from our parent node.  We can't inherit the property
591 	 * because the label prop is mutable on bay nodes and only immutable
592 	 * properties can be inherited.
593 	 */
594 	if ((topo_node_label(nvme_info->nei_parent, &label, &err) != 0 &&
595 	    err != ETOPO_PROP_NOENT) ||
596 	    topo_node_label_set(nvme, label, &err) != 0) {
597 		topo_mod_dprintf(mod, "%s: failed to set label: %s",
598 		    __func__, topo_strerror(err));
599 		(void) topo_mod_seterrno(mod, err);
600 		goto error;
601 	}
602 
603 	/*
604 	 * Ensure that we have a UFM property set based on our devinfo path.
605 	 * This is a little repetitive if our parent actually did so as well,
606 	 * but given that the majority of such nodes are under bays and slots
607 	 * right now, it's a worthwhile tradeoff.
608 	 */
609 	disk_nvme_make_ufm(mod, nvme_info);
610 
611 	if (topo_pgroup_create(nvme, &nvme_pgroup, &err) != 0) {
612 		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
613 		    __func__, TOPO_PGROUP_NVME, topo_strerror(err));
614 		(void) topo_mod_seterrno(mod, err);
615 		goto error;
616 	}
617 
618 	if (asprintf(&vers, "%u.%u", nvme_info->nei_vers.v_major,
619 	    nvme_info->nei_vers.v_minor) < 0) {
620 		topo_mod_dprintf(mod, "%s: failed to alloc string", __func__);
621 		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
622 		goto error;
623 	}
624 	if (topo_prop_set_string(nvme, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER,
625 	    TOPO_PROP_IMMUTABLE, vers, &err) != 0) {
626 		topo_mod_dprintf(mod, "%s: failed to set %s/%s property",
627 		    __func__, TOPO_PGROUP_NVME, TOPO_PROP_NVME_VER);
628 		(void) topo_mod_seterrno(mod, err);
629 		goto error;
630 	}
631 
632 	if (topo_pgroup_create(nvme, &io_pgroup, &err) != 0) {
633 		topo_mod_dprintf(mod, "%s: failed to create %s pgroup: %s",
634 		    __func__, TOPO_PGROUP_IO, topo_strerror(err));
635 		(void) topo_mod_seterrno(mod, err);
636 		goto error;
637 	}
638 
639 	if (!disk_nvme_common_io(mod, nvme, nvme_info->nei_dinode)) {
640 		goto error;
641 	}
642 
643 	/*
644 	 * Create a child disk node for each namespace.
645 	 */
646 	if (topo_node_range_create(mod, nvme, DISK, 0,
647 	    (nvme_info->nei_idctl->id_nn - 1)) < 0) {
648 		/* errno set */
649 		topo_mod_dprintf(mod, "%s: error creating %s range", __func__,
650 		    DISK);
651 		goto error;
652 	}
653 
654 	/*
655 	 * Iterate over each namespace to see if it's a candidate for inclusion.
656 	 * Namespaces start at index 1 and not every namespace will be included.
657 	 * We map things such that a disk instance is always namespace - 1 to
658 	 * fit into the above mapping.
659 	 */
660 	for (uint32_t i = 1; i <= nvme_info->nei_idctl->id_nn; i++) {
661 		disk_nvme_make_ns(nvme_info, i);
662 	}
663 	ret = 0;
664 
665 error:
666 	free(vers);
667 	nvlist_free(auth);
668 	nvlist_free(fmri);
669 	topo_mod_strfree(mod, rev);
670 	topo_mod_strfree(mod, model);
671 	topo_mod_strfree(mod, serial);
672 	topo_mod_strfree(mod, label);
673 	return (ret);
674 }
675 
676 struct diwalk_arg {
677 	topo_mod_t	*diwk_mod;
678 	tnode_t		*diwk_parent;
679 };
680 
681 /*
682  * This function gathers identity information from the NVMe controller and
683  * stores it in a struct.  This struct is passed to make_nvme_node(), which
684  * does the actual topo node creation.
685  */
686 static int
687 discover_nvme_ctl(di_node_t node, di_minor_t minor, void *arg)
688 {
689 	struct diwalk_arg *wkarg = arg;
690 	topo_mod_t *mod = wkarg->diwk_mod;
691 	char *path = NULL, *devctl = NULL;
692 	nvme_ioctl_t nioc = { 0 };
693 	nvme_identify_ctrl_t *idctl = NULL;
694 	nvme_enum_info_t nvme_info = { 0 };
695 	int fd = -1, ret = DI_WALK_TERMINATE;
696 
697 	if ((path = di_devfs_minor_path(minor)) == NULL) {
698 		topo_mod_dprintf(mod, "failed to get minor path");
699 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
700 		return (ret);
701 	}
702 
703 	topo_mod_dprintf(mod, "%s=%" PRIu64 ": found nvme controller: %s",
704 	    topo_node_name(wkarg->diwk_parent),
705 	    topo_node_instance(wkarg->diwk_parent), path);
706 
707 	if (asprintf(&devctl, "/devices%s", path) < 0) {
708 		topo_mod_dprintf(mod, "failed to alloc string");
709 		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
710 		goto error;
711 	}
712 
713 	if ((fd = open(devctl, O_RDWR)) < 0) {
714 		topo_mod_dprintf(mod, "failed to open %s", devctl);
715 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
716 		goto error;
717 	}
718 	if ((idctl = topo_mod_zalloc(mod, NVME_IDENTIFY_BUFSIZE)) == NULL) {
719 		topo_mod_dprintf(mod, "zalloc failed");
720 		(void) topo_mod_seterrno(mod, EMOD_NOMEM);
721 		goto error;
722 	}
723 	nioc.n_len = NVME_IDENTIFY_BUFSIZE;
724 	nioc.n_buf = (uintptr_t)idctl;
725 	nioc.n_arg = NVME_IDENTIFY_CTRL;
726 
727 	if (ioctl(fd, NVME_IOC_IDENTIFY, &nioc) != 0) {
728 		topo_mod_dprintf(mod, "NVME_IOC_IDENTIFY ioctl "
729 		    "failed: %s", strerror(errno));
730 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
731 		goto error;
732 	}
733 
734 	nioc.n_len = sizeof (nvme_version_t);
735 	nioc.n_buf = (uintptr_t)&nvme_info.nei_vers;
736 	nioc.n_arg = 0;
737 
738 	if (ioctl(fd, NVME_IOC_VERSION, &nioc) != 0) {
739 		topo_mod_dprintf(mod, "NVME_IOC_VERSION ioctl failed: %s",
740 		    strerror(errno));
741 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
742 		goto error;
743 	}
744 
745 	nvme_info.nei_mod = mod;
746 	nvme_info.nei_nvme_path = path;
747 	nvme_info.nei_dinode = node;
748 	nvme_info.nei_idctl = idctl;
749 	nvme_info.nei_parent = wkarg->diwk_parent;
750 	nvme_info.nei_fd = fd;
751 
752 	if (make_nvme_node(&nvme_info) != 0) {
753 		/* errno set */
754 		goto error;
755 	}
756 
757 	ret = DI_WALK_CONTINUE;
758 
759 error:
760 	if (fd > 0)
761 		(void) close(fd);
762 	di_devfs_path_free(path);
763 	free(devctl);
764 	if (idctl != NULL)
765 		topo_mod_free(mod, idctl, NVME_IDENTIFY_BUFSIZE);
766 	return (ret);
767 }
768 
769 int
770 disk_nvme_enum_disk(topo_mod_t *mod, tnode_t *pnode)
771 {
772 	char *parent = NULL;
773 	int err;
774 	di_node_t devtree;
775 	di_node_t dnode;
776 	struct diwalk_arg wkarg = { 0 };
777 	int ret = -1;
778 
779 	/*
780 	 * Lookup a property containing the devfs path of the parent PCIe
781 	 * device of the NVMe device we're attempting to enumerate.  This
782 	 * property is hard-coded in per-platform topo XML maps that are
783 	 * delivered with the OS.  This hard-coded path allows topo to map a
784 	 * given NVMe controller to a physical location (bay or slot) on the
785 	 * platform, when generating the topo snapshot.
786 	 */
787 	if (topo_prop_get_string(pnode, TOPO_PGROUP_BINDING,
788 	    TOPO_BINDING_PARENT_DEV, &parent, &err) != 0) {
789 		topo_mod_dprintf(mod, "parent node was missing nvme binding "
790 		    "properties\n");
791 		(void) topo_mod_seterrno(mod, err);
792 		goto out;
793 	}
794 	if ((devtree = topo_mod_devinfo(mod)) == DI_NODE_NIL) {
795 		topo_mod_dprintf(mod, "failed to get devinfo snapshot");
796 		(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
797 		goto out;
798 	}
799 
800 	/*
801 	 * Walk the devinfo tree looking NVMe devices. For each NVMe device,
802 	 * check if the devfs path of the parent matches the one specified in
803 	 * TOPO_BINDING_PARENT_DEV.
804 	 */
805 	wkarg.diwk_mod = mod;
806 	wkarg.diwk_parent = pnode;
807 	dnode = di_drv_first_node(NVME_DRV, devtree);
808 	while (dnode != DI_NODE_NIL) {
809 		char *path;
810 
811 		if ((path = di_devfs_path(di_parent_node(dnode))) == NULL) {
812 			topo_mod_dprintf(mod, "failed to get dev path");
813 			(void) topo_mod_seterrno(mod, EMOD_UNKNOWN);
814 			goto out;
815 		}
816 		if (strcmp(parent, path) == 0) {
817 			if (di_walk_minor(dnode, DDI_NT_NVME_NEXUS, 0,
818 			    &wkarg, discover_nvme_ctl) < 0) {
819 				di_devfs_path_free(path);
820 				goto out;
821 			}
822 		}
823 		di_devfs_path_free(path);
824 		dnode = di_drv_next_node(dnode);
825 	}
826 	ret = 0;
827 
828 out:
829 	topo_mod_strfree(mod, parent);
830 	return (ret);
831 }
832