xref: /illumos-gate/usr/src/lib/libnvme/common/libnvme.c (revision 533affcbc7fc4d0c8132976ea454aaa715fe2307)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2024 Oxide Computer Company
14  */
15 
16 /*
17  * Programmatic interface to NVMe Devices
18  *
19  * libnvme exists to provide a means of performing non-I/O related operations on
20  * an NVMe device. This is intended to allow software, regardless of whether it
21  * is part of illumos or not, to operate on NVMe devices and perform most of the
22  * administrative and operator tasks that might come up. This library does not
23  * provide a stable interface yet. The rest of this block comment goes into the
24  * organization and background into why it looks the way it does.
25  *
26  * --------------------
27  * Library Organization
28  * --------------------
29  *
30  * There are two large classes of source files that make up this library
31  * currently:
32  *
33  *   1. Source code that implements the library's interfaces is found alongside
34  *      this file in lib/libnvme/common. This code is generally organized based
35  *      around the portion of the NVMe specification that it implements. So for
36  *      example, code that implements logic related to the features is found
37  *      in libnvme_feature.c, formatting namespaces in libnvme_format.c, log
38  *      pages in libnvme_log.c, etc. All files in the library begin with
39  *      'libnvme_' as a way to help namespace the file names from the second set
40  *      of files.
41  *
42  *   2. Validation logic that is shared between libnvme and the kernel is found
43  *      in common/nvme/. While the kernel must validate requests regardless, we
44  *      leverage this shared information as a means for trying to ensure that we
45  *      have useful errors early. That code is factored in a way to facilitate
46  *      easier unit testing.
47  *
48  * Because of the nature of this split, all of the opaque structures that we
49  * create and their relationships are all maintained in the library (group 1).
50  * All of the logic in group 2 is designed to be constant data tables and
51  * functions that are fed information about the controller they are operating on
52  * to answer them.
53  *
54  * There are several general classes of interfaces and related structures that
55  * we have in the library. We break them into the following general categories
56  * based on their purpose:
57  *
58  * DISCOVERY
59  *
60  * One of the large responsibilities of this library is helping someone discover
61  * information about something, whether that be a controller, a namespace, a log
62  * page, a feature, a unique command, etc. Information about one of these items
63  * is contained in a generally opaque discovery structure. For example, the
64  * nvme_log_disc_t.
65  *
66  * The goal of these structures is to contain all of the metadata for working
67  * with the object in question. Continuing on the log page discovery example, it
68  * can tell us information about what fields are required, whether or not the
69  * log might be supported, whether it operates on a controller, a namespace, or
70  * something else, as well as more human-usable things such as names and
71  * descriptions.
72  *
73  * Discovery objects are both for humans and for programmatic consumption. There
74  * are several cases where requests can be created directly from discovery
75  * objects. A well designed discovery object can allow a general implementation
76  * of a consumer such as nvmeadm to build up a request without having to
77  * hardcode everything about what is needed for each request (though most
78  * consumers still need to have information about the actual contents, meaning,
79  * and semantics of a log or feature).
80  *
81  * Discovery objects are obtained in two general ways. The first is using one of
82  * the iterator/callback based functions to discover a given class of data. The
83  * second path is that several of the functions which operate based on the name
84  * of something, e.g. nvme_log_req_init_by_name(),
85  * nvme_get_feat_req_init_by_name(), etc. will return a discovery object.
86  *
87  * When a discovery object is returned based on iteration (more below), the
88  * memory is owned by the iterator. When it is returned by a request
89  * initialization function, then it has its own life time and must be freed.
90  * We try to make this distinction clear in the API based on whether or not the
91  * discovery object is 'const'.
92  *
93  * All discovery objects should be fully filled out before they are handed back
94  * to a caller. It is an explicit design goal that every function that gets data
95  * from the discovery structure operates on a const version of the pointer. This
96  * is the hint that you cannot perform additional I/O or related after handing
97  * out the discovery structure. Attempts to loosen this constraint should be
98  * considered carefully due to how we communicate ownership.
99  *
100  * ITERATORS
101  *
102  * A common pattern of the library is iterating over items. This includes
103  * controllers and namespaces, but also as part of discovering what specific
104  * logs, commands, features, etc. are actually supported by the device.
105  * Iteration always follows the same general pattern:
106  *
107  * 1. An iterator is initialized with a call to nvme_<name>_discover_init().
108  * This will generally return a structure of the form nvme_<name>_iter_t. This
109  * structure contains the memory for the corresponding value that is returned
110  * from step in (2).
111  *
112  * 2. To actually pull values out of an iterator, one must call the
113  * nvme_<name>_step() function for the iterator. This will return a
114  * corresponding nvme_<name>_disc_t structure that is opaque and has a suite of
115  * functions that are usable for getting information out from it. This structure
116  * is valid only until the next time the nvme_<name>_step() is called. The
117  * return value of step indicates the state of the data and indicates whether or
118  * not there is an error, the iterator has finished, or we successfully stepped
119  * and the data is filled out.
120  *
121  * If discovery data needs to outlive a given iteration, then it can be
122  * duplicated which will give it a separate lifetime, though that comes with
123  * the responsibility that it must then be freed.
124  *
125  * 3. To finish using iterators, one finally calls the corresponding
126  * nvme_<name>_discover_fini(). That will deallocate the iterator structure and
127  * finish everything up.
128  *
129  * REQUESTS
130  *
131  * One of the chief goals of this library is to be able to perform requests.
132  * Each request has a structure that can be initialized, filled out, and then
133  * executed. A request structure can be reused multiple times with minor
134  * adjustments in-between (though changes aren't required). Request structures
135  * are either initialized in a blank mode where every value must be filled out
136  * or they can be initialized through their discovery object (or the common name
137  * of such an object).
138  *
139  * When a request structure is initialized through a discovery object, it
140  * automatically sets several of the fields, knows which ones are still required
141  * to be set, and which fields cannot be set. For example, if you create a get
142  * log page request from a log discovery object, it will not allow you to change
143  * the log page you're requesting; however, in return you don't have to specify
144  * the command set interface or log identifier.
145  *
146  * Request objects are tied to a controller. See 'Parallelism, Thread Safety,
147  * and Errors' for more information.
148  *
149  * INFORMATION SNAPSHOTS
150  *
151  * To get information about a namespace or controller, one has to take an
152  * information snapshot. Once an information snapshot is obtained, this snapshot
153  * answers all questions about the controller with a mostly consistent set of
154  * point-in-time data. The main reason for this design was to try and simplify
155  * where errors can occur and to provide a straightforward serialization point
156  * so that way the raw underlying data could be gathered at one system and then
157  * interpreted later on another.
158  *
159  * The only reason that there are some fallible operations on the snapshot are
160  * things that are not guaranteed to exist for all such NVMe controllers.
161  *
162  * LIBRARY, CONTROLLER, NAMESPACE and SNAPSHOT HANDLES
163  *
164  * The last major set of types used in this library are opaque handles. As you
165  * might have guessed given the request structures, all of the objects which
166  * represent something are opaque. Each library handle is independent of one
167  * another and each controller handle is independent of one another. In general,
168  * it is expected that only a single controller handle is used at a given time
169  * for a given library handle, but this is not currently enforced.  Error
170  * information and parallelism is tied into this, see 'Parallelism, Thread
171  * Safety, and Errors' for more information.
172  *
173  * -----------------
174  * Opaque Structures
175  * -----------------
176  *
177  * One of the things that might stand out in libnvme is the use of opaque
178  * structures everywhere with functions to access every arbitrary piece of data.
179  * This and the function pattern around building up a request were done to try
180  * and deal with the evolutionary nature of the NVMe specification. If you look
181  * at the various requests, with the exception of firmware download, almost
182  * every request has added additional features through the spec revisions. NVMe
183  * 2.0 changed most things again with the requirement to specify the command set
184  * interface.
185  *
186  * While the way that the NVMe specification has done this is quite reasonable,
187  * it makes it much more difficult to use a traditional series of arguments to
188  * functions or a structure without having to try to version the symbol through
189  * clever games. If instead we accept that the specification will change and
190  * that the specification is always taking these additional arguments out of
191  * values that must be zero, then an opaque request structure where you have to
192  * make an explicit function call and recompile to get slightly different
193  * behavior is mostly reasonable. We may not be able to be perfect given we're
194  * at the mercy of the specification, but at least this is better than the
195  * alternative.
196  *
197  * This is ultimately why all the request structures are opaque and use a
198  * pseudo-builder pattern to fill out the request information. Further evidence
199  * to this point is that there was no way to avoid changing every kernel
200  * structure here while retaining semantic operations. No one wants to manually
201  * assemble cdw12-15 here. That's not how we can add value for the library.
202  *
203  * Similarly, for all discovery objects we ended up utilizing opaque objects.
204  * The main reason here is that we want to be able to embed this library as a
205  * committed interface in other languages and having the discovery structures be
206  * something that everyone can see means it'll be harder to extend it. While
207  * this concern is somewhat more theoretical given the iterator pattern, given
208  * the other bits in the request structure we decided to lean into the
209  * opaqueness.
210  *
211  * --------------------------------------
212  * Parallelism, Thread Safety, and Errors
213  * --------------------------------------
214  *
215  * One of the library's major design points is how do we achieve thread-safety,
216  * how does ownership work, where do errors appear, and what is the degree of
217  * parallelism that is achievable. To work through this we look at a few
218  * different things:
219  *
220  * 1. The degree to which the hardware allows for parallelism
221  * 2. The degree to which users might desire parallelism
222  * 3. The ergonomics of getting and storing errors
223  *
224  * The NVMe specification allows for different degrees of admin command
225  * parallelism on a per-command basis. This is discoverable, but the main point
226  * is that there are a class of commands where only one can be outstanding at a
227  * time, which likely fall into the case of most of the destructive commands
228  * like Format NVM, Activate Firmware, etc. Our expectation to some extent is
229  * that most admin queue commands don't need to be issued in parallel; however,
230  * beyond how we structure the library and error handling, we don't try to
231  * enforce that here. The kernel does do some enforcement through requiring
232  * mandatory write locks to perform some operations.
233  *
234  * When we get to how do folks want to use this, during the initial design phase
235  * we mostly theorized based on how nvmeadm is using it today and how various
236  * daemons like a FRU monitor or an appliance kit's software might want to
237  * interact with it. Our general starting assumption is that it's very
238  * reasonable for each discovered controller to be handled in parallel, but that
239  * operations on a controller itself are likely serial given that we're not
240  * issuing I/O through this mechanism. If we were, then that'd be an entirely
241  * different set of constraints.
242  *
243  * To discuss the perceived ergonomics, we need to first discuss what error
244  * information we want to be able to have. It's an important goal of both the
245  * NVMe driver and this library to give useful semantic errors. In particular,
246  * for any operation we want to make sure that we include the following
247  * information:
248  *
249  *   o A hopefully distinguishable semantic error
250  *   o Saving errno as a system error if relevant (e.g if open(2) failed)
251  *   o A message for humans that gives more specifics about what happened and is
252  *     intended to be passed along to the output of a command or another error
253  *     message.
254  *   o If a controller error occurs, we want to be able to provide the
255  *     controller's sc (status code) and sct (status code type).
256  *
257  * With this we get to the questions around ergonomics and related which are
258  * entirely subjective. Given that we want to capture that information how do we
259  * best do this given the tooling that we have. When the library was first being
260  * prototyped all errors were on the nvme_t, basically the top-level handle.
261  * This meant that each operation on a controller had to be done serially or you
262  * would have to use different handles. However, the simplicity was that there
263  * was one thing to check.
264  *
265  * This evolution changed slightly when we introduced information snapshots.
266  * Because the information snapshots are meant to be separate entities whose
267  * lifetime can extend beyond the nvme_t library handle, they ended up
268  * developing their own error codes and functions. This has been okay because
269  * there aren't too many use cases there, though the need to duplicate error
270  * handling functions is a bit painful.
271  *
272  * From there, we did consider what if each request had its own error
273  * information that could be extracted. That would turn into a lot of functions
274  * to get at that data. The controller's allowed parallelism for admin commands
275  * varies based on each command. Some commands must occur when there are no
276  * other admin commands on the controller and others when there there is nothing
277  * on the namespace. However, due to that nuance, it would lead to forcing the
278  * consumer to understand the controller's specifics more than is often
279  * necessary for a given request. To add to that, it'd also just be a pain to
280  * try to get all the error information out in a different way and the consumers
281  * we started writing in this fashion were not looking good.
282  *
283  * We also considered whether we could consolidate all the error functions on
284  * each request into one structure that we get, but that didn't move the needle
285  * too much. It also raised some more concerns around how we minimize races and
286  * how data changes around that.
287  *
288  * So all of this led us to our current compromise position: we allow for
289  * parallelism at the controller level. More specifically:
290  *
291  * 1. Operations which take the nvme_t handle set errors on it and must operate
292  *    serially. That is the nvme_t should only be used from one thread at any
293  *    time, but may move between threads. Errors are set on it.
294  *
295  * 2. The nvme_ctrl_t has its own error information. A given nvme_ctrl_t should
296  *    only be used serially; however, different ones can be used in parallel. A
297  *    controller doesn't guarantee exclusivity. That requires an explicit
298  *    locking operation.
299  *
300  * 3. Both request structures and namespaces place their errors on the
301  *    corresponding controller that they were created from. Therefore the
302  *    per-controller serialization in (2) applies here as well. If two requests
303  *    are tied to different controllers, they can proceed in parallel.
304  *
305  * 4. Once a controller or namespace snapshot is obtained, they fall into a
306  *    similar pattern: each one can be operated on in parallel, but generally
307  *    one should only operate on a single one serially.
308  *
309  * Other than the constraints defined above, the library does not care which
310  * threads that an operation occurs on. These can be moved to wherever it needs
311  * to be. Locking and related in the kernel is based on the open file descriptor
312  * to the controller.
313  *
314  * ----------------
315  * Field Validation
316  * ----------------
317  *
318  * Every request is made up of fields that correspond to parts of the NVMe
319  * specification. Our requests operate in terms of the logical fields that we
320  * opt to expose and that the kernel knows how to consume. In general, we don't
321  * expose the raw cdw values that make up the commands (except for the vendor
322  * unique commands or arguments that are explicitly that way ala get features).
323  * While operating on raw cdw arguments would be a simple way to create ABI
324  * stability, it would leave everyone having to break up all the fields
325  * themselves and we believe end up somewhat more error prone than the
326  * interfaces we expose today.
327  *
328  * Requests are created in one of two ways today: they are either initialized
329  * from corresponding discovery data e.g. nvme_log_req_init_by_disc() and
330  * nvme_get_feat_req_init_by_name(), or one creates a raw request ala
331  * nvme_get_feat_req_init(). In the former cases, we fill out a bunch of the
332  * fields that would normally need to be set such as the log or feature ID. We
333  * also will note which fields are allowed and expected. For example, the health
334  * log page does not take or expect a lsp (log specific parameter) or related
335  * and therefore we can flag that with an _UNUSE class error. Conversely,
336  * requests that are created from their raw form will not have any such error
337  * checking performed until they are finalized and checked by the kernel. The
338  * set of fields that can be set in a request is usually tracked in the
339  * structure with a member of the form <prefix>_allow.
340  *
341  * One set of library error checking that is uniform between both types is that
342  * of missing fields. There are minimum fields that must be set for different
343  * types of requests. That check will always be performed regardless of the path
344  * that is taken through the system. Tracking which members must still be set is
345  * done by a member of the form <prefix>_need.
346  *
347  * When we perform validation, we try to push the vast majority of it into the
348  * common validation code that is shared between the kernel and userland. This
349  * is wrapped up through the nvme_field_check_one() logic. The common code will
350  * check if the field is supported by the controller (generating an _UNSUP class
351  * error if not) and if the value of the field is within a valid range
352  * (generating a _RANGE class error if not).
353  *
354  * While we try to fold the majority of such checks into the common code as
355  * possible, it isn't perfect and some things have to be checked outside of
356  * that. Those consist of the following general cases:
357  *
358  * 1) Items that are not semantically fields in the actual command but are
359  * things that we are tracking ourselves in the library. An example of this
360  * would be fields in the vuc request structure that we are synthesizing
361  * ourselves.
362  *
363  * 2) While the field logic has the specifics of what controller is being
364  * operated upon, it doesn't have all the knowledge of what things can be
365  * combined or not. It can answer the specifics about its field, but cannot look
366  * at the broader request.
367  *
368  * As a result, there are some duplicated checks in the library and the kernel,
369  * though several are left just to the kernel. However, the vast majority of
370  * validation does happen through these common routines which leaves the library
371  * nvme_<type>_req_set_<field> functions generally wrappers around checking
372  * common code and updating our tracking around what fields are set or not so we
373  * can issue an ioctl.
374  */
375 
376 #include <stdlib.h>
377 #include <stdarg.h>
378 #include <libdevinfo.h>
379 #include <unistd.h>
380 #include <string.h>
381 #include <sys/types.h>
382 #include <sys/stat.h>
383 #include <fcntl.h>
384 #include <upanic.h>
385 
386 #include "libnvme_impl.h"
387 
388 bool
nvme_vers_ctrl_atleast(const nvme_ctrl_t * ctrl,const nvme_version_t * targ)389 nvme_vers_ctrl_atleast(const nvme_ctrl_t *ctrl, const nvme_version_t *targ)
390 {
391 	return (nvme_vers_atleast(&ctrl->nc_vers, targ));
392 }
393 
394 bool
nvme_vers_ctrl_info_atleast(const nvme_ctrl_info_t * ci,const nvme_version_t * targ)395 nvme_vers_ctrl_info_atleast(const nvme_ctrl_info_t *ci,
396     const nvme_version_t *targ)
397 {
398 	return (nvme_vers_atleast(&ci->nci_vers, targ));
399 }
400 
401 bool
nvme_vers_ns_info_atleast(const nvme_ns_info_t * info,const nvme_version_t * targ)402 nvme_vers_ns_info_atleast(const nvme_ns_info_t *info,
403     const nvme_version_t *targ)
404 {
405 	return (nvme_vers_atleast(&info->nni_vers, targ));
406 }
407 
408 bool
nvme_guid_valid(const nvme_ctrl_t * ctrl,const uint8_t guid[16])409 nvme_guid_valid(const nvme_ctrl_t *ctrl, const uint8_t guid[16])
410 {
411 	const uint8_t zero_guid[16] = { 0 };
412 
413 	return (nvme_vers_ctrl_atleast(ctrl, &nvme_vers_1v2) &&
414 	    memcmp(zero_guid, guid, sizeof (zero_guid)) != 0);
415 }
416 
417 bool
nvme_eui64_valid(const nvme_ctrl_t * ctrl,const uint8_t eui64[8])418 nvme_eui64_valid(const nvme_ctrl_t *ctrl, const uint8_t eui64[8])
419 {
420 	const uint8_t zero_eui[8] = { 0 };
421 
422 	return (nvme_vers_ctrl_atleast(ctrl, &nvme_vers_1v1) &&
423 	    memcmp(zero_eui, eui64, sizeof (zero_eui)) != 0);
424 }
425 
426 int
nvme_format_nguid(const uint8_t nguid[16],char * buf,size_t len)427 nvme_format_nguid(const uint8_t nguid[16], char *buf, size_t len)
428 {
429 	return (snprintf(buf, len, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X"
430 	    "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
431 	    nguid[0], nguid[1], nguid[2], nguid[3], nguid[4], nguid[5],
432 	    nguid[6], nguid[7], nguid[8], nguid[9], nguid[10], nguid[11],
433 	    nguid[12], nguid[13], nguid[14], nguid[15]));
434 }
435 
436 int
nvme_format_eui64(const uint8_t eui64[8],char * buf,size_t len)437 nvme_format_eui64(const uint8_t eui64[8], char *buf, size_t len)
438 {
439 	return (snprintf(buf, len, "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X",
440 	    eui64[0], eui64[1], eui64[2], eui64[3], eui64[4], eui64[5],
441 	    eui64[6], eui64[7]));
442 }
443 
444 void
nvme_fini(nvme_t * nvme)445 nvme_fini(nvme_t *nvme)
446 {
447 	if (nvme == NULL)
448 		return;
449 
450 	if (nvme->nh_devinfo != DI_NODE_NIL) {
451 		di_fini(nvme->nh_devinfo);
452 	}
453 
454 	free(nvme);
455 }
456 
457 nvme_t *
nvme_init(void)458 nvme_init(void)
459 {
460 	nvme_t *nvme;
461 
462 	nvme = calloc(1, sizeof (nvme_t));
463 	if (nvme == NULL) {
464 		return (NULL);
465 	}
466 
467 	nvme->nh_devinfo = di_init("/", DINFOCPYALL);
468 	if (nvme->nh_devinfo == DI_NODE_NIL) {
469 		nvme_fini(nvme);
470 		return (NULL);
471 	}
472 
473 	return (nvme);
474 }
475 
476 void
nvme_ctrl_discover_fini(nvme_ctrl_iter_t * iter)477 nvme_ctrl_discover_fini(nvme_ctrl_iter_t *iter)
478 {
479 	free(iter);
480 }
481 
482 nvme_iter_t
nvme_ctrl_discover_step(nvme_ctrl_iter_t * iter,const nvme_ctrl_disc_t ** discp)483 nvme_ctrl_discover_step(nvme_ctrl_iter_t *iter, const nvme_ctrl_disc_t **discp)
484 {
485 	di_minor_t m;
486 
487 	*discp = NULL;
488 	if (iter->ni_done) {
489 		return (NVME_ITER_DONE);
490 	}
491 
492 	for (;;) {
493 		if (iter->ni_cur == NULL) {
494 			iter->ni_cur = di_drv_first_node("nvme",
495 			    iter->ni_nvme->nh_devinfo);
496 		} else {
497 			iter->ni_cur = di_drv_next_node(iter->ni_cur);
498 		}
499 
500 		if (iter->ni_cur == NULL) {
501 			iter->ni_done = true;
502 			return (NVME_ITER_DONE);
503 		}
504 
505 		for (m = di_minor_next(iter->ni_cur, DI_MINOR_NIL);
506 		    m != DI_MINOR_NIL; m = di_minor_next(iter->ni_cur, m)) {
507 			if (strcmp(di_minor_nodetype(m),
508 			    DDI_NT_NVME_NEXUS) == 0) {
509 				break;
510 			}
511 		}
512 
513 		if (m == DI_MINOR_NIL) {
514 			continue;
515 		}
516 
517 		iter->ni_disc.ncd_devi = iter->ni_cur;
518 		iter->ni_disc.ncd_minor = m;
519 		*discp = &iter->ni_disc;
520 		return (NVME_ITER_VALID);
521 	}
522 
523 	return (NVME_ITER_DONE);
524 }
525 
526 bool
nvme_ctrl_discover_init(nvme_t * nvme,nvme_ctrl_iter_t ** iterp)527 nvme_ctrl_discover_init(nvme_t *nvme, nvme_ctrl_iter_t **iterp)
528 {
529 	nvme_ctrl_iter_t *iter;
530 
531 	if (iterp == NULL) {
532 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
533 		    "invalid nvme_ctrl_iter_t output pointer: %p", iterp));
534 	}
535 
536 	iter = calloc(1, sizeof (nvme_ctrl_iter_t));
537 	if (iter == NULL) {
538 		int e = errno;
539 		return (nvme_error(nvme, NVME_ERR_NO_MEM, e, "failed to "
540 		    "allocate memory for a new nvme_ctrl_iter_t: %s",
541 		    strerror(e)));
542 	}
543 	iter->ni_nvme = nvme;
544 	*iterp = iter;
545 	return (nvme_success(nvme));
546 }
547 
548 bool
nvme_ctrl_discover(nvme_t * nvme,nvme_ctrl_disc_f func,void * arg)549 nvme_ctrl_discover(nvme_t *nvme, nvme_ctrl_disc_f func, void *arg)
550 {
551 	nvme_ctrl_iter_t *iter;
552 	const nvme_ctrl_disc_t *disc;
553 	nvme_iter_t ret;
554 
555 	if (func == NULL) {
556 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
557 		    "invalid nvme_ctrl_disc_f function pointer: %p", func));
558 	}
559 
560 	if (!nvme_ctrl_discover_init(nvme, &iter)) {
561 		return (false);
562 	}
563 
564 	while ((ret = nvme_ctrl_discover_step(iter, &disc)) ==
565 	    NVME_ITER_VALID) {
566 		if (!func(nvme, disc, arg))
567 			break;
568 	}
569 
570 	nvme_ctrl_discover_fini(iter);
571 	if (ret == NVME_ITER_ERROR) {
572 		return (false);
573 	}
574 
575 	return (nvme_success(nvme));
576 }
577 
578 di_node_t
nvme_ctrl_disc_devi(const nvme_ctrl_disc_t * discp)579 nvme_ctrl_disc_devi(const nvme_ctrl_disc_t *discp)
580 {
581 	return (discp->ncd_devi);
582 }
583 
584 di_minor_t
nvme_ctrl_disc_minor(const nvme_ctrl_disc_t * discp)585 nvme_ctrl_disc_minor(const nvme_ctrl_disc_t *discp)
586 {
587 	return (discp->ncd_minor);
588 }
589 
590 void
nvme_ctrl_fini(nvme_ctrl_t * ctrl)591 nvme_ctrl_fini(nvme_ctrl_t *ctrl)
592 {
593 	if (ctrl == NULL) {
594 		return;
595 	}
596 
597 	if (ctrl->nc_devi_path != NULL) {
598 		di_devfs_path_free(ctrl->nc_devi_path);
599 	}
600 
601 	if (ctrl->nc_fd >= 0) {
602 		(void) close(ctrl->nc_fd);
603 		ctrl->nc_fd = -1;
604 	}
605 
606 	free(ctrl);
607 }
608 
609 bool
nvme_ctrl_init(nvme_t * nvme,di_node_t di,nvme_ctrl_t ** outp)610 nvme_ctrl_init(nvme_t *nvme, di_node_t di, nvme_ctrl_t **outp)
611 {
612 	const char *drv;
613 	int32_t inst;
614 	di_minor_t minor;
615 	char *path, buf[PATH_MAX];
616 	nvme_ctrl_t *ctrl;
617 	nvme_ioctl_ctrl_info_t ctrl_info;
618 
619 	if (di == DI_NODE_NIL) {
620 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
621 		    "invalid di_node_t: %p", di));
622 	}
623 
624 	if (outp == NULL) {
625 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
626 		    "invalid nvme_ctrl_t output pointer: %p", outp));
627 	}
628 	*outp = NULL;
629 
630 	drv = di_driver_name(di);
631 	inst = di_instance(di);
632 	if (drv == NULL || inst < 0) {
633 		return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s has "
634 		    "no driver attached", di_node_name(di)));
635 	}
636 
637 	if (strcmp(drv, "nvme") != 0) {
638 		return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s isn't "
639 		    "attached to nvme, found %s", di_node_name(di), drv));
640 	}
641 
642 	/*
643 	 * We have an NVMe node. Find the right minor that corresponds to the
644 	 * attachment point. Once we find that then we can go ahead and open a
645 	 * path to that and construct the device.
646 	 */
647 	minor = DI_MINOR_NIL;
648 	while ((minor = di_minor_next(di, minor)) != DI_MINOR_NIL) {
649 		if (strcmp(di_minor_nodetype(minor), DDI_NT_NVME_NEXUS) == 0) {
650 			break;
651 		}
652 	}
653 
654 	if (minor == DI_MINOR_NIL) {
655 		return (nvme_error(nvme, NVME_ERR_BAD_DEVI, 0, "devi %s isn't "
656 		    "attached to nvme, found %s", di_node_name(di), drv));
657 	}
658 
659 	path = di_devfs_minor_path(minor);
660 	if (path == NULL) {
661 		int e = errno;
662 		return (nvme_error(nvme, NVME_ERR_LIBDEVINFO, e, "failed to "
663 		    "obtain /devices path for the requested minor: %s",
664 		    strerror(e)));
665 	}
666 
667 	if (snprintf(buf, sizeof (buf), "/devices%s", path) >= sizeof (buf)) {
668 		di_devfs_path_free(path);
669 		return (nvme_error(nvme, NVME_ERR_INTERNAL, 0, "failed to "
670 		    "construct full /devices minor path, would have overflown "
671 		    "internal buffer"));
672 	}
673 	di_devfs_path_free(path);
674 
675 	ctrl = calloc(1, sizeof (*ctrl));
676 	if (ctrl == NULL) {
677 		int e = errno;
678 		return (nvme_error(nvme, NVME_ERR_NO_MEM, e, "failed to "
679 		    "allocate memory for a new nvme_ctrl_t: %s", strerror(e)));
680 	}
681 
682 	ctrl->nc_nvme = nvme;
683 	ctrl->nc_devi = di;
684 	ctrl->nc_minor = minor;
685 	ctrl->nc_inst = inst;
686 	ctrl->nc_fd = open(buf, O_RDWR | O_CLOEXEC);
687 	if (ctrl->nc_fd < 0) {
688 		int e = errno;
689 		nvme_ctrl_fini(ctrl);
690 		return (nvme_error(nvme, NVME_ERR_OPEN_DEV, e, "failed to open "
691 		    "device path %s: %s", buf, strerror(e)));
692 	}
693 
694 	ctrl->nc_devi_path = di_devfs_path(di);
695 	if (ctrl->nc_devi_path == NULL) {
696 		int e = errno;
697 		nvme_ctrl_fini(ctrl);
698 		return (nvme_error(nvme, NVME_ERR_LIBDEVINFO, e, "failed to "
699 		    "obtain /devices path for the controller: %s",
700 		    strerror(e)));
701 	}
702 
703 	if (!nvme_ioc_ctrl_info(ctrl, &ctrl_info)) {
704 		nvme_err_data_t err;
705 
706 		nvme_ctrl_err_save(ctrl, &err);
707 		nvme_err_set(nvme, &err);
708 		nvme_ctrl_fini(ctrl);
709 		return (false);
710 	}
711 
712 	ctrl->nc_vers = ctrl_info.nci_vers;
713 	ctrl->nc_info = ctrl_info.nci_ctrl_id;
714 
715 	nvme_vendor_map_ctrl(ctrl);
716 
717 	*outp = ctrl;
718 	return (nvme_success(nvme));
719 }
720 
721 typedef struct {
722 	bool ncia_found;
723 	int32_t ncia_inst;
724 	nvme_ctrl_t *ncia_ctrl;
725 	nvme_err_data_t ncia_err;
726 } nvme_ctrl_init_arg_t;
727 
728 bool
nvme_ctrl_init_by_instance_cb(nvme_t * nvme,const nvme_ctrl_disc_t * disc,void * arg)729 nvme_ctrl_init_by_instance_cb(nvme_t *nvme, const nvme_ctrl_disc_t *disc,
730     void *arg)
731 {
732 	nvme_ctrl_init_arg_t *init = arg;
733 
734 	if (di_instance(disc->ncd_devi) != init->ncia_inst) {
735 		return (true);
736 	}
737 
738 	/*
739 	 * If we fail to open the controller, we need to save the error
740 	 * information because it's going to end up being clobbered because this
741 	 * is a callback function surrounded by other libnvme callers.
742 	 */
743 	init->ncia_found = true;
744 	if (!nvme_ctrl_init(nvme, disc->ncd_devi, &init->ncia_ctrl)) {
745 		nvme_err_save(nvme, &init->ncia_err);
746 	}
747 
748 	return (false);
749 }
750 
751 bool
nvme_ctrl_init_by_instance(nvme_t * nvme,int32_t inst,nvme_ctrl_t ** outp)752 nvme_ctrl_init_by_instance(nvme_t *nvme, int32_t inst, nvme_ctrl_t **outp)
753 {
754 	nvme_ctrl_init_arg_t init;
755 
756 	if (inst < 0) {
757 		return (nvme_error(nvme, NVME_ERR_ILLEGAL_INSTANCE, 0,
758 		    "encountered illegal negative instance number: %d", inst));
759 	}
760 
761 	if (outp == NULL) {
762 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
763 		    "invalid nvme_ctrl_t output pointer: %p", outp));
764 	}
765 
766 	init.ncia_found = false;
767 	init.ncia_inst = inst;
768 	init.ncia_ctrl = NULL;
769 
770 	if (!nvme_ctrl_discover(nvme, nvme_ctrl_init_by_instance_cb, &init)) {
771 		return (false);
772 	}
773 
774 	if (!init.ncia_found) {
775 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
776 		    "failed to find NVMe controller nvme%d", inst));
777 	}
778 
779 	/*
780 	 * If we don't have an NVMe controller structure but we did find the
781 	 * instance, then we must have had an error constructing this will which
782 	 * be on our handle. We have to reconstruct the error from saved
783 	 * information as nvme_ctrl_discover will have clobbered it.
784 	 */
785 	if (init.ncia_ctrl == NULL) {
786 		nvme_err_set(nvme, &init.ncia_err);
787 		return (false);
788 	}
789 
790 	*outp = init.ncia_ctrl;
791 	return (nvme_success(nvme));
792 }
793 
794 bool
nvme_ctrl_devi(nvme_ctrl_t * ctrl,di_node_t * devip)795 nvme_ctrl_devi(nvme_ctrl_t *ctrl, di_node_t *devip)
796 {
797 	*devip = ctrl->nc_devi;
798 	return (nvme_ctrl_success(ctrl));
799 }
800 
801 bool
nvme_ioc_ctrl_info(nvme_ctrl_t * ctrl,nvme_ioctl_ctrl_info_t * info)802 nvme_ioc_ctrl_info(nvme_ctrl_t *ctrl, nvme_ioctl_ctrl_info_t *info)
803 {
804 	(void) memset(info, 0, sizeof (nvme_ioctl_ctrl_info_t));
805 
806 	if (ioctl(ctrl->nc_fd, NVME_IOC_CTRL_INFO, info) != 0) {
807 		int e = errno;
808 		return (nvme_ioctl_syserror(ctrl, e, "controller info"));
809 	}
810 
811 	if (info->nci_common.nioc_drv_err != NVME_IOCTL_E_OK) {
812 		return (nvme_ioctl_error(ctrl, &info->nci_common,
813 		    "controller info"));
814 	}
815 
816 	return (true);
817 }
818 
819 bool
nvme_ioc_ns_info(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_ioctl_ns_info_t * info)820 nvme_ioc_ns_info(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_ioctl_ns_info_t *info)
821 {
822 	(void) memset(info, 0, sizeof (nvme_ioctl_ns_info_t));
823 	info->nni_common.nioc_nsid = nsid;
824 
825 	if (ioctl(ctrl->nc_fd, NVME_IOC_NS_INFO, info) != 0) {
826 		int e = errno;
827 		return (nvme_ioctl_syserror(ctrl, e, "namespace info"));
828 	}
829 
830 	if (info->nni_common.nioc_drv_err != NVME_IOCTL_E_OK) {
831 		return (nvme_ioctl_error(ctrl, &info->nni_common,
832 		    "namespace info"));
833 	}
834 
835 	return (true);
836 }
837 
838 const char *
nvme_tporttostr(nvme_ctrl_transport_t tport)839 nvme_tporttostr(nvme_ctrl_transport_t tport)
840 {
841 	switch (tport) {
842 	case NVME_CTRL_TRANSPORT_PCI:
843 		return ("PCI");
844 	case NVME_CTRL_TRANSPORT_TCP:
845 		return ("TCP");
846 	case NVME_CTRL_TRANSPORT_RDMA:
847 		return ("RDMA");
848 	default:
849 		return ("unknown transport");
850 	}
851 }
852 
853 static bool
nvme_ns_discover_validate(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level)854 nvme_ns_discover_validate(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level)
855 {
856 	switch (level) {
857 	case NVME_NS_DISC_F_ALL:
858 	case NVME_NS_DISC_F_ALLOCATED:
859 	case NVME_NS_DISC_F_ACTIVE:
860 	case NVME_NS_DISC_F_NOT_IGNORED:
861 	case NVME_NS_DISC_F_BLKDEV:
862 		return (true);
863 	default:
864 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "invalid "
865 		    "namespace discovery level specified: 0x%x", level));
866 	}
867 }
868 
869 void
nvme_ns_discover_fini(nvme_ns_iter_t * iter)870 nvme_ns_discover_fini(nvme_ns_iter_t *iter)
871 {
872 	free(iter);
873 }
874 
875 const char *
nvme_nsleveltostr(nvme_ns_disc_level_t level)876 nvme_nsleveltostr(nvme_ns_disc_level_t level)
877 {
878 	switch (level) {
879 	case NVME_NS_DISC_F_ALL:
880 		return ("unallocated");
881 	case NVME_NS_DISC_F_ALLOCATED:
882 		return ("allocated");
883 	case NVME_NS_DISC_F_ACTIVE:
884 		return ("active");
885 	case NVME_NS_DISC_F_NOT_IGNORED:
886 		return ("not ignored");
887 	case NVME_NS_DISC_F_BLKDEV:
888 		return ("blkdev");
889 	default:
890 		return ("unknown level");
891 	}
892 }
893 
894 nvme_ns_disc_level_t
nvme_ns_state_to_disc_level(nvme_ns_state_t state)895 nvme_ns_state_to_disc_level(nvme_ns_state_t state)
896 {
897 	if ((state & NVME_NS_STATE_ALLOCATED) == 0) {
898 		return (NVME_NS_DISC_F_ALL);
899 	}
900 
901 	if ((state & NVME_NS_STATE_ACTIVE) == 0) {
902 		return (NVME_NS_DISC_F_ALLOCATED);
903 	}
904 
905 	if ((state & NVME_NS_STATE_IGNORED) != 0) {
906 		return (NVME_NS_DISC_F_ACTIVE);
907 	}
908 
909 	if ((state & NVME_NS_STATE_ATTACHED) == 0) {
910 		return (NVME_NS_DISC_F_NOT_IGNORED);
911 	} else {
912 		return (NVME_NS_DISC_F_BLKDEV);
913 	}
914 }
915 
916 nvme_iter_t
nvme_ns_discover_step(nvme_ns_iter_t * iter,const nvme_ns_disc_t ** discp)917 nvme_ns_discover_step(nvme_ns_iter_t *iter, const nvme_ns_disc_t **discp)
918 {
919 	nvme_ctrl_t *ctrl = iter->nni_ctrl;
920 
921 	if (iter->nni_err) {
922 		return (NVME_ITER_ERROR);
923 	}
924 
925 	if (iter->nni_done) {
926 		return (NVME_ITER_DONE);
927 	}
928 
929 	while (iter->nni_cur_idx <= ctrl->nc_info.id_nn) {
930 		uint32_t nsid = iter->nni_cur_idx;
931 		nvme_ioctl_ns_info_t ns_info = { 0 };
932 		nvme_ns_disc_level_t level;
933 
934 		if (!nvme_ioc_ns_info(ctrl, nsid, &ns_info)) {
935 			iter->nni_err = true;
936 			return (NVME_ITER_ERROR);
937 		}
938 
939 		iter->nni_cur_idx++;
940 		level = nvme_ns_state_to_disc_level(ns_info.nni_state);
941 		if (iter->nni_level > level) {
942 			continue;
943 		}
944 
945 		(void) memset(&iter->nni_disc, 0, sizeof (nvme_ns_disc_t));
946 		iter->nni_disc.nnd_nsid = nsid;
947 		iter->nni_disc.nnd_level = level;
948 
949 		if (nvme_guid_valid(ctrl, ns_info.nni_id.id_nguid)) {
950 			iter->nni_disc.nnd_flags |= NVME_NS_DISC_F_NGUID_VALID;
951 			(void) memcpy(iter->nni_disc.nnd_nguid,
952 			    ns_info.nni_id.id_nguid,
953 			    sizeof (ns_info.nni_id.id_nguid));
954 		}
955 
956 		if (nvme_eui64_valid(ctrl, ns_info.nni_id.id_eui64)) {
957 			iter->nni_disc.nnd_flags |= NVME_NS_DISC_F_EUI64_VALID;
958 			(void) memcpy(iter->nni_disc.nnd_eui64,
959 			    ns_info.nni_id.id_eui64,
960 			    sizeof (ns_info.nni_id.id_eui64));
961 		}
962 
963 		*discp = &iter->nni_disc;
964 		return (NVME_ITER_VALID);
965 	}
966 
967 	iter->nni_done = true;
968 	return (NVME_ITER_DONE);
969 }
970 
971 bool
nvme_ns_discover_init(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level,nvme_ns_iter_t ** iterp)972 nvme_ns_discover_init(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level,
973     nvme_ns_iter_t **iterp)
974 {
975 	nvme_ns_iter_t *iter;
976 
977 	if (!nvme_ns_discover_validate(ctrl, level)) {
978 		return (false);
979 	}
980 
981 	if (iterp == NULL) {
982 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
983 		    "encountered invalid nvme_ns_iter_t output pointer: %p",
984 		    iterp));
985 	}
986 
987 	iter = calloc(1, sizeof (nvme_ns_iter_t));
988 	if (iter == NULL) {
989 		int e = errno;
990 		return (nvme_ctrl_error(ctrl, NVME_ERR_NO_MEM, e, "failed to "
991 		    "allocate memory for a new nvme_ns_iter_t: %s",
992 		    strerror(e)));
993 	}
994 
995 	iter->nni_ctrl = ctrl;
996 	iter->nni_level = level;
997 	iter->nni_cur_idx = 1;
998 
999 	*iterp = iter;
1000 	return (nvme_ctrl_success(ctrl));
1001 }
1002 
1003 bool
nvme_ns_discover(nvme_ctrl_t * ctrl,nvme_ns_disc_level_t level,nvme_ns_disc_f func,void * arg)1004 nvme_ns_discover(nvme_ctrl_t *ctrl, nvme_ns_disc_level_t level,
1005     nvme_ns_disc_f func, void *arg)
1006 {
1007 	nvme_ns_iter_t *iter;
1008 	nvme_iter_t ret;
1009 	const nvme_ns_disc_t *disc;
1010 
1011 	if (!nvme_ns_discover_validate(ctrl, level)) {
1012 		return (false);
1013 	}
1014 
1015 	if (func == NULL) {
1016 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1017 		    "encountered invalid nvme_ns_disc_f function pointer: %p",
1018 		    func));
1019 	}
1020 
1021 	if (!nvme_ns_discover_init(ctrl, level, &iter)) {
1022 		return (false);
1023 	}
1024 
1025 	while ((ret = nvme_ns_discover_step(iter, &disc)) == NVME_ITER_VALID) {
1026 		if (!func(ctrl, disc, arg))
1027 			break;
1028 	}
1029 
1030 	nvme_ns_discover_fini(iter);
1031 	if (ret == NVME_ITER_ERROR) {
1032 		return (false);
1033 	}
1034 
1035 	return (nvme_ctrl_success(ctrl));
1036 }
1037 
1038 uint32_t
nvme_ns_disc_nsid(const nvme_ns_disc_t * discp)1039 nvme_ns_disc_nsid(const nvme_ns_disc_t *discp)
1040 {
1041 	return (discp->nnd_nsid);
1042 }
1043 
1044 nvme_ns_disc_level_t
nvme_ns_disc_level(const nvme_ns_disc_t * discp)1045 nvme_ns_disc_level(const nvme_ns_disc_t *discp)
1046 {
1047 	return (discp->nnd_level);
1048 }
1049 
1050 nvme_ns_disc_flags_t
nvme_ns_disc_flags(const nvme_ns_disc_t * discp)1051 nvme_ns_disc_flags(const nvme_ns_disc_t *discp)
1052 {
1053 	return (discp->nnd_flags);
1054 }
1055 
1056 const uint8_t *
nvme_ns_disc_eui64(const nvme_ns_disc_t * discp)1057 nvme_ns_disc_eui64(const nvme_ns_disc_t *discp)
1058 {
1059 	if ((discp->nnd_flags & NVME_NS_DISC_F_EUI64_VALID) == 0) {
1060 		return (NULL);
1061 	}
1062 
1063 	return (discp->nnd_eui64);
1064 }
1065 
1066 const uint8_t *
nvme_ns_disc_nguid(const nvme_ns_disc_t * discp)1067 nvme_ns_disc_nguid(const nvme_ns_disc_t *discp)
1068 {
1069 	if ((discp->nnd_flags & NVME_NS_DISC_F_NGUID_VALID) == 0) {
1070 		return (NULL);
1071 	}
1072 
1073 	return (discp->nnd_nguid);
1074 }
1075 
1076 void
nvme_ns_fini(nvme_ns_t * ns)1077 nvme_ns_fini(nvme_ns_t *ns)
1078 {
1079 	free(ns);
1080 }
1081 
1082 bool
nvme_ns_init(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_ns_t ** nsp)1083 nvme_ns_init(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_ns_t **nsp)
1084 {
1085 	nvme_ns_t *ns;
1086 
1087 	if (nsp == NULL) {
1088 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1089 		    "encountered invalid nvme_ns_t output pointer: %p", nsp));
1090 	}
1091 
1092 	if (nsid < NVME_NSID_MIN || nsid > ctrl->nc_info.id_nn) {
1093 		return (nvme_ctrl_error(ctrl, NVME_ERR_NS_RANGE, 0, "requested "
1094 		    "namespace 0x%x is invalid, valid namespaces are [0x%x, "
1095 		    "0x%x]", nsid, NVME_NSID_MIN, ctrl->nc_info.id_nn));
1096 	}
1097 
1098 	ns = calloc(1, sizeof (nvme_ns_t));
1099 	if (ns == NULL) {
1100 		int e = errno;
1101 		return (nvme_ctrl_error(ctrl, NVME_ERR_NO_MEM, e, "failed to "
1102 		    "allocate memory for a new nvme_ns_t: %s", strerror(e)));
1103 	}
1104 
1105 	ns->nn_ctrl = ctrl;
1106 	ns->nn_nsid = nsid;
1107 
1108 	*nsp = ns;
1109 	return (nvme_ctrl_success(ctrl));
1110 }
1111 
1112 typedef struct {
1113 	nvme_ctrl_t *nnia_ctrl;
1114 	const char *nnia_name;
1115 	bool nnia_found;
1116 	nvme_ns_t *nnia_ns;
1117 	nvme_err_data_t nnia_err;
1118 } nvme_ns_init_arg_t;
1119 
1120 static bool
nvme_ns_init_by_name_cb(nvme_ctrl_t * ctrl,const nvme_ns_disc_t * disc,void * arg)1121 nvme_ns_init_by_name_cb(nvme_ctrl_t *ctrl, const nvme_ns_disc_t *disc,
1122     void *arg)
1123 {
1124 	nvme_ns_init_arg_t *init = arg;
1125 	char buf[NVME_NGUID_NAMELEN];
1126 	CTASSERT(NVME_NGUID_NAMELEN > NVME_EUI64_NAMELEN);
1127 
1128 	if ((disc->nnd_flags & NVME_NS_DISC_F_NGUID_VALID) != 0) {
1129 		(void) nvme_format_nguid(disc->nnd_nguid, buf, sizeof (buf));
1130 		if (strcasecmp(init->nnia_name, buf) == 0)
1131 			goto match;
1132 	}
1133 
1134 	if ((disc->nnd_flags & NVME_NS_DISC_F_EUI64_VALID) != 0) {
1135 		(void) nvme_format_eui64(disc->nnd_eui64, buf, sizeof (buf));
1136 		if (strcasecmp(init->nnia_name, buf) == 0)
1137 			goto match;
1138 	}
1139 
1140 	(void) snprintf(buf, sizeof (buf), "%u", disc->nnd_nsid);
1141 	if (strcasecmp(init->nnia_name, buf) == 0)
1142 		goto match;
1143 
1144 	return (true);
1145 
1146 match:
1147 	init->nnia_found = true;
1148 	if (!nvme_ns_init(ctrl, disc->nnd_nsid, &init->nnia_ns)) {
1149 		nvme_ctrl_err_save(ctrl, &init->nnia_err);
1150 	}
1151 
1152 	return (false);
1153 }
1154 
1155 /*
1156  * Attempt to find a namespace by 'name'. A name could be the NGUID, EUI64, or
1157  * just the plain old namespace ID.
1158  */
1159 bool
nvme_ns_init_by_name(nvme_ctrl_t * ctrl,const char * ns_name,nvme_ns_t ** nsp)1160 nvme_ns_init_by_name(nvme_ctrl_t *ctrl, const char *ns_name, nvme_ns_t **nsp)
1161 {
1162 	nvme_ns_init_arg_t init;
1163 
1164 	if (ns_name == NULL) {
1165 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1166 		    "encountered invalid namespace name: %p", ns_name));
1167 	}
1168 
1169 	if (nsp == NULL) {
1170 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_PTR, 0,
1171 		    "encountered invalid nvme_ns_t output pointer: %p", nsp));
1172 	}
1173 
1174 	init.nnia_ctrl = ctrl;
1175 	init.nnia_name = ns_name;
1176 	init.nnia_found = false;
1177 	init.nnia_ns = NULL;
1178 
1179 	if (!nvme_ns_discover(ctrl, NVME_NS_DISC_F_ALL, nvme_ns_init_by_name_cb,
1180 	    &init)) {
1181 		return (false);
1182 	}
1183 
1184 	if (!init.nnia_found) {
1185 		return (nvme_ctrl_error(ctrl, NVME_ERR_NS_RANGE, 0, "failed to "
1186 		    "find NVMe namespace %s on nvme%d", ns_name,
1187 		    ctrl->nc_inst));
1188 	}
1189 
1190 	if (init.nnia_ns == NULL) {
1191 		nvme_ctrl_err_set(ctrl, &init.nnia_err);
1192 		return (false);
1193 	}
1194 
1195 	*nsp = init.nnia_ns;
1196 	return (nvme_ctrl_success(ctrl));
1197 }
1198 
1199 bool
nvme_ctrl_ns_init(nvme_t * nvme,const char * name,nvme_ctrl_t ** ctrlp,nvme_ns_t ** nsp)1200 nvme_ctrl_ns_init(nvme_t *nvme, const char *name, nvme_ctrl_t **ctrlp,
1201     nvme_ns_t **nsp)
1202 {
1203 	const char *slash, *ns_name;
1204 	char *eptr;
1205 	nvme_ctrl_t *ctrl;
1206 	nvme_ns_t *ns;
1207 	unsigned long inst;
1208 	size_t ctrl_namelen;
1209 
1210 	if (name == NULL) {
1211 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
1212 		    "invalid name to search for: %p", name));
1213 	}
1214 
1215 	/*
1216 	 * We require a controller, but the namespace output pointer is only
1217 	 * required if we end up having a namespace present.
1218 	 */
1219 	if (ctrlp == NULL) {
1220 		return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0, "encountered "
1221 		    "invalid nvme_ctrl_t output pointer: %p", ctrlp));
1222 	}
1223 
1224 	slash = strchr(name, '/');
1225 	if (slash != NULL) {
1226 		ctrl_namelen = (uintptr_t)slash - (uintptr_t)name;
1227 		ns_name = slash + 1;
1228 
1229 		if (nsp == NULL) {
1230 			return (nvme_error(nvme, NVME_ERR_BAD_PTR, 0,
1231 			    "encountered invalid nvme_ns_t output pointer: %p",
1232 			    nsp));
1233 		}
1234 
1235 	} else {
1236 		ctrl_namelen = strlen(name);
1237 		ns_name = NULL;
1238 	}
1239 
1240 	*ctrlp = NULL;
1241 	if (nsp != NULL) {
1242 		*nsp = NULL;
1243 	}
1244 
1245 	if (strncmp(name, "nvme", 4) != 0) {
1246 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0, "unable "
1247 		    "to map controller '%.*s' to a known device class, "
1248 		    "expected the controller to start with 'nvme'",
1249 		    (int)ctrl_namelen, name));
1250 	}
1251 
1252 	/*
1253 	 * Before we go ahead and try to parse this with strtoul we need to
1254 	 * manually check two things that strtoul will not:
1255 	 *
1256 	 * 1) If we have a null terminator, then we'll just get a 0 back.
1257 	 * 2) If there are multiple leading zeros in a row then that's an error.
1258 	 * We don't want to conflate 001 and 1 as the same here. The only valid
1259 	 * case is 'nvme0' which is 5 characters long, hence the check below.
1260 	 */
1261 	if (ctrl_namelen == 4) {
1262 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1263 		    "no controller instance specified in %.*s",
1264 		    (int)ctrl_namelen, name));
1265 	}
1266 
1267 	if (name[4] == '0' && ctrl_namelen > 5) {
1268 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1269 		    "leading zeros aren't allowed for the instance specified "
1270 		    "in %.*s", (int)ctrl_namelen, name));
1271 	}
1272 
1273 	errno = 0;
1274 	inst = strtoul(name + 4, &eptr, 10);
1275 	if (errno != 0 || (*eptr != '\0' && eptr != slash)) {
1276 		return (nvme_error(nvme, NVME_ERR_BAD_CONTROLLER, 0,
1277 		    "failed to parse controller instance from %.*s",
1278 		    (int)ctrl_namelen, name));
1279 	}
1280 
1281 	if (inst > INT32_MAX) {
1282 		return (nvme_error(nvme, NVME_ERR_ILLEGAL_INSTANCE, 0,
1283 		    "parsed controller instance %lu is outside the valid "
1284 		    "range [0, %d]", inst, INT32_MAX));
1285 	}
1286 
1287 	if (!nvme_ctrl_init_by_instance(nvme, (int32_t)inst, &ctrl)) {
1288 		return (false);
1289 	}
1290 
1291 	if (ns_name == NULL) {
1292 		*ctrlp = ctrl;
1293 		return (nvme_success(nvme));
1294 	}
1295 
1296 	if (!nvme_ns_init_by_name(ctrl, ns_name, &ns)) {
1297 		nvme_err_data_t err;
1298 
1299 		nvme_ctrl_err_save(ctrl, &err);
1300 		nvme_err_set(nvme, &err);
1301 		nvme_ctrl_fini(ctrl);
1302 		return (false);
1303 	}
1304 
1305 	*ctrlp = ctrl;
1306 	*nsp = ns;
1307 
1308 	return (nvme_success(nvme));
1309 }
1310 
1311 bool
nvme_ns_bd_attach(nvme_ns_t * ns)1312 nvme_ns_bd_attach(nvme_ns_t *ns)
1313 {
1314 	nvme_ctrl_t *ctrl = ns->nn_ctrl;
1315 	nvme_ioctl_common_t com;
1316 
1317 	(void) memset(&com, 0, sizeof (com));
1318 	com.nioc_nsid = ns->nn_nsid;
1319 
1320 	if (ioctl(ns->nn_ctrl->nc_fd, NVME_IOC_ATTACH, &com) != 0) {
1321 		int e = errno;
1322 		return (nvme_ioctl_syserror(ctrl, e, "namespace attach"));
1323 	}
1324 
1325 	if (com.nioc_drv_err != NVME_IOCTL_E_OK) {
1326 		return (nvme_ioctl_error(ctrl, &com, "namespace attach"));
1327 	}
1328 
1329 	return (nvme_ctrl_success(ctrl));
1330 }
1331 
1332 bool
nvme_ns_bd_detach(nvme_ns_t * ns)1333 nvme_ns_bd_detach(nvme_ns_t *ns)
1334 {
1335 	nvme_ctrl_t *ctrl = ns->nn_ctrl;
1336 	nvme_ioctl_common_t com;
1337 
1338 	(void) memset(&com, 0, sizeof (com));
1339 	com.nioc_nsid = ns->nn_nsid;
1340 
1341 	if (ioctl(ns->nn_ctrl->nc_fd, NVME_IOC_DETACH, &com) != 0) {
1342 		int e = errno;
1343 		return (nvme_ioctl_syserror(ctrl, e, "namespace detach"));
1344 	}
1345 
1346 	if (com.nioc_drv_err != NVME_IOCTL_E_OK) {
1347 		return (nvme_ioctl_error(ctrl, &com, "namespace detach"));
1348 	}
1349 
1350 	return (nvme_ctrl_success(ctrl));
1351 }
1352 
1353 /*
1354  * Check for a lock programming error and upanic() if so.
1355  */
1356 static void
nvme_lock_check(nvme_ctrl_t * ctrl)1357 nvme_lock_check(nvme_ctrl_t *ctrl)
1358 {
1359 	char msg[1024];
1360 	int ret;
1361 	const char *up;
1362 	size_t ulen;
1363 	const char *base = "fatal libnvme locking error detected";
1364 
1365 	if (ctrl->nc_err.ne_err != NVME_ERR_LOCK_PROG) {
1366 		return;
1367 	}
1368 
1369 	ret = snprintf(msg, sizeof (msg), "%s: %s (controller %p)", base,
1370 	    ctrl->nc_err.ne_errmsg, ctrl);
1371 	if (ret >= sizeof (msg)) {
1372 		ulen = sizeof (msg);
1373 		up = msg;
1374 	} else if (ret <= 0) {
1375 		ulen = strlen(base) + 1;
1376 		up = base;
1377 	} else {
1378 		ulen = (size_t)ret + 1;
1379 		up = msg;
1380 	}
1381 
1382 	upanic(up, ulen);
1383 }
1384 
1385 static bool
nvme_lock_common(nvme_ctrl_t * ctrl,uint32_t nsid,nvme_lock_level_t level,nvme_lock_flags_t flags)1386 nvme_lock_common(nvme_ctrl_t *ctrl, uint32_t nsid, nvme_lock_level_t level,
1387     nvme_lock_flags_t flags)
1388 {
1389 	nvme_ioctl_lock_t lock;
1390 	const nvme_lock_flags_t all_flags = NVME_LOCK_F_DONT_BLOCK;
1391 
1392 	if (level != NVME_LOCK_L_READ && level != NVME_LOCK_L_WRITE) {
1393 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "unknown "
1394 		    "lock level: 0x%x", level));
1395 	}
1396 
1397 	if ((flags & ~all_flags) != 0) {
1398 		return (nvme_ctrl_error(ctrl, NVME_ERR_BAD_FLAG, 0, "unknown "
1399 		    "lock flags: 0x%x", flags & ~all_flags));
1400 	}
1401 
1402 	(void) memset(&lock, 0, sizeof (lock));
1403 	lock.nil_common.nioc_nsid = nsid;
1404 	if (nsid != 0) {
1405 		lock.nil_ent = NVME_LOCK_E_NS;
1406 	} else {
1407 		lock.nil_ent = NVME_LOCK_E_CTRL;
1408 	}
1409 	lock.nil_level = level;
1410 	lock.nil_flags = flags;
1411 
1412 	if (ioctl(ctrl->nc_fd, NVME_IOC_LOCK, &lock) != 0) {
1413 		int e = errno;
1414 		return (nvme_ioctl_syserror(ctrl, e, "lock"));
1415 	}
1416 
1417 	if (lock.nil_common.nioc_drv_err != NVME_IOCTL_E_OK) {
1418 		(void) nvme_ioctl_error(ctrl, &lock.nil_common, "lock");
1419 		nvme_lock_check(ctrl);
1420 		return (false);
1421 	}
1422 
1423 	return (nvme_ctrl_success(ctrl));
1424 }
1425 
1426 /*
1427  * You may reasonably be wondering why does this return and why do we basically
1428  * panic everywhere. The reality is twofold. The first part of this is that we
1429  * know from experience in libc that error checking mutexes are not the most
1430  * common and the kernel simplicity of mutex_enter() and mutex_exit() are really
1431  * a boon. The second piece here is that the way that the ioctl path works here,
1432  * only programming errors or mischief in the library could cause this to fail
1433  * at the raw ioctl / errno level. That is EBADF/EFAULT, etc. are our fault and
1434  * if you cannot unlock because of that you're not going to get much further.
1435  */
1436 void
nvme_unlock_common(nvme_ctrl_t * ctrl,uint32_t nsid)1437 nvme_unlock_common(nvme_ctrl_t *ctrl, uint32_t nsid)
1438 {
1439 	nvme_ioctl_unlock_t unlock;
1440 
1441 	(void) memset(&unlock, 0, sizeof (unlock));
1442 	unlock.niu_common.nioc_nsid = nsid;
1443 	if (nsid != 0) {
1444 		unlock.niu_ent = NVME_LOCK_E_NS;
1445 	} else {
1446 		unlock.niu_ent = NVME_LOCK_E_CTRL;
1447 	}
1448 
1449 	/*
1450 	 * Because all unlock ioctls errors are promoted to an error, we don't
1451 	 * bother calling nvme_ioctl_syserror() here.
1452 	 */
1453 	if (ioctl(ctrl->nc_fd, NVME_IOC_UNLOCK, &unlock) != 0) {
1454 		int e = errno;
1455 		(void) nvme_ctrl_error(ctrl, NVME_ERR_LOCK_PROG, e, "internal "
1456 		    "programming error: failed to issue unlock ioctl: %s",
1457 		    strerror(e));
1458 		nvme_lock_check(ctrl);
1459 		return;
1460 	}
1461 
1462 	if (unlock.niu_common.nioc_drv_err != NVME_IOCTL_E_OK) {
1463 		(void) nvme_ioctl_error(ctrl, &unlock.niu_common, "unlock");
1464 		/*
1465 		 * Promote any other failure to a new fatal failure. Consumers
1466 		 * expect this to have worked.
1467 		 */
1468 		if (ctrl->nc_err.ne_err != NVME_ERR_LOCK_PROG) {
1469 			nvme_err_data_t err;
1470 			nvme_ctrl_err_save(ctrl, &err);
1471 			(void) nvme_ctrl_error(ctrl, NVME_ERR_LOCK_PROG, 0,
1472 			    "internal programming error: received unexpected "
1473 			    "libnvme error 0x%x: %s", err.ne_err,
1474 			    err.ne_errmsg);
1475 		}
1476 		nvme_lock_check(ctrl);
1477 		return;
1478 	}
1479 
1480 	(void) nvme_ctrl_success(ctrl);
1481 }
1482 
1483 bool
nvme_ctrl_lock(nvme_ctrl_t * ctrl,nvme_lock_level_t level,nvme_lock_flags_t flags)1484 nvme_ctrl_lock(nvme_ctrl_t *ctrl, nvme_lock_level_t level,
1485     nvme_lock_flags_t flags)
1486 {
1487 	return (nvme_lock_common(ctrl, 0, level, flags));
1488 }
1489 
1490 bool
nvme_ns_lock(nvme_ns_t * ns,nvme_lock_level_t level,nvme_lock_flags_t flags)1491 nvme_ns_lock(nvme_ns_t *ns, nvme_lock_level_t level,
1492     nvme_lock_flags_t flags)
1493 {
1494 	return (nvme_lock_common(ns->nn_ctrl, ns->nn_nsid, level, flags));
1495 }
1496 
1497 void
nvme_ctrl_unlock(nvme_ctrl_t * ctrl)1498 nvme_ctrl_unlock(nvme_ctrl_t *ctrl)
1499 {
1500 	nvme_unlock_common(ctrl, 0);
1501 }
1502 
1503 void
nvme_ns_unlock(nvme_ns_t * ns)1504 nvme_ns_unlock(nvme_ns_t *ns)
1505 {
1506 	nvme_unlock_common(ns->nn_ctrl, ns->nn_nsid);
1507 }
1508