xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision c093b3ec6d35e1fe023174ed7f6ca6b90690d526)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2023 RackTop Systems, Inc.
16  * Copyright 2023 MNX Cloud, Inc.
17  */
18 
19 /*
20  * Mellanox Connect-X 4/5/6 driver.
21  */
22 
23 /*
24  * The PRM for this family of parts was freely available at:
25  *
26  * https://www.mellanox.com/related-docs/user_manuals/ \
27  *   Ethernet_Adapters_Programming_Manual.pdf
28  *
29  * but has since disappeared.
30  */
31 /*
32  * ConnectX glossary
33  * -----------------
34  *
35  * WR		Work Request: something we've asked the hardware to do by
36  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
37  *
38  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
39  *
40  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
41  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
42  *		types have different WQE structures, different commands for
43  *		creating and destroying them, etc, but share a common context
44  *		structure, counter setup and state graph.
45  * SQ		Send Queue, a specific type of WQ that sends packets
46  * RQ		Receive Queue, a specific type of WQ that receives packets
47  *
48  * CQ		Completion Queue: completion of WRs from a WQ are reported to
49  *		one of these, as a CQE on its entry ring.
50  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
51  *		info, as well as packet size, the ID of the WQ, and the index
52  *		of the WQE which completed. Does not contain any packet data.
53  *
54  * EQ		Event Queue: a ring of event structs from the hardware informing
55  *		us when particular events happen. Many events can point at a
56  *		a particular CQ which we should then go look at.
57  * EQE		Event Queue Entry: an entry on the EQ ring
58  *
59  * UAR		User Access Region, a page of the device's PCI BAR which is
60  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
61  *		ring to arm them for interrupts or wake them up for new work
62  *
63  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
64  *		as a single unit (for e.g. hashing/RSS).
65  *
66  * TIR		Transport Interface Recieve, a bucket of resources for the
67  *		reception of packets. TIRs have to point at either a single RQ
68  *		or a table of RQs (RQT). They then serve as a target for flow
69  *		table entries (FEs). TIRs that point at an RQT also contain the
70  *		settings for hashing for RSS.
71  *
72  * TIS		Transport Interface Send, a bucket of resources associated with
73  *		the transmission of packets. In particular, the temporary
74  *		resources used for LSO internally in the card are accounted to
75  *		a TIS.
76  *
77  * FT		Flow Table, a collection of FEs and FGs that can be referred to
78  *		as a single entity (e.g. used as a target from another flow
79  *		entry or set as the "root" table to handle incoming or outgoing
80  *		packets). Packets arriving at a FT are matched against the
81  *		FEs in the table until either one matches with a terminating
82  *		action or all FEs are exhausted (it's first-match-wins but with
83  *		some actions that are non-terminal, like counting actions).
84  *
85  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
86  *		they match on the same attributes of packets coming into the
87  *		flow).
88  *
89  * FE		Flow Entry, an individual set of values to match against
90  *		packets entering the flow table, combined with an action to
91  *		take upon a successful match. The action we use most is
92  *		"forward", which sends the packets to a TIR or another flow
93  *		table and then stops further processing within the FE's FT.
94  *
95  * lkey/mkey	A reference to something similar to a page table but in the
96  *		device's internal onboard MMU. Since Connect-X parts double as
97  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
98  *		features which we try very hard not to use. For our WQEs we use
99  *		the "reserved" lkey, which is a special value which indicates
100  *		that addresses we give are linear addresses and should not be
101  *		translated.
102  *
103  * PD		Protection Domain, an IB concept. We have to allocate one to
104  *		provide as a parameter for new WQs, but we don't do anything
105  *		with it.
106  *
107  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
108  *		provide it as a parameter to TIR/TIS creation, but we don't do
109  *		anything with it.
110  */
111 /*
112  *
113  * Data flow overview
114  * ------------------
115  *
116  * This driver is a MAC ring-enabled driver which maps rings to send and recv
117  * queues in hardware on the device.
118  *
119  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
120  * sufficient space, and simplify the logic needed to work out which buffer
121  * was completed.
122  *
123  * The CQs are then round-robin allocated onto EQs, of which we set up one per
124  * interrupt that the system gives us for the device. Normally this means we
125  * have 8 EQs.
126  *
127  * When we have >= 8 EQs available, we try to allocate only RX or only TX
128  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
129  *
130  * EQ #0 is reserved for all event types other than completion events, and has
131  * no CQs associated with it at any time. EQs #1 and upwards are only used for
132  * handling CQ completion events.
133  *
134  * +------+     +------+           +------+        +---------+
135  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
136  * +------+     +------+     |     +------+        +---------+
137  *                           |
138  * +------+     +------+     |
139  * | SQ 1 |---->| CQ 1 |---+ |     +------+
140  * +------+     +------+   | +---> |      |
141  *                         |       |      |
142  * +------+     +------+   |       | EQ 1 |        +---------+
143  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
144  * +------+     +------+   | +---> |      |        +---------+
145  *                         | |     +------+
146  *                         | |
147  *   ...                   | |
148  *                         | |     +------+
149  * +------+     +------+   +-----> |      |
150  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
151  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
152  *                           |     |      |        +---------+
153  * +------+     +------+     | +-> |      |
154  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
155  * +------+     +------+       |
156  *                             |     ....
157  * +------+     +------+       |
158  * | RQ 2 |---->| CQ 5 |-------+
159  * +------+     +------+
160  *
161  *   ... (note this diagram does not show RX-only or TX-only EQs)
162  *
163  * For TX, we advertise all of the SQs we create as plain rings to MAC with
164  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
165  * and use the rings as it sees fit.
166  *
167  * For RX, we advertise actual groups in order to make use of hardware
168  * classification.
169  *
170  * The hardware classification we use is based around Flow Tables, and we
171  * currently ignore all of the eswitch features of the card. The NIC VPORT
172  * is always set to promisc mode so that the eswitch sends us all of the
173  * traffic that arrives on the NIC, and we use flow entries to manage
174  * everything.
175  *
176  * We use 2 layers of flow tables for classification: traffic arrives at the
177  * root RX flow table which contains MAC address filters. Those then send
178  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
179  * presence and VID filters.
180  *
181  * Since these parts only support doing RSS hashing on a single protocol at a
182  * time, we have to use a third layer of flow tables as well to break traffic
183  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
184  * so that it can be sent to the appropriate TIR for hashing.
185  *
186  * Incoming packets
187  *        +           +---------+      +---------+
188  *        |        +->| group 0 |      | group 0 |
189  *        |        |  | vlan ft |  +-->| hash ft |
190  *        v        |  |   L1    |  |   |   L2    |
191  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
192  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
193  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
194  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
195  *        |        |  |         |  |   +---------+    +-----+    |     +------+
196  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
197  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
198  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
199  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
200  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
201  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
202  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
203  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
204  *   +---------+   |               ^   |  other  |-+
205  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
206  *   +---------+                   |               +->| TIR |--->| RQ0 |
207  *   |  MAC 1  |-+                 |                  +-----+    +-----+
208  *   +---------+ | +---------------+
209  *   |  MAC 2  |-+ |               ^
210  *   +---------+ | |               |
211  *   |  MAC 3  |-+ |  +---------+  |   +---------+
212  *   +---------+ | |  | group 1 |  |   | group 1 |
213  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
214  *   |         |   |  |   L1    |  | | |   L2    |
215  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
216  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
217  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
218  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
219  *                    |         |  |   +---------+    +-----+    |     +------+
220  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
221  *                    |         |  |   +---------+    +-----+    | RQT +------+
222  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
223  *                    |         |  |   +---------+    +-----+    |     |      |
224  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
225  *                    | promisc |--+   +---------+    +-----+    |     |      |
226  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
227  *                                     +---------+    +-----+    +-----+------+
228  *                                     |  other  |-+
229  *                                     +---------+ |
230  *                      .......                    |  +-----+    +-----+
231  *                                                 +->| TIR |--->| RQ3 |
232  *                                                    +-----+    +-----+
233  *
234  * Note that the "promisc" flow entries are only set/enabled when promisc
235  * mode is enabled for the NIC. All promisc flow entries point directly at
236  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
237  * the "default group" in MAC).
238  *
239  * The "default" entry in the L1 VLAN filter flow tables is used when there
240  * are no VLANs set for the group, to accept any traffic regardless of tag. It
241  * is deleted as soon as a VLAN filter is added (and re-instated if the
242  * last VLAN filter is removed).
243  *
244  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
245  * space for packet data (they're a collection of scatter pointers only). TX
246  * descriptors contain some space for "inline headers" (and the card requires
247  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
248  * but all the rest of the data comes from the gather pointers.
249  *
250  * When we get completions back they simply contain the ring index number of
251  * the WR (work request) which completed. So, we manage the buffers for actual
252  * packet data completely independently of the descriptors in this driver. When
253  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
254  * with the WQE index that we put it at, and therefore don't have to look at
255  * the original descriptor at all when handling completions.
256  *
257  * For RX, we create sufficient packet data buffers to fill 150% of the
258  * available descriptors for each ring. These all are pre-set-up for DMA and
259  * have an mblk_t associated with them (with desballoc()).
260  *
261  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
262  * large enough), or we copy it into a pre-allocated buffer set up in the same
263  * as as for RX.
264  */
265 
266 /*
267  * Buffer lifecycle: RX
268  * --------------------
269  *
270  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
271  * straightforward.
272  *
273  * It is created (and has all its memory allocated) at the time of starting up
274  * the RX ring it belongs to. Then it is placed on the "free" list in the
275  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
276  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
277  * before making a WQE for it.
278  *
279  * After a completion event occurs, the packet is either discarded (and the
280  * buffer_t returned to the free list), or it is readied for loaning to MAC
281  * and placed on the "loaned" list in the mlxcx_buffer_shard_t.
282  *
283  * Once MAC and the rest of the system have finished with the packet, they call
284  * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point
285  * the fate of the buffer_t is determined by the state of the
286  * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t
287  * will be returned to the free list, potentially to be recycled and used
288  * again. But if the shard is draining (E.g. after a ring stop) there will be
289  * no recycling and the buffer_t is immediately destroyed.
290  *
291  * At detach/teardown time, buffers are only every destroyed from the free list.
292  *
293  *
294  *                         +
295  *                         |
296  *                         | mlxcx_buf_create
297  *                         |
298  *                         v
299  *                    +----+----+
300  *                    | created |
301  *                    +----+----+                        +------+
302  *                         |                             | dead |
303  *                         |                             +------+
304  *                         | mlxcx_buf_return                ^
305  *                         |                                 |
306  *                         v                                 | mlxcx_buf_destroy
307  * mlxcx_buf_destroy  +----+----+          +-----------+     |
308  *          +---------|  free   |<------no-| draining? |-yes-+
309  *          |         +----+----+          +-----------+
310  *          |              |                     ^
311  *          |              |                     |
312  *          v              | mlxcx_buf_take      | mlxcx_buf_return
313  *      +---+--+           v                     |
314  *      | dead |       +---+---+                 |
315  *      +------+       | on WQ |- - - - - - - - >O
316  *                     +---+---+                 ^
317  *                         |                     |
318  *                         |                     |
319  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
320  *                         v                     |
321  *                 +-------+--------+            |
322  *                 | on loan to MAC |----------->O
323  *                 +----------------+  freemsg()
324  *
325  */
326 
327 /*
328  * Buffer lifecycle: TX
329  * --------------------
330  *
331  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
332  * "foreign" buffers.
333  *
334  * The former have their memory allocated and DMA bound by this driver, while
335  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
336  * not owned by us, though we do DMA bind it (and take responsibility for
337  * un-binding it when we're done with them).
338  *
339  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
340  * SQ. Thus, there is a separate free list and mutex for each kind.
341  *
342  * Since a TX packet might consist of multiple mblks, we translate each mblk
343  * into exactly one buffer_t. The buffer_ts are chained together in the same
344  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
345  *
346  * Each chain of TX buffers may consist of foreign or driver buffers, in any
347  * mixture.
348  *
349  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
350  * it from the rest of the chain buffers.
351  *
352  * TX buffer chains are always returned to the free list by
353  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
354  * freeing all of the members.
355  *
356  * We only call freemsg() once, on the head of the TX buffer chain's original
357  * mblk. This is true whether we copied it or bound it in a foreign buffer.
358  */
359 
360 /*
361  * Startup and command interface
362  * -----------------------------
363  *
364  * The command interface is the primary way in which we give control orders to
365  * the hardware (e.g. actions like "create this queue" or "delete this flow
366  * entry"). The command interface is never used to transmit or receive packets
367  * -- that takes place only on the queues that are set up through it.
368  *
369  * In mlxcx_cmd.c we implement our use of the command interface on top of a
370  * simple taskq. As commands are submitted from the taskq they choose a
371  * "slot", if there are no free slots then execution of the command will
372  * be paused until one is free. The hardware permits up to 32 independent
373  * slots for concurrent command execution.
374  *
375  * Before interrupts are enabled, command completion is polled, once
376  * interrupts are up command completions become asynchronous and are
377  * wired to EQ 0. A caveat to this is commands can not be submitted
378  * directly from EQ 0's completion handler, and any processing resulting from
379  * an asynchronous event which requires further use of the command interface
380  * is posted through a taskq.
381  *
382  * The startup/attach process for this card involves a bunch of different steps
383  * which are summarised pretty well in the PRM. We have to send a number of
384  * commands which do different things to start the card up, give it some pages
385  * of our own memory for it to use, then start creating all the entities that
386  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
387  * and TDoms.
388  */
389 
390 /*
391  * UARs
392  * ----
393  *
394  * The pages of the PCI BAR other than the first few are reserved for use as
395  * "UAR" sections in this device. Each UAR section can be used as a set of
396  * doorbells for our queues.
397  *
398  * Currently we just make one single UAR for all of our queues. It doesn't
399  * seem to be a major limitation yet.
400  *
401  * When we're sending packets through an SQ, the PRM is not awful clear about
402  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
403  * (it's clear on the pattern of alternation you're expected to use between
404  * even and odd for Blueflame sends, but not for regular doorbells).
405  *
406  * Currently we don't do the even-odd alternating pattern for ordinary
407  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
408  * least on Connect-X4 Lx.
409  */
410 
411 /*
412  * Lock ordering
413  * -------------
414  *
415  * Interrupt side:
416  *
417  *  - mleq_mtx
418  *    - mlcq_arm_mtx
419  *      - mlcq_mtx
420  *        - mlcq_bufbmtx
421  *        - mlwq_mtx
422  *          - mlbs_mtx
423  *    - mlp_mtx
424  *
425  * GLD side:
426  *
427  *  - mlp_mtx
428  *    - mlg_mtx
429  *      - mlg_*.mlft_mtx
430  *    - mlp_*.mlft_mtx
431  *    - mlwq_mtx
432  *      - mlbs_mtx
433  *      - mlcq_bufbmtx
434  *  - mleq_mtx
435  *    - mlcq_arm_mtx
436  *      - mlcq_mtx
437  *
438  */
439 
440 #include <sys/modctl.h>
441 #include <sys/conf.h>
442 #include <sys/devops.h>
443 #include <sys/sysmacros.h>
444 #include <sys/time.h>
445 #include <sys/pci.h>
446 #include <sys/mac_provider.h>
447 
448 #include <mlxcx.h>
449 
450 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
451 
452 #define	MLXCX_MODULE_NAME	"mlxcx"
453 /*
454  * We give this to the firmware, so it has to be in a fixed format that it
455  * understands.
456  */
457 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
458 
459 /*
460  * Firmware may take a while to reclaim pages. Try a set number of times.
461  */
462 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
463 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
464 
465 static void *mlxcx_softstate;
466 
467 /*
468  * Fault detection thresholds.
469  */
470 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
471 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
472 
473 static void
474 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
475 {
476 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
477 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
478 
479 	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
480 	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
481 
482 	/*
483 	 * Currently we have different queue size defaults for two
484 	 * categories of queues. One set for devices which support a
485 	 * maximum speed of 10Gb/s, and another for those above that.
486 	 */
487 	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
488 	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0 ||
489 	    (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_25G |
490 	    MLXCX_EXTPROTO_40G | MLXCX_EXTPROTO_50G | MLXCX_EXTPROTO_100G |
491 	    MLXCX_EXTPROTO_200G | MLXCX_EXTPROTO_400G)) != 0) {
492 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
493 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
494 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
495 	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
496 	    MLXCX_PROTO_10G)) != 0 ||
497 	    (port->mlp_ext_max_proto & (MLXCX_EXTPROTO_100M |
498 	    MLXCX_EXTPROTO_5G | MLXCX_EXTPROTO_1G | MLXCX_EXTPROTO_10G)) != 0) {
499 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
500 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
501 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
502 	} else {
503 		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
504 		    "recognize. Proto: 0x%x", port->mlp_max_proto);
505 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
506 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
507 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
508 	}
509 }
510 
511 /*
512  * Properties which may have different defaults based on hardware
513  * characteristics.
514  */
515 static void
516 mlxcx_load_model_props(mlxcx_t *mlxp)
517 {
518 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
519 
520 	mlxcx_load_prop_defaults(mlxp);
521 
522 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
523 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
524 	    p->mldp_cq_size_shift_default);
525 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
526 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
527 	    p->mldp_sq_size_shift_default);
528 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
529 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
530 	    p->mldp_rq_size_shift_default);
531 }
532 
533 static void
534 mlxcx_load_props(mlxcx_t *mlxp)
535 {
536 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
537 
538 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
539 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
540 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
541 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
542 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
543 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
544 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
545 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
546 	    MLXCX_CQEMOD_COUNT_DFLT);
547 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
548 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
549 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
550 
551 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
552 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
553 	    MLXCX_TX_NGROUPS_DFLT);
554 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
555 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
556 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
557 
558 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
559 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
560 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
561 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
562 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
563 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
564 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
565 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
566 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
567 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
568 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
569 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
570 
571 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
572 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
573 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
574 
575 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
576 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
577 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
578 
579 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
580 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
581 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
582 
583 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
584 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
585 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
586 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
587 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
588 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
589 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
590 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
591 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
592 
593 	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
594 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
595 	    MLXCX_RX_PER_CQ_DEFAULT);
596 
597 	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
598 	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
599 		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
600 		    "out of range. Defaulting to: %d. Valid values are from "
601 		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
602 		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
603 		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
604 	}
605 }
606 
607 void
608 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
609 {
610 	va_list ap;
611 
612 	va_start(ap, fmt);
613 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
614 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
615 	} else {
616 		vcmn_err(CE_NOTE, fmt, ap);
617 	}
618 	va_end(ap);
619 }
620 
621 void
622 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
623 {
624 	va_list ap;
625 
626 	va_start(ap, fmt);
627 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
628 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
629 	} else {
630 		vcmn_err(CE_WARN, fmt, ap);
631 	}
632 	va_end(ap);
633 }
634 
635 void
636 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
637 {
638 	va_list ap;
639 
640 	va_start(ap, fmt);
641 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
642 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
643 	} else {
644 		vcmn_err(CE_PANIC, fmt, ap);
645 	}
646 	va_end(ap);
647 }
648 
649 uint16_t
650 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
651 {
652 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
653 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
654 }
655 
656 uint32_t
657 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
658 {
659 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
660 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
661 }
662 
663 uint64_t
664 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
665 {
666 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
667 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
668 }
669 
670 void
671 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
672 {
673 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
674 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
675 }
676 
677 void
678 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
679 {
680 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
681 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
682 }
683 
684 void
685 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
686 {
687 	/*
688 	 * The UAR is always inside the first BAR, which we mapped as
689 	 * mlx_regs
690 	 */
691 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
692 	    (uintptr_t)mlxp->mlx_regs_base;
693 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
694 }
695 
696 void
697 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
698 {
699 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
700 	    (uintptr_t)mlxp->mlx_regs_base;
701 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
702 }
703 
704 static void
705 mlxcx_fm_fini(mlxcx_t *mlxp)
706 {
707 	if (mlxp->mlx_fm_caps == 0)
708 		return;
709 
710 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
711 		ddi_fm_handler_unregister(mlxp->mlx_dip);
712 
713 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
714 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
715 		pci_ereport_teardown(mlxp->mlx_dip);
716 
717 	ddi_fm_fini(mlxp->mlx_dip);
718 
719 	mlxp->mlx_fm_caps = 0;
720 }
721 
722 void
723 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
724 {
725 	uint64_t ena;
726 	char buf[FM_MAX_CLASS];
727 
728 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
729 		return;
730 
731 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
732 	ena = fm_ena_generate(0, FM_ENA_FMT1);
733 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
734 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
735 	    NULL);
736 }
737 
738 static int
739 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
740 {
741 	/*
742 	 * as the driver can always deal with an error in any dma or
743 	 * access handle, we can just return the fme_status value.
744 	 */
745 	pci_ereport_post(dip, err, NULL);
746 	return (err->fme_status);
747 }
748 
749 static void
750 mlxcx_fm_init(mlxcx_t *mlxp)
751 {
752 	ddi_iblock_cookie_t iblk;
753 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
754 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
755 
756 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
757 	    DDI_PROP_DONTPASS, "fm_capable", def);
758 
759 	if (mlxp->mlx_fm_caps < 0) {
760 		mlxp->mlx_fm_caps = 0;
761 	}
762 	mlxp->mlx_fm_caps &= def;
763 
764 	if (mlxp->mlx_fm_caps == 0)
765 		return;
766 
767 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
768 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
769 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
770 		pci_ereport_setup(mlxp->mlx_dip);
771 	}
772 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
773 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
774 		    (void *)mlxp);
775 	}
776 }
777 
778 static void
779 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
780 {
781 	mlxcx_buffer_t *buf;
782 
783 	mutex_enter(&s->mlbs_mtx);
784 
785 	while (!list_is_empty(&s->mlbs_busy))
786 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
787 
788 	while (!list_is_empty(&s->mlbs_loaned))
789 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
790 
791 	while ((buf = list_head(&s->mlbs_free)) != NULL)
792 		mlxcx_buf_destroy(mlxp, buf);
793 
794 	list_destroy(&s->mlbs_free);
795 	list_destroy(&s->mlbs_busy);
796 	list_destroy(&s->mlbs_loaned);
797 	mutex_exit(&s->mlbs_mtx);
798 
799 	cv_destroy(&s->mlbs_free_nonempty);
800 	mutex_destroy(&s->mlbs_mtx);
801 }
802 
803 static void
804 mlxcx_teardown_bufs(mlxcx_t *mlxp)
805 {
806 	mlxcx_buf_shard_t *s;
807 
808 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
809 		mlxcx_mlbs_teardown(mlxp, s);
810 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
811 	}
812 	list_destroy(&mlxp->mlx_buf_shards);
813 
814 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
815 }
816 
817 static void
818 mlxcx_teardown_pages(mlxcx_t *mlxp)
819 {
820 	uint_t nzeros = 0;
821 	uint64_t *pas;
822 
823 	pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES,
824 	    KM_SLEEP);
825 
826 	mutex_enter(&mlxp->mlx_pagemtx);
827 
828 	while (mlxp->mlx_npages > 0) {
829 		int32_t req, ret;
830 
831 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
832 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
833 
834 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
835 			mlxcx_warn(mlxp, "hardware refused to return pages, "
836 			    "leaking %u remaining pages", mlxp->mlx_npages);
837 			goto out;
838 		}
839 
840 		for (int32_t i = 0; i < ret; i++) {
841 			mlxcx_dev_page_t *mdp, probe;
842 			bzero(&probe, sizeof (probe));
843 			probe.mxdp_pa = pas[i];
844 
845 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
846 
847 			if (mdp != NULL) {
848 				avl_remove(&mlxp->mlx_pages, mdp);
849 				mlxp->mlx_npages--;
850 				mlxcx_dma_free(&mdp->mxdp_dma);
851 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
852 			} else {
853 				mlxcx_panic(mlxp, "hardware returned a page "
854 				    "with PA 0x%" PRIx64 " but we have no "
855 				    "record of giving out such a page", pas[i]);
856 			}
857 		}
858 
859 		/*
860 		 * If no pages were returned, note that fact.
861 		 */
862 		if (ret == 0) {
863 			nzeros++;
864 			if (nzeros > mlxcx_reclaim_tries) {
865 				mlxcx_warn(mlxp, "hardware refused to return "
866 				    "pages, leaking %u remaining pages",
867 				    mlxp->mlx_npages);
868 				goto out;
869 			}
870 			delay(drv_usectohz(mlxcx_reclaim_delay));
871 		}
872 	}
873 
874 	avl_destroy(&mlxp->mlx_pages);
875 
876 out:
877 	mutex_exit(&mlxp->mlx_pagemtx);
878 	mutex_destroy(&mlxp->mlx_pagemtx);
879 
880 	kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES);
881 }
882 
883 static boolean_t
884 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
885 {
886 	ddi_device_acc_attr_t acc;
887 	ddi_dma_attr_t attr;
888 	boolean_t ret;
889 	size_t sz, i;
890 
891 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
892 
893 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
894 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
895 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
896 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
897 
898 	mlxcx_dma_acc_attr(mlxp, &acc);
899 	mlxcx_dma_queue_attr(mlxp, &attr);
900 
901 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
902 	    B_TRUE, sz, B_TRUE);
903 	if (!ret) {
904 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
905 		return (B_FALSE);
906 	}
907 
908 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
909 
910 	for (i = 0; i < mleq->mleq_nents; ++i)
911 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
912 
913 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
914 
915 	return (B_TRUE);
916 }
917 
918 static void
919 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
920 {
921 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
922 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
923 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
924 
925 	mlxcx_dma_free(&mleq->mleq_dma);
926 	mleq->mleq_ent = NULL;
927 
928 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
929 }
930 
931 void
932 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
933 {
934 	mlxcx_flow_group_t *fg;
935 	mlxcx_flow_entry_t *fe;
936 	int i;
937 
938 	ASSERT(mutex_owned(&ft->mlft_mtx));
939 
940 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
941 		fe = &ft->mlft_ent[i];
942 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
943 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
944 				mlxcx_panic(mlxp, "failed to delete flow "
945 				    "entry %u on table %u", i,
946 				    ft->mlft_num);
947 			}
948 		}
949 	}
950 
951 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
952 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
953 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
954 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
955 				mlxcx_panic(mlxp, "failed to destroy flow "
956 				    "group %u", fg->mlfg_num);
957 			}
958 		}
959 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
960 	}
961 	list_destroy(&ft->mlft_groups);
962 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
963 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
964 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
965 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
966 			    ft->mlft_num);
967 		}
968 	}
969 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
970 	ft->mlft_ent = NULL;
971 	mutex_exit(&ft->mlft_mtx);
972 	mutex_destroy(&ft->mlft_mtx);
973 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
974 }
975 
976 static void
977 mlxcx_teardown_ports(mlxcx_t *mlxp)
978 {
979 	uint_t i;
980 	mlxcx_port_t *p;
981 	mlxcx_flow_table_t *ft;
982 
983 	for (i = 0; i < mlxp->mlx_nports; ++i) {
984 		p = &mlxp->mlx_ports[i];
985 		if (!(p->mlp_init & MLXCX_PORT_INIT))
986 			continue;
987 		mutex_enter(&p->mlp_mtx);
988 		if ((ft = p->mlp_rx_flow) != NULL) {
989 			mutex_enter(&ft->mlft_mtx);
990 			/*
991 			 * teardown_flow_table() will destroy the mutex, so
992 			 * we don't release it here.
993 			 */
994 			mlxcx_teardown_flow_table(mlxp, ft);
995 		}
996 		mutex_exit(&p->mlp_mtx);
997 		mutex_destroy(&p->mlp_mtx);
998 		mutex_destroy(&p->mlx_port_event.mla_mtx);
999 		p->mlx_port_event.mla_mlx = NULL;
1000 		p->mlx_port_event.mla_port = NULL;
1001 		p->mlp_init &= ~MLXCX_PORT_INIT;
1002 	}
1003 
1004 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
1005 	mlxp->mlx_ports = NULL;
1006 }
1007 
1008 static void
1009 mlxcx_teardown_wqs(mlxcx_t *mlxp)
1010 {
1011 	mlxcx_work_queue_t *mlwq;
1012 
1013 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
1014 		mlxcx_wq_teardown(mlxp, mlwq);
1015 	}
1016 	list_destroy(&mlxp->mlx_wqs);
1017 }
1018 
1019 static void
1020 mlxcx_teardown_cqs(mlxcx_t *mlxp)
1021 {
1022 	mlxcx_completion_queue_t *mlcq;
1023 
1024 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
1025 		mlxcx_cq_teardown(mlxp, mlcq);
1026 	}
1027 	list_destroy(&mlxp->mlx_cqs);
1028 }
1029 
1030 static void
1031 mlxcx_teardown_eqs(mlxcx_t *mlxp)
1032 {
1033 	mlxcx_event_queue_t *mleq;
1034 	uint_t i;
1035 
1036 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1037 		mleq = &mlxp->mlx_eqs[i];
1038 		mutex_enter(&mleq->mleq_mtx);
1039 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1040 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1041 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1042 				mlxcx_warn(mlxp, "failed to destroy "
1043 				    "event queue idx %u eqn %u",
1044 				    i, mleq->mleq_num);
1045 			}
1046 		}
1047 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1048 			mlxcx_eq_rele_dma(mlxp, mleq);
1049 		}
1050 		mutex_exit(&mleq->mleq_mtx);
1051 	}
1052 }
1053 
1054 static void
1055 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1056 {
1057 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1058 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1059 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1060 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1061 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1062 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1063 }
1064 
1065 static void
1066 mlxcx_teardown(mlxcx_t *mlxp)
1067 {
1068 	uint_t i;
1069 	dev_info_t *dip = mlxp->mlx_dip;
1070 
1071 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1072 		/*
1073 		 * Disable interrupts and let any active vectors quiesce.
1074 		 */
1075 		mlxcx_intr_disable(mlxp);
1076 	}
1077 
1078 	if (mlxp->mlx_attach & MLXCX_ATTACH_SENSORS) {
1079 		mlxcx_teardown_sensors(mlxp);
1080 		mlxp->mlx_attach &= ~MLXCX_ATTACH_SENSORS;
1081 	}
1082 
1083 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1084 		mlxcx_teardown_checktimers(mlxp);
1085 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1086 	}
1087 
1088 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1089 		mlxcx_teardown_groups(mlxp);
1090 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1091 	}
1092 
1093 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1094 		mlxcx_teardown_wqs(mlxp);
1095 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1096 	}
1097 
1098 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1099 		mlxcx_teardown_cqs(mlxp);
1100 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1101 	}
1102 
1103 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1104 		mlxcx_teardown_bufs(mlxp);
1105 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1106 	}
1107 
1108 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1109 		mlxcx_teardown_ports(mlxp);
1110 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1111 	}
1112 
1113 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1114 		mlxcx_teardown_eqs(mlxp);
1115 		mlxcx_intr_teardown(mlxp);
1116 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1117 	}
1118 
1119 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1120 		if (mlxp->mlx_uar.mlu_allocated) {
1121 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1122 				mlxcx_warn(mlxp, "failed to release UAR");
1123 			}
1124 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1125 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1126 		}
1127 		if (mlxp->mlx_pd.mlpd_allocated &&
1128 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1129 			mlxcx_warn(mlxp, "failed to release PD");
1130 		}
1131 		if (mlxp->mlx_tdom.mltd_allocated &&
1132 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1133 			mlxcx_warn(mlxp, "failed to release TDOM");
1134 		}
1135 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1136 	}
1137 
1138 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1139 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1140 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1141 			    "command during device detach");
1142 		}
1143 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1144 	}
1145 
1146 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1147 		mlxcx_teardown_pages(mlxp);
1148 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1149 	}
1150 
1151 	if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) {
1152 		for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
1153 			mlxp->mlx_npages_req[i].mla_mlx = NULL;
1154 			mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx);
1155 		}
1156 		taskq_destroy(mlxp->mlx_async_tq);
1157 		mlxp->mlx_async_tq = NULL;
1158 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ;
1159 	}
1160 
1161 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1162 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1163 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1164 			    "during device detach");
1165 		}
1166 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1167 	}
1168 
1169 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1170 		mlxcx_cmd_queue_fini(mlxp);
1171 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1172 	}
1173 
1174 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1175 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1176 		mlxp->mlx_caps = NULL;
1177 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1178 	}
1179 
1180 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1181 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1182 		mlxp->mlx_regs_handle = NULL;
1183 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1184 	}
1185 
1186 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1187 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1188 		mlxp->mlx_cfg_handle = NULL;
1189 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1190 	}
1191 
1192 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1193 		mlxcx_fm_fini(mlxp);
1194 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1195 	}
1196 
1197 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1198 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1199 	ddi_set_driver_private(dip, NULL);
1200 }
1201 
1202 static void
1203 mlxcx_get_model(mlxcx_t *mlxp)
1204 {
1205 	uint16_t venid;
1206 	uint16_t devid;
1207 
1208 	venid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_VENID);
1209 	if (venid != MLXCX_VENDOR_ID) {
1210 		/* Currently, all supported cards have a Mellanox vendor id. */
1211 		mlxp->mlx_type = MLXCX_DEV_UNKNOWN;
1212 		return;
1213 	}
1214 
1215 	devid = pci_config_get16(mlxp->mlx_cfg_handle, PCI_CONF_DEVID);
1216 	switch (devid) {
1217 	case MLXCX_CX4_DEVID:
1218 	case MLXCX_CX4_VF_DEVID:
1219 	case MLXCX_CX4_LX_VF_DEVID:
1220 		mlxp->mlx_type = MLXCX_DEV_CX4;
1221 		break;
1222 	case MLXCX_CX5_DEVID:
1223 	case MLXCX_CX5_VF_DEVID:
1224 	case MLXCX_CX5_EX_DEVID:
1225 	case MLXCX_CX5_EX_VF_DEVID:
1226 	case MLXCX_CX5_GEN_VF_DEVID:
1227 		mlxp->mlx_type = MLXCX_DEV_CX5;
1228 		break;
1229 	case MLXCX_CX6_DEVID:
1230 	case MLXCX_CX6_VF_DEVID:
1231 	case MLXCX_CX6_DF_DEVID:
1232 	case MLXCX_CX6_LX_DEVID:
1233 		mlxp->mlx_type = MLXCX_DEV_CX6;
1234 		break;
1235 	default:
1236 		mlxp->mlx_type = MLXCX_DEV_UNKNOWN;
1237 	}
1238 }
1239 
1240 static boolean_t
1241 mlxcx_regs_map(mlxcx_t *mlxp)
1242 {
1243 	off_t memsize;
1244 	int ret;
1245 	ddi_device_acc_attr_t da;
1246 
1247 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1248 	    DDI_SUCCESS) {
1249 		mlxcx_warn(mlxp, "failed to get register set size");
1250 		return (B_FALSE);
1251 	}
1252 
1253 	/*
1254 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1255 	 * device.
1256 	 */
1257 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1258 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1259 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1260 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1261 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1262 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1263 	} else {
1264 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1265 	}
1266 
1267 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1268 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1269 
1270 	if (ret != DDI_SUCCESS) {
1271 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1272 		return (B_FALSE);
1273 	}
1274 
1275 	return (B_TRUE);
1276 }
1277 
1278 static boolean_t
1279 mlxcx_check_issi(mlxcx_t *mlxp)
1280 {
1281 	uint32_t issi;
1282 
1283 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1284 		mlxcx_warn(mlxp, "failed to get ISSI");
1285 		return (B_FALSE);
1286 	}
1287 
1288 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1289 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1290 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1291 		return (B_FALSE);
1292 	}
1293 
1294 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1295 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1296 		    MLXCX_CURRENT_ISSI);
1297 		return (B_FALSE);
1298 	}
1299 
1300 	return (B_TRUE);
1301 }
1302 
1303 boolean_t
1304 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven)
1305 {
1306 	ddi_device_acc_attr_t acc;
1307 	ddi_dma_attr_t attr;
1308 	int32_t i;
1309 	list_t plist;
1310 	mlxcx_dev_page_t *mdp;
1311 	mlxcx_dev_page_t **pages;
1312 	const ddi_dma_cookie_t *ck;
1313 
1314 	/*
1315 	 * If there are no pages required, then we're done here.
1316 	 */
1317 	if (npages <= 0) {
1318 		*ngiven = 0;
1319 		return (B_TRUE);
1320 	}
1321 
1322 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
1323 
1324 	pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP);
1325 
1326 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1327 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1328 
1329 	for (i = 0; i < npages; i++) {
1330 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1331 		mlxcx_dma_acc_attr(mlxp, &acc);
1332 		mlxcx_dma_page_attr(mlxp, &attr);
1333 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1334 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1335 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1336 			    npages);
1337 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1338 			goto cleanup_npages;
1339 		}
1340 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1341 		mdp->mxdp_pa = ck->dmac_laddress;
1342 
1343 		list_insert_tail(&plist, mdp);
1344 	}
1345 
1346 	/*
1347 	 * Now that all of the pages have been allocated, given them to hardware
1348 	 * in chunks.
1349 	 */
1350 	for (i = 0; i < npages; i++) {
1351 		pages[i] = list_remove_head(&plist);
1352 	}
1353 
1354 	if (!mlxcx_cmd_give_pages(mlxp,
1355 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
1356 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1357 		    "pages!", npages);
1358 		for (i = 0; i < npages; i++) {
1359 			list_insert_tail(&plist, pages[i]);
1360 		}
1361 		goto cleanup_npages;
1362 	}
1363 
1364 	mutex_enter(&mlxp->mlx_pagemtx);
1365 	for (i = 0; i < npages; i++) {
1366 		avl_add(&mlxp->mlx_pages, pages[i]);
1367 	}
1368 	mlxp->mlx_npages += npages;
1369 	mutex_exit(&mlxp->mlx_pagemtx);
1370 
1371 	list_destroy(&plist);
1372 	kmem_free(pages, sizeof (*pages) * npages);
1373 
1374 	*ngiven = npages;
1375 
1376 	return (B_TRUE);
1377 
1378 cleanup_npages:
1379 	kmem_free(pages, sizeof (*pages) * npages);
1380 	while ((mdp = list_remove_head(&plist)) != NULL) {
1381 		mlxcx_dma_free(&mdp->mxdp_dma);
1382 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1383 	}
1384 	list_destroy(&plist);
1385 	return (B_FALSE);
1386 }
1387 
1388 static boolean_t
1389 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1390 {
1391 	int32_t npages, given;
1392 
1393 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1394 		mlxcx_warn(mlxp, "failed to determine boot pages");
1395 		return (B_FALSE);
1396 	}
1397 
1398 	while (npages > 0) {
1399 		if (!mlxcx_give_pages(mlxp, npages, &given))
1400 			return (B_FALSE);
1401 
1402 		npages -= given;
1403 	}
1404 
1405 	return (B_TRUE);
1406 }
1407 
1408 static int
1409 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1410 {
1411 	mlxcx_t *mlxp = cookie;
1412 	mlxcx_buffer_t *b = arg;
1413 
1414 	bzero(b, sizeof (mlxcx_buffer_t));
1415 	b->mlb_mlx = mlxp;
1416 	b->mlb_state = MLXCX_BUFFER_INIT;
1417 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1418 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1419 
1420 	return (0);
1421 }
1422 
1423 static void
1424 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1425 {
1426 	mlxcx_t *mlxp = cookie;
1427 	mlxcx_buffer_t *b = arg;
1428 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1429 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1430 	list_destroy(&b->mlb_tx_chain);
1431 }
1432 
1433 mlxcx_buf_shard_t *
1434 mlxcx_mlbs_create(mlxcx_t *mlxp)
1435 {
1436 	mlxcx_buf_shard_t *s;
1437 
1438 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1439 
1440 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1441 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1442 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1443 	    offsetof(mlxcx_buffer_t, mlb_entry));
1444 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1445 	    offsetof(mlxcx_buffer_t, mlb_entry));
1446 	list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t),
1447 	    offsetof(mlxcx_buffer_t, mlb_entry));
1448 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1449 
1450 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1451 
1452 	return (s);
1453 }
1454 
1455 static boolean_t
1456 mlxcx_setup_bufs(mlxcx_t *mlxp)
1457 {
1458 	char namebuf[KSTAT_STRLEN];
1459 
1460 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1461 	    ddi_get_instance(mlxp->mlx_dip));
1462 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1463 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1464 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1465 	    NULL, mlxp, NULL, 0);
1466 
1467 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1468 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1469 
1470 	return (B_TRUE);
1471 }
1472 
1473 static void
1474 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1475     const char *state, uint8_t statenum)
1476 {
1477 	uint64_t ena;
1478 	char buf[FM_MAX_CLASS];
1479 
1480 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1481 		return;
1482 
1483 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1484 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1485 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1486 
1487 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1488 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1489 	    "state", DATA_TYPE_STRING, state,
1490 	    "state_num", DATA_TYPE_UINT8, statenum,
1491 	    "qtype", DATA_TYPE_STRING, qtype,
1492 	    "qnum", DATA_TYPE_UINT32, qnum,
1493 	    NULL);
1494 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1495 }
1496 
1497 /*
1498  * The following set of routines are for monitoring the health of
1499  * event, completion and work queues. They run infrequently peeking at
1500  * the structs to catch stalls and inconsistent state.
1501  *
1502  * They peek at the structs *without* acquiring locks - we don't want
1503  * to impede flow of data. Driver start up and shutdown semantics
1504  * guarantee the structs are present and won't disappear underneath
1505  * these routines.
1506  *
1507  * As previously noted, the routines peek at active data in the structs and
1508  * they will store some values for comparison on next invocation. To
1509  * maintain integrity of the saved values, these values are only modified
1510  * within these routines.
1511  */
1512 static void
1513 mlxcx_eq_check(void *arg)
1514 {
1515 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1516 	mlxcx_event_queue_t *eq;
1517 	mlxcx_eventq_ctx_t ctx;
1518 	const char *str;
1519 
1520 	uint_t i;
1521 
1522 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1523 		eq = &mlxp->mlx_eqs[i];
1524 
1525 		if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0)
1526 			continue;
1527 
1528 		/*
1529 		 * If the event queue was successfully created in the HCA,
1530 		 * then initialization and shutdown sequences guarantee
1531 		 * the queue exists.
1532 		 */
1533 		ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED);
1534 
1535 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx))
1536 			continue;
1537 
1538 		str = "???";
1539 		switch (ctx.mleqc_status) {
1540 		case MLXCX_EQ_STATUS_OK:
1541 			break;
1542 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1543 			str = "WRITE_FAILURE";
1544 			break;
1545 		}
1546 
1547 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1548 			mlxcx_fm_qstate_ereport(mlxp, "event",
1549 			    eq->mleq_num, str, ctx.mleqc_status);
1550 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1551 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1552 		}
1553 
1554 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1555 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1556 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1557 			    ++eq->mleq_check_disarm_cnt >= 3) {
1558 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1559 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1560 				    eq->mleq_intr_index);
1561 			}
1562 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1563 		} else {
1564 			eq->mleq_check_disarm_cc = 0;
1565 			eq->mleq_check_disarm_cnt = 0;
1566 		}
1567 	}
1568 }
1569 
1570 static void
1571 mlxcx_cq_check(void *arg)
1572 {
1573 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1574 	mlxcx_completion_queue_t *cq;
1575 	mlxcx_completionq_ctx_t ctx;
1576 	const char *str, *type;
1577 	uint_t v;
1578 
1579 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1580 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1581 
1582 		if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0)
1583 			continue;
1584 
1585 		/*
1586 		 * If the completion queue was successfully created in the HCA,
1587 		 * then initialization and shutdown sequences guarantee
1588 		 * the queue exists.
1589 		 */
1590 		ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED);
1591 		ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN);
1592 
1593 		if (cq->mlcq_fm_repd_qstate)
1594 			continue;
1595 
1596 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx))
1597 			continue;
1598 
1599 		if (cq->mlcq_wq != NULL) {
1600 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1601 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1602 				type = "rx ";
1603 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1604 				type = "tx ";
1605 			else
1606 				type = "";
1607 		} else {
1608 			type = "";
1609 		}
1610 
1611 		str = "???";
1612 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1613 		switch (v) {
1614 		case MLXCX_CQC_STATUS_OK:
1615 			break;
1616 		case MLXCX_CQC_STATUS_OVERFLOW:
1617 			str = "OVERFLOW";
1618 			break;
1619 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1620 			str = "WRITE_FAIL";
1621 			break;
1622 		case MLXCX_CQC_STATUS_INVALID:
1623 			str = "INVALID";
1624 			break;
1625 		}
1626 
1627 		if (v != MLXCX_CQC_STATUS_OK) {
1628 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1629 			    cq->mlcq_num, str, v);
1630 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1631 			    type, cq->mlcq_num, v, str);
1632 			cq->mlcq_fm_repd_qstate = B_TRUE;
1633 		}
1634 
1635 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1636 		if (v != MLXCX_CQC_STATE_ARMED &&
1637 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1638 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1639 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1640 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1641 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1642 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1643 				    type, cq->mlcq_num, cq);
1644 			}
1645 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1646 		} else {
1647 			cq->mlcq_check_disarm_cnt = 0;
1648 			cq->mlcq_check_disarm_cc = 0;
1649 		}
1650 	}
1651 }
1652 
1653 void
1654 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1655 {
1656 	mlxcx_sq_ctx_t ctx;
1657 	mlxcx_sq_state_t state;
1658 
1659 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1660 		return;
1661 
1662 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1663 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1664 	switch (state) {
1665 	case MLXCX_SQ_STATE_RST:
1666 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1667 			mlxcx_fm_qstate_ereport(mlxp, "send",
1668 			    sq->mlwq_num, "RST", state);
1669 			sq->mlwq_fm_repd_qstate = B_TRUE;
1670 		}
1671 		break;
1672 	case MLXCX_SQ_STATE_RDY:
1673 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1674 			mlxcx_fm_qstate_ereport(mlxp, "send",
1675 			    sq->mlwq_num, "RDY", state);
1676 			sq->mlwq_fm_repd_qstate = B_TRUE;
1677 		}
1678 		break;
1679 	case MLXCX_SQ_STATE_ERR:
1680 		mlxcx_fm_qstate_ereport(mlxp, "send",
1681 		    sq->mlwq_num, "ERR", state);
1682 		sq->mlwq_fm_repd_qstate = B_TRUE;
1683 		break;
1684 	default:
1685 		mlxcx_fm_qstate_ereport(mlxp, "send",
1686 		    sq->mlwq_num, "???", state);
1687 		sq->mlwq_fm_repd_qstate = B_TRUE;
1688 		break;
1689 	}
1690 }
1691 
1692 void
1693 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1694 {
1695 	mlxcx_rq_ctx_t ctx;
1696 	mlxcx_rq_state_t state;
1697 
1698 
1699 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1700 		return;
1701 
1702 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1703 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1704 	switch (state) {
1705 	case MLXCX_RQ_STATE_RST:
1706 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1707 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1708 			    rq->mlwq_num, "RST", state);
1709 			rq->mlwq_fm_repd_qstate = B_TRUE;
1710 		}
1711 		break;
1712 	case MLXCX_RQ_STATE_RDY:
1713 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1714 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1715 			    rq->mlwq_num, "RDY", state);
1716 			rq->mlwq_fm_repd_qstate = B_TRUE;
1717 		}
1718 		break;
1719 	case MLXCX_RQ_STATE_ERR:
1720 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1721 		    rq->mlwq_num, "ERR", state);
1722 		rq->mlwq_fm_repd_qstate = B_TRUE;
1723 		break;
1724 	default:
1725 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1726 		    rq->mlwq_num, "???", state);
1727 		rq->mlwq_fm_repd_qstate = B_TRUE;
1728 		break;
1729 	}
1730 }
1731 
1732 static void
1733 mlxcx_wq_check(void *arg)
1734 {
1735 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1736 	mlxcx_work_queue_t *wq;
1737 
1738 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1739 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1740 
1741 		if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0)
1742 			continue;
1743 
1744 		/*
1745 		 * If the work queue was successfully created in the HCA,
1746 		 * then initialization and shutdown sequences guarantee
1747 		 * the queue exists.
1748 		 */
1749 		ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED);
1750 		ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN);
1751 
1752 		if (wq->mlwq_fm_repd_qstate)
1753 			continue;
1754 
1755 		switch (wq->mlwq_type) {
1756 		case MLXCX_WQ_TYPE_SENDQ:
1757 			mlxcx_check_sq(mlxp, wq);
1758 			break;
1759 		case MLXCX_WQ_TYPE_RECVQ:
1760 			mlxcx_check_rq(mlxp, wq);
1761 			break;
1762 		}
1763 	}
1764 }
1765 
1766 static boolean_t
1767 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1768 {
1769 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1770 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1771 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1772 		    DDI_IPL_0);
1773 	}
1774 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1775 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1776 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1777 		    DDI_IPL_0);
1778 	}
1779 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1780 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1781 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1782 		    DDI_IPL_0);
1783 	}
1784 	return (B_TRUE);
1785 }
1786 
1787 int
1788 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1789 {
1790 	const mlxcx_flow_entry_t *left = arg0;
1791 	const mlxcx_flow_entry_t *right = arg1;
1792 	int bcmpr;
1793 
1794 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1795 	    sizeof (left->mlfe_dmac));
1796 	if (bcmpr < 0)
1797 		return (-1);
1798 	if (bcmpr > 0)
1799 		return (1);
1800 	if (left->mlfe_vid < right->mlfe_vid)
1801 		return (-1);
1802 	if (left->mlfe_vid > right->mlfe_vid)
1803 		return (1);
1804 	return (0);
1805 }
1806 
1807 int
1808 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1809 {
1810 	const mlxcx_group_mac_t *left = arg0;
1811 	const mlxcx_group_mac_t *right = arg1;
1812 	int bcmpr;
1813 
1814 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1815 	    sizeof (left->mlgm_mac));
1816 	if (bcmpr < 0)
1817 		return (-1);
1818 	if (bcmpr > 0)
1819 		return (1);
1820 	return (0);
1821 }
1822 
1823 int
1824 mlxcx_page_compare(const void *arg0, const void *arg1)
1825 {
1826 	const mlxcx_dev_page_t *p0 = arg0;
1827 	const mlxcx_dev_page_t *p1 = arg1;
1828 
1829 	if (p0->mxdp_pa < p1->mxdp_pa)
1830 		return (-1);
1831 	if (p0->mxdp_pa > p1->mxdp_pa)
1832 		return (1);
1833 	return (0);
1834 }
1835 
1836 static boolean_t
1837 mlxcx_setup_ports(mlxcx_t *mlxp)
1838 {
1839 	uint_t i, j;
1840 	mlxcx_port_t *p;
1841 	mlxcx_flow_table_t *ft;
1842 	mlxcx_flow_group_t *fg;
1843 	mlxcx_flow_entry_t *fe;
1844 
1845 	VERIFY3U(mlxp->mlx_nports, >, 0);
1846 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1847 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1848 
1849 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1850 		p = &mlxp->mlx_ports[i];
1851 		p->mlp_num = i;
1852 		p->mlx_port_event.mla_mlx = mlxp;
1853 		p->mlx_port_event.mla_port = p;
1854 		mutex_init(&p->mlx_port_event.mla_mtx, NULL,
1855 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
1856 		p->mlp_init |= MLXCX_PORT_INIT;
1857 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1858 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1859 		mutex_enter(&p->mlp_mtx);
1860 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1861 			mutex_exit(&p->mlp_mtx);
1862 			goto err;
1863 		}
1864 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1865 			mutex_exit(&p->mlp_mtx);
1866 			goto err;
1867 		}
1868 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1869 			mutex_exit(&p->mlp_mtx);
1870 			goto err;
1871 		}
1872 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1873 			mutex_exit(&p->mlp_mtx);
1874 			goto err;
1875 		}
1876 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1877 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1878 			mutex_exit(&p->mlp_mtx);
1879 			goto err;
1880 		}
1881 		if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
1882 			mutex_exit(&p->mlp_mtx);
1883 			goto err;
1884 		}
1885 		p->mlp_fec_requested = LINK_FEC_AUTO;
1886 
1887 		mutex_exit(&p->mlp_mtx);
1888 	}
1889 
1890 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1891 		p = &mlxp->mlx_ports[i];
1892 		mutex_enter(&p->mlp_mtx);
1893 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1894 		    KM_SLEEP));
1895 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1896 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1897 
1898 		mutex_enter(&ft->mlft_mtx);
1899 
1900 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1901 		ft->mlft_port = p;
1902 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1903 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1904 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1905 		ft->mlft_nents = (1 << ft->mlft_entshift);
1906 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1907 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1908 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1909 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1910 
1911 		for (j = 0; j < ft->mlft_nents; ++j) {
1912 			ft->mlft_ent[j].mlfe_table = ft;
1913 			ft->mlft_ent[j].mlfe_index = j;
1914 		}
1915 
1916 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1917 			mutex_exit(&ft->mlft_mtx);
1918 			mutex_exit(&p->mlp_mtx);
1919 			goto err;
1920 		}
1921 
1922 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1923 			mutex_exit(&ft->mlft_mtx);
1924 			mutex_exit(&p->mlp_mtx);
1925 			goto err;
1926 		}
1927 
1928 		/*
1929 		 * We match broadcast at the top of the root flow table, then
1930 		 * all multicast/unicast MACs, then the promisc entry is down
1931 		 * the very bottom.
1932 		 *
1933 		 * This way when promisc is on, that entry simply catches any
1934 		 * remaining traffic that earlier flows haven't matched.
1935 		 */
1936 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1937 		list_insert_tail(&ft->mlft_groups, fg);
1938 		fg->mlfg_table = ft;
1939 		fg->mlfg_size = 1;
1940 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1941 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1942 			mutex_exit(&ft->mlft_mtx);
1943 			mutex_exit(&p->mlp_mtx);
1944 			goto err;
1945 		}
1946 		p->mlp_bcast = fg;
1947 		fe = list_head(&fg->mlfg_entries);
1948 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1949 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1950 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1951 
1952 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1953 		list_insert_tail(&ft->mlft_groups, fg);
1954 		fg->mlfg_table = ft;
1955 		fg->mlfg_size = ft->mlft_nents - 2;
1956 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1957 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1958 			mutex_exit(&ft->mlft_mtx);
1959 			mutex_exit(&p->mlp_mtx);
1960 			goto err;
1961 		}
1962 		p->mlp_umcast = fg;
1963 
1964 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1965 		list_insert_tail(&ft->mlft_groups, fg);
1966 		fg->mlfg_table = ft;
1967 		fg->mlfg_size = 1;
1968 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1969 			mutex_exit(&ft->mlft_mtx);
1970 			mutex_exit(&p->mlp_mtx);
1971 			goto err;
1972 		}
1973 		p->mlp_promisc = fg;
1974 		fe = list_head(&fg->mlfg_entries);
1975 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1976 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1977 
1978 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1979 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1980 		    mlfe_dmac_entry));
1981 
1982 		mutex_exit(&ft->mlft_mtx);
1983 		mutex_exit(&p->mlp_mtx);
1984 	}
1985 
1986 	return (B_TRUE);
1987 
1988 err:
1989 	mlxcx_teardown_ports(mlxp);
1990 	return (B_FALSE);
1991 }
1992 
1993 void
1994 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1995 {
1996 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1997 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1998 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1999 	mlxcx_flow_entry_t *fe;
2000 	mlxcx_group_vlan_t *v;
2001 
2002 	ASSERT(mutex_owned(&g->mlg_mtx));
2003 
2004 	mutex_enter(&ft->mlft_mtx);
2005 
2006 	if (!list_is_empty(&g->mlg_rx_vlans)) {
2007 		fe = list_head(&dfg->mlfg_entries);
2008 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2009 	}
2010 
2011 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
2012 		fe = v->mlgv_fe;
2013 		ASSERT3P(fe->mlfe_table, ==, ft);
2014 		ASSERT3P(fe->mlfe_group, ==, fg);
2015 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2016 
2017 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2018 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2019 	}
2020 
2021 	mutex_exit(&ft->mlft_mtx);
2022 }
2023 
2024 boolean_t
2025 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
2026     boolean_t tagged, uint16_t vid)
2027 {
2028 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2029 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2030 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2031 	mlxcx_flow_entry_t *fe;
2032 	mlxcx_group_vlan_t *v;
2033 	boolean_t found = B_FALSE;
2034 
2035 	ASSERT(mutex_owned(&g->mlg_mtx));
2036 
2037 	mutex_enter(&ft->mlft_mtx);
2038 
2039 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2040 	    v = list_next(&g->mlg_rx_vlans, v)) {
2041 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2042 			found = B_TRUE;
2043 			break;
2044 		}
2045 	}
2046 	if (!found) {
2047 		mutex_exit(&ft->mlft_mtx);
2048 		return (B_FALSE);
2049 	}
2050 
2051 	list_remove(&g->mlg_rx_vlans, v);
2052 
2053 	/*
2054 	 * If this is the last VLAN entry, we have to go back to accepting
2055 	 * any VLAN (which means re-enabling the default entry).
2056 	 *
2057 	 * Do this before we remove the flow entry for the last specific
2058 	 * VLAN so that we don't lose any traffic in the transition.
2059 	 */
2060 	if (list_is_empty(&g->mlg_rx_vlans)) {
2061 		fe = list_head(&dfg->mlfg_entries);
2062 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2063 			list_insert_tail(&g->mlg_rx_vlans, v);
2064 			mutex_exit(&ft->mlft_mtx);
2065 			return (B_FALSE);
2066 		}
2067 	}
2068 
2069 	fe = v->mlgv_fe;
2070 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
2071 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
2072 	ASSERT3P(fe->mlfe_table, ==, ft);
2073 	ASSERT3P(fe->mlfe_group, ==, fg);
2074 
2075 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
2076 		list_insert_tail(&g->mlg_rx_vlans, v);
2077 		fe = list_head(&dfg->mlfg_entries);
2078 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
2079 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2080 		}
2081 		mutex_exit(&ft->mlft_mtx);
2082 		return (B_FALSE);
2083 	}
2084 
2085 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2086 
2087 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
2088 
2089 	mutex_exit(&ft->mlft_mtx);
2090 	return (B_TRUE);
2091 }
2092 
2093 boolean_t
2094 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
2095     uint16_t vid)
2096 {
2097 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2098 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2099 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2100 	mlxcx_flow_entry_t *fe;
2101 	mlxcx_group_vlan_t *v;
2102 	boolean_t found = B_FALSE;
2103 	boolean_t first = B_FALSE;
2104 
2105 	ASSERT(mutex_owned(&g->mlg_mtx));
2106 
2107 	mutex_enter(&ft->mlft_mtx);
2108 
2109 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2110 	    v = list_next(&g->mlg_rx_vlans, v)) {
2111 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2112 			mutex_exit(&ft->mlft_mtx);
2113 			return (B_TRUE);
2114 		}
2115 	}
2116 	if (list_is_empty(&g->mlg_rx_vlans))
2117 		first = B_TRUE;
2118 
2119 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2120 	    fe = list_next(&fg->mlfg_entries, fe)) {
2121 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2122 			found = B_TRUE;
2123 			break;
2124 		}
2125 	}
2126 	if (!found) {
2127 		mutex_exit(&ft->mlft_mtx);
2128 		return (B_FALSE);
2129 	}
2130 
2131 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
2132 	v->mlgv_fe = fe;
2133 	v->mlgv_tagged = tagged;
2134 	v->mlgv_vid = vid;
2135 
2136 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2137 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2138 	fe->mlfe_vid = vid;
2139 	if (tagged) {
2140 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2141 	} else {
2142 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2143 	}
2144 
2145 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2146 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2147 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2148 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2149 		mutex_exit(&ft->mlft_mtx);
2150 		return (B_FALSE);
2151 	}
2152 
2153 	list_insert_tail(&g->mlg_rx_vlans, v);
2154 
2155 	/*
2156 	 * If the vlan list was empty for this group before adding this one,
2157 	 * then we no longer want the "default" entry to allow all VLANs
2158 	 * through.
2159 	 */
2160 	if (first) {
2161 		fe = list_head(&dfg->mlfg_entries);
2162 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2163 	}
2164 
2165 	mutex_exit(&ft->mlft_mtx);
2166 	return (B_TRUE);
2167 }
2168 
2169 void
2170 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2171     mlxcx_ring_group_t *group)
2172 {
2173 	mlxcx_flow_entry_t *fe;
2174 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2175 	mlxcx_group_mac_t *gm, *ngm;
2176 
2177 	ASSERT(mutex_owned(&port->mlp_mtx));
2178 	ASSERT(mutex_owned(&group->mlg_mtx));
2179 
2180 	mutex_enter(&ft->mlft_mtx);
2181 
2182 	gm = avl_first(&group->mlg_rx_macs);
2183 	for (; gm != NULL; gm = ngm) {
2184 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2185 
2186 		ASSERT3P(gm->mlgm_group, ==, group);
2187 		fe = gm->mlgm_fe;
2188 		ASSERT3P(fe->mlfe_table, ==, ft);
2189 
2190 		avl_remove(&group->mlg_rx_macs, gm);
2191 		list_remove(&fe->mlfe_ring_groups, gm);
2192 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
2193 
2194 		fe->mlfe_ndest = 0;
2195 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2196 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2197 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2198 			    gm->mlgm_group->mlg_rx_vlan_ft;
2199 		}
2200 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2201 
2202 		if (fe->mlfe_ndest > 0) {
2203 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2204 			continue;
2205 		}
2206 
2207 		/*
2208 		 * There are no more ring groups left for this MAC (it wasn't
2209 		 * attached to any other groups since ndest == 0), so clean up
2210 		 * its flow entry.
2211 		 */
2212 		avl_remove(&port->mlp_dmac_fe, fe);
2213 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2214 		list_destroy(&fe->mlfe_ring_groups);
2215 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2216 	}
2217 
2218 	mutex_exit(&ft->mlft_mtx);
2219 }
2220 
2221 boolean_t
2222 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2223     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2224 {
2225 	mlxcx_flow_entry_t *fe;
2226 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2227 	mlxcx_group_mac_t *gm, probe;
2228 
2229 	ASSERT(mutex_owned(&port->mlp_mtx));
2230 	ASSERT(mutex_owned(&group->mlg_mtx));
2231 
2232 	bzero(&probe, sizeof (probe));
2233 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2234 
2235 	mutex_enter(&ft->mlft_mtx);
2236 
2237 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2238 	if (gm == NULL) {
2239 		mutex_exit(&ft->mlft_mtx);
2240 		return (B_FALSE);
2241 	}
2242 	ASSERT3P(gm->mlgm_group, ==, group);
2243 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2244 
2245 	fe = gm->mlgm_fe;
2246 	ASSERT3P(fe->mlfe_table, ==, ft);
2247 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2248 
2249 	list_remove(&fe->mlfe_ring_groups, gm);
2250 	avl_remove(&group->mlg_rx_macs, gm);
2251 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2252 
2253 	fe->mlfe_ndest = 0;
2254 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2255 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2256 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2257 		    gm->mlgm_group->mlg_rx_vlan_ft;
2258 	}
2259 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2260 
2261 	if (fe->mlfe_ndest > 0) {
2262 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2263 			mutex_exit(&ft->mlft_mtx);
2264 			return (B_FALSE);
2265 		}
2266 		mutex_exit(&ft->mlft_mtx);
2267 		return (B_TRUE);
2268 	}
2269 
2270 	/*
2271 	 * There are no more ring groups left for this MAC (it wasn't attached
2272 	 * to any other groups since ndest == 0), so clean up its flow entry.
2273 	 */
2274 	avl_remove(&port->mlp_dmac_fe, fe);
2275 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2276 	list_destroy(&fe->mlfe_ring_groups);
2277 
2278 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2279 
2280 	mutex_exit(&ft->mlft_mtx);
2281 
2282 	return (B_TRUE);
2283 }
2284 
2285 boolean_t
2286 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2287     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2288 {
2289 	mlxcx_flow_group_t *fg;
2290 	mlxcx_flow_entry_t *fe, probe;
2291 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2292 	mlxcx_group_mac_t *gm;
2293 	boolean_t found = B_FALSE;
2294 
2295 	ASSERT(mutex_owned(&port->mlp_mtx));
2296 	ASSERT(mutex_owned(&group->mlg_mtx));
2297 
2298 	bzero(&probe, sizeof (probe));
2299 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2300 
2301 	mutex_enter(&ft->mlft_mtx);
2302 
2303 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2304 
2305 	if (fe == NULL) {
2306 		fg = port->mlp_umcast;
2307 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2308 		    fe = list_next(&fg->mlfg_entries, fe)) {
2309 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2310 				found = B_TRUE;
2311 				break;
2312 			}
2313 		}
2314 		if (!found) {
2315 			mutex_exit(&ft->mlft_mtx);
2316 			return (B_FALSE);
2317 		}
2318 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2319 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2320 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2321 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2322 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2323 
2324 		avl_add(&port->mlp_dmac_fe, fe);
2325 	}
2326 
2327 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2328 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2329 
2330 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2331 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2332 		if (--fe->mlfe_ndest == 0) {
2333 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2334 		}
2335 		mutex_exit(&ft->mlft_mtx);
2336 		return (B_FALSE);
2337 	}
2338 
2339 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2340 	gm->mlgm_group = group;
2341 	gm->mlgm_fe = fe;
2342 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2343 	avl_add(&group->mlg_rx_macs, gm);
2344 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2345 
2346 	mutex_exit(&ft->mlft_mtx);
2347 
2348 	return (B_TRUE);
2349 }
2350 
2351 boolean_t
2352 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2353     mlxcx_flow_group_t *fg)
2354 {
2355 	mlxcx_flow_entry_t *fe;
2356 	uint_t i, idx;
2357 
2358 	ASSERT(mutex_owned(&ft->mlft_mtx));
2359 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2360 	ASSERT3P(fg->mlfg_table, ==, ft);
2361 
2362 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2363 		return (B_FALSE);
2364 	fg->mlfg_start_idx = ft->mlft_next_ent;
2365 
2366 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2367 		return (B_FALSE);
2368 	}
2369 
2370 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2371 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2372 	for (i = 0; i < fg->mlfg_size; ++i) {
2373 		idx = fg->mlfg_start_idx + i;
2374 		fe = &ft->mlft_ent[idx];
2375 		fe->mlfe_group = fg;
2376 		list_insert_tail(&fg->mlfg_entries, fe);
2377 	}
2378 	fg->mlfg_avail = fg->mlfg_size;
2379 	ft->mlft_next_ent += fg->mlfg_size;
2380 
2381 	return (B_TRUE);
2382 }
2383 
2384 static boolean_t
2385 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events)
2386 {
2387 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec];
2388 
2389 	mutex_enter(&mleq->mleq_mtx);
2390 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2391 		/* mlxcx_teardown_eqs() will clean this up */
2392 		mutex_exit(&mleq->mleq_mtx);
2393 		return (B_FALSE);
2394 	}
2395 	mleq->mleq_mlx = mlxp;
2396 	mleq->mleq_uar = &mlxp->mlx_uar;
2397 	mleq->mleq_events = events;
2398 	mleq->mleq_intr_index = vec;
2399 
2400 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2401 		/* mlxcx_teardown_eqs() will clean this up */
2402 		mutex_exit(&mleq->mleq_mtx);
2403 		return (B_FALSE);
2404 	}
2405 
2406 	if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) {
2407 		/*
2408 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2409 		 * eq_rele_dma
2410 		 */
2411 		mutex_exit(&mleq->mleq_mtx);
2412 		return (B_FALSE);
2413 	}
2414 	mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2415 	mleq->mleq_state |= MLXCX_EQ_ATTACHING;
2416 	mlxcx_arm_eq(mlxp, mleq);
2417 	mutex_exit(&mleq->mleq_mtx);
2418 
2419 	return (B_TRUE);
2420 }
2421 
2422 static void
2423 mlxcx_eq_set_attached(mlxcx_t *mlxp)
2424 {
2425 	uint_t vec;
2426 	mlxcx_event_queue_t *mleq;
2427 
2428 	for (vec = 0; vec < mlxp->mlx_intr_count; ++vec) {
2429 		mleq = &mlxp->mlx_eqs[vec];
2430 
2431 		mutex_enter(&mleq->mleq_mtx);
2432 		mleq->mleq_state &= ~MLXCX_EQ_ATTACHING;
2433 		mutex_exit(&mleq->mleq_mtx);
2434 	}
2435 }
2436 
2437 static boolean_t
2438 mlxcx_setup_async_eqs(mlxcx_t *mlxp)
2439 {
2440 	boolean_t ret;
2441 
2442 	ret = mlxcx_setup_eq(mlxp, 0,
2443 	    (1ULL << MLXCX_EVENT_CMD_COMPLETION) |
2444 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2445 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2446 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2447 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2448 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2449 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2450 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2451 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2452 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2453 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2454 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2455 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2456 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST));
2457 
2458 	if (ret)
2459 		mlxcx_cmd_eq_enable(mlxp);
2460 
2461 	return (ret);
2462 }
2463 
2464 int
2465 mlxcx_cq_compare(const void *arg0, const void *arg1)
2466 {
2467 	const mlxcx_completion_queue_t *left = arg0;
2468 	const mlxcx_completion_queue_t *right = arg1;
2469 
2470 	if (left->mlcq_num < right->mlcq_num) {
2471 		return (-1);
2472 	}
2473 	if (left->mlcq_num > right->mlcq_num) {
2474 		return (1);
2475 	}
2476 	return (0);
2477 }
2478 
2479 static boolean_t
2480 mlxcx_setup_eqs(mlxcx_t *mlxp)
2481 {
2482 	uint_t i;
2483 	mlxcx_event_queue_t *mleq;
2484 
2485 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2486 
2487 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
2488 		mleq = &mlxp->mlx_eqs[i];
2489 		mutex_enter(&mleq->mleq_mtx);
2490 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2491 			mutex_exit(&mleq->mleq_mtx);
2492 			return (B_FALSE);
2493 		}
2494 		mleq->mleq_uar = &mlxp->mlx_uar;
2495 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2496 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2497 			mutex_exit(&mleq->mleq_mtx);
2498 			return (B_FALSE);
2499 		}
2500 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2501 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2502 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2503 			mutex_exit(&mleq->mleq_mtx);
2504 			return (B_FALSE);
2505 		}
2506 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2507 			mutex_exit(&mleq->mleq_mtx);
2508 			return (B_FALSE);
2509 		}
2510 		mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2511 		mlxcx_arm_eq(mlxp, mleq);
2512 		mutex_exit(&mleq->mleq_mtx);
2513 	}
2514 
2515 	mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
2516 
2517 	return (B_TRUE);
2518 }
2519 
2520 /*
2521  * A more recent ConnectX part will have the Port CApability Mask register.
2522  * Explore it and note things here.
2523  */
2524 static void
2525 mlxcx_explore_pcam(mlxcx_t *mlxp, mlxcx_caps_t *c)
2526 {
2527 	mlxcx_register_data_t data;
2528 	mlxcx_reg_pcam_t *pcam = &data.mlrd_pcam;
2529 
2530 	ASSERT(c->mlc_pcam);
2531 	bzero(&data, sizeof (data));
2532 
2533 	/*
2534 	 * Okay, so we have access the the Ports CApability Mask (PCAM).
2535 	 * There are various things we need to check about it.
2536 	 */
2537 
2538 	VERIFY(mlxcx_cmd_access_register(mlxp, MLXCX_CMD_ACCESS_REGISTER_READ,
2539 	    MLXCX_REG_PCAM, &data));
2540 
2541 	/*
2542 	 * NOTE: These ASSERT()s may change in future mlxcx(4D) parts.
2543 	 * As of now, only 0 is valid, and 1-255 are reserved.  A future part
2544 	 * may return non-zero in these fields.
2545 	 */
2546 	ASSERT0(pcam->mlrd_pcam_feature_group);
2547 	ASSERT0(pcam->mlrd_pcam_access_reg_group);
2548 
2549 	c->mlc_ext_ptys = get_bit64(pcam->mlrd_pcam_feature_cap_mask_low,
2550 	    MLXCX_PCAM_LOW_FFLAGS_PTYS_EXTENDED);
2551 }
2552 
2553 /*
2554  * Snapshot all of the hardware capabilities that we care about and then modify
2555  * the HCA capabilities to get things moving.
2556  */
2557 static boolean_t
2558 mlxcx_init_caps(mlxcx_t *mlxp)
2559 {
2560 	mlxcx_caps_t *c;
2561 
2562 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2563 
2564 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2565 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2566 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2567 	}
2568 
2569 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2570 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2571 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2572 	}
2573 
2574 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2575 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2576 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2577 	}
2578 
2579 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2580 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2581 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2582 	}
2583 
2584 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2585 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2586 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2587 	}
2588 
2589 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2590 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2591 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2592 	}
2593 
2594 	/*
2595 	 * Check the caps meet our requirements.
2596 	 */
2597 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2598 
2599 	if (gen->mlcap_general_log_pg_sz != 12) {
2600 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2601 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2602 		goto err;
2603 	}
2604 	if (gen->mlcap_general_cqe_version != 1) {
2605 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2606 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2607 		goto err;
2608 	}
2609 	if (gen->mlcap_general_port_type !=
2610 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2611 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2612 		goto err;
2613 	}
2614 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2615 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2616 
2617 	if (mlxp->mlx_type >= MLXCX_DEV_CX5 &&
2618 	    get_bit16(gen->mlcap_general_flags_c,
2619 	    MLXCX_CAP_GENERAL_FLAGS_C_PCAM_REG)) {
2620 		c->mlc_pcam = B_TRUE;
2621 	}
2622 
2623 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2624 
2625 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2626 	    MLXCX_ETH_CAP_CSUM_CAP);
2627 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2628 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2629 
2630 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2631 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2632 	if (c->mlc_max_lso_size == 1) {
2633 		c->mlc_max_lso_size = 0;
2634 		c->mlc_lso = B_FALSE;
2635 	} else {
2636 		c->mlc_lso = B_TRUE;
2637 	}
2638 
2639 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2640 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2641 
2642 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2643 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2644 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2645 		goto err;
2646 	}
2647 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2648 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2649 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2650 		    "flow table entries");
2651 		goto err;
2652 	}
2653 
2654 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2655 	    mlcap_flow_prop_log_max_ft_size;
2656 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2657 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2658 	c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow.
2659 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num);
2660 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2661 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2662 
2663 	return (B_TRUE);
2664 
2665 err:
2666 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2667 	return (B_FALSE);
2668 }
2669 
2670 static int
2671 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2672 {
2673 	mlxcx_t *mlxp;
2674 
2675 	if (cmd != DDI_DETACH)
2676 		return (DDI_FAILURE);
2677 
2678 	mlxp = ddi_get_driver_private(dip);
2679 	if (mlxp == NULL) {
2680 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2681 		    "private data");
2682 		return (DDI_FAILURE);
2683 	}
2684 
2685 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2686 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2687 			return (DDI_FAILURE);
2688 		}
2689 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2690 	}
2691 
2692 	mlxcx_teardown(mlxp);
2693 	return (DDI_SUCCESS);
2694 }
2695 
2696 static size_t
2697 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2698 {
2699 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2700 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2701 	size_t tirlim, flowlim, gflowlim;
2702 
2703 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2704 	if (tirlim < ngroups) {
2705 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2706 		    "on number of TIRs available", tirlim);
2707 		ngroups = tirlim;
2708 	}
2709 
2710 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2711 	if (flowlim < ngroups) {
2712 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2713 		    "on max size of RX flow tables", flowlim);
2714 		ngroups = flowlim;
2715 	}
2716 
2717 	/*
2718 	 * Restrict the number of groups not to exceed the max flow
2719 	 * table number from the devices capabilities.
2720 	 * There is one root table entry per port and 2 entries per
2721 	 * group.
2722 	 */
2723 	flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2;
2724 	if (flowlim < ngroups) {
2725 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2726 		    "on max number of RX flow tables",
2727 		    flowlim);
2728 		ngroups = flowlim;
2729 	}
2730 
2731 	do {
2732 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2733 		if (gflowlim < ngroups) {
2734 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2735 			    "based on max total RX flows", gflowlim);
2736 			--ngroups;
2737 		}
2738 	} while (gflowlim < ngroups);
2739 
2740 	return (ngroups);
2741 }
2742 
2743 static int
2744 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2745 {
2746 	mlxcx_t *mlxp;
2747 	char tq_name[TASKQ_NAMELEN];
2748 	uint_t i;
2749 	int inst, ret;
2750 
2751 	if (cmd != DDI_ATTACH)
2752 		return (DDI_FAILURE);
2753 
2754 	inst = ddi_get_instance(dip);
2755 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2756 	if (ret != 0)
2757 		return (ret);
2758 
2759 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2760 	if (mlxp == NULL)
2761 		return (DDI_FAILURE);
2762 	mlxp->mlx_dip = dip;
2763 	mlxp->mlx_inst = inst;
2764 	ddi_set_driver_private(dip, mlxp);
2765 
2766 	mlxcx_load_props(mlxp);
2767 
2768 	mlxcx_fm_init(mlxp);
2769 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2770 
2771 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2772 	    DDI_SUCCESS) {
2773 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2774 		goto err;
2775 	}
2776 	mlxcx_get_model(mlxp);
2777 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2778 
2779 	if (!mlxcx_regs_map(mlxp)) {
2780 		goto err;
2781 	}
2782 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2783 
2784 	if (!mlxcx_cmd_queue_init(mlxp)) {
2785 		goto err;
2786 	}
2787 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2788 
2789 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2790 		goto err;
2791 	}
2792 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2793 
2794 	if (!mlxcx_check_issi(mlxp)) {
2795 		goto err;
2796 	}
2797 
2798 	/*
2799 	 * We have to get our interrupts now so we know what priority to
2800 	 * create pagemtx with.
2801 	 */
2802 	if (!mlxcx_intr_setup(mlxp)) {
2803 		goto err;
2804 	}
2805 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2806 
2807 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2808 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2809 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2810 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2811 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2812 
2813 	/*
2814 	 * Taskq for asynchronous events which may interact with the HCA
2815 	 * via the command interface. Single threaded FIFO.
2816 	 */
2817 	(void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d",
2818 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst);
2819 	mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX,
2820 	    TASKQ_PREPOPULATE);
2821 	/*
2822 	 * Initialize any pre-allocated taskq param structs.
2823 	 */
2824 	for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
2825 		mlxp->mlx_npages_req[i].mla_mlx = mlxp;
2826 		mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL,
2827 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
2828 	}
2829 	mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ;
2830 
2831 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2832 		goto err;
2833 	}
2834 
2835 	if (!mlxcx_init_caps(mlxp)) {
2836 		goto err;
2837 	}
2838 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2839 
2840 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2841 		goto err;
2842 	}
2843 
2844 	if (!mlxcx_cmd_init_hca(mlxp)) {
2845 		goto err;
2846 	}
2847 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2848 
2849 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2850 		goto err;
2851 	}
2852 
2853 	if (mlxp->mlx_caps->mlc_pcam) {
2854 		mlxcx_explore_pcam(mlxp, mlxp->mlx_caps);
2855 	}
2856 
2857 	/*
2858 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2859 	 * doorbells.
2860 	 */
2861 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2862 		goto err;
2863 	}
2864 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2865 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2866 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2867 	}
2868 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2869 
2870 	/*
2871 	 * Set up asynchronous event queue which handles control type events
2872 	 * like PAGE_REQUEST and CMD completion events.
2873 	 *
2874 	 * This will enable and arm the interrupt on EQ 0. Note that only page
2875 	 * reqs and cmd completions will be handled until we call
2876 	 * mlxcx_eq_set_attached further down (this way we don't need an extra
2877 	 * set of locks over the mlxcx_t sub-structs not allocated yet)
2878 	 */
2879 	if (!mlxcx_setup_async_eqs(mlxp)) {
2880 		goto err;
2881 	}
2882 
2883 	/*
2884 	 * Allocate a protection and transport domain. These don't really do
2885 	 * anything for us (they're IB concepts), but we need to give their
2886 	 * ID numbers in other commands.
2887 	 */
2888 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2889 		goto err;
2890 	}
2891 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2892 		goto err;
2893 	}
2894 	/*
2895 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2896 	 * work queue entries, rather than having to mess with the NIC's
2897 	 * internal MMU.
2898 	 */
2899 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2900 		goto err;
2901 	}
2902 
2903 	/*
2904 	 * Query our port information and current state, populate the
2905 	 * mlxcx_port_t structs.
2906 	 *
2907 	 * This also sets up the root flow tables and flow groups.
2908 	 */
2909 	if (!mlxcx_setup_ports(mlxp)) {
2910 		goto err;
2911 	}
2912 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2913 
2914 	mlxcx_load_model_props(mlxp);
2915 
2916 	/*
2917 	 * Set up, enable and arm the rest of the interrupt EQs which will
2918 	 * service events from CQs.
2919 	 *
2920 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2921 	 * cleaned up.
2922 	 */
2923 	if (!mlxcx_setup_eqs(mlxp)) {
2924 		goto err;
2925 	}
2926 
2927 	/* Completion queues */
2928 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2929 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2930 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2931 
2932 	/* Work queues (send queues, receive queues) */
2933 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2934 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2935 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2936 
2937 	/*
2938 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2939 	 * "groups" we advertise to MAC.
2940 	 */
2941 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2942 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2943 	    sizeof (mlxcx_ring_group_t);
2944 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2945 
2946 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2947 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2948 	    sizeof (mlxcx_ring_group_t);
2949 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2950 
2951 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2952 
2953 	/*
2954 	 * Sets up the free/busy buffers list for keeping track of packet
2955 	 * buffers.
2956 	 */
2957 	if (!mlxcx_setup_bufs(mlxp))
2958 		goto err;
2959 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2960 
2961 	/*
2962 	 * Before we tell MAC about our rings/groups, we need to do enough
2963 	 * setup on them to be sure about the numbers and configuration that
2964 	 * we have. This will do basically everything short of allocating
2965 	 * packet buffers and starting the rings up.
2966 	 */
2967 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2968 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2969 			goto err;
2970 	}
2971 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2972 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2973 			goto err;
2974 	}
2975 
2976 	/*
2977 	 * Set up periodic fault check timers which check the queue states,
2978 	 * set up should be after all the queues have been initialized and
2979 	 * consequently the teardown of timers must happen before
2980 	 * queue teardown.
2981 	 */
2982 	if (!mlxcx_setup_checktimers(mlxp)) {
2983 		goto err;
2984 	}
2985 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2986 
2987 	/*
2988 	 * Some devices may not have a working temperature sensor; however,
2989 	 * there isn't a great way for us to know. We shouldn't fail attach if
2990 	 * this doesn't work.
2991 	 */
2992 	if (mlxcx_setup_sensors(mlxp)) {
2993 		mlxp->mlx_attach |= MLXCX_ATTACH_SENSORS;
2994 	}
2995 
2996 	/*
2997 	 * Finally, tell MAC that we exist!
2998 	 */
2999 	if (!mlxcx_register_mac(mlxp)) {
3000 		goto err;
3001 	}
3002 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
3003 
3004 	/*
3005 	 * This tells the interrupt handlers they can start processing events
3006 	 * other than cmd completions and page requests.
3007 	 */
3008 	mlxcx_eq_set_attached(mlxp);
3009 
3010 	return (DDI_SUCCESS);
3011 
3012 err:
3013 	mlxcx_teardown(mlxp);
3014 	return (DDI_FAILURE);
3015 }
3016 
3017 static struct cb_ops mlxcx_cb_ops = {
3018 	.cb_open = nulldev,
3019 	.cb_close = nulldev,
3020 	.cb_strategy = nodev,
3021 	.cb_print = nodev,
3022 	.cb_dump = nodev,
3023 	.cb_read = nodev,
3024 	.cb_write = nodev,
3025 	.cb_ioctl = nodev,
3026 	.cb_devmap = nodev,
3027 	.cb_mmap = nodev,
3028 	.cb_segmap = nodev,
3029 	.cb_chpoll = nochpoll,
3030 	.cb_prop_op = ddi_prop_op,
3031 	.cb_flag = D_MP,
3032 	.cb_rev = CB_REV,
3033 	.cb_aread = nodev,
3034 	.cb_awrite = nodev
3035 };
3036 
3037 static struct dev_ops mlxcx_dev_ops = {
3038 	.devo_rev = DEVO_REV,
3039 	.devo_refcnt = 0,
3040 	.devo_getinfo = NULL,
3041 	.devo_identify = nulldev,
3042 	.devo_probe = nulldev,
3043 	.devo_attach = mlxcx_attach,
3044 	.devo_detach = mlxcx_detach,
3045 	.devo_reset = nodev,
3046 	.devo_quiesce = ddi_quiesce_not_supported,
3047 	.devo_cb_ops = &mlxcx_cb_ops
3048 };
3049 
3050 static struct modldrv mlxcx_modldrv = {
3051 	.drv_modops = &mod_driverops,
3052 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
3053 	.drv_dev_ops = &mlxcx_dev_ops
3054 };
3055 
3056 static struct modlinkage mlxcx_modlinkage = {
3057 	.ml_rev = MODREV_1,
3058 	.ml_linkage = { &mlxcx_modldrv, NULL }
3059 };
3060 
3061 int
3062 _init(void)
3063 {
3064 	int ret;
3065 
3066 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
3067 	if (ret != 0) {
3068 		return (ret);
3069 	}
3070 
3071 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
3072 
3073 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3074 		mac_fini_ops(&mlxcx_dev_ops);
3075 		ddi_soft_state_fini(&mlxcx_softstate);
3076 		return (ret);
3077 	}
3078 
3079 	return (DDI_SUCCESS);
3080 }
3081 
3082 int
3083 _info(struct modinfo *modinfop)
3084 {
3085 	return (mod_info(&mlxcx_modlinkage, modinfop));
3086 }
3087 
3088 int
3089 _fini(void)
3090 {
3091 	int ret;
3092 
3093 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
3094 		return (ret);
3095 	}
3096 
3097 	mac_fini_ops(&mlxcx_dev_ops);
3098 
3099 	ddi_soft_state_fini(&mlxcx_softstate);
3100 
3101 	return (DDI_SUCCESS);
3102 }
3103