xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision 97b5374547d500fded52d886ceba8a9962af0527)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  */
16 
17 /*
18  * Mellanox Connect-X 4/5/6 driver.
19  */
20 
21 /*
22  * The PRM for this family of parts is freely available, and can be found at:
23  * https://www.mellanox.com/related-docs/user_manuals/ \
24  *   Ethernet_Adapters_Programming_Manual.pdf
25  */
26 /*
27  * ConnectX glossary
28  * -----------------
29  *
30  * WR		Work Request: something we've asked the hardware to do by
31  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
32  *
33  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
34  *
35  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
36  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
37  *		types have different WQE structures, different commands for
38  *		creating and destroying them, etc, but share a common context
39  *		structure, counter setup and state graph.
40  * SQ		Send Queue, a specific type of WQ that sends packets
41  * RQ		Receive Queue, a specific type of WQ that receives packets
42  *
43  * CQ		Completion Queue: completion of WRs from a WQ are reported to
44  *		one of these, as a CQE on its entry ring.
45  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
46  *		info, as well as packet size, the ID of the WQ, and the index
47  *		of the WQE which completed. Does not contain any packet data.
48  *
49  * EQ		Event Queue: a ring of event structs from the hardware informing
50  *		us when particular events happen. Many events can point at a
51  *		a particular CQ which we should then go look at.
52  * EQE		Event Queue Entry: an entry on the EQ ring
53  *
54  * UAR		User Access Region, a page of the device's PCI BAR which is
55  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
56  *		ring to arm them for interrupts or wake them up for new work
57  *
58  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
59  *		as a single unit (for e.g. hashing/RSS).
60  *
61  * TIR		Transport Interface Recieve, a bucket of resources for the
62  *		reception of packets. TIRs have to point at either a single RQ
63  *		or a table of RQs (RQT). They then serve as a target for flow
64  *		table entries (FEs). TIRs that point at an RQT also contain the
65  *		settings for hashing for RSS.
66  *
67  * TIS		Transport Interface Send, a bucket of resources associated with
68  *		the transmission of packets. In particular, the temporary
69  *		resources used for LSO internally in the card are accounted to
70  *		a TIS.
71  *
72  * FT		Flow Table, a collection of FEs and FGs that can be referred to
73  *		as a single entity (e.g. used as a target from another flow
74  *		entry or set as the "root" table to handle incoming or outgoing
75  *		packets). Packets arriving at a FT are matched against the
76  *		FEs in the table until either one matches with a terminating
77  *		action or all FEs are exhausted (it's first-match-wins but with
78  *		some actions that are non-terminal, like counting actions).
79  *
80  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
81  *		they match on the same attributes of packets coming into the
82  *		flow).
83  *
84  * FE		Flow Entry, an individual set of values to match against
85  *		packets entering the flow table, combined with an action to
86  *		take upon a successful match. The action we use most is
87  *		"forward", which sends the packets to a TIR or another flow
88  *		table and then stops further processing within the FE's FT.
89  *
90  * lkey/mkey	A reference to something similar to a page table but in the
91  *		device's internal onboard MMU. Since Connect-X parts double as
92  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
93  *		features which we try very hard not to use. For our WQEs we use
94  *		the "reserved" lkey, which is a special value which indicates
95  *		that addresses we give are linear addresses and should not be
96  *		translated.
97  *
98  * PD		Protection Domain, an IB concept. We have to allocate one to
99  *		provide as a parameter for new WQs, but we don't do anything
100  *		with it.
101  *
102  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
103  *		provide it as a parameter to TIR/TIS creation, but we don't do
104  *		anything with it.
105  */
106 /*
107  *
108  * Data flow overview
109  * ------------------
110  *
111  * This driver is a MAC ring-enabled driver which maps rings to send and recv
112  * queues in hardware on the device.
113  *
114  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
115  * sufficient space, and simplify the logic needed to work out which buffer
116  * was completed.
117  *
118  * The CQs are then round-robin allocated onto EQs, of which we set up one per
119  * interrupt that the system gives us for the device. Normally this means we
120  * have 8 EQs.
121  *
122  * When we have >= 8 EQs available, we try to allocate only RX or only TX
123  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
124  *
125  * EQ #0 is reserved for all event types other than completion events, and has
126  * no CQs associated with it at any time. EQs #1 and upwards are only used for
127  * handling CQ completion events.
128  *
129  * +------+     +------+           +------+        +---------+
130  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
131  * +------+     +------+     |     +------+        +---------+
132  *                           |
133  * +------+     +------+     |
134  * | SQ 1 |---->| CQ 1 |---+ |     +------+
135  * +------+     +------+   | +---> |      |
136  *                         |       |      |
137  * +------+     +------+   |       | EQ 1 |        +---------+
138  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
139  * +------+     +------+   | +---> |      |        +---------+
140  *                         | |     +------+
141  *                         | |
142  *   ...                   | |
143  *                         | |     +------+
144  * +------+     +------+   +-----> |      |
145  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
146  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
147  *                           |     |      |        +---------+
148  * +------+     +------+     | +-> |      |
149  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
150  * +------+     +------+       |
151  *                             |     ....
152  * +------+     +------+       |
153  * | RQ 2 |---->| CQ 5 |-------+
154  * +------+     +------+
155  *
156  *   ... (note this diagram does not show RX-only or TX-only EQs)
157  *
158  * For TX, we advertise all of the SQs we create as plain rings to MAC with
159  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
160  * and use the rings as it sees fit.
161  *
162  * For RX, we advertise actual groups in order to make use of hardware
163  * classification.
164  *
165  * The hardware classification we use is based around Flow Tables, and we
166  * currently ignore all of the eswitch features of the card. The NIC VPORT
167  * is always set to promisc mode so that the eswitch sends us all of the
168  * traffic that arrives on the NIC, and we use flow entries to manage
169  * everything.
170  *
171  * We use 2 layers of flow tables for classification: traffic arrives at the
172  * root RX flow table which contains MAC address filters. Those then send
173  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
174  * presence and VID filters.
175  *
176  * Since these parts only support doing RSS hashing on a single protocol at a
177  * time, we have to use a third layer of flow tables as well to break traffic
178  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
179  * so that it can be sent to the appropriate TIR for hashing.
180  *
181  * Incoming packets
182  *        +           +---------+      +---------+
183  *        |        +->| group 0 |      | group 0 |
184  *        |        |  | vlan ft |  +-->| hash ft |
185  *        v        |  |   L1    |  |   |   L2    |
186  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
187  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
188  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
189  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
190  *        |        |  |         |  |   +---------+    +-----+    |     +------+
191  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
192  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
193  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
194  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
195  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
196  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
197  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
198  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
199  *   +---------+   |               ^   |  other  |-+
200  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
201  *   +---------+                   |               +->| TIR |--->| RQ0 |
202  *   |  MAC 1  |-+                 |                  +-----+    +-----+
203  *   +---------+ | +---------------+
204  *   |  MAC 2  |-+ |               ^
205  *   +---------+ | |               |
206  *   |  MAC 3  |-+ |  +---------+  |   +---------+
207  *   +---------+ | |  | group 1 |  |   | group 1 |
208  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
209  *   |         |   |  |   L1    |  | | |   L2    |
210  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
211  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
212  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
213  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
214  *                    |         |  |   +---------+    +-----+    |     +------+
215  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
216  *                    |         |  |   +---------+    +-----+    | RQT +------+
217  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
218  *                    |         |  |   +---------+    +-----+    |     |      |
219  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
220  *                    | promisc |--+   +---------+    +-----+    |     |      |
221  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
222  *                                     +---------+    +-----+    +-----+------+
223  *                                     |  other  |-+
224  *                                     +---------+ |
225  *                      .......                    |  +-----+    +-----+
226  *                                                 +->| TIR |--->| RQ3 |
227  *                                                    +-----+    +-----+
228  *
229  * Note that the "promisc" flow entries are only set/enabled when promisc
230  * mode is enabled for the NIC. All promisc flow entries point directly at
231  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
232  * the "default group" in MAC).
233  *
234  * The "default" entry in the L1 VLAN filter flow tables is used when there
235  * are no VLANs set for the group, to accept any traffic regardless of tag. It
236  * is deleted as soon as a VLAN filter is added (and re-instated if the
237  * last VLAN filter is removed).
238  *
239  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
240  * space for packet data (they're a collection of scatter pointers only). TX
241  * descriptors contain some space for "inline headers" (and the card requires
242  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
243  * but all the rest of the data comes from the gather pointers.
244  *
245  * When we get completions back they simply contain the ring index number of
246  * the WR (work request) which completed. So, we manage the buffers for actual
247  * packet data completely independently of the descriptors in this driver. When
248  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
249  * with the WQE index that we put it at, and therefore don't have to look at
250  * the original descriptor at all when handling completions.
251  *
252  * For RX, we create sufficient packet data buffers to fill 150% of the
253  * available descriptors for each ring. These all are pre-set-up for DMA and
254  * have an mblk_t associated with them (with desballoc()).
255  *
256  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
257  * large enough), or we copy it into a pre-allocated buffer set up in the same
258  * as as for RX.
259  */
260 
261 /*
262  * Buffer lifecycle: RX
263  * --------------------
264  *
265  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
266  * straightforward.
267  *
268  * It is created (and has all its memory allocated) at the time of starting up
269  * the RX ring it belongs to. Then it is placed on the "free" list in the
270  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
271  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
272  * before making a WQE for it.
273  *
274  * After a completion event occurs, the packet is either discarded (and the
275  * buffer_t returned to the free list), or it is readied for loaning to MAC.
276  *
277  * Once MAC and the rest of the system have finished with the packet, they call
278  * freemsg() on its mblk, which will call mlxcx_buf_mp_return and return the
279  * buffer_t to the free list.
280  *
281  * At detach/teardown time, buffers are only every destroyed from the free list.
282  *
283  *
284  *                         +
285  *                         |
286  *                         | mlxcx_buf_create
287  *                         |
288  *                         v
289  *                    +----+----+
290  *                    | created |
291  *                    +----+----+
292  *                         |
293  *                         |
294  *                         | mlxcx_buf_return
295  *                         |
296  *                         v
297  * mlxcx_buf_destroy  +----+----+
298  *          +---------|  free   |<---------------+
299  *          |         +----+----+                |
300  *          |              |                     |
301  *          |              |                     | mlxcx_buf_return
302  *          v              | mlxcx_buf_take      |
303  *      +---+--+           v                     |
304  *      | dead |       +---+---+                 |
305  *      +------+       | on WQ |- - - - - - - - >O
306  *                     +---+---+                 ^
307  *                         |                     |
308  *                         |                     |
309  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
310  *                         v                     |
311  *                 +-------+--------+            |
312  *                 | on loan to MAC |----------->O
313  *                 +----------------+  freemsg()
314  *
315  */
316 
317 /*
318  * Buffer lifecycle: TX
319  * --------------------
320  *
321  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
322  * "foreign" buffers.
323  *
324  * The former have their memory allocated and DMA bound by this driver, while
325  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
326  * not owned by us, though we do DMA bind it (and take responsibility for
327  * un-binding it when we're done with them).
328  *
329  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
330  * SQ. Thus, there is a separate free list and mutex for each kind.
331  *
332  * Since a TX packet might consist of multiple mblks, we translate each mblk
333  * into exactly one buffer_t. The buffer_ts are chained together in the same
334  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
335  *
336  * Each chain of TX buffers may consist of foreign or driver buffers, in any
337  * mixture.
338  *
339  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
340  * it from the rest of the chain buffers.
341  *
342  * TX buffer chains are always returned to the free list by
343  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
344  * freeing all of the members.
345  *
346  * We only call freemsg() once, on the head of the TX buffer chain's original
347  * mblk. This is true whether we copied it or bound it in a foreign buffer.
348  */
349 
350 /*
351  * Startup and command interface
352  * -----------------------------
353  *
354  * The command interface is the primary way in which we give control orders to
355  * the hardware (e.g. actions like "create this queue" or "delete this flow
356  * entry"). The command interface is never used to transmit or receive packets
357  * -- that takes place only on the queues that are set up through it.
358  *
359  * In mlxcx_cmd.c we implement our use of the command interface on top of a
360  * simple taskq. Since it's not performance critical, we busy-wait on command
361  * completions and only process a single command at a time.
362  *
363  * If this becomes a problem later we can wire command completions up to EQ 0
364  * once we have interrupts running.
365  *
366  * The startup/attach process for this card involves a bunch of different steps
367  * which are summarised pretty well in the PRM. We have to send a number of
368  * commands which do different things to start the card up, give it some pages
369  * of our own memory for it to use, then start creating all the entities that
370  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
371  * and TDoms.
372  */
373 
374 /*
375  * UARs
376  * ----
377  *
378  * The pages of the PCI BAR other than the first few are reserved for use as
379  * "UAR" sections in this device. Each UAR section can be used as a set of
380  * doorbells for our queues.
381  *
382  * Currently we just make one single UAR for all of our queues. It doesn't
383  * seem to be a major limitation yet.
384  *
385  * When we're sending packets through an SQ, the PRM is not awful clear about
386  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
387  * (it's clear on the pattern of alternation you're expected to use between
388  * even and odd for Blueflame sends, but not for regular doorbells).
389  *
390  * Currently we don't do the even-odd alternating pattern for ordinary
391  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
392  * least on Connect-X4 Lx.
393  */
394 
395 /*
396  * Lock ordering
397  * -------------
398  *
399  * Interrupt side:
400  *
401  *  - mleq_mtx
402  *    - mlcq_mtx
403  *      - mlcq_bufbmtx
404  *      - mlwq_mtx
405  *        - mlbs_mtx
406  *    - mlp_mtx
407  *
408  * GLD side:
409  *
410  *  - mlp_mtx
411  *    - mlg_mtx
412  *      - mlg_*.mlft_mtx
413  *    - mlp_*.mlft_mtx
414  *    - mlwq_mtx
415  *      - mlbs_mtx
416  *      - mlcq_bufbmtx
417  *  - mleq_mtx
418  *    - mlcq_mtx
419  *
420  */
421 
422 #include <sys/modctl.h>
423 #include <sys/conf.h>
424 #include <sys/devops.h>
425 #include <sys/sysmacros.h>
426 #include <sys/time.h>
427 
428 #include <sys/mac_provider.h>
429 
430 #include <mlxcx.h>
431 
432 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
433 
434 #define	MLXCX_MODULE_NAME	"mlxcx"
435 /*
436  * We give this to the firmware, so it has to be in a fixed format that it
437  * understands.
438  */
439 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
440 
441 /*
442  * Firmware may take a while to reclaim pages. Try a set number of times.
443  */
444 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
445 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
446 
447 static void *mlxcx_softstate;
448 
449 /*
450  * Fault detection thresholds.
451  */
452 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
453 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
454 
455 static void
456 mlxcx_load_props(mlxcx_t *mlxp)
457 {
458 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
459 
460 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
461 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
462 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
463 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
464 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
465 	    MLXCX_CQ_SIZE_SHIFT_DFLT);
466 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
467 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
468 	    MLXCX_SQ_SIZE_SHIFT_DFLT);
469 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
470 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
471 	    MLXCX_RQ_SIZE_SHIFT_DFLT);
472 
473 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
474 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
475 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
476 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
477 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
478 	    MLXCX_CQEMOD_COUNT_DFLT);
479 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
480 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
481 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
482 
483 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
484 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
485 	    MLXCX_TX_NGROUPS_DFLT);
486 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
487 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
488 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
489 
490 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
491 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
492 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
493 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
494 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
495 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
496 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
497 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
498 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
499 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
500 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
501 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
502 
503 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
504 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
505 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
506 
507 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
508 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
509 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
510 
511 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
512 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
513 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
514 
515 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
516 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
517 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
518 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
519 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
520 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
521 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
522 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
523 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
524 }
525 
526 void
527 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
528 {
529 	va_list ap;
530 
531 	va_start(ap, fmt);
532 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
533 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
534 	} else {
535 		vcmn_err(CE_NOTE, fmt, ap);
536 	}
537 	va_end(ap);
538 }
539 
540 void
541 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
542 {
543 	va_list ap;
544 
545 	va_start(ap, fmt);
546 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
547 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
548 	} else {
549 		vcmn_err(CE_WARN, fmt, ap);
550 	}
551 	va_end(ap);
552 }
553 
554 void
555 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
556 {
557 	va_list ap;
558 
559 	va_start(ap, fmt);
560 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
561 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
562 	} else {
563 		vcmn_err(CE_PANIC, fmt, ap);
564 	}
565 	va_end(ap);
566 }
567 
568 uint16_t
569 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
570 {
571 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
572 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
573 }
574 
575 uint32_t
576 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
577 {
578 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
579 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
580 }
581 
582 uint64_t
583 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
584 {
585 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
586 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
587 }
588 
589 void
590 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
591 {
592 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
593 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
594 }
595 
596 void
597 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
598 {
599 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
600 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
601 }
602 
603 void
604 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
605 {
606 	/*
607 	 * The UAR is always inside the first BAR, which we mapped as
608 	 * mlx_regs
609 	 */
610 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
611 	    (uintptr_t)mlxp->mlx_regs_base;
612 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
613 }
614 
615 void
616 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
617 {
618 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
619 	    (uintptr_t)mlxp->mlx_regs_base;
620 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
621 }
622 
623 static void
624 mlxcx_fm_fini(mlxcx_t *mlxp)
625 {
626 	if (mlxp->mlx_fm_caps == 0)
627 		return;
628 
629 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
630 		ddi_fm_handler_unregister(mlxp->mlx_dip);
631 
632 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
633 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
634 		pci_ereport_teardown(mlxp->mlx_dip);
635 
636 	ddi_fm_fini(mlxp->mlx_dip);
637 
638 	mlxp->mlx_fm_caps = 0;
639 }
640 
641 void
642 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
643 {
644 	uint64_t ena;
645 	char buf[FM_MAX_CLASS];
646 
647 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
648 		return;
649 
650 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
651 	ena = fm_ena_generate(0, FM_ENA_FMT1);
652 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
653 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
654 	    NULL);
655 }
656 
657 static int
658 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
659 {
660 	/*
661 	 * as the driver can always deal with an error in any dma or
662 	 * access handle, we can just return the fme_status value.
663 	 */
664 	pci_ereport_post(dip, err, NULL);
665 	return (err->fme_status);
666 }
667 
668 static void
669 mlxcx_fm_init(mlxcx_t *mlxp)
670 {
671 	ddi_iblock_cookie_t iblk;
672 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
673 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
674 
675 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
676 	    DDI_PROP_DONTPASS, "fm_capable", def);
677 
678 	if (mlxp->mlx_fm_caps < 0) {
679 		mlxp->mlx_fm_caps = 0;
680 	}
681 	mlxp->mlx_fm_caps &= def;
682 
683 	if (mlxp->mlx_fm_caps == 0)
684 		return;
685 
686 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
687 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
688 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
689 		pci_ereport_setup(mlxp->mlx_dip);
690 	}
691 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
692 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
693 		    (void *)mlxp);
694 	}
695 }
696 
697 static void
698 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
699 {
700 	mlxcx_buffer_t *buf;
701 
702 	mutex_enter(&s->mlbs_mtx);
703 	while (!list_is_empty(&s->mlbs_busy))
704 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
705 	while ((buf = list_head(&s->mlbs_free)) != NULL) {
706 		mlxcx_buf_destroy(mlxp, buf);
707 	}
708 	list_destroy(&s->mlbs_free);
709 	list_destroy(&s->mlbs_busy);
710 	mutex_exit(&s->mlbs_mtx);
711 
712 	cv_destroy(&s->mlbs_free_nonempty);
713 	mutex_destroy(&s->mlbs_mtx);
714 }
715 
716 static void
717 mlxcx_teardown_bufs(mlxcx_t *mlxp)
718 {
719 	mlxcx_buf_shard_t *s;
720 
721 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
722 		mlxcx_mlbs_teardown(mlxp, s);
723 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
724 	}
725 	list_destroy(&mlxp->mlx_buf_shards);
726 
727 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
728 }
729 
730 static void
731 mlxcx_teardown_pages(mlxcx_t *mlxp)
732 {
733 	uint_t nzeros = 0;
734 
735 	mutex_enter(&mlxp->mlx_pagemtx);
736 
737 	while (mlxp->mlx_npages > 0) {
738 		int32_t req, ret;
739 		uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES];
740 
741 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
742 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
743 
744 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
745 			mlxcx_warn(mlxp, "hardware refused to return pages, "
746 			    "leaking %u remaining pages", mlxp->mlx_npages);
747 			goto out;
748 		}
749 
750 		for (int32_t i = 0; i < ret; i++) {
751 			mlxcx_dev_page_t *mdp, probe;
752 			bzero(&probe, sizeof (probe));
753 			probe.mxdp_pa = pas[i];
754 
755 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
756 
757 			if (mdp != NULL) {
758 				avl_remove(&mlxp->mlx_pages, mdp);
759 				mlxp->mlx_npages--;
760 				mlxcx_dma_free(&mdp->mxdp_dma);
761 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
762 			} else {
763 				mlxcx_panic(mlxp, "hardware returned a page "
764 				    "with PA 0x%" PRIx64 " but we have no "
765 				    "record of giving out such a page", pas[i]);
766 			}
767 		}
768 
769 		/*
770 		 * If no pages were returned, note that fact.
771 		 */
772 		if (ret == 0) {
773 			nzeros++;
774 			if (nzeros > mlxcx_reclaim_tries) {
775 				mlxcx_warn(mlxp, "hardware refused to return "
776 				    "pages, leaking %u remaining pages",
777 				    mlxp->mlx_npages);
778 				goto out;
779 			}
780 			delay(drv_usectohz(mlxcx_reclaim_delay));
781 		}
782 	}
783 
784 	avl_destroy(&mlxp->mlx_pages);
785 
786 out:
787 	mutex_exit(&mlxp->mlx_pagemtx);
788 	mutex_destroy(&mlxp->mlx_pagemtx);
789 }
790 
791 static boolean_t
792 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
793 {
794 	ddi_device_acc_attr_t acc;
795 	ddi_dma_attr_t attr;
796 	boolean_t ret;
797 	size_t sz, i;
798 
799 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
800 
801 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
802 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
803 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
804 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
805 
806 	mlxcx_dma_acc_attr(mlxp, &acc);
807 	mlxcx_dma_queue_attr(mlxp, &attr);
808 
809 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
810 	    B_TRUE, sz, B_TRUE);
811 	if (!ret) {
812 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
813 		return (B_FALSE);
814 	}
815 
816 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
817 
818 	for (i = 0; i < mleq->mleq_nents; ++i)
819 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
820 
821 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
822 
823 	return (B_TRUE);
824 }
825 
826 static void
827 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
828 {
829 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
830 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
831 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
832 
833 	mlxcx_dma_free(&mleq->mleq_dma);
834 	mleq->mleq_ent = NULL;
835 
836 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
837 }
838 
839 void
840 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
841 {
842 	mlxcx_flow_group_t *fg;
843 	mlxcx_flow_entry_t *fe;
844 	int i;
845 
846 	ASSERT(mutex_owned(&ft->mlft_mtx));
847 
848 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
849 		fe = &ft->mlft_ent[i];
850 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
851 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
852 				mlxcx_panic(mlxp, "failed to delete flow "
853 				    "entry %u on table %u", i,
854 				    ft->mlft_num);
855 			}
856 		}
857 	}
858 
859 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
860 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
861 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
862 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
863 				mlxcx_panic(mlxp, "failed to destroy flow "
864 				    "group %u", fg->mlfg_num);
865 			}
866 		}
867 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
868 	}
869 	list_destroy(&ft->mlft_groups);
870 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
871 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
872 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
873 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
874 			    ft->mlft_num);
875 		}
876 	}
877 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
878 	ft->mlft_ent = NULL;
879 	mutex_exit(&ft->mlft_mtx);
880 	mutex_destroy(&ft->mlft_mtx);
881 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
882 }
883 
884 static void
885 mlxcx_teardown_ports(mlxcx_t *mlxp)
886 {
887 	uint_t i;
888 	mlxcx_port_t *p;
889 	mlxcx_flow_table_t *ft;
890 
891 	for (i = 0; i < mlxp->mlx_nports; ++i) {
892 		p = &mlxp->mlx_ports[i];
893 		if (!(p->mlp_init & MLXCX_PORT_INIT))
894 			continue;
895 		mutex_enter(&p->mlp_mtx);
896 		if ((ft = p->mlp_rx_flow) != NULL) {
897 			mutex_enter(&ft->mlft_mtx);
898 			/*
899 			 * teardown_flow_table() will destroy the mutex, so
900 			 * we don't release it here.
901 			 */
902 			mlxcx_teardown_flow_table(mlxp, ft);
903 		}
904 		mutex_exit(&p->mlp_mtx);
905 		mutex_destroy(&p->mlp_mtx);
906 		p->mlp_init &= ~MLXCX_PORT_INIT;
907 	}
908 
909 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
910 	mlxp->mlx_ports = NULL;
911 }
912 
913 static void
914 mlxcx_teardown_wqs(mlxcx_t *mlxp)
915 {
916 	mlxcx_work_queue_t *mlwq;
917 
918 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
919 		mlxcx_wq_teardown(mlxp, mlwq);
920 	}
921 	list_destroy(&mlxp->mlx_wqs);
922 }
923 
924 static void
925 mlxcx_teardown_cqs(mlxcx_t *mlxp)
926 {
927 	mlxcx_completion_queue_t *mlcq;
928 
929 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
930 		mlxcx_cq_teardown(mlxp, mlcq);
931 	}
932 	list_destroy(&mlxp->mlx_cqs);
933 }
934 
935 static void
936 mlxcx_teardown_eqs(mlxcx_t *mlxp)
937 {
938 	mlxcx_event_queue_t *mleq;
939 	uint_t i;
940 
941 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
942 		mleq = &mlxp->mlx_eqs[i];
943 		mutex_enter(&mleq->mleq_mtx);
944 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
945 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
946 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
947 				mlxcx_warn(mlxp, "failed to destroy "
948 				    "event queue idx %u eqn %u",
949 				    i, mleq->mleq_num);
950 			}
951 		}
952 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
953 			mlxcx_eq_rele_dma(mlxp, mleq);
954 		}
955 		mutex_exit(&mleq->mleq_mtx);
956 	}
957 }
958 
959 static void
960 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
961 {
962 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
963 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
964 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
965 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
966 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
967 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
968 }
969 
970 static void
971 mlxcx_teardown(mlxcx_t *mlxp)
972 {
973 	uint_t i;
974 	dev_info_t *dip = mlxp->mlx_dip;
975 
976 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
977 		mlxcx_teardown_groups(mlxp);
978 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
979 	}
980 
981 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
982 		mlxcx_teardown_checktimers(mlxp);
983 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
984 	}
985 
986 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
987 		mlxcx_teardown_wqs(mlxp);
988 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
989 	}
990 
991 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
992 		mlxcx_teardown_cqs(mlxp);
993 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
994 	}
995 
996 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
997 		mlxcx_teardown_bufs(mlxp);
998 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
999 	}
1000 
1001 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1002 		mlxcx_teardown_ports(mlxp);
1003 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1004 	}
1005 
1006 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1007 		mlxcx_teardown_eqs(mlxp);
1008 		mlxcx_intr_teardown(mlxp);
1009 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1010 	}
1011 
1012 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1013 		if (mlxp->mlx_uar.mlu_allocated) {
1014 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1015 				mlxcx_warn(mlxp, "failed to release UAR");
1016 			}
1017 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1018 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1019 		}
1020 		if (mlxp->mlx_pd.mlpd_allocated &&
1021 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1022 			mlxcx_warn(mlxp, "failed to release PD");
1023 		}
1024 		if (mlxp->mlx_tdom.mltd_allocated &&
1025 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1026 			mlxcx_warn(mlxp, "failed to release TDOM");
1027 		}
1028 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1029 	}
1030 
1031 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1032 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1033 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1034 			    "command during device detach");
1035 		}
1036 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1037 	}
1038 
1039 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1040 		mlxcx_teardown_pages(mlxp);
1041 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1042 	}
1043 
1044 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1045 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1046 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1047 			    "during device detach");
1048 		}
1049 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1050 	}
1051 
1052 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1053 		mlxcx_cmd_queue_fini(mlxp);
1054 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1055 	}
1056 
1057 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1058 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1059 		mlxp->mlx_caps = NULL;
1060 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1061 	}
1062 
1063 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1064 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1065 		mlxp->mlx_regs_handle = NULL;
1066 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1067 	}
1068 
1069 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1070 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1071 		mlxp->mlx_cfg_handle = NULL;
1072 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1073 	}
1074 
1075 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1076 		mlxcx_fm_fini(mlxp);
1077 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1078 	}
1079 
1080 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1081 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1082 	ddi_set_driver_private(dip, NULL);
1083 }
1084 
1085 static boolean_t
1086 mlxcx_regs_map(mlxcx_t *mlxp)
1087 {
1088 	off_t memsize;
1089 	int ret;
1090 	ddi_device_acc_attr_t da;
1091 
1092 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1093 	    DDI_SUCCESS) {
1094 		mlxcx_warn(mlxp, "failed to get register set size");
1095 		return (B_FALSE);
1096 	}
1097 
1098 	/*
1099 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1100 	 * device.
1101 	 */
1102 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1103 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1104 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1105 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1106 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1107 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1108 	} else {
1109 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1110 	}
1111 
1112 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1113 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1114 
1115 	if (ret != DDI_SUCCESS) {
1116 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1117 		return (B_FALSE);
1118 	}
1119 
1120 	return (B_TRUE);
1121 }
1122 
1123 static boolean_t
1124 mlxcx_check_issi(mlxcx_t *mlxp)
1125 {
1126 	uint32_t issi;
1127 
1128 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1129 		mlxcx_warn(mlxp, "failed to get ISSI");
1130 		return (B_FALSE);
1131 	}
1132 
1133 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1134 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1135 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1136 		return (B_FALSE);
1137 	}
1138 
1139 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1140 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1141 		    MLXCX_CURRENT_ISSI);
1142 		return (B_FALSE);
1143 	}
1144 
1145 	return (B_TRUE);
1146 }
1147 
1148 boolean_t
1149 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages)
1150 {
1151 	ddi_device_acc_attr_t acc;
1152 	ddi_dma_attr_t attr;
1153 	int32_t i;
1154 	list_t plist;
1155 	mlxcx_dev_page_t *mdp;
1156 	const ddi_dma_cookie_t *ck;
1157 
1158 	/*
1159 	 * If there are no pages required, then we're done here.
1160 	 */
1161 	if (npages <= 0) {
1162 		return (B_TRUE);
1163 	}
1164 
1165 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1166 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1167 
1168 	for (i = 0; i < npages; i++) {
1169 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1170 		mlxcx_dma_acc_attr(mlxp, &acc);
1171 		mlxcx_dma_page_attr(mlxp, &attr);
1172 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1173 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1174 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1175 			    npages);
1176 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1177 			goto cleanup_npages;
1178 		}
1179 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1180 		mdp->mxdp_pa = ck->dmac_laddress;
1181 
1182 		list_insert_tail(&plist, mdp);
1183 	}
1184 
1185 	/*
1186 	 * Now that all of the pages have been allocated, given them to hardware
1187 	 * in chunks.
1188 	 */
1189 	while (npages > 0) {
1190 		mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES];
1191 		int32_t togive = MIN(MLXCX_MANAGE_PAGES_MAX_PAGES, npages);
1192 
1193 		for (i = 0; i < togive; i++) {
1194 			pages[i] = list_remove_head(&plist);
1195 		}
1196 
1197 		if (!mlxcx_cmd_give_pages(mlxp,
1198 		    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) {
1199 			mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1200 			    "pages!", togive);
1201 			for (i = 0; i < togive; i++) {
1202 				list_insert_tail(&plist, pages[i]);
1203 			}
1204 			goto cleanup_npages;
1205 		}
1206 
1207 		mutex_enter(&mlxp->mlx_pagemtx);
1208 		for (i = 0; i < togive; i++) {
1209 			avl_add(&mlxp->mlx_pages, pages[i]);
1210 		}
1211 		mlxp->mlx_npages += togive;
1212 		mutex_exit(&mlxp->mlx_pagemtx);
1213 		npages -= togive;
1214 	}
1215 
1216 	list_destroy(&plist);
1217 
1218 	return (B_TRUE);
1219 
1220 cleanup_npages:
1221 	while ((mdp = list_remove_head(&plist)) != NULL) {
1222 		mlxcx_dma_free(&mdp->mxdp_dma);
1223 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1224 	}
1225 	list_destroy(&plist);
1226 	return (B_FALSE);
1227 }
1228 
1229 static boolean_t
1230 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1231 {
1232 	int32_t npages;
1233 
1234 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1235 		mlxcx_warn(mlxp, "failed to determine boot pages");
1236 		return (B_FALSE);
1237 	}
1238 
1239 	return (mlxcx_give_pages(mlxp, npages));
1240 }
1241 
1242 static int
1243 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1244 {
1245 	mlxcx_t *mlxp = cookie;
1246 	mlxcx_buffer_t *b = arg;
1247 
1248 	bzero(b, sizeof (mlxcx_buffer_t));
1249 	b->mlb_mlx = mlxp;
1250 	b->mlb_state = MLXCX_BUFFER_INIT;
1251 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1252 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1253 
1254 	return (0);
1255 }
1256 
1257 static void
1258 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1259 {
1260 	mlxcx_t *mlxp = cookie;
1261 	mlxcx_buffer_t *b = arg;
1262 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1263 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1264 	list_destroy(&b->mlb_tx_chain);
1265 }
1266 
1267 mlxcx_buf_shard_t *
1268 mlxcx_mlbs_create(mlxcx_t *mlxp)
1269 {
1270 	mlxcx_buf_shard_t *s;
1271 
1272 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1273 
1274 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1275 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1276 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1277 	    offsetof(mlxcx_buffer_t, mlb_entry));
1278 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1279 	    offsetof(mlxcx_buffer_t, mlb_entry));
1280 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1281 
1282 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1283 
1284 	return (s);
1285 }
1286 
1287 static boolean_t
1288 mlxcx_setup_bufs(mlxcx_t *mlxp)
1289 {
1290 	char namebuf[KSTAT_STRLEN];
1291 
1292 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1293 	    ddi_get_instance(mlxp->mlx_dip));
1294 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1295 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1296 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1297 	    NULL, mlxp, NULL, 0);
1298 
1299 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1300 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1301 
1302 	return (B_TRUE);
1303 }
1304 
1305 static void
1306 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1307     const char *state, uint8_t statenum)
1308 {
1309 	uint64_t ena;
1310 	char buf[FM_MAX_CLASS];
1311 
1312 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1313 		return;
1314 
1315 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1316 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1317 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1318 
1319 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1320 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1321 	    "state", DATA_TYPE_STRING, state,
1322 	    "state_num", DATA_TYPE_UINT8, statenum,
1323 	    "qtype", DATA_TYPE_STRING, qtype,
1324 	    "qnum", DATA_TYPE_UINT32, qnum,
1325 	    NULL);
1326 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1327 }
1328 
1329 static void
1330 mlxcx_eq_check(void *arg)
1331 {
1332 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1333 	mlxcx_event_queue_t *eq;
1334 	mlxcx_eventq_ctx_t ctx;
1335 	const char *str;
1336 
1337 	uint_t i;
1338 
1339 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1340 		eq = &mlxp->mlx_eqs[i];
1341 		if (!(eq->mleq_state & MLXCX_EQ_CREATED) ||
1342 		    (eq->mleq_state & MLXCX_EQ_DESTROYED))
1343 			continue;
1344 		mutex_enter(&eq->mleq_mtx);
1345 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) {
1346 			mutex_exit(&eq->mleq_mtx);
1347 			continue;
1348 		}
1349 
1350 		str = "???";
1351 		switch (ctx.mleqc_status) {
1352 		case MLXCX_EQ_STATUS_OK:
1353 			break;
1354 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1355 			str = "WRITE_FAILURE";
1356 			break;
1357 		}
1358 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1359 			mlxcx_fm_qstate_ereport(mlxp, "event",
1360 			    eq->mleq_num, str, ctx.mleqc_status);
1361 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1362 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1363 		}
1364 
1365 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1366 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1367 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1368 			    ++eq->mleq_check_disarm_cnt >= 3) {
1369 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1370 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1371 				    eq->mleq_intr_index);
1372 			}
1373 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1374 		} else {
1375 			eq->mleq_check_disarm_cc = 0;
1376 			eq->mleq_check_disarm_cnt = 0;
1377 		}
1378 
1379 		mutex_exit(&eq->mleq_mtx);
1380 	}
1381 }
1382 
1383 static void
1384 mlxcx_cq_check(void *arg)
1385 {
1386 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1387 	mlxcx_completion_queue_t *cq;
1388 	mlxcx_completionq_ctx_t ctx;
1389 	const char *str, *type;
1390 	uint_t v;
1391 
1392 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1393 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1394 		mutex_enter(&cq->mlcq_mtx);
1395 		if (!(cq->mlcq_state & MLXCX_CQ_CREATED) ||
1396 		    (cq->mlcq_state & MLXCX_CQ_DESTROYED) ||
1397 		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
1398 			mutex_exit(&cq->mlcq_mtx);
1399 			continue;
1400 		}
1401 		if (cq->mlcq_fm_repd_qstate) {
1402 			mutex_exit(&cq->mlcq_mtx);
1403 			continue;
1404 		}
1405 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) {
1406 			mutex_exit(&cq->mlcq_mtx);
1407 			continue;
1408 		}
1409 		if (cq->mlcq_wq != NULL) {
1410 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1411 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1412 				type = "rx ";
1413 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1414 				type = "tx ";
1415 			else
1416 				type = "";
1417 		} else {
1418 			type = "";
1419 		}
1420 
1421 		str = "???";
1422 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1423 		switch (v) {
1424 		case MLXCX_CQC_STATUS_OK:
1425 			break;
1426 		case MLXCX_CQC_STATUS_OVERFLOW:
1427 			str = "OVERFLOW";
1428 			break;
1429 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1430 			str = "WRITE_FAIL";
1431 			break;
1432 		case MLXCX_CQC_STATUS_INVALID:
1433 			str = "INVALID";
1434 			break;
1435 		}
1436 		if (v != MLXCX_CQC_STATUS_OK) {
1437 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1438 			    cq->mlcq_num, str, v);
1439 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1440 			    type, cq->mlcq_num, v, str);
1441 			cq->mlcq_fm_repd_qstate = B_TRUE;
1442 		}
1443 
1444 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1445 		if (v != MLXCX_CQC_STATE_ARMED &&
1446 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1447 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1448 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1449 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1450 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1451 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1452 				    type, cq->mlcq_num, cq);
1453 			}
1454 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1455 		} else {
1456 			cq->mlcq_check_disarm_cnt = 0;
1457 			cq->mlcq_check_disarm_cc = 0;
1458 		}
1459 		mutex_exit(&cq->mlcq_mtx);
1460 	}
1461 }
1462 
1463 void
1464 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1465 {
1466 	mlxcx_sq_ctx_t ctx;
1467 	mlxcx_sq_state_t state;
1468 
1469 	ASSERT(mutex_owned(&sq->mlwq_mtx));
1470 
1471 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1472 		return;
1473 
1474 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1475 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1476 	switch (state) {
1477 	case MLXCX_SQ_STATE_RST:
1478 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1479 			mlxcx_fm_qstate_ereport(mlxp, "send",
1480 			    sq->mlwq_num, "RST", state);
1481 			sq->mlwq_fm_repd_qstate = B_TRUE;
1482 		}
1483 		break;
1484 	case MLXCX_SQ_STATE_RDY:
1485 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1486 			mlxcx_fm_qstate_ereport(mlxp, "send",
1487 			    sq->mlwq_num, "RDY", state);
1488 			sq->mlwq_fm_repd_qstate = B_TRUE;
1489 		}
1490 		break;
1491 	case MLXCX_SQ_STATE_ERR:
1492 		mlxcx_fm_qstate_ereport(mlxp, "send",
1493 		    sq->mlwq_num, "ERR", state);
1494 		sq->mlwq_fm_repd_qstate = B_TRUE;
1495 		break;
1496 	default:
1497 		mlxcx_fm_qstate_ereport(mlxp, "send",
1498 		    sq->mlwq_num, "???", state);
1499 		sq->mlwq_fm_repd_qstate = B_TRUE;
1500 		break;
1501 	}
1502 }
1503 
1504 void
1505 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1506 {
1507 	mlxcx_rq_ctx_t ctx;
1508 	mlxcx_rq_state_t state;
1509 
1510 	ASSERT(mutex_owned(&rq->mlwq_mtx));
1511 
1512 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1513 		return;
1514 
1515 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1516 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1517 	switch (state) {
1518 	case MLXCX_RQ_STATE_RST:
1519 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1520 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1521 			    rq->mlwq_num, "RST", state);
1522 			rq->mlwq_fm_repd_qstate = B_TRUE;
1523 		}
1524 		break;
1525 	case MLXCX_RQ_STATE_RDY:
1526 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1527 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1528 			    rq->mlwq_num, "RDY", state);
1529 			rq->mlwq_fm_repd_qstate = B_TRUE;
1530 		}
1531 		break;
1532 	case MLXCX_RQ_STATE_ERR:
1533 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1534 		    rq->mlwq_num, "ERR", state);
1535 		rq->mlwq_fm_repd_qstate = B_TRUE;
1536 		break;
1537 	default:
1538 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1539 		    rq->mlwq_num, "???", state);
1540 		rq->mlwq_fm_repd_qstate = B_TRUE;
1541 		break;
1542 	}
1543 }
1544 
1545 static void
1546 mlxcx_wq_check(void *arg)
1547 {
1548 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1549 	mlxcx_work_queue_t *wq;
1550 
1551 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1552 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1553 		mutex_enter(&wq->mlwq_mtx);
1554 		if (!(wq->mlwq_state & MLXCX_WQ_CREATED) ||
1555 		    (wq->mlwq_state & MLXCX_WQ_DESTROYED) ||
1556 		    (wq->mlwq_state & MLXCX_WQ_TEARDOWN)) {
1557 			mutex_exit(&wq->mlwq_mtx);
1558 			continue;
1559 		}
1560 		if (wq->mlwq_fm_repd_qstate) {
1561 			mutex_exit(&wq->mlwq_mtx);
1562 			continue;
1563 		}
1564 		switch (wq->mlwq_type) {
1565 		case MLXCX_WQ_TYPE_SENDQ:
1566 			mlxcx_check_sq(mlxp, wq);
1567 			break;
1568 		case MLXCX_WQ_TYPE_RECVQ:
1569 			mlxcx_check_rq(mlxp, wq);
1570 			break;
1571 		}
1572 		mutex_exit(&wq->mlwq_mtx);
1573 	}
1574 }
1575 
1576 static boolean_t
1577 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1578 {
1579 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1580 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1581 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1582 		    DDI_IPL_0);
1583 	}
1584 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1585 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1586 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1587 		    DDI_IPL_0);
1588 	}
1589 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1590 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1591 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1592 		    DDI_IPL_0);
1593 	}
1594 	return (B_TRUE);
1595 }
1596 
1597 int
1598 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1599 {
1600 	const mlxcx_flow_entry_t *left = arg0;
1601 	const mlxcx_flow_entry_t *right = arg1;
1602 	int bcmpr;
1603 
1604 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1605 	    sizeof (left->mlfe_dmac));
1606 	if (bcmpr < 0)
1607 		return (-1);
1608 	if (bcmpr > 0)
1609 		return (1);
1610 	if (left->mlfe_vid < right->mlfe_vid)
1611 		return (-1);
1612 	if (left->mlfe_vid > right->mlfe_vid)
1613 		return (1);
1614 	return (0);
1615 }
1616 
1617 int
1618 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1619 {
1620 	const mlxcx_group_mac_t *left = arg0;
1621 	const mlxcx_group_mac_t *right = arg1;
1622 	int bcmpr;
1623 
1624 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1625 	    sizeof (left->mlgm_mac));
1626 	if (bcmpr < 0)
1627 		return (-1);
1628 	if (bcmpr > 0)
1629 		return (1);
1630 	return (0);
1631 }
1632 
1633 int
1634 mlxcx_page_compare(const void *arg0, const void *arg1)
1635 {
1636 	const mlxcx_dev_page_t *p0 = arg0;
1637 	const mlxcx_dev_page_t *p1 = arg1;
1638 
1639 	if (p0->mxdp_pa < p1->mxdp_pa)
1640 		return (-1);
1641 	if (p0->mxdp_pa > p1->mxdp_pa)
1642 		return (1);
1643 	return (0);
1644 }
1645 
1646 static boolean_t
1647 mlxcx_setup_ports(mlxcx_t *mlxp)
1648 {
1649 	uint_t i, j;
1650 	mlxcx_port_t *p;
1651 	mlxcx_flow_table_t *ft;
1652 	mlxcx_flow_group_t *fg;
1653 	mlxcx_flow_entry_t *fe;
1654 
1655 	VERIFY3U(mlxp->mlx_nports, >, 0);
1656 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1657 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1658 
1659 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1660 		p = &mlxp->mlx_ports[i];
1661 		p->mlp_num = i;
1662 		p->mlp_init |= MLXCX_PORT_INIT;
1663 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1664 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1665 		mutex_enter(&p->mlp_mtx);
1666 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1667 			mutex_exit(&p->mlp_mtx);
1668 			goto err;
1669 		}
1670 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1671 			mutex_exit(&p->mlp_mtx);
1672 			goto err;
1673 		}
1674 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1675 			mutex_exit(&p->mlp_mtx);
1676 			goto err;
1677 		}
1678 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1679 			mutex_exit(&p->mlp_mtx);
1680 			goto err;
1681 		}
1682 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1683 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1684 			mutex_exit(&p->mlp_mtx);
1685 			goto err;
1686 		}
1687 
1688 		mutex_exit(&p->mlp_mtx);
1689 	}
1690 
1691 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1692 		p = &mlxp->mlx_ports[i];
1693 		mutex_enter(&p->mlp_mtx);
1694 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1695 		    KM_SLEEP));
1696 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1697 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1698 
1699 		mutex_enter(&ft->mlft_mtx);
1700 
1701 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1702 		ft->mlft_port = p;
1703 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1704 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1705 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1706 		ft->mlft_nents = (1 << ft->mlft_entshift);
1707 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1708 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1709 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1710 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1711 
1712 		for (j = 0; j < ft->mlft_nents; ++j) {
1713 			ft->mlft_ent[j].mlfe_table = ft;
1714 			ft->mlft_ent[j].mlfe_index = j;
1715 		}
1716 
1717 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1718 			mutex_exit(&ft->mlft_mtx);
1719 			mutex_exit(&p->mlp_mtx);
1720 			goto err;
1721 		}
1722 
1723 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1724 			mutex_exit(&ft->mlft_mtx);
1725 			mutex_exit(&p->mlp_mtx);
1726 			goto err;
1727 		}
1728 
1729 		/*
1730 		 * We match broadcast at the top of the root flow table, then
1731 		 * all multicast/unicast MACs, then the promisc entry is down
1732 		 * the very bottom.
1733 		 *
1734 		 * This way when promisc is on, that entry simply catches any
1735 		 * remaining traffic that earlier flows haven't matched.
1736 		 */
1737 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1738 		list_insert_tail(&ft->mlft_groups, fg);
1739 		fg->mlfg_table = ft;
1740 		fg->mlfg_size = 1;
1741 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1742 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1743 			mutex_exit(&ft->mlft_mtx);
1744 			mutex_exit(&p->mlp_mtx);
1745 			goto err;
1746 		}
1747 		p->mlp_bcast = fg;
1748 		fe = list_head(&fg->mlfg_entries);
1749 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1750 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1751 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1752 
1753 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1754 		list_insert_tail(&ft->mlft_groups, fg);
1755 		fg->mlfg_table = ft;
1756 		fg->mlfg_size = ft->mlft_nents - 2;
1757 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1758 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1759 			mutex_exit(&ft->mlft_mtx);
1760 			mutex_exit(&p->mlp_mtx);
1761 			goto err;
1762 		}
1763 		p->mlp_umcast = fg;
1764 
1765 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1766 		list_insert_tail(&ft->mlft_groups, fg);
1767 		fg->mlfg_table = ft;
1768 		fg->mlfg_size = 1;
1769 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1770 			mutex_exit(&ft->mlft_mtx);
1771 			mutex_exit(&p->mlp_mtx);
1772 			goto err;
1773 		}
1774 		p->mlp_promisc = fg;
1775 		fe = list_head(&fg->mlfg_entries);
1776 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1777 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1778 
1779 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1780 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1781 		    mlfe_dmac_entry));
1782 
1783 		mutex_exit(&ft->mlft_mtx);
1784 		mutex_exit(&p->mlp_mtx);
1785 	}
1786 
1787 	return (B_TRUE);
1788 
1789 err:
1790 	mlxcx_teardown_ports(mlxp);
1791 	return (B_FALSE);
1792 }
1793 
1794 void
1795 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1796 {
1797 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1798 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1799 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1800 	mlxcx_flow_entry_t *fe;
1801 	mlxcx_group_vlan_t *v;
1802 
1803 	ASSERT(mutex_owned(&g->mlg_mtx));
1804 
1805 	mutex_enter(&ft->mlft_mtx);
1806 
1807 	if (!list_is_empty(&g->mlg_rx_vlans)) {
1808 		fe = list_head(&dfg->mlfg_entries);
1809 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
1810 	}
1811 
1812 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
1813 		fe = v->mlgv_fe;
1814 		ASSERT3P(fe->mlfe_table, ==, ft);
1815 		ASSERT3P(fe->mlfe_group, ==, fg);
1816 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
1817 
1818 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1819 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1820 	}
1821 
1822 	mutex_exit(&ft->mlft_mtx);
1823 }
1824 
1825 boolean_t
1826 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1827     boolean_t tagged, uint16_t vid)
1828 {
1829 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1830 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1831 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1832 	mlxcx_flow_entry_t *fe;
1833 	mlxcx_group_vlan_t *v;
1834 	boolean_t found = B_FALSE;
1835 
1836 	ASSERT(mutex_owned(&g->mlg_mtx));
1837 
1838 	mutex_enter(&ft->mlft_mtx);
1839 
1840 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1841 	    v = list_next(&g->mlg_rx_vlans, v)) {
1842 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1843 			found = B_TRUE;
1844 			break;
1845 		}
1846 	}
1847 	if (!found) {
1848 		mutex_exit(&ft->mlft_mtx);
1849 		return (B_FALSE);
1850 	}
1851 
1852 	list_remove(&g->mlg_rx_vlans, v);
1853 
1854 	/*
1855 	 * If this is the last VLAN entry, we have to go back to accepting
1856 	 * any VLAN (which means re-enabling the default entry).
1857 	 *
1858 	 * Do this before we remove the flow entry for the last specific
1859 	 * VLAN so that we don't lose any traffic in the transition.
1860 	 */
1861 	if (list_is_empty(&g->mlg_rx_vlans)) {
1862 		fe = list_head(&dfg->mlfg_entries);
1863 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1864 			list_insert_tail(&g->mlg_rx_vlans, v);
1865 			mutex_exit(&ft->mlft_mtx);
1866 			return (B_FALSE);
1867 		}
1868 	}
1869 
1870 	fe = v->mlgv_fe;
1871 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
1872 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
1873 	ASSERT3P(fe->mlfe_table, ==, ft);
1874 	ASSERT3P(fe->mlfe_group, ==, fg);
1875 
1876 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
1877 		list_insert_tail(&g->mlg_rx_vlans, v);
1878 		fe = list_head(&dfg->mlfg_entries);
1879 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
1880 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1881 		}
1882 		mutex_exit(&ft->mlft_mtx);
1883 		return (B_FALSE);
1884 	}
1885 
1886 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1887 
1888 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
1889 
1890 	mutex_exit(&ft->mlft_mtx);
1891 	return (B_TRUE);
1892 }
1893 
1894 boolean_t
1895 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
1896     uint16_t vid)
1897 {
1898 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1899 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1900 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1901 	mlxcx_flow_entry_t *fe;
1902 	mlxcx_group_vlan_t *v;
1903 	boolean_t found = B_FALSE;
1904 	boolean_t first = B_FALSE;
1905 
1906 	ASSERT(mutex_owned(&g->mlg_mtx));
1907 
1908 	mutex_enter(&ft->mlft_mtx);
1909 
1910 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1911 	    v = list_next(&g->mlg_rx_vlans, v)) {
1912 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1913 			mutex_exit(&ft->mlft_mtx);
1914 			return (B_TRUE);
1915 		}
1916 	}
1917 	if (list_is_empty(&g->mlg_rx_vlans))
1918 		first = B_TRUE;
1919 
1920 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
1921 	    fe = list_next(&fg->mlfg_entries, fe)) {
1922 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
1923 			found = B_TRUE;
1924 			break;
1925 		}
1926 	}
1927 	if (!found) {
1928 		mutex_exit(&ft->mlft_mtx);
1929 		return (B_FALSE);
1930 	}
1931 
1932 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
1933 	v->mlgv_fe = fe;
1934 	v->mlgv_tagged = tagged;
1935 	v->mlgv_vid = vid;
1936 
1937 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
1938 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1939 	fe->mlfe_vid = vid;
1940 	if (tagged) {
1941 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
1942 	} else {
1943 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
1944 	}
1945 
1946 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1947 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
1948 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1949 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
1950 		mutex_exit(&ft->mlft_mtx);
1951 		return (B_FALSE);
1952 	}
1953 
1954 	list_insert_tail(&g->mlg_rx_vlans, v);
1955 
1956 	/*
1957 	 * If the vlan list was empty for this group before adding this one,
1958 	 * then we no longer want the "default" entry to allow all VLANs
1959 	 * through.
1960 	 */
1961 	if (first) {
1962 		fe = list_head(&dfg->mlfg_entries);
1963 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1964 	}
1965 
1966 	mutex_exit(&ft->mlft_mtx);
1967 	return (B_TRUE);
1968 }
1969 
1970 void
1971 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
1972     mlxcx_ring_group_t *group)
1973 {
1974 	mlxcx_flow_entry_t *fe;
1975 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
1976 	mlxcx_group_mac_t *gm, *ngm;
1977 
1978 	ASSERT(mutex_owned(&port->mlp_mtx));
1979 	ASSERT(mutex_owned(&group->mlg_mtx));
1980 
1981 	mutex_enter(&ft->mlft_mtx);
1982 
1983 	gm = avl_first(&group->mlg_rx_macs);
1984 	for (; gm != NULL; gm = ngm) {
1985 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
1986 
1987 		ASSERT3P(gm->mlgm_group, ==, group);
1988 		fe = gm->mlgm_fe;
1989 		ASSERT3P(fe->mlfe_table, ==, ft);
1990 
1991 		avl_remove(&group->mlg_rx_macs, gm);
1992 		list_remove(&fe->mlfe_ring_groups, gm);
1993 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
1994 
1995 		fe->mlfe_ndest = 0;
1996 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
1997 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
1998 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
1999 			    gm->mlgm_group->mlg_rx_vlan_ft;
2000 		}
2001 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2002 
2003 		if (fe->mlfe_ndest > 0) {
2004 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2005 			continue;
2006 		}
2007 
2008 		/*
2009 		 * There are no more ring groups left for this MAC (it wasn't
2010 		 * attached to any other groups since ndest == 0), so clean up
2011 		 * its flow entry.
2012 		 */
2013 		avl_remove(&port->mlp_dmac_fe, fe);
2014 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2015 		list_destroy(&fe->mlfe_ring_groups);
2016 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2017 	}
2018 
2019 	mutex_exit(&ft->mlft_mtx);
2020 }
2021 
2022 boolean_t
2023 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2024     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2025 {
2026 	mlxcx_flow_entry_t *fe;
2027 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2028 	mlxcx_group_mac_t *gm, probe;
2029 
2030 	ASSERT(mutex_owned(&port->mlp_mtx));
2031 	ASSERT(mutex_owned(&group->mlg_mtx));
2032 
2033 	bzero(&probe, sizeof (probe));
2034 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2035 
2036 	mutex_enter(&ft->mlft_mtx);
2037 
2038 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2039 	if (gm == NULL) {
2040 		mutex_exit(&ft->mlft_mtx);
2041 		return (B_FALSE);
2042 	}
2043 	ASSERT3P(gm->mlgm_group, ==, group);
2044 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2045 
2046 	fe = gm->mlgm_fe;
2047 	ASSERT3P(fe->mlfe_table, ==, ft);
2048 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2049 
2050 	list_remove(&fe->mlfe_ring_groups, gm);
2051 	avl_remove(&group->mlg_rx_macs, gm);
2052 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2053 
2054 	fe->mlfe_ndest = 0;
2055 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2056 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2057 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2058 		    gm->mlgm_group->mlg_rx_vlan_ft;
2059 	}
2060 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2061 
2062 	if (fe->mlfe_ndest > 0) {
2063 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2064 			mutex_exit(&ft->mlft_mtx);
2065 			return (B_FALSE);
2066 		}
2067 		mutex_exit(&ft->mlft_mtx);
2068 		return (B_TRUE);
2069 	}
2070 
2071 	/*
2072 	 * There are no more ring groups left for this MAC (it wasn't attached
2073 	 * to any other groups since ndest == 0), so clean up its flow entry.
2074 	 */
2075 	avl_remove(&port->mlp_dmac_fe, fe);
2076 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2077 	list_destroy(&fe->mlfe_ring_groups);
2078 
2079 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2080 
2081 	mutex_exit(&ft->mlft_mtx);
2082 
2083 	return (B_TRUE);
2084 }
2085 
2086 boolean_t
2087 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2088     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2089 {
2090 	mlxcx_flow_group_t *fg;
2091 	mlxcx_flow_entry_t *fe, probe;
2092 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2093 	mlxcx_group_mac_t *gm;
2094 	boolean_t found = B_FALSE;
2095 
2096 	ASSERT(mutex_owned(&port->mlp_mtx));
2097 	ASSERT(mutex_owned(&group->mlg_mtx));
2098 
2099 	bzero(&probe, sizeof (probe));
2100 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2101 
2102 	mutex_enter(&ft->mlft_mtx);
2103 
2104 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2105 
2106 	if (fe == NULL) {
2107 		fg = port->mlp_umcast;
2108 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2109 		    fe = list_next(&fg->mlfg_entries, fe)) {
2110 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2111 				found = B_TRUE;
2112 				break;
2113 			}
2114 		}
2115 		if (!found) {
2116 			mutex_exit(&ft->mlft_mtx);
2117 			return (B_FALSE);
2118 		}
2119 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2120 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2121 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2122 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2123 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2124 
2125 		avl_add(&port->mlp_dmac_fe, fe);
2126 	}
2127 
2128 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2129 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2130 
2131 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2132 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2133 		if (--fe->mlfe_ndest == 0) {
2134 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2135 		}
2136 		mutex_exit(&ft->mlft_mtx);
2137 		return (B_FALSE);
2138 	}
2139 
2140 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2141 	gm->mlgm_group = group;
2142 	gm->mlgm_fe = fe;
2143 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2144 	avl_add(&group->mlg_rx_macs, gm);
2145 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2146 
2147 	mutex_exit(&ft->mlft_mtx);
2148 
2149 	return (B_TRUE);
2150 }
2151 
2152 boolean_t
2153 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2154     mlxcx_flow_group_t *fg)
2155 {
2156 	mlxcx_flow_entry_t *fe;
2157 	uint_t i, idx;
2158 
2159 	ASSERT(mutex_owned(&ft->mlft_mtx));
2160 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2161 	ASSERT3P(fg->mlfg_table, ==, ft);
2162 
2163 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2164 		return (B_FALSE);
2165 	fg->mlfg_start_idx = ft->mlft_next_ent;
2166 
2167 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2168 		return (B_FALSE);
2169 	}
2170 
2171 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2172 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2173 	for (i = 0; i < fg->mlfg_size; ++i) {
2174 		idx = fg->mlfg_start_idx + i;
2175 		fe = &ft->mlft_ent[idx];
2176 		fe->mlfe_group = fg;
2177 		list_insert_tail(&fg->mlfg_entries, fe);
2178 	}
2179 	fg->mlfg_avail = fg->mlfg_size;
2180 	ft->mlft_next_ent += fg->mlfg_size;
2181 
2182 	return (B_TRUE);
2183 }
2184 
2185 static boolean_t
2186 mlxcx_setup_eq0(mlxcx_t *mlxp)
2187 {
2188 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[0];
2189 
2190 	mutex_enter(&mleq->mleq_mtx);
2191 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2192 		/* mlxcx_teardown_eqs() will clean this up */
2193 		mutex_exit(&mleq->mleq_mtx);
2194 		return (B_FALSE);
2195 	}
2196 	mleq->mleq_mlx = mlxp;
2197 	mleq->mleq_uar = &mlxp->mlx_uar;
2198 	mleq->mleq_events =
2199 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2200 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2201 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2202 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2203 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2204 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2205 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2206 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2207 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2208 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2209 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2210 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2211 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST);
2212 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2213 		/* mlxcx_teardown_eqs() will clean this up */
2214 		mutex_exit(&mleq->mleq_mtx);
2215 		return (B_FALSE);
2216 	}
2217 	if (ddi_intr_enable(mlxp->mlx_intr_handles[0]) != DDI_SUCCESS) {
2218 		/*
2219 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2220 		 * eq_rele_dma
2221 		 */
2222 		mutex_exit(&mleq->mleq_mtx);
2223 		return (B_FALSE);
2224 	}
2225 	mlxcx_arm_eq(mlxp, mleq);
2226 	mutex_exit(&mleq->mleq_mtx);
2227 	return (B_TRUE);
2228 }
2229 
2230 int
2231 mlxcx_cq_compare(const void *arg0, const void *arg1)
2232 {
2233 	const mlxcx_completion_queue_t *left = arg0;
2234 	const mlxcx_completion_queue_t *right = arg1;
2235 
2236 	if (left->mlcq_num < right->mlcq_num) {
2237 		return (-1);
2238 	}
2239 	if (left->mlcq_num > right->mlcq_num) {
2240 		return (1);
2241 	}
2242 	return (0);
2243 }
2244 
2245 static boolean_t
2246 mlxcx_setup_eqs(mlxcx_t *mlxp)
2247 {
2248 	uint_t i;
2249 	mlxcx_event_queue_t *mleq;
2250 
2251 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2252 
2253 	for (i = 1; i < mlxp->mlx_intr_count; ++i) {
2254 		mleq = &mlxp->mlx_eqs[i];
2255 		mutex_enter(&mleq->mleq_mtx);
2256 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2257 			mutex_exit(&mleq->mleq_mtx);
2258 			return (B_FALSE);
2259 		}
2260 		mleq->mleq_uar = &mlxp->mlx_uar;
2261 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2262 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2263 			mutex_exit(&mleq->mleq_mtx);
2264 			return (B_FALSE);
2265 		}
2266 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2267 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2268 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2269 			mutex_exit(&mleq->mleq_mtx);
2270 			return (B_FALSE);
2271 		}
2272 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2273 			mutex_exit(&mleq->mleq_mtx);
2274 			return (B_FALSE);
2275 		}
2276 		mlxcx_arm_eq(mlxp, mleq);
2277 		mutex_exit(&mleq->mleq_mtx);
2278 	}
2279 
2280 	mlxp->mlx_next_eq = 1;
2281 
2282 	return (B_TRUE);
2283 }
2284 
2285 /*
2286  * Snapshot all of the hardware capabilities that we care about and then modify
2287  * the HCA capabilities to get things moving.
2288  */
2289 static boolean_t
2290 mlxcx_init_caps(mlxcx_t *mlxp)
2291 {
2292 	mlxcx_caps_t *c;
2293 
2294 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2295 
2296 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2297 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2298 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2299 	}
2300 
2301 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2302 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2303 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2304 	}
2305 
2306 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2307 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2308 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2309 	}
2310 
2311 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2312 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2313 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2314 	}
2315 
2316 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2317 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2318 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2319 	}
2320 
2321 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2322 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2323 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2324 	}
2325 
2326 	/*
2327 	 * Check the caps meet our requirements.
2328 	 */
2329 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2330 
2331 	if (gen->mlcap_general_log_pg_sz != 12) {
2332 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2333 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2334 		goto err;
2335 	}
2336 	if (gen->mlcap_general_cqe_version != 1) {
2337 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2338 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2339 		goto err;
2340 	}
2341 	if (gen->mlcap_general_port_type !=
2342 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2343 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2344 		goto err;
2345 	}
2346 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2347 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2348 
2349 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2350 
2351 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2352 	    MLXCX_ETH_CAP_CSUM_CAP);
2353 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2354 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2355 
2356 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2357 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2358 	if (c->mlc_max_lso_size == 1) {
2359 		c->mlc_max_lso_size = 0;
2360 		c->mlc_lso = B_FALSE;
2361 	} else {
2362 		c->mlc_lso = B_TRUE;
2363 	}
2364 
2365 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2366 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2367 
2368 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2369 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2370 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2371 		goto err;
2372 	}
2373 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2374 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2375 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2376 		    "flow table entries");
2377 		goto err;
2378 	}
2379 
2380 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2381 	    mlcap_flow_prop_log_max_ft_size;
2382 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2383 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2384 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2385 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2386 
2387 	return (B_TRUE);
2388 
2389 err:
2390 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2391 	return (B_FALSE);
2392 }
2393 
2394 static int
2395 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2396 {
2397 	mlxcx_t *mlxp;
2398 
2399 	if (cmd != DDI_DETACH)
2400 		return (DDI_FAILURE);
2401 
2402 	mlxp = ddi_get_driver_private(dip);
2403 	if (mlxp == NULL) {
2404 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2405 		    "private data");
2406 		return (DDI_FAILURE);
2407 	}
2408 
2409 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2410 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2411 			return (DDI_FAILURE);
2412 		}
2413 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2414 	}
2415 
2416 	mlxcx_teardown(mlxp);
2417 	return (DDI_SUCCESS);
2418 }
2419 
2420 static size_t
2421 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2422 {
2423 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2424 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2425 	size_t tirlim, flowlim, gflowlim;
2426 
2427 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2428 	if (tirlim < ngroups) {
2429 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2430 		    "on number of TIRs available", tirlim);
2431 		ngroups = tirlim;
2432 	}
2433 
2434 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2435 	if (flowlim < ngroups) {
2436 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2437 		    "on max size of RX flow tables", flowlim);
2438 		ngroups = flowlim;
2439 	}
2440 
2441 	do {
2442 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2443 		if (gflowlim < ngroups) {
2444 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2445 			    "based on max total RX flows", gflowlim);
2446 			--ngroups;
2447 		}
2448 	} while (gflowlim < ngroups);
2449 
2450 	return (ngroups);
2451 }
2452 
2453 static int
2454 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2455 {
2456 	mlxcx_t *mlxp;
2457 	uint_t i;
2458 	int inst, ret;
2459 
2460 	if (cmd != DDI_ATTACH)
2461 		return (DDI_FAILURE);
2462 
2463 	inst = ddi_get_instance(dip);
2464 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2465 	if (ret != 0)
2466 		return (ret);
2467 
2468 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2469 	if (mlxp == NULL)
2470 		return (DDI_FAILURE);
2471 	mlxp->mlx_dip = dip;
2472 	mlxp->mlx_inst = inst;
2473 	ddi_set_driver_private(dip, mlxp);
2474 
2475 	mlxcx_load_props(mlxp);
2476 
2477 	mlxcx_fm_init(mlxp);
2478 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2479 
2480 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2481 	    DDI_SUCCESS) {
2482 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2483 		goto err;
2484 	}
2485 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2486 
2487 	if (!mlxcx_regs_map(mlxp)) {
2488 		goto err;
2489 	}
2490 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2491 
2492 	if (!mlxcx_cmd_queue_init(mlxp)) {
2493 		goto err;
2494 	}
2495 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2496 
2497 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2498 		goto err;
2499 	}
2500 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2501 
2502 	if (!mlxcx_check_issi(mlxp)) {
2503 		goto err;
2504 	}
2505 
2506 	/*
2507 	 * We have to get our interrupts now so we know what priority to
2508 	 * create pagemtx with.
2509 	 */
2510 	if (!mlxcx_intr_setup(mlxp)) {
2511 		goto err;
2512 	}
2513 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2514 
2515 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2516 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2517 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2518 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2519 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2520 
2521 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2522 		goto err;
2523 	}
2524 
2525 	if (!mlxcx_init_caps(mlxp)) {
2526 		goto err;
2527 	}
2528 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2529 
2530 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2531 		goto err;
2532 	}
2533 
2534 	if (!mlxcx_cmd_init_hca(mlxp)) {
2535 		goto err;
2536 	}
2537 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2538 
2539 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2540 		goto err;
2541 	}
2542 
2543 	/*
2544 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2545 	 * doorbells.
2546 	 */
2547 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2548 		goto err;
2549 	}
2550 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2551 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2552 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2553 	}
2554 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2555 
2556 	/*
2557 	 * Set up event queue #0 -- it's special and only handles control
2558 	 * type events, like PAGE_REQUEST (which we will probably get during
2559 	 * the commands below).
2560 	 *
2561 	 * This will enable and arm the interrupt on EQ 0, too.
2562 	 */
2563 	if (!mlxcx_setup_eq0(mlxp)) {
2564 		goto err;
2565 	}
2566 
2567 	/*
2568 	 * Allocate a protection and transport domain. These don't really do
2569 	 * anything for us (they're IB concepts), but we need to give their
2570 	 * ID numbers in other commands.
2571 	 */
2572 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2573 		goto err;
2574 	}
2575 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2576 		goto err;
2577 	}
2578 	/*
2579 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2580 	 * work queue entries, rather than having to mess with the NIC's
2581 	 * internal MMU.
2582 	 */
2583 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2584 		goto err;
2585 	}
2586 
2587 	/*
2588 	 * Query our port information and current state, populate the
2589 	 * mlxcx_port_t structs.
2590 	 *
2591 	 * This also sets up the root flow tables and flow groups.
2592 	 */
2593 	if (!mlxcx_setup_ports(mlxp)) {
2594 		goto err;
2595 	}
2596 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2597 
2598 	/*
2599 	 * Set up, enable and arm the rest of the interrupt EQs which will
2600 	 * service events from CQs.
2601 	 *
2602 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2603 	 * cleaned up.
2604 	 */
2605 	if (!mlxcx_setup_eqs(mlxp)) {
2606 		goto err;
2607 	}
2608 
2609 	/* Completion queues */
2610 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2611 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2612 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2613 
2614 	/* Work queues (send queues, receive queues) */
2615 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2616 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2617 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2618 
2619 	/* Set up periodic fault check timers which check the queue states */
2620 	if (!mlxcx_setup_checktimers(mlxp)) {
2621 		goto err;
2622 	}
2623 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2624 
2625 	/*
2626 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2627 	 * "groups" we advertise to MAC.
2628 	 */
2629 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2630 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2631 	    sizeof (mlxcx_ring_group_t);
2632 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2633 
2634 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2635 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2636 	    sizeof (mlxcx_ring_group_t);
2637 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2638 
2639 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2640 
2641 	/*
2642 	 * Sets up the free/busy buffers list for keeping track of packet
2643 	 * buffers.
2644 	 */
2645 	if (!mlxcx_setup_bufs(mlxp))
2646 		goto err;
2647 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2648 
2649 	/*
2650 	 * Before we tell MAC about our rings/groups, we need to do enough
2651 	 * setup on them to be sure about the numbers and configuration that
2652 	 * we have. This will do basically everything short of allocating
2653 	 * packet buffers and starting the rings up.
2654 	 */
2655 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2656 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2657 			goto err;
2658 	}
2659 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2660 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2661 			goto err;
2662 	}
2663 
2664 	/*
2665 	 * Finally, tell MAC that we exist!
2666 	 */
2667 	if (!mlxcx_register_mac(mlxp)) {
2668 		goto err;
2669 	}
2670 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
2671 
2672 	return (DDI_SUCCESS);
2673 
2674 err:
2675 	mlxcx_teardown(mlxp);
2676 	return (DDI_FAILURE);
2677 }
2678 
2679 static struct cb_ops mlxcx_cb_ops = {
2680 	.cb_open = nulldev,
2681 	.cb_close = nulldev,
2682 	.cb_strategy = nodev,
2683 	.cb_print = nodev,
2684 	.cb_dump = nodev,
2685 	.cb_read = nodev,
2686 	.cb_write = nodev,
2687 	.cb_ioctl = nodev,
2688 	.cb_devmap = nodev,
2689 	.cb_mmap = nodev,
2690 	.cb_segmap = nodev,
2691 	.cb_chpoll = nochpoll,
2692 	.cb_prop_op = ddi_prop_op,
2693 	.cb_flag = D_MP,
2694 	.cb_rev = CB_REV,
2695 	.cb_aread = nodev,
2696 	.cb_awrite = nodev
2697 };
2698 
2699 static struct dev_ops mlxcx_dev_ops = {
2700 	.devo_rev = DEVO_REV,
2701 	.devo_refcnt = 0,
2702 	.devo_getinfo = NULL,
2703 	.devo_identify = nulldev,
2704 	.devo_probe = nulldev,
2705 	.devo_attach = mlxcx_attach,
2706 	.devo_detach = mlxcx_detach,
2707 	.devo_reset = nodev,
2708 	.devo_power = ddi_power,
2709 	.devo_quiesce = ddi_quiesce_not_supported,
2710 	.devo_cb_ops = &mlxcx_cb_ops
2711 };
2712 
2713 static struct modldrv mlxcx_modldrv = {
2714 	.drv_modops = &mod_driverops,
2715 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
2716 	.drv_dev_ops = &mlxcx_dev_ops
2717 };
2718 
2719 static struct modlinkage mlxcx_modlinkage = {
2720 	.ml_rev = MODREV_1,
2721 	.ml_linkage = { &mlxcx_modldrv, NULL }
2722 };
2723 
2724 int
2725 _init(void)
2726 {
2727 	int ret;
2728 
2729 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
2730 	if (ret != 0) {
2731 		return (ret);
2732 	}
2733 
2734 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
2735 
2736 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2737 		mac_fini_ops(&mlxcx_dev_ops);
2738 		ddi_soft_state_fini(&mlxcx_softstate);
2739 		return (ret);
2740 	}
2741 
2742 	return (DDI_SUCCESS);
2743 }
2744 
2745 int
2746 _info(struct modinfo *modinfop)
2747 {
2748 	return (mod_info(&mlxcx_modlinkage, modinfop));
2749 }
2750 
2751 int
2752 _fini(void)
2753 {
2754 	int ret;
2755 
2756 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2757 		return (ret);
2758 	}
2759 
2760 	mac_fini_ops(&mlxcx_dev_ops);
2761 
2762 	ddi_soft_state_fini(&mlxcx_softstate);
2763 
2764 	return (DDI_SUCCESS);
2765 }
2766