xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision c94be9439c4f0773ef60e2cec21d548359cfea20)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 /*
23  * The PRM for this family of parts is freely available, and can be found at:
24  * https://www.mellanox.com/related-docs/user_manuals/ \
25  *   Ethernet_Adapters_Programming_Manual.pdf
26  */
27 /*
28  * ConnectX glossary
29  * -----------------
30  *
31  * WR		Work Request: something we've asked the hardware to do by
32  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
33  *
34  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
35  *
36  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
37  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
38  *		types have different WQE structures, different commands for
39  *		creating and destroying them, etc, but share a common context
40  *		structure, counter setup and state graph.
41  * SQ		Send Queue, a specific type of WQ that sends packets
42  * RQ		Receive Queue, a specific type of WQ that receives packets
43  *
44  * CQ		Completion Queue: completion of WRs from a WQ are reported to
45  *		one of these, as a CQE on its entry ring.
46  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
47  *		info, as well as packet size, the ID of the WQ, and the index
48  *		of the WQE which completed. Does not contain any packet data.
49  *
50  * EQ		Event Queue: a ring of event structs from the hardware informing
51  *		us when particular events happen. Many events can point at a
52  *		a particular CQ which we should then go look at.
53  * EQE		Event Queue Entry: an entry on the EQ ring
54  *
55  * UAR		User Access Region, a page of the device's PCI BAR which is
56  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
57  *		ring to arm them for interrupts or wake them up for new work
58  *
59  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
60  *		as a single unit (for e.g. hashing/RSS).
61  *
62  * TIR		Transport Interface Recieve, a bucket of resources for the
63  *		reception of packets. TIRs have to point at either a single RQ
64  *		or a table of RQs (RQT). They then serve as a target for flow
65  *		table entries (FEs). TIRs that point at an RQT also contain the
66  *		settings for hashing for RSS.
67  *
68  * TIS		Transport Interface Send, a bucket of resources associated with
69  *		the transmission of packets. In particular, the temporary
70  *		resources used for LSO internally in the card are accounted to
71  *		a TIS.
72  *
73  * FT		Flow Table, a collection of FEs and FGs that can be referred to
74  *		as a single entity (e.g. used as a target from another flow
75  *		entry or set as the "root" table to handle incoming or outgoing
76  *		packets). Packets arriving at a FT are matched against the
77  *		FEs in the table until either one matches with a terminating
78  *		action or all FEs are exhausted (it's first-match-wins but with
79  *		some actions that are non-terminal, like counting actions).
80  *
81  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
82  *		they match on the same attributes of packets coming into the
83  *		flow).
84  *
85  * FE		Flow Entry, an individual set of values to match against
86  *		packets entering the flow table, combined with an action to
87  *		take upon a successful match. The action we use most is
88  *		"forward", which sends the packets to a TIR or another flow
89  *		table and then stops further processing within the FE's FT.
90  *
91  * lkey/mkey	A reference to something similar to a page table but in the
92  *		device's internal onboard MMU. Since Connect-X parts double as
93  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
94  *		features which we try very hard not to use. For our WQEs we use
95  *		the "reserved" lkey, which is a special value which indicates
96  *		that addresses we give are linear addresses and should not be
97  *		translated.
98  *
99  * PD		Protection Domain, an IB concept. We have to allocate one to
100  *		provide as a parameter for new WQs, but we don't do anything
101  *		with it.
102  *
103  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
104  *		provide it as a parameter to TIR/TIS creation, but we don't do
105  *		anything with it.
106  */
107 /*
108  *
109  * Data flow overview
110  * ------------------
111  *
112  * This driver is a MAC ring-enabled driver which maps rings to send and recv
113  * queues in hardware on the device.
114  *
115  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
116  * sufficient space, and simplify the logic needed to work out which buffer
117  * was completed.
118  *
119  * The CQs are then round-robin allocated onto EQs, of which we set up one per
120  * interrupt that the system gives us for the device. Normally this means we
121  * have 8 EQs.
122  *
123  * When we have >= 8 EQs available, we try to allocate only RX or only TX
124  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
125  *
126  * EQ #0 is reserved for all event types other than completion events, and has
127  * no CQs associated with it at any time. EQs #1 and upwards are only used for
128  * handling CQ completion events.
129  *
130  * +------+     +------+           +------+        +---------+
131  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
132  * +------+     +------+     |     +------+        +---------+
133  *                           |
134  * +------+     +------+     |
135  * | SQ 1 |---->| CQ 1 |---+ |     +------+
136  * +------+     +------+   | +---> |      |
137  *                         |       |      |
138  * +------+     +------+   |       | EQ 1 |        +---------+
139  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
140  * +------+     +------+   | +---> |      |        +---------+
141  *                         | |     +------+
142  *                         | |
143  *   ...                   | |
144  *                         | |     +------+
145  * +------+     +------+   +-----> |      |
146  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
147  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
148  *                           |     |      |        +---------+
149  * +------+     +------+     | +-> |      |
150  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
151  * +------+     +------+       |
152  *                             |     ....
153  * +------+     +------+       |
154  * | RQ 2 |---->| CQ 5 |-------+
155  * +------+     +------+
156  *
157  *   ... (note this diagram does not show RX-only or TX-only EQs)
158  *
159  * For TX, we advertise all of the SQs we create as plain rings to MAC with
160  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
161  * and use the rings as it sees fit.
162  *
163  * For RX, we advertise actual groups in order to make use of hardware
164  * classification.
165  *
166  * The hardware classification we use is based around Flow Tables, and we
167  * currently ignore all of the eswitch features of the card. The NIC VPORT
168  * is always set to promisc mode so that the eswitch sends us all of the
169  * traffic that arrives on the NIC, and we use flow entries to manage
170  * everything.
171  *
172  * We use 2 layers of flow tables for classification: traffic arrives at the
173  * root RX flow table which contains MAC address filters. Those then send
174  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
175  * presence and VID filters.
176  *
177  * Since these parts only support doing RSS hashing on a single protocol at a
178  * time, we have to use a third layer of flow tables as well to break traffic
179  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
180  * so that it can be sent to the appropriate TIR for hashing.
181  *
182  * Incoming packets
183  *        +           +---------+      +---------+
184  *        |        +->| group 0 |      | group 0 |
185  *        |        |  | vlan ft |  +-->| hash ft |
186  *        v        |  |   L1    |  |   |   L2    |
187  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
188  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
189  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
190  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
191  *        |        |  |         |  |   +---------+    +-----+    |     +------+
192  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
193  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
194  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
195  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
196  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
197  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
198  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
199  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
200  *   +---------+   |               ^   |  other  |-+
201  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
202  *   +---------+                   |               +->| TIR |--->| RQ0 |
203  *   |  MAC 1  |-+                 |                  +-----+    +-----+
204  *   +---------+ | +---------------+
205  *   |  MAC 2  |-+ |               ^
206  *   +---------+ | |               |
207  *   |  MAC 3  |-+ |  +---------+  |   +---------+
208  *   +---------+ | |  | group 1 |  |   | group 1 |
209  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
210  *   |         |   |  |   L1    |  | | |   L2    |
211  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
212  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
213  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
214  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
215  *                    |         |  |   +---------+    +-----+    |     +------+
216  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
217  *                    |         |  |   +---------+    +-----+    | RQT +------+
218  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
219  *                    |         |  |   +---------+    +-----+    |     |      |
220  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
221  *                    | promisc |--+   +---------+    +-----+    |     |      |
222  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
223  *                                     +---------+    +-----+    +-----+------+
224  *                                     |  other  |-+
225  *                                     +---------+ |
226  *                      .......                    |  +-----+    +-----+
227  *                                                 +->| TIR |--->| RQ3 |
228  *                                                    +-----+    +-----+
229  *
230  * Note that the "promisc" flow entries are only set/enabled when promisc
231  * mode is enabled for the NIC. All promisc flow entries point directly at
232  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
233  * the "default group" in MAC).
234  *
235  * The "default" entry in the L1 VLAN filter flow tables is used when there
236  * are no VLANs set for the group, to accept any traffic regardless of tag. It
237  * is deleted as soon as a VLAN filter is added (and re-instated if the
238  * last VLAN filter is removed).
239  *
240  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
241  * space for packet data (they're a collection of scatter pointers only). TX
242  * descriptors contain some space for "inline headers" (and the card requires
243  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
244  * but all the rest of the data comes from the gather pointers.
245  *
246  * When we get completions back they simply contain the ring index number of
247  * the WR (work request) which completed. So, we manage the buffers for actual
248  * packet data completely independently of the descriptors in this driver. When
249  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
250  * with the WQE index that we put it at, and therefore don't have to look at
251  * the original descriptor at all when handling completions.
252  *
253  * For RX, we create sufficient packet data buffers to fill 150% of the
254  * available descriptors for each ring. These all are pre-set-up for DMA and
255  * have an mblk_t associated with them (with desballoc()).
256  *
257  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
258  * large enough), or we copy it into a pre-allocated buffer set up in the same
259  * as as for RX.
260  */
261 
262 /*
263  * Buffer lifecycle: RX
264  * --------------------
265  *
266  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
267  * straightforward.
268  *
269  * It is created (and has all its memory allocated) at the time of starting up
270  * the RX ring it belongs to. Then it is placed on the "free" list in the
271  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
272  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
273  * before making a WQE for it.
274  *
275  * After a completion event occurs, the packet is either discarded (and the
276  * buffer_t returned to the free list), or it is readied for loaning to MAC
277  * and placed on the "loaned" list in the mlxcx_buffer_shard_t.
278  *
279  * Once MAC and the rest of the system have finished with the packet, they call
280  * freemsg() on its mblk, which will call mlxcx_buf_mp_return. At this point
281  * the fate of the buffer_t is determined by the state of the
282  * mlxcx_buffer_shard_t. When the shard is in its normal state the buffer_t
283  * will be returned to the free list, potentially to be recycled and used
284  * again. But if the shard is draining (E.g. after a ring stop) there will be
285  * no recycling and the buffer_t is immediately destroyed.
286  *
287  * At detach/teardown time, buffers are only every destroyed from the free list.
288  *
289  *
290  *                         +
291  *                         |
292  *                         | mlxcx_buf_create
293  *                         |
294  *                         v
295  *                    +----+----+
296  *                    | created |
297  *                    +----+----+                        +------+
298  *                         |                             | dead |
299  *                         |                             +------+
300  *                         | mlxcx_buf_return                ^
301  *                         |                                 |
302  *                         v                                 | mlxcx_buf_destroy
303  * mlxcx_buf_destroy  +----+----+          +-----------+     |
304  *          +---------|  free   |<------no-| draining? |-yes-+
305  *          |         +----+----+          +-----------+
306  *          |              |                     ^
307  *          |              |                     |
308  *          v              | mlxcx_buf_take      | mlxcx_buf_return
309  *      +---+--+           v                     |
310  *      | dead |       +---+---+                 |
311  *      +------+       | on WQ |- - - - - - - - >O
312  *                     +---+---+                 ^
313  *                         |                     |
314  *                         |                     |
315  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
316  *                         v                     |
317  *                 +-------+--------+            |
318  *                 | on loan to MAC |----------->O
319  *                 +----------------+  freemsg()
320  *
321  */
322 
323 /*
324  * Buffer lifecycle: TX
325  * --------------------
326  *
327  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
328  * "foreign" buffers.
329  *
330  * The former have their memory allocated and DMA bound by this driver, while
331  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
332  * not owned by us, though we do DMA bind it (and take responsibility for
333  * un-binding it when we're done with them).
334  *
335  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
336  * SQ. Thus, there is a separate free list and mutex for each kind.
337  *
338  * Since a TX packet might consist of multiple mblks, we translate each mblk
339  * into exactly one buffer_t. The buffer_ts are chained together in the same
340  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
341  *
342  * Each chain of TX buffers may consist of foreign or driver buffers, in any
343  * mixture.
344  *
345  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
346  * it from the rest of the chain buffers.
347  *
348  * TX buffer chains are always returned to the free list by
349  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
350  * freeing all of the members.
351  *
352  * We only call freemsg() once, on the head of the TX buffer chain's original
353  * mblk. This is true whether we copied it or bound it in a foreign buffer.
354  */
355 
356 /*
357  * Startup and command interface
358  * -----------------------------
359  *
360  * The command interface is the primary way in which we give control orders to
361  * the hardware (e.g. actions like "create this queue" or "delete this flow
362  * entry"). The command interface is never used to transmit or receive packets
363  * -- that takes place only on the queues that are set up through it.
364  *
365  * In mlxcx_cmd.c we implement our use of the command interface on top of a
366  * simple taskq. As commands are submitted from the taskq they choose a
367  * "slot", if there are no free slots then execution of the command will
368  * be paused until one is free. The hardware permits up to 32 independent
369  * slots for concurrent command execution.
370  *
371  * Before interrupts are enabled, command completion is polled, once
372  * interrupts are up command completions become asynchronous and are
373  * wired to EQ 0. A caveat to this is commands can not be submitted
374  * directly from EQ 0's completion handler, and any processing resulting from
375  * an asynchronous event which requires further use of the command interface
376  * is posted through a taskq.
377  *
378  * The startup/attach process for this card involves a bunch of different steps
379  * which are summarised pretty well in the PRM. We have to send a number of
380  * commands which do different things to start the card up, give it some pages
381  * of our own memory for it to use, then start creating all the entities that
382  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
383  * and TDoms.
384  */
385 
386 /*
387  * UARs
388  * ----
389  *
390  * The pages of the PCI BAR other than the first few are reserved for use as
391  * "UAR" sections in this device. Each UAR section can be used as a set of
392  * doorbells for our queues.
393  *
394  * Currently we just make one single UAR for all of our queues. It doesn't
395  * seem to be a major limitation yet.
396  *
397  * When we're sending packets through an SQ, the PRM is not awful clear about
398  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
399  * (it's clear on the pattern of alternation you're expected to use between
400  * even and odd for Blueflame sends, but not for regular doorbells).
401  *
402  * Currently we don't do the even-odd alternating pattern for ordinary
403  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
404  * least on Connect-X4 Lx.
405  */
406 
407 /*
408  * Lock ordering
409  * -------------
410  *
411  * Interrupt side:
412  *
413  *  - mleq_mtx
414  *    - mlcq_arm_mtx
415  *      - mlcq_mtx
416  *        - mlcq_bufbmtx
417  *        - mlwq_mtx
418  *          - mlbs_mtx
419  *    - mlp_mtx
420  *
421  * GLD side:
422  *
423  *  - mlp_mtx
424  *    - mlg_mtx
425  *      - mlg_*.mlft_mtx
426  *    - mlp_*.mlft_mtx
427  *    - mlwq_mtx
428  *      - mlbs_mtx
429  *      - mlcq_bufbmtx
430  *  - mleq_mtx
431  *    - mlcq_arm_mtx
432  *      - mlcq_mtx
433  *
434  */
435 
436 #include <sys/modctl.h>
437 #include <sys/conf.h>
438 #include <sys/devops.h>
439 #include <sys/sysmacros.h>
440 #include <sys/time.h>
441 
442 #include <sys/mac_provider.h>
443 
444 #include <mlxcx.h>
445 
446 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
447 
448 #define	MLXCX_MODULE_NAME	"mlxcx"
449 /*
450  * We give this to the firmware, so it has to be in a fixed format that it
451  * understands.
452  */
453 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
454 
455 /*
456  * Firmware may take a while to reclaim pages. Try a set number of times.
457  */
458 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
459 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
460 
461 static void *mlxcx_softstate;
462 
463 /*
464  * Fault detection thresholds.
465  */
466 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
467 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
468 
469 static void
470 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
471 {
472 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
473 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
474 
475 	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
476 	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
477 
478 	/*
479 	 * Currently we have different queue size defaults for two
480 	 * categories of queues. One set for devices which support a
481 	 * maximum speed of 10Gb/s, and another for those above that.
482 	 */
483 	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
484 	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
485 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
486 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
487 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
488 	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
489 	    MLXCX_PROTO_10G)) != 0) {
490 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
491 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
492 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
493 	} else {
494 		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
495 		    "recognize. Proto: 0x%x", port->mlp_max_proto);
496 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
497 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
498 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
499 	}
500 }
501 
502 /*
503  * Properties which may have different defaults based on hardware
504  * characteristics.
505  */
506 static void
507 mlxcx_load_model_props(mlxcx_t *mlxp)
508 {
509 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
510 
511 	mlxcx_load_prop_defaults(mlxp);
512 
513 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
514 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
515 	    p->mldp_cq_size_shift_default);
516 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
517 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
518 	    p->mldp_sq_size_shift_default);
519 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
520 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
521 	    p->mldp_rq_size_shift_default);
522 }
523 
524 static void
525 mlxcx_load_props(mlxcx_t *mlxp)
526 {
527 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
528 
529 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
530 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
531 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
532 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
533 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
534 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
535 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
536 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
537 	    MLXCX_CQEMOD_COUNT_DFLT);
538 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
539 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
540 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
541 
542 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
543 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
544 	    MLXCX_TX_NGROUPS_DFLT);
545 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
546 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
547 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
548 
549 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
550 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
551 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
552 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
553 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
554 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
555 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
556 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
557 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
558 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
559 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
560 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
561 
562 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
563 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
564 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
565 
566 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
567 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
568 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
569 
570 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
571 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
572 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
573 
574 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
575 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
576 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
577 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
578 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
579 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
580 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
581 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
582 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
583 
584 	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
585 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
586 	    MLXCX_RX_PER_CQ_DEFAULT);
587 
588 	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
589 	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
590 		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
591 		    "out of range. Defaulting to: %d. Valid values are from "
592 		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
593 		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
594 		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
595 	}
596 }
597 
598 void
599 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
600 {
601 	va_list ap;
602 
603 	va_start(ap, fmt);
604 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
605 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
606 	} else {
607 		vcmn_err(CE_NOTE, fmt, ap);
608 	}
609 	va_end(ap);
610 }
611 
612 void
613 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
614 {
615 	va_list ap;
616 
617 	va_start(ap, fmt);
618 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
619 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
620 	} else {
621 		vcmn_err(CE_WARN, fmt, ap);
622 	}
623 	va_end(ap);
624 }
625 
626 void
627 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
628 {
629 	va_list ap;
630 
631 	va_start(ap, fmt);
632 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
633 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
634 	} else {
635 		vcmn_err(CE_PANIC, fmt, ap);
636 	}
637 	va_end(ap);
638 }
639 
640 uint16_t
641 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
642 {
643 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
644 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
645 }
646 
647 uint32_t
648 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
649 {
650 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
651 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
652 }
653 
654 uint64_t
655 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
656 {
657 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
658 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
659 }
660 
661 void
662 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
663 {
664 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
665 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
666 }
667 
668 void
669 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
670 {
671 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
672 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
673 }
674 
675 void
676 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
677 {
678 	/*
679 	 * The UAR is always inside the first BAR, which we mapped as
680 	 * mlx_regs
681 	 */
682 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
683 	    (uintptr_t)mlxp->mlx_regs_base;
684 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
685 }
686 
687 void
688 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
689 {
690 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
691 	    (uintptr_t)mlxp->mlx_regs_base;
692 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
693 }
694 
695 static void
696 mlxcx_fm_fini(mlxcx_t *mlxp)
697 {
698 	if (mlxp->mlx_fm_caps == 0)
699 		return;
700 
701 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
702 		ddi_fm_handler_unregister(mlxp->mlx_dip);
703 
704 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
705 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
706 		pci_ereport_teardown(mlxp->mlx_dip);
707 
708 	ddi_fm_fini(mlxp->mlx_dip);
709 
710 	mlxp->mlx_fm_caps = 0;
711 }
712 
713 void
714 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
715 {
716 	uint64_t ena;
717 	char buf[FM_MAX_CLASS];
718 
719 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
720 		return;
721 
722 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
723 	ena = fm_ena_generate(0, FM_ENA_FMT1);
724 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
725 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
726 	    NULL);
727 }
728 
729 static int
730 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
731 {
732 	/*
733 	 * as the driver can always deal with an error in any dma or
734 	 * access handle, we can just return the fme_status value.
735 	 */
736 	pci_ereport_post(dip, err, NULL);
737 	return (err->fme_status);
738 }
739 
740 static void
741 mlxcx_fm_init(mlxcx_t *mlxp)
742 {
743 	ddi_iblock_cookie_t iblk;
744 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
745 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
746 
747 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
748 	    DDI_PROP_DONTPASS, "fm_capable", def);
749 
750 	if (mlxp->mlx_fm_caps < 0) {
751 		mlxp->mlx_fm_caps = 0;
752 	}
753 	mlxp->mlx_fm_caps &= def;
754 
755 	if (mlxp->mlx_fm_caps == 0)
756 		return;
757 
758 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
759 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
760 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
761 		pci_ereport_setup(mlxp->mlx_dip);
762 	}
763 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
764 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
765 		    (void *)mlxp);
766 	}
767 }
768 
769 static void
770 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
771 {
772 	mlxcx_buffer_t *buf;
773 
774 	mutex_enter(&s->mlbs_mtx);
775 
776 	while (!list_is_empty(&s->mlbs_busy))
777 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
778 
779 	while (!list_is_empty(&s->mlbs_loaned))
780 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
781 
782 	while ((buf = list_head(&s->mlbs_free)) != NULL)
783 		mlxcx_buf_destroy(mlxp, buf);
784 
785 	list_destroy(&s->mlbs_free);
786 	list_destroy(&s->mlbs_busy);
787 	list_destroy(&s->mlbs_loaned);
788 	mutex_exit(&s->mlbs_mtx);
789 
790 	cv_destroy(&s->mlbs_free_nonempty);
791 	mutex_destroy(&s->mlbs_mtx);
792 }
793 
794 static void
795 mlxcx_teardown_bufs(mlxcx_t *mlxp)
796 {
797 	mlxcx_buf_shard_t *s;
798 
799 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
800 		mlxcx_mlbs_teardown(mlxp, s);
801 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
802 	}
803 	list_destroy(&mlxp->mlx_buf_shards);
804 
805 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
806 }
807 
808 static void
809 mlxcx_teardown_pages(mlxcx_t *mlxp)
810 {
811 	uint_t nzeros = 0;
812 	uint64_t *pas;
813 
814 	pas = kmem_alloc(sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES,
815 	    KM_SLEEP);
816 
817 	mutex_enter(&mlxp->mlx_pagemtx);
818 
819 	while (mlxp->mlx_npages > 0) {
820 		int32_t req, ret;
821 
822 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
823 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
824 
825 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
826 			mlxcx_warn(mlxp, "hardware refused to return pages, "
827 			    "leaking %u remaining pages", mlxp->mlx_npages);
828 			goto out;
829 		}
830 
831 		for (int32_t i = 0; i < ret; i++) {
832 			mlxcx_dev_page_t *mdp, probe;
833 			bzero(&probe, sizeof (probe));
834 			probe.mxdp_pa = pas[i];
835 
836 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
837 
838 			if (mdp != NULL) {
839 				avl_remove(&mlxp->mlx_pages, mdp);
840 				mlxp->mlx_npages--;
841 				mlxcx_dma_free(&mdp->mxdp_dma);
842 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
843 			} else {
844 				mlxcx_panic(mlxp, "hardware returned a page "
845 				    "with PA 0x%" PRIx64 " but we have no "
846 				    "record of giving out such a page", pas[i]);
847 			}
848 		}
849 
850 		/*
851 		 * If no pages were returned, note that fact.
852 		 */
853 		if (ret == 0) {
854 			nzeros++;
855 			if (nzeros > mlxcx_reclaim_tries) {
856 				mlxcx_warn(mlxp, "hardware refused to return "
857 				    "pages, leaking %u remaining pages",
858 				    mlxp->mlx_npages);
859 				goto out;
860 			}
861 			delay(drv_usectohz(mlxcx_reclaim_delay));
862 		}
863 	}
864 
865 	avl_destroy(&mlxp->mlx_pages);
866 
867 out:
868 	mutex_exit(&mlxp->mlx_pagemtx);
869 	mutex_destroy(&mlxp->mlx_pagemtx);
870 
871 	kmem_free(pas, sizeof (*pas) * MLXCX_MANAGE_PAGES_MAX_PAGES);
872 }
873 
874 static boolean_t
875 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
876 {
877 	ddi_device_acc_attr_t acc;
878 	ddi_dma_attr_t attr;
879 	boolean_t ret;
880 	size_t sz, i;
881 
882 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
883 
884 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
885 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
886 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
887 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
888 
889 	mlxcx_dma_acc_attr(mlxp, &acc);
890 	mlxcx_dma_queue_attr(mlxp, &attr);
891 
892 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
893 	    B_TRUE, sz, B_TRUE);
894 	if (!ret) {
895 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
896 		return (B_FALSE);
897 	}
898 
899 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
900 
901 	for (i = 0; i < mleq->mleq_nents; ++i)
902 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
903 
904 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
905 
906 	return (B_TRUE);
907 }
908 
909 static void
910 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
911 {
912 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
913 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
914 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
915 
916 	mlxcx_dma_free(&mleq->mleq_dma);
917 	mleq->mleq_ent = NULL;
918 
919 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
920 }
921 
922 void
923 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
924 {
925 	mlxcx_flow_group_t *fg;
926 	mlxcx_flow_entry_t *fe;
927 	int i;
928 
929 	ASSERT(mutex_owned(&ft->mlft_mtx));
930 
931 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
932 		fe = &ft->mlft_ent[i];
933 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
934 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
935 				mlxcx_panic(mlxp, "failed to delete flow "
936 				    "entry %u on table %u", i,
937 				    ft->mlft_num);
938 			}
939 		}
940 	}
941 
942 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
943 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
944 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
945 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
946 				mlxcx_panic(mlxp, "failed to destroy flow "
947 				    "group %u", fg->mlfg_num);
948 			}
949 		}
950 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
951 	}
952 	list_destroy(&ft->mlft_groups);
953 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
954 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
955 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
956 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
957 			    ft->mlft_num);
958 		}
959 	}
960 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
961 	ft->mlft_ent = NULL;
962 	mutex_exit(&ft->mlft_mtx);
963 	mutex_destroy(&ft->mlft_mtx);
964 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
965 }
966 
967 static void
968 mlxcx_teardown_ports(mlxcx_t *mlxp)
969 {
970 	uint_t i;
971 	mlxcx_port_t *p;
972 	mlxcx_flow_table_t *ft;
973 
974 	for (i = 0; i < mlxp->mlx_nports; ++i) {
975 		p = &mlxp->mlx_ports[i];
976 		if (!(p->mlp_init & MLXCX_PORT_INIT))
977 			continue;
978 		mutex_enter(&p->mlp_mtx);
979 		if ((ft = p->mlp_rx_flow) != NULL) {
980 			mutex_enter(&ft->mlft_mtx);
981 			/*
982 			 * teardown_flow_table() will destroy the mutex, so
983 			 * we don't release it here.
984 			 */
985 			mlxcx_teardown_flow_table(mlxp, ft);
986 		}
987 		mutex_exit(&p->mlp_mtx);
988 		mutex_destroy(&p->mlp_mtx);
989 		mutex_destroy(&p->mlx_port_event.mla_mtx);
990 		p->mlx_port_event.mla_mlx = NULL;
991 		p->mlx_port_event.mla_port = NULL;
992 		p->mlp_init &= ~MLXCX_PORT_INIT;
993 	}
994 
995 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
996 	mlxp->mlx_ports = NULL;
997 }
998 
999 static void
1000 mlxcx_teardown_wqs(mlxcx_t *mlxp)
1001 {
1002 	mlxcx_work_queue_t *mlwq;
1003 
1004 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
1005 		mlxcx_wq_teardown(mlxp, mlwq);
1006 	}
1007 	list_destroy(&mlxp->mlx_wqs);
1008 }
1009 
1010 static void
1011 mlxcx_teardown_cqs(mlxcx_t *mlxp)
1012 {
1013 	mlxcx_completion_queue_t *mlcq;
1014 
1015 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
1016 		mlxcx_cq_teardown(mlxp, mlcq);
1017 	}
1018 	list_destroy(&mlxp->mlx_cqs);
1019 }
1020 
1021 static void
1022 mlxcx_teardown_eqs(mlxcx_t *mlxp)
1023 {
1024 	mlxcx_event_queue_t *mleq;
1025 	uint_t i;
1026 
1027 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1028 		mleq = &mlxp->mlx_eqs[i];
1029 		mutex_enter(&mleq->mleq_mtx);
1030 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1031 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1032 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1033 				mlxcx_warn(mlxp, "failed to destroy "
1034 				    "event queue idx %u eqn %u",
1035 				    i, mleq->mleq_num);
1036 			}
1037 		}
1038 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1039 			mlxcx_eq_rele_dma(mlxp, mleq);
1040 		}
1041 		mutex_exit(&mleq->mleq_mtx);
1042 	}
1043 }
1044 
1045 static void
1046 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1047 {
1048 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1049 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1050 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1051 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1052 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1053 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1054 }
1055 
1056 static void
1057 mlxcx_teardown(mlxcx_t *mlxp)
1058 {
1059 	uint_t i;
1060 	dev_info_t *dip = mlxp->mlx_dip;
1061 
1062 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1063 		/*
1064 		 * Disable interrupts and let any active vectors quiesce.
1065 		 */
1066 		mlxcx_intr_disable(mlxp);
1067 	}
1068 
1069 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1070 		mlxcx_teardown_checktimers(mlxp);
1071 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1072 	}
1073 
1074 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1075 		mlxcx_teardown_groups(mlxp);
1076 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1077 	}
1078 
1079 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1080 		mlxcx_teardown_wqs(mlxp);
1081 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1082 	}
1083 
1084 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1085 		mlxcx_teardown_cqs(mlxp);
1086 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1087 	}
1088 
1089 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1090 		mlxcx_teardown_bufs(mlxp);
1091 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1092 	}
1093 
1094 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1095 		mlxcx_teardown_ports(mlxp);
1096 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1097 	}
1098 
1099 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1100 		mlxcx_teardown_eqs(mlxp);
1101 		mlxcx_intr_teardown(mlxp);
1102 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1103 	}
1104 
1105 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1106 		if (mlxp->mlx_uar.mlu_allocated) {
1107 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1108 				mlxcx_warn(mlxp, "failed to release UAR");
1109 			}
1110 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1111 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1112 		}
1113 		if (mlxp->mlx_pd.mlpd_allocated &&
1114 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1115 			mlxcx_warn(mlxp, "failed to release PD");
1116 		}
1117 		if (mlxp->mlx_tdom.mltd_allocated &&
1118 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1119 			mlxcx_warn(mlxp, "failed to release TDOM");
1120 		}
1121 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1122 	}
1123 
1124 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1125 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1126 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1127 			    "command during device detach");
1128 		}
1129 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1130 	}
1131 
1132 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1133 		mlxcx_teardown_pages(mlxp);
1134 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1135 	}
1136 
1137 	if (mlxp->mlx_attach & MLXCX_ATTACH_ASYNC_TQ) {
1138 		for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
1139 			mlxp->mlx_npages_req[i].mla_mlx = NULL;
1140 			mutex_destroy(&mlxp->mlx_npages_req[i].mla_mtx);
1141 		}
1142 		taskq_destroy(mlxp->mlx_async_tq);
1143 		mlxp->mlx_async_tq = NULL;
1144 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ASYNC_TQ;
1145 	}
1146 
1147 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1148 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1149 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1150 			    "during device detach");
1151 		}
1152 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1153 	}
1154 
1155 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1156 		mlxcx_cmd_queue_fini(mlxp);
1157 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1158 	}
1159 
1160 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1161 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1162 		mlxp->mlx_caps = NULL;
1163 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1164 	}
1165 
1166 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1167 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1168 		mlxp->mlx_regs_handle = NULL;
1169 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1170 	}
1171 
1172 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1173 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1174 		mlxp->mlx_cfg_handle = NULL;
1175 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1176 	}
1177 
1178 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1179 		mlxcx_fm_fini(mlxp);
1180 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1181 	}
1182 
1183 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1184 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1185 	ddi_set_driver_private(dip, NULL);
1186 }
1187 
1188 static boolean_t
1189 mlxcx_regs_map(mlxcx_t *mlxp)
1190 {
1191 	off_t memsize;
1192 	int ret;
1193 	ddi_device_acc_attr_t da;
1194 
1195 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1196 	    DDI_SUCCESS) {
1197 		mlxcx_warn(mlxp, "failed to get register set size");
1198 		return (B_FALSE);
1199 	}
1200 
1201 	/*
1202 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1203 	 * device.
1204 	 */
1205 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1206 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1207 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1208 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1209 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1210 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1211 	} else {
1212 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1213 	}
1214 
1215 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1216 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1217 
1218 	if (ret != DDI_SUCCESS) {
1219 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1220 		return (B_FALSE);
1221 	}
1222 
1223 	return (B_TRUE);
1224 }
1225 
1226 static boolean_t
1227 mlxcx_check_issi(mlxcx_t *mlxp)
1228 {
1229 	uint32_t issi;
1230 
1231 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1232 		mlxcx_warn(mlxp, "failed to get ISSI");
1233 		return (B_FALSE);
1234 	}
1235 
1236 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1237 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1238 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1239 		return (B_FALSE);
1240 	}
1241 
1242 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1243 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1244 		    MLXCX_CURRENT_ISSI);
1245 		return (B_FALSE);
1246 	}
1247 
1248 	return (B_TRUE);
1249 }
1250 
1251 boolean_t
1252 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages, int32_t *ngiven)
1253 {
1254 	ddi_device_acc_attr_t acc;
1255 	ddi_dma_attr_t attr;
1256 	int32_t i;
1257 	list_t plist;
1258 	mlxcx_dev_page_t *mdp;
1259 	mlxcx_dev_page_t **pages;
1260 	const ddi_dma_cookie_t *ck;
1261 
1262 	/*
1263 	 * If there are no pages required, then we're done here.
1264 	 */
1265 	if (npages <= 0) {
1266 		*ngiven = 0;
1267 		return (B_TRUE);
1268 	}
1269 
1270 	npages = MIN(npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
1271 
1272 	pages = kmem_alloc(sizeof (*pages) * npages, KM_SLEEP);
1273 
1274 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1275 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1276 
1277 	for (i = 0; i < npages; i++) {
1278 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1279 		mlxcx_dma_acc_attr(mlxp, &acc);
1280 		mlxcx_dma_page_attr(mlxp, &attr);
1281 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1282 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1283 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1284 			    npages);
1285 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1286 			goto cleanup_npages;
1287 		}
1288 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1289 		mdp->mxdp_pa = ck->dmac_laddress;
1290 
1291 		list_insert_tail(&plist, mdp);
1292 	}
1293 
1294 	/*
1295 	 * Now that all of the pages have been allocated, given them to hardware
1296 	 * in chunks.
1297 	 */
1298 	for (i = 0; i < npages; i++) {
1299 		pages[i] = list_remove_head(&plist);
1300 	}
1301 
1302 	if (!mlxcx_cmd_give_pages(mlxp,
1303 	    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, npages, pages)) {
1304 		mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1305 		    "pages!", npages);
1306 		for (i = 0; i < npages; i++) {
1307 			list_insert_tail(&plist, pages[i]);
1308 		}
1309 		goto cleanup_npages;
1310 	}
1311 
1312 	mutex_enter(&mlxp->mlx_pagemtx);
1313 	for (i = 0; i < npages; i++) {
1314 		avl_add(&mlxp->mlx_pages, pages[i]);
1315 	}
1316 	mlxp->mlx_npages += npages;
1317 	mutex_exit(&mlxp->mlx_pagemtx);
1318 
1319 	list_destroy(&plist);
1320 	kmem_free(pages, sizeof (*pages) * npages);
1321 
1322 	*ngiven = npages;
1323 
1324 	return (B_TRUE);
1325 
1326 cleanup_npages:
1327 	kmem_free(pages, sizeof (*pages) * npages);
1328 	while ((mdp = list_remove_head(&plist)) != NULL) {
1329 		mlxcx_dma_free(&mdp->mxdp_dma);
1330 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1331 	}
1332 	list_destroy(&plist);
1333 	return (B_FALSE);
1334 }
1335 
1336 static boolean_t
1337 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1338 {
1339 	int32_t npages, given;
1340 
1341 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1342 		mlxcx_warn(mlxp, "failed to determine boot pages");
1343 		return (B_FALSE);
1344 	}
1345 
1346 	while (npages > 0) {
1347 		if (!mlxcx_give_pages(mlxp, npages, &given))
1348 			return (B_FALSE);
1349 
1350 		npages -= given;
1351 	}
1352 
1353 	return (B_TRUE);
1354 }
1355 
1356 static int
1357 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1358 {
1359 	mlxcx_t *mlxp = cookie;
1360 	mlxcx_buffer_t *b = arg;
1361 
1362 	bzero(b, sizeof (mlxcx_buffer_t));
1363 	b->mlb_mlx = mlxp;
1364 	b->mlb_state = MLXCX_BUFFER_INIT;
1365 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1366 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1367 
1368 	return (0);
1369 }
1370 
1371 static void
1372 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1373 {
1374 	mlxcx_t *mlxp = cookie;
1375 	mlxcx_buffer_t *b = arg;
1376 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1377 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1378 	list_destroy(&b->mlb_tx_chain);
1379 }
1380 
1381 mlxcx_buf_shard_t *
1382 mlxcx_mlbs_create(mlxcx_t *mlxp)
1383 {
1384 	mlxcx_buf_shard_t *s;
1385 
1386 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1387 
1388 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1389 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1390 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1391 	    offsetof(mlxcx_buffer_t, mlb_entry));
1392 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1393 	    offsetof(mlxcx_buffer_t, mlb_entry));
1394 	list_create(&s->mlbs_loaned, sizeof (mlxcx_buffer_t),
1395 	    offsetof(mlxcx_buffer_t, mlb_entry));
1396 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1397 
1398 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1399 
1400 	return (s);
1401 }
1402 
1403 static boolean_t
1404 mlxcx_setup_bufs(mlxcx_t *mlxp)
1405 {
1406 	char namebuf[KSTAT_STRLEN];
1407 
1408 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1409 	    ddi_get_instance(mlxp->mlx_dip));
1410 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1411 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1412 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1413 	    NULL, mlxp, NULL, 0);
1414 
1415 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1416 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1417 
1418 	return (B_TRUE);
1419 }
1420 
1421 static void
1422 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1423     const char *state, uint8_t statenum)
1424 {
1425 	uint64_t ena;
1426 	char buf[FM_MAX_CLASS];
1427 
1428 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1429 		return;
1430 
1431 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1432 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1433 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1434 
1435 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1436 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1437 	    "state", DATA_TYPE_STRING, state,
1438 	    "state_num", DATA_TYPE_UINT8, statenum,
1439 	    "qtype", DATA_TYPE_STRING, qtype,
1440 	    "qnum", DATA_TYPE_UINT32, qnum,
1441 	    NULL);
1442 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1443 }
1444 
1445 /*
1446  * The following set of routines are for monitoring the health of
1447  * event, completion and work queues. They run infrequently peeking at
1448  * the structs to catch stalls and inconsistent state.
1449  *
1450  * They peek at the structs *without* acquiring locks - we don't want
1451  * to impede flow of data. Driver start up and shutdown semantics
1452  * guarantee the structs are present and won't disappear underneath
1453  * these routines.
1454  *
1455  * As previously noted, the routines peek at active data in the structs and
1456  * they will store some values for comparison on next invocation. To
1457  * maintain integrity of the saved values, these values are only modified
1458  * within these routines.
1459  */
1460 static void
1461 mlxcx_eq_check(void *arg)
1462 {
1463 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1464 	mlxcx_event_queue_t *eq;
1465 	mlxcx_eventq_ctx_t ctx;
1466 	const char *str;
1467 
1468 	uint_t i;
1469 
1470 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1471 		eq = &mlxp->mlx_eqs[i];
1472 
1473 		if ((eq->mleq_state & MLXCX_EQ_CREATED) == 0)
1474 			continue;
1475 
1476 		/*
1477 		 * If the event queue was successfully created in the HCA,
1478 		 * then initialization and shutdown sequences guarantee
1479 		 * the queue exists.
1480 		 */
1481 		ASSERT0(eq->mleq_state & MLXCX_EQ_DESTROYED);
1482 
1483 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx))
1484 			continue;
1485 
1486 		str = "???";
1487 		switch (ctx.mleqc_status) {
1488 		case MLXCX_EQ_STATUS_OK:
1489 			break;
1490 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1491 			str = "WRITE_FAILURE";
1492 			break;
1493 		}
1494 
1495 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1496 			mlxcx_fm_qstate_ereport(mlxp, "event",
1497 			    eq->mleq_num, str, ctx.mleqc_status);
1498 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1499 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1500 		}
1501 
1502 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1503 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1504 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1505 			    ++eq->mleq_check_disarm_cnt >= 3) {
1506 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1507 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1508 				    eq->mleq_intr_index);
1509 			}
1510 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1511 		} else {
1512 			eq->mleq_check_disarm_cc = 0;
1513 			eq->mleq_check_disarm_cnt = 0;
1514 		}
1515 	}
1516 }
1517 
1518 static void
1519 mlxcx_cq_check(void *arg)
1520 {
1521 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1522 	mlxcx_completion_queue_t *cq;
1523 	mlxcx_completionq_ctx_t ctx;
1524 	const char *str, *type;
1525 	uint_t v;
1526 
1527 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1528 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1529 
1530 		if ((cq->mlcq_state & MLXCX_CQ_CREATED) == 0)
1531 			continue;
1532 
1533 		/*
1534 		 * If the completion queue was successfully created in the HCA,
1535 		 * then initialization and shutdown sequences guarantee
1536 		 * the queue exists.
1537 		 */
1538 		ASSERT0(cq->mlcq_state & MLXCX_CQ_DESTROYED);
1539 		ASSERT0(cq->mlcq_state & MLXCX_CQ_TEARDOWN);
1540 
1541 		if (cq->mlcq_fm_repd_qstate)
1542 			continue;
1543 
1544 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx))
1545 			continue;
1546 
1547 		if (cq->mlcq_wq != NULL) {
1548 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1549 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1550 				type = "rx ";
1551 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1552 				type = "tx ";
1553 			else
1554 				type = "";
1555 		} else {
1556 			type = "";
1557 		}
1558 
1559 		str = "???";
1560 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1561 		switch (v) {
1562 		case MLXCX_CQC_STATUS_OK:
1563 			break;
1564 		case MLXCX_CQC_STATUS_OVERFLOW:
1565 			str = "OVERFLOW";
1566 			break;
1567 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1568 			str = "WRITE_FAIL";
1569 			break;
1570 		case MLXCX_CQC_STATUS_INVALID:
1571 			str = "INVALID";
1572 			break;
1573 		}
1574 
1575 		if (v != MLXCX_CQC_STATUS_OK) {
1576 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1577 			    cq->mlcq_num, str, v);
1578 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1579 			    type, cq->mlcq_num, v, str);
1580 			cq->mlcq_fm_repd_qstate = B_TRUE;
1581 		}
1582 
1583 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1584 		if (v != MLXCX_CQC_STATE_ARMED &&
1585 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1586 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1587 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1588 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1589 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1590 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1591 				    type, cq->mlcq_num, cq);
1592 			}
1593 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1594 		} else {
1595 			cq->mlcq_check_disarm_cnt = 0;
1596 			cq->mlcq_check_disarm_cc = 0;
1597 		}
1598 	}
1599 }
1600 
1601 void
1602 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1603 {
1604 	mlxcx_sq_ctx_t ctx;
1605 	mlxcx_sq_state_t state;
1606 
1607 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1608 		return;
1609 
1610 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1611 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1612 	switch (state) {
1613 	case MLXCX_SQ_STATE_RST:
1614 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1615 			mlxcx_fm_qstate_ereport(mlxp, "send",
1616 			    sq->mlwq_num, "RST", state);
1617 			sq->mlwq_fm_repd_qstate = B_TRUE;
1618 		}
1619 		break;
1620 	case MLXCX_SQ_STATE_RDY:
1621 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1622 			mlxcx_fm_qstate_ereport(mlxp, "send",
1623 			    sq->mlwq_num, "RDY", state);
1624 			sq->mlwq_fm_repd_qstate = B_TRUE;
1625 		}
1626 		break;
1627 	case MLXCX_SQ_STATE_ERR:
1628 		mlxcx_fm_qstate_ereport(mlxp, "send",
1629 		    sq->mlwq_num, "ERR", state);
1630 		sq->mlwq_fm_repd_qstate = B_TRUE;
1631 		break;
1632 	default:
1633 		mlxcx_fm_qstate_ereport(mlxp, "send",
1634 		    sq->mlwq_num, "???", state);
1635 		sq->mlwq_fm_repd_qstate = B_TRUE;
1636 		break;
1637 	}
1638 }
1639 
1640 void
1641 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1642 {
1643 	mlxcx_rq_ctx_t ctx;
1644 	mlxcx_rq_state_t state;
1645 
1646 
1647 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1648 		return;
1649 
1650 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1651 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1652 	switch (state) {
1653 	case MLXCX_RQ_STATE_RST:
1654 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1655 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1656 			    rq->mlwq_num, "RST", state);
1657 			rq->mlwq_fm_repd_qstate = B_TRUE;
1658 		}
1659 		break;
1660 	case MLXCX_RQ_STATE_RDY:
1661 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1662 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1663 			    rq->mlwq_num, "RDY", state);
1664 			rq->mlwq_fm_repd_qstate = B_TRUE;
1665 		}
1666 		break;
1667 	case MLXCX_RQ_STATE_ERR:
1668 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1669 		    rq->mlwq_num, "ERR", state);
1670 		rq->mlwq_fm_repd_qstate = B_TRUE;
1671 		break;
1672 	default:
1673 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1674 		    rq->mlwq_num, "???", state);
1675 		rq->mlwq_fm_repd_qstate = B_TRUE;
1676 		break;
1677 	}
1678 }
1679 
1680 static void
1681 mlxcx_wq_check(void *arg)
1682 {
1683 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1684 	mlxcx_work_queue_t *wq;
1685 
1686 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1687 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1688 
1689 		if ((wq->mlwq_state & MLXCX_WQ_CREATED) == 0)
1690 			continue;
1691 
1692 		/*
1693 		 * If the work queue was successfully created in the HCA,
1694 		 * then initialization and shutdown sequences guarantee
1695 		 * the queue exists.
1696 		 */
1697 		ASSERT0(wq->mlwq_state & MLXCX_WQ_DESTROYED);
1698 		ASSERT0(wq->mlwq_state & MLXCX_WQ_TEARDOWN);
1699 
1700 		if (wq->mlwq_fm_repd_qstate)
1701 			continue;
1702 
1703 		switch (wq->mlwq_type) {
1704 		case MLXCX_WQ_TYPE_SENDQ:
1705 			mlxcx_check_sq(mlxp, wq);
1706 			break;
1707 		case MLXCX_WQ_TYPE_RECVQ:
1708 			mlxcx_check_rq(mlxp, wq);
1709 			break;
1710 		}
1711 	}
1712 }
1713 
1714 static boolean_t
1715 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1716 {
1717 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1718 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1719 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1720 		    DDI_IPL_0);
1721 	}
1722 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1723 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1724 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1725 		    DDI_IPL_0);
1726 	}
1727 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1728 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1729 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1730 		    DDI_IPL_0);
1731 	}
1732 	return (B_TRUE);
1733 }
1734 
1735 int
1736 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1737 {
1738 	const mlxcx_flow_entry_t *left = arg0;
1739 	const mlxcx_flow_entry_t *right = arg1;
1740 	int bcmpr;
1741 
1742 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1743 	    sizeof (left->mlfe_dmac));
1744 	if (bcmpr < 0)
1745 		return (-1);
1746 	if (bcmpr > 0)
1747 		return (1);
1748 	if (left->mlfe_vid < right->mlfe_vid)
1749 		return (-1);
1750 	if (left->mlfe_vid > right->mlfe_vid)
1751 		return (1);
1752 	return (0);
1753 }
1754 
1755 int
1756 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1757 {
1758 	const mlxcx_group_mac_t *left = arg0;
1759 	const mlxcx_group_mac_t *right = arg1;
1760 	int bcmpr;
1761 
1762 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1763 	    sizeof (left->mlgm_mac));
1764 	if (bcmpr < 0)
1765 		return (-1);
1766 	if (bcmpr > 0)
1767 		return (1);
1768 	return (0);
1769 }
1770 
1771 int
1772 mlxcx_page_compare(const void *arg0, const void *arg1)
1773 {
1774 	const mlxcx_dev_page_t *p0 = arg0;
1775 	const mlxcx_dev_page_t *p1 = arg1;
1776 
1777 	if (p0->mxdp_pa < p1->mxdp_pa)
1778 		return (-1);
1779 	if (p0->mxdp_pa > p1->mxdp_pa)
1780 		return (1);
1781 	return (0);
1782 }
1783 
1784 static boolean_t
1785 mlxcx_setup_ports(mlxcx_t *mlxp)
1786 {
1787 	uint_t i, j;
1788 	mlxcx_port_t *p;
1789 	mlxcx_flow_table_t *ft;
1790 	mlxcx_flow_group_t *fg;
1791 	mlxcx_flow_entry_t *fe;
1792 
1793 	VERIFY3U(mlxp->mlx_nports, >, 0);
1794 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1795 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1796 
1797 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1798 		p = &mlxp->mlx_ports[i];
1799 		p->mlp_num = i;
1800 		p->mlx_port_event.mla_mlx = mlxp;
1801 		p->mlx_port_event.mla_port = p;
1802 		mutex_init(&p->mlx_port_event.mla_mtx, NULL,
1803 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
1804 		p->mlp_init |= MLXCX_PORT_INIT;
1805 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1806 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1807 		mutex_enter(&p->mlp_mtx);
1808 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1809 			mutex_exit(&p->mlp_mtx);
1810 			goto err;
1811 		}
1812 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1813 			mutex_exit(&p->mlp_mtx);
1814 			goto err;
1815 		}
1816 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1817 			mutex_exit(&p->mlp_mtx);
1818 			goto err;
1819 		}
1820 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1821 			mutex_exit(&p->mlp_mtx);
1822 			goto err;
1823 		}
1824 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1825 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1826 			mutex_exit(&p->mlp_mtx);
1827 			goto err;
1828 		}
1829 		if (!mlxcx_cmd_query_port_fec(mlxp, p)) {
1830 			mutex_exit(&p->mlp_mtx);
1831 			goto err;
1832 		}
1833 		p->mlp_fec_requested = LINK_FEC_AUTO;
1834 
1835 		mutex_exit(&p->mlp_mtx);
1836 	}
1837 
1838 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1839 		p = &mlxp->mlx_ports[i];
1840 		mutex_enter(&p->mlp_mtx);
1841 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1842 		    KM_SLEEP));
1843 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1844 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1845 
1846 		mutex_enter(&ft->mlft_mtx);
1847 
1848 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1849 		ft->mlft_port = p;
1850 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1851 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1852 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1853 		ft->mlft_nents = (1 << ft->mlft_entshift);
1854 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1855 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1856 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1857 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1858 
1859 		for (j = 0; j < ft->mlft_nents; ++j) {
1860 			ft->mlft_ent[j].mlfe_table = ft;
1861 			ft->mlft_ent[j].mlfe_index = j;
1862 		}
1863 
1864 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1865 			mutex_exit(&ft->mlft_mtx);
1866 			mutex_exit(&p->mlp_mtx);
1867 			goto err;
1868 		}
1869 
1870 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1871 			mutex_exit(&ft->mlft_mtx);
1872 			mutex_exit(&p->mlp_mtx);
1873 			goto err;
1874 		}
1875 
1876 		/*
1877 		 * We match broadcast at the top of the root flow table, then
1878 		 * all multicast/unicast MACs, then the promisc entry is down
1879 		 * the very bottom.
1880 		 *
1881 		 * This way when promisc is on, that entry simply catches any
1882 		 * remaining traffic that earlier flows haven't matched.
1883 		 */
1884 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1885 		list_insert_tail(&ft->mlft_groups, fg);
1886 		fg->mlfg_table = ft;
1887 		fg->mlfg_size = 1;
1888 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1889 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1890 			mutex_exit(&ft->mlft_mtx);
1891 			mutex_exit(&p->mlp_mtx);
1892 			goto err;
1893 		}
1894 		p->mlp_bcast = fg;
1895 		fe = list_head(&fg->mlfg_entries);
1896 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1897 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1898 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1899 
1900 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1901 		list_insert_tail(&ft->mlft_groups, fg);
1902 		fg->mlfg_table = ft;
1903 		fg->mlfg_size = ft->mlft_nents - 2;
1904 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1905 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1906 			mutex_exit(&ft->mlft_mtx);
1907 			mutex_exit(&p->mlp_mtx);
1908 			goto err;
1909 		}
1910 		p->mlp_umcast = fg;
1911 
1912 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1913 		list_insert_tail(&ft->mlft_groups, fg);
1914 		fg->mlfg_table = ft;
1915 		fg->mlfg_size = 1;
1916 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1917 			mutex_exit(&ft->mlft_mtx);
1918 			mutex_exit(&p->mlp_mtx);
1919 			goto err;
1920 		}
1921 		p->mlp_promisc = fg;
1922 		fe = list_head(&fg->mlfg_entries);
1923 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1924 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1925 
1926 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1927 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1928 		    mlfe_dmac_entry));
1929 
1930 		mutex_exit(&ft->mlft_mtx);
1931 		mutex_exit(&p->mlp_mtx);
1932 	}
1933 
1934 	return (B_TRUE);
1935 
1936 err:
1937 	mlxcx_teardown_ports(mlxp);
1938 	return (B_FALSE);
1939 }
1940 
1941 void
1942 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1943 {
1944 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1945 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1946 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1947 	mlxcx_flow_entry_t *fe;
1948 	mlxcx_group_vlan_t *v;
1949 
1950 	ASSERT(mutex_owned(&g->mlg_mtx));
1951 
1952 	mutex_enter(&ft->mlft_mtx);
1953 
1954 	if (!list_is_empty(&g->mlg_rx_vlans)) {
1955 		fe = list_head(&dfg->mlfg_entries);
1956 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
1957 	}
1958 
1959 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
1960 		fe = v->mlgv_fe;
1961 		ASSERT3P(fe->mlfe_table, ==, ft);
1962 		ASSERT3P(fe->mlfe_group, ==, fg);
1963 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
1964 
1965 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1966 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1967 	}
1968 
1969 	mutex_exit(&ft->mlft_mtx);
1970 }
1971 
1972 boolean_t
1973 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1974     boolean_t tagged, uint16_t vid)
1975 {
1976 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1977 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1978 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1979 	mlxcx_flow_entry_t *fe;
1980 	mlxcx_group_vlan_t *v;
1981 	boolean_t found = B_FALSE;
1982 
1983 	ASSERT(mutex_owned(&g->mlg_mtx));
1984 
1985 	mutex_enter(&ft->mlft_mtx);
1986 
1987 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1988 	    v = list_next(&g->mlg_rx_vlans, v)) {
1989 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1990 			found = B_TRUE;
1991 			break;
1992 		}
1993 	}
1994 	if (!found) {
1995 		mutex_exit(&ft->mlft_mtx);
1996 		return (B_FALSE);
1997 	}
1998 
1999 	list_remove(&g->mlg_rx_vlans, v);
2000 
2001 	/*
2002 	 * If this is the last VLAN entry, we have to go back to accepting
2003 	 * any VLAN (which means re-enabling the default entry).
2004 	 *
2005 	 * Do this before we remove the flow entry for the last specific
2006 	 * VLAN so that we don't lose any traffic in the transition.
2007 	 */
2008 	if (list_is_empty(&g->mlg_rx_vlans)) {
2009 		fe = list_head(&dfg->mlfg_entries);
2010 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2011 			list_insert_tail(&g->mlg_rx_vlans, v);
2012 			mutex_exit(&ft->mlft_mtx);
2013 			return (B_FALSE);
2014 		}
2015 	}
2016 
2017 	fe = v->mlgv_fe;
2018 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
2019 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
2020 	ASSERT3P(fe->mlfe_table, ==, ft);
2021 	ASSERT3P(fe->mlfe_group, ==, fg);
2022 
2023 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
2024 		list_insert_tail(&g->mlg_rx_vlans, v);
2025 		fe = list_head(&dfg->mlfg_entries);
2026 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
2027 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2028 		}
2029 		mutex_exit(&ft->mlft_mtx);
2030 		return (B_FALSE);
2031 	}
2032 
2033 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2034 
2035 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
2036 
2037 	mutex_exit(&ft->mlft_mtx);
2038 	return (B_TRUE);
2039 }
2040 
2041 boolean_t
2042 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
2043     uint16_t vid)
2044 {
2045 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
2046 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
2047 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
2048 	mlxcx_flow_entry_t *fe;
2049 	mlxcx_group_vlan_t *v;
2050 	boolean_t found = B_FALSE;
2051 	boolean_t first = B_FALSE;
2052 
2053 	ASSERT(mutex_owned(&g->mlg_mtx));
2054 
2055 	mutex_enter(&ft->mlft_mtx);
2056 
2057 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
2058 	    v = list_next(&g->mlg_rx_vlans, v)) {
2059 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
2060 			mutex_exit(&ft->mlft_mtx);
2061 			return (B_TRUE);
2062 		}
2063 	}
2064 	if (list_is_empty(&g->mlg_rx_vlans))
2065 		first = B_TRUE;
2066 
2067 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2068 	    fe = list_next(&fg->mlfg_entries, fe)) {
2069 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2070 			found = B_TRUE;
2071 			break;
2072 		}
2073 	}
2074 	if (!found) {
2075 		mutex_exit(&ft->mlft_mtx);
2076 		return (B_FALSE);
2077 	}
2078 
2079 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
2080 	v->mlgv_fe = fe;
2081 	v->mlgv_tagged = tagged;
2082 	v->mlgv_vid = vid;
2083 
2084 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2085 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2086 	fe->mlfe_vid = vid;
2087 	if (tagged) {
2088 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2089 	} else {
2090 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2091 	}
2092 
2093 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2094 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2095 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2096 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2097 		mutex_exit(&ft->mlft_mtx);
2098 		return (B_FALSE);
2099 	}
2100 
2101 	list_insert_tail(&g->mlg_rx_vlans, v);
2102 
2103 	/*
2104 	 * If the vlan list was empty for this group before adding this one,
2105 	 * then we no longer want the "default" entry to allow all VLANs
2106 	 * through.
2107 	 */
2108 	if (first) {
2109 		fe = list_head(&dfg->mlfg_entries);
2110 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2111 	}
2112 
2113 	mutex_exit(&ft->mlft_mtx);
2114 	return (B_TRUE);
2115 }
2116 
2117 void
2118 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2119     mlxcx_ring_group_t *group)
2120 {
2121 	mlxcx_flow_entry_t *fe;
2122 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2123 	mlxcx_group_mac_t *gm, *ngm;
2124 
2125 	ASSERT(mutex_owned(&port->mlp_mtx));
2126 	ASSERT(mutex_owned(&group->mlg_mtx));
2127 
2128 	mutex_enter(&ft->mlft_mtx);
2129 
2130 	gm = avl_first(&group->mlg_rx_macs);
2131 	for (; gm != NULL; gm = ngm) {
2132 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2133 
2134 		ASSERT3P(gm->mlgm_group, ==, group);
2135 		fe = gm->mlgm_fe;
2136 		ASSERT3P(fe->mlfe_table, ==, ft);
2137 
2138 		avl_remove(&group->mlg_rx_macs, gm);
2139 		list_remove(&fe->mlfe_ring_groups, gm);
2140 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
2141 
2142 		fe->mlfe_ndest = 0;
2143 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2144 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2145 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2146 			    gm->mlgm_group->mlg_rx_vlan_ft;
2147 		}
2148 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2149 
2150 		if (fe->mlfe_ndest > 0) {
2151 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2152 			continue;
2153 		}
2154 
2155 		/*
2156 		 * There are no more ring groups left for this MAC (it wasn't
2157 		 * attached to any other groups since ndest == 0), so clean up
2158 		 * its flow entry.
2159 		 */
2160 		avl_remove(&port->mlp_dmac_fe, fe);
2161 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2162 		list_destroy(&fe->mlfe_ring_groups);
2163 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2164 	}
2165 
2166 	mutex_exit(&ft->mlft_mtx);
2167 }
2168 
2169 boolean_t
2170 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2171     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2172 {
2173 	mlxcx_flow_entry_t *fe;
2174 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2175 	mlxcx_group_mac_t *gm, probe;
2176 
2177 	ASSERT(mutex_owned(&port->mlp_mtx));
2178 	ASSERT(mutex_owned(&group->mlg_mtx));
2179 
2180 	bzero(&probe, sizeof (probe));
2181 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2182 
2183 	mutex_enter(&ft->mlft_mtx);
2184 
2185 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2186 	if (gm == NULL) {
2187 		mutex_exit(&ft->mlft_mtx);
2188 		return (B_FALSE);
2189 	}
2190 	ASSERT3P(gm->mlgm_group, ==, group);
2191 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2192 
2193 	fe = gm->mlgm_fe;
2194 	ASSERT3P(fe->mlfe_table, ==, ft);
2195 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2196 
2197 	list_remove(&fe->mlfe_ring_groups, gm);
2198 	avl_remove(&group->mlg_rx_macs, gm);
2199 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2200 
2201 	fe->mlfe_ndest = 0;
2202 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2203 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2204 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2205 		    gm->mlgm_group->mlg_rx_vlan_ft;
2206 	}
2207 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2208 
2209 	if (fe->mlfe_ndest > 0) {
2210 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2211 			mutex_exit(&ft->mlft_mtx);
2212 			return (B_FALSE);
2213 		}
2214 		mutex_exit(&ft->mlft_mtx);
2215 		return (B_TRUE);
2216 	}
2217 
2218 	/*
2219 	 * There are no more ring groups left for this MAC (it wasn't attached
2220 	 * to any other groups since ndest == 0), so clean up its flow entry.
2221 	 */
2222 	avl_remove(&port->mlp_dmac_fe, fe);
2223 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2224 	list_destroy(&fe->mlfe_ring_groups);
2225 
2226 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2227 
2228 	mutex_exit(&ft->mlft_mtx);
2229 
2230 	return (B_TRUE);
2231 }
2232 
2233 boolean_t
2234 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2235     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2236 {
2237 	mlxcx_flow_group_t *fg;
2238 	mlxcx_flow_entry_t *fe, probe;
2239 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2240 	mlxcx_group_mac_t *gm;
2241 	boolean_t found = B_FALSE;
2242 
2243 	ASSERT(mutex_owned(&port->mlp_mtx));
2244 	ASSERT(mutex_owned(&group->mlg_mtx));
2245 
2246 	bzero(&probe, sizeof (probe));
2247 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2248 
2249 	mutex_enter(&ft->mlft_mtx);
2250 
2251 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2252 
2253 	if (fe == NULL) {
2254 		fg = port->mlp_umcast;
2255 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2256 		    fe = list_next(&fg->mlfg_entries, fe)) {
2257 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2258 				found = B_TRUE;
2259 				break;
2260 			}
2261 		}
2262 		if (!found) {
2263 			mutex_exit(&ft->mlft_mtx);
2264 			return (B_FALSE);
2265 		}
2266 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2267 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2268 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2269 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2270 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2271 
2272 		avl_add(&port->mlp_dmac_fe, fe);
2273 	}
2274 
2275 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2276 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2277 
2278 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2279 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2280 		if (--fe->mlfe_ndest == 0) {
2281 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2282 		}
2283 		mutex_exit(&ft->mlft_mtx);
2284 		return (B_FALSE);
2285 	}
2286 
2287 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2288 	gm->mlgm_group = group;
2289 	gm->mlgm_fe = fe;
2290 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2291 	avl_add(&group->mlg_rx_macs, gm);
2292 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2293 
2294 	mutex_exit(&ft->mlft_mtx);
2295 
2296 	return (B_TRUE);
2297 }
2298 
2299 boolean_t
2300 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2301     mlxcx_flow_group_t *fg)
2302 {
2303 	mlxcx_flow_entry_t *fe;
2304 	uint_t i, idx;
2305 
2306 	ASSERT(mutex_owned(&ft->mlft_mtx));
2307 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2308 	ASSERT3P(fg->mlfg_table, ==, ft);
2309 
2310 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2311 		return (B_FALSE);
2312 	fg->mlfg_start_idx = ft->mlft_next_ent;
2313 
2314 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2315 		return (B_FALSE);
2316 	}
2317 
2318 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2319 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2320 	for (i = 0; i < fg->mlfg_size; ++i) {
2321 		idx = fg->mlfg_start_idx + i;
2322 		fe = &ft->mlft_ent[idx];
2323 		fe->mlfe_group = fg;
2324 		list_insert_tail(&fg->mlfg_entries, fe);
2325 	}
2326 	fg->mlfg_avail = fg->mlfg_size;
2327 	ft->mlft_next_ent += fg->mlfg_size;
2328 
2329 	return (B_TRUE);
2330 }
2331 
2332 static boolean_t
2333 mlxcx_setup_eq(mlxcx_t *mlxp, uint_t vec, uint64_t events)
2334 {
2335 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[vec];
2336 
2337 	mutex_enter(&mleq->mleq_mtx);
2338 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2339 		/* mlxcx_teardown_eqs() will clean this up */
2340 		mutex_exit(&mleq->mleq_mtx);
2341 		return (B_FALSE);
2342 	}
2343 	mleq->mleq_mlx = mlxp;
2344 	mleq->mleq_uar = &mlxp->mlx_uar;
2345 	mleq->mleq_events = events;
2346 	mleq->mleq_intr_index = vec;
2347 
2348 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2349 		/* mlxcx_teardown_eqs() will clean this up */
2350 		mutex_exit(&mleq->mleq_mtx);
2351 		return (B_FALSE);
2352 	}
2353 
2354 	if (ddi_intr_enable(mlxp->mlx_intr_handles[vec]) != DDI_SUCCESS) {
2355 		/*
2356 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2357 		 * eq_rele_dma
2358 		 */
2359 		mutex_exit(&mleq->mleq_mtx);
2360 		return (B_FALSE);
2361 	}
2362 	mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2363 	mlxcx_arm_eq(mlxp, mleq);
2364 	mutex_exit(&mleq->mleq_mtx);
2365 
2366 	return (B_TRUE);
2367 }
2368 
2369 static boolean_t
2370 mlxcx_setup_async_eqs(mlxcx_t *mlxp)
2371 {
2372 	boolean_t ret;
2373 
2374 	ret = mlxcx_setup_eq(mlxp, 0,
2375 	    (1ULL << MLXCX_EVENT_CMD_COMPLETION) |
2376 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2377 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2378 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2379 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2380 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2381 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2382 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2383 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2384 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2385 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2386 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2387 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2388 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST));
2389 
2390 	if (ret)
2391 		mlxcx_cmd_eq_enable(mlxp);
2392 
2393 	return (ret);
2394 }
2395 
2396 int
2397 mlxcx_cq_compare(const void *arg0, const void *arg1)
2398 {
2399 	const mlxcx_completion_queue_t *left = arg0;
2400 	const mlxcx_completion_queue_t *right = arg1;
2401 
2402 	if (left->mlcq_num < right->mlcq_num) {
2403 		return (-1);
2404 	}
2405 	if (left->mlcq_num > right->mlcq_num) {
2406 		return (1);
2407 	}
2408 	return (0);
2409 }
2410 
2411 static boolean_t
2412 mlxcx_setup_eqs(mlxcx_t *mlxp)
2413 {
2414 	uint_t i;
2415 	mlxcx_event_queue_t *mleq;
2416 
2417 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2418 
2419 	for (i = mlxp->mlx_intr_cq0; i < mlxp->mlx_intr_count; ++i) {
2420 		mleq = &mlxp->mlx_eqs[i];
2421 		mutex_enter(&mleq->mleq_mtx);
2422 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2423 			mutex_exit(&mleq->mleq_mtx);
2424 			return (B_FALSE);
2425 		}
2426 		mleq->mleq_uar = &mlxp->mlx_uar;
2427 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2428 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2429 			mutex_exit(&mleq->mleq_mtx);
2430 			return (B_FALSE);
2431 		}
2432 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2433 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2434 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2435 			mutex_exit(&mleq->mleq_mtx);
2436 			return (B_FALSE);
2437 		}
2438 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2439 			mutex_exit(&mleq->mleq_mtx);
2440 			return (B_FALSE);
2441 		}
2442 		mleq->mleq_state |= MLXCX_EQ_INTR_ENABLED;
2443 		mlxcx_arm_eq(mlxp, mleq);
2444 		mutex_exit(&mleq->mleq_mtx);
2445 	}
2446 
2447 	mlxp->mlx_next_eq = mlxp->mlx_intr_cq0;
2448 
2449 	return (B_TRUE);
2450 }
2451 
2452 /*
2453  * Snapshot all of the hardware capabilities that we care about and then modify
2454  * the HCA capabilities to get things moving.
2455  */
2456 static boolean_t
2457 mlxcx_init_caps(mlxcx_t *mlxp)
2458 {
2459 	mlxcx_caps_t *c;
2460 
2461 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2462 
2463 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2464 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2465 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2466 	}
2467 
2468 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2469 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2470 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2471 	}
2472 
2473 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2474 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2475 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2476 	}
2477 
2478 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2479 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2480 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2481 	}
2482 
2483 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2484 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2485 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2486 	}
2487 
2488 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2489 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2490 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2491 	}
2492 
2493 	/*
2494 	 * Check the caps meet our requirements.
2495 	 */
2496 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2497 
2498 	if (gen->mlcap_general_log_pg_sz != 12) {
2499 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2500 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2501 		goto err;
2502 	}
2503 	if (gen->mlcap_general_cqe_version != 1) {
2504 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2505 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2506 		goto err;
2507 	}
2508 	if (gen->mlcap_general_port_type !=
2509 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2510 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2511 		goto err;
2512 	}
2513 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2514 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2515 
2516 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2517 
2518 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2519 	    MLXCX_ETH_CAP_CSUM_CAP);
2520 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2521 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2522 
2523 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2524 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2525 	if (c->mlc_max_lso_size == 1) {
2526 		c->mlc_max_lso_size = 0;
2527 		c->mlc_lso = B_FALSE;
2528 	} else {
2529 		c->mlc_lso = B_TRUE;
2530 	}
2531 
2532 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2533 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2534 
2535 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2536 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2537 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2538 		goto err;
2539 	}
2540 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2541 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2542 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2543 		    "flow table entries");
2544 		goto err;
2545 	}
2546 
2547 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2548 	    mlcap_flow_prop_log_max_ft_size;
2549 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2550 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2551 	c->mlc_max_rx_ft = (1 << c->mlc_nic_flow_cur.mhc_flow.
2552 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_ft_num);
2553 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2554 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2555 
2556 	return (B_TRUE);
2557 
2558 err:
2559 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2560 	return (B_FALSE);
2561 }
2562 
2563 static int
2564 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2565 {
2566 	mlxcx_t *mlxp;
2567 
2568 	if (cmd != DDI_DETACH)
2569 		return (DDI_FAILURE);
2570 
2571 	mlxp = ddi_get_driver_private(dip);
2572 	if (mlxp == NULL) {
2573 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2574 		    "private data");
2575 		return (DDI_FAILURE);
2576 	}
2577 
2578 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2579 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2580 			return (DDI_FAILURE);
2581 		}
2582 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2583 	}
2584 
2585 	mlxcx_teardown(mlxp);
2586 	return (DDI_SUCCESS);
2587 }
2588 
2589 static size_t
2590 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2591 {
2592 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2593 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2594 	size_t tirlim, flowlim, gflowlim;
2595 
2596 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2597 	if (tirlim < ngroups) {
2598 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2599 		    "on number of TIRs available", tirlim);
2600 		ngroups = tirlim;
2601 	}
2602 
2603 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2604 	if (flowlim < ngroups) {
2605 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2606 		    "on max size of RX flow tables", flowlim);
2607 		ngroups = flowlim;
2608 	}
2609 
2610 	/*
2611 	 * Restrict the number of groups not to exceed the max flow
2612 	 * table number from the devices capabilities.
2613 	 * There is one root table entry per port and 2 entries per
2614 	 * group.
2615 	 */
2616 	flowlim = (mlxp->mlx_caps->mlc_max_rx_ft - mlxp->mlx_nports) / 2;
2617 	if (flowlim < ngroups) {
2618 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2619 		    "on max number of RX flow tables",
2620 		    flowlim);
2621 		ngroups = flowlim;
2622 	}
2623 
2624 	do {
2625 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2626 		if (gflowlim < ngroups) {
2627 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2628 			    "based on max total RX flows", gflowlim);
2629 			--ngroups;
2630 		}
2631 	} while (gflowlim < ngroups);
2632 
2633 	return (ngroups);
2634 }
2635 
2636 static int
2637 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2638 {
2639 	mlxcx_t *mlxp;
2640 	char tq_name[TASKQ_NAMELEN];
2641 	uint_t i;
2642 	int inst, ret;
2643 
2644 	if (cmd != DDI_ATTACH)
2645 		return (DDI_FAILURE);
2646 
2647 	inst = ddi_get_instance(dip);
2648 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2649 	if (ret != 0)
2650 		return (ret);
2651 
2652 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2653 	if (mlxp == NULL)
2654 		return (DDI_FAILURE);
2655 	mlxp->mlx_dip = dip;
2656 	mlxp->mlx_inst = inst;
2657 	ddi_set_driver_private(dip, mlxp);
2658 
2659 	mlxcx_load_props(mlxp);
2660 
2661 	mlxcx_fm_init(mlxp);
2662 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2663 
2664 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2665 	    DDI_SUCCESS) {
2666 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2667 		goto err;
2668 	}
2669 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2670 
2671 	if (!mlxcx_regs_map(mlxp)) {
2672 		goto err;
2673 	}
2674 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2675 
2676 	if (!mlxcx_cmd_queue_init(mlxp)) {
2677 		goto err;
2678 	}
2679 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2680 
2681 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2682 		goto err;
2683 	}
2684 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2685 
2686 	if (!mlxcx_check_issi(mlxp)) {
2687 		goto err;
2688 	}
2689 
2690 	/*
2691 	 * We have to get our interrupts now so we know what priority to
2692 	 * create pagemtx with.
2693 	 */
2694 	if (!mlxcx_intr_setup(mlxp)) {
2695 		goto err;
2696 	}
2697 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2698 
2699 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2700 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2701 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2702 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2703 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2704 
2705 	/*
2706 	 * Taskq for asynchronous events which may interact with the HCA
2707 	 * via the command interface. Single threaded FIFO.
2708 	 */
2709 	(void) snprintf(tq_name, sizeof (tq_name), "%s_async_%d",
2710 	    ddi_driver_name(mlxp->mlx_dip), mlxp->mlx_inst);
2711 	mlxp->mlx_async_tq = taskq_create(tq_name, 1, minclsyspri, 1, INT_MAX,
2712 	    TASKQ_PREPOPULATE);
2713 	/*
2714 	 * Initialize any pre-allocated taskq param structs.
2715 	 */
2716 	for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
2717 		mlxp->mlx_npages_req[i].mla_mlx = mlxp;
2718 		mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL,
2719 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
2720 	}
2721 	mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ;
2722 
2723 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2724 		goto err;
2725 	}
2726 
2727 	if (!mlxcx_init_caps(mlxp)) {
2728 		goto err;
2729 	}
2730 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2731 
2732 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2733 		goto err;
2734 	}
2735 
2736 	if (!mlxcx_cmd_init_hca(mlxp)) {
2737 		goto err;
2738 	}
2739 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2740 
2741 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2742 		goto err;
2743 	}
2744 
2745 	/*
2746 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2747 	 * doorbells.
2748 	 */
2749 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2750 		goto err;
2751 	}
2752 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2753 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2754 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2755 	}
2756 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2757 
2758 	/*
2759 	 * Set up asynchronous event queue which handles control type events
2760 	 * like PAGE_REQUEST and CMD completion events.
2761 	 *
2762 	 * This will enable and arm the interrupt on EQ 0.
2763 	 */
2764 	if (!mlxcx_setup_async_eqs(mlxp)) {
2765 		goto err;
2766 	}
2767 
2768 	/*
2769 	 * Allocate a protection and transport domain. These don't really do
2770 	 * anything for us (they're IB concepts), but we need to give their
2771 	 * ID numbers in other commands.
2772 	 */
2773 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2774 		goto err;
2775 	}
2776 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2777 		goto err;
2778 	}
2779 	/*
2780 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2781 	 * work queue entries, rather than having to mess with the NIC's
2782 	 * internal MMU.
2783 	 */
2784 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2785 		goto err;
2786 	}
2787 
2788 	/*
2789 	 * Query our port information and current state, populate the
2790 	 * mlxcx_port_t structs.
2791 	 *
2792 	 * This also sets up the root flow tables and flow groups.
2793 	 */
2794 	if (!mlxcx_setup_ports(mlxp)) {
2795 		goto err;
2796 	}
2797 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2798 
2799 	mlxcx_load_model_props(mlxp);
2800 
2801 	/*
2802 	 * Set up, enable and arm the rest of the interrupt EQs which will
2803 	 * service events from CQs.
2804 	 *
2805 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2806 	 * cleaned up.
2807 	 */
2808 	if (!mlxcx_setup_eqs(mlxp)) {
2809 		goto err;
2810 	}
2811 
2812 	/* Completion queues */
2813 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2814 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2815 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2816 
2817 	/* Work queues (send queues, receive queues) */
2818 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2819 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2820 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2821 
2822 	/*
2823 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2824 	 * "groups" we advertise to MAC.
2825 	 */
2826 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2827 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2828 	    sizeof (mlxcx_ring_group_t);
2829 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2830 
2831 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2832 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2833 	    sizeof (mlxcx_ring_group_t);
2834 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2835 
2836 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2837 
2838 	/*
2839 	 * Sets up the free/busy buffers list for keeping track of packet
2840 	 * buffers.
2841 	 */
2842 	if (!mlxcx_setup_bufs(mlxp))
2843 		goto err;
2844 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2845 
2846 	/*
2847 	 * Before we tell MAC about our rings/groups, we need to do enough
2848 	 * setup on them to be sure about the numbers and configuration that
2849 	 * we have. This will do basically everything short of allocating
2850 	 * packet buffers and starting the rings up.
2851 	 */
2852 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2853 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2854 			goto err;
2855 	}
2856 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2857 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2858 			goto err;
2859 	}
2860 
2861 	/*
2862 	 * Set up periodic fault check timers which check the queue states,
2863 	 * set up should be after all the queues have been initialized and
2864 	 * consequently the teardown of timers must happen before
2865 	 * queue teardown.
2866 	 */
2867 	if (!mlxcx_setup_checktimers(mlxp)) {
2868 		goto err;
2869 	}
2870 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2871 
2872 	/*
2873 	 * Finally, tell MAC that we exist!
2874 	 */
2875 	if (!mlxcx_register_mac(mlxp)) {
2876 		goto err;
2877 	}
2878 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
2879 
2880 	return (DDI_SUCCESS);
2881 
2882 err:
2883 	mlxcx_teardown(mlxp);
2884 	return (DDI_FAILURE);
2885 }
2886 
2887 static struct cb_ops mlxcx_cb_ops = {
2888 	.cb_open = nulldev,
2889 	.cb_close = nulldev,
2890 	.cb_strategy = nodev,
2891 	.cb_print = nodev,
2892 	.cb_dump = nodev,
2893 	.cb_read = nodev,
2894 	.cb_write = nodev,
2895 	.cb_ioctl = nodev,
2896 	.cb_devmap = nodev,
2897 	.cb_mmap = nodev,
2898 	.cb_segmap = nodev,
2899 	.cb_chpoll = nochpoll,
2900 	.cb_prop_op = ddi_prop_op,
2901 	.cb_flag = D_MP,
2902 	.cb_rev = CB_REV,
2903 	.cb_aread = nodev,
2904 	.cb_awrite = nodev
2905 };
2906 
2907 static struct dev_ops mlxcx_dev_ops = {
2908 	.devo_rev = DEVO_REV,
2909 	.devo_refcnt = 0,
2910 	.devo_getinfo = NULL,
2911 	.devo_identify = nulldev,
2912 	.devo_probe = nulldev,
2913 	.devo_attach = mlxcx_attach,
2914 	.devo_detach = mlxcx_detach,
2915 	.devo_reset = nodev,
2916 	.devo_quiesce = ddi_quiesce_not_supported,
2917 	.devo_cb_ops = &mlxcx_cb_ops
2918 };
2919 
2920 static struct modldrv mlxcx_modldrv = {
2921 	.drv_modops = &mod_driverops,
2922 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
2923 	.drv_dev_ops = &mlxcx_dev_ops
2924 };
2925 
2926 static struct modlinkage mlxcx_modlinkage = {
2927 	.ml_rev = MODREV_1,
2928 	.ml_linkage = { &mlxcx_modldrv, NULL }
2929 };
2930 
2931 int
2932 _init(void)
2933 {
2934 	int ret;
2935 
2936 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
2937 	if (ret != 0) {
2938 		return (ret);
2939 	}
2940 
2941 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
2942 
2943 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2944 		mac_fini_ops(&mlxcx_dev_ops);
2945 		ddi_soft_state_fini(&mlxcx_softstate);
2946 		return (ret);
2947 	}
2948 
2949 	return (DDI_SUCCESS);
2950 }
2951 
2952 int
2953 _info(struct modinfo *modinfop)
2954 {
2955 	return (mod_info(&mlxcx_modlinkage, modinfop));
2956 }
2957 
2958 int
2959 _fini(void)
2960 {
2961 	int ret;
2962 
2963 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2964 		return (ret);
2965 	}
2966 
2967 	mac_fini_ops(&mlxcx_dev_ops);
2968 
2969 	ddi_soft_state_fini(&mlxcx_softstate);
2970 
2971 	return (DDI_SUCCESS);
2972 }
2973