xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IP */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip_if.h>		/* for IP6_DL_SAP */
54 #include <inet/ip6.h>		/* for ip6_t */
55 #include <inet/tcp.h>		/* for tcph_t */
56 #include <netinet/icmp6.h>	/* for icmp6_t */
57 #include <sys/callb.h>
58 #include <sys/modhash.h>
59 
60 #include <sys/ib/clients/ibd/ibd.h>
61 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
62 #include <sys/note.h>
63 #include <sys/multidata.h>
64 
65 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
66 
67 /*
68  * Per-interface tunables
69  *
70  * ibd_tx_copy_thresh
71  *     This sets the threshold at which ibd will attempt to do a bcopy of the
72  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
73  *     is restricted by various parameters, so setting of this value must be
74  *     made after careful considerations only.  For instance, IB HCAs currently
75  *     impose a relatively small limit (when compared to ethernet NICs) on the
76  *     length of the SGL for transmit. On the other hand, the ip stack could
77  *     send down mp chains that are quite long when LSO is enabled.
78  *
79  * ibd_num_swqe
80  *     Number of "send WQE" elements that will be allocated and used by ibd.
81  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
82  *     buffer in each of these send wqes must be taken into account. This
83  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
84  *     currently set to the same value of ibd_tx_copy_thresh, but may be
85  *     changed independently if needed).
86  *
87  * ibd_num_rwqe
88  *     Number of "receive WQE" elements that will be allocated and used by
89  *     ibd. This parameter is limited by the maximum channel size of the HCA.
90  *     Each buffer in the receive wqe will be of MTU size.
91  *
92  * ibd_num_lso_bufs
93  *     Number of "larger-than-MTU" copy buffers to use for cases when the
94  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
95  *     and too large to be used with regular MTU-sized copy buffers. It is
96  *     not recommended to tune this variable without understanding the
97  *     application environment and/or memory resources. The size of each of
98  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
99  *
100  * ibd_num_ah
101  *     Number of AH cache entries to allocate
102  *
103  * ibd_hash_size
104  *     Hash table size for the active AH list
105  *
106  * ibd_separate_cqs
107  * ibd_txcomp_poll
108  *     These boolean variables (1 or 0) may be used to tune the behavior of
109  *     ibd in managing the send and receive completion queues and in deciding
110  *     whether or not transmit completions should be polled or interrupt
111  *     driven (when the completion queues are separate). If both the completion
112  *     queues are interrupt driven, it may not be possible for the handlers to
113  *     be invoked concurrently, depending on how the interrupts are tied on
114  *     the PCI intr line.  Note that some combination of these two parameters
115  *     may not be meaningful (and therefore not allowed).
116  *
117  * ibd_tx_softintr
118  * ibd_rx_softintr
119  *     The softintr mechanism allows ibd to avoid event queue overflows if
120  *     the receive/completion handlers are to be expensive. These are enabled
121  *     by default.
122  *
123  * ibd_log_sz
124  *     This specifies the size of the ibd log buffer in bytes. The buffer is
125  *     allocated and logging is enabled only when IBD_LOGGING is defined.
126  *
127  */
128 uint_t ibd_tx_copy_thresh = 0x1000;
129 uint_t ibd_num_swqe = 4000;
130 uint_t ibd_num_rwqe = 4000;
131 uint_t ibd_num_lso_bufs = 0x400;
132 uint_t ibd_num_ah = 64;
133 uint_t ibd_hash_size = 32;
134 uint_t ibd_separate_cqs = 1;
135 uint_t ibd_txcomp_poll = 0;
136 uint_t ibd_rx_softintr = 1;
137 uint_t ibd_tx_softintr = 1;
138 uint_t ibd_create_broadcast_group = 1;
139 uint_t ibd_force_lso_disable = 1;
140 #ifdef IBD_LOGGING
141 uint_t ibd_log_sz = 0x20000;
142 #endif
143 
144 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
145 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
146 #define	IBD_NUM_SWQE			ibd_num_swqe
147 #define	IBD_NUM_RWQE			ibd_num_rwqe
148 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
149 #define	IBD_NUM_AH			ibd_num_ah
150 #define	IBD_HASH_SIZE			ibd_hash_size
151 #ifdef IBD_LOGGING
152 #define	IBD_LOG_SZ			ibd_log_sz
153 #endif
154 
155 /*
156  * Receive CQ moderation parameters: NOT tunables
157  */
158 static uint_t ibd_rxcomp_count = 4;
159 static uint_t ibd_rxcomp_usec = 10;
160 
161 /*
162  * Send CQ moderation parameters: NOT tunables
163  */
164 #define	IBD_TXCOMP_COUNT		10
165 #define	IBD_TXCOMP_USEC			300
166 
167 /*
168  * Thresholds
169  *
170  * When waiting for resources (swqes or lso buffers) to become available,
171  * the first two thresholds below determine how long to wait before informing
172  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
173  * determines how low the available swqes should go before we start polling
174  * the completion queue.
175  */
176 #define	IBD_FREE_LSOS_THRESH		8
177 #define	IBD_FREE_SWQES_THRESH		20
178 #define	IBD_TX_POLL_THRESH		80
179 
180 /*
181  * When doing multiple-send-wr or multiple-recv-wr posts, this value
182  * determines how many to do at a time (in a single ibt_post_send/recv).
183  */
184 #define	IBD_MAX_POST_MULTIPLE		4
185 
186 /*
187  * Maximum length for returning chained mps back to crossbow
188  */
189 #define	IBD_MAX_RX_MP_LEN		16
190 
191 /*
192  * LSO parameters
193  */
194 #define	IBD_LSO_MAXLEN			65536
195 #define	IBD_LSO_BUFSZ			8192
196 #define	IBD_PROP_LSO_POLICY		"lso-policy"
197 
198 /*
199  * Completion queue polling control
200  */
201 #define	IBD_RX_CQ_POLLING		0x1
202 #define	IBD_TX_CQ_POLLING		0x2
203 #define	IBD_REDO_RX_CQ_POLLING		0x4
204 #define	IBD_REDO_TX_CQ_POLLING		0x8
205 
206 /*
207  * Flag bits for resources to reap
208  */
209 #define	IBD_RSRC_SWQE			0x1
210 #define	IBD_RSRC_LSOBUF			0x2
211 
212 /*
213  * Async operation types
214  */
215 #define	IBD_ASYNC_GETAH			1
216 #define	IBD_ASYNC_JOIN			2
217 #define	IBD_ASYNC_LEAVE			3
218 #define	IBD_ASYNC_PROMON		4
219 #define	IBD_ASYNC_PROMOFF		5
220 #define	IBD_ASYNC_REAP			6
221 #define	IBD_ASYNC_TRAP			7
222 #define	IBD_ASYNC_SCHED			8
223 #define	IBD_ASYNC_LINK			9
224 #define	IBD_ASYNC_EXIT			10
225 
226 /*
227  * Async operation states
228  */
229 #define	IBD_OP_NOTSTARTED		0
230 #define	IBD_OP_ONGOING			1
231 #define	IBD_OP_COMPLETED		2
232 #define	IBD_OP_ERRORED			3
233 #define	IBD_OP_ROUTERED			4
234 
235 /*
236  * State of IBD driver initialization during attach/m_start
237  */
238 #define	IBD_DRV_STATE_INITIALIZED	0x00001
239 #define	IBD_DRV_RXINTR_ADDED		0x00002
240 #define	IBD_DRV_TXINTR_ADDED		0x00004
241 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
242 #define	IBD_DRV_HCA_OPENED		0x00010
243 #define	IBD_DRV_PD_ALLOCD		0x00020
244 #define	IBD_DRV_MAC_REGISTERED		0x00040
245 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
246 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
247 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
248 #define	IBD_DRV_CQS_ALLOCD		0x00400
249 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
250 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
251 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
252 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
253 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
254 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
255 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
256 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
257 #define	IBD_DRV_STARTED			0x80000
258 
259 /*
260  * Start/stop in-progress flags; note that restart must always remain
261  * the OR of start and stop flag values.
262  */
263 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
264 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
265 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
266 
267 /*
268  * Miscellaneous constants
269  */
270 #define	IBD_SEND			0
271 #define	IBD_RECV			1
272 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
273 #define	IBD_DEF_MAX_SDU			2044
274 #define	IBD_DEFAULT_QKEY		0xB1B
275 #ifdef IBD_LOGGING
276 #define	IBD_DMAX_LINE			100
277 #endif
278 
279 /*
280  * Enumerations for link states
281  */
282 typedef enum {
283 	IBD_LINK_DOWN,
284 	IBD_LINK_UP,
285 	IBD_LINK_UP_ABSENT
286 } ibd_link_op_t;
287 
288 /*
289  * Driver State Pointer
290  */
291 void *ibd_list;
292 
293 /*
294  * Logging
295  */
296 #ifdef IBD_LOGGING
297 kmutex_t ibd_lbuf_lock;
298 uint8_t *ibd_lbuf;
299 uint32_t ibd_lbuf_ndx;
300 #endif
301 
302 /*
303  * Required system entry points
304  */
305 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
306 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
307 
308 /*
309  * Required driver entry points for GLDv3
310  */
311 static int ibd_m_stat(void *, uint_t, uint64_t *);
312 static int ibd_m_start(void *);
313 static void ibd_m_stop(void *);
314 static int ibd_m_promisc(void *, boolean_t);
315 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
316 static int ibd_m_unicst(void *, const uint8_t *);
317 static mblk_t *ibd_m_tx(void *, mblk_t *);
318 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
319 
320 /*
321  * Private driver entry points for GLDv3
322  */
323 
324 /*
325  * Initialization
326  */
327 static int ibd_state_init(ibd_state_t *, dev_info_t *);
328 static int ibd_init_txlist(ibd_state_t *);
329 static int ibd_init_rxlist(ibd_state_t *);
330 static int ibd_acache_init(ibd_state_t *);
331 #ifdef IBD_LOGGING
332 static void ibd_log_init(void);
333 #endif
334 
335 /*
336  * Termination/cleanup
337  */
338 static void ibd_state_fini(ibd_state_t *);
339 static void ibd_fini_txlist(ibd_state_t *);
340 static void ibd_fini_rxlist(ibd_state_t *);
341 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
342 static void ibd_acache_fini(ibd_state_t *);
343 #ifdef IBD_LOGGING
344 static void ibd_log_fini(void);
345 #endif
346 
347 /*
348  * Allocation/acquire/map routines
349  */
350 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t);
351 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **);
352 static int ibd_alloc_tx_copybufs(ibd_state_t *);
353 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
354 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **);
355 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
356     uint32_t *);
357 
358 /*
359  * Free/release/unmap routines
360  */
361 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *);
362 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
363 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *);
364 static void ibd_free_tx_copybufs(ibd_state_t *);
365 static void ibd_free_tx_lsobufs(ibd_state_t *);
366 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *);
367 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
368 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
369 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
370 
371 /*
372  * Handlers/callback routines
373  */
374 static uint_t ibd_intr(char *);
375 static uint_t ibd_tx_recycle(char *);
376 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
377 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
378 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t);
379 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t);
380 static void ibd_freemsg_cb(char *);
381 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
382     ibt_async_event_t *);
383 static void ibd_snet_notices_handler(void *, ib_gid_t,
384     ibt_subnet_event_code_t, ibt_subnet_event_t *);
385 
386 /*
387  * Send/receive routines
388  */
389 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
390 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
391 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t);
392 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
393 static void ibd_flush_rx(ibd_state_t *, mblk_t *);
394 
395 /*
396  * Threads
397  */
398 static void ibd_async_work(ibd_state_t *);
399 
400 /*
401  * Async tasks
402  */
403 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
404 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
405 static void ibd_async_setprom(ibd_state_t *);
406 static void ibd_async_unsetprom(ibd_state_t *);
407 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
408 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
409 static void ibd_async_txsched(ibd_state_t *);
410 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
411 
412 /*
413  * Async task helpers
414  */
415 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
416 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
417 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
418 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
419     ipoib_mac_t *, ipoib_mac_t *);
420 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
421 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
422 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
423 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
424 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
425 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
426 static uint64_t ibd_get_portspeed(ibd_state_t *);
427 static boolean_t ibd_async_safe(ibd_state_t *);
428 static void ibd_async_done(ibd_state_t *);
429 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
430 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
431 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
432 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
433 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
434 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
435 
436 /*
437  * Helpers for attach/start routines
438  */
439 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
440 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
441 static int ibd_unattach(ibd_state_t *, dev_info_t *);
442 static int ibd_get_port_details(ibd_state_t *);
443 static int ibd_alloc_cqs(ibd_state_t *);
444 static int ibd_setup_ud_channel(ibd_state_t *);
445 static int ibd_start(ibd_state_t *);
446 static int ibd_undo_start(ibd_state_t *, link_state_t);
447 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
448 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
449 
450 
451 /*
452  * Miscellaneous helpers
453  */
454 static int ibd_sched_poll(ibd_state_t *, int, int);
455 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
456 static int ibd_resume_transmission(ibd_state_t *);
457 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
458 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
459 static void *list_get_head(list_t *);
460 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
461 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
462 static void ibd_print_warn(ibd_state_t *, char *, ...);
463 #ifdef IBD_LOGGING
464 static void ibd_log(const char *, ...);
465 #endif
466 
467 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
468     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
469 
470 /* Module Driver Info */
471 static struct modldrv ibd_modldrv = {
472 	&mod_driverops,			/* This one is a driver */
473 	"InfiniBand GLDv3 Driver",	/* short description */
474 	&ibd_dev_ops			/* driver specific ops */
475 };
476 
477 /* Module Linkage */
478 static struct modlinkage ibd_modlinkage = {
479 	MODREV_1, (void *)&ibd_modldrv, NULL
480 };
481 
482 /*
483  * Module (static) info passed to IBTL during ibt_attach
484  */
485 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
486 	IBTI_V_CURR,
487 	IBT_NETWORK,
488 	ibd_async_handler,
489 	NULL,
490 	"IPIB"
491 };
492 
493 /*
494  * GLDv3 entry points
495  */
496 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
497 static mac_callbacks_t ibd_m_callbacks = {
498 	IBD_M_CALLBACK_FLAGS,
499 	ibd_m_stat,
500 	ibd_m_start,
501 	ibd_m_stop,
502 	ibd_m_promisc,
503 	ibd_m_multicst,
504 	ibd_m_unicst,
505 	ibd_m_tx,
506 	NULL,
507 	ibd_m_getcapab
508 };
509 
510 /*
511  * Fill/clear <scope> and <p_key> in multicast/broadcast address
512  */
513 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
514 {							\
515 	*(uint32_t *)((char *)(maddr) + 4) |=		\
516 	    htonl((uint32_t)(scope) << 16);		\
517 	*(uint32_t *)((char *)(maddr) + 8) |=		\
518 	    htonl((uint32_t)(pkey) << 16);		\
519 }
520 
521 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
522 {							\
523 	*(uint32_t *)((char *)(maddr) + 4) &=		\
524 	    htonl(~((uint32_t)0xF << 16));		\
525 	*(uint32_t *)((char *)(maddr) + 8) &=		\
526 	    htonl(~((uint32_t)0xFFFF << 16));		\
527 }
528 
529 /*
530  * Rudimentary debugging support
531  */
532 #ifdef DEBUG
533 int ibd_debuglevel = 100;
534 static void
535 debug_print(int l, char *fmt, ...)
536 {
537 	va_list ap;
538 
539 	if (l < ibd_debuglevel)
540 		return;
541 	va_start(ap, fmt);
542 	vcmn_err(CE_CONT, fmt, ap);
543 	va_end(ap);
544 }
545 #define	DPRINT		debug_print
546 #else
547 #define	DPRINT
548 #endif
549 
550 /*
551  * Common routine to print warning messages; adds in hca guid, port number
552  * and pkey to be able to identify the IBA interface.
553  */
554 static void
555 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
556 {
557 	ib_guid_t hca_guid;
558 	char ibd_print_buf[256];
559 	int len;
560 	va_list ap;
561 
562 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
563 	    0, "hca-guid", 0);
564 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
565 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
566 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
567 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
568 	va_start(ap, fmt);
569 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
570 	    fmt, ap);
571 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
572 	va_end(ap);
573 }
574 
575 /*
576  * Warlock directives
577  */
578 
579 /*
580  * id_lso_lock
581  *
582  * state->id_lso->bkt_nfree may be accessed without a lock to
583  * determine the threshold at which we have to ask the nw layer
584  * to resume transmission (see ibd_resume_transmission()).
585  */
586 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
587     ibd_state_t::id_lso))
588 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
589 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
590 
591 /*
592  * id_cq_poll_lock
593  */
594 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock,
595     ibd_state_t::id_cq_poll_busy))
596 
597 /*
598  * id_txpost_lock
599  */
600 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
601     ibd_state_t::id_tx_head))
602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
603     ibd_state_t::id_tx_busy))
604 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
605     ibd_state_t::id_tx_tailp))
606 
607 /*
608  * id_rxpost_lock
609  */
610 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
611     ibd_state_t::id_rx_head))
612 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
613     ibd_state_t::id_rx_busy))
614 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock,
615     ibd_state_t::id_rx_tailp))
616 
617 /*
618  * id_acache_req_lock
619  */
620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
621     ibd_state_t::id_acache_req_cv))
622 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
623     ibd_state_t::id_req_list))
624 
625 /*
626  * id_ac_mutex
627  *
628  * This mutex is actually supposed to protect id_ah_op as well,
629  * but this path of the code isn't clean (see update of id_ah_op
630  * in ibd_async_acache(), immediately after the call to
631  * ibd_async_mcache()). For now, we'll skip this check by
632  * declaring that id_ah_op is protected by some internal scheme
633  * that warlock isn't aware of.
634  */
635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
636     ibd_state_t::id_ah_active))
637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
638     ibd_state_t::id_ah_free))
639 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
640     ibd_state_t::id_ah_addr))
641 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
642     ibd_state_t::id_ah_op))
643 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
644     ibd_state_t::id_ah_error))
645 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
646 
647 /*
648  * id_mc_mutex
649  */
650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
651     ibd_state_t::id_mc_full))
652 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
653     ibd_state_t::id_mc_non))
654 
655 /*
656  * id_trap_lock
657  */
658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
659     ibd_state_t::id_trap_cv))
660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
661     ibd_state_t::id_trap_stop))
662 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
663     ibd_state_t::id_trap_inprog))
664 
665 /*
666  * id_prom_op
667  */
668 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
669     ibd_state_t::id_prom_op))
670 
671 /*
672  * id_sched_lock
673  */
674 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
675     ibd_state_t::id_sched_needed))
676 
677 /*
678  * id_link_mutex
679  */
680 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
681     ibd_state_t::id_link_state))
682 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
683 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
684     ibd_state_t::id_link_speed))
685 
686 /*
687  * id_tx_list.dl_mutex
688  */
689 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
690     ibd_state_t::id_tx_list.dl_head))
691 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
692     ibd_state_t::id_tx_list.dl_tail))
693 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
694     ibd_state_t::id_tx_list.dl_pending_sends))
695 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
696     ibd_state_t::id_tx_list.dl_cnt))
697 
698 /*
699  * id_rx_list.dl_mutex
700  */
701 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
702     ibd_state_t::id_rx_list.dl_head))
703 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex,
704     ibd_state_t::id_rx_list.dl_tail))
705 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
706     ibd_state_t::id_rx_list.dl_bufs_outstanding))
707 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
708     ibd_state_t::id_rx_list.dl_cnt))
709 
710 
711 /*
712  * Items protected by atomic updates
713  */
714 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
715     ibd_state_s::id_brd_rcv
716     ibd_state_s::id_brd_xmt
717     ibd_state_s::id_multi_rcv
718     ibd_state_s::id_multi_xmt
719     ibd_state_s::id_num_intrs
720     ibd_state_s::id_rcv_bytes
721     ibd_state_s::id_rcv_pkt
722     ibd_state_s::id_tx_short
723     ibd_state_s::id_xmt_bytes
724     ibd_state_s::id_xmt_pkt))
725 
726 /*
727  * Non-mutex protection schemes for data elements. Almost all of
728  * these are non-shared items.
729  */
730 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
731     callb_cpr
732     ib_gid_s
733     ib_header_info
734     ibd_acache_rq
735     ibd_acache_s::ac_mce
736     ibd_mcache::mc_fullreap
737     ibd_mcache::mc_jstate
738     ibd_mcache::mc_req
739     ibd_rwqe_s
740     ibd_swqe_s
741     ibd_wqe_s
742     ibt_wr_ds_s::ds_va
743     ibt_wr_lso_s
744     ipoib_mac::ipoib_qpn
745     mac_capab_lso_s
746     msgb::b_next
747     msgb::b_rptr
748     msgb::b_wptr))
749 
750 int
751 _init()
752 {
753 	int status;
754 
755 	/*
756 	 * Sanity check some parameter settings. Tx completion polling
757 	 * only makes sense with separate CQs for Tx and Rx.
758 	 */
759 	if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) {
760 		cmn_err(CE_NOTE, "!ibd: %s",
761 		    "Setting ibd_txcomp_poll = 0 for combined CQ");
762 		ibd_txcomp_poll = 0;
763 	}
764 
765 	status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0);
766 	if (status != 0) {
767 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
768 		return (status);
769 	}
770 
771 	mac_init_ops(&ibd_dev_ops, "ibd");
772 	status = mod_install(&ibd_modlinkage);
773 	if (status != 0) {
774 		DPRINT(10, "_init:failed in mod_install()");
775 		ddi_soft_state_fini(&ibd_list);
776 		mac_fini_ops(&ibd_dev_ops);
777 		return (status);
778 	}
779 
780 #ifdef IBD_LOGGING
781 	ibd_log_init();
782 #endif
783 	return (0);
784 }
785 
786 int
787 _info(struct modinfo *modinfop)
788 {
789 	return (mod_info(&ibd_modlinkage, modinfop));
790 }
791 
792 int
793 _fini()
794 {
795 	int status;
796 
797 	status = mod_remove(&ibd_modlinkage);
798 	if (status != 0)
799 		return (status);
800 
801 	mac_fini_ops(&ibd_dev_ops);
802 	ddi_soft_state_fini(&ibd_list);
803 #ifdef IBD_LOGGING
804 	ibd_log_fini();
805 #endif
806 	return (0);
807 }
808 
809 /*
810  * Convert the GID part of the mac address from network byte order
811  * to host order.
812  */
813 static void
814 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
815 {
816 	ib_sn_prefix_t nbopref;
817 	ib_guid_t nboguid;
818 
819 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
820 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
821 	dgid->gid_prefix = b2h64(nbopref);
822 	dgid->gid_guid = b2h64(nboguid);
823 }
824 
825 /*
826  * Create the IPoIB address in network byte order from host order inputs.
827  */
828 static void
829 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
830     ib_guid_t guid)
831 {
832 	ib_sn_prefix_t nbopref;
833 	ib_guid_t nboguid;
834 
835 	mac->ipoib_qpn = htonl(qpn);
836 	nbopref = h2b64(prefix);
837 	nboguid = h2b64(guid);
838 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
839 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
840 }
841 
842 /*
843  * Send to the appropriate all-routers group when the IBA multicast group
844  * does not exist, based on whether the target group is v4 or v6.
845  */
846 static boolean_t
847 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
848     ipoib_mac_t *rmac)
849 {
850 	boolean_t retval = B_TRUE;
851 	uint32_t adjscope = state->id_scope << 16;
852 	uint32_t topword;
853 
854 	/*
855 	 * Copy the first 4 bytes in without assuming any alignment of
856 	 * input mac address; this will have IPoIB signature, flags and
857 	 * scope bits.
858 	 */
859 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
860 	topword = ntohl(topword);
861 
862 	/*
863 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
864 	 */
865 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
866 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
867 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
868 		    ((uint32_t)(state->id_pkey << 16))),
869 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
870 	else
871 		/*
872 		 * Does not have proper bits in the mgid address.
873 		 */
874 		retval = B_FALSE;
875 
876 	return (retval);
877 }
878 
879 /*
880  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
881  * front of optional src/tgt link layer address. Right now Solaris inserts
882  * padding by default at the end. The routine which is doing is nce_xmit()
883  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
884  * the packet comes down from IP layer to the IBD driver, it is in the
885  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
886  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
887  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
888  *
889  * The send routine at IBD driver changes this packet as follows:
890  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
891  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
892  * aligned.
893  *
894  * At the receiving side again ibd_process_rx takes the above packet and
895  * removes the two bytes of front padding and inserts it at the end. This
896  * is since the IP layer does not understand padding at the front.
897  */
898 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
899 	uchar_t 	*nd_lla_ptr;					\
900 	icmp6_t 	*icmp6;						\
901 	nd_opt_hdr_t	*opt;						\
902 	int 		i;						\
903 									\
904 	icmp6 = (icmp6_t *)&ip6h[1];					\
905 	len -= sizeof (nd_neighbor_advert_t);				\
906 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
907 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
908 	    (len != 0)) {						\
909 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
910 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
911 		ASSERT(opt != NULL);					\
912 		nd_lla_ptr = (uchar_t *)&opt[1];			\
913 		if (type == IBD_SEND) {					\
914 			for (i = IPOIB_ADDRL; i > 0; i--)		\
915 				*(nd_lla_ptr + i + 1) =			\
916 				    *(nd_lla_ptr + i - 1);		\
917 		} else {						\
918 			for (i = 0; i < IPOIB_ADDRL; i++)		\
919 				*(nd_lla_ptr + i) =			\
920 				    *(nd_lla_ptr + i + 2);		\
921 		}							\
922 		*(nd_lla_ptr + i) = 0;					\
923 		*(nd_lla_ptr + i + 1) = 0;				\
924 	}								\
925 }
926 
927 /*
928  * Address handle entries maintained by the driver are kept in the
929  * free and active lists. Each entry starts out in the free list;
930  * it migrates to the active list when primed using ibt_get_paths()
931  * and ibt_modify_ud_dest() for transmission to a specific destination.
932  * In the active list, the entry has a reference count indicating the
933  * number of ongoing/uncompleted transmits that reference it. The
934  * entry is left in the active list even after the reference count
935  * goes to 0, since successive transmits can find it there and do
936  * not need to set up another entry (ie the path information is
937  * cached using the active list). Entries on the active list are
938  * also hashed using the destination link address as a key for faster
939  * lookups during transmits.
940  *
941  * For any destination address (unicast or multicast, whatever the
942  * join states), there will be at most one entry in the active list.
943  * Entries with a 0 reference count on the active list can be reused
944  * for a transmit to a new destination, if the free list is empty.
945  *
946  * The AH free list insertion/deletion is protected with the id_ac_mutex,
947  * since the async thread and Tx callback handlers insert/delete. The
948  * active list does not need a lock (all operations are done by the
949  * async thread) but updates to the reference count are atomically
950  * done (increments done by Tx path, decrements by the Tx callback handler).
951  */
952 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
953 	list_insert_head(&state->id_ah_free, ce)
954 #define	IBD_ACACHE_GET_FREE(state) \
955 	list_get_head(&state->id_ah_free)
956 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
957 	int _ret_;						\
958 	list_insert_head(&state->id_ah_active, ce);		\
959 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
960 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
961 	ASSERT(_ret_ == 0);					\
962 }
963 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
964 	list_remove(&state->id_ah_active, ce);			\
965 	(void) mod_hash_remove(state->id_ah_active_hash,	\
966 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
967 }
968 #define	IBD_ACACHE_GET_ACTIVE(state) \
969 	list_get_head(&state->id_ah_active)
970 
971 /*
972  * Membership states for different mcg's are tracked by two lists:
973  * the "non" list is used for promiscuous mode, when all mcg traffic
974  * needs to be inspected. This type of membership is never used for
975  * transmission, so there can not be an AH in the active list
976  * corresponding to a member in this list. This list does not need
977  * any protection, since all operations are performed by the async
978  * thread.
979  *
980  * "Full" and "SendOnly" membership is tracked using a single list,
981  * the "full" list. This is because this single list can then be
982  * searched during transmit to a multicast group (if an AH for the
983  * mcg is not found in the active list), since at least one type
984  * of membership must be present before initiating the transmit.
985  * This list is also emptied during driver detach, since sendonly
986  * membership acquired during transmit is dropped at detach time
987  * alongwith ipv4 broadcast full membership. Insert/deletes to
988  * this list are done only by the async thread, but it is also
989  * searched in program context (see multicast disable case), thus
990  * the id_mc_mutex protects the list. The driver detach path also
991  * deconstructs the "full" list, but it ensures that the async
992  * thread will not be accessing the list (by blocking out mcg
993  * trap handling and making sure no more Tx reaping will happen).
994  *
995  * Currently, an IBA attach is done in the SendOnly case too,
996  * although this is not required.
997  */
998 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
999 	list_insert_head(&state->id_mc_full, mce)
1000 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1001 	list_insert_head(&state->id_mc_non, mce)
1002 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1003 	ibd_mcache_find(mgid, &state->id_mc_full)
1004 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1005 	ibd_mcache_find(mgid, &state->id_mc_non)
1006 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1007 	list_remove(&state->id_mc_full, mce)
1008 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1009 	list_remove(&state->id_mc_non, mce)
1010 
1011 /*
1012  * AH and MCE active list manipulation:
1013  *
1014  * Multicast disable requests and MCG delete traps are two cases
1015  * where the active AH entry for the mcg (if any unreferenced one exists)
1016  * will be moved to the free list (to force the next Tx to the mcg to
1017  * join the MCG in SendOnly mode). Port up handling will also move AHs
1018  * from active to free list.
1019  *
1020  * In the case when some transmits are still pending on an entry
1021  * for an mcg, but a multicast disable has already been issued on the
1022  * mcg, there are some options to consider to preserve the join state
1023  * to ensure the emitted packet is properly routed on the IBA fabric.
1024  * For the AH, we can
1025  * 1. take out of active list at multicast disable time.
1026  * 2. take out of active list only when last pending Tx completes.
1027  * For the MCE, we can
1028  * 3. take out of active list at multicast disable time.
1029  * 4. take out of active list only when last pending Tx completes.
1030  * 5. move from active list to stale list at multicast disable time.
1031  * We choose to use 2,4. We use option 4 so that if a multicast enable
1032  * is tried before the pending Tx completes, the enable code finds the
1033  * mce in the active list and just has to make sure it will not be reaped
1034  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1035  * a stale list (#5) that would be checked in the enable code would need
1036  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1037  * after the multicast disable would try to put an AH in the active list,
1038  * and associate the mce it finds in the active list to this new AH,
1039  * whereas the mce is already associated with the previous AH (taken off
1040  * the active list), and will be removed once the pending Tx's complete
1041  * (unless a reference count on mce's is implemented). One implication of
1042  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1043  * grab new references on the AH, further delaying the leave.
1044  *
1045  * In the case of mcg delete (or create) trap when the port is sendonly
1046  * joined, the AH and MCE handling is different: the AH and MCE has to be
1047  * immediately taken off the active lists (forcing a join and path lookup
1048  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1049  * to an mcg as it is repeatedly created and deleted and goes thru
1050  * reincarnations).
1051  *
1052  * When a port is already sendonly joined, and a multicast enable is
1053  * attempted, the same mce structure is promoted; this ensures only a
1054  * single mce on the active list tracks the most powerful join state.
1055  *
1056  * In the case of port up event handling, the MCE for sendonly membership
1057  * is freed up, and the ACE is put into the free list as soon as possible
1058  * (depending on whether posted Tx's have completed). For fullmembership
1059  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1060  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1061  * done; else the mce is deconstructed (mc_fullreap case).
1062  *
1063  * MCG creation and deletion trap handling:
1064  *
1065  * These traps are unreliable (meaning sometimes the trap might never
1066  * be delivered to the subscribed nodes) and may arrive out-of-order
1067  * since they use UD transport. An alternative to relying on these
1068  * unreliable traps is to poll for mcg presence every so often, but
1069  * instead of doing that, we try to be as conservative as possible
1070  * while handling the traps, and hope that the traps do arrive at
1071  * the subscribed nodes soon. Note that if a node is fullmember
1072  * joined to an mcg, it can not possibly receive a mcg create/delete
1073  * trap for that mcg (by fullmember definition); if it does, it is
1074  * an old trap from a previous incarnation of the mcg.
1075  *
1076  * Whenever a trap is received, the driver cleans up its sendonly
1077  * membership to the group; we choose to do a sendonly leave even
1078  * on a creation trap to handle the case of a prior deletion of the mcg
1079  * having gone unnoticed. Consider an example scenario:
1080  * T1: MCG M is deleted, and fires off deletion trap D1.
1081  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1082  * T3: Node N tries to transmit to M, joining in sendonly mode.
1083  * T4: MCG M is deleted, and fires off deletion trap D2.
1084  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1085  *     If the trap is D2, then a LEAVE is not required, since the mcg
1086  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1087  *     approach is to always LEAVE, but the SM may be confused if it
1088  *     receives a LEAVE without a prior JOIN.
1089  *
1090  * Management of the non-membership to an mcg is similar to the above,
1091  * except that if the interface is in promiscuous mode, it is required
1092  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1093  * if the re-join attempt fails (in which case a warning message needs
1094  * to be printed), it is not clear whether it failed due to the mcg not
1095  * existing, or some fabric/hca issues, due to the delayed nature of
1096  * trap delivery. Querying the SA to establish presence/absence of the
1097  * mcg is also racy at best. Thus, the driver just prints a warning
1098  * message when it can not rejoin after receiving a create trap, although
1099  * this might be (on rare occassions) a mis-warning if the create trap is
1100  * received after the mcg was deleted.
1101  */
1102 
1103 /*
1104  * Implementation of atomic "recycle" bits and reference count
1105  * on address handles. This utilizes the fact that max reference
1106  * count on any handle is limited by number of send wqes, thus
1107  * high bits in the ac_ref field can be used as the recycle bits,
1108  * and only the low bits hold the number of pending Tx requests.
1109  * This atomic AH reference counting allows the Tx completion
1110  * handler not to acquire the id_ac_mutex to process every completion,
1111  * thus reducing lock contention problems between completion and
1112  * the Tx path.
1113  */
1114 #define	CYCLEVAL		0x80000
1115 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1116 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1117 #define	GET_REF(ace)		((ace)->ac_ref)
1118 #define	GET_REF_CYCLE(ace) (				\
1119 	/*						\
1120 	 * Make sure "cycle" bit is set.		\
1121 	 */						\
1122 	ASSERT(CYCLE_SET(ace)),				\
1123 	((ace)->ac_ref & ~(CYCLEVAL))			\
1124 )
1125 #define	INC_REF(ace, num) {				\
1126 	atomic_add_32(&(ace)->ac_ref, num);		\
1127 }
1128 #define	SET_CYCLE_IF_REF(ace) (				\
1129 	CYCLE_SET(ace) ? B_TRUE :			\
1130 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1131 		CYCLEVAL ?				\
1132 		/*					\
1133 		 * Clear the "cycle" bit we just set;	\
1134 		 * ref count known to be 0 from above.	\
1135 		 */					\
1136 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1137 		/*					\
1138 		 * We set "cycle" bit; let caller know.	\
1139 		 */					\
1140 		B_TRUE					\
1141 )
1142 #define	DEC_REF_DO_CYCLE(ace) (				\
1143 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1144 	    CYCLEVAL ?					\
1145 		/*					\
1146 		 * Ref count known to be 0 from above.	\
1147 		 */					\
1148 		B_TRUE :				\
1149 		B_FALSE					\
1150 )
1151 
1152 static void *
1153 list_get_head(list_t *list)
1154 {
1155 	list_node_t *lhead = list_head(list);
1156 
1157 	if (lhead != NULL)
1158 		list_remove(list, lhead);
1159 	return (lhead);
1160 }
1161 
1162 /*
1163  * This is always guaranteed to be able to queue the work.
1164  */
1165 static void
1166 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1167 {
1168 	/* Initialize request */
1169 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1170 	ptr->rq_op = op;
1171 
1172 	/*
1173 	 * Queue provided slot onto request pool.
1174 	 */
1175 	mutex_enter(&state->id_acache_req_lock);
1176 	list_insert_tail(&state->id_req_list, ptr);
1177 
1178 	/* Go, fetch, async thread */
1179 	cv_signal(&state->id_acache_req_cv);
1180 	mutex_exit(&state->id_acache_req_lock);
1181 }
1182 
1183 /*
1184  * Main body of the per interface async thread.
1185  */
1186 static void
1187 ibd_async_work(ibd_state_t *state)
1188 {
1189 	ibd_req_t *ptr;
1190 	callb_cpr_t cprinfo;
1191 
1192 	mutex_enter(&state->id_acache_req_lock);
1193 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1194 	    callb_generic_cpr, "ibd_async_work");
1195 
1196 	for (;;) {
1197 		ptr = list_get_head(&state->id_req_list);
1198 		if (ptr != NULL) {
1199 			mutex_exit(&state->id_acache_req_lock);
1200 
1201 			/*
1202 			 * Once we have done the operation, there is no
1203 			 * guarantee the request slot is going to be valid,
1204 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1205 			 * TRAP).
1206 			 *
1207 			 * Perform the request.
1208 			 */
1209 			switch (ptr->rq_op) {
1210 				case IBD_ASYNC_GETAH:
1211 					ibd_async_acache(state, &ptr->rq_mac);
1212 					break;
1213 				case IBD_ASYNC_JOIN:
1214 				case IBD_ASYNC_LEAVE:
1215 					ibd_async_multicast(state,
1216 					    ptr->rq_gid, ptr->rq_op);
1217 					break;
1218 				case IBD_ASYNC_PROMON:
1219 					ibd_async_setprom(state);
1220 					break;
1221 				case IBD_ASYNC_PROMOFF:
1222 					ibd_async_unsetprom(state);
1223 					break;
1224 				case IBD_ASYNC_REAP:
1225 					ibd_async_reap_group(state,
1226 					    ptr->rq_ptr, ptr->rq_gid,
1227 					    IB_MC_JSTATE_FULL);
1228 					/*
1229 					 * the req buf contains in mce
1230 					 * structure, so we do not need
1231 					 * to free it here.
1232 					 */
1233 					ptr = NULL;
1234 					break;
1235 				case IBD_ASYNC_TRAP:
1236 					ibd_async_trap(state, ptr);
1237 					break;
1238 				case IBD_ASYNC_SCHED:
1239 					ibd_async_txsched(state);
1240 					break;
1241 				case IBD_ASYNC_LINK:
1242 					ibd_async_link(state, ptr);
1243 					break;
1244 				case IBD_ASYNC_EXIT:
1245 					mutex_enter(&state->id_acache_req_lock);
1246 #ifndef __lock_lint
1247 					CALLB_CPR_EXIT(&cprinfo);
1248 #else
1249 					mutex_exit(&state->id_acache_req_lock);
1250 #endif
1251 					return;
1252 			}
1253 			if (ptr != NULL)
1254 				kmem_cache_free(state->id_req_kmc, ptr);
1255 
1256 			mutex_enter(&state->id_acache_req_lock);
1257 		} else {
1258 #ifndef __lock_lint
1259 			/*
1260 			 * Nothing to do: wait till new request arrives.
1261 			 */
1262 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1263 			cv_wait(&state->id_acache_req_cv,
1264 			    &state->id_acache_req_lock);
1265 			CALLB_CPR_SAFE_END(&cprinfo,
1266 			    &state->id_acache_req_lock);
1267 #endif
1268 		}
1269 	}
1270 
1271 	/*NOTREACHED*/
1272 	_NOTE(NOT_REACHED)
1273 }
1274 
1275 /*
1276  * Return when it is safe to queue requests to the async daemon; primarily
1277  * for subnet trap and async event handling. Disallow requests before the
1278  * daemon is created, and when interface deinitilization starts.
1279  */
1280 static boolean_t
1281 ibd_async_safe(ibd_state_t *state)
1282 {
1283 	mutex_enter(&state->id_trap_lock);
1284 	if (state->id_trap_stop) {
1285 		mutex_exit(&state->id_trap_lock);
1286 		return (B_FALSE);
1287 	}
1288 	state->id_trap_inprog++;
1289 	mutex_exit(&state->id_trap_lock);
1290 	return (B_TRUE);
1291 }
1292 
1293 /*
1294  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1295  * trap or event handling to complete to kill the async thread and deconstruct
1296  * the mcg/ace list.
1297  */
1298 static void
1299 ibd_async_done(ibd_state_t *state)
1300 {
1301 	mutex_enter(&state->id_trap_lock);
1302 	if (--state->id_trap_inprog == 0)
1303 		cv_signal(&state->id_trap_cv);
1304 	mutex_exit(&state->id_trap_lock);
1305 }
1306 
1307 /*
1308  * Hash functions:
1309  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1310  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1311  * These operate on mac addresses input into ibd_send, but there is no
1312  * guarantee on the alignment of the ipoib_mac_t structure.
1313  */
1314 /*ARGSUSED*/
1315 static uint_t
1316 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1317 {
1318 	ulong_t ptraddr = (ulong_t)key;
1319 	uint_t hval;
1320 
1321 	/*
1322 	 * If the input address is 4 byte aligned, we can just dereference
1323 	 * it. This is most common, since IP will send in a 4 byte aligned
1324 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1325 	 * 4 byte aligned too.
1326 	 */
1327 	if ((ptraddr & 3) == 0)
1328 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1329 
1330 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1331 	return (hval);
1332 }
1333 
1334 static int
1335 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1336 {
1337 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1338 		return (0);
1339 	else
1340 		return (1);
1341 }
1342 
1343 /*
1344  * Initialize all the per interface caches and lists; AH cache,
1345  * MCG list etc.
1346  */
1347 static int
1348 ibd_acache_init(ibd_state_t *state)
1349 {
1350 	ibd_ace_t *ce;
1351 	int i;
1352 
1353 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1354 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1355 
1356 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1357 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1358 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1359 	    offsetof(ibd_ace_t, ac_list));
1360 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1361 	    offsetof(ibd_ace_t, ac_list));
1362 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1363 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1364 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1365 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1366 	    offsetof(ibd_mce_t, mc_list));
1367 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1368 	    offsetof(ibd_mce_t, mc_list));
1369 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1370 	    offsetof(ibd_req_t, rq_list));
1371 
1372 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1373 	    IBD_NUM_AH, KM_SLEEP);
1374 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1375 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1376 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1377 			ibd_acache_fini(state);
1378 			return (DDI_FAILURE);
1379 		} else {
1380 			CLEAR_REFCYCLE(ce);
1381 			ce->ac_mce = NULL;
1382 			IBD_ACACHE_INSERT_FREE(state, ce);
1383 		}
1384 	}
1385 	return (DDI_SUCCESS);
1386 }
1387 
1388 static void
1389 ibd_acache_fini(ibd_state_t *state)
1390 {
1391 	ibd_ace_t *ptr;
1392 
1393 	mutex_enter(&state->id_ac_mutex);
1394 
1395 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1396 		ASSERT(GET_REF(ptr) == 0);
1397 		(void) ibt_free_ud_dest(ptr->ac_dest);
1398 	}
1399 
1400 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1401 		ASSERT(GET_REF(ptr) == 0);
1402 		(void) ibt_free_ud_dest(ptr->ac_dest);
1403 	}
1404 
1405 	list_destroy(&state->id_ah_free);
1406 	list_destroy(&state->id_ah_active);
1407 	list_destroy(&state->id_mc_full);
1408 	list_destroy(&state->id_mc_non);
1409 	list_destroy(&state->id_req_list);
1410 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1411 	mutex_exit(&state->id_ac_mutex);
1412 	mutex_destroy(&state->id_ac_mutex);
1413 	mutex_destroy(&state->id_mc_mutex);
1414 	mutex_destroy(&state->id_acache_req_lock);
1415 	cv_destroy(&state->id_acache_req_cv);
1416 }
1417 
1418 /*
1419  * Search AH active hash list for a cached path to input destination.
1420  * If we are "just looking", hold == F. When we are in the Tx path,
1421  * we set hold == T to grab a reference on the AH so that it can not
1422  * be recycled to a new destination while the Tx request is posted.
1423  */
1424 static ibd_ace_t *
1425 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1426 {
1427 	ibd_ace_t *ptr;
1428 
1429 	ASSERT(mutex_owned(&state->id_ac_mutex));
1430 
1431 	/*
1432 	 * Do hash search.
1433 	 */
1434 	if (mod_hash_find(state->id_ah_active_hash,
1435 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1436 		if (hold)
1437 			INC_REF(ptr, num);
1438 		return (ptr);
1439 	}
1440 	return (NULL);
1441 }
1442 
1443 /*
1444  * This is called by the tx side; if an initialized AH is found in
1445  * the active list, it is locked down and can be used; if no entry
1446  * is found, an async request is queued to do path resolution.
1447  */
1448 static ibd_ace_t *
1449 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1450 {
1451 	ibd_ace_t *ptr;
1452 	ibd_req_t *req;
1453 
1454 	/*
1455 	 * Only attempt to print when we can; in the mdt pattr case, the
1456 	 * address is not aligned properly.
1457 	 */
1458 	if (((ulong_t)mac & 3) == 0) {
1459 		DPRINT(4,
1460 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1461 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1462 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1463 		    htonl(mac->ipoib_gidsuff[1]));
1464 	}
1465 
1466 	mutex_enter(&state->id_ac_mutex);
1467 
1468 	if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) {
1469 		mutex_exit(&state->id_ac_mutex);
1470 		return (ptr);
1471 	}
1472 
1473 	/*
1474 	 * Implementation of a single outstanding async request; if
1475 	 * the operation is not started yet, queue a request and move
1476 	 * to ongoing state. Remember in id_ah_addr for which address
1477 	 * we are queueing the request, in case we need to flag an error;
1478 	 * Any further requests, for the same or different address, until
1479 	 * the operation completes, is sent back to GLDv3 to be retried.
1480 	 * The async thread will update id_ah_op with an error indication
1481 	 * or will set it to indicate the next look up can start; either
1482 	 * way, it will mac_tx_update() so that all blocked requests come
1483 	 * back here.
1484 	 */
1485 	*err = EAGAIN;
1486 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1487 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1488 		if (req != NULL) {
1489 			/*
1490 			 * We did not even find the entry; queue a request
1491 			 * for it.
1492 			 */
1493 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1494 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1495 			state->id_ah_op = IBD_OP_ONGOING;
1496 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1497 		}
1498 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1499 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1500 		/*
1501 		 * Check the status of the pathrecord lookup request
1502 		 * we had queued before.
1503 		 */
1504 		if (state->id_ah_op == IBD_OP_ERRORED) {
1505 			*err = EFAULT;
1506 			state->id_ah_error++;
1507 		} else {
1508 			/*
1509 			 * IBD_OP_ROUTERED case: We need to send to the
1510 			 * all-router MCG. If we can find the AH for
1511 			 * the mcg, the Tx will be attempted. If we
1512 			 * do not find the AH, we return NORESOURCES
1513 			 * to retry.
1514 			 */
1515 			ipoib_mac_t routermac;
1516 
1517 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1518 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1519 			    numwqe);
1520 		}
1521 		state->id_ah_op = IBD_OP_NOTSTARTED;
1522 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1523 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1524 		/*
1525 		 * This case can happen when we get a higher band
1526 		 * packet. The easiest way is to reset the state machine
1527 		 * to accommodate the higher priority packet.
1528 		 */
1529 		state->id_ah_op = IBD_OP_NOTSTARTED;
1530 	}
1531 	mutex_exit(&state->id_ac_mutex);
1532 
1533 	return (ptr);
1534 }
1535 
1536 /*
1537  * Grab a not-currently-in-use AH/PathRecord from the active
1538  * list to recycle to a new destination. Only the async thread
1539  * executes this code.
1540  */
1541 static ibd_ace_t *
1542 ibd_acache_get_unref(ibd_state_t *state)
1543 {
1544 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1545 
1546 	ASSERT(mutex_owned(&state->id_ac_mutex));
1547 
1548 	/*
1549 	 * Do plain linear search.
1550 	 */
1551 	while (ptr != NULL) {
1552 		/*
1553 		 * Note that it is possible that the "cycle" bit
1554 		 * is set on the AH w/o any reference count. The
1555 		 * mcg must have been deleted, and the tx cleanup
1556 		 * just decremented the reference count to 0, but
1557 		 * hasn't gotten around to grabbing the id_ac_mutex
1558 		 * to move the AH into the free list.
1559 		 */
1560 		if (GET_REF(ptr) == 0) {
1561 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1562 			break;
1563 		}
1564 		ptr = list_next(&state->id_ah_active, ptr);
1565 	}
1566 	return (ptr);
1567 }
1568 
1569 /*
1570  * Invoked to clean up AH from active list in case of multicast
1571  * disable and to handle sendonly memberships during mcg traps.
1572  * And for port up processing for multicast and unicast AHs.
1573  * Normally, the AH is taken off the active list, and put into
1574  * the free list to be recycled for a new destination. In case
1575  * Tx requests on the AH have not completed yet, the AH is marked
1576  * for reaping (which will put the AH on the free list) once the Tx's
1577  * complete; in this case, depending on the "force" input, we take
1578  * out the AH from the active list right now, or leave it also for
1579  * the reap operation. Returns TRUE if the AH is taken off the active
1580  * list (and either put into the free list right now, or arranged for
1581  * later), FALSE otherwise.
1582  */
1583 static boolean_t
1584 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1585 {
1586 	ibd_ace_t *acactive;
1587 	boolean_t ret = B_TRUE;
1588 
1589 	ASSERT(mutex_owned(&state->id_ac_mutex));
1590 
1591 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1592 
1593 		/*
1594 		 * Note that the AH might already have the cycle bit set
1595 		 * on it; this might happen if sequences of multicast
1596 		 * enables and disables are coming so fast, that posted
1597 		 * Tx's to the mcg have not completed yet, and the cycle
1598 		 * bit is set successively by each multicast disable.
1599 		 */
1600 		if (SET_CYCLE_IF_REF(acactive)) {
1601 			if (!force) {
1602 				/*
1603 				 * The ace is kept on the active list, further
1604 				 * Tx's can still grab a reference on it; the
1605 				 * ace is reaped when all pending Tx's
1606 				 * referencing the AH complete.
1607 				 */
1608 				ret = B_FALSE;
1609 			} else {
1610 				/*
1611 				 * In the mcg trap case, we always pull the
1612 				 * AH from the active list. And also the port
1613 				 * up multi/unicast case.
1614 				 */
1615 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1616 				acactive->ac_mce = NULL;
1617 			}
1618 		} else {
1619 			/*
1620 			 * Determined the ref count is 0, thus reclaim
1621 			 * immediately after pulling out the ace from
1622 			 * the active list.
1623 			 */
1624 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1625 			acactive->ac_mce = NULL;
1626 			IBD_ACACHE_INSERT_FREE(state, acactive);
1627 		}
1628 
1629 	}
1630 	return (ret);
1631 }
1632 
1633 /*
1634  * Helper function for async path record lookup. If we are trying to
1635  * Tx to a MCG, check our membership, possibly trying to join the
1636  * group if required. If that fails, try to send the packet to the
1637  * all router group (indicated by the redirect output), pointing
1638  * the input mac address to the router mcg address.
1639  */
1640 static ibd_mce_t *
1641 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1642 {
1643 	ib_gid_t mgid;
1644 	ibd_mce_t *mce;
1645 	ipoib_mac_t routermac;
1646 
1647 	*redirect = B_FALSE;
1648 	ibd_n2h_gid(mac, &mgid);
1649 
1650 	/*
1651 	 * Check the FullMember+SendOnlyNonMember list.
1652 	 * Since we are the only one who manipulates the
1653 	 * id_mc_full list, no locks are needed.
1654 	 */
1655 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1656 	if (mce != NULL) {
1657 		DPRINT(4, "ibd_async_mcache : already joined to group");
1658 		return (mce);
1659 	}
1660 
1661 	/*
1662 	 * Not found; try to join(SendOnlyNonMember) and attach.
1663 	 */
1664 	DPRINT(4, "ibd_async_mcache : not joined to group");
1665 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1666 	    NULL) {
1667 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1668 		return (mce);
1669 	}
1670 
1671 	/*
1672 	 * MCGroup not present; try to join the all-router group. If
1673 	 * any of the following steps succeed, we will be redirecting
1674 	 * to the all router group.
1675 	 */
1676 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1677 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1678 		return (NULL);
1679 	*redirect = B_TRUE;
1680 	ibd_n2h_gid(&routermac, &mgid);
1681 	bcopy(&routermac, mac, IPOIB_ADDRL);
1682 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1683 	    mgid.gid_prefix, mgid.gid_guid);
1684 
1685 	/*
1686 	 * Are we already joined to the router group?
1687 	 */
1688 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1689 		DPRINT(4, "ibd_async_mcache : using already joined router"
1690 		    "group\n");
1691 		return (mce);
1692 	}
1693 
1694 	/*
1695 	 * Can we join(SendOnlyNonMember) the router group?
1696 	 */
1697 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1698 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1699 	    NULL) {
1700 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1701 		return (mce);
1702 	}
1703 
1704 	return (NULL);
1705 }
1706 
1707 /*
1708  * Async path record lookup code.
1709  */
1710 static void
1711 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1712 {
1713 	ibd_ace_t *ce;
1714 	ibd_mce_t *mce = NULL;
1715 	ibt_path_attr_t path_attr;
1716 	ibt_path_info_t path_info;
1717 	ib_gid_t destgid;
1718 	char ret = IBD_OP_NOTSTARTED;
1719 
1720 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1721 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1722 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1723 	    htonl(mac->ipoib_gidsuff[1]));
1724 
1725 	/*
1726 	 * Check whether we are trying to transmit to a MCG.
1727 	 * In that case, we need to make sure we are a member of
1728 	 * the MCG.
1729 	 */
1730 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1731 		boolean_t redirected;
1732 
1733 		/*
1734 		 * If we can not find or join the group or even
1735 		 * redirect, error out.
1736 		 */
1737 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1738 		    NULL) {
1739 			state->id_ah_op = IBD_OP_ERRORED;
1740 			return;
1741 		}
1742 
1743 		/*
1744 		 * If we got redirected, we need to determine whether
1745 		 * the AH for the new mcg is in the cache already, and
1746 		 * not pull it in then; otherwise proceed to get the
1747 		 * path for the new mcg. There is no guarantee that
1748 		 * if the AH is currently in the cache, it will still be
1749 		 * there when we look in ibd_acache_lookup(), but that's
1750 		 * okay, we will come back here.
1751 		 */
1752 		if (redirected) {
1753 			ret = IBD_OP_ROUTERED;
1754 			DPRINT(4, "ibd_async_acache :  redirected to "
1755 			    "%08X:%08X:%08X:%08X:%08X",
1756 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1757 			    htonl(mac->ipoib_gidpref[1]),
1758 			    htonl(mac->ipoib_gidsuff[0]),
1759 			    htonl(mac->ipoib_gidsuff[1]));
1760 
1761 			mutex_enter(&state->id_ac_mutex);
1762 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1763 				state->id_ah_op = IBD_OP_ROUTERED;
1764 				mutex_exit(&state->id_ac_mutex);
1765 				DPRINT(4, "ibd_async_acache : router AH found");
1766 				return;
1767 			}
1768 			mutex_exit(&state->id_ac_mutex);
1769 		}
1770 	}
1771 
1772 	/*
1773 	 * Get an AH from the free list.
1774 	 */
1775 	mutex_enter(&state->id_ac_mutex);
1776 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1777 		/*
1778 		 * No free ones; try to grab an unreferenced active
1779 		 * one. Maybe we need to make the active list LRU,
1780 		 * but that will create more work for Tx callbacks.
1781 		 * Is there a way of not having to pull out the
1782 		 * entry from the active list, but just indicate it
1783 		 * is being recycled? Yes, but that creates one more
1784 		 * check in the fast lookup path.
1785 		 */
1786 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1787 			/*
1788 			 * Pretty serious shortage now.
1789 			 */
1790 			state->id_ah_op = IBD_OP_NOTSTARTED;
1791 			mutex_exit(&state->id_ac_mutex);
1792 			DPRINT(10, "ibd_async_acache : failed to find AH "
1793 			    "slot\n");
1794 			return;
1795 		}
1796 		/*
1797 		 * We could check whether ac_mce points to a SendOnly
1798 		 * member and drop that membership now. Or do it lazily
1799 		 * at detach time.
1800 		 */
1801 		ce->ac_mce = NULL;
1802 	}
1803 	mutex_exit(&state->id_ac_mutex);
1804 	ASSERT(ce->ac_mce == NULL);
1805 
1806 	/*
1807 	 * Update the entry.
1808 	 */
1809 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1810 
1811 	bzero(&path_info, sizeof (path_info));
1812 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1813 	path_attr.pa_sgid = state->id_sgid;
1814 	path_attr.pa_num_dgids = 1;
1815 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1816 	path_attr.pa_dgids = &destgid;
1817 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1818 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1819 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1820 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1821 		goto error;
1822 	}
1823 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1824 	    ntohl(ce->ac_mac.ipoib_qpn),
1825 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1826 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1827 		goto error;
1828 	}
1829 
1830 	/*
1831 	 * mce is set whenever an AH is being associated with a
1832 	 * MCG; this will come in handy when we leave the MCG. The
1833 	 * lock protects Tx fastpath from scanning the active list.
1834 	 */
1835 	if (mce != NULL)
1836 		ce->ac_mce = mce;
1837 	mutex_enter(&state->id_ac_mutex);
1838 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1839 	state->id_ah_op = ret;
1840 	mutex_exit(&state->id_ac_mutex);
1841 	return;
1842 error:
1843 	/*
1844 	 * We might want to drop SendOnly membership here if we
1845 	 * joined above. The lock protects Tx callbacks inserting
1846 	 * into the free list.
1847 	 */
1848 	mutex_enter(&state->id_ac_mutex);
1849 	state->id_ah_op = IBD_OP_ERRORED;
1850 	IBD_ACACHE_INSERT_FREE(state, ce);
1851 	mutex_exit(&state->id_ac_mutex);
1852 }
1853 
1854 /*
1855  * While restoring port's presence on the subnet on a port up, it is possible
1856  * that the port goes down again.
1857  */
1858 static void
1859 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1860 {
1861 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1862 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1863 	    LINK_STATE_UP;
1864 	ibd_mce_t *mce, *pmce;
1865 	ibd_ace_t *ace, *pace;
1866 
1867 	DPRINT(10, "ibd_async_link(): %d", opcode);
1868 
1869 	/*
1870 	 * On a link up, revalidate the link speed/width. No point doing
1871 	 * this on a link down, since we will be unable to do SA operations,
1872 	 * defaulting to the lowest speed. Also notice that we update our
1873 	 * notion of speed before calling mac_link_update(), which will do
1874 	 * neccesary higher level notifications for speed changes.
1875 	 */
1876 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1877 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1878 		state->id_link_speed = ibd_get_portspeed(state);
1879 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1880 	}
1881 
1882 	/*
1883 	 * Do all the work required to establish our presence on
1884 	 * the subnet.
1885 	 */
1886 	if (opcode == IBD_LINK_UP_ABSENT) {
1887 		/*
1888 		 * If in promiscuous mode ...
1889 		 */
1890 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1891 			/*
1892 			 * Drop all nonmembership.
1893 			 */
1894 			ibd_async_unsetprom(state);
1895 
1896 			/*
1897 			 * Then, try to regain nonmembership to all mcg's.
1898 			 */
1899 			ibd_async_setprom(state);
1900 
1901 		}
1902 
1903 		/*
1904 		 * Drop all sendonly membership (which also gets rid of the
1905 		 * AHs); try to reacquire all full membership.
1906 		 */
1907 		mce = list_head(&state->id_mc_full);
1908 		while ((pmce = mce) != NULL) {
1909 			mce = list_next(&state->id_mc_full, mce);
1910 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1911 				ibd_leave_group(state,
1912 				    pmce->mc_info.mc_adds_vect.av_dgid,
1913 				    IB_MC_JSTATE_SEND_ONLY_NON);
1914 			else
1915 				ibd_reacquire_group(state, pmce);
1916 		}
1917 
1918 		/*
1919 		 * Recycle all active AHs to free list (and if there are
1920 		 * pending posts, make sure they will go into the free list
1921 		 * once the Tx's complete). Grab the lock to prevent
1922 		 * concurrent Tx's as well as Tx cleanups.
1923 		 */
1924 		mutex_enter(&state->id_ac_mutex);
1925 		ace = list_head(&state->id_ah_active);
1926 		while ((pace = ace) != NULL) {
1927 			boolean_t cycled;
1928 
1929 			ace = list_next(&state->id_ah_active, ace);
1930 			mce = pace->ac_mce;
1931 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
1932 			    B_TRUE);
1933 			/*
1934 			 * If this is for an mcg, it must be for a fullmember,
1935 			 * since we got rid of send-only members above when
1936 			 * processing the mce list.
1937 			 */
1938 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1939 			    IB_MC_JSTATE_FULL)));
1940 
1941 			/*
1942 			 * Check if the fullmember mce needs to be torn down,
1943 			 * ie whether the DLPI disable has already been done.
1944 			 * If so, do some of the work of tx_cleanup, namely
1945 			 * causing leave (which will fail), detach and
1946 			 * mce-freeing. tx_cleanup will put the AH into free
1947 			 * list. The reason to duplicate some of this
1948 			 * tx_cleanup work is because we want to delete the
1949 			 * AH right now instead of waiting for tx_cleanup, to
1950 			 * force subsequent Tx's to reacquire an AH.
1951 			 */
1952 			if ((mce != NULL) && (mce->mc_fullreap))
1953 				ibd_async_reap_group(state, mce,
1954 				    mce->mc_info.mc_adds_vect.av_dgid,
1955 				    mce->mc_jstate);
1956 		}
1957 		mutex_exit(&state->id_ac_mutex);
1958 	}
1959 
1960 	/*
1961 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1962 	 * (which stops further events from being delivered) before
1963 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1964 	 * has already been done.
1965 	 */
1966 	mutex_enter(&state->id_link_mutex);
1967 	state->id_link_state = lstate;
1968 	mac_link_update(state->id_mh, lstate);
1969 	mutex_exit(&state->id_link_mutex);
1970 
1971 	ibd_async_done(state);
1972 }
1973 
1974 /*
1975  * Check the pkey table to see if we can find the pkey we're looking for.
1976  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1977  * failure.
1978  */
1979 static int
1980 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1981     uint16_t *pkix)
1982 {
1983 	uint16_t ndx;
1984 
1985 	ASSERT(pkix != NULL);
1986 
1987 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1988 		if (pkey_tbl[ndx] == pkey) {
1989 			*pkix = ndx;
1990 			return (0);
1991 		}
1992 	}
1993 	return (-1);
1994 }
1995 
1996 /*
1997  * When the link is notified up, we need to do a few things, based
1998  * on the port's current p_init_type_reply claiming a reinit has been
1999  * done or not. The reinit steps are:
2000  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2001  *    the old Pkey and GID0 are correct.
2002  * 2. Register for mcg traps (already done by ibmf).
2003  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2004  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2005  * 4. Give up all sendonly memberships.
2006  * 5. Acquire all full memberships.
2007  * 6. In promiscuous mode, acquire all non memberships.
2008  * 7. Recycle all AHs to free list.
2009  */
2010 static void
2011 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2012 {
2013 	ibt_hca_portinfo_t *port_infop = NULL;
2014 	ibt_status_t ibt_status;
2015 	uint_t psize, port_infosz;
2016 	ibd_link_op_t opcode;
2017 	ibd_req_t *req;
2018 	link_state_t new_link_state = LINK_STATE_UP;
2019 	uint8_t itreply;
2020 	uint16_t pkix;
2021 	int ret;
2022 
2023 	/*
2024 	 * Let's not race with a plumb or an unplumb; if we detect a
2025 	 * pkey relocation event later on here, we may have to restart.
2026 	 */
2027 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2028 
2029 	mutex_enter(&state->id_link_mutex);
2030 
2031 	/*
2032 	 * If the init code in ibd_m_start hasn't yet set up the
2033 	 * pkey/gid, nothing to do; that code will set the link state.
2034 	 */
2035 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2036 		mutex_exit(&state->id_link_mutex);
2037 		goto link_mod_return;
2038 	}
2039 
2040 	/*
2041 	 * If this routine was called in response to a port down event,
2042 	 * we just need to see if this should be informed.
2043 	 */
2044 	if (code == IBT_ERROR_PORT_DOWN) {
2045 		new_link_state = LINK_STATE_DOWN;
2046 		goto update_link_state;
2047 	}
2048 
2049 	/*
2050 	 * If it's not a port down event we've received, try to get the port
2051 	 * attributes first. If we fail here, the port is as good as down.
2052 	 * Otherwise, if the link went down by the time the handler gets
2053 	 * here, give up - we cannot even validate the pkey/gid since those
2054 	 * are not valid and this is as bad as a port down anyway.
2055 	 */
2056 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2057 	    &port_infop, &psize, &port_infosz);
2058 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2059 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2060 		new_link_state = LINK_STATE_DOWN;
2061 		goto update_link_state;
2062 	}
2063 
2064 	/*
2065 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2066 	 * PreserveContentReply are 0, we don't know anything about the
2067 	 * data loaded into the port attributes, so we need to verify
2068 	 * if gid0 and pkey are still valid.
2069 	 */
2070 	itreply = port_infop->p_init_type_reply;
2071 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2072 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2073 		/*
2074 		 * Check to see if the subnet part of GID0 has changed. If
2075 		 * not, check the simple case first to see if the pkey
2076 		 * index is the same as before; finally check to see if the
2077 		 * pkey has been relocated to a different index in the table.
2078 		 */
2079 		if (bcmp(port_infop->p_sgid_tbl,
2080 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2081 
2082 			new_link_state = LINK_STATE_DOWN;
2083 
2084 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2085 		    state->id_pkey) {
2086 
2087 			new_link_state = LINK_STATE_UP;
2088 
2089 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2090 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2091 
2092 			ibt_free_portinfo(port_infop, port_infosz);
2093 			mutex_exit(&state->id_link_mutex);
2094 
2095 			/*
2096 			 * Currently a restart is required if our pkey has moved
2097 			 * in the pkey table. If we get the ibt_recycle_ud() to
2098 			 * work as documented (expected), we may be able to
2099 			 * avoid a complete restart.  Note that we've already
2100 			 * marked both the start and stop 'in-progress' flags,
2101 			 * so it is ok to go ahead and do this restart.
2102 			 */
2103 			ibd_undo_start(state, LINK_STATE_DOWN);
2104 			if ((ret = ibd_start(state)) != 0) {
2105 				DPRINT(10, "ibd_restart: cannot restart, "
2106 				    "ret=%d", ret);
2107 			}
2108 
2109 			goto link_mod_return;
2110 		} else {
2111 			new_link_state = LINK_STATE_DOWN;
2112 		}
2113 	}
2114 
2115 update_link_state:
2116 	if (port_infop) {
2117 		ibt_free_portinfo(port_infop, port_infosz);
2118 	}
2119 
2120 	/*
2121 	 * If the old state is the same as the new state, nothing to do
2122 	 */
2123 	if (state->id_link_state == new_link_state) {
2124 		mutex_exit(&state->id_link_mutex);
2125 		goto link_mod_return;
2126 	}
2127 
2128 	/*
2129 	 * Ok, so there was a link state change; see if it's safe to ask
2130 	 * the async thread to do the work
2131 	 */
2132 	if (!ibd_async_safe(state)) {
2133 		state->id_link_state = new_link_state;
2134 		mutex_exit(&state->id_link_mutex);
2135 		goto link_mod_return;
2136 	}
2137 
2138 	mutex_exit(&state->id_link_mutex);
2139 
2140 	/*
2141 	 * If we're reporting a link up, check InitTypeReply to see if
2142 	 * the SM has ensured that the port's presence in mcg, traps,
2143 	 * etc. is intact.
2144 	 */
2145 	if (new_link_state == LINK_STATE_DOWN) {
2146 		opcode = IBD_LINK_DOWN;
2147 	} else {
2148 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2149 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2150 			opcode = IBD_LINK_UP;
2151 		} else {
2152 			opcode = IBD_LINK_UP_ABSENT;
2153 		}
2154 	}
2155 
2156 	/*
2157 	 * Queue up a request for ibd_async_link() to handle this link
2158 	 * state change event
2159 	 */
2160 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2161 	req->rq_ptr = (void *)opcode;
2162 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2163 
2164 link_mod_return:
2165 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2166 }
2167 
2168 /*
2169  * For the port up/down events, IBTL guarantees there will not be concurrent
2170  * invocations of the handler. IBTL might coalesce link transition events,
2171  * and not invoke the handler for _each_ up/down transition, but it will
2172  * invoke the handler with last known state
2173  */
2174 static void
2175 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2176     ibt_async_code_t code, ibt_async_event_t *event)
2177 {
2178 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2179 
2180 	switch (code) {
2181 	case IBT_ERROR_CATASTROPHIC_CHAN:
2182 		ibd_print_warn(state, "catastrophic channel error");
2183 		break;
2184 	case IBT_ERROR_CQ:
2185 		ibd_print_warn(state, "completion queue error");
2186 		break;
2187 	case IBT_PORT_CHANGE_EVENT:
2188 		/*
2189 		 * Events will be delivered to all instances that have
2190 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2191 		 * Only need to do work for our port; IBTF will deliver
2192 		 * events for other ports on the hca we have ibt_open_hca'ed
2193 		 * too. Note that id_port is initialized in ibd_attach()
2194 		 * before we do an ibt_open_hca() in ibd_attach().
2195 		 */
2196 		ASSERT(state->id_hca_hdl == hca_hdl);
2197 		if (state->id_port != event->ev_port)
2198 			break;
2199 
2200 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2201 		    IBT_PORT_CHANGE_PKEY) {
2202 			ibd_link_mod(state, code);
2203 		}
2204 		break;
2205 	case IBT_ERROR_PORT_DOWN:
2206 	case IBT_CLNT_REREG_EVENT:
2207 	case IBT_EVENT_PORT_UP:
2208 		/*
2209 		 * Events will be delivered to all instances that have
2210 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2211 		 * Only need to do work for our port; IBTF will deliver
2212 		 * events for other ports on the hca we have ibt_open_hca'ed
2213 		 * too. Note that id_port is initialized in ibd_attach()
2214 		 * before we do an ibt_open_hca() in ibd_attach().
2215 		 */
2216 		ASSERT(state->id_hca_hdl == hca_hdl);
2217 		if (state->id_port != event->ev_port)
2218 			break;
2219 
2220 		ibd_link_mod(state, code);
2221 		break;
2222 
2223 	case IBT_HCA_ATTACH_EVENT:
2224 	case IBT_HCA_DETACH_EVENT:
2225 		/*
2226 		 * When a new card is plugged to the system, attach_event is
2227 		 * invoked. Additionally, a cfgadm needs to be run to make the
2228 		 * card known to the system, and an ifconfig needs to be run to
2229 		 * plumb up any ibd interfaces on the card. In the case of card
2230 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2231 		 * unplumb the ibd interfaces on the card; when the card is
2232 		 * actually unplugged, the detach_event is invoked;
2233 		 * additionally, if any ibd instances are still active on the
2234 		 * card (eg there were no associated RCM scripts), driver's
2235 		 * detach routine is invoked.
2236 		 */
2237 		break;
2238 	default:
2239 		break;
2240 	}
2241 }
2242 
2243 static int
2244 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2245 {
2246 	mac_register_t *macp;
2247 	int ret;
2248 
2249 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2250 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2251 		return (DDI_FAILURE);
2252 	}
2253 
2254 	/*
2255 	 * Note that when we register with mac during attach, we don't
2256 	 * have the id_macaddr yet, so we'll simply be registering a
2257 	 * zero macaddr that we'll overwrite later during plumb (in
2258 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2259 	 * update the mac layer with the correct mtu during plumb.
2260 	 */
2261 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2262 	macp->m_driver = state;
2263 	macp->m_dip = dip;
2264 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2265 	macp->m_callbacks = &ibd_m_callbacks;
2266 	macp->m_min_sdu = 0;
2267 	macp->m_max_sdu = IBD_DEF_MAX_SDU;
2268 
2269 	/*
2270 	 *  Register ourselves with the GLDv3 interface
2271 	 */
2272 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2273 		mac_free(macp);
2274 		DPRINT(10,
2275 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2276 		return (DDI_FAILURE);
2277 	}
2278 
2279 	mac_free(macp);
2280 	return (DDI_SUCCESS);
2281 }
2282 
2283 static int
2284 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2285 {
2286 	ibt_hca_attr_t hca_attrs;
2287 	ibt_status_t ibt_status;
2288 
2289 	/*
2290 	 * Query the HCA and fetch its attributes
2291 	 */
2292 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2293 	ASSERT(ibt_status == IBT_SUCCESS);
2294 
2295 	/*
2296 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2297 	 *    full checksum offload.
2298 	 */
2299 	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
2300 		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2301 	}
2302 
2303 	/*
2304 	 * 2. Set LSO policy, capability and maximum length
2305 	 */
2306 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
2307 	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2308 		state->id_lso_policy = B_TRUE;
2309 	} else {
2310 		state->id_lso_policy = B_FALSE;
2311 	}
2312 
2313 	/*
2314 	 * Work-around for Bug 6866957. Ignore policy from ibd.conf.
2315 	 * Turn off LSO forcibly. Remove it when the work-around is no longer
2316 	 * needed.
2317 	 */
2318 	if (ibd_force_lso_disable) {
2319 		state->id_lso_policy = B_FALSE;
2320 	}
2321 	/* End of Workaround */
2322 
2323 	if (hca_attrs.hca_max_lso_size > 0) {
2324 		state->id_lso_capable = B_TRUE;
2325 		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2326 			state->id_lso_maxlen = IBD_LSO_MAXLEN;
2327 		else
2328 			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
2329 	} else {
2330 		state->id_lso_capable = B_FALSE;
2331 		state->id_lso_maxlen = 0;
2332 	}
2333 
2334 	/*
2335 	 * 3. Set Reserved L_Key capability
2336 	 */
2337 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2338 		state->id_hca_res_lkey_capab = 1;
2339 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2340 	}
2341 
2342 	/*
2343 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2344 	 *    size information is provided by the hca
2345 	 */
2346 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2347 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2348 	} else {
2349 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2350 	}
2351 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2352 		state->id_max_sqseg = IBD_MAX_SQSEG;
2353 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2354 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2355 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2356 	}
2357 
2358 	/*
2359 	 * 5. Set number of recv and send wqes after checking hca maximum
2360 	 *    channel size
2361 	 */
2362 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2363 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2364 	} else {
2365 		state->id_num_rwqe = IBD_NUM_RWQE;
2366 	}
2367 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2368 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2369 	} else {
2370 		state->id_num_swqe = IBD_NUM_SWQE;
2371 	}
2372 
2373 	return (DDI_SUCCESS);
2374 }
2375 
2376 static int
2377 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2378 {
2379 	int instance;
2380 	uint32_t progress = state->id_mac_state;
2381 	ibt_status_t ret;
2382 
2383 	if (progress & IBD_DRV_MAC_REGISTERED) {
2384 		(void) mac_unregister(state->id_mh);
2385 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2386 	}
2387 
2388 	if (progress & IBD_DRV_PD_ALLOCD) {
2389 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2390 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2391 			ibd_print_warn(state, "failed to free "
2392 			    "protection domain, ret=%d", ret);
2393 		}
2394 		state->id_pd_hdl = NULL;
2395 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2396 	}
2397 
2398 	if (progress & IBD_DRV_HCA_OPENED) {
2399 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2400 		    IBT_SUCCESS) {
2401 			ibd_print_warn(state, "failed to close "
2402 			    "HCA device, ret=%d", ret);
2403 		}
2404 		state->id_hca_hdl = NULL;
2405 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2406 	}
2407 
2408 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2409 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
2410 			ibd_print_warn(state,
2411 			    "ibt_detach() failed, ret=%d", ret);
2412 		}
2413 		state->id_ibt_hdl = NULL;
2414 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2415 	}
2416 
2417 	if (progress & IBD_DRV_TXINTR_ADDED) {
2418 		ddi_remove_softintr(state->id_tx);
2419 		state->id_tx = NULL;
2420 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2421 	}
2422 
2423 	if (progress & IBD_DRV_RXINTR_ADDED) {
2424 		ddi_remove_softintr(state->id_rx);
2425 		state->id_rx = NULL;
2426 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2427 	}
2428 
2429 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2430 		ibd_state_fini(state);
2431 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2432 	}
2433 
2434 	instance = ddi_get_instance(dip);
2435 	ddi_soft_state_free(ibd_list, instance);
2436 
2437 	return (DDI_SUCCESS);
2438 }
2439 
2440 /*
2441  * Attach device to the IO framework.
2442  */
2443 static int
2444 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2445 {
2446 	ibd_state_t *state = NULL;
2447 	ib_guid_t hca_guid;
2448 	int instance;
2449 	ibt_status_t ret;
2450 	int rv;
2451 
2452 	/*
2453 	 * IBD doesn't support suspend/resume
2454 	 */
2455 	if (cmd != DDI_ATTACH)
2456 		return (DDI_FAILURE);
2457 
2458 	/*
2459 	 * Allocate softstate structure
2460 	 */
2461 	instance = ddi_get_instance(dip);
2462 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2463 		return (DDI_FAILURE);
2464 	state = ddi_get_soft_state(ibd_list, instance);
2465 
2466 	/*
2467 	 * Initialize mutexes and condition variables
2468 	 */
2469 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2470 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2471 		goto attach_fail;
2472 	}
2473 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2474 
2475 	/*
2476 	 * Allocate rx,tx softintr
2477 	 */
2478 	if (ibd_rx_softintr == 1) {
2479 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2480 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2481 			DPRINT(10, "ibd_attach: failed in "
2482 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2483 			goto attach_fail;
2484 		}
2485 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2486 	}
2487 	if (ibd_tx_softintr == 1) {
2488 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2489 		    NULL, NULL, ibd_tx_recycle,
2490 		    (caddr_t)state)) != DDI_SUCCESS) {
2491 			DPRINT(10, "ibd_attach: failed in "
2492 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2493 			goto attach_fail;
2494 		}
2495 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2496 	}
2497 
2498 	/*
2499 	 * Obtain IBA P_Key, port number and HCA guid and validate
2500 	 * them (for P_Key, only full members are allowed as per
2501 	 * IPoIB specification; neither port number nor HCA guid
2502 	 * can be zero)
2503 	 */
2504 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2505 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2506 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2507 		    state->id_pkey);
2508 		goto attach_fail;
2509 	}
2510 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2511 	    "port-number", 0)) == 0) {
2512 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2513 		    state->id_port);
2514 		goto attach_fail;
2515 	}
2516 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2517 	    "hca-guid", 0)) == 0) {
2518 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2519 		    hca_guid);
2520 		goto attach_fail;
2521 	}
2522 
2523 	/*
2524 	 * Attach to IBTL
2525 	 */
2526 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2527 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2528 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
2529 		goto attach_fail;
2530 	}
2531 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2532 
2533 	/*
2534 	 * Open the HCA
2535 	 */
2536 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2537 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2538 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2539 		goto attach_fail;
2540 	}
2541 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2542 
2543 	/*
2544 	 * Record capabilities
2545 	 */
2546 	(void) ibd_record_capab(state, dip);
2547 
2548 	/*
2549 	 * Allocate a protection domain on the HCA
2550 	 */
2551 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2552 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2553 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2554 		goto attach_fail;
2555 	}
2556 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2557 
2558 
2559 	/*
2560 	 * Register ibd interfaces with the Nemo framework
2561 	 */
2562 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2563 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2564 		goto attach_fail;
2565 	}
2566 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2567 
2568 	/*
2569 	 * We're done with everything we could to make the attach
2570 	 * succeed.  All the buffer allocations and IPoIB broadcast
2571 	 * group joins are deferred to when the interface instance
2572 	 * is actually plumbed to avoid wasting memory.
2573 	 */
2574 	return (DDI_SUCCESS);
2575 
2576 attach_fail:
2577 	(void) ibd_unattach(state, dip);
2578 	return (DDI_FAILURE);
2579 }
2580 
2581 /*
2582  * Detach device from the IO framework.
2583  */
2584 static int
2585 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2586 {
2587 	ibd_state_t *state;
2588 	int instance;
2589 
2590 	/*
2591 	 * IBD doesn't support suspend/resume
2592 	 */
2593 	if (cmd != DDI_DETACH)
2594 		return (DDI_FAILURE);
2595 
2596 	/*
2597 	 * Get the instance softstate
2598 	 */
2599 	instance = ddi_get_instance(dip);
2600 	state = ddi_get_soft_state(ibd_list, instance);
2601 
2602 	/*
2603 	 * Release all resources we're holding still.  Note that if we'd
2604 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2605 	 * so far, we should find all the flags we need in id_mac_state.
2606 	 */
2607 	(void) ibd_unattach(state, dip);
2608 
2609 	return (DDI_SUCCESS);
2610 }
2611 
2612 /*
2613  * Pre ibt_attach() driver initialization
2614  */
2615 static int
2616 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2617 {
2618 	char buf[64];
2619 
2620 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2621 	state->id_link_state = LINK_STATE_UNKNOWN;
2622 
2623 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2624 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2625 	state->id_trap_stop = B_TRUE;
2626 	state->id_trap_inprog = 0;
2627 
2628 	mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2629 	state->id_dip = dip;
2630 
2631 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2632 
2633 	state->id_tx_list.dl_head = NULL;
2634 	state->id_tx_list.dl_tail = NULL;
2635 	state->id_tx_list.dl_pending_sends = B_FALSE;
2636 	state->id_tx_list.dl_cnt = 0;
2637 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2638 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2639 	state->id_tx_busy = 0;
2640 
2641 	state->id_rx_list.dl_head = NULL;
2642 	state->id_rx_list.dl_tail = NULL;
2643 	state->id_rx_list.dl_bufs_outstanding = 0;
2644 	state->id_rx_list.dl_cnt = 0;
2645 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2646 	mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL);
2647 
2648 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2649 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2650 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2651 
2652 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
2653 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
2654 
2655 	return (DDI_SUCCESS);
2656 }
2657 
2658 /*
2659  * Post ibt_detach() driver deconstruction
2660  */
2661 static void
2662 ibd_state_fini(ibd_state_t *state)
2663 {
2664 	cv_destroy(&state->id_macst_cv);
2665 	mutex_destroy(&state->id_macst_lock);
2666 
2667 	kmem_cache_destroy(state->id_req_kmc);
2668 
2669 	mutex_destroy(&state->id_rxpost_lock);
2670 	mutex_destroy(&state->id_rx_list.dl_mutex);
2671 
2672 	mutex_destroy(&state->id_txpost_lock);
2673 	mutex_destroy(&state->id_tx_list.dl_mutex);
2674 
2675 	mutex_destroy(&state->id_sched_lock);
2676 	mutex_destroy(&state->id_cq_poll_lock);
2677 
2678 	cv_destroy(&state->id_trap_cv);
2679 	mutex_destroy(&state->id_trap_lock);
2680 	mutex_destroy(&state->id_link_mutex);
2681 }
2682 
2683 /*
2684  * Fetch link speed from SA for snmp ifspeed reporting.
2685  */
2686 static uint64_t
2687 ibd_get_portspeed(ibd_state_t *state)
2688 {
2689 	int			ret;
2690 	ibt_path_info_t		path;
2691 	ibt_path_attr_t		path_attr;
2692 	uint8_t			num_paths;
2693 	uint64_t		ifspeed;
2694 
2695 	/*
2696 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2697 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2698 	 * 2000000000. Start with that as default.
2699 	 */
2700 	ifspeed = 2000000000;
2701 
2702 	bzero(&path_attr, sizeof (path_attr));
2703 
2704 	/*
2705 	 * Get the port speed from Loopback path information.
2706 	 */
2707 	path_attr.pa_dgids = &state->id_sgid;
2708 	path_attr.pa_num_dgids = 1;
2709 	path_attr.pa_sgid = state->id_sgid;
2710 
2711 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2712 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2713 		goto earlydone;
2714 
2715 	if (num_paths < 1)
2716 		goto earlydone;
2717 
2718 	/*
2719 	 * In case SA does not return an expected value, report the default
2720 	 * speed as 1X.
2721 	 */
2722 	ret = 1;
2723 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2724 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2725 			ret = 1;
2726 			break;
2727 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2728 			ret = 4;
2729 			break;
2730 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2731 			ret = 12;
2732 			break;
2733 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2734 			ret = 2;
2735 			break;
2736 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2737 			ret = 8;
2738 			break;
2739 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2740 			ret = 16;
2741 			break;
2742 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2743 			ret = 24;
2744 			break;
2745 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2746 			ret = 32;
2747 			break;
2748 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2749 			ret = 48;
2750 			break;
2751 	}
2752 
2753 	ifspeed *= ret;
2754 
2755 earlydone:
2756 	return (ifspeed);
2757 }
2758 
2759 /*
2760  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2761  * representing the input mcg mgid.
2762  */
2763 static ibd_mce_t *
2764 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2765 {
2766 	ibd_mce_t *ptr = list_head(mlist);
2767 
2768 	/*
2769 	 * Do plain linear search.
2770 	 */
2771 	while (ptr != NULL) {
2772 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2773 		    sizeof (ib_gid_t)) == 0)
2774 			return (ptr);
2775 		ptr = list_next(mlist, ptr);
2776 	}
2777 	return (NULL);
2778 }
2779 
2780 /*
2781  * Execute IBA JOIN.
2782  */
2783 static ibt_status_t
2784 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2785 {
2786 	ibt_mcg_attr_t mcg_attr;
2787 
2788 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2789 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2790 	mcg_attr.mc_mgid = mgid;
2791 	mcg_attr.mc_join_state = mce->mc_jstate;
2792 	mcg_attr.mc_scope = state->id_scope;
2793 	mcg_attr.mc_pkey = state->id_pkey;
2794 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2795 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2796 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2797 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2798 	    NULL, NULL));
2799 }
2800 
2801 /*
2802  * This code JOINs the port in the proper way (depending on the join
2803  * state) so that IBA fabric will forward mcg packets to/from the port.
2804  * It also attaches the QPN to the mcg so it can receive those mcg
2805  * packets. This code makes sure not to attach the mcg to the QP if
2806  * that has been previously done due to the mcg being joined with a
2807  * different join state, even though this is not required by SWG_0216,
2808  * refid 3610.
2809  */
2810 static ibd_mce_t *
2811 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2812 {
2813 	ibt_status_t ibt_status;
2814 	ibd_mce_t *mce, *tmce, *omce = NULL;
2815 	boolean_t do_attach = B_TRUE;
2816 
2817 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2818 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2819 
2820 	/*
2821 	 * For enable_multicast Full member joins, we need to do some
2822 	 * extra work. If there is already an mce on the list that
2823 	 * indicates full membership, that means the membership has
2824 	 * not yet been dropped (since the disable_multicast was issued)
2825 	 * because there are pending Tx's to the mcg; in that case, just
2826 	 * mark the mce not to be reaped when the Tx completion queues
2827 	 * an async reap operation.
2828 	 *
2829 	 * If there is already an mce on the list indicating sendonly
2830 	 * membership, try to promote to full membership. Be careful
2831 	 * not to deallocate the old mce, since there might be an AH
2832 	 * pointing to it; instead, update the old mce with new data
2833 	 * that tracks the full membership.
2834 	 */
2835 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2836 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2837 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2838 			ASSERT(omce->mc_fullreap);
2839 			omce->mc_fullreap = B_FALSE;
2840 			return (omce);
2841 		} else {
2842 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2843 		}
2844 	}
2845 
2846 	/*
2847 	 * Allocate the ibd_mce_t to track this JOIN.
2848 	 */
2849 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2850 	mce->mc_fullreap = B_FALSE;
2851 	mce->mc_jstate = jstate;
2852 
2853 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2854 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2855 		    ibt_status);
2856 		kmem_free(mce, sizeof (ibd_mce_t));
2857 		return (NULL);
2858 	}
2859 
2860 	/*
2861 	 * Is an IBA attach required? Not if the interface is already joined
2862 	 * to the mcg in a different appropriate join state.
2863 	 */
2864 	if (jstate == IB_MC_JSTATE_NON) {
2865 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2866 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2867 			do_attach = B_FALSE;
2868 	} else if (jstate == IB_MC_JSTATE_FULL) {
2869 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2870 			do_attach = B_FALSE;
2871 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2872 		do_attach = B_FALSE;
2873 	}
2874 
2875 	if (do_attach) {
2876 		/*
2877 		 * Do the IBA attach.
2878 		 */
2879 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2880 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2881 		    &mce->mc_info)) != IBT_SUCCESS) {
2882 			DPRINT(10, "ibd_join_group : failed qp attachment "
2883 			    "%d\n", ibt_status);
2884 			/*
2885 			 * NOTE that we should probably preserve the join info
2886 			 * in the list and later try to leave again at detach
2887 			 * time.
2888 			 */
2889 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2890 			    state->id_sgid, jstate);
2891 			kmem_free(mce, sizeof (ibd_mce_t));
2892 			return (NULL);
2893 		}
2894 	}
2895 
2896 	/*
2897 	 * Insert the ibd_mce_t in the proper list.
2898 	 */
2899 	if (jstate == IB_MC_JSTATE_NON) {
2900 		IBD_MCACHE_INSERT_NON(state, mce);
2901 	} else {
2902 		/*
2903 		 * Set up the mc_req fields used for reaping the
2904 		 * mcg in case of delayed tx completion (see
2905 		 * ibd_tx_cleanup()). Also done for sendonly join in
2906 		 * case we are promoted to fullmembership later and
2907 		 * keep using the same mce.
2908 		 */
2909 		mce->mc_req.rq_gid = mgid;
2910 		mce->mc_req.rq_ptr = mce;
2911 		/*
2912 		 * Check whether this is the case of trying to join
2913 		 * full member, and we were already joined send only.
2914 		 * We try to drop our SendOnly membership, but it is
2915 		 * possible that the mcg does not exist anymore (and
2916 		 * the subnet trap never reached us), so the leave
2917 		 * operation might fail.
2918 		 */
2919 		if (omce != NULL) {
2920 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2921 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2922 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2923 			bcopy(&mce->mc_info, &omce->mc_info,
2924 			    sizeof (ibt_mcg_info_t));
2925 			kmem_free(mce, sizeof (ibd_mce_t));
2926 			return (omce);
2927 		}
2928 		mutex_enter(&state->id_mc_mutex);
2929 		IBD_MCACHE_INSERT_FULL(state, mce);
2930 		mutex_exit(&state->id_mc_mutex);
2931 	}
2932 
2933 	return (mce);
2934 }
2935 
2936 /*
2937  * Called during port up event handling to attempt to reacquire full
2938  * membership to an mcg. Stripped down version of ibd_join_group().
2939  * Note that it is possible that the mcg might have gone away, and
2940  * gets recreated at this point.
2941  */
2942 static void
2943 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2944 {
2945 	ib_gid_t mgid;
2946 
2947 	/*
2948 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2949 	 * reap/leave is going to try to leave the group. We could prevent
2950 	 * that by adding a boolean flag into ibd_mce_t, if required.
2951 	 */
2952 	if (mce->mc_fullreap)
2953 		return;
2954 
2955 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2956 
2957 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2958 	    mgid.gid_guid);
2959 
2960 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2961 		ibd_print_warn(state, "Failure on port up to rejoin "
2962 		    "multicast gid %016llx:%016llx",
2963 		    (u_longlong_t)mgid.gid_prefix,
2964 		    (u_longlong_t)mgid.gid_guid);
2965 }
2966 
2967 /*
2968  * This code handles delayed Tx completion cleanups for mcg's to which
2969  * disable_multicast has been issued, regular mcg related cleanups during
2970  * disable_multicast, disable_promiscous and mcg traps, as well as
2971  * cleanups during driver detach time. Depending on the join state,
2972  * it deletes the mce from the appropriate list and issues the IBA
2973  * leave/detach; except in the disable_multicast case when the mce
2974  * is left on the active list for a subsequent Tx completion cleanup.
2975  */
2976 static void
2977 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2978     uint8_t jstate)
2979 {
2980 	ibd_mce_t *tmce;
2981 	boolean_t do_detach = B_TRUE;
2982 
2983 	/*
2984 	 * Before detaching, we must check whether the other list
2985 	 * contains the mcg; if we detach blindly, the consumer
2986 	 * who set up the other list will also stop receiving
2987 	 * traffic.
2988 	 */
2989 	if (jstate == IB_MC_JSTATE_FULL) {
2990 		/*
2991 		 * The following check is only relevant while coming
2992 		 * from the Tx completion path in the reap case.
2993 		 */
2994 		if (!mce->mc_fullreap)
2995 			return;
2996 		mutex_enter(&state->id_mc_mutex);
2997 		IBD_MCACHE_PULLOUT_FULL(state, mce);
2998 		mutex_exit(&state->id_mc_mutex);
2999 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3000 			do_detach = B_FALSE;
3001 	} else if (jstate == IB_MC_JSTATE_NON) {
3002 		IBD_MCACHE_PULLOUT_NON(state, mce);
3003 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3004 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3005 			do_detach = B_FALSE;
3006 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3007 		mutex_enter(&state->id_mc_mutex);
3008 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3009 		mutex_exit(&state->id_mc_mutex);
3010 		do_detach = B_FALSE;
3011 	}
3012 
3013 	/*
3014 	 * If we are reacting to a mcg trap and leaving our sendonly or
3015 	 * non membership, the mcg is possibly already gone, so attempting
3016 	 * to leave might fail. On the other hand, we must try to leave
3017 	 * anyway, since this might be a trap from long ago, and we could
3018 	 * have potentially sendonly joined to a recent incarnation of
3019 	 * the mcg and are about to loose track of this information.
3020 	 */
3021 	if (do_detach) {
3022 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3023 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3024 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3025 	}
3026 
3027 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3028 	kmem_free(mce, sizeof (ibd_mce_t));
3029 }
3030 
3031 /*
3032  * Async code executed due to multicast and promiscuous disable requests
3033  * and mcg trap handling; also executed during driver detach. Mostly, a
3034  * leave and detach is done; except for the fullmember case when Tx
3035  * requests are pending, whence arrangements are made for subsequent
3036  * cleanup on Tx completion.
3037  */
3038 static void
3039 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3040 {
3041 	ipoib_mac_t mcmac;
3042 	boolean_t recycled;
3043 	ibd_mce_t *mce;
3044 
3045 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3046 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3047 
3048 	if (jstate == IB_MC_JSTATE_NON) {
3049 		recycled = B_TRUE;
3050 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3051 		/*
3052 		 * In case we are handling a mcg trap, we might not find
3053 		 * the mcg in the non list.
3054 		 */
3055 		if (mce == NULL) {
3056 			return;
3057 		}
3058 	} else {
3059 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3060 
3061 		/*
3062 		 * In case we are handling a mcg trap, make sure the trap
3063 		 * is not arriving late; if we have an mce that indicates
3064 		 * that we are already a fullmember, that would be a clear
3065 		 * indication that the trap arrived late (ie, is for a
3066 		 * previous incarnation of the mcg).
3067 		 */
3068 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3069 			if ((mce == NULL) || (mce->mc_jstate ==
3070 			    IB_MC_JSTATE_FULL)) {
3071 				return;
3072 			}
3073 		} else {
3074 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3075 
3076 			/*
3077 			 * If join group failed, mce will be NULL here.
3078 			 * This is because in GLDv3 driver, set multicast
3079 			 *  will always return success.
3080 			 */
3081 			if (mce == NULL) {
3082 				return;
3083 			}
3084 
3085 			mce->mc_fullreap = B_TRUE;
3086 		}
3087 
3088 		/*
3089 		 * If no pending Tx's remain that reference the AH
3090 		 * for the mcg, recycle it from active to free list.
3091 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3092 		 * so the last completing Tx will cause an async reap
3093 		 * operation to be invoked, at which time we will drop our
3094 		 * membership to the mcg so that the pending Tx's complete
3095 		 * successfully. Refer to comments on "AH and MCE active
3096 		 * list manipulation" at top of this file. The lock protects
3097 		 * against Tx fast path and Tx cleanup code.
3098 		 */
3099 		mutex_enter(&state->id_ac_mutex);
3100 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3101 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3102 		    IB_MC_JSTATE_SEND_ONLY_NON));
3103 		mutex_exit(&state->id_ac_mutex);
3104 	}
3105 
3106 	if (recycled) {
3107 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3108 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3109 		ibd_async_reap_group(state, mce, mgid, jstate);
3110 	}
3111 }
3112 
3113 /*
3114  * Find the broadcast address as defined by IPoIB; implicitly
3115  * determines the IBA scope, mtu, tclass etc of the link the
3116  * interface is going to be a member of.
3117  */
3118 static ibt_status_t
3119 ibd_find_bgroup(ibd_state_t *state)
3120 {
3121 	ibt_mcg_attr_t mcg_attr;
3122 	uint_t numg;
3123 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3124 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3125 	    IB_MC_SCOPE_GLOBAL };
3126 	int i, mcgmtu;
3127 	boolean_t found = B_FALSE;
3128 	int ret;
3129 	ibt_mcg_info_t mcg_info;
3130 
3131 	state->id_bgroup_created = B_FALSE;
3132 
3133 query_bcast_grp:
3134 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3135 	mcg_attr.mc_pkey = state->id_pkey;
3136 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3137 
3138 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3139 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3140 
3141 		/*
3142 		 * Look for the IPoIB broadcast group.
3143 		 */
3144 		state->id_mgid.gid_prefix =
3145 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3146 		    ((uint64_t)state->id_scope << 48) |
3147 		    ((uint32_t)(state->id_pkey << 16)));
3148 		mcg_attr.mc_mgid = state->id_mgid;
3149 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3150 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3151 			found = B_TRUE;
3152 			break;
3153 		}
3154 	}
3155 
3156 	if (!found) {
3157 		if (ibd_create_broadcast_group) {
3158 			/*
3159 			 * If we created the broadcast group, but failed to
3160 			 * find it, we can't do anything except leave the
3161 			 * one we created and return failure.
3162 			 */
3163 			if (state->id_bgroup_created) {
3164 				ibd_print_warn(state, "IPoIB broadcast group "
3165 				    "absent. Unable to query after create.");
3166 				goto find_bgroup_fail;
3167 			}
3168 
3169 			/*
3170 			 * Create the ipoib broadcast group if it didn't exist
3171 			 */
3172 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3173 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3174 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3175 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3176 			mcg_attr.mc_pkey = state->id_pkey;
3177 			mcg_attr.mc_flow = 0;
3178 			mcg_attr.mc_sl = 0;
3179 			mcg_attr.mc_tclass = 0;
3180 			state->id_mgid.gid_prefix =
3181 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3182 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3183 			    ((uint32_t)(state->id_pkey << 16)));
3184 			mcg_attr.mc_mgid = state->id_mgid;
3185 
3186 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3187 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3188 				ibd_print_warn(state, "IPoIB broadcast group "
3189 				    "absent, create failed: ret = %d\n", ret);
3190 				state->id_bgroup_created = B_FALSE;
3191 				return (IBT_FAILURE);
3192 			}
3193 			state->id_bgroup_created = B_TRUE;
3194 			goto query_bcast_grp;
3195 		} else {
3196 			ibd_print_warn(state, "IPoIB broadcast group absent");
3197 			return (IBT_FAILURE);
3198 		}
3199 	}
3200 
3201 	/*
3202 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3203 	 */
3204 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3205 	if (state->id_mtu < mcgmtu) {
3206 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3207 		    "greater than port's maximum MTU %d", mcgmtu,
3208 		    state->id_mtu);
3209 		ibt_free_mcg_info(state->id_mcinfo, 1);
3210 		goto find_bgroup_fail;
3211 	}
3212 	state->id_mtu = mcgmtu;
3213 
3214 	return (IBT_SUCCESS);
3215 
3216 find_bgroup_fail:
3217 	if (state->id_bgroup_created) {
3218 		(void) ibt_leave_mcg(state->id_sgid,
3219 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3220 		    IB_MC_JSTATE_FULL);
3221 	}
3222 
3223 	return (IBT_FAILURE);
3224 }
3225 
3226 static int
3227 ibd_alloc_tx_copybufs(ibd_state_t *state)
3228 {
3229 	ibt_mr_attr_t mem_attr;
3230 
3231 	/*
3232 	 * Allocate one big chunk for all regular tx copy bufs
3233 	 */
3234 	state->id_tx_buf_sz = state->id_mtu;
3235 	if (state->id_lso_policy && state->id_lso_capable &&
3236 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3237 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3238 	}
3239 
3240 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3241 	    state->id_tx_buf_sz, KM_SLEEP);
3242 
3243 	/*
3244 	 * Do one memory registration on the entire txbuf area
3245 	 */
3246 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3247 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3248 	mem_attr.mr_as = NULL;
3249 	mem_attr.mr_flags = IBT_MR_SLEEP;
3250 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3251 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3252 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3253 		kmem_free(state->id_tx_bufs,
3254 		    state->id_num_swqe * state->id_tx_buf_sz);
3255 		state->id_tx_bufs = NULL;
3256 		return (DDI_FAILURE);
3257 	}
3258 
3259 	return (DDI_SUCCESS);
3260 }
3261 
3262 static int
3263 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3264 {
3265 	ibt_mr_attr_t mem_attr;
3266 	ibd_lsobuf_t *buflist;
3267 	ibd_lsobuf_t *lbufp;
3268 	ibd_lsobuf_t *tail;
3269 	ibd_lsobkt_t *bktp;
3270 	uint8_t *membase;
3271 	uint8_t *memp;
3272 	uint_t memsz;
3273 	int i;
3274 
3275 	/*
3276 	 * Allocate the lso bucket
3277 	 */
3278 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3279 
3280 	/*
3281 	 * Allocate the entire lso memory and register it
3282 	 */
3283 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3284 	membase = kmem_zalloc(memsz, KM_SLEEP);
3285 
3286 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3287 	mem_attr.mr_len = memsz;
3288 	mem_attr.mr_as = NULL;
3289 	mem_attr.mr_flags = IBT_MR_SLEEP;
3290 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3291 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3292 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3293 		kmem_free(membase, memsz);
3294 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3295 		return (DDI_FAILURE);
3296 	}
3297 
3298 	/*
3299 	 * Now allocate the buflist.  Note that the elements in the buflist and
3300 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3301 	 * can always derive the address of a buflist entry from the address of
3302 	 * an lso buffer.
3303 	 */
3304 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3305 	    KM_SLEEP);
3306 
3307 	/*
3308 	 * Set up the lso buf chain
3309 	 */
3310 	memp = membase;
3311 	lbufp = buflist;
3312 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3313 		lbufp->lb_isfree = 1;
3314 		lbufp->lb_buf = memp;
3315 		lbufp->lb_next = lbufp + 1;
3316 
3317 		tail = lbufp;
3318 
3319 		memp += IBD_LSO_BUFSZ;
3320 		lbufp++;
3321 	}
3322 	tail->lb_next = NULL;
3323 
3324 	/*
3325 	 * Set up the LSO buffer information in ibd state
3326 	 */
3327 	bktp->bkt_bufl = buflist;
3328 	bktp->bkt_free_head = buflist;
3329 	bktp->bkt_mem = membase;
3330 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3331 	bktp->bkt_nfree = bktp->bkt_nelem;
3332 
3333 	state->id_lso = bktp;
3334 
3335 	return (DDI_SUCCESS);
3336 }
3337 
3338 /*
3339  * Statically allocate Tx buffer list(s).
3340  */
3341 static int
3342 ibd_init_txlist(ibd_state_t *state)
3343 {
3344 	ibd_swqe_t *swqe;
3345 	ibt_lkey_t lkey;
3346 	int i;
3347 
3348 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3349 		return (DDI_FAILURE);
3350 
3351 	if (state->id_lso_policy && state->id_lso_capable) {
3352 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3353 			state->id_lso_policy = B_FALSE;
3354 	}
3355 
3356 	/*
3357 	 * Allocate and setup the swqe list
3358 	 */
3359 	lkey = state->id_tx_mr_desc.md_lkey;
3360 	for (i = 0; i < state->id_num_swqe; i++) {
3361 		if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) {
3362 			DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed");
3363 			ibd_fini_txlist(state);
3364 			return (DDI_FAILURE);
3365 		}
3366 
3367 		/* add to list */
3368 		state->id_tx_list.dl_cnt++;
3369 		if (state->id_tx_list.dl_head == NULL) {
3370 			swqe->swqe_prev = NULL;
3371 			swqe->swqe_next = NULL;
3372 			state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3373 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3374 		} else {
3375 			swqe->swqe_prev = state->id_tx_list.dl_tail;
3376 			swqe->swqe_next = NULL;
3377 			state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
3378 			state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
3379 		}
3380 	}
3381 
3382 	return (DDI_SUCCESS);
3383 }
3384 
3385 static int
3386 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3387     uint32_t *nds_p)
3388 {
3389 	ibd_lsobkt_t *bktp;
3390 	ibd_lsobuf_t *lbufp;
3391 	ibd_lsobuf_t *nextp;
3392 	ibt_lkey_t lso_lkey;
3393 	uint_t frag_sz;
3394 	uint_t num_needed;
3395 	int i;
3396 
3397 	ASSERT(sgl_p != NULL);
3398 	ASSERT(nds_p != NULL);
3399 	ASSERT(req_sz != 0);
3400 
3401 	/*
3402 	 * Determine how many bufs we'd need for the size requested
3403 	 */
3404 	num_needed = req_sz / IBD_LSO_BUFSZ;
3405 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3406 		num_needed++;
3407 
3408 	mutex_enter(&state->id_lso_lock);
3409 
3410 	/*
3411 	 * If we don't have enough lso bufs, return failure
3412 	 */
3413 	ASSERT(state->id_lso != NULL);
3414 	bktp = state->id_lso;
3415 	if (bktp->bkt_nfree < num_needed) {
3416 		mutex_exit(&state->id_lso_lock);
3417 		return (-1);
3418 	}
3419 
3420 	/*
3421 	 * Pick the first 'num_needed' bufs from the free list
3422 	 */
3423 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3424 	lbufp = bktp->bkt_free_head;
3425 	for (i = 0; i < num_needed; i++) {
3426 		ASSERT(lbufp->lb_isfree != 0);
3427 		ASSERT(lbufp->lb_buf != NULL);
3428 
3429 		nextp = lbufp->lb_next;
3430 
3431 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3432 		sgl_p[i].ds_key = lso_lkey;
3433 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3434 
3435 		lbufp->lb_isfree = 0;
3436 		lbufp->lb_next = NULL;
3437 
3438 		lbufp = nextp;
3439 	}
3440 	bktp->bkt_free_head = lbufp;
3441 
3442 	/*
3443 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3444 	 * to adjust the last sgl entry's length. Since we know we need atleast
3445 	 * one, the i-1 use below is ok.
3446 	 */
3447 	if (frag_sz) {
3448 		sgl_p[i-1].ds_len = frag_sz;
3449 	}
3450 
3451 	/*
3452 	 * Update nfree count and return
3453 	 */
3454 	bktp->bkt_nfree -= num_needed;
3455 
3456 	mutex_exit(&state->id_lso_lock);
3457 
3458 	*nds_p = num_needed;
3459 
3460 	return (0);
3461 }
3462 
3463 static void
3464 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3465 {
3466 	ibd_lsobkt_t *bktp;
3467 	ibd_lsobuf_t *lbufp;
3468 	uint8_t *lso_mem_end;
3469 	uint_t ndx;
3470 	int i;
3471 
3472 	mutex_enter(&state->id_lso_lock);
3473 
3474 	bktp = state->id_lso;
3475 	ASSERT(bktp != NULL);
3476 
3477 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3478 	for (i = 0; i < nds; i++) {
3479 		uint8_t *va;
3480 
3481 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3482 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3483 
3484 		/*
3485 		 * Figure out the buflist element this sgl buffer corresponds
3486 		 * to and put it back at the head
3487 		 */
3488 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3489 		lbufp = bktp->bkt_bufl + ndx;
3490 
3491 		ASSERT(lbufp->lb_isfree == 0);
3492 		ASSERT(lbufp->lb_buf == va);
3493 
3494 		lbufp->lb_isfree = 1;
3495 		lbufp->lb_next = bktp->bkt_free_head;
3496 		bktp->bkt_free_head = lbufp;
3497 	}
3498 	bktp->bkt_nfree += nds;
3499 
3500 	mutex_exit(&state->id_lso_lock);
3501 }
3502 
3503 static void
3504 ibd_free_tx_copybufs(ibd_state_t *state)
3505 {
3506 	/*
3507 	 * Unregister txbuf mr
3508 	 */
3509 	if (ibt_deregister_mr(state->id_hca_hdl,
3510 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3511 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3512 	}
3513 	state->id_tx_mr_hdl = NULL;
3514 
3515 	/*
3516 	 * Free txbuf memory
3517 	 */
3518 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3519 	state->id_tx_bufs = NULL;
3520 }
3521 
3522 static void
3523 ibd_free_tx_lsobufs(ibd_state_t *state)
3524 {
3525 	ibd_lsobkt_t *bktp;
3526 
3527 	mutex_enter(&state->id_lso_lock);
3528 
3529 	if ((bktp = state->id_lso) == NULL) {
3530 		mutex_exit(&state->id_lso_lock);
3531 		return;
3532 	}
3533 
3534 	/*
3535 	 * First, free the buflist
3536 	 */
3537 	ASSERT(bktp->bkt_bufl != NULL);
3538 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3539 
3540 	/*
3541 	 * Unregister the LSO memory and free it
3542 	 */
3543 	ASSERT(bktp->bkt_mr_hdl != NULL);
3544 	if (ibt_deregister_mr(state->id_hca_hdl,
3545 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3546 		DPRINT(10,
3547 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3548 	}
3549 	ASSERT(bktp->bkt_mem);
3550 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3551 
3552 	/*
3553 	 * Finally free the bucket
3554 	 */
3555 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3556 	state->id_lso = NULL;
3557 
3558 	mutex_exit(&state->id_lso_lock);
3559 }
3560 
3561 /*
3562  * Free the statically allocated Tx buffer list.
3563  */
3564 static void
3565 ibd_fini_txlist(ibd_state_t *state)
3566 {
3567 	ibd_swqe_t *node;
3568 
3569 	/*
3570 	 * Free the allocated swqes
3571 	 */
3572 	mutex_enter(&state->id_tx_list.dl_mutex);
3573 	while (state->id_tx_list.dl_head != NULL) {
3574 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3575 		state->id_tx_list.dl_head = node->swqe_next;
3576 		ASSERT(state->id_tx_list.dl_cnt > 0);
3577 		state->id_tx_list.dl_cnt--;
3578 		ibd_free_swqe(state, node);
3579 	}
3580 	mutex_exit(&state->id_tx_list.dl_mutex);
3581 
3582 	ibd_free_tx_lsobufs(state);
3583 	ibd_free_tx_copybufs(state);
3584 }
3585 
3586 /*
3587  * Allocate a single send wqe and register it so it is almost
3588  * ready to be posted to the hardware.
3589  */
3590 static int
3591 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey)
3592 {
3593 	ibd_swqe_t *swqe;
3594 
3595 	swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP);
3596 	*wqe = swqe;
3597 
3598 	swqe->swqe_type = IBD_WQE_SEND;
3599 	swqe->swqe_next = NULL;
3600 	swqe->swqe_prev = NULL;
3601 	swqe->swqe_im_mblk = NULL;
3602 
3603 	swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3604 	    (state->id_tx_bufs + ndx * state->id_tx_buf_sz);
3605 	swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3606 	swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3607 
3608 	swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3609 	swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
3610 	swqe->w_swr.wr_trans = IBT_UD_SRV;
3611 
3612 	/* These are set in send */
3613 	swqe->w_swr.wr_nds = 0;
3614 	swqe->w_swr.wr_sgl = NULL;
3615 	swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3616 
3617 	return (DDI_SUCCESS);
3618 }
3619 
3620 /*
3621  * Free an allocated send wqe.
3622  */
3623 /*ARGSUSED*/
3624 static void
3625 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
3626 {
3627 	kmem_free(swqe, sizeof (ibd_swqe_t));
3628 }
3629 
3630 /*
3631  * Post a rwqe to the hardware and add it to the Rx list. The
3632  * "recycle" parameter indicates whether an old rwqe is being
3633  * recycled, or this is a new one.
3634  */
3635 static int
3636 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle)
3637 {
3638 	ibt_status_t ibt_status;
3639 
3640 	if (recycle == B_FALSE) {
3641 		mutex_enter(&state->id_rx_list.dl_mutex);
3642 		if (state->id_rx_list.dl_head == NULL) {
3643 			rwqe->rwqe_prev = NULL;
3644 			rwqe->rwqe_next = NULL;
3645 			state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe);
3646 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3647 		} else {
3648 			rwqe->rwqe_prev = state->id_rx_list.dl_tail;
3649 			rwqe->rwqe_next = NULL;
3650 			state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe);
3651 			state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe);
3652 		}
3653 		mutex_exit(&state->id_rx_list.dl_mutex);
3654 	}
3655 
3656 	mutex_enter(&state->id_rxpost_lock);
3657 	if (state->id_rx_busy) {
3658 		rwqe->w_post_link = NULL;
3659 		if (state->id_rx_head)
3660 			*(state->id_rx_tailp) = (ibd_wqe_t *)rwqe;
3661 		else
3662 			state->id_rx_head = rwqe;
3663 		state->id_rx_tailp = &(rwqe->w_post_link);
3664 	} else {
3665 		state->id_rx_busy = 1;
3666 		do {
3667 			mutex_exit(&state->id_rxpost_lock);
3668 
3669 			/*
3670 			 * Here we should add dl_cnt before post recv, because
3671 			 * we would have to make sure dl_cnt is updated before
3672 			 * the corresponding ibd_process_rx() is called.
3673 			 */
3674 			atomic_add_32(&state->id_rx_list.dl_cnt, 1);
3675 
3676 			ibt_status = ibt_post_recv(state->id_chnl_hdl,
3677 			    &rwqe->w_rwr, 1, NULL);
3678 			if (ibt_status != IBT_SUCCESS) {
3679 				(void) atomic_add_32_nv(
3680 				    &state->id_rx_list.dl_cnt, -1);
3681 				ibd_print_warn(state, "ibd_post_recv: "
3682 				    "posting failed, ret=%d", ibt_status);
3683 				return (DDI_FAILURE);
3684 			}
3685 
3686 			mutex_enter(&state->id_rxpost_lock);
3687 			rwqe = state->id_rx_head;
3688 			if (rwqe) {
3689 				state->id_rx_head =
3690 				    (ibd_rwqe_t *)(rwqe->w_post_link);
3691 			}
3692 		} while (rwqe);
3693 		state->id_rx_busy = 0;
3694 	}
3695 	mutex_exit(&state->id_rxpost_lock);
3696 
3697 	return (DDI_SUCCESS);
3698 }
3699 
3700 /*
3701  * Allocate the statically allocated Rx buffer list.
3702  */
3703 static int
3704 ibd_init_rxlist(ibd_state_t *state)
3705 {
3706 	ibd_rwqe_t *rwqe;
3707 	int i;
3708 
3709 	for (i = 0; i < state->id_num_rwqe; i++) {
3710 		if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) {
3711 			ibd_fini_rxlist(state);
3712 			return (DDI_FAILURE);
3713 		}
3714 
3715 		if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) {
3716 			ibd_free_rwqe(state, rwqe);
3717 			ibd_fini_rxlist(state);
3718 			return (DDI_FAILURE);
3719 		}
3720 	}
3721 
3722 	return (DDI_SUCCESS);
3723 }
3724 
3725 /*
3726  * Free the statically allocated Rx buffer list.
3727  *
3728  */
3729 static void
3730 ibd_fini_rxlist(ibd_state_t *state)
3731 {
3732 	ibd_rwqe_t *node;
3733 
3734 	mutex_enter(&state->id_rx_list.dl_mutex);
3735 	while (state->id_rx_list.dl_head != NULL) {
3736 		node = WQE_TO_RWQE(state->id_rx_list.dl_head);
3737 		state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next;
3738 		ASSERT(state->id_rx_list.dl_cnt > 0);
3739 		state->id_rx_list.dl_cnt--;
3740 
3741 		ibd_free_rwqe(state, node);
3742 	}
3743 	mutex_exit(&state->id_rx_list.dl_mutex);
3744 }
3745 
3746 /*
3747  * Allocate a single recv wqe and register it so it is almost
3748  * ready to be posted to the hardware.
3749  */
3750 static int
3751 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe)
3752 {
3753 	ibt_mr_attr_t mem_attr;
3754 	ibd_rwqe_t *rwqe;
3755 
3756 	if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) {
3757 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3758 		return (DDI_FAILURE);
3759 	}
3760 	*wqe = rwqe;
3761 	rwqe->rwqe_type = IBD_WQE_RECV;
3762 	rwqe->w_state = state;
3763 	rwqe->rwqe_next = NULL;
3764 	rwqe->rwqe_prev = NULL;
3765 	rwqe->w_freeing_wqe = B_FALSE;
3766 	rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3767 	rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3768 
3769 	rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu +
3770 	    IPOIB_GRH_SIZE, KM_NOSLEEP);
3771 	if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) {
3772 		DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc");
3773 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3774 		return (DDI_FAILURE);
3775 	}
3776 
3777 	if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
3778 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) ==
3779 	    NULL) {
3780 		DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()");
3781 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3782 		    state->id_mtu + IPOIB_GRH_SIZE);
3783 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3784 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3785 		return (DDI_FAILURE);
3786 	}
3787 
3788 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3789 	mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE;
3790 	mem_attr.mr_as = NULL;
3791 	mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3792 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3793 	    &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) !=
3794 	    IBT_SUCCESS) {
3795 		DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()");
3796 		rwqe->w_freeing_wqe = B_TRUE;
3797 		freemsg(rwqe->rwqe_im_mblk);
3798 		kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3799 		    state->id_mtu + IPOIB_GRH_SIZE);
3800 		rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3801 		kmem_free(rwqe, sizeof (ibd_rwqe_t));
3802 		return (DDI_FAILURE);
3803 	}
3804 
3805 	rwqe->rwqe_copybuf.ic_sgl.ds_va =
3806 	    (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr;
3807 	rwqe->rwqe_copybuf.ic_sgl.ds_key =
3808 	    rwqe->rwqe_copybuf.ic_mr_desc.md_lkey;
3809 	rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE;
3810 	rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3811 	rwqe->w_rwr.wr_nds = 1;
3812 	rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3813 
3814 	return (DDI_SUCCESS);
3815 }
3816 
3817 /*
3818  * Free an allocated recv wqe.
3819  */
3820 static void
3821 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3822 {
3823 	if (ibt_deregister_mr(state->id_hca_hdl,
3824 	    rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) {
3825 		DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()");
3826 		return;
3827 	}
3828 
3829 	/*
3830 	 * Indicate to the callback function that this rwqe/mblk
3831 	 * should not be recycled. The freemsg() will invoke
3832 	 * ibd_freemsg_cb().
3833 	 */
3834 	if (rwqe->rwqe_im_mblk != NULL) {
3835 		rwqe->w_freeing_wqe = B_TRUE;
3836 		freemsg(rwqe->rwqe_im_mblk);
3837 	}
3838 	kmem_free(rwqe->rwqe_copybuf.ic_bufaddr,
3839 	    state->id_mtu + IPOIB_GRH_SIZE);
3840 	rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3841 	kmem_free(rwqe, sizeof (ibd_rwqe_t));
3842 }
3843 
3844 /*
3845  * Delete the rwqe being freed from the rx list.
3846  */
3847 static void
3848 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3849 {
3850 	mutex_enter(&state->id_rx_list.dl_mutex);
3851 	if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe))
3852 		state->id_rx_list.dl_head = rwqe->rwqe_next;
3853 	else
3854 		rwqe->rwqe_prev->w_next = rwqe->rwqe_next;
3855 	if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe))
3856 		state->id_rx_list.dl_tail = rwqe->rwqe_prev;
3857 	else
3858 		rwqe->rwqe_next->w_prev = rwqe->rwqe_prev;
3859 	mutex_exit(&state->id_rx_list.dl_mutex);
3860 }
3861 
3862 /*
3863  * IBA Rx/Tx completion queue handler. Guaranteed to be single
3864  * threaded and nonreentrant for this CQ. When using combined CQ,
3865  * this handles Tx and Rx completions. With separate CQs, this handles
3866  * only Rx completions.
3867  */
3868 /* ARGSUSED */
3869 static void
3870 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3871 {
3872 	ibd_state_t *state = (ibd_state_t *)arg;
3873 
3874 	atomic_add_64(&state->id_num_intrs, 1);
3875 
3876 	if (ibd_rx_softintr == 1)
3877 		ddi_trigger_softintr(state->id_rx);
3878 	else
3879 		(void) ibd_intr((char *)state);
3880 }
3881 
3882 /*
3883  * Separate CQ handler for Tx completions, when the Tx CQ is in
3884  * interrupt driven mode.
3885  */
3886 /* ARGSUSED */
3887 static void
3888 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3889 {
3890 	ibd_state_t *state = (ibd_state_t *)arg;
3891 
3892 	atomic_add_64(&state->id_num_intrs, 1);
3893 
3894 	if (ibd_tx_softintr == 1)
3895 		ddi_trigger_softintr(state->id_tx);
3896 	else
3897 		(void) ibd_tx_recycle((char *)state);
3898 }
3899 
3900 /*
3901  * Multicast group create/delete trap handler. These will be delivered
3902  * on a kernel thread (handling can thus block) and can be invoked
3903  * concurrently. The handler can be invoked anytime after it is
3904  * registered and before ibt_detach().
3905  */
3906 /* ARGSUSED */
3907 static void
3908 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3909     ibt_subnet_event_t *event)
3910 {
3911 	ibd_state_t *state = (ibd_state_t *)arg;
3912 	ibd_req_t *req;
3913 
3914 	/*
3915 	 * The trap handler will get invoked once for every event for
3916 	 * evert port. The input "gid" is the GID0 of the port the
3917 	 * trap came in on; we just need to act on traps that came
3918 	 * to our port, meaning the port on which the ipoib interface
3919 	 * resides. Since ipoib uses GID0 of the port, we just match
3920 	 * the gids to check whether we need to handle the trap.
3921 	 */
3922 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
3923 		return;
3924 
3925 	DPRINT(10, "ibd_notices_handler : %d\n", code);
3926 
3927 	switch (code) {
3928 		case IBT_SM_EVENT_UNAVAILABLE:
3929 			/*
3930 			 * If we are in promiscuous mode or have
3931 			 * sendnonmembers, we need to print a warning
3932 			 * message right now. Else, just store the
3933 			 * information, print when we enter promiscuous
3934 			 * mode or attempt nonmember send. We might
3935 			 * also want to stop caching sendnonmember.
3936 			 */
3937 			ibd_print_warn(state, "IBA multicast support "
3938 			    "degraded due to unavailability of multicast "
3939 			    "traps");
3940 			break;
3941 		case IBT_SM_EVENT_AVAILABLE:
3942 			/*
3943 			 * If we printed a warning message above or
3944 			 * while trying to nonmember send or get into
3945 			 * promiscuous mode, print an okay message.
3946 			 */
3947 			ibd_print_warn(state, "IBA multicast support "
3948 			    "restored due to availability of multicast "
3949 			    "traps");
3950 			break;
3951 		case IBT_SM_EVENT_MCG_CREATED:
3952 		case IBT_SM_EVENT_MCG_DELETED:
3953 			/*
3954 			 * Common processing of creation/deletion traps.
3955 			 * First check if the instance is being
3956 			 * [de]initialized; back off then, without doing
3957 			 * anything more, since we are not sure if the
3958 			 * async thread is around, or whether we might
3959 			 * be racing with the detach code in ibd_m_stop()
3960 			 * that scans the mcg list.
3961 			 */
3962 			if (!ibd_async_safe(state))
3963 				return;
3964 
3965 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
3966 			req->rq_gid = event->sm_notice_gid;
3967 			req->rq_ptr = (void *)code;
3968 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
3969 			break;
3970 	}
3971 }
3972 
3973 static void
3974 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
3975 {
3976 	ib_gid_t mgid = req->rq_gid;
3977 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
3978 
3979 	DPRINT(10, "ibd_async_trap : %d\n", code);
3980 
3981 	/*
3982 	 * Atomically search the nonmember and sendonlymember lists and
3983 	 * delete.
3984 	 */
3985 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
3986 
3987 	if (state->id_prom_op == IBD_OP_COMPLETED) {
3988 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
3989 
3990 		/*
3991 		 * If in promiscuous mode, try to join/attach to the new
3992 		 * mcg. Given the unreliable out-of-order mode of trap
3993 		 * delivery, we can never be sure whether it is a problem
3994 		 * if the join fails. Thus, we warn the admin of a failure
3995 		 * if this was a creation trap. Note that the trap might
3996 		 * actually be reporting a long past event, and the mcg
3997 		 * might already have been deleted, thus we might be warning
3998 		 * in vain.
3999 		 */
4000 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4001 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4002 			ibd_print_warn(state, "IBA promiscuous mode missed "
4003 			    "new multicast gid %016llx:%016llx",
4004 			    (u_longlong_t)mgid.gid_prefix,
4005 			    (u_longlong_t)mgid.gid_guid);
4006 	}
4007 
4008 	/*
4009 	 * Free the request slot allocated by the subnet event thread.
4010 	 */
4011 	ibd_async_done(state);
4012 }
4013 
4014 /*
4015  * GLDv3 entry point to get capabilities.
4016  */
4017 static boolean_t
4018 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4019 {
4020 	ibd_state_t *state = arg;
4021 
4022 	switch (cap) {
4023 	case MAC_CAPAB_HCKSUM: {
4024 		uint32_t *txflags = cap_data;
4025 
4026 		/*
4027 		 * We either do full checksum or not do it at all
4028 		 */
4029 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4030 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4031 		else
4032 			return (B_FALSE);
4033 		break;
4034 	}
4035 
4036 	case MAC_CAPAB_LSO: {
4037 		mac_capab_lso_t *cap_lso = cap_data;
4038 
4039 		/*
4040 		 * In addition to the capability and policy, since LSO
4041 		 * relies on hw checksum, we'll not enable LSO if we
4042 		 * don't have hw checksum.  Of course, if the HCA doesn't
4043 		 * provide the reserved lkey capability, enabling LSO will
4044 		 * actually affect performance adversely, so we'll disable
4045 		 * LSO even for that case.
4046 		 */
4047 		if (!state->id_lso_policy || !state->id_lso_capable)
4048 			return (B_FALSE);
4049 
4050 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4051 			return (B_FALSE);
4052 
4053 		if (state->id_hca_res_lkey_capab == 0) {
4054 			ibd_print_warn(state, "no reserved-lkey capability, "
4055 			    "disabling LSO");
4056 			return (B_FALSE);
4057 		}
4058 
4059 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4060 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4061 		break;
4062 	}
4063 
4064 	default:
4065 		return (B_FALSE);
4066 	}
4067 
4068 	return (B_TRUE);
4069 }
4070 
4071 static int
4072 ibd_get_port_details(ibd_state_t *state)
4073 {
4074 	ibt_hca_portinfo_t *port_infop;
4075 	ibt_status_t ret;
4076 	uint_t psize, port_infosz;
4077 
4078 	mutex_enter(&state->id_link_mutex);
4079 
4080 	/*
4081 	 * Query for port information
4082 	 */
4083 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4084 	    &port_infop, &psize, &port_infosz);
4085 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
4086 		mutex_exit(&state->id_link_mutex);
4087 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4088 		    "failed, ret=%d", ret);
4089 		return (ENETDOWN);
4090 	}
4091 
4092 	/*
4093 	 * If the link already went down by the time we get here,
4094 	 * give up
4095 	 */
4096 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
4097 		mutex_exit(&state->id_link_mutex);
4098 		ibt_free_portinfo(port_infop, port_infosz);
4099 		DPRINT(10, "ibd_get_port_details: port is not active");
4100 		return (ENETDOWN);
4101 	}
4102 
4103 	/*
4104 	 * If the link is active, verify the pkey
4105 	 */
4106 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4107 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4108 		mutex_exit(&state->id_link_mutex);
4109 		ibt_free_portinfo(port_infop, port_infosz);
4110 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
4111 		    "failed, ret=%d", ret);
4112 		return (ENONET);
4113 	}
4114 
4115 	state->id_mtu = (128 << port_infop->p_mtu);
4116 	state->id_sgid = *port_infop->p_sgid_tbl;
4117 	state->id_link_state = LINK_STATE_UP;
4118 
4119 	mutex_exit(&state->id_link_mutex);
4120 	ibt_free_portinfo(port_infop, port_infosz);
4121 
4122 	/*
4123 	 * Now that the port is active, record the port speed
4124 	 */
4125 	state->id_link_speed = ibd_get_portspeed(state);
4126 
4127 	return (0);
4128 }
4129 
4130 static int
4131 ibd_alloc_cqs(ibd_state_t *state)
4132 {
4133 	ibt_hca_attr_t hca_attrs;
4134 	ibt_cq_attr_t cq_attr;
4135 	ibt_status_t ret;
4136 	uint32_t real_size;
4137 
4138 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4139 	ASSERT(ret == IBT_SUCCESS);
4140 
4141 	/*
4142 	 * Allocate Rx/combined CQ:
4143 	 * Theoretically, there is no point in having more than #rwqe
4144 	 * plus #swqe cqe's, except that the CQ will be signalled for
4145 	 * overflow when the last wqe completes, if none of the previous
4146 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4147 	 * to make sure such overflow does not occur.
4148 	 */
4149 	cq_attr.cq_sched = NULL;
4150 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4151 
4152 	if (ibd_separate_cqs == 1) {
4153 		/*
4154 		 * Allocate Receive CQ.
4155 		 */
4156 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4157 			cq_attr.cq_size = state->id_num_rwqe + 1;
4158 		} else {
4159 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4160 			state->id_num_rwqe = cq_attr.cq_size - 1;
4161 		}
4162 
4163 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4164 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4165 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4166 			    "failed, ret=%d\n", ret);
4167 			return (DDI_FAILURE);
4168 		}
4169 
4170 		if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4171 		    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4172 			DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4173 			    "moderation failed, ret=%d\n", ret);
4174 		}
4175 
4176 		state->id_rxwcs_size = state->id_num_rwqe + 1;
4177 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4178 		    state->id_rxwcs_size, KM_SLEEP);
4179 
4180 		/*
4181 		 * Allocate Send CQ.
4182 		 */
4183 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4184 			cq_attr.cq_size = state->id_num_swqe + 1;
4185 		} else {
4186 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4187 			state->id_num_swqe = cq_attr.cq_size - 1;
4188 		}
4189 
4190 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4191 		    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4192 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4193 			    "failed, ret=%d\n", ret);
4194 			kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4195 			    state->id_rxwcs_size);
4196 			(void) ibt_free_cq(state->id_rcq_hdl);
4197 			return (DDI_FAILURE);
4198 		}
4199 		if ((ret = ibt_modify_cq(state->id_scq_hdl,
4200 		    IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) {
4201 			DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4202 			    "moderation failed, ret=%d\n", ret);
4203 		}
4204 
4205 		state->id_txwcs_size = state->id_num_swqe + 1;
4206 		state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4207 		    state->id_txwcs_size, KM_SLEEP);
4208 	} else {
4209 		/*
4210 		 * Allocate combined Send/Receive CQ.
4211 		 */
4212 		if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe +
4213 		    state->id_num_swqe + 1)) {
4214 			cq_attr.cq_size = state->id_num_rwqe +
4215 			    state->id_num_swqe + 1;
4216 		} else {
4217 			cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4218 			state->id_num_rwqe = ((cq_attr.cq_size - 1) *
4219 			    state->id_num_rwqe) / (state->id_num_rwqe +
4220 			    state->id_num_swqe);
4221 			state->id_num_swqe = cq_attr.cq_size - 1 -
4222 			    state->id_num_rwqe;
4223 		}
4224 
4225 		state->id_rxwcs_size = cq_attr.cq_size;
4226 		state->id_txwcs_size = state->id_rxwcs_size;
4227 
4228 		if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4229 		    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4230 			DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) "
4231 			    "failed, ret=%d\n", ret);
4232 			return (DDI_FAILURE);
4233 		}
4234 		state->id_scq_hdl = state->id_rcq_hdl;
4235 		state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4236 		    state->id_rxwcs_size, KM_SLEEP);
4237 		state->id_txwcs = state->id_rxwcs;
4238 	}
4239 
4240 	/*
4241 	 * Print message in case we could not allocate as many wqe's
4242 	 * as was requested.
4243 	 */
4244 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4245 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4246 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4247 	}
4248 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4249 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4250 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4251 	}
4252 
4253 	return (DDI_SUCCESS);
4254 }
4255 
4256 static int
4257 ibd_setup_ud_channel(ibd_state_t *state)
4258 {
4259 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4260 	ibt_ud_chan_query_attr_t ud_chan_attr;
4261 	ibt_status_t ret;
4262 
4263 	ud_alloc_attr.ud_flags  = IBT_WR_SIGNALED;
4264 	if (state->id_hca_res_lkey_capab)
4265 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4266 	if (state->id_lso_policy && state->id_lso_capable)
4267 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4268 
4269 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4270 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4271 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4272 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4273 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4274 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4275 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4276 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4277 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4278 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4279 	ud_alloc_attr.ud_clone_chan	= NULL;
4280 
4281 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4282 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4283 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4284 		    "failed, ret=%d\n", ret);
4285 		return (DDI_FAILURE);
4286 	}
4287 
4288 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4289 	    &ud_chan_attr)) != IBT_SUCCESS) {
4290 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4291 		    "failed, ret=%d\n", ret);
4292 		(void) ibt_free_channel(state->id_chnl_hdl);
4293 		return (DDI_FAILURE);
4294 	}
4295 
4296 	state->id_qpnum = ud_chan_attr.ud_qpn;
4297 
4298 	return (DDI_SUCCESS);
4299 }
4300 
4301 static int
4302 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
4303 {
4304 	uint32_t progress = state->id_mac_state;
4305 	uint_t attempts;
4306 	ibt_status_t ret;
4307 	ib_gid_t mgid;
4308 	ibd_mce_t *mce;
4309 	uint8_t jstate;
4310 
4311 	/*
4312 	 * Before we try to stop/undo whatever we did in ibd_start(),
4313 	 * we need to mark the link state appropriately to prevent the
4314 	 * ip layer from using this instance for any new transfers. Note
4315 	 * that if the original state of the link was "up" when we're
4316 	 * here, we'll set the final link state to "unknown", to behave
4317 	 * in the same fashion as other ethernet drivers.
4318 	 */
4319 	mutex_enter(&state->id_link_mutex);
4320 	if (cur_link_state == LINK_STATE_DOWN) {
4321 		state->id_link_state = cur_link_state;
4322 	} else {
4323 		state->id_link_state = LINK_STATE_UNKNOWN;
4324 	}
4325 	mutex_exit(&state->id_link_mutex);
4326 	mac_link_update(state->id_mh, state->id_link_state);
4327 
4328 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4329 	if (progress & IBD_DRV_STARTED) {
4330 		state->id_mac_state &= (~IBD_DRV_STARTED);
4331 	}
4332 
4333 	/*
4334 	 * First, stop receive interrupts; this stops the driver from
4335 	 * handing up buffers to higher layers.  Wait for receive buffers
4336 	 * to be returned and give up after 5 seconds.
4337 	 */
4338 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4339 
4340 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4341 
4342 		attempts = 50;
4343 		while (state->id_rx_list.dl_bufs_outstanding > 0) {
4344 			delay(drv_usectohz(100000));
4345 			if (--attempts == 0) {
4346 				/*
4347 				 * There are pending bufs with the network
4348 				 * layer and we have no choice but to wait
4349 				 * for them to be done with. Reap all the
4350 				 * Tx/Rx completions that were posted since
4351 				 * we turned off the notification and
4352 				 * return failure.
4353 				 */
4354 				DPRINT(2, "ibd_undo_start: "
4355 				    "reclaiming failed");
4356 				ibd_poll_compq(state, state->id_rcq_hdl);
4357 				ibt_set_cq_handler(state->id_rcq_hdl,
4358 				    ibd_rcq_handler, state);
4359 				return (DDI_FAILURE);
4360 			}
4361 		}
4362 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4363 	}
4364 
4365 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4366 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4367 
4368 		mutex_enter(&state->id_trap_lock);
4369 		state->id_trap_stop = B_TRUE;
4370 		while (state->id_trap_inprog > 0)
4371 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4372 		mutex_exit(&state->id_trap_lock);
4373 
4374 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4375 	}
4376 
4377 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4378 		/*
4379 		 * Flushing the channel ensures that all pending WQE's
4380 		 * are marked with flush_error and handed to the CQ. It
4381 		 * does not guarantee the invocation of the CQ handler.
4382 		 * This call is guaranteed to return successfully for
4383 		 * UD QPNs.
4384 		 */
4385 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
4386 		    IBT_SUCCESS) {
4387 			DPRINT(10, "ibd_undo_start: flush_channel "
4388 			    "failed, ret=%d", ret);
4389 		}
4390 
4391 		/*
4392 		 * Turn off Tx interrupts and poll. By the time the polling
4393 		 * returns an empty indicator, we are sure we have seen all
4394 		 * pending Tx callbacks. Note that after the call to
4395 		 * ibt_set_cq_handler() returns, the old handler is
4396 		 * guaranteed not to be invoked anymore.
4397 		 */
4398 		if (ibd_separate_cqs == 1) {
4399 			ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4400 		}
4401 		ibd_poll_compq(state, state->id_scq_hdl);
4402 
4403 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4404 	}
4405 
4406 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4407 		/*
4408 		 * No new async requests will be posted since the device
4409 		 * link state has been marked as unknown; completion handlers
4410 		 * have been turned off, so Tx handler will not cause any
4411 		 * more IBD_ASYNC_REAP requests.
4412 		 *
4413 		 * Queue a request for the async thread to exit, which will
4414 		 * be serviced after any pending ones. This can take a while,
4415 		 * specially if the SM is unreachable, since IBMF will slowly
4416 		 * timeout each SM request issued by the async thread.  Reap
4417 		 * the thread before continuing on, we do not want it to be
4418 		 * lingering in modunloaded code (or we could move the reap
4419 		 * to ibd_detach(), provided we keep track of the current
4420 		 * id_async_thrid somewhere safe).
4421 		 */
4422 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4423 		thread_join(state->id_async_thrid);
4424 
4425 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4426 	}
4427 
4428 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4429 		/*
4430 		 * Drop all residual full/non membership. This includes full
4431 		 * membership to the broadcast group, and any nonmembership
4432 		 * acquired during transmits. We do this after the Tx completion
4433 		 * handlers are done, since those might result in some late
4434 		 * leaves; this also eliminates a potential race with that
4435 		 * path wrt the mc full list insert/delete. Trap handling
4436 		 * has also been suppressed at this point. Thus, no locks
4437 		 * are required while traversing the mc full list.
4438 		 */
4439 		DPRINT(2, "ibd_undo_start: clear full cache entries");
4440 		mce = list_head(&state->id_mc_full);
4441 		while (mce != NULL) {
4442 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4443 			jstate = mce->mc_jstate;
4444 			mce = list_next(&state->id_mc_full, mce);
4445 			ibd_leave_group(state, mgid, jstate);
4446 		}
4447 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4448 	}
4449 
4450 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4451 		ibd_fini_rxlist(state);
4452 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4453 	}
4454 
4455 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4456 		ibd_fini_txlist(state);
4457 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4458 	}
4459 
4460 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4461 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
4462 		    IBT_SUCCESS) {
4463 			DPRINT(10, "ibd_undo_start: free_channel "
4464 			    "failed, ret=%d", ret);
4465 		}
4466 
4467 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4468 	}
4469 
4470 	if (progress & IBD_DRV_CQS_ALLOCD) {
4471 		if (ibd_separate_cqs == 1) {
4472 			kmem_free(state->id_txwcs,
4473 			    sizeof (ibt_wc_t) * state->id_txwcs_size);
4474 			if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
4475 			    IBT_SUCCESS) {
4476 				DPRINT(10, "ibd_undo_start: free_cq(scq) "
4477 				    "failed, ret=%d", ret);
4478 			}
4479 		}
4480 
4481 		kmem_free(state->id_rxwcs,
4482 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4483 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
4484 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
4485 			    "ret=%d", ret);
4486 		}
4487 
4488 		state->id_txwcs = NULL;
4489 		state->id_rxwcs = NULL;
4490 		state->id_scq_hdl = NULL;
4491 		state->id_rcq_hdl = NULL;
4492 
4493 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4494 	}
4495 
4496 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4497 		mod_hash_destroy_hash(state->id_ah_active_hash);
4498 		ibd_acache_fini(state);
4499 
4500 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4501 	}
4502 
4503 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4504 		/*
4505 		 * If we'd created the ipoib broadcast group and had
4506 		 * successfully joined it, leave it now
4507 		 */
4508 		if (state->id_bgroup_created) {
4509 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
4510 			jstate = IB_MC_JSTATE_FULL;
4511 			(void) ibt_leave_mcg(state->id_sgid, mgid,
4512 			    state->id_sgid, jstate);
4513 		}
4514 		ibt_free_mcg_info(state->id_mcinfo, 1);
4515 
4516 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4517 	}
4518 
4519 	return (DDI_SUCCESS);
4520 }
4521 
4522 /*
4523  * These pair of routines are used to set/clear the condition that
4524  * the caller is likely to do something to change the id_mac_state.
4525  * If there's already someone doing either a start or a stop (possibly
4526  * due to the async handler detecting a pkey relocation event, a plumb
4527  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
4528  * that's done.
4529  */
4530 static void
4531 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
4532 {
4533 	mutex_enter(&state->id_macst_lock);
4534 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
4535 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
4536 
4537 	state->id_mac_state |= flag;
4538 	mutex_exit(&state->id_macst_lock);
4539 }
4540 
4541 static void
4542 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
4543 {
4544 	mutex_enter(&state->id_macst_lock);
4545 	state->id_mac_state &= (~flag);
4546 	cv_signal(&state->id_macst_cv);
4547 	mutex_exit(&state->id_macst_lock);
4548 }
4549 
4550 /*
4551  * GLDv3 entry point to start hardware.
4552  */
4553 /*ARGSUSED*/
4554 static int
4555 ibd_m_start(void *arg)
4556 {
4557 	ibd_state_t *state = arg;
4558 	int	ret;
4559 
4560 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4561 
4562 	ret = ibd_start(state);
4563 
4564 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4565 
4566 	return (ret);
4567 }
4568 
4569 static int
4570 ibd_start(ibd_state_t *state)
4571 {
4572 	kthread_t *kht;
4573 	int err;
4574 	ibt_status_t ret;
4575 
4576 	if (state->id_mac_state & IBD_DRV_STARTED)
4577 		return (DDI_SUCCESS);
4578 
4579 	/*
4580 	 * Get port details; if we fail here, very likely the port
4581 	 * state is inactive or the pkey can't be verified.
4582 	 */
4583 	if ((err = ibd_get_port_details(state)) != 0) {
4584 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
4585 		goto start_fail;
4586 	}
4587 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4588 
4589 	/*
4590 	 * Find the IPoIB broadcast group
4591 	 */
4592 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4593 		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
4594 		err = ENOTACTIVE;
4595 		goto start_fail;
4596 	}
4597 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4598 
4599 	/*
4600 	 * Initialize per-interface caches and lists; if we fail here,
4601 	 * it is most likely due to a lack of resources
4602 	 */
4603 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4604 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
4605 		err = ENOMEM;
4606 		goto start_fail;
4607 	}
4608 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4609 
4610 	/*
4611 	 * Allocate send and receive completion queues
4612 	 */
4613 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4614 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
4615 		err = ENOMEM;
4616 		goto start_fail;
4617 	}
4618 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4619 
4620 	/*
4621 	 * Setup a UD channel
4622 	 */
4623 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4624 		err = ENOMEM;
4625 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
4626 		goto start_fail;
4627 	}
4628 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4629 
4630 	/*
4631 	 * Allocate and initialize the tx buffer list
4632 	 */
4633 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4634 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
4635 		err = ENOMEM;
4636 		goto start_fail;
4637 	}
4638 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4639 
4640 	/*
4641 	 * If we have separate cqs, create the send cq handler here
4642 	 */
4643 	if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) {
4644 		ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4645 		if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
4646 		    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4647 			DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
4648 			    "failed, ret=%d", ret);
4649 			err = EINVAL;
4650 			goto start_fail;
4651 		}
4652 		state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4653 	}
4654 
4655 	/*
4656 	 * Allocate and initialize the rx buffer list
4657 	 */
4658 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4659 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
4660 		err = ENOMEM;
4661 		goto start_fail;
4662 	}
4663 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4664 
4665 	/*
4666 	 * Join IPoIB broadcast group
4667 	 */
4668 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4669 		DPRINT(10, "ibd_start: ibd_join_group() failed");
4670 		err = ENOTACTIVE;
4671 		goto start_fail;
4672 	}
4673 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4674 
4675 	/*
4676 	 * Create the async thread; thread_create never fails.
4677 	 */
4678 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4679 	    TS_RUN, minclsyspri);
4680 	state->id_async_thrid = kht->t_did;
4681 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4682 
4683 	/*
4684 	 * When we did mac_register() in ibd_attach(), we didn't register
4685 	 * the real macaddr and we didn't have the true port mtu. Now that
4686 	 * we're almost ready, set the local mac address and broadcast
4687 	 * addresses and update gldv3 about the real values of these
4688 	 * parameters.
4689 	 */
4690 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4691 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4692 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4693 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4694 
4695 	mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
4696 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4697 
4698 	/*
4699 	 * Setup the receive cq handler
4700 	 */
4701 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4702 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
4703 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4704 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
4705 		    "failed, ret=%d", ret);
4706 		err = EINVAL;
4707 		goto start_fail;
4708 	}
4709 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4710 
4711 	/*
4712 	 * Setup the subnet notices handler after we've initialized the acache/
4713 	 * mcache and started the async thread, both of which are required for
4714 	 * the trap handler to function properly.
4715 	 *
4716 	 * Now that the async thread has been started (and we've already done
4717 	 * a mac_register() during attach so mac_tx_update() can be called
4718 	 * if necessary without any problem), we can enable the trap handler
4719 	 * to queue requests to the async thread.
4720 	 */
4721 	ibt_register_subnet_notices(state->id_ibt_hdl,
4722 	    ibd_snet_notices_handler, state);
4723 	mutex_enter(&state->id_trap_lock);
4724 	state->id_trap_stop = B_FALSE;
4725 	mutex_exit(&state->id_trap_lock);
4726 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4727 
4728 	/*
4729 	 * Indicate link status to GLDv3 and higher layers. By default,
4730 	 * we assume we are in up state (which must have been true at
4731 	 * least at the time the broadcast mcg's were probed); if there
4732 	 * were any up/down transitions till the time we come here, the
4733 	 * async handler will have updated last known state, which we
4734 	 * use to tell GLDv3. The async handler will not send any
4735 	 * notifications to GLDv3 till we reach here in the initialization
4736 	 * sequence.
4737 	 */
4738 	state->id_mac_state |= IBD_DRV_STARTED;
4739 	mac_link_update(state->id_mh, state->id_link_state);
4740 
4741 	return (DDI_SUCCESS);
4742 
4743 start_fail:
4744 	/*
4745 	 * If we ran into a problem during ibd_start() and ran into
4746 	 * some other problem during undoing our partial work, we can't
4747 	 * do anything about it.  Ignore any errors we might get from
4748 	 * ibd_undo_start() and just return the original error we got.
4749 	 */
4750 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
4751 	return (err);
4752 }
4753 
4754 /*
4755  * GLDv3 entry point to stop hardware from receiving packets.
4756  */
4757 /*ARGSUSED*/
4758 static void
4759 ibd_m_stop(void *arg)
4760 {
4761 	ibd_state_t *state = (ibd_state_t *)arg;
4762 
4763 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4764 
4765 	(void) ibd_undo_start(state, state->id_link_state);
4766 
4767 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4768 }
4769 
4770 /*
4771  * GLDv3 entry point to modify device's mac address. We do not
4772  * allow address modifications.
4773  */
4774 static int
4775 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4776 {
4777 	ibd_state_t *state = arg;
4778 
4779 	/*
4780 	 * Don't bother even comparing the macaddr if we haven't
4781 	 * completed ibd_m_start().
4782 	 */
4783 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4784 		return (0);
4785 
4786 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4787 		return (0);
4788 	else
4789 		return (EINVAL);
4790 }
4791 
4792 /*
4793  * The blocking part of the IBA join/leave operations are done out
4794  * of here on the async thread.
4795  */
4796 static void
4797 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4798 {
4799 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4800 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4801 
4802 	if (op == IBD_ASYNC_JOIN) {
4803 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4804 			ibd_print_warn(state, "Joint multicast group failed :"
4805 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4806 		}
4807 	} else {
4808 		/*
4809 		 * Here, we must search for the proper mcg_info and
4810 		 * use that to leave the group.
4811 		 */
4812 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4813 	}
4814 }
4815 
4816 /*
4817  * GLDv3 entry point for multicast enable/disable requests.
4818  * This function queues the operation to the async thread and
4819  * return success for a valid multicast address.
4820  */
4821 static int
4822 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4823 {
4824 	ibd_state_t *state = (ibd_state_t *)arg;
4825 	ipoib_mac_t maddr, *mcast;
4826 	ib_gid_t mgid;
4827 	ibd_req_t *req;
4828 
4829 	/*
4830 	 * If we haven't completed ibd_m_start(), async thread wouldn't
4831 	 * have been started and id_bcaddr wouldn't be set, so there's
4832 	 * no point in continuing.
4833 	 */
4834 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4835 		return (0);
4836 
4837 	/*
4838 	 * The incoming multicast address might not be aligned properly
4839 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4840 	 * it to look like one though, to get the offsets of the mc gid,
4841 	 * since we know we are not going to dereference any values with
4842 	 * the ipoib_mac_t pointer.
4843 	 */
4844 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
4845 	mcast = &maddr;
4846 
4847 	/*
4848 	 * Check validity of MCG address. We could additionally check
4849 	 * that a enable/disable is not being issued on the "broadcast"
4850 	 * mcg, but since this operation is only invokable by priviledged
4851 	 * programs anyway, we allow the flexibility to those dlpi apps.
4852 	 * Note that we do not validate the "scope" of the IBA mcg.
4853 	 */
4854 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
4855 		return (EINVAL);
4856 
4857 	/*
4858 	 * fill in multicast pkey and scope
4859 	 */
4860 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
4861 
4862 	/*
4863 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4864 	 * nothing (i.e. we stay JOINed to the broadcast group done in
4865 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
4866 	 * requires to be joined to broadcast groups at all times.
4867 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4868 	 * depends on this.
4869 	 */
4870 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4871 		return (0);
4872 
4873 	ibd_n2h_gid(mcast, &mgid);
4874 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4875 	if (req == NULL)
4876 		return (ENOMEM);
4877 
4878 	req->rq_gid = mgid;
4879 
4880 	if (add) {
4881 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
4882 		    mgid.gid_prefix, mgid.gid_guid);
4883 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
4884 	} else {
4885 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
4886 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4887 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
4888 	}
4889 	return (0);
4890 }
4891 
4892 /*
4893  * The blocking part of the IBA promiscuous operations are done
4894  * out of here on the async thread. The dlpireq parameter indicates
4895  * whether this invocation is due to a dlpi request or due to
4896  * a port up/down event.
4897  */
4898 static void
4899 ibd_async_unsetprom(ibd_state_t *state)
4900 {
4901 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4902 	ib_gid_t mgid;
4903 
4904 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4905 
4906 	while (mce != NULL) {
4907 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4908 		mce = list_next(&state->id_mc_non, mce);
4909 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4910 	}
4911 	state->id_prom_op = IBD_OP_NOTSTARTED;
4912 }
4913 
4914 /*
4915  * The blocking part of the IBA promiscuous operations are done
4916  * out of here on the async thread. The dlpireq parameter indicates
4917  * whether this invocation is due to a dlpi request or due to
4918  * a port up/down event.
4919  */
4920 static void
4921 ibd_async_setprom(ibd_state_t *state)
4922 {
4923 	ibt_mcg_attr_t mcg_attr;
4924 	ibt_mcg_info_t *mcg_info;
4925 	ib_gid_t mgid;
4926 	uint_t numg;
4927 	int i;
4928 	char ret = IBD_OP_COMPLETED;
4929 
4930 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4931 
4932 	/*
4933 	 * Obtain all active MC groups on the IB fabric with
4934 	 * specified criteria (scope + Pkey + Qkey + mtu).
4935 	 */
4936 	bzero(&mcg_attr, sizeof (mcg_attr));
4937 	mcg_attr.mc_pkey = state->id_pkey;
4938 	mcg_attr.mc_scope = state->id_scope;
4939 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4940 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4941 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4942 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4943 	    IBT_SUCCESS) {
4944 		ibd_print_warn(state, "Could not get list of IBA multicast "
4945 		    "groups");
4946 		ret = IBD_OP_ERRORED;
4947 		goto done;
4948 	}
4949 
4950 	/*
4951 	 * Iterate over the returned mcg's and join as NonMember
4952 	 * to the IP mcg's.
4953 	 */
4954 	for (i = 0; i < numg; i++) {
4955 		/*
4956 		 * Do a NonMember JOIN on the MC group.
4957 		 */
4958 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
4959 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
4960 			ibd_print_warn(state, "IBA promiscuous mode missed "
4961 			    "multicast gid %016llx:%016llx",
4962 			    (u_longlong_t)mgid.gid_prefix,
4963 			    (u_longlong_t)mgid.gid_guid);
4964 	}
4965 
4966 	ibt_free_mcg_info(mcg_info, numg);
4967 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
4968 done:
4969 	state->id_prom_op = ret;
4970 }
4971 
4972 /*
4973  * GLDv3 entry point for multicast promiscuous enable/disable requests.
4974  * GLDv3 assumes phys state receives more packets than multi state,
4975  * which is not true for IPoIB. Thus, treat the multi and phys
4976  * promiscuous states the same way to work with GLDv3's assumption.
4977  */
4978 static int
4979 ibd_m_promisc(void *arg, boolean_t on)
4980 {
4981 	ibd_state_t *state = (ibd_state_t *)arg;
4982 	ibd_req_t *req;
4983 
4984 	/*
4985 	 * Async thread wouldn't have been started if we haven't
4986 	 * passed ibd_m_start()
4987 	 */
4988 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4989 		return (0);
4990 
4991 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4992 	if (req == NULL)
4993 		return (ENOMEM);
4994 	if (on) {
4995 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
4996 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
4997 	} else {
4998 		DPRINT(1, "ibd_m_promisc : unset_promisc");
4999 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
5000 	}
5001 
5002 	return (0);
5003 }
5004 
5005 /*
5006  * GLDv3 entry point for gathering statistics.
5007  */
5008 static int
5009 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
5010 {
5011 	ibd_state_t *state = (ibd_state_t *)arg;
5012 
5013 	switch (stat) {
5014 	case MAC_STAT_IFSPEED:
5015 		*val = state->id_link_speed;
5016 		break;
5017 	case MAC_STAT_MULTIRCV:
5018 		*val = state->id_multi_rcv;
5019 		break;
5020 	case MAC_STAT_BRDCSTRCV:
5021 		*val = state->id_brd_rcv;
5022 		break;
5023 	case MAC_STAT_MULTIXMT:
5024 		*val = state->id_multi_xmt;
5025 		break;
5026 	case MAC_STAT_BRDCSTXMT:
5027 		*val = state->id_brd_xmt;
5028 		break;
5029 	case MAC_STAT_RBYTES:
5030 		*val = state->id_rcv_bytes;
5031 		break;
5032 	case MAC_STAT_IPACKETS:
5033 		*val = state->id_rcv_pkt;
5034 		break;
5035 	case MAC_STAT_OBYTES:
5036 		*val = state->id_xmt_bytes;
5037 		break;
5038 	case MAC_STAT_OPACKETS:
5039 		*val = state->id_xmt_pkt;
5040 		break;
5041 	case MAC_STAT_OERRORS:
5042 		*val = state->id_ah_error;	/* failed AH translation */
5043 		break;
5044 	case MAC_STAT_IERRORS:
5045 		*val = 0;
5046 		break;
5047 	case MAC_STAT_NOXMTBUF:
5048 		*val = state->id_tx_short;
5049 		break;
5050 	case MAC_STAT_NORCVBUF:
5051 	default:
5052 		return (ENOTSUP);
5053 	}
5054 
5055 	return (0);
5056 }
5057 
5058 static void
5059 ibd_async_txsched(ibd_state_t *state)
5060 {
5061 	ibd_req_t *req;
5062 	int ret;
5063 
5064 	if (ibd_txcomp_poll)
5065 		ibd_poll_compq(state, state->id_scq_hdl);
5066 
5067 	ret = ibd_resume_transmission(state);
5068 	if (ret && ibd_txcomp_poll) {
5069 		if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP))
5070 			ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5071 		else {
5072 			ibd_print_warn(state, "ibd_async_txsched: "
5073 			    "no memory, can't schedule work slot");
5074 		}
5075 	}
5076 }
5077 
5078 static int
5079 ibd_resume_transmission(ibd_state_t *state)
5080 {
5081 	int flag;
5082 	int met_thresh = 0;
5083 	int ret = -1;
5084 
5085 	mutex_enter(&state->id_sched_lock);
5086 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
5087 		met_thresh = (state->id_tx_list.dl_cnt >
5088 		    IBD_FREE_SWQES_THRESH);
5089 		flag = IBD_RSRC_SWQE;
5090 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
5091 		ASSERT(state->id_lso != NULL);
5092 		met_thresh = (state->id_lso->bkt_nfree >
5093 		    IBD_FREE_LSOS_THRESH);
5094 		flag = IBD_RSRC_LSOBUF;
5095 	}
5096 	if (met_thresh) {
5097 		state->id_sched_needed &= ~flag;
5098 		ret = 0;
5099 	}
5100 	mutex_exit(&state->id_sched_lock);
5101 
5102 	if (ret == 0)
5103 		mac_tx_update(state->id_mh);
5104 
5105 	return (ret);
5106 }
5107 
5108 /*
5109  * Release the send wqe back into free list.
5110  */
5111 static void
5112 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe)
5113 {
5114 	/*
5115 	 * Add back on Tx list for reuse.
5116 	 */
5117 	swqe->swqe_next = NULL;
5118 	mutex_enter(&state->id_tx_list.dl_mutex);
5119 	if (state->id_tx_list.dl_pending_sends) {
5120 		state->id_tx_list.dl_pending_sends = B_FALSE;
5121 	}
5122 	if (state->id_tx_list.dl_head == NULL) {
5123 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
5124 	} else {
5125 		state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe);
5126 	}
5127 	state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe);
5128 	state->id_tx_list.dl_cnt++;
5129 	mutex_exit(&state->id_tx_list.dl_mutex);
5130 }
5131 
5132 /*
5133  * Acquire a send wqe from free list.
5134  * Returns error number and send wqe pointer.
5135  */
5136 static int
5137 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe)
5138 {
5139 	int rc = 0;
5140 	ibd_swqe_t *wqe;
5141 
5142 	/*
5143 	 * Check and reclaim some of the completed Tx requests.
5144 	 * If someone else is already in this code and pulling Tx
5145 	 * completions, no need to poll, since the current lock holder
5146 	 * will do the work anyway. Normally, we poll for completions
5147 	 * every few Tx attempts, but if we are short on Tx descriptors,
5148 	 * we always try to poll.
5149 	 */
5150 	if ((ibd_txcomp_poll == 1) &&
5151 	    (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) {
5152 		ibd_poll_compq(state, state->id_scq_hdl);
5153 	}
5154 
5155 	/*
5156 	 * Grab required transmit wqes.
5157 	 */
5158 	mutex_enter(&state->id_tx_list.dl_mutex);
5159 	wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
5160 	if (wqe != NULL) {
5161 		state->id_tx_list.dl_cnt -= 1;
5162 		state->id_tx_list.dl_head = wqe->swqe_next;
5163 		if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe))
5164 			state->id_tx_list.dl_tail = NULL;
5165 	} else {
5166 		/*
5167 		 * If we did not find the number we were looking for, flag
5168 		 * no resource. Adjust list appropriately in either case.
5169 		 */
5170 		rc = ENOENT;
5171 		state->id_tx_list.dl_pending_sends = B_TRUE;
5172 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
5173 		atomic_add_64(&state->id_tx_short, 1);
5174 	}
5175 	mutex_exit(&state->id_tx_list.dl_mutex);
5176 	*swqe = wqe;
5177 
5178 	return (rc);
5179 }
5180 
5181 static int
5182 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
5183     ibt_ud_dest_hdl_t ud_dest)
5184 {
5185 	mblk_t	*nmp;
5186 	int iph_len, tcph_len;
5187 	ibt_wr_lso_t *lso;
5188 	uintptr_t ip_start, tcp_start;
5189 	uint8_t *dst;
5190 	uint_t pending, mblen;
5191 
5192 	/*
5193 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
5194 	 * we need to adjust it here for lso.
5195 	 */
5196 	lso = &(node->w_swr.wr.ud_lso);
5197 	lso->lso_ud_dest = ud_dest;
5198 	lso->lso_mss = mss;
5199 
5200 	/*
5201 	 * Calculate the LSO header size and set it in the UD LSO structure.
5202 	 * Note that the only assumption we make is that each of the IPoIB,
5203 	 * IP and TCP headers will be contained in a single mblk fragment;
5204 	 * together, the headers may span multiple mblk fragments.
5205 	 */
5206 	nmp = mp;
5207 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
5208 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
5209 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
5210 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
5211 		nmp = nmp->b_cont;
5212 
5213 	}
5214 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
5215 
5216 	tcp_start = ip_start + iph_len;
5217 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
5218 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
5219 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
5220 		nmp = nmp->b_cont;
5221 	}
5222 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5223 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5224 
5225 	/*
5226 	 * If the lso header fits entirely within a single mblk fragment,
5227 	 * we'll avoid an additional copy of the lso header here and just
5228 	 * pass the b_rptr of the mblk directly.
5229 	 *
5230 	 * If this isn't true, we'd have to allocate for it explicitly.
5231 	 */
5232 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5233 		lso->lso_hdr = mp->b_rptr;
5234 	} else {
5235 		/* On work completion, remember to free this allocated hdr */
5236 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5237 		if (lso->lso_hdr == NULL) {
5238 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5239 			    "sz = %d", lso->lso_hdr_sz);
5240 			lso->lso_hdr_sz = 0;
5241 			lso->lso_mss = 0;
5242 			return (-1);
5243 		}
5244 	}
5245 
5246 	/*
5247 	 * Copy in the lso header only if we need to
5248 	 */
5249 	if (lso->lso_hdr != mp->b_rptr) {
5250 		dst = lso->lso_hdr;
5251 		pending = lso->lso_hdr_sz;
5252 
5253 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5254 			mblen = MBLKL(nmp);
5255 			if (pending > mblen) {
5256 				bcopy(nmp->b_rptr, dst, mblen);
5257 				dst += mblen;
5258 				pending -= mblen;
5259 			} else {
5260 				bcopy(nmp->b_rptr, dst, pending);
5261 				break;
5262 			}
5263 		}
5264 	}
5265 
5266 	return (0);
5267 }
5268 
5269 static void
5270 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5271 {
5272 	ibt_wr_lso_t *lso;
5273 
5274 	if ((!node) || (!mp))
5275 		return;
5276 
5277 	/*
5278 	 * Free any header space that we might've allocated if we
5279 	 * did an LSO
5280 	 */
5281 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5282 		lso = &(node->w_swr.wr.ud_lso);
5283 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5284 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5285 			lso->lso_hdr = NULL;
5286 			lso->lso_hdr_sz = 0;
5287 		}
5288 	}
5289 }
5290 
5291 static void
5292 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5293 {
5294 	uint_t		i;
5295 	uint_t		num_posted;
5296 	uint_t		n_wrs;
5297 	ibt_status_t	ibt_status;
5298 	ibt_send_wr_t	wrs[IBD_MAX_POST_MULTIPLE];
5299 	ibd_swqe_t	*elem;
5300 	ibd_swqe_t	*nodes[IBD_MAX_POST_MULTIPLE];
5301 
5302 	node->swqe_next = NULL;
5303 
5304 	mutex_enter(&state->id_txpost_lock);
5305 
5306 	/*
5307 	 * Enqueue the new node in chain of wqes to send
5308 	 */
5309 	if (state->id_tx_head) {
5310 		*(state->id_tx_tailp) = (ibd_wqe_t *)node;
5311 	} else {
5312 		state->id_tx_head = node;
5313 	}
5314 	state->id_tx_tailp = &(node->swqe_next);
5315 
5316 	/*
5317 	 * If someone else is helping out with the sends,
5318 	 * just go back
5319 	 */
5320 	if (state->id_tx_busy) {
5321 		mutex_exit(&state->id_txpost_lock);
5322 		return;
5323 	}
5324 
5325 	/*
5326 	 * Otherwise, mark the flag to indicate that we'll be
5327 	 * doing the dispatch of what's there in the wqe chain
5328 	 */
5329 	state->id_tx_busy = 1;
5330 
5331 	while (state->id_tx_head) {
5332 		/*
5333 		 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs
5334 		 * at a time if possible, and keep posting them.
5335 		 */
5336 		for (n_wrs = 0, elem = state->id_tx_head;
5337 		    (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE);
5338 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5339 
5340 			nodes[n_wrs] = elem;
5341 			wrs[n_wrs] = elem->w_swr;
5342 		}
5343 		state->id_tx_head = elem;
5344 
5345 		/*
5346 		 * Release the txpost lock before posting the
5347 		 * send request to the hca; if the posting fails
5348 		 * for some reason, we'll never receive completion
5349 		 * intimation, so we'll need to cleanup.
5350 		 */
5351 		mutex_exit(&state->id_txpost_lock);
5352 
5353 		ASSERT(n_wrs != 0);
5354 
5355 		/*
5356 		 * If posting fails for some reason, we'll never receive
5357 		 * completion intimation, so we'll need to cleanup. But
5358 		 * we need to make sure we don't clean up nodes whose
5359 		 * wrs have been successfully posted. We assume that the
5360 		 * hca driver returns on the first failure to post and
5361 		 * therefore the first 'num_posted' entries don't need
5362 		 * cleanup here.
5363 		 */
5364 		num_posted = 0;
5365 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5366 		    wrs, n_wrs, &num_posted);
5367 		if (ibt_status != IBT_SUCCESS) {
5368 
5369 			ibd_print_warn(state, "ibd_post_send: "
5370 			    "posting multiple wrs failed: "
5371 			    "requested=%d, done=%d, ret=%d",
5372 			    n_wrs, num_posted, ibt_status);
5373 
5374 			for (i = num_posted; i < n_wrs; i++)
5375 				ibd_tx_cleanup(state, nodes[i]);
5376 		}
5377 
5378 		/*
5379 		 * Grab the mutex before we go and check the tx Q again
5380 		 */
5381 		mutex_enter(&state->id_txpost_lock);
5382 	}
5383 
5384 	state->id_tx_busy = 0;
5385 	mutex_exit(&state->id_txpost_lock);
5386 }
5387 
5388 static int
5389 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5390     uint_t lsohdr_sz)
5391 {
5392 	ibt_wr_ds_t *sgl;
5393 	ibt_status_t ibt_status;
5394 	mblk_t *nmp;
5395 	mblk_t *data_mp;
5396 	uchar_t *bufp;
5397 	size_t blksize;
5398 	size_t skip;
5399 	size_t avail;
5400 	uint_t pktsize;
5401 	uint_t frag_len;
5402 	uint_t pending_hdr;
5403 	uint_t hiwm;
5404 	int nmblks;
5405 	int i;
5406 
5407 	/*
5408 	 * Let's skip ahead to the data if this is LSO
5409 	 */
5410 	data_mp = mp;
5411 	pending_hdr = 0;
5412 	if (lsohdr_sz) {
5413 		pending_hdr = lsohdr_sz;
5414 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5415 			frag_len = nmp->b_wptr - nmp->b_rptr;
5416 			if (frag_len > pending_hdr)
5417 				break;
5418 			pending_hdr -= frag_len;
5419 		}
5420 		data_mp = nmp;	/* start of data past lso header */
5421 		ASSERT(data_mp != NULL);
5422 	}
5423 
5424 	/*
5425 	 * Calculate the size of message data and number of msg blocks
5426 	 */
5427 	pktsize = 0;
5428 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5429 	    nmp = nmp->b_cont, nmblks++) {
5430 		pktsize += MBLKL(nmp);
5431 	}
5432 	pktsize -= pending_hdr;
5433 
5434 	/*
5435 	 * Translating the virtual address regions into physical regions
5436 	 * for using the Reserved LKey feature results in a wr sgl that
5437 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
5438 	 * we'll fix a high-water mark (65%) for when we should stop.
5439 	 */
5440 	hiwm = (state->id_max_sqseg * 65) / 100;
5441 
5442 	/*
5443 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5444 	 * "copy-threshold", and if the number of mp fragments is less than
5445 	 * the maximum acceptable.
5446 	 */
5447 	if ((state->id_hca_res_lkey_capab) &&
5448 	    (pktsize > IBD_TX_COPY_THRESH) &&
5449 	    (nmblks < hiwm)) {
5450 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5451 		ibt_iov_attr_t iov_attr;
5452 
5453 		iov_attr.iov_as = NULL;
5454 		iov_attr.iov = iov_arr;
5455 		iov_attr.iov_buf = NULL;
5456 		iov_attr.iov_list_len = nmblks;
5457 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5458 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5459 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5460 
5461 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5462 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5463 			iov_arr[i].iov_len = MBLKL(nmp);
5464 			if (i == 0) {
5465 				iov_arr[i].iov_addr += pending_hdr;
5466 				iov_arr[i].iov_len -= pending_hdr;
5467 			}
5468 		}
5469 
5470 		node->w_buftype = IBD_WQE_MAPPED;
5471 		node->w_swr.wr_sgl = node->w_sgl;
5472 
5473 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5474 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5475 		if (ibt_status != IBT_SUCCESS) {
5476 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5477 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5478 			goto ibd_copy_path;
5479 		}
5480 
5481 		return (0);
5482 	}
5483 
5484 ibd_copy_path:
5485 	if (pktsize <= state->id_tx_buf_sz) {
5486 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5487 		node->w_swr.wr_nds = 1;
5488 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5489 		node->w_buftype = IBD_WQE_TXBUF;
5490 
5491 		/*
5492 		 * Even though this is the copy path for transfers less than
5493 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5494 		 * is possible the first data mblk fragment (data_mp) still
5495 		 * contains part of the LSO header that we need to skip.
5496 		 */
5497 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5498 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5499 			blksize = MBLKL(nmp) - pending_hdr;
5500 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5501 			bufp += blksize;
5502 			pending_hdr = 0;
5503 		}
5504 
5505 		return (0);
5506 	}
5507 
5508 	/*
5509 	 * Copy path for transfers greater than id_tx_buf_sz
5510 	 */
5511 	node->w_swr.wr_sgl = node->w_sgl;
5512 	if (ibd_acquire_lsobufs(state, pktsize,
5513 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5514 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5515 		return (-1);
5516 	}
5517 	node->w_buftype = IBD_WQE_LSOBUF;
5518 
5519 	/*
5520 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5521 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5522 	 * need to skip part of the LSO header in the first fragment
5523 	 * as before.
5524 	 */
5525 	nmp = data_mp;
5526 	skip = pending_hdr;
5527 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5528 		sgl = node->w_swr.wr_sgl + i;
5529 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5530 		avail = IBD_LSO_BUFSZ;
5531 		while (nmp && avail) {
5532 			blksize = MBLKL(nmp) - skip;
5533 			if (blksize > avail) {
5534 				bcopy(nmp->b_rptr + skip, bufp, avail);
5535 				skip += avail;
5536 				avail = 0;
5537 			} else {
5538 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5539 				skip = 0;
5540 				avail -= blksize;
5541 				bufp += blksize;
5542 				nmp = nmp->b_cont;
5543 			}
5544 		}
5545 	}
5546 
5547 	return (0);
5548 }
5549 
5550 /*
5551  * Schedule a completion queue polling to reap the resource we're
5552  * short on.  If we implement the change to reap tx completions
5553  * in a separate thread, we'll need to wake up that thread here.
5554  */
5555 static int
5556 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5557 {
5558 	ibd_req_t *req;
5559 
5560 	mutex_enter(&state->id_sched_lock);
5561 	state->id_sched_needed |= resource_type;
5562 	mutex_exit(&state->id_sched_lock);
5563 
5564 	/*
5565 	 * If we are asked to queue a work entry, we need to do it
5566 	 */
5567 	if (q_flag) {
5568 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5569 		if (req == NULL)
5570 			return (-1);
5571 
5572 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5573 	}
5574 
5575 	return (0);
5576 }
5577 
5578 /*
5579  * The passed in packet has this format:
5580  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5581  */
5582 static boolean_t
5583 ibd_send(ibd_state_t *state, mblk_t *mp)
5584 {
5585 	ibd_ace_t *ace;
5586 	ibd_swqe_t *node;
5587 	ipoib_mac_t *dest;
5588 	ib_header_info_t *ipibp;
5589 	ip6_t *ip6h;
5590 	uint_t pktsize;
5591 	uint32_t mss;
5592 	uint32_t hckflags;
5593 	uint32_t lsoflags = 0;
5594 	uint_t lsohdr_sz = 0;
5595 	int ret, len;
5596 	boolean_t dofree = B_FALSE;
5597 	boolean_t rc;
5598 
5599 	/*
5600 	 * If we aren't done with the device initialization and start,
5601 	 * we shouldn't be here.
5602 	 */
5603 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5604 		return (B_FALSE);
5605 
5606 	node = NULL;
5607 	if (ibd_acquire_swqe(state, &node) != 0) {
5608 		/*
5609 		 * If we don't have an swqe available, schedule a transmit
5610 		 * completion queue cleanup and hold off on sending more
5611 		 * more packets until we have some free swqes
5612 		 */
5613 		if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0)
5614 			return (B_FALSE);
5615 
5616 		/*
5617 		 * If a poll cannot be scheduled, we have no choice but
5618 		 * to drop this packet
5619 		 */
5620 		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5621 		return (B_TRUE);
5622 	}
5623 
5624 	/*
5625 	 * Initialize the commonly used fields in swqe to NULL to protect
5626 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5627 	 * failure.
5628 	 */
5629 	node->swqe_im_mblk = NULL;
5630 	node->w_swr.wr_nds = 0;
5631 	node->w_swr.wr_sgl = NULL;
5632 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5633 
5634 	/*
5635 	 * Obtain an address handle for the destination.
5636 	 */
5637 	ipibp = (ib_header_info_t *)mp->b_rptr;
5638 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5639 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5640 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5641 
5642 	pktsize = msgsize(mp);
5643 
5644 	atomic_add_64(&state->id_xmt_bytes, pktsize);
5645 	atomic_inc_64(&state->id_xmt_pkt);
5646 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5647 		atomic_inc_64(&state->id_brd_xmt);
5648 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5649 		atomic_inc_64(&state->id_multi_xmt);
5650 
5651 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
5652 		node->w_ahandle = ace;
5653 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5654 	} else {
5655 		DPRINT(5,
5656 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5657 		    ((ret == EFAULT) ? "failed" : "queued"),
5658 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5659 		    htonl(dest->ipoib_gidpref[1]),
5660 		    htonl(dest->ipoib_gidsuff[0]),
5661 		    htonl(dest->ipoib_gidsuff[1]));
5662 		node->w_ahandle = NULL;
5663 
5664 		/*
5665 		 * for the poll mode, it is probably some cqe pending in the
5666 		 * cq. So ibd has to poll cq here, otherwise acache probably
5667 		 * may not be recycled.
5668 		 */
5669 		if (ibd_txcomp_poll == 1)
5670 			ibd_poll_compq(state, state->id_scq_hdl);
5671 
5672 		/*
5673 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5674 		 * can not find a path for the specific dest address. We
5675 		 * should get rid of this kind of packet.  We also should get
5676 		 * rid of the packet if we cannot schedule a poll via the
5677 		 * async thread.  For the normal case, ibd will return the
5678 		 * packet to upper layer and wait for AH creating.
5679 		 *
5680 		 * Note that we always queue a work slot entry for the async
5681 		 * thread when we fail AH lookup (even in intr mode); this is
5682 		 * due to the convoluted way the code currently looks for AH.
5683 		 */
5684 		if (ret == EFAULT) {
5685 			dofree = B_TRUE;
5686 			rc = B_TRUE;
5687 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5688 			dofree = B_TRUE;
5689 			rc = B_TRUE;
5690 		} else {
5691 			dofree = B_FALSE;
5692 			rc = B_FALSE;
5693 		}
5694 		goto ibd_send_fail;
5695 	}
5696 
5697 	/*
5698 	 * For ND6 packets, padding is at the front of the source lladdr.
5699 	 * Insert the padding at front.
5700 	 */
5701 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) {
5702 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
5703 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5704 			    sizeof (ib_header_info_t))) {
5705 				DPRINT(10, "ibd_send: pullupmsg failure ");
5706 				dofree = B_TRUE;
5707 				rc = B_TRUE;
5708 				goto ibd_send_fail;
5709 			}
5710 			ipibp = (ib_header_info_t *)mp->b_rptr;
5711 		}
5712 		ip6h = (ip6_t *)((uchar_t *)ipibp +
5713 		    sizeof (ib_header_info_t));
5714 		len = ntohs(ip6h->ip6_plen);
5715 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5716 			mblk_t	*pad;
5717 
5718 			pad = allocb(4, 0);
5719 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
5720 			linkb(mp, pad);
5721 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
5722 			    IPV6_HDR_LEN + len + 4) {
5723 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
5724 				    IPV6_HDR_LEN + len + 4)) {
5725 					DPRINT(10, "ibd_send: pullupmsg "
5726 					    "failure ");
5727 					dofree = B_TRUE;
5728 					rc = B_TRUE;
5729 					goto ibd_send_fail;
5730 				}
5731 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5732 				    sizeof (ib_header_info_t));
5733 			}
5734 
5735 			/* LINTED: E_CONSTANT_CONDITION */
5736 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5737 		}
5738 	}
5739 
5740 	mp->b_rptr += sizeof (ib_addrs_t);
5741 
5742 	/*
5743 	 * Do LSO and checksum related work here.  For LSO send, adjust the
5744 	 * ud destination, the opcode and the LSO header information to the
5745 	 * work request.
5746 	 */
5747 	lso_info_get(mp, &mss, &lsoflags);
5748 	if ((lsoflags & HW_LSO) != HW_LSO) {
5749 		node->w_swr.wr_opcode = IBT_WRC_SEND;
5750 		lsohdr_sz = 0;
5751 	} else {
5752 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
5753 			/*
5754 			 * The routine can only fail if there's no memory; we
5755 			 * can only drop the packet if this happens
5756 			 */
5757 			ibd_print_warn(state,
5758 			    "ibd_send: no memory, lso posting failed");
5759 			dofree = B_TRUE;
5760 			rc = B_TRUE;
5761 			goto ibd_send_fail;
5762 		}
5763 
5764 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
5765 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
5766 	}
5767 
5768 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
5769 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
5770 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
5771 	else
5772 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
5773 
5774 	/*
5775 	 * Prepare the sgl for posting; the routine can only fail if there's
5776 	 * no lso buf available for posting. If this is the case, we should
5777 	 * probably resched for lso bufs to become available and then try again.
5778 	 */
5779 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
5780 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
5781 			dofree = B_TRUE;
5782 			rc = B_TRUE;
5783 		} else {
5784 			dofree = B_FALSE;
5785 			rc = B_FALSE;
5786 		}
5787 		goto ibd_send_fail;
5788 	}
5789 	node->swqe_im_mblk = mp;
5790 
5791 	/*
5792 	 * Queue the wqe to hardware; since we can now simply queue a
5793 	 * post instead of doing it serially, we cannot assume anything
5794 	 * about the 'node' after ibd_post_send() returns.
5795 	 */
5796 	ibd_post_send(state, node);
5797 
5798 	return (B_TRUE);
5799 
5800 ibd_send_fail:
5801 	if (node && mp)
5802 		ibd_free_lsohdr(node, mp);
5803 
5804 	if (dofree)
5805 		freemsg(mp);
5806 
5807 	if (node != NULL)
5808 		ibd_tx_cleanup(state, node);
5809 
5810 	return (rc);
5811 }
5812 
5813 /*
5814  * GLDv3 entry point for transmitting datagram.
5815  */
5816 static mblk_t *
5817 ibd_m_tx(void *arg, mblk_t *mp)
5818 {
5819 	ibd_state_t *state = (ibd_state_t *)arg;
5820 	mblk_t *next;
5821 
5822 	if (state->id_link_state != LINK_STATE_UP) {
5823 		freemsgchain(mp);
5824 		mp = NULL;
5825 	}
5826 
5827 	while (mp != NULL) {
5828 		next = mp->b_next;
5829 		mp->b_next = NULL;
5830 		if (ibd_send(state, mp) == B_FALSE) {
5831 			/* Send fail */
5832 			mp->b_next = next;
5833 			break;
5834 		}
5835 		mp = next;
5836 	}
5837 
5838 	return (mp);
5839 }
5840 
5841 /*
5842  * this handles Tx and Rx completions. With separate CQs, this handles
5843  * only Rx completions.
5844  */
5845 static uint_t
5846 ibd_intr(char *arg)
5847 {
5848 	ibd_state_t *state = (ibd_state_t *)arg;
5849 
5850 	ibd_poll_compq(state, state->id_rcq_hdl);
5851 
5852 	return (DDI_INTR_CLAIMED);
5853 }
5854 
5855 /*
5856  * Poll and drain the cq
5857  */
5858 static uint_t
5859 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs,
5860     uint_t numwcs)
5861 {
5862 	ibd_wqe_t *wqe;
5863 	ibt_wc_t *wc;
5864 	uint_t total_polled = 0;
5865 	uint_t num_polled;
5866 	int i;
5867 
5868 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5869 		total_polled += num_polled;
5870 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5871 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5872 			ASSERT((wqe->w_type == IBD_WQE_SEND) ||
5873 			    (wqe->w_type == IBD_WQE_RECV));
5874 			if (wc->wc_status != IBT_WC_SUCCESS) {
5875 				/*
5876 				 * Channel being torn down.
5877 				 */
5878 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5879 					DPRINT(5, "ibd_drain_cq: flush error");
5880 					/*
5881 					 * Only invoke the Tx handler to
5882 					 * release possibly held resources
5883 					 * like AH refcount etc. Can not
5884 					 * invoke Rx handler because it might
5885 					 * try adding buffers to the Rx pool
5886 					 * when we are trying to deinitialize.
5887 					 */
5888 					if (wqe->w_type == IBD_WQE_RECV) {
5889 						continue;
5890 					} else {
5891 						DPRINT(10, "ibd_drain_cq: Bad "
5892 						    "status %d", wc->wc_status);
5893 					}
5894 				}
5895 			}
5896 			if (wqe->w_type == IBD_WQE_SEND) {
5897 				ibd_tx_cleanup(state, WQE_TO_SWQE(wqe));
5898 			} else {
5899 				ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5900 			}
5901 		}
5902 	}
5903 
5904 	return (total_polled);
5905 }
5906 
5907 /*
5908  * Common code for interrupt handling as well as for polling
5909  * for all completed wqe's while detaching.
5910  */
5911 static void
5912 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5913 {
5914 	ibt_wc_t *wcs;
5915 	uint_t numwcs;
5916 	int flag, redo_flag;
5917 	int redo = 1;
5918 	uint_t num_polled = 0;
5919 
5920 	if (ibd_separate_cqs == 1) {
5921 		if (cq_hdl == state->id_rcq_hdl) {
5922 			flag = IBD_RX_CQ_POLLING;
5923 			redo_flag = IBD_REDO_RX_CQ_POLLING;
5924 		} else {
5925 			flag = IBD_TX_CQ_POLLING;
5926 			redo_flag = IBD_REDO_TX_CQ_POLLING;
5927 		}
5928 	} else {
5929 		flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING;
5930 		redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING;
5931 	}
5932 
5933 	mutex_enter(&state->id_cq_poll_lock);
5934 	if (state->id_cq_poll_busy & flag) {
5935 		state->id_cq_poll_busy |= redo_flag;
5936 		mutex_exit(&state->id_cq_poll_lock);
5937 		return;
5938 	}
5939 	state->id_cq_poll_busy |= flag;
5940 	mutex_exit(&state->id_cq_poll_lock);
5941 
5942 	/*
5943 	 * In some cases (eg detaching), this code can be invoked on
5944 	 * any cpu after disabling cq notification (thus no concurrency
5945 	 * exists). Apart from that, the following applies normally:
5946 	 * The receive completion handling is always on the Rx interrupt
5947 	 * cpu. Transmit completion handling could be from any cpu if
5948 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
5949 	 * is interrupt driven. Combined completion handling is always
5950 	 * on the interrupt cpu. Thus, lock accordingly and use the
5951 	 * proper completion array.
5952 	 */
5953 	if (ibd_separate_cqs == 1) {
5954 		if (cq_hdl == state->id_rcq_hdl) {
5955 			wcs = state->id_rxwcs;
5956 			numwcs = state->id_rxwcs_size;
5957 		} else {
5958 			wcs = state->id_txwcs;
5959 			numwcs = state->id_txwcs_size;
5960 		}
5961 	} else {
5962 		wcs = state->id_rxwcs;
5963 		numwcs = state->id_rxwcs_size;
5964 	}
5965 
5966 	/*
5967 	 * Poll and drain the CQ
5968 	 */
5969 	num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5970 
5971 	/*
5972 	 * Enable CQ notifications and redrain the cq to catch any
5973 	 * completions we might have missed after the ibd_drain_cq()
5974 	 * above and before the ibt_enable_cq_notify() that follows.
5975 	 * Finally, service any new requests to poll the cq that
5976 	 * could've come in after the ibt_enable_cq_notify().
5977 	 */
5978 	do {
5979 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
5980 		    IBT_SUCCESS) {
5981 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
5982 		}
5983 
5984 		num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs);
5985 
5986 		mutex_enter(&state->id_cq_poll_lock);
5987 		if (state->id_cq_poll_busy & redo_flag)
5988 			state->id_cq_poll_busy &= ~redo_flag;
5989 		else {
5990 			state->id_cq_poll_busy &= ~flag;
5991 			redo = 0;
5992 		}
5993 		mutex_exit(&state->id_cq_poll_lock);
5994 
5995 	} while (redo);
5996 
5997 	/*
5998 	 * If we polled the receive cq and found anything, we need to flush
5999 	 * it out to the nw layer here.
6000 	 */
6001 	if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) {
6002 		ibd_flush_rx(state, NULL);
6003 	}
6004 }
6005 
6006 /*
6007  * Unmap the memory area associated with a given swqe.
6008  */
6009 static void
6010 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
6011 {
6012 	ibt_status_t stat;
6013 
6014 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
6015 
6016 	if (swqe->w_mi_hdl) {
6017 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
6018 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
6019 			DPRINT(10,
6020 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
6021 		}
6022 		swqe->w_mi_hdl = NULL;
6023 	}
6024 	swqe->w_swr.wr_nds = 0;
6025 }
6026 
6027 /*
6028  * Common code that deals with clean ups after a successful or
6029  * erroneous transmission attempt.
6030  */
6031 static void
6032 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
6033 {
6034 	ibd_ace_t *ace = swqe->w_ahandle;
6035 
6036 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
6037 
6038 	/*
6039 	 * If this was a dynamic mapping in ibd_send(), we need to
6040 	 * unmap here. If this was an lso buffer we'd used for sending,
6041 	 * we need to release the lso buf to the pool, since the resource
6042 	 * is scarce. However, if this was simply a normal send using
6043 	 * the copybuf (present in each swqe), we don't need to release it.
6044 	 */
6045 	if (swqe->swqe_im_mblk != NULL) {
6046 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
6047 			ibd_unmap_mem(state, swqe);
6048 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6049 			ibd_release_lsobufs(state,
6050 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6051 		}
6052 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6053 		freemsg(swqe->swqe_im_mblk);
6054 		swqe->swqe_im_mblk = NULL;
6055 	}
6056 
6057 	/*
6058 	 * Drop the reference count on the AH; it can be reused
6059 	 * now for a different destination if there are no more
6060 	 * posted sends that will use it. This can be eliminated
6061 	 * if we can always associate each Tx buffer with an AH.
6062 	 * The ace can be null if we are cleaning up from the
6063 	 * ibd_send() error path.
6064 	 */
6065 	if (ace != NULL) {
6066 		/*
6067 		 * The recycling logic can be eliminated from here
6068 		 * and put into the async thread if we create another
6069 		 * list to hold ACE's for unjoined mcg's.
6070 		 */
6071 		if (DEC_REF_DO_CYCLE(ace)) {
6072 			ibd_mce_t *mce;
6073 
6074 			/*
6075 			 * Check with the lock taken: we decremented
6076 			 * reference count without the lock, and some
6077 			 * transmitter might alreay have bumped the
6078 			 * reference count (possible in case of multicast
6079 			 * disable when we leave the AH on the active
6080 			 * list). If not still 0, get out, leaving the
6081 			 * recycle bit intact.
6082 			 *
6083 			 * Atomically transition the AH from active
6084 			 * to free list, and queue a work request to
6085 			 * leave the group and destroy the mce. No
6086 			 * transmitter can be looking at the AH or
6087 			 * the MCE in between, since we have the
6088 			 * ac_mutex lock. In the SendOnly reap case,
6089 			 * it is not neccesary to hold the ac_mutex
6090 			 * and recheck the ref count (since the AH was
6091 			 * taken off the active list), we just do it
6092 			 * to have uniform processing with the Full
6093 			 * reap case.
6094 			 */
6095 			mutex_enter(&state->id_ac_mutex);
6096 			mce = ace->ac_mce;
6097 			if (GET_REF_CYCLE(ace) == 0) {
6098 				CLEAR_REFCYCLE(ace);
6099 				/*
6100 				 * Identify the case of fullmember reap as
6101 				 * opposed to mcg trap reap. Also, port up
6102 				 * might set ac_mce to NULL to indicate Tx
6103 				 * cleanup should do no more than put the
6104 				 * AH in the free list (see ibd_async_link).
6105 				 */
6106 				if (mce != NULL) {
6107 					ace->ac_mce = NULL;
6108 					IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
6109 					/*
6110 					 * mc_req was initialized at mce
6111 					 * creation time.
6112 					 */
6113 					ibd_queue_work_slot(state,
6114 					    &mce->mc_req, IBD_ASYNC_REAP);
6115 				}
6116 				IBD_ACACHE_INSERT_FREE(state, ace);
6117 			}
6118 			mutex_exit(&state->id_ac_mutex);
6119 		}
6120 	}
6121 
6122 	/*
6123 	 * Release the send wqe for reuse.
6124 	 */
6125 	ibd_release_swqe(state, swqe);
6126 }
6127 
6128 /*
6129  * Hand off the processed rx mp chain to mac_rx()
6130  */
6131 static void
6132 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc)
6133 {
6134 	if (mpc == NULL) {
6135 		mutex_enter(&state->id_rx_lock);
6136 
6137 		mpc = state->id_rx_mp;
6138 
6139 		state->id_rx_mp = NULL;
6140 		state->id_rx_mp_tail = NULL;
6141 		state->id_rx_mp_len = 0;
6142 
6143 		mutex_exit(&state->id_rx_lock);
6144 	}
6145 
6146 	if (mpc) {
6147 		mac_rx(state->id_mh, state->id_rh, mpc);
6148 	}
6149 }
6150 
6151 /*
6152  * Processing to be done after receipt of a packet; hand off to GLD
6153  * in the format expected by GLD.  The received packet has this
6154  * format: 2b sap :: 00 :: data.
6155  */
6156 static void
6157 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
6158 {
6159 	ib_header_info_t *phdr;
6160 	mblk_t *mp;
6161 	mblk_t *mpc = NULL;
6162 	ipoib_hdr_t *ipibp;
6163 	ipha_t *iphap;
6164 	ip6_t *ip6h;
6165 	int rxcnt, len;
6166 
6167 	/*
6168 	 * Track number handed to upper layer, and number still
6169 	 * available to receive packets.
6170 	 */
6171 	rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1);
6172 	ASSERT(rxcnt >= 0);
6173 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1);
6174 
6175 	/*
6176 	 * Adjust write pointer depending on how much data came in.
6177 	 */
6178 	mp = rwqe->rwqe_im_mblk;
6179 	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer;
6180 
6181 	/*
6182 	 * Make sure this is NULL or we're in trouble.
6183 	 */
6184 	if (mp->b_next != NULL) {
6185 		ibd_print_warn(state,
6186 		    "ibd_process_rx: got duplicate mp from rcq?");
6187 		mp->b_next = NULL;
6188 	}
6189 
6190 	/*
6191 	 * the IB link will deliver one of the IB link layer
6192 	 * headers called, the Global Routing Header (GRH).
6193 	 * ibd driver uses the information in GRH to build the
6194 	 * Header_info structure and pass it with the datagram up
6195 	 * to GLDv3.
6196 	 * If the GRH is not valid, indicate to GLDv3 by setting
6197 	 * the VerTcFlow field to 0.
6198 	 */
6199 	phdr = (ib_header_info_t *)mp->b_rptr;
6200 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
6201 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
6202 
6203 		/* if it is loop back packet, just drop it. */
6204 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
6205 		    IPOIB_ADDRL) == 0) {
6206 			freemsg(mp);
6207 			return;
6208 		}
6209 
6210 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
6211 		    sizeof (ipoib_mac_t));
6212 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
6213 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
6214 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
6215 		} else {
6216 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
6217 		}
6218 	} else {
6219 		/*
6220 		 * It can not be a IBA multicast packet. Must have been
6221 		 * unicast for us. Just copy the interface address to dst.
6222 		 */
6223 		phdr->ib_grh.ipoib_vertcflow = 0;
6224 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
6225 		    sizeof (ipoib_mac_t));
6226 	}
6227 
6228 	/*
6229 	 * For ND6 packets, padding is at the front of the source/target
6230 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6231 	 * the padding from such packets.
6232 	 */
6233 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6234 	if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) {
6235 		if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) {
6236 			if (!pullupmsg(mp, IPV6_HDR_LEN +
6237 			    sizeof (ipoib_hdr_t))) {
6238 				DPRINT(10, "ibd_process_rx: pullupmsg failed");
6239 				freemsg(mp);
6240 				return;
6241 			}
6242 			ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr +
6243 			    sizeof (ipoib_pgrh_t));
6244 		}
6245 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6246 		len = ntohs(ip6h->ip6_plen);
6247 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6248 			if (MBLKL(mp) < sizeof (ipoib_hdr_t) +
6249 			    IPV6_HDR_LEN + len) {
6250 				if (!pullupmsg(mp, sizeof (ipoib_hdr_t) +
6251 				    IPV6_HDR_LEN + len)) {
6252 					DPRINT(10, "ibd_process_rx: pullupmsg"
6253 					    " failed");
6254 					freemsg(mp);
6255 					return;
6256 				}
6257 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6258 				    sizeof (ipoib_pgrh_t) +
6259 				    sizeof (ipoib_hdr_t));
6260 			}
6261 			/* LINTED: E_CONSTANT_CONDITION */
6262 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6263 		}
6264 	}
6265 
6266 	/*
6267 	 * Update statistics
6268 	 */
6269 	atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer);
6270 	atomic_inc_64(&state->id_rcv_pkt);
6271 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6272 		atomic_inc_64(&state->id_brd_rcv);
6273 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6274 		atomic_inc_64(&state->id_multi_rcv);
6275 
6276 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6277 	/*
6278 	 * Set receive checksum status in mp
6279 	 * Hardware checksumming can be considered valid only if:
6280 	 * 1. CQE.IP_OK bit is set
6281 	 * 2. CQE.CKSUM = 0xffff
6282 	 * 3. IPv6 routing header is not present in the packet
6283 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6284 	 */
6285 
6286 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6287 	    (wc->wc_cksum == 0xFFFF) &&
6288 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6289 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
6290 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
6291 	}
6292 
6293 	/*
6294 	 * Add this mp to the list of processed mp's to send to
6295 	 * the nw layer
6296 	 */
6297 	mutex_enter(&state->id_rx_lock);
6298 	if (state->id_rx_mp) {
6299 		ASSERT(state->id_rx_mp_tail != NULL);
6300 		state->id_rx_mp_tail->b_next = mp;
6301 	} else {
6302 		ASSERT(state->id_rx_mp_tail == NULL);
6303 		state->id_rx_mp = mp;
6304 	}
6305 
6306 	state->id_rx_mp_tail = mp;
6307 	state->id_rx_mp_len++;
6308 
6309 	if (state->id_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
6310 		mpc = state->id_rx_mp;
6311 
6312 		state->id_rx_mp = NULL;
6313 		state->id_rx_mp_tail = NULL;
6314 		state->id_rx_mp_len = 0;
6315 	}
6316 
6317 	mutex_exit(&state->id_rx_lock);
6318 
6319 	if (mpc) {
6320 		ibd_flush_rx(state, mpc);
6321 	}
6322 }
6323 
6324 /*
6325  * Callback code invoked from STREAMs when the receive data buffer is
6326  * free for recycling.
6327  */
6328 static void
6329 ibd_freemsg_cb(char *arg)
6330 {
6331 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6332 	ibd_state_t *state = rwqe->w_state;
6333 
6334 	/*
6335 	 * If the wqe is being destructed, do not attempt recycling.
6336 	 */
6337 	if (rwqe->w_freeing_wqe == B_TRUE) {
6338 		DPRINT(6, "ibd_freemsg: wqe being freed");
6339 		return;
6340 	} else {
6341 		/*
6342 		 * Upper layer has released held mblk, so we have
6343 		 * no more use for keeping the old pointer in
6344 		 * our rwqe.
6345 		 */
6346 		rwqe->rwqe_im_mblk = NULL;
6347 	}
6348 
6349 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6350 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6351 	if (rwqe->rwqe_im_mblk == NULL) {
6352 		ibd_delete_rwqe(state, rwqe);
6353 		ibd_free_rwqe(state, rwqe);
6354 		DPRINT(6, "ibd_freemsg: desballoc failed");
6355 		return;
6356 	}
6357 
6358 	if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) {
6359 		ibd_delete_rwqe(state, rwqe);
6360 		ibd_free_rwqe(state, rwqe);
6361 		return;
6362 	}
6363 
6364 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
6365 }
6366 
6367 static uint_t
6368 ibd_tx_recycle(char *arg)
6369 {
6370 	ibd_state_t *state = (ibd_state_t *)arg;
6371 
6372 	/*
6373 	 * Poll for completed entries
6374 	 */
6375 	ibd_poll_compq(state, state->id_scq_hdl);
6376 
6377 	/*
6378 	 * Resume any blocked transmissions if possible
6379 	 */
6380 	(void) ibd_resume_transmission(state);
6381 
6382 	return (DDI_INTR_CLAIMED);
6383 }
6384 
6385 #ifdef IBD_LOGGING
6386 static void
6387 ibd_log_init(void)
6388 {
6389 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
6390 	ibd_lbuf_ndx = 0;
6391 
6392 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
6393 }
6394 
6395 static void
6396 ibd_log_fini(void)
6397 {
6398 	if (ibd_lbuf)
6399 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
6400 	ibd_lbuf_ndx = 0;
6401 	ibd_lbuf = NULL;
6402 
6403 	mutex_destroy(&ibd_lbuf_lock);
6404 }
6405 
6406 static void
6407 ibd_log(const char *fmt, ...)
6408 {
6409 	va_list	ap;
6410 	uint32_t off;
6411 	uint32_t msglen;
6412 	char tmpbuf[IBD_DMAX_LINE];
6413 
6414 	if (ibd_lbuf == NULL)
6415 		return;
6416 
6417 	va_start(ap, fmt);
6418 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
6419 	va_end(ap);
6420 
6421 	if (msglen >= IBD_DMAX_LINE)
6422 		msglen = IBD_DMAX_LINE - 1;
6423 
6424 	mutex_enter(&ibd_lbuf_lock);
6425 
6426 	off = ibd_lbuf_ndx;		/* current msg should go here */
6427 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
6428 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
6429 
6430 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
6431 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
6432 
6433 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
6434 		ibd_lbuf_ndx = 0;
6435 
6436 	mutex_exit(&ibd_lbuf_lock);
6437 
6438 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
6439 }
6440 #endif
6441