xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_wr.c
29  *    Tavor Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Tavor WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 
45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46     uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47 #pragma inline(tavor_qp_send_doorbell)
48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
49     uint32_t nds, uint32_t qpn, uint32_t credits);
50 #pragma inline(tavor_qp_recv_doorbell)
51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
54     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
56     ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
57     uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
59     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
61     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
62     tavor_qphdl_t qp);
63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
64     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
66     uint64_t *prev, tavor_qphdl_t qp);
67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
68     ibt_recv_wr_t *wr, uint64_t *desc);
69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
70     tavor_srqhdl_t srq);
71 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
72     uint_t sync_to, uint_t sync_type, uint_t flag);
73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
74     tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
77     uint_t send_or_recv);
78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
79     tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
82     tavor_wrid_list_hdr_t *wrid_list);
83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
84     tavor_wrid_list_hdr_t *wrid_list);
85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
90 
91 /*
92  * tavor_post_send()
93  *    Context: Can be called from interrupt or base context.
94  */
95 int
96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
97     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
98 {
99 	tavor_sw_wqe_dbinfo_t		dbinfo;
100 	tavor_wrid_list_hdr_t		*wridlist;
101 	tavor_wrid_entry_t		*wre_last;
102 	uint64_t			*desc, *prev, *first;
103 	uint32_t			desc_sz, first_sz;
104 	uint32_t			wqeaddrsz, signaled_dbd;
105 	uint32_t			head, tail, next_tail, qsize_msk;
106 	uint32_t			sync_from, sync_to;
107 	uint_t				currindx, wrindx, numremain;
108 	uint_t				chainlen, chainbegin, posted_cnt;
109 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
110 	int				status;
111 
112 	TAVOR_TNF_ENTER(tavor_post_send);
113 
114 	/*
115 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
116 	 * clients to post to QP memory that is accessible directly by the
117 	 * user.  If the QP memory is user accessible, then return an error.
118 	 */
119 	if (qp->qp_is_umap) {
120 		TNF_PROBE_0(tavor_post_send_inv_usrmapped_type,
121 		    TAVOR_TNF_ERROR, "");
122 		TAVOR_TNF_EXIT(tavor_post_send);
123 		return (IBT_QP_HDL_INVALID);
124 	}
125 
126 	/* Initialize posted_cnt */
127 	posted_cnt = 0;
128 
129 	mutex_enter(&qp->qp_lock);
130 
131 	/*
132 	 * Check QP state.  Can not post Send requests from the "Reset",
133 	 * "Init", or "RTR" states
134 	 */
135 	if ((qp->qp_state == TAVOR_QP_RESET) ||
136 	    (qp->qp_state == TAVOR_QP_INIT) ||
137 	    (qp->qp_state == TAVOR_QP_RTR)) {
138 		mutex_exit(&qp->qp_lock);
139 		TNF_PROBE_0(tavor_post_send_inv_qpstate_fail,
140 		    TAVOR_TNF_ERROR, "");
141 		TAVOR_TNF_EXIT(tavor_post_send);
142 		return (IBT_QP_STATE_INVALID);
143 	}
144 
145 	/* Grab the lock for the WRID list */
146 	mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
147 	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
148 
149 	/* Save away some initial QP state */
150 	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
151 	tail	  = qp->qp_sq_wqhdr->wq_tail;
152 	head	  = qp->qp_sq_wqhdr->wq_head;
153 
154 	/*
155 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
156 	 * request and build a Send WQE.  Note:  Because we are potentially
157 	 * building a chain of WQEs, we want to link them all together.
158 	 * However, we do not want to link the first one to the previous
159 	 * WQE until the entire chain has been linked.  Then in the last
160 	 * step we ring the appropriate doorbell.  Note:  It is possible for
161 	 * more Work Requests to be posted than the HW will support at one
162 	 * shot.  If this happens, we need to be able to post and ring
163 	 * several chains here until the the entire request is complete.
164 	 */
165 	wrindx = 0;
166 	numremain = num_wr;
167 	status	  = DDI_SUCCESS;
168 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
169 		/*
170 		 * For the first WQE on a new chain we need "prev" to point
171 		 * to the current descriptor.  As we begin to process
172 		 * further, "prev" will be updated to point to the previous
173 		 * WQE on the current chain (see below).
174 		 */
175 		prev = TAVOR_QP_SQ_ENTRY(qp, tail);
176 
177 		/*
178 		 * Before we begin, save the current "tail index" for later
179 		 * DMA sync
180 		 */
181 		sync_from = tail;
182 
183 		/*
184 		 * Break the request up into chains that are less than or
185 		 * equal to the maximum number of WQEs that can be posted
186 		 * per doorbell ring
187 		 */
188 		chainlen   = (numremain > maxdb) ? maxdb : numremain;
189 		numremain -= chainlen;
190 		chainbegin = wrindx;
191 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
192 			/*
193 			 * Check for "queue full" condition.  If the queue
194 			 * is already full, then no more WQEs can be posted.
195 			 * So break out, ring a doorbell (if necessary) and
196 			 * return an error
197 			 */
198 			if (qp->qp_sq_wqhdr->wq_full != 0) {
199 				status = IBT_QP_FULL;
200 				TNF_PROBE_0_DEBUG(tavor_post_send_sqfull,
201 				    TAVOR_TNF_TRACE, "");
202 				break;
203 			}
204 
205 			/*
206 			 * Increment the "tail index" and check for "queue
207 			 * full" condition.  If we detect that the current
208 			 * work request is going to fill the work queue, then
209 			 * we mark this condition and continue.
210 			 */
211 			next_tail = (tail + 1) & qsize_msk;
212 			if (next_tail == head) {
213 				qp->qp_sq_wqhdr->wq_full = 1;
214 			}
215 
216 			/*
217 			 * Get the address of the location where the next
218 			 * Send WQE should be built
219 			 */
220 			desc = TAVOR_QP_SQ_ENTRY(qp, tail);
221 
222 			/*
223 			 * Call tavor_wqe_send_build() to build the WQE
224 			 * at the given address.  This routine uses the
225 			 * information in the ibt_send_wr_t list (wr[]) and
226 			 * returns the size of the WQE when it returns.
227 			 */
228 			status = tavor_wqe_send_build(state, qp,
229 			    &wr[wrindx], desc, &desc_sz);
230 			if (status != DDI_SUCCESS) {
231 				TNF_PROBE_0(tavor_post_send_bldwqe_fail,
232 				    TAVOR_TNF_ERROR, "");
233 				break;
234 			}
235 
236 			/*
237 			 * Add a WRID entry to the WRID list.  Need to
238 			 * calculate the "wqeaddrsz" and "signaled_dbd"
239 			 * values to pass to tavor_wrid_add_entry()
240 			 */
241 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
242 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
243 			    desc_sz);
244 			if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
245 			    (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
246 				signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
247 			} else {
248 				signaled_dbd = 0;
249 			}
250 			tavor_wrid_add_entry(qp->qp_sq_wqhdr,
251 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
252 
253 			/*
254 			 * If this is not the first descriptor on the current
255 			 * chain, then link it to the previous WQE.  Otherwise,
256 			 * save the address and size of this descriptor (in
257 			 * "first" and "first_sz" respectively) and continue.
258 			 * Note: Linking a WQE to the the previous one will
259 			 * depend on whether the two WQEs are from "special
260 			 * QPs" (i.e. MLX transport WQEs) or whether they are
261 			 * normal Send WQEs.
262 			 */
263 			if (currindx != 0) {
264 				if (qp->qp_is_special) {
265 					tavor_wqe_mlx_linknext(&wr[wrindx - 1],
266 					    desc, desc_sz, prev, NULL, qp);
267 				} else {
268 					tavor_wqe_send_linknext(&wr[wrindx],
269 					    &wr[wrindx - 1], desc, desc_sz,
270 					    prev, NULL, qp);
271 				}
272 				prev = desc;
273 			} else {
274 				first	 = desc;
275 				first_sz = desc_sz;
276 			}
277 
278 			/*
279 			 * Update the current "tail index" and increment
280 			 * "posted_cnt"
281 			 */
282 			tail = next_tail;
283 			posted_cnt++;
284 		}
285 
286 		/*
287 		 * If we reach here and there are one or more WQEs which have
288 		 * been successfully chained together, then we need to link
289 		 * the current chain to the previously executing chain of
290 		 * descriptor (if there is one) and ring the doorbell for the
291 		 * send work queue.
292 		 */
293 		if (currindx != 0) {
294 			/*
295 			 * Before we link the chain, we need to ensure that the
296 			 * "next" field on the last WQE is set to NULL (to
297 			 * indicate the end of the chain).  Note: Just as it
298 			 * did above, the format for the "next" fields in a
299 			 * given WQE depend on whether the WQE is MLX
300 			 * transport or not.
301 			 */
302 			if (qp->qp_is_special) {
303 				tavor_wqe_mlx_linknext(&wr[chainbegin +
304 				    currindx - 1], NULL, 0, prev, NULL, qp);
305 			} else {
306 				tavor_wqe_send_linknext(NULL,
307 				    &wr[chainbegin + currindx - 1], NULL, 0,
308 				    prev, NULL, qp);
309 			}
310 
311 			/* Save away updated "tail index" for the DMA sync */
312 			sync_to = tail;
313 
314 			/* Do a DMA sync for current send WQE(s) */
315 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
316 			    DDI_DMA_SYNC_FORDEV);
317 
318 			/*
319 			 * Now link the chain to the old chain (if there was
320 			 * one.  Note: still need to pay attention to whether
321 			 * the QP used MLX transport WQEs or not.
322 			 */
323 			if (qp->qp_is_special) {
324 				tavor_wqe_mlx_linknext(NULL, first, first_sz,
325 				    qp->qp_sq_lastwqeaddr, &dbinfo, qp);
326 			} else {
327 				tavor_wqe_send_linknext(&wr[chainbegin], NULL,
328 				    first, first_sz, qp->qp_sq_lastwqeaddr,
329 				    &dbinfo, qp);
330 			}
331 
332 			/*
333 			 * If there was a valid previous WQE (i.e. non-NULL),
334 			 * then sync it too.  This is because we have updated
335 			 * its "next" fields and we want to ensure that the
336 			 * hardware can see the changes.
337 			 */
338 			if (qp->qp_sq_lastwqeaddr != NULL) {
339 				sync_to   = sync_from;
340 				sync_from = (sync_from - 1) & qsize_msk;
341 				tavor_wqe_sync(qp, sync_from, sync_to,
342 				    TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
343 			}
344 
345 			/*
346 			 * Now if the WRID tail entry is non-NULL, then this
347 			 * represents the entry to which we are chaining the
348 			 * new entries.  Since we are going to ring the
349 			 * doorbell for this WQE, we want set its "dbd" bit.
350 			 *
351 			 * On the other hand, if the tail is NULL, even though
352 			 * we will have rung the doorbell for the previous WQE
353 			 * (for the hardware's sake) it is irrelevant to our
354 			 * purposes (for tracking WRIDs) because we know the
355 			 * request must have already completed.
356 			 */
357 			wre_last = wridlist->wl_wre_old_tail;
358 			if (wre_last != NULL) {
359 				wre_last->wr_signaled_dbd |=
360 				    TAVOR_WRID_ENTRY_DOORBELLED;
361 			}
362 
363 			/* Update some of the state in the QP */
364 			qp->qp_sq_lastwqeaddr	 = desc;
365 			qp->qp_sq_wqhdr->wq_tail = tail;
366 
367 			/* Ring the doorbell */
368 			tavor_qp_send_doorbell(state,
369 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
370 			    first_sz, qp->qp_qpnum, dbinfo.db_fence,
371 			    dbinfo.db_nopcode);
372 		}
373 	}
374 
375 	/*
376 	 * Update the "num_posted" return value (if necessary).  Then drop
377 	 * the locks and return success.
378 	 */
379 	if (num_posted != NULL) {
380 		*num_posted = posted_cnt;
381 	}
382 
383 	mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
384 	mutex_exit(&qp->qp_lock);
385 
386 	TAVOR_TNF_EXIT(tavor_post_send);
387 	return (status);
388 }
389 
390 
391 /*
392  * tavor_post_recv()
393  *    Context: Can be called from interrupt or base context.
394  */
395 int
396 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
397     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
398 {
399 	tavor_wrid_list_hdr_t		*wridlist;
400 	tavor_wrid_entry_t		*wre_last;
401 	uint64_t			*desc, *prev, *first;
402 	uint32_t			desc_sz, first_sz;
403 	uint32_t			wqeaddrsz, signaled_dbd;
404 	uint32_t			head, tail, next_tail, qsize_msk;
405 	uint32_t			sync_from, sync_to;
406 	uint_t				currindx, wrindx, numremain;
407 	uint_t				chainlen, posted_cnt;
408 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
409 	int				status;
410 
411 	TAVOR_TNF_ENTER(tavor_post_recv);
412 
413 	/*
414 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
415 	 * clients to post to QP memory that is accessible directly by the
416 	 * user.  If the QP memory is user accessible, then return an error.
417 	 */
418 	if (qp->qp_is_umap) {
419 		TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type,
420 		    TAVOR_TNF_ERROR, "");
421 		TAVOR_TNF_EXIT(tavor_post_recv);
422 		return (IBT_QP_HDL_INVALID);
423 	}
424 
425 	/* Initialize posted_cnt */
426 	posted_cnt = 0;
427 
428 	mutex_enter(&qp->qp_lock);
429 
430 	/*
431 	 * Check if QP is associated with an SRQ
432 	 */
433 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
434 		mutex_exit(&qp->qp_lock);
435 		TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq,
436 		    TAVOR_TNF_ERROR, "");
437 		TAVOR_TNF_EXIT(tavor_post_recv);
438 		return (IBT_SRQ_IN_USE);
439 	}
440 
441 	/*
442 	 * Check QP state.  Can not post Recv requests from the "Reset" state
443 	 */
444 	if (qp->qp_state == TAVOR_QP_RESET) {
445 		mutex_exit(&qp->qp_lock);
446 		TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail,
447 		    TAVOR_TNF_ERROR, "");
448 		TAVOR_TNF_EXIT(tavor_post_recv);
449 		return (IBT_QP_STATE_INVALID);
450 	}
451 
452 	/* Grab the lock for the WRID list */
453 	mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
454 	wridlist  = qp->qp_rq_wqhdr->wq_wrid_post;
455 
456 	/* Save away some initial QP state */
457 	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
458 	tail	  = qp->qp_rq_wqhdr->wq_tail;
459 	head	  = qp->qp_rq_wqhdr->wq_head;
460 
461 	/*
462 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
463 	 * request and build a Recv WQE.  Note:  Because we are potentially
464 	 * building a chain of WQEs, we want to link them all together.
465 	 * However, we do not want to link the first one to the previous
466 	 * WQE until the entire chain has been linked.  Then in the last
467 	 * step we ring the appropriate doorbell.  Note:  It is possible for
468 	 * more Work Requests to be posted than the HW will support at one
469 	 * shot.  If this happens, we need to be able to post and ring
470 	 * several chains here until the the entire request is complete.
471 	 */
472 	wrindx = 0;
473 	numremain = num_wr;
474 	status	  = DDI_SUCCESS;
475 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
476 		/*
477 		 * For the first WQE on a new chain we need "prev" to point
478 		 * to the current descriptor.  As we begin to process
479 		 * further, "prev" will be updated to point to the previous
480 		 * WQE on the current chain (see below).
481 		 */
482 		prev = TAVOR_QP_RQ_ENTRY(qp, tail);
483 
484 		/*
485 		 * Before we begin, save the current "tail index" for later
486 		 * DMA sync
487 		 */
488 		sync_from = tail;
489 
490 		/*
491 		 * Break the request up into chains that are less than or
492 		 * equal to the maximum number of WQEs that can be posted
493 		 * per doorbell ring
494 		 */
495 		chainlen = (numremain > maxdb) ? maxdb : numremain;
496 		numremain -= chainlen;
497 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
498 			/*
499 			 * Check for "queue full" condition.  If the queue
500 			 * is already full, then no more WQEs can be posted.
501 			 * So break out, ring a doorbell (if necessary) and
502 			 * return an error
503 			 */
504 			if (qp->qp_rq_wqhdr->wq_full != 0) {
505 				status = IBT_QP_FULL;
506 				TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull,
507 				    TAVOR_TNF_TRACE, "");
508 				break;
509 			}
510 
511 			/*
512 			 * Increment the "tail index" and check for "queue
513 			 * full" condition.  If we detect that the current
514 			 * work request is going to fill the work queue, then
515 			 * we mark this condition and continue.
516 			 */
517 			next_tail = (tail + 1) & qsize_msk;
518 			if (next_tail == head) {
519 				qp->qp_rq_wqhdr->wq_full = 1;
520 			}
521 
522 			/*
523 			 * Get the address of the location where the next
524 			 * Recv WQE should be built
525 			 */
526 			desc = TAVOR_QP_RQ_ENTRY(qp, tail);
527 
528 			/*
529 			 * Call tavor_wqe_recv_build() to build the WQE
530 			 * at the given address.  This routine uses the
531 			 * information in the ibt_recv_wr_t list (wr[]) and
532 			 * returns the size of the WQE when it returns.
533 			 */
534 			status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
535 			    desc, &desc_sz);
536 			if (status != DDI_SUCCESS) {
537 				TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
538 				    TAVOR_TNF_ERROR, "");
539 				break;
540 			}
541 
542 			/*
543 			 * Add a WRID entry to the WRID list.  Need to
544 			 * calculate the "wqeaddrsz" and "signaled_dbd"
545 			 * values to pass to tavor_wrid_add_entry().  Note:
546 			 * all Recv WQEs are essentially "signaled"
547 			 */
548 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
549 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
550 			    desc_sz);
551 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
552 			tavor_wrid_add_entry(qp->qp_rq_wqhdr,
553 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
554 
555 			/*
556 			 * If this is not the first descriptor on the current
557 			 * chain, then link it to the previous WQE.  Otherwise,
558 			 * save the address and size of this descriptor (in
559 			 * "first" and "first_sz" respectively) and continue.
560 			 */
561 			if (currindx != 0) {
562 				tavor_wqe_recv_linknext(desc, desc_sz, prev,
563 				    qp);
564 				prev = desc;
565 			} else {
566 				first	 = desc;
567 				first_sz = desc_sz;
568 			}
569 
570 			/*
571 			 * Update the current "tail index" and increment
572 			 * "posted_cnt"
573 			 */
574 			tail = next_tail;
575 			posted_cnt++;
576 		}
577 
578 		/*
579 		 * If we reach here and there are one or more WQEs which have
580 		 * been successfully chained together, then we need to link
581 		 * the current chain to the previously executing chain of
582 		 * descriptor (if there is one) and ring the doorbell for the
583 		 * recv work queue.
584 		 */
585 		if (currindx != 0) {
586 			/*
587 			 * Before we link the chain, we need to ensure that the
588 			 * "next" field on the last WQE is set to NULL (to
589 			 * indicate the end of the chain).
590 			 */
591 			tavor_wqe_recv_linknext(NULL, 0, prev, qp);
592 
593 			/* Save away updated "tail index" for the DMA sync */
594 			sync_to = tail;
595 
596 			/* Do a DMA sync for current recv WQE(s) */
597 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
598 			    DDI_DMA_SYNC_FORDEV);
599 
600 			/*
601 			 * Now link the chain to the old chain (if there was
602 			 * one.
603 			 */
604 			tavor_wqe_recv_linknext(first, first_sz,
605 			    qp->qp_rq_lastwqeaddr, qp);
606 
607 			/*
608 			 * If there was a valid previous WQE (i.e. non-NULL),
609 			 * then sync it too.  This is because we have updated
610 			 * its "next" fields and we want to ensure that the
611 			 * hardware can see the changes.
612 			 */
613 			if (qp->qp_rq_lastwqeaddr != NULL) {
614 				sync_to	  = sync_from;
615 				sync_from = (sync_from - 1) & qsize_msk;
616 				tavor_wqe_sync(qp, sync_from, sync_to,
617 				    TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
618 			}
619 
620 			/*
621 			 * Now if the WRID tail entry is non-NULL, then this
622 			 * represents the entry to which we are chaining the
623 			 * new entries.  Since we are going to ring the
624 			 * doorbell for this WQE, we want set its "dbd" bit.
625 			 *
626 			 * On the other hand, if the tail is NULL, even though
627 			 * we will have rung the doorbell for the previous WQE
628 			 * (for the hardware's sake) it is irrelevant to our
629 			 * purposes (for tracking WRIDs) because we know the
630 			 * request must have already completed.
631 			 */
632 			wre_last = wridlist->wl_wre_old_tail;
633 			if (wre_last != NULL) {
634 				wre_last->wr_signaled_dbd |=
635 				    TAVOR_WRID_ENTRY_DOORBELLED;
636 			}
637 
638 			/* Update some of the state in the QP */
639 			qp->qp_rq_lastwqeaddr	 = desc;
640 			qp->qp_rq_wqhdr->wq_tail = tail;
641 
642 			/* Ring the doorbell */
643 			tavor_qp_recv_doorbell(state,
644 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
645 			    first_sz, qp->qp_qpnum, (chainlen % maxdb));
646 		}
647 	}
648 
649 	/*
650 	 * Update the "num_posted" return value (if necessary).  Then drop
651 	 * the locks and return success.
652 	 */
653 	if (num_posted != NULL) {
654 		*num_posted = posted_cnt;
655 	}
656 
657 	mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
658 	mutex_exit(&qp->qp_lock);
659 
660 	TAVOR_TNF_EXIT(tavor_post_recv);
661 	return (status);
662 }
663 
664 /*
665  * tavor_post_srq()
666  *    Context: Can be called from interrupt or base context.
667  */
668 int
669 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
670     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
671 {
672 	uint64_t			*desc, *prev, *first, *last_wqe_addr;
673 	uint32_t			signaled_dbd;
674 	uint32_t			sync_indx;
675 	uint_t				currindx, wrindx, numremain;
676 	uint_t				chainlen, posted_cnt;
677 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
678 	int				status;
679 
680 	TAVOR_TNF_ENTER(tavor_post_srq);
681 
682 	/*
683 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
684 	 * clients to post to QP memory that is accessible directly by the
685 	 * user.  If the QP memory is user accessible, then return an error.
686 	 */
687 	if (srq->srq_is_umap) {
688 		TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type,
689 		    TAVOR_TNF_ERROR, "");
690 		TAVOR_TNF_EXIT(tavor_post_srq);
691 		return (IBT_SRQ_HDL_INVALID);
692 	}
693 
694 	/* Initialize posted_cnt */
695 	posted_cnt = 0;
696 
697 	mutex_enter(&srq->srq_lock);
698 
699 	/*
700 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
701 	 */
702 	if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
703 		mutex_exit(&srq->srq_lock);
704 		TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail,
705 		    TAVOR_TNF_ERROR, "");
706 		TAVOR_TNF_EXIT(tavor_post_srq);
707 		return (IBT_QP_STATE_INVALID);
708 	}
709 
710 	/* Grab the lock for the WRID list */
711 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
712 
713 	/*
714 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
715 	 * request and build a Recv WQE.  Note:  Because we are potentially
716 	 * building a chain of WQEs, we want to link them all together.
717 	 * However, we do not want to link the first one to the previous
718 	 * WQE until the entire chain has been linked.  Then in the last
719 	 * step we ring the appropriate doorbell.  Note:  It is possible for
720 	 * more Work Requests to be posted than the HW will support at one
721 	 * shot.  If this happens, we need to be able to post and ring
722 	 * several chains here until the the entire request is complete.
723 	 */
724 	wrindx = 0;
725 	numremain = num_wr;
726 	status	  = DDI_SUCCESS;
727 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
728 		/*
729 		 * For the first WQE on a new chain we need "prev" to point
730 		 * to the current descriptor.  As we begin to process
731 		 * further, "prev" will be updated to point to the previous
732 		 * WQE on the current chain (see below).
733 		 */
734 		if (srq->srq_wq_lastwqeindx == -1) {
735 			prev = NULL;
736 		} else {
737 			prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
738 		}
739 
740 		/*
741 		 * Break the request up into chains that are less than or
742 		 * equal to the maximum number of WQEs that can be posted
743 		 * per doorbell ring
744 		 */
745 		chainlen = (numremain > maxdb) ? maxdb : numremain;
746 		numremain -= chainlen;
747 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
748 
749 			/*
750 			 * Check for "queue full" condition.  If the queue
751 			 * is already full, then no more WQEs can be posted.
752 			 * So break out, ring a doorbell (if necessary) and
753 			 * return an error
754 			 */
755 			if (srq->srq_wridlist->wl_free_list_indx == -1) {
756 				status = IBT_QP_FULL;
757 				TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull,
758 				    TAVOR_TNF_TRACE, "");
759 				break;
760 			}
761 
762 			/*
763 			 * Get the address of the location where the next
764 			 * Recv WQE should be built
765 			 */
766 			desc = TAVOR_SRQ_WQE_ADDR(srq,
767 			    srq->srq_wridlist->wl_free_list_indx);
768 
769 			/*
770 			 * Add a WRID entry to the WRID list.  Need to
771 			 * set the "signaled_dbd" values to pass to
772 			 * tavor_wrid_add_entry().  Note: all Recv WQEs are
773 			 * essentially "signaled"
774 			 *
775 			 * The 'size' is stored at srq_alloc time, in the
776 			 * srq_wq_stride.  This is a constant value required
777 			 * for SRQ.
778 			 */
779 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
780 			tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
781 			    signaled_dbd);
782 
783 			/*
784 			 * Call tavor_wqe_srq_build() to build the WQE
785 			 * at the given address.  This routine uses the
786 			 * information in the ibt_recv_wr_t list (wr[]) and
787 			 * returns the size of the WQE when it returns.
788 			 */
789 			status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
790 			    desc);
791 			if (status != DDI_SUCCESS) {
792 				TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
793 				    TAVOR_TNF_ERROR, "");
794 				break;
795 			}
796 
797 			/*
798 			 * If this is not the first descriptor on the current
799 			 * chain, then link it to the previous WQE.  Otherwise,
800 			 * save the address of this descriptor (in "first") and
801 			 * continue.
802 			 */
803 			if (currindx != 0) {
804 				tavor_wqe_srq_linknext(desc, prev, srq);
805 				sync_indx = TAVOR_SRQ_WQE_INDEX(
806 				    srq->srq_wq_buf, prev,
807 				    srq->srq_wq_log_wqesz);
808 
809 				/* Do a DMA sync for previous recv WQE */
810 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
811 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
812 
813 				prev = desc;
814 			} else {
815 
816 				/*
817 				 * In this case, the last WQE on the chain is
818 				 * also considered 'first'.  So set prev to
819 				 * first, here.
820 				 */
821 				first = prev = desc;
822 			}
823 
824 			/*
825 			 * Increment "posted_cnt"
826 			 */
827 			posted_cnt++;
828 		}
829 
830 		/*
831 		 * If we reach here and there are one or more WQEs which have
832 		 * been successfully chained together, then we need to link
833 		 * the current chain to the previously executing chain of
834 		 * descriptor (if there is one) and ring the doorbell for the
835 		 * recv work queue.
836 		 */
837 		if (currindx != 0) {
838 			/*
839 			 * Before we link the chain, we need to ensure that the
840 			 * "next" field on the last WQE is set to NULL (to
841 			 * indicate the end of the chain).
842 			 */
843 			tavor_wqe_srq_linknext(NULL, prev, srq);
844 
845 			sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
846 			    srq->srq_wq_log_wqesz);
847 
848 			/* Do a DMA sync for current recv WQE */
849 			tavor_wqe_sync(srq, sync_indx, sync_indx+1,
850 			    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
851 
852 			/*
853 			 * Now link the chain to the old chain (if there was
854 			 * one).
855 			 */
856 			if (srq->srq_wq_lastwqeindx == -1) {
857 				last_wqe_addr = NULL;
858 			} else {
859 				last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
860 				    srq->srq_wq_lastwqeindx);
861 			}
862 			tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
863 
864 			/*
865 			 * If there was a valid previous WQE (i.e. valid index),
866 			 * then sync it too.  This is because we have updated
867 			 * its "next" fields and we want to ensure that the
868 			 * hardware can see the changes.
869 			 */
870 			if (srq->srq_wq_lastwqeindx != -1) {
871 				sync_indx = srq->srq_wq_lastwqeindx;
872 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
873 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
874 			}
875 
876 			/* Update some of the state in the QP */
877 			srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
878 			    srq->srq_wq_buf, desc,
879 			    srq->srq_wq_log_wqesz);
880 
881 			/* Ring the doorbell */
882 			/* SRQ needs NDS of 0 */
883 			tavor_qp_recv_doorbell(state,
884 			    (uint32_t)((uintptr_t)first - srq->srq_desc_off),
885 			    0, srq->srq_srqnum, (chainlen % maxdb));
886 		}
887 	}
888 
889 	/*
890 	 * Update the "num_posted" return value (if necessary).  Then drop
891 	 * the locks and return success.
892 	 */
893 	if (num_posted != NULL) {
894 		*num_posted = posted_cnt;
895 	}
896 
897 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
898 	mutex_exit(&srq->srq_lock);
899 
900 	TAVOR_TNF_EXIT(tavor_post_srq);
901 	return (status);
902 }
903 
904 
905 /*
906  * tavor_qp_send_doorbell()
907  *    Context: Can be called from interrupt or base context.
908  */
909 static void
910 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
911     uint32_t qpn, uint32_t fence, uint32_t nopcode)
912 {
913 	uint64_t	doorbell = 0;
914 
915 	/* Build the doorbell from the parameters */
916 	doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
917 	    TAVOR_QPSNDDB_NDA_SHIFT) |
918 	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
919 	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
920 	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
921 
922 	TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "",
923 	    tnf_ulong, doorbell, doorbell);
924 
925 	/* Write the doorbell to UAR */
926 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
927 	    doorbell);
928 }
929 
930 
931 /*
932  * tavor_qp_recv_doorbell()
933  *    Context: Can be called from interrupt or base context.
934  */
935 static void
936 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
937     uint32_t qpn, uint32_t credits)
938 {
939 	uint64_t	doorbell = 0;
940 
941 	/* Build the doorbell from the parameters */
942 	doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
943 	    TAVOR_QPRCVDB_NDA_SHIFT) |
944 	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
945 	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
946 
947 	TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "",
948 	    tnf_ulong, doorbell, doorbell);
949 
950 	/* Write the doorbell to UAR */
951 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
952 	    doorbell);
953 }
954 
955 
956 /*
957  * tavor_wqe_send_build()
958  *    Context: Can be called from interrupt or base context.
959  */
960 static int
961 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
962     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
963 {
964 	tavor_hw_snd_wqe_ud_t		*ud;
965 	tavor_hw_snd_wqe_remaddr_t	*rc;
966 	tavor_hw_snd_wqe_atomic_t	*at;
967 	tavor_hw_snd_wqe_remaddr_t	*uc;
968 	tavor_hw_snd_wqe_bind_t		*bn;
969 	tavor_hw_wqe_sgl_t		*ds;
970 	ibt_wr_ds_t			*sgl;
971 	tavor_ahhdl_t			ah;
972 	uint32_t			nds;
973 	int				i, num_ds, status;
974 
975 	TAVOR_TNF_ENTER(tavor_wqe_send_build);
976 
977 	ASSERT(MUTEX_HELD(&qp->qp_lock));
978 
979 	/* Initialize the information for the Data Segments */
980 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
981 	    sizeof (tavor_hw_snd_wqe_nextctrl_t));
982 	nds = wr->wr_nds;
983 	sgl = wr->wr_sgl;
984 	num_ds = 0;
985 
986 	/*
987 	 * Build a Send WQE depends first and foremost on the transport
988 	 * type of Work Request (i.e. UD, RC, or UC)
989 	 */
990 	switch (wr->wr_trans) {
991 	case IBT_UD_SRV:
992 		/* Ensure that work request transport type matches QP type */
993 		if (qp->qp_serv_type != TAVOR_QP_UD) {
994 			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
995 			    TAVOR_TNF_ERROR, "");
996 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
997 			return (IBT_QP_SRV_TYPE_INVALID);
998 		}
999 
1000 		/*
1001 		 * Validate the operation type.  For UD requests, only the
1002 		 * "Send" operation is valid
1003 		 */
1004 		if (wr->wr_opcode != IBT_WRC_SEND) {
1005 			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1006 			    TAVOR_TNF_ERROR, "");
1007 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1008 			return (IBT_QP_OP_TYPE_INVALID);
1009 		}
1010 
1011 		/*
1012 		 * If this is a Special QP (QP0 or QP1), then we need to
1013 		 * build MLX WQEs instead.  So jump to tavor_wqe_mlx_build()
1014 		 * and return whatever status it returns
1015 		 */
1016 		if (qp->qp_is_special) {
1017 			status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
1018 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1019 			return (status);
1020 		}
1021 
1022 		/*
1023 		 * Otherwise, if this is a normal UD Send request, then fill
1024 		 * all the fields in the Tavor UD header for the WQE.  Note:
1025 		 * to do this we'll need to extract some information from the
1026 		 * Address Handle passed with the work request.
1027 		 */
1028 		ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1029 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1030 		ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1031 		if (ah == NULL) {
1032 			TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail,
1033 			    TAVOR_TNF_ERROR, "");
1034 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1035 			return (IBT_AH_HDL_INVALID);
1036 		}
1037 
1038 		/*
1039 		 * Build the Unreliable Datagram Segment for the WQE, using
1040 		 * the information from the address handle and the work
1041 		 * request.
1042 		 */
1043 		mutex_enter(&ah->ah_lock);
1044 		TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
1045 		mutex_exit(&ah->ah_lock);
1046 
1047 		/* Update "ds" for filling in Data Segments (below) */
1048 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
1049 		    sizeof (tavor_hw_snd_wqe_ud_t));
1050 		break;
1051 
1052 	case IBT_RC_SRV:
1053 		/* Ensure that work request transport type matches QP type */
1054 		if (qp->qp_serv_type != TAVOR_QP_RC) {
1055 			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1056 			    TAVOR_TNF_ERROR, "");
1057 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1058 			return (IBT_QP_SRV_TYPE_INVALID);
1059 		}
1060 
1061 		/*
1062 		 * Validate the operation type.  For RC requests, we allow
1063 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1064 		 * operations, and memory window "Bind"
1065 		 */
1066 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1067 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1068 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1069 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1070 		    (wr->wr_opcode != IBT_WRC_FADD) &&
1071 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1072 			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1073 			    TAVOR_TNF_ERROR, "");
1074 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1075 			return (IBT_QP_OP_TYPE_INVALID);
1076 		}
1077 
1078 		/*
1079 		 * If this is a Send request, then all we need to do is break
1080 		 * out and here and begin the Data Segment processing below
1081 		 */
1082 		if (wr->wr_opcode == IBT_WRC_SEND) {
1083 			break;
1084 		}
1085 
1086 		/*
1087 		 * If this is an RDMA Read or RDMA Write request, then fill
1088 		 * in the "Remote Address" header fields.
1089 		 */
1090 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1091 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1092 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1093 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1094 
1095 			/*
1096 			 * Build the Remote Address Segment for the WQE, using
1097 			 * the information from the RC work request.
1098 			 */
1099 			TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1100 
1101 			/* Update "ds" for filling in Data Segments (below) */
1102 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1103 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1104 			break;
1105 		}
1106 
1107 		/*
1108 		 * If this is one of the Atomic type operations (i.e
1109 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1110 		 * Address" header fields and the "Atomic" header fields.
1111 		 */
1112 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1113 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1114 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1115 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1116 			at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1117 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1118 
1119 			/*
1120 			 * Build the Remote Address and Atomic Segments for
1121 			 * the WQE, using the information from the RC Atomic
1122 			 * work request.
1123 			 */
1124 			TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1125 			TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1126 
1127 			/* Update "ds" for filling in Data Segments (below) */
1128 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1129 			    sizeof (tavor_hw_snd_wqe_atomic_t));
1130 
1131 			/*
1132 			 * Update "nds" and "sgl" because Atomic requests have
1133 			 * only a single Data Segment (and they are encoded
1134 			 * somewhat differently in the work request.
1135 			 */
1136 			nds = 1;
1137 			sgl = wr->wr_sgl;
1138 			break;
1139 		}
1140 
1141 		/*
1142 		 * If this is memory window Bind operation, then we call the
1143 		 * tavor_wr_bind_check() routine to validate the request and
1144 		 * to generate the updated RKey.  If this is successful, then
1145 		 * we fill in the WQE's "Bind" header fields.
1146 		 */
1147 		if (wr->wr_opcode == IBT_WRC_BIND) {
1148 			status = tavor_wr_bind_check(state, wr);
1149 			if (status != DDI_SUCCESS) {
1150 				TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1151 				    TAVOR_TNF_ERROR, "");
1152 				TAVOR_TNF_EXIT(tavor_wqe_send_build);
1153 				return (status);
1154 			}
1155 
1156 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1157 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1158 
1159 			/*
1160 			 * Build the Bind Memory Window Segments for the WQE,
1161 			 * using the information from the RC Bind memory
1162 			 * window work request.
1163 			 */
1164 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1165 
1166 			/*
1167 			 * Update the "ds" pointer.  Even though the "bind"
1168 			 * operation requires no SGLs, this is necessary to
1169 			 * facilitate the correct descriptor size calculations
1170 			 * (below).
1171 			 */
1172 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1173 			    sizeof (tavor_hw_snd_wqe_bind_t));
1174 			nds = 0;
1175 		}
1176 		break;
1177 
1178 	case IBT_UC_SRV:
1179 		/* Ensure that work request transport type matches QP type */
1180 		if (qp->qp_serv_type != TAVOR_QP_UC) {
1181 			TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1182 			    TAVOR_TNF_ERROR, "");
1183 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1184 			return (IBT_QP_SRV_TYPE_INVALID);
1185 		}
1186 
1187 		/*
1188 		 * Validate the operation type.  For UC requests, we only
1189 		 * allow "Send", "RDMA Write", and memory window "Bind".
1190 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1191 		 * operations
1192 		 */
1193 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1194 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1195 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1196 			TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1197 			    TAVOR_TNF_ERROR, "");
1198 			TAVOR_TNF_EXIT(tavor_wqe_send_build);
1199 			return (IBT_QP_OP_TYPE_INVALID);
1200 		}
1201 
1202 		/*
1203 		 * If this is a Send request, then all we need to do is break
1204 		 * out and here and begin the Data Segment processing below
1205 		 */
1206 		if (wr->wr_opcode == IBT_WRC_SEND) {
1207 			break;
1208 		}
1209 
1210 		/*
1211 		 * If this is an RDMA Write request, then fill in the "Remote
1212 		 * Address" header fields.
1213 		 */
1214 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1215 			uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1216 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1217 
1218 			/*
1219 			 * Build the Remote Address Segment for the WQE, using
1220 			 * the information from the UC work request.
1221 			 */
1222 			TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1223 
1224 			/* Update "ds" for filling in Data Segments (below) */
1225 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1226 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1227 			break;
1228 		}
1229 
1230 		/*
1231 		 * If this is memory window Bind operation, then we call the
1232 		 * tavor_wr_bind_check() routine to validate the request and
1233 		 * to generate the updated RKey.  If this is successful, then
1234 		 * we fill in the WQE's "Bind" header fields.
1235 		 */
1236 		if (wr->wr_opcode == IBT_WRC_BIND) {
1237 			status = tavor_wr_bind_check(state, wr);
1238 			if (status != DDI_SUCCESS) {
1239 				TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1240 				    TAVOR_TNF_ERROR, "");
1241 				TAVOR_TNF_EXIT(tavor_wqe_send_build);
1242 				return (status);
1243 			}
1244 
1245 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1246 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1247 
1248 			/*
1249 			 * Build the Bind Memory Window Segments for the WQE,
1250 			 * using the information from the UC Bind memory
1251 			 * window work request.
1252 			 */
1253 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1254 
1255 			/*
1256 			 * Update the "ds" pointer.  Even though the "bind"
1257 			 * operation requires no SGLs, this is necessary to
1258 			 * facilitate the correct descriptor size calculations
1259 			 * (below).
1260 			 */
1261 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1262 			    sizeof (tavor_hw_snd_wqe_bind_t));
1263 			nds = 0;
1264 		}
1265 		break;
1266 
1267 	default:
1268 		TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail,
1269 		    TAVOR_TNF_ERROR, "");
1270 		TAVOR_TNF_EXIT(tavor_wqe_send_build);
1271 		return (IBT_QP_SRV_TYPE_INVALID);
1272 	}
1273 
1274 	/*
1275 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1276 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1277 	 * Start by checking for a valid number of SGL entries
1278 	 */
1279 	if (nds > qp->qp_sq_sgl) {
1280 		TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail,
1281 		    TAVOR_TNF_ERROR, "");
1282 		TAVOR_TNF_EXIT(tavor_wqe_send_build);
1283 		return (IBT_QP_SGL_LEN_INVALID);
1284 	}
1285 
1286 	/*
1287 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1288 	 * segments.  Note: We skip any SGL with zero size because Tavor
1289 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1290 	 * the encoding for zero means a 2GB transfer.  Because of this special
1291 	 * encoding in the hardware, we mask the requested length with
1292 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1293 	 * zero.)
1294 	 */
1295 	for (i = 0; i < nds; i++) {
1296 		if (sgl[i].ds_len == 0) {
1297 			continue;
1298 		}
1299 
1300 		/*
1301 		 * Fill in the Data Segment(s) for the current WQE, using the
1302 		 * information contained in the scatter-gather list of the
1303 		 * work request.
1304 		 */
1305 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1306 		num_ds++;
1307 	}
1308 
1309 	/* Return the size of descriptor (in 16-byte chunks) */
1310 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1311 
1312 	TAVOR_TNF_EXIT(tavor_wqe_send_build);
1313 	return (DDI_SUCCESS);
1314 }
1315 
1316 
1317 /*
1318  * tavor_wqe_send_linknext()
1319  *    Context: Can be called from interrupt or base context.
1320  */
1321 static void
1322 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1323     uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1324     tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1325 {
1326 	uint64_t	next, ctrl;
1327 	uint32_t	nopcode, fence;
1328 
1329 	/*
1330 	 * Calculate the "next" field of the descriptor.  This amounts to
1331 	 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1332 	 * fields (see tavor_hw.h for more).  Note:  If there is no next
1333 	 * descriptor (i.e. if the current descriptor is the last WQE on
1334 	 * the chain), then set "next" to zero.
1335 	 */
1336 	if (curr_desc != NULL) {
1337 		/*
1338 		 * Determine the value for the Tavor WQE "nopcode" field
1339 		 * by using the IBTF opcode from the work request
1340 		 */
1341 		switch (curr_wr->wr_opcode) {
1342 		case IBT_WRC_RDMAW:
1343 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1344 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1345 			} else {
1346 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1347 			}
1348 			break;
1349 
1350 		case IBT_WRC_SEND:
1351 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1352 				nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1353 			} else {
1354 				nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1355 			}
1356 			break;
1357 
1358 		case IBT_WRC_RDMAR:
1359 			nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1360 			break;
1361 
1362 		case IBT_WRC_CSWAP:
1363 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1364 			break;
1365 
1366 		case IBT_WRC_FADD:
1367 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1368 			break;
1369 
1370 		case IBT_WRC_BIND:
1371 			nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1372 			break;
1373 		}
1374 
1375 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1376 		    - qp->qp_desc_off);
1377 		next  = ((uint64_t)(uintptr_t)curr_desc &
1378 		    TAVOR_WQE_NDA_MASK) << 32;
1379 		next  = next | ((uint64_t)nopcode << 32);
1380 		fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1381 		if (fence) {
1382 			next = next | TAVOR_WQE_SEND_FENCE_MASK;
1383 		}
1384 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1385 
1386 		/*
1387 		 * If a send queue doorbell will be rung for the next
1388 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1389 		 * Note: We also update the "dbinfo" structure here to pass
1390 		 * back information about what should (later) be included
1391 		 * in the send queue doorbell.
1392 		 */
1393 		if (dbinfo) {
1394 			next = next | TAVOR_WQE_DBD_MASK;
1395 			dbinfo->db_nopcode = nopcode;
1396 			dbinfo->db_fence   = fence;
1397 		}
1398 	} else {
1399 		next = 0;
1400 	}
1401 
1402 	/*
1403 	 * If this WQE is supposed to be linked to the previous descriptor,
1404 	 * then we need to update not only the previous WQE's "next" fields
1405 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1406 	 * "s", "i" and "immediate" fields - see tavor_hw.h for more).  Note:
1407 	 * the "e" bit is always hardcoded to zero.
1408 	 */
1409 	if (prev_desc != NULL) {
1410 		/*
1411 		 * If a send queue doorbell will be rung for the next WQE on
1412 		 * the chain, then update the current WQE's "next" field and
1413 		 * return.
1414 		 * Note: We don't want to modify the "ctrl" field here because
1415 		 * that portion of the previous WQE has already been set
1416 		 * correctly at some previous point in time.
1417 		 */
1418 		if (dbinfo) {
1419 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1420 			return;
1421 		}
1422 
1423 		ctrl = 0;
1424 
1425 		/* Set the "c" (i.e. "signaled") bit appropriately */
1426 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1427 			ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1428 		}
1429 
1430 		/* Set the "s" (i.e. "solicited") bit appropriately */
1431 		if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1432 			ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1433 		}
1434 
1435 		/* Set the "i" bit and the immediate data appropriately */
1436 		if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1437 			ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1438 			ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1439 		}
1440 
1441 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1442 	}
1443 }
1444 
1445 
1446 /*
1447  * tavor_wqe_mlx_build()
1448  *    Context: Can be called from interrupt or base context.
1449  */
1450 static int
1451 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1452     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1453 {
1454 	tavor_hw_udav_t		udav;
1455 	tavor_ahhdl_t		ah;
1456 	ib_lrh_hdr_t		*lrh;
1457 	ib_grh_t		*grh;
1458 	ib_bth_hdr_t		*bth;
1459 	ib_deth_hdr_t		*deth;
1460 	tavor_hw_wqe_sgl_t	*ds;
1461 	ibt_wr_ds_t		*sgl;
1462 	uint8_t			*mgmtclass, *hpoint, *hcount;
1463 	uint64_t		data;
1464 	uint32_t		nds, offset, pktlen;
1465 	uint32_t		desc_sz, udav_sz;
1466 	int			i, num_ds;
1467 
1468 	TAVOR_TNF_ENTER(tavor_wqe_mlx_build);
1469 
1470 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1471 
1472 	/* Initialize the information for the Data Segments */
1473 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1474 	    sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1475 
1476 	/*
1477 	 * Pull the address handle from the work request and read in
1478 	 * the contents of the UDAV.  This will be used to answer some
1479 	 * questions about the request.
1480 	 */
1481 	ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1482 	if (ah == NULL) {
1483 		TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail,
1484 		    TAVOR_TNF_ERROR, "");
1485 		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1486 		return (IBT_AH_HDL_INVALID);
1487 	}
1488 	mutex_enter(&ah->ah_lock);
1489 	udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1490 	for (i = 0; i < udav_sz; i++) {
1491 		data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1492 		    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1493 		((uint64_t *)&udav)[i] = data;
1494 	}
1495 	mutex_exit(&ah->ah_lock);
1496 
1497 	/*
1498 	 * If the request is for QP1 and the destination LID is equal to
1499 	 * the Permissive LID, then return an error.  This combination is
1500 	 * not allowed
1501 	 */
1502 	if ((udav.rlid == IB_LID_PERMISSIVE) &&
1503 	    (qp->qp_is_special == TAVOR_QP_GSI)) {
1504 		TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail,
1505 		    TAVOR_TNF_ERROR, "");
1506 		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1507 		return (IBT_AH_HDL_INVALID);
1508 	}
1509 
1510 	/*
1511 	 * Calculate the size of the packet headers, including the GRH
1512 	 * (if necessary)
1513 	 */
1514 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1515 	    sizeof (ib_deth_hdr_t);
1516 	if (udav.grh) {
1517 		desc_sz += sizeof (ib_grh_t);
1518 	}
1519 
1520 	/*
1521 	 * Begin to build the first "inline" data segment for the packet
1522 	 * headers.  Note:  By specifying "inline" we can build the contents
1523 	 * of the MAD packet headers directly into the work queue (as part
1524 	 * descriptor).  This has the advantage of both speeding things up
1525 	 * and of not requiring the driver to allocate/register any additional
1526 	 * memory for the packet headers.
1527 	 */
1528 	TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1529 	desc_sz += 4;
1530 
1531 	/*
1532 	 * Build Local Route Header (LRH)
1533 	 *    We start here by building the LRH into a temporary location.
1534 	 *    When we have finished we copy the LRH data into the descriptor.
1535 	 *
1536 	 *    Notice that the VL values are hardcoded.  This is not a problem
1537 	 *    because VL15 is decided later based on the value in the MLX
1538 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1539 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1540 	 *    values.  This rule does not hold for loopback packets however
1541 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1542 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1543 	 *
1544 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1545 	 *    (0xFFFF).  This is also not a problem because if the Destination
1546 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1547 	 *    transport "next/ctrl" header will be set to zero and the hardware
1548 	 *    will pull the LID from value in the port.
1549 	 */
1550 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1551 	pktlen = (desc_sz + 0x100) >> 2;
1552 	TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1553 
1554 	/*
1555 	 * Build Global Route Header (GRH)
1556 	 *    This is only built if necessary as defined by the "grh" bit in
1557 	 *    the address vector.  Note:  We also calculate the offset to the
1558 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1559 	 */
1560 	if (udav.grh) {
1561 		/*
1562 		 * If the request is for QP0, then return an error.  The
1563 		 * combination of global routine (GRH) and QP0 is not allowed.
1564 		 */
1565 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1566 			TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail,
1567 			    TAVOR_TNF_ERROR, "");
1568 			TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1569 			return (IBT_AH_HDL_INVALID);
1570 		}
1571 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1572 		TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1573 
1574 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1575 	} else {
1576 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1577 	}
1578 
1579 
1580 	/*
1581 	 * Build Base Transport Header (BTH)
1582 	 *    Notice that the M, PadCnt, and TVer fields are all set
1583 	 *    to zero implicitly.  This is true for all Management Datagrams
1584 	 *    MADs whether GSI are SMI.
1585 	 */
1586 	TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1587 
1588 	/*
1589 	 * Build Datagram Extended Transport Header (DETH)
1590 	 */
1591 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1592 	TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1593 
1594 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1595 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1596 	ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1597 	nds = wr->wr_nds;
1598 	sgl = wr->wr_sgl;
1599 	num_ds = 0;
1600 
1601 	/*
1602 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1603 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1604 	 * Start by checking for a valid number of SGL entries
1605 	 */
1606 	if (nds > qp->qp_sq_sgl) {
1607 		TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail,
1608 		    TAVOR_TNF_ERROR, "");
1609 		TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1610 		return (IBT_QP_SGL_LEN_INVALID);
1611 	}
1612 
1613 	/*
1614 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1615 	 * segments.  Note: We skip any SGL with zero size because Tavor
1616 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1617 	 * the encoding for zero means a 2GB transfer.  Because of this special
1618 	 * encoding in the hardware, we mask the requested length with
1619 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1620 	 * zero.)
1621 	 */
1622 	mgmtclass = hpoint = hcount = NULL;
1623 	offset = 0;
1624 	for (i = 0; i < nds; i++) {
1625 		if (sgl[i].ds_len == 0) {
1626 			continue;
1627 		}
1628 
1629 		/*
1630 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1631 		 * the information contained in the scatter-gather list of
1632 		 * the work request.
1633 		 */
1634 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1635 
1636 		/*
1637 		 * Search through the contents of all MADs posted to QP0 to
1638 		 * initialize pointers to the places where Directed Route "hop
1639 		 * pointer", "hop count", and "mgmtclass" would be.  Tavor
1640 		 * needs these updated (i.e. incremented or decremented, as
1641 		 * necessary) by software.
1642 		 */
1643 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1644 
1645 			TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1646 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1647 
1648 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1649 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1650 
1651 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1652 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1653 
1654 			offset += sgl[i].ds_len;
1655 		}
1656 		num_ds++;
1657 	}
1658 
1659 	/*
1660 	 * Tavor's Directed Route MADs need to have the "hop pointer"
1661 	 * incremented/decremented (as necessary) depending on whether it is
1662 	 * currently less than or greater than the "hop count" (i.e. whether
1663 	 * the MAD is a request or a response.)
1664 	 */
1665 	if (qp->qp_is_special == TAVOR_QP_SMI) {
1666 		TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1667 		    *hpoint, *hcount);
1668 	}
1669 
1670 	/*
1671 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1672 	 * just like the packets headers above, but it is only four bytes and
1673 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1674 	 */
1675 	TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1676 	num_ds++;
1677 
1678 	/* Return the size of descriptor (in 16-byte chunks) */
1679 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1680 
1681 	TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1682 	return (DDI_SUCCESS);
1683 }
1684 
1685 
1686 /*
1687  * tavor_wqe_mlx_linknext()
1688  *    Context: Can be called from interrupt or base context.
1689  */
1690 static void
1691 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1692     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1693     tavor_qphdl_t qp)
1694 {
1695 	tavor_hw_udav_t		udav;
1696 	tavor_ahhdl_t		ah;
1697 	uint64_t		next, ctrl, data;
1698 	uint_t			nopcode;
1699 	uint_t			udav_sz;
1700 	int			i;
1701 
1702 	/*
1703 	 * Calculate the "next" field of the descriptor.  This amounts to
1704 	 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1705 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1706 	 * if the current descriptor is the last WQE on the chain), then set
1707 	 * "next" to zero.
1708 	 */
1709 	if (curr_desc != NULL) {
1710 		/*
1711 		 * The only valid Tavor WQE "nopcode" for MLX transport
1712 		 * requests is the "Send" code.
1713 		 */
1714 		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1715 		curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1716 		    (uintptr_t)curr_desc - qp->qp_desc_off);
1717 		next = (uint64_t)((uintptr_t)curr_desc &
1718 		    TAVOR_WQE_NDA_MASK) << 32;
1719 		next = next | ((uint64_t)nopcode << 32);
1720 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1721 
1722 		/*
1723 		 * If a send queue doorbell will be rung for the next
1724 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1725 		 * Note: We also update the "dbinfo" structure here to pass
1726 		 * back information about what should (later) be included
1727 		 * in the send queue doorbell.
1728 		 */
1729 		if (dbinfo) {
1730 			next = next | TAVOR_WQE_DBD_MASK;
1731 			dbinfo->db_nopcode = nopcode;
1732 			dbinfo->db_fence   = 0;
1733 		}
1734 	} else {
1735 		next = 0;
1736 	}
1737 
1738 	/*
1739 	 * If this WQE is supposed to be linked to the previous descriptor,
1740 	 * then we need to update not only the previous WQE's "next" fields
1741 	 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1742 	 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1743 	 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1744 	 * always hardcoded to zero.
1745 	 */
1746 	if (prev_desc != NULL) {
1747 		/*
1748 		 * If a send queue doorbell will be rung for the next WQE on
1749 		 * the chain, then update the current WQE's "next" field and
1750 		 * return.
1751 		 * Note: We don't want to modify the "ctrl" field here because
1752 		 * that portion of the previous WQE has already been set
1753 		 * correctly at some previous point in time.
1754 		 */
1755 		if (dbinfo) {
1756 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1757 			return;
1758 		}
1759 
1760 		/*
1761 		 * Pull the address handle from the work request and read in
1762 		 * the contents of the UDAV.  This will be used to answer some
1763 		 * questions about the request.
1764 		 */
1765 		ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1766 		mutex_enter(&ah->ah_lock);
1767 		udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1768 		for (i = 0; i < udav_sz; i++) {
1769 			data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1770 			    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1771 			((uint64_t *)&udav)[i] = data;
1772 		}
1773 		mutex_exit(&ah->ah_lock);
1774 
1775 		ctrl = 0;
1776 
1777 		/* Only QP0 uses VL15, otherwise use VL in the packet */
1778 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1779 			ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1780 		}
1781 
1782 		/*
1783 		 * The SLR (Source LID Replace) bit determines whether the
1784 		 * source LID for an outgoing MLX packet should come from the
1785 		 * PortInfo (SLR = 0) or should be left as it is in the
1786 		 * descriptor (SLR = 1).  The latter is necessary for packets
1787 		 * to be sent with the Permissive LID.
1788 		 */
1789 		if (udav.rlid == IB_LID_PERMISSIVE) {
1790 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1791 		}
1792 
1793 		/* Fill in the max static rate from the address handle */
1794 		ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1795 		    TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1796 
1797 		/* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1798 		if (qp->qp_is_special != TAVOR_QP_SMI) {
1799 			ctrl = ctrl | ((uint64_t)udav.sl <<
1800 			    TAVOR_WQE_MLXHDR_SL_SHIFT);
1801 		}
1802 
1803 		/* Set the "c" (i.e. "signaled") bit appropriately */
1804 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1805 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1806 		}
1807 
1808 		/* Fill in the destination LID from the address handle */
1809 		ctrl = ctrl | ((uint64_t)udav.rlid <<
1810 		    TAVOR_WQE_MLXHDR_RLID_SHIFT);
1811 
1812 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1813 	}
1814 }
1815 
1816 
1817 /*
1818  * tavor_wqe_recv_build()
1819  *    Context: Can be called from interrupt or base context.
1820  */
1821 /* ARGSUSED */
1822 static int
1823 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1824     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1825 {
1826 	tavor_hw_wqe_sgl_t	*ds;
1827 	int			i, num_ds;
1828 
1829 	TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1830 
1831 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1832 
1833 	/* Check that work request transport type is valid */
1834 	if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1835 	    (qp->qp_serv_type != TAVOR_QP_RC) &&
1836 	    (qp->qp_serv_type != TAVOR_QP_UC)) {
1837 		TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail,
1838 		    TAVOR_TNF_ERROR, "");
1839 		TAVOR_TNF_EXIT(tavor_build_recv_wqe);
1840 		return (IBT_QP_SRV_TYPE_INVALID);
1841 	}
1842 
1843 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1844 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1845 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1846 	num_ds = 0;
1847 
1848 	/* Check for valid number of SGL entries */
1849 	if (wr->wr_nds > qp->qp_rq_sgl) {
1850 		TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail,
1851 		    TAVOR_TNF_ERROR, "");
1852 		TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1853 		return (IBT_QP_SGL_LEN_INVALID);
1854 	}
1855 
1856 	/*
1857 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1858 	 * segments.  Note: We skip any SGL with zero size because Tavor
1859 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1860 	 * the encoding for zero means a 2GB transfer.  Because of this special
1861 	 * encoding in the hardware, we mask the requested length with
1862 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1863 	 * zero.)
1864 	 */
1865 	for (i = 0; i < wr->wr_nds; i++) {
1866 		if (wr->wr_sgl[i].ds_len == 0) {
1867 			continue;
1868 		}
1869 
1870 		/*
1871 		 * Fill in the Data Segment(s) for the receive WQE, using the
1872 		 * information contained in the scatter-gather list of the
1873 		 * work request.
1874 		 */
1875 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1876 		num_ds++;
1877 	}
1878 
1879 	/* Return the size of descriptor (in 16-byte chunks) */
1880 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1881 
1882 	TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1883 	return (DDI_SUCCESS);
1884 }
1885 
1886 
1887 /*
1888  * tavor_wqe_recv_linknext()
1889  *    Context: Can be called from interrupt or base context.
1890  */
1891 static void
1892 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1893     uint64_t *prev_desc, tavor_qphdl_t qp)
1894 {
1895 	uint64_t	next;
1896 
1897 	/*
1898 	 * Calculate the "next" field of the descriptor.  This amounts to
1899 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1900 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1901 	 * if the current descriptor is the last WQE on the chain), then set
1902 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1903 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1904 	 * In either case, we must add a single bit in the "reserved" field
1905 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1906 	 * workaround for a known Tavor errata that can cause Recv WQEs with
1907 	 * zero in the NDA field to behave improperly.
1908 	 */
1909 	if (curr_desc != NULL) {
1910 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1911 		    qp->qp_desc_off);
1912 		next = (uint64_t)((uintptr_t)curr_desc &
1913 		    TAVOR_WQE_NDA_MASK) << 32;
1914 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1915 		    TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1916 	} else {
1917 		next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1918 	}
1919 
1920 	/*
1921 	 * If this WQE is supposed to be linked to the previous descriptor,
1922 	 * then we need to update not only the previous WQE's "next" fields
1923 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1924 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1925 	 * bits are always hardcoded to zero.
1926 	 */
1927 	if (prev_desc != NULL) {
1928 		TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1929 	}
1930 }
1931 
1932 
1933 /*
1934  * tavor_wqe_srq_build()
1935  *    Context: Can be called from interrupt or base context.
1936  */
1937 /* ARGSUSED */
1938 static int
1939 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1940     ibt_recv_wr_t *wr, uint64_t *desc)
1941 {
1942 	tavor_hw_wqe_sgl_t	*ds;
1943 	ibt_wr_ds_t		end_sgl;
1944 	int			i, num_ds;
1945 
1946 	TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1947 
1948 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1949 
1950 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1951 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1952 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1953 	num_ds = 0;
1954 
1955 	/* Check for valid number of SGL entries */
1956 	if (wr->wr_nds > srq->srq_wq_sgl) {
1957 		TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail,
1958 		    TAVOR_TNF_ERROR, "");
1959 		TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1960 		return (IBT_QP_SGL_LEN_INVALID);
1961 	}
1962 
1963 	/*
1964 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1965 	 * segments.  Note: We skip any SGL with zero size because Tavor
1966 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1967 	 * the encoding for zero means a 2GB transfer.  Because of this special
1968 	 * encoding in the hardware, we mask the requested length with
1969 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1970 	 * zero.)
1971 	 */
1972 	for (i = 0; i < wr->wr_nds; i++) {
1973 		if (wr->wr_sgl[i].ds_len == 0) {
1974 			continue;
1975 		}
1976 
1977 		/*
1978 		 * Fill in the Data Segment(s) for the receive WQE, using the
1979 		 * information contained in the scatter-gather list of the
1980 		 * work request.
1981 		 */
1982 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1983 		num_ds++;
1984 	}
1985 
1986 	/*
1987 	 * For SRQ, if the number of data segments is less than the maximum
1988 	 * specified at alloc, then we have to fill in a special "key" entry in
1989 	 * the sgl entry after the last valid one in this post request.  We do
1990 	 * that here.
1991 	 */
1992 	if (num_ds < srq->srq_wq_sgl) {
1993 		end_sgl.ds_va  = 0;
1994 		end_sgl.ds_len = 0;
1995 		end_sgl.ds_key = 0x1;
1996 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1997 	}
1998 
1999 	TAVOR_TNF_EXIT(tavor_wqe_srq_build);
2000 	return (DDI_SUCCESS);
2001 }
2002 
2003 
2004 /*
2005  * tavor_wqe_srq_linknext()
2006  *    Context: Can be called from interrupt or base context.
2007  */
2008 static void
2009 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
2010     tavor_srqhdl_t srq)
2011 {
2012 	uint64_t	next;
2013 
2014 	/*
2015 	 * Calculate the "next" field of the descriptor.  This amounts to
2016 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
2017 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
2018 	 * if the current descriptor is the last WQE on the chain), then set
2019 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
2020 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
2021 	 * In either case, we must add a single bit in the "reserved" field
2022 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
2023 	 * workaround for a known Tavor errata that can cause Recv WQEs with
2024 	 * zero in the NDA field to behave improperly.
2025 	 */
2026 	if (curr_desc != NULL) {
2027 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
2028 		    srq->srq_desc_off);
2029 		next = (uint64_t)((uintptr_t)curr_desc &
2030 		    TAVOR_WQE_NDA_MASK) << 32;
2031 		next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2032 	} else {
2033 		next = TAVOR_RCV_WQE_NDA0_WA_MASK;
2034 	}
2035 
2036 	/*
2037 	 * If this WQE is supposed to be linked to the previous descriptor,
2038 	 * then we need to update not only the previous WQE's "next" fields
2039 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
2040 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
2041 	 * bits are always hardcoded to zero.
2042 	 */
2043 	if (prev_desc != NULL) {
2044 		TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
2045 	}
2046 }
2047 
2048 
2049 /*
2050  * tavor_wr_get_immediate()
2051  *    Context: Can be called from interrupt or base context.
2052  */
2053 static uint32_t
2054 tavor_wr_get_immediate(ibt_send_wr_t *wr)
2055 {
2056 	/*
2057 	 * This routine extracts the "immediate data" from the appropriate
2058 	 * location in the IBTF work request.  Because of the way the
2059 	 * work request structure is defined, the location for this data
2060 	 * depends on the actual work request operation type.
2061 	 */
2062 
2063 	/* For RDMA Write, test if RC or UC */
2064 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
2065 		if (wr->wr_trans == IBT_RC_SRV) {
2066 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
2067 		} else {  /* IBT_UC_SRV */
2068 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
2069 		}
2070 	}
2071 
2072 	/* For Send, test if RC, UD, or UC */
2073 	if (wr->wr_opcode == IBT_WRC_SEND) {
2074 		if (wr->wr_trans == IBT_RC_SRV) {
2075 			return (wr->wr.rc.rcwr.send_immed);
2076 		} else if (wr->wr_trans == IBT_UD_SRV) {
2077 			return (wr->wr.ud.udwr_immed);
2078 		} else {  /* IBT_UC_SRV */
2079 			return (wr->wr.uc.ucwr.send_immed);
2080 		}
2081 	}
2082 
2083 	/*
2084 	 * If any other type of request, then immediate is undefined
2085 	 */
2086 	return (0);
2087 }
2088 
2089 
2090 /*
2091  * tavor_wqe_sync()
2092  *    Context: Can be called from interrupt or base context.
2093  */
2094 static void
2095 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
2096     uint_t sync_type, uint_t flag)
2097 {
2098 	tavor_qphdl_t		qp;
2099 	tavor_srqhdl_t		srq;
2100 	uint_t			is_sync_req;
2101 	uint64_t		*wqe_from, *wqe_to, *wqe_base, *wqe_top;
2102 	ddi_dma_handle_t	dmahdl;
2103 	off_t			offset;
2104 	size_t			length;
2105 	uint32_t		qsize;
2106 	int			status;
2107 
2108 	TAVOR_TNF_ENTER(tavor_wqe_sync);
2109 
2110 	if (sync_type == TAVOR_WR_SRQ) {
2111 		srq = (tavor_srqhdl_t)hdl;
2112 		is_sync_req = srq->srq_sync;
2113 		/* Get the DMA handle from SRQ context */
2114 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
2115 	} else {
2116 		qp = (tavor_qphdl_t)hdl;
2117 		is_sync_req = qp->qp_sync;
2118 		/* Get the DMA handle from QP context */
2119 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
2120 	}
2121 
2122 	/* Determine if the work queues need to be synced or not */
2123 	if (is_sync_req == 0) {
2124 		TAVOR_TNF_EXIT(tavor_wqe_sync);
2125 		return;
2126 	}
2127 
2128 	/*
2129 	 * Depending on the type of the work queue, we grab information
2130 	 * about the address ranges we need to DMA sync.
2131 	 */
2132 	if (sync_type == TAVOR_WR_SEND) {
2133 		wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
2134 		wqe_to   = TAVOR_QP_SQ_ENTRY(qp, sync_to);
2135 		qsize	 = qp->qp_sq_bufsz;
2136 
2137 		wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2138 		wqe_top	 = TAVOR_QP_SQ_ENTRY(qp, qsize);
2139 	} else if (sync_type == TAVOR_WR_RECV) {
2140 		wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2141 		wqe_to   = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2142 		qsize	 = qp->qp_rq_bufsz;
2143 
2144 		wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2145 		wqe_top	 = TAVOR_QP_RQ_ENTRY(qp, qsize);
2146 	} else {
2147 		wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2148 		wqe_to   = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2149 		qsize	 = srq->srq_wq_bufsz;
2150 
2151 		wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2152 		wqe_top	 = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2153 	}
2154 
2155 	/*
2156 	 * There are two possible cases for the beginning and end of the WQE
2157 	 * chain we are trying to sync.  Either this is the simple case, where
2158 	 * the end of the chain is below the beginning of the chain, or it is
2159 	 * the "wrap-around" case, where the end of the chain has wrapped over
2160 	 * the end of the queue.  In the former case, we simply need to
2161 	 * calculate the span from beginning to end and sync it.  In the latter
2162 	 * case, however, we need to calculate the span from the top of the
2163 	 * work queue to the end of the chain and sync that, and then we need
2164 	 * to find the other portion (from beginning of chain to end of queue)
2165 	 * and sync that as well.  Note: if the "top to end" span is actually
2166 	 * zero length, then we don't do a DMA sync because a zero length DMA
2167 	 * sync unnecessarily syncs the entire work queue.
2168 	 */
2169 	if (wqe_to > wqe_from) {
2170 		/* "From Beginning to End" */
2171 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2172 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2173 
2174 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2175 		if (status != DDI_SUCCESS) {
2176 			TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2177 			TAVOR_TNF_EXIT(tavor_wqe_sync);
2178 			return;
2179 		}
2180 	} else {
2181 		/* "From Top to End" */
2182 		offset = (off_t)0;
2183 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2184 		if (length) {
2185 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2186 			if (status != DDI_SUCCESS) {
2187 				TNF_PROBE_0(tavor_wqe_sync_fail,
2188 				    TAVOR_TNF_ERROR, "");
2189 				TAVOR_TNF_EXIT(tavor_wqe_sync);
2190 				return;
2191 			}
2192 		}
2193 
2194 		/* "From Beginning to Bottom" */
2195 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2196 		length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2197 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2198 		if (status != DDI_SUCCESS) {
2199 			TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2200 			TAVOR_TNF_EXIT(tavor_wqe_sync);
2201 			return;
2202 		}
2203 	}
2204 
2205 	TAVOR_TNF_EXIT(tavor_wqe_sync);
2206 }
2207 
2208 
2209 /*
2210  * tavor_wr_bind_check()
2211  *    Context: Can be called from interrupt or base context.
2212  */
2213 static int
2214 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2215 {
2216 	ibt_bind_flags_t	bind_flags;
2217 	uint64_t		vaddr, len;
2218 	uint64_t		reg_start_addr, reg_end_addr;
2219 	tavor_mwhdl_t		mw;
2220 	tavor_mrhdl_t		mr;
2221 	tavor_rsrc_t		*mpt;
2222 	uint32_t		new_rkey;
2223 
2224 	TAVOR_TNF_ENTER(tavor_wr_bind_check);
2225 
2226 	/* Check for a valid Memory Window handle in the WR */
2227 	mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2228 	if (mw == NULL) {
2229 		TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail,
2230 		    TAVOR_TNF_ERROR, "");
2231 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2232 		return (IBT_MW_HDL_INVALID);
2233 	}
2234 
2235 	/* Check for a valid Memory Region handle in the WR */
2236 	mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2237 	if (mr == NULL) {
2238 		TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail,
2239 		    TAVOR_TNF_ERROR, "");
2240 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2241 		return (IBT_MR_HDL_INVALID);
2242 	}
2243 
2244 	mutex_enter(&mr->mr_lock);
2245 	mutex_enter(&mw->mr_lock);
2246 
2247 	/*
2248 	 * Check here to see if the memory region has already been partially
2249 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2250 	 * If so, this is an error, return failure.
2251 	 */
2252 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2253 		mutex_exit(&mr->mr_lock);
2254 		mutex_exit(&mw->mr_lock);
2255 		TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail,
2256 		    TAVOR_TNF_ERROR, "");
2257 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2258 		return (IBT_MR_HDL_INVALID);
2259 	}
2260 
2261 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2262 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2263 		mutex_exit(&mr->mr_lock);
2264 		mutex_exit(&mw->mr_lock);
2265 		TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail,
2266 		    TAVOR_TNF_ERROR, "");
2267 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2268 		return (IBT_MR_RKEY_INVALID);
2269 	}
2270 
2271 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2272 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2273 		mutex_exit(&mr->mr_lock);
2274 		mutex_exit(&mw->mr_lock);
2275 		TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail,
2276 		    TAVOR_TNF_ERROR, "");
2277 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2278 		return (IBT_MR_LKEY_INVALID);
2279 	}
2280 
2281 	/*
2282 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2283 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2284 	 */
2285 	len = wr->wr.rc.rcwr.bind->bind_len;
2286 	if (len != 0) {
2287 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2288 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2289 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2290 		    (mr->mr_bindinfo.bi_len - 1);
2291 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2292 			mutex_exit(&mr->mr_lock);
2293 			mutex_exit(&mw->mr_lock);
2294 			TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail,
2295 			    TAVOR_TNF_ERROR, "");
2296 			TAVOR_TNF_EXIT(tavor_wr_bind_check);
2297 			return (IBT_MR_VA_INVALID);
2298 		}
2299 		vaddr = (vaddr + len) - 1;
2300 		if (vaddr > reg_end_addr) {
2301 			mutex_exit(&mr->mr_lock);
2302 			mutex_exit(&mw->mr_lock);
2303 			TNF_PROBE_0(tavor_wr_bind_check_invlen_fail,
2304 			    TAVOR_TNF_ERROR, "");
2305 			TAVOR_TNF_EXIT(tavor_wr_bind_check);
2306 			return (IBT_MR_LEN_INVALID);
2307 		}
2308 	}
2309 
2310 	/*
2311 	 * Validate the bind access flags.  Remote Write and Atomic access for
2312 	 * the Memory Window require that Local Write access be set in the
2313 	 * corresponding Memory Region.
2314 	 */
2315 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2316 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2317 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2318 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2319 		mutex_exit(&mr->mr_lock);
2320 		mutex_exit(&mw->mr_lock);
2321 		TNF_PROBE_0(tavor_wr_bind_check_invflags_fail,
2322 		    TAVOR_TNF_ERROR, "");
2323 		TAVOR_TNF_EXIT(tavor_wr_bind_check);
2324 		return (IBT_MR_ACCESS_REQ_INVALID);
2325 	}
2326 
2327 	/* Calculate the new RKey for the Memory Window */
2328 	mpt = mw->mr_mptrsrcp;
2329 	tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2330 
2331 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2332 	mw->mr_rkey = new_rkey;
2333 
2334 	mutex_exit(&mr->mr_lock);
2335 	mutex_exit(&mw->mr_lock);
2336 	TAVOR_TNF_EXIT(tavor_wr_bind_check);
2337 	return (DDI_SUCCESS);
2338 }
2339 
2340 
2341 /*
2342  * tavor_wrid_from_reset_handling()
2343  *    Context: Can be called from interrupt or base context.
2344  */
2345 int
2346 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2347 {
2348 	tavor_workq_hdr_t	*swq, *rwq;
2349 	tavor_wrid_list_hdr_t	*s_wridlist, *r_wridlist;
2350 	uint_t			create_new_swq = 0, create_new_rwq = 0;
2351 	uint_t			create_wql = 0;
2352 	uint_t			qp_srq_en;
2353 
2354 	TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling);
2355 
2356 	/*
2357 	 * For each of this QP's Work Queues, make sure we have a (properly
2358 	 * initialized) Work Request ID list attached to the relevant
2359 	 * completion queue.  Grab the CQ lock(s) before manipulating the
2360 	 * lists.
2361 	 */
2362 	tavor_wrid_wqhdr_lock_both(qp);
2363 	swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2364 	    TAVOR_WR_SEND);
2365 	if (swq == NULL) {
2366 		/* Couldn't find matching work queue header, create it */
2367 		create_new_swq = create_wql = 1;
2368 		swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2369 		    qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2370 		if (swq == NULL) {
2371 			/*
2372 			 * If we couldn't find/allocate space for the workq
2373 			 * header, then drop the lock(s) and return failure.
2374 			 */
2375 			tavor_wrid_wqhdr_unlock_both(qp);
2376 			TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2377 			    TAVOR_TNF_ERROR, "");
2378 			TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2379 			return (ibc_get_ci_failure(0));
2380 		}
2381 	}
2382 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2383 	qp->qp_sq_wqhdr = swq;
2384 	swq->wq_size = qp->qp_sq_bufsz;
2385 	swq->wq_head = 0;
2386 	swq->wq_tail = 0;
2387 	swq->wq_full = 0;
2388 
2389 	/*
2390 	 * Allocate space for the tavor_wrid_entry_t container
2391 	 */
2392 	s_wridlist = tavor_wrid_get_list(swq->wq_size);
2393 	if (s_wridlist == NULL) {
2394 		/*
2395 		 * If we couldn't allocate space for tracking the WRID
2396 		 * entries, then cleanup the workq header from above (if
2397 		 * necessary, i.e. if we created the workq header).  Then
2398 		 * drop the lock(s) and return failure.
2399 		 */
2400 		if (create_new_swq) {
2401 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2402 		}
2403 
2404 		tavor_wrid_wqhdr_unlock_both(qp);
2405 		TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2406 		    TAVOR_TNF_ERROR, "");
2407 		TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2408 		return (ibc_get_ci_failure(0));
2409 	}
2410 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2411 	s_wridlist->wl_wqhdr = swq;
2412 
2413 	/* Chain the new WRID list container to the workq hdr list */
2414 	mutex_enter(&swq->wq_wrid_wql->wql_lock);
2415 	tavor_wrid_wqhdr_add(swq, s_wridlist);
2416 	mutex_exit(&swq->wq_wrid_wql->wql_lock);
2417 
2418 	qp_srq_en = qp->qp_srq_en;
2419 
2420 #ifdef __lock_lint
2421 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2422 #else
2423 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2424 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2425 	}
2426 #endif
2427 	/*
2428 	 * Now we repeat all the above operations for the receive work queue,
2429 	 * or shared receive work queue.
2430 	 *
2431 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2432 	 */
2433 	rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2434 	    TAVOR_WR_RECV);
2435 	if (rwq == NULL) {
2436 		create_new_rwq = create_wql = 1;
2437 
2438 		/*
2439 		 * If this QP is associated with an SRQ, and this isn't the
2440 		 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2441 		 * created.  Since the WQL is created at 'wqhdr_create' time we
2442 		 * pass in the flag 'create_wql' here to be 0 if we have
2443 		 * already created it.  And later on below we then next setup
2444 		 * the WQL and rwq information based off the existing SRQ info.
2445 		 */
2446 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2447 		    qp->qp_srqhdl->srq_wrid_wql != NULL) {
2448 			create_wql = 0;
2449 		}
2450 
2451 		rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2452 		    qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2453 		if (rwq == NULL) {
2454 			/*
2455 			 * If we couldn't find/allocate space for the workq
2456 			 * header, then free all the send queue resources we
2457 			 * just allocated and setup (above), drop the lock(s)
2458 			 * and return failure.
2459 			 */
2460 			mutex_enter(&swq->wq_wrid_wql->wql_lock);
2461 			tavor_wrid_wqhdr_remove(swq, s_wridlist);
2462 			mutex_exit(&swq->wq_wrid_wql->wql_lock);
2463 			if (create_new_swq) {
2464 				tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2465 				    swq);
2466 			}
2467 
2468 #ifdef __lock_lint
2469 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2470 #else
2471 			if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2472 				mutex_exit(&qp->qp_srqhdl->srq_lock);
2473 			}
2474 #endif
2475 
2476 			tavor_wrid_wqhdr_unlock_both(qp);
2477 			TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2478 			    TAVOR_TNF_ERROR, "");
2479 			TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2480 			return (ibc_get_ci_failure(0));
2481 		}
2482 	}
2483 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2484 
2485 	/*
2486 	 * Setup receive workq hdr
2487 	 *
2488 	 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2489 	 * keeping a copy of the rwq pointer, setting the rwq bufsize
2490 	 * appropriately, and initializing our part of the WQLock.
2491 	 *
2492 	 * In the normal QP case, the QP recv queue bufsize is used.
2493 	 */
2494 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2495 		rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2496 		if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2497 			qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2498 		} else {
2499 			rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2500 		}
2501 		tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2502 
2503 	} else {
2504 		rwq->wq_size = qp->qp_rq_bufsz;
2505 	}
2506 
2507 	qp->qp_rq_wqhdr = rwq;
2508 	rwq->wq_head = 0;
2509 	rwq->wq_tail = 0;
2510 	rwq->wq_full = 0;
2511 
2512 	/*
2513 	 * Allocate space for the tavor_wrid_entry_t container.
2514 	 *
2515 	 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2516 	 * allocate the wridlist normally.  However, if the srq_wridlist is !=
2517 	 * NULL, then we know this SRQ has already been initialized, thus the
2518 	 * wridlist has already been initialized.  So we re-use the
2519 	 * srq_wridlist as the r_wridlist for this QP in this case.
2520 	 */
2521 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2522 	    qp->qp_srqhdl->srq_wridlist != NULL) {
2523 		/* Use existing srq_wridlist pointer */
2524 		r_wridlist = qp->qp_srqhdl->srq_wridlist;
2525 		ASSERT(r_wridlist != NULL);
2526 	} else {
2527 		/* Allocate memory for the r_wridlist */
2528 		r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2529 	}
2530 
2531 	/*
2532 	 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2533 	 * is mistakenly NULL), we cleanup our previous swq allocation from
2534 	 * above
2535 	 */
2536 	if (r_wridlist == NULL) {
2537 		/*
2538 		 * If we couldn't allocate space for tracking the WRID
2539 		 * entries, then cleanup all the stuff from above.  Then
2540 		 * drop the lock(s) and return failure.
2541 		 */
2542 		mutex_enter(&swq->wq_wrid_wql->wql_lock);
2543 		tavor_wrid_wqhdr_remove(swq, s_wridlist);
2544 		mutex_exit(&swq->wq_wrid_wql->wql_lock);
2545 		if (create_new_swq) {
2546 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2547 		}
2548 		if (create_new_rwq) {
2549 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2550 		}
2551 
2552 #ifdef __lock_lint
2553 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2554 #else
2555 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2556 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2557 		}
2558 #endif
2559 
2560 		tavor_wrid_wqhdr_unlock_both(qp);
2561 		TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2562 		    TAVOR_TNF_ERROR, "");
2563 		TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2564 		return (ibc_get_ci_failure(0));
2565 	}
2566 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2567 
2568 	/*
2569 	 * Initialize the wridlist
2570 	 *
2571 	 * In the normal QP case, there is no special initialization needed.
2572 	 * We simply setup the wridlist backpointer to be the receive wqhdr
2573 	 * (rwq).
2574 	 *
2575 	 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2576 	 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2577 	 * and thus potentially shared across multiple QPs with the SRQ.  We
2578 	 * also setup the srq_wridlist pointer to be the r_wridlist, and
2579 	 * intialize the freelist to an invalid index.  This srq_wridlist
2580 	 * pointer is used above on future moves from_reset to let us know that
2581 	 * the srq_wridlist has been initialized already.
2582 	 *
2583 	 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2584 	 * free list.
2585 	 */
2586 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2587 	    qp->qp_srqhdl->srq_wridlist == NULL) {
2588 		r_wridlist->wl_srq_en = 1;
2589 		r_wridlist->wl_free_list_indx = -1;
2590 		qp->qp_srqhdl->srq_wridlist = r_wridlist;
2591 
2592 		/* Initialize srq wrid free list */
2593 		if (qp->qp_srqhdl->srq_is_umap == 0) {
2594 			mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2595 			tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2596 			mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2597 		}
2598 	} else {
2599 		r_wridlist->wl_wqhdr = rwq;
2600 	}
2601 
2602 	/* Chain the WRID list "container" to the workq hdr list */
2603 	mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2604 	tavor_wrid_wqhdr_add(rwq, r_wridlist);
2605 	mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2606 
2607 #ifdef __lock_lint
2608 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2609 #else
2610 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2611 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2612 	}
2613 #endif
2614 
2615 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2616 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2617 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2618 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2619 
2620 	tavor_wrid_wqhdr_unlock_both(qp);
2621 	TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2622 	return (DDI_SUCCESS);
2623 }
2624 
2625 
2626 /*
2627  * tavor_wrid_to_reset_handling()
2628  *    Context: Can be called from interrupt or base context.
2629  */
2630 void
2631 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2632 {
2633 	uint_t		free_wqhdr = 0;
2634 
2635 	TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling);
2636 
2637 	/*
2638 	 * For each of this QP's Work Queues, move the WRID "container" to
2639 	 * the "reapable" list.  Although there may still be unpolled
2640 	 * entries in these containers, it is not a big deal.  We will not
2641 	 * reap the list until either the Poll CQ command detects an empty
2642 	 * condition or the CQ itself is freed.  Grab the CQ lock(s) before
2643 	 * manipulating the lists.
2644 	 */
2645 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2646 	tavor_wrid_wqhdr_lock_both(qp);
2647 	tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2648 
2649 	/*
2650 	 * Add the receive work queue header on to the reaplist.  But if we are
2651 	 * on SRQ, then don't add anything to the reaplist.  Instead we flush
2652 	 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2653 	 * WQHDR (if needed).  We must hold the WQL for these operations, yet
2654 	 * the call to tavor_cq_wqhdr_remove grabs the WQL internally.  So we
2655 	 * drop WQL before that call.  Then release the CQ WQHDR locks and the
2656 	 * CQ lock and return.
2657 	 */
2658 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2659 
2660 		/*
2661 		 * Pull off all (if any) entries for this QP from CQ.  This
2662 		 * only includes entries that have not yet been polled
2663 		 */
2664 		mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2665 		tavor_cq_srq_entries_flush(state, qp);
2666 
2667 		/* Remove wridlist from WQHDR */
2668 		tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2669 		    qp->qp_rq_wqhdr->wq_wrid_post);
2670 
2671 		/* If wridlist chain is now empty, remove the wqhdr as well */
2672 		if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2673 			free_wqhdr = 1;
2674 		} else {
2675 			free_wqhdr = 0;
2676 		}
2677 
2678 		mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2679 
2680 		/* Free the WQHDR */
2681 		if (free_wqhdr) {
2682 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2683 		}
2684 	} else {
2685 		tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2686 	}
2687 	tavor_wrid_wqhdr_unlock_both(qp);
2688 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2689 
2690 	TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling);
2691 }
2692 
2693 
2694 /*
2695  * tavor_wrid_add_entry()
2696  *    Context: Can be called from interrupt or base context.
2697  */
2698 void
2699 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2700     uint_t signaled_dbd)
2701 {
2702 	tavor_wrid_entry_t	*wre_tmp;
2703 	uint32_t		head, tail, size;
2704 
2705 	TAVOR_TNF_ENTER(tavor_wrid_add_entry);
2706 
2707 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2708 
2709 	/*
2710 	 * Find the entry in the container pointed to by the "tail" index.
2711 	 * Add all of the relevant information to that entry, including WRID,
2712 	 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2713 	 * and/or doorbelled.
2714 	 */
2715 	head = wq->wq_wrid_post->wl_head;
2716 	tail = wq->wq_wrid_post->wl_tail;
2717 	size = wq->wq_wrid_post->wl_size;
2718 	wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2719 	wre_tmp->wr_wrid	  = wrid;
2720 	wre_tmp->wr_wqeaddrsz	  = wqeaddrsz;
2721 	wre_tmp->wr_signaled_dbd  = signaled_dbd;
2722 
2723 	/*
2724 	 * Update the "wrid_old_tail" pointer to point to the entry we just
2725 	 * inserted into the queue.  By tracking this pointer (the pointer to
2726 	 * the most recently inserted entry) it will possible later in the
2727 	 * PostSend() and PostRecv() code paths to find the entry that needs
2728 	 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2729 	 * tavor_post_send()).
2730 	 */
2731 	wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2732 
2733 	/* Update the tail index */
2734 	tail = ((tail + 1) & (size - 1));
2735 	wq->wq_wrid_post->wl_tail = tail;
2736 
2737 	/*
2738 	 * If the "tail" index has just wrapped over into the "head" index,
2739 	 * then we have filled the container.  We use the "full" flag to
2740 	 * indicate this condition and to distinguish it from the "empty"
2741 	 * condition (where head and tail are also equal).
2742 	 */
2743 	if (head == tail) {
2744 		wq->wq_wrid_post->wl_full = 1;
2745 	}
2746 	TAVOR_TNF_EXIT(tavor_wrid_add_entry);
2747 }
2748 
2749 /*
2750  * tavor_wrid_add_entry_srq()
2751  * Context: Can be called from interrupt or base context
2752  */
2753 void
2754 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2755 {
2756 	tavor_wrid_entry_t	*wre;
2757 	uint64_t		*wl_wqe;
2758 	uint32_t		wqe_index;
2759 
2760 	TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq);
2761 
2762 	/*
2763 	 * Find the next available WQE from the SRQ free_list.  Then update the
2764 	 * free_list to point to the next entry
2765 	 */
2766 	wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2767 
2768 	wqe_index = srq->srq_wridlist->wl_free_list_indx;
2769 
2770 	/* ASSERT on impossible wqe_index values */
2771 	ASSERT(wqe_index < srq->srq_wq_bufsz);
2772 
2773 	/*
2774 	 * Setup the WRE.
2775 	 *
2776 	 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2777 	 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2778 	 * this information and associate the WRID to the WQE found on the CQE.
2779 	 */
2780 	wre = &srq->srq_wridlist->wl_wre[wqe_index];
2781 	wre->wr_wrid = wrid;
2782 	wre->wr_signaled_dbd  = signaled_dbd;
2783 
2784 	/* Update the free list index */
2785 	srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2786 	    srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2787 
2788 	TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq);
2789 }
2790 
2791 
2792 /*
2793  * tavor_wrid_get_entry()
2794  *    Context: Can be called from interrupt or base context.
2795  */
2796 uint64_t
2797 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2798     tavor_wrid_entry_t *wre)
2799 {
2800 	tavor_workq_hdr_t	*wq;
2801 	tavor_wrid_entry_t	*wre_tmp;
2802 	uint64_t		wrid;
2803 	uint_t			send_or_recv, qpnum, error, opcode;
2804 
2805 	TAVOR_TNF_ENTER(tavor_wrid_get_entry);
2806 
2807 	/* Lock the list of work queues associated with this CQ */
2808 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2809 
2810 	/*
2811 	 * Determine whether this CQE is a send or receive completion (and
2812 	 * whether it was a "successful" completion or not)
2813 	 */
2814 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2815 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2816 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2817 		error = 1;
2818 		send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2819 		    TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2820 	} else {
2821 		error = 0;
2822 		send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2823 	}
2824 
2825 	/* Find the work queue for this QP number (send or receive side) */
2826 	qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2827 	wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2828 	ASSERT(wq != NULL);
2829 
2830 	/*
2831 	 * Regardless of whether the completion is the result of a "success"
2832 	 * or a "failure", we lock the list of "containers" and attempt to
2833 	 * search for the the first matching completion (i.e. the first WR
2834 	 * with a matching WQE addr and size).  Once we find it, we pull out
2835 	 * the "wrid" field and return it (see below).  Note: One possible
2836 	 * future enhancement would be to enable this routine to skip over
2837 	 * any "unsignaled" completions to go directly to the next "signaled"
2838 	 * entry on success. XXX
2839 	 */
2840 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
2841 	wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2842 
2843 	/*
2844 	 * If this is a "successful" completion, then we assert that this
2845 	 * completion must be a "signaled" completion.
2846 	 */
2847 	ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2848 
2849 	/*
2850 	 * If the completion is a "failed" completion, then we save away the
2851 	 * contents of the entry (into the "wre" field passed in) for use
2852 	 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2853 	 * function to grab "wqeaddrsz" from the next entry in the container.
2854 	 * This is required for error processing (where updating these fields
2855 	 * properly is necessary to correct handling of the "error" CQE)
2856 	 */
2857 	if (error && (wre != NULL)) {
2858 		*wre = *wre_tmp;
2859 		wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2860 	}
2861 
2862 	/* Pull out the WRID and return it */
2863 	wrid = wre_tmp->wr_wrid;
2864 
2865 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
2866 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2867 
2868 	TAVOR_TNF_EXIT(tavor_wrid_get_entry);
2869 	return (wrid);
2870 }
2871 
2872 
2873 /*
2874  * tavor_wrid_find_match()
2875  *    Context: Can be called from interrupt or base context.
2876  */
2877 static tavor_wrid_entry_t *
2878 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2879     tavor_hw_cqe_t *cqe)
2880 {
2881 	tavor_wrid_entry_t	*curr = NULL;
2882 	tavor_wrid_list_hdr_t	*container;
2883 	uint32_t		wqeaddr_size;
2884 	uint32_t		head, tail, size;
2885 	int			found = 0, last_container;
2886 
2887 	TAVOR_TNF_ENTER(tavor_wrid_find_match);
2888 
2889 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2890 
2891 	/* Pull the "wqeaddrsz" information from the CQE */
2892 	wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2893 
2894 	/*
2895 	 * Walk the "containers" list(s), find first WR with a matching WQE
2896 	 * addr.  If the current "container" is not the last one on the list,
2897 	 * i.e. not the current one to which we are posting new WRID entries,
2898 	 * then we do not attempt to update the "q_head", "q_tail", and
2899 	 * "q_full" indicators on the main work queue header.  We do, however,
2900 	 * update the "head" and "full" indicators on the individual containers
2901 	 * as we go.  This is imperative because we need to be able to
2902 	 * determine when the current container has been emptied (so that we
2903 	 * can move on to the next container).
2904 	 */
2905 	container = wq->wq_wrid_poll;
2906 	while (container != NULL) {
2907 		/* Is this the last/only "container" on the list */
2908 		last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2909 
2910 		/*
2911 		 * First check if we are on an SRQ.  If so, we grab the entry
2912 		 * and break out.  Since SRQ wridlist's are never added to
2913 		 * reaplist, they can only be the last container.
2914 		 */
2915 		if (container->wl_srq_en) {
2916 			ASSERT(last_container == 1);
2917 			curr = tavor_wrid_find_match_srq(container, cq, cqe);
2918 			break;
2919 		}
2920 
2921 		/*
2922 		 * Grab the current "head", "tail" and "size" fields before
2923 		 * walking the list in the current container. Note: the "size"
2924 		 * field here must always be a power-of-2.  The "full"
2925 		 * parameter is checked (and updated) here to distinguish the
2926 		 * "queue full" condition from "queue empty".
2927 		 */
2928 		head = container->wl_head;
2929 		tail = container->wl_tail;
2930 		size = container->wl_size;
2931 		while ((head != tail) || (container->wl_full)) {
2932 			container->wl_full = 0;
2933 			curr = &container->wl_wre[head];
2934 			head = ((head + 1) & (size - 1));
2935 
2936 			/*
2937 			 * If the current entry's "wqeaddrsz" matches the one
2938 			 * we're searching for, then this must correspond to
2939 			 * the work request that caused the completion.  Set
2940 			 * the "found" flag and bail out.
2941 			 */
2942 			if (curr->wr_wqeaddrsz == wqeaddr_size) {
2943 				found = 1;
2944 				break;
2945 			}
2946 		}
2947 
2948 		/*
2949 		 * If the current container is empty (having reached here the
2950 		 * "head == tail" condition can only mean that the container
2951 		 * is empty), then NULL out the "wrid_old_tail" field (see
2952 		 * tavor_post_send() and tavor_post_recv() for more details)
2953 		 * and (potentially) remove the current container from future
2954 		 * searches.
2955 		 */
2956 		if (head == tail) {
2957 
2958 			container->wl_wre_old_tail = NULL;
2959 			/*
2960 			 * If this wasn't the last "container" on the chain,
2961 			 * i.e. the one to which new WRID entries will be
2962 			 * added, then remove it from the list.
2963 			 * Note: we don't "lose" the memory pointed to by this
2964 			 * because we should have already put this container
2965 			 * on the "reapable" list (from where it will later be
2966 			 * pulled).
2967 			 */
2968 			if (!last_container) {
2969 				wq->wq_wrid_poll = container->wl_next;
2970 			}
2971 		}
2972 
2973 		/* Update the head index for the container */
2974 		container->wl_head = head;
2975 
2976 		/*
2977 		 * If the entry was found in this container, then continue to
2978 		 * bail out.  Else reset the "curr" pointer and move on to the
2979 		 * next container (if there is one).  Note: the only real
2980 		 * reason for setting "curr = NULL" here is so that the ASSERT
2981 		 * below can catch the case where no matching entry was found
2982 		 * on any of the lists.
2983 		 */
2984 		if (found) {
2985 			break;
2986 		} else {
2987 			curr = NULL;
2988 			container = container->wl_next;
2989 		}
2990 	}
2991 
2992 	/*
2993 	 * Update work queue header's "head" and "full" conditions to match
2994 	 * the last entry on the container list.  (Note: Only if we're pulling
2995 	 * entries from the last work queue portion of the list, i.e. not from
2996 	 * the previous portions that may be the "reapable" list.)
2997 	 */
2998 	if (last_container) {
2999 		wq->wq_head = wq->wq_wrid_post->wl_head;
3000 		wq->wq_full = wq->wq_wrid_post->wl_full;
3001 	}
3002 
3003 	/* Ensure that we've actually found what we were searching for */
3004 	ASSERT(curr != NULL);
3005 
3006 	TAVOR_TNF_EXIT(tavor_wrid_find_match);
3007 	return (curr);
3008 }
3009 
3010 
3011 /*
3012  * tavor_wrid_find_match_srq()
3013  *    Context: Can be called from interrupt or base context.
3014  */
3015 tavor_wrid_entry_t *
3016 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
3017     tavor_hw_cqe_t *cqe)
3018 {
3019 	tavor_wrid_entry_t	*wre;
3020 	uint64_t		*wl_wqe;
3021 	uint32_t		wqe_index;
3022 	uint64_t		wqe_addr;
3023 	uint32_t		cqe_wqe_addr;
3024 
3025 	/* Grab the WQE addr out of the CQE */
3026 	cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
3027 
3028 	/*
3029 	 * Use the WQE addr as the lower 32-bit, we add back on the
3030 	 * 'wl_srq_desc_off' because we have a zero-based queue.  Then the
3031 	 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
3032 	 * the SRQ Work Queue itself.  We use this address as the index to find
3033 	 * out which Work Queue Entry this CQE corresponds with.
3034 	 *
3035 	 * We also use this address below to add the WQE back on to the free
3036 	 * list.
3037 	 */
3038 	wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
3039 	    (cqe_wqe_addr + wl->wl_srq_desc_off);
3040 
3041 	/*
3042 	 * Given the 'wqe_addr' just calculated and the srq buf address, we
3043 	 * find the 'wqe_index'.  The 'wre' returned below contains the WRID
3044 	 * that we are looking for.  This indexes into the wre_list for this
3045 	 * specific WQE.
3046 	 */
3047 	wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
3048 	    wl->wl_srq_log_wqesz);
3049 
3050 	/* ASSERT on impossible wqe_index values */
3051 	ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
3052 
3053 	/* Get the pointer to this WQE */
3054 	wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
3055 
3056 	/* Put this WQE index back on the free list */
3057 	ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
3058 	wl->wl_free_list_indx = wqe_index;
3059 
3060 	/* Using the index, return the Work Request ID Entry (wre) */
3061 	wre = &wl->wl_wre[wqe_index];
3062 
3063 	return (wre);
3064 }
3065 
3066 
3067 /*
3068  * tavor_wrid_cq_reap()
3069  *    Context: Can be called from interrupt or base context.
3070  */
3071 void
3072 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
3073 {
3074 	tavor_workq_hdr_t	*consume_wqhdr;
3075 	tavor_wrid_list_hdr_t	*container, *to_free;
3076 
3077 	ASSERT(MUTEX_HELD(&cq->cq_lock));
3078 
3079 	TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3080 
3081 	/* Lock the list of work queues associated with this CQ */
3082 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
3083 
3084 	/* Walk the "reapable" list and free up containers */
3085 	container = cq->cq_wrid_reap_head;
3086 	while (container != NULL) {
3087 		to_free	  = container;
3088 		container = container->wl_reap_next;
3089 		/*
3090 		 * If reaping the WRID list containers pulls the last
3091 		 * container from the given work queue header, then we free
3092 		 * the work queue header as well.
3093 		 */
3094 		consume_wqhdr = tavor_wrid_list_reap(to_free);
3095 		if (consume_wqhdr != NULL) {
3096 			tavor_cq_wqhdr_remove(cq, consume_wqhdr);
3097 		}
3098 	}
3099 
3100 	/* Once finished reaping, we reset the CQ's reap list */
3101 	cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
3102 
3103 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
3104 	TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3105 }
3106 
3107 
3108 /*
3109  * tavor_wrid_cq_force_reap()
3110  *    Context: Can be called from interrupt or base context.
3111  */
3112 void
3113 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
3114 {
3115 	tavor_workq_hdr_t	*curr;
3116 	tavor_wrid_list_hdr_t	*container, *to_free;
3117 	avl_tree_t		*treep;
3118 	void			*cookie = NULL;
3119 
3120 	ASSERT(MUTEX_HELD(&cq->cq_lock));
3121 
3122 	TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3123 
3124 	/*
3125 	 * The first step is to walk the "reapable" list and free up those
3126 	 * containers.  This is necessary because the containers on the
3127 	 * reapable list are not otherwise connected to the work queue headers
3128 	 * anymore.
3129 	 */
3130 	tavor_wrid_cq_reap(cq);
3131 
3132 	/* Now lock the list of work queues associated with this CQ */
3133 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
3134 
3135 	/*
3136 	 * Walk the list of work queue headers and free up all the WRID list
3137 	 * containers chained to it.  Note: We don't need to grab the locks
3138 	 * for each of the individual WRID lists here because the only way
3139 	 * things can be added or removed from the list at this point would be
3140 	 * through post a work request to a QP.  But if we've come this far,
3141 	 * then we can be assured that there are no longer any QP associated
3142 	 * with the CQ that we are trying to free.
3143 	 */
3144 #ifdef __lock_lint
3145 	tavor_wrid_wqhdr_compare(NULL, NULL);
3146 #endif
3147 	treep = &cq->cq_wrid_wqhdr_avl_tree;
3148 	while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
3149 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
3150 		container = curr->wq_wrid_poll;
3151 		while (container != NULL) {
3152 			to_free	  = container;
3153 			container = container->wl_next;
3154 			/*
3155 			 * If reaping the WRID list containers pulls the last
3156 			 * container from the given work queue header, then
3157 			 * we free the work queue header as well.  Note: we
3158 			 * ignore the return value because we know that the
3159 			 * work queue header should always be freed once the
3160 			 * list of containers has come to an end.
3161 			 */
3162 			(void) tavor_wrid_list_reap(to_free);
3163 			if (container == NULL) {
3164 				tavor_cq_wqhdr_remove(cq, curr);
3165 			}
3166 		}
3167 	}
3168 	avl_destroy(treep);
3169 
3170 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
3171 	TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3172 }
3173 
3174 
3175 /*
3176  * tavor_wrid_get_list()
3177  *    Context: Can be called from interrupt or base context.
3178  */
3179 tavor_wrid_list_hdr_t *
3180 tavor_wrid_get_list(uint32_t qsize)
3181 {
3182 	tavor_wrid_list_hdr_t	*wridlist;
3183 	uint32_t		size;
3184 
3185 	/*
3186 	 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
3187 	 * which holds the pointers necessary for maintaining the "reapable"
3188 	 * list, chaining together multiple "containers" old and new, and
3189 	 * tracking the head, tail, size, etc. for each container.
3190 	 *
3191 	 * The "container" also holds all the tavor_wrid_entry_t's, which is
3192 	 * allocated separately, one for each entry on the corresponding work
3193 	 * queue.
3194 	 */
3195 	size = sizeof (tavor_wrid_list_hdr_t);
3196 
3197 	/*
3198 	 * Note that this allocation has to be a NOSLEEP operation here
3199 	 * because we are holding the "wqhdr_list_lock" and, therefore,
3200 	 * could get raised to the interrupt level.
3201 	 */
3202 	wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
3203 	if (wridlist == NULL) {
3204 		return (NULL);
3205 	}
3206 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
3207 
3208 	/* Complete the "container" initialization */
3209 	wridlist->wl_size = qsize;
3210 	wridlist->wl_full = 0;
3211 	wridlist->wl_head = 0;
3212 	wridlist->wl_tail = 0;
3213 	wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3214 	    sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3215 	if (wridlist->wl_wre == NULL) {
3216 		kmem_free(wridlist, size);
3217 		return (NULL);
3218 	}
3219 	wridlist->wl_wre_old_tail  = NULL;
3220 	wridlist->wl_reap_next = NULL;
3221 	wridlist->wl_next  = NULL;
3222 	wridlist->wl_prev  = NULL;
3223 	wridlist->wl_srq_en = 0;
3224 
3225 	return (wridlist);
3226 }
3227 
3228 /*
3229  * tavor_wrid_list_srq_init()
3230  * Context: Can be called from interrupt or base context
3231  */
3232 void
3233 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3234     uint_t wq_start)
3235 {
3236 	uint64_t *wl_wqe;
3237 	int wqe_index;
3238 
3239 	ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3240 
3241 	/* Setup pointers for use later when we are polling the CQ */
3242 	wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3243 	wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3244 	wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3245 	wridlist->wl_srq_desc_off = srq->srq_desc_off;
3246 	wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3247 
3248 	/* Given wq_start to start initializing buf at, verify sanity */
3249 	ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3250 
3251 	/*
3252 	 * Initialize wridlist free list
3253 	 *
3254 	 * For each WQ up to the size of our queue, we store an index in the WQ
3255 	 * memory itself, representing the next available free entry.  The
3256 	 * 'wl_free_list_indx' always holds the index of the next available
3257 	 * free entry in the WQ.  If 'wl_free_list_indx' is -1, then we are
3258 	 * completely full.  This gives us the advantage of being able to have
3259 	 * entries complete or be polled off the WQ out-of-order.
3260 	 *
3261 	 * For now, we write the free_list entries inside the WQ itself.  It
3262 	 * may be useful in the future to store this information in a separate
3263 	 * structure for debugging purposes.
3264 	 */
3265 	for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3266 		wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3267 		ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3268 		    wridlist->wl_free_list_indx);
3269 		wridlist->wl_free_list_indx = wqe_index;
3270 	}
3271 }
3272 
3273 
3274 /*
3275  * tavor_wrid_reaplist_add()
3276  *    Context: Can be called from interrupt or base context.
3277  */
3278 static void
3279 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3280 {
3281 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3282 
3283 	TAVOR_TNF_ENTER(tavor_wrid_reaplist_add);
3284 
3285 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
3286 
3287 	/*
3288 	 * Add the "post" container (the last one on the current chain) to
3289 	 * the CQ's "reapable" list
3290 	 */
3291 	if ((cq->cq_wrid_reap_head == NULL) &&
3292 	    (cq->cq_wrid_reap_tail == NULL)) {
3293 		cq->cq_wrid_reap_head = wq->wq_wrid_post;
3294 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3295 	} else {
3296 		cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3297 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3298 	}
3299 
3300 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
3301 }
3302 
3303 
3304 int
3305 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3306 {
3307 	tavor_workq_compare_t	*cmpp;
3308 	tavor_workq_hdr_t	*curr;
3309 
3310 	cmpp = (tavor_workq_compare_t *)p1;
3311 	curr = (tavor_workq_hdr_t *)p2;
3312 
3313 	if (cmpp->cmp_qpn < curr->wq_qpn)
3314 		return (-1);
3315 	else if (cmpp->cmp_qpn > curr->wq_qpn)
3316 		return (+1);
3317 	else if (cmpp->cmp_type < curr->wq_type)
3318 		return (-1);
3319 	else if (cmpp->cmp_type > curr->wq_type)
3320 		return (+1);
3321 	else
3322 		return (0);
3323 }
3324 
3325 
3326 /*
3327  * tavor_wrid_wqhdr_find()
3328  *    Context: Can be called from interrupt or base context.
3329  */
3330 static tavor_workq_hdr_t *
3331 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3332 {
3333 	tavor_workq_hdr_t	*curr;
3334 	tavor_workq_compare_t	cmp;
3335 
3336 	TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find);
3337 
3338 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3339 
3340 	/*
3341 	 * Walk the CQ's work queue list, trying to find a send or recv queue
3342 	 * with the same QP number.  We do this even if we are going to later
3343 	 * create a new entry because it helps us easily find the end of the
3344 	 * list.
3345 	 */
3346 	cmp.cmp_qpn = qpn;
3347 	cmp.cmp_type = wq_type;
3348 #ifdef __lock_lint
3349 	tavor_wrid_wqhdr_compare(NULL, NULL);
3350 #endif
3351 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3352 
3353 	TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find);
3354 	return (curr);
3355 }
3356 
3357 
3358 /*
3359  * tavor_wrid_wqhdr_create()
3360  *    Context: Can be called from interrupt or base context.
3361  */
3362 static tavor_workq_hdr_t *
3363 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3364     uint_t wq_type, uint_t create_wql)
3365 {
3366 	tavor_workq_hdr_t	*wqhdr_tmp;
3367 
3368 	TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create);
3369 
3370 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3371 
3372 	/*
3373 	 * Allocate space a work queue header structure and initialize it.
3374 	 * Each work queue header structure includes a "wq_wrid_wql"
3375 	 * which needs to be initialized.  Note that this allocation has to be
3376 	 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3377 	 * and, therefore, could get raised to the interrupt level.
3378 	 */
3379 	wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3380 	    sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3381 	if (wqhdr_tmp == NULL) {
3382 		TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3383 		return (NULL);
3384 	}
3385 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3386 	wqhdr_tmp->wq_qpn	= qpn;
3387 	wqhdr_tmp->wq_type	= wq_type;
3388 
3389 	if (create_wql) {
3390 		wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3391 		if (wqhdr_tmp->wq_wrid_wql == NULL) {
3392 			kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3393 			TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3394 			return (NULL);
3395 		}
3396 	}
3397 
3398 	wqhdr_tmp->wq_wrid_poll = NULL;
3399 	wqhdr_tmp->wq_wrid_post = NULL;
3400 
3401 	/* Chain the newly allocated work queue header to the CQ's list */
3402 	tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3403 
3404 	TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3405 	return (wqhdr_tmp);
3406 }
3407 
3408 
3409 /*
3410  * tavor_wrid_wql_create()
3411  *    Context: Can be called from interrupt or base context.
3412  */
3413 tavor_wq_lock_t *
3414 tavor_wrid_wql_create(tavor_state_t *state)
3415 {
3416 	tavor_wq_lock_t *wql;
3417 
3418 	TAVOR_TNF_ENTER(tavor_wrid_wql_create);
3419 
3420 	/*
3421 	 * Allocate the WQL and initialize it.
3422 	 */
3423 	wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3424 	if (wql == NULL) {
3425 		TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3426 		return (NULL);
3427 	}
3428 
3429 	mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3430 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
3431 
3432 	/* Add refcount to WQL */
3433 	tavor_wql_refcnt_inc(wql);
3434 
3435 	TAVOR_TNF_EXIT(tavor_wrid_wql_create);
3436 	return (wql);
3437 }
3438 
3439 
3440 /*
3441  * tavor_wrid_get_wqeaddrsz()
3442  *    Context: Can be called from interrupt or base context.
3443  */
3444 static uint32_t
3445 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3446 {
3447 	tavor_wrid_entry_t	*wre;
3448 	uint32_t		wqeaddrsz;
3449 	uint32_t		head;
3450 
3451 	/*
3452 	 * If the container is empty, then there is no next entry. So just
3453 	 * return zero.  Note: the "head == tail" condition here can only
3454 	 * mean that the container is empty because we have previously pulled
3455 	 * something from the container.
3456 	 *
3457 	 * If the container is not empty, then find the next entry and return
3458 	 * the contents of its "wqeaddrsz" field.
3459 	 */
3460 	if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3461 		wqeaddrsz = 0;
3462 	} else {
3463 		/*
3464 		 * We don't need to calculate the "next" head pointer here
3465 		 * because "head" should already point to the next entry on
3466 		 * the list (since we just pulled something off - in
3467 		 * tavor_wrid_find_match() - and moved the head index forward.)
3468 		 */
3469 		head = wq->wq_wrid_poll->wl_head;
3470 		wre = &wq->wq_wrid_poll->wl_wre[head];
3471 		wqeaddrsz = wre->wr_wqeaddrsz;
3472 	}
3473 	return (wqeaddrsz);
3474 }
3475 
3476 
3477 /*
3478  * tavor_wrid_wqhdr_add()
3479  *    Context: Can be called from interrupt or base context.
3480  */
3481 static void
3482 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3483     tavor_wrid_list_hdr_t *wridlist)
3484 {
3485 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3486 
3487 	/* Chain the new WRID list "container" to the work queue list */
3488 	if ((wqhdr->wq_wrid_post == NULL) &&
3489 	    (wqhdr->wq_wrid_poll == NULL)) {
3490 		wqhdr->wq_wrid_poll = wridlist;
3491 		wqhdr->wq_wrid_post = wridlist;
3492 	} else {
3493 		wqhdr->wq_wrid_post->wl_next = wridlist;
3494 		wridlist->wl_prev = wqhdr->wq_wrid_post;
3495 		wqhdr->wq_wrid_post = wridlist;
3496 	}
3497 }
3498 
3499 
3500 /*
3501  * tavor_wrid_wqhdr_remove()
3502  *    Context: Can be called from interrupt or base context.
3503  *
3504  *    Note: this is only called to remove the most recently added WRID list
3505  *    container (i.e. in tavor_from_reset() above)
3506  */
3507 static void
3508 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3509     tavor_wrid_list_hdr_t *wridlist)
3510 {
3511 	tavor_wrid_list_hdr_t	*prev, *next;
3512 
3513 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3514 
3515 	/* Unlink the WRID list "container" from the work queue list */
3516 	prev = wridlist->wl_prev;
3517 	next = wridlist->wl_next;
3518 	if (prev != NULL) {
3519 		prev->wl_next = next;
3520 	}
3521 	if (next != NULL) {
3522 		next->wl_prev = prev;
3523 	}
3524 
3525 	/*
3526 	 * Update any pointers in the work queue hdr that may point to this
3527 	 * WRID list container
3528 	 */
3529 	if (wqhdr->wq_wrid_post == wridlist) {
3530 		wqhdr->wq_wrid_post = prev;
3531 	}
3532 	if (wqhdr->wq_wrid_poll == wridlist) {
3533 		wqhdr->wq_wrid_poll = NULL;
3534 	}
3535 }
3536 
3537 
3538 /*
3539  * tavor_wrid_list_reap()
3540  *    Context: Can be called from interrupt or base context.
3541  *    Note: The "wqhdr_list_lock" must be held.
3542  */
3543 static tavor_workq_hdr_t *
3544 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3545 {
3546 	tavor_workq_hdr_t	*wqhdr, *consume_wqhdr = NULL;
3547 	tavor_wrid_list_hdr_t	*prev, *next;
3548 	uint32_t		size;
3549 
3550 	TAVOR_TNF_ENTER(tavor_wrid_list_reap);
3551 
3552 	/* Get the back pointer to the work queue header (see below) */
3553 	wqhdr = wridlist->wl_wqhdr;
3554 	mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3555 
3556 	/* Unlink the WRID list "container" from the work queue list */
3557 	prev = wridlist->wl_prev;
3558 	next = wridlist->wl_next;
3559 	if (prev != NULL) {
3560 		prev->wl_next = next;
3561 	}
3562 	if (next != NULL) {
3563 		next->wl_prev = prev;
3564 	}
3565 
3566 	/*
3567 	 * If the back pointer to the work queue header shows that it
3568 	 * was pointing to the entry we are about to remove, then the work
3569 	 * queue header is reapable as well.
3570 	 */
3571 	if ((wqhdr->wq_wrid_poll == wridlist) &&
3572 	    (wqhdr->wq_wrid_post == wridlist)) {
3573 		consume_wqhdr = wqhdr;
3574 	}
3575 
3576 	/* Be sure to update the "poll" and "post" container pointers */
3577 	if (wqhdr->wq_wrid_poll == wridlist) {
3578 		wqhdr->wq_wrid_poll = next;
3579 	}
3580 	if (wqhdr->wq_wrid_post == wridlist) {
3581 		wqhdr->wq_wrid_post = NULL;
3582 	}
3583 
3584 	/* Calculate the size and free the container */
3585 	size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3586 	kmem_free(wridlist->wl_wre, size);
3587 	kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3588 
3589 	mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3590 
3591 	TAVOR_TNF_EXIT(tavor_wrid_list_reap);
3592 	return (consume_wqhdr);
3593 }
3594 
3595 
3596 /*
3597  * tavor_wrid_wqhdr_lock_both()
3598  *    Context: Can be called from interrupt or base context.
3599  */
3600 static void
3601 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3602 {
3603 	tavor_cqhdl_t	sq_cq, rq_cq;
3604 
3605 	sq_cq = qp->qp_sq_cqhdl;
3606 	rq_cq = qp->qp_rq_cqhdl;
3607 
3608 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3609 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3610 
3611 	/*
3612 	 * If both work queues (send and recv) share a completion queue, then
3613 	 * grab the common lock.  If they use different CQs (hence different
3614 	 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3615 	 * receive.  We do this consistently and correctly in
3616 	 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3617 	 * of dead lock condition.  Note:  We add the "__lock_lint" code here
3618 	 * to fake out warlock into thinking we've grabbed both locks (when,
3619 	 * in fact, we only needed the one).
3620 	 */
3621 	if (sq_cq == rq_cq) {
3622 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3623 #ifdef	__lock_lint
3624 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3625 #endif
3626 	} else {
3627 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3628 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3629 	}
3630 }
3631 
3632 /*
3633  * tavor_wrid_wqhdr_unlock_both()
3634  *    Context: Can be called from interrupt or base context.
3635  */
3636 static void
3637 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3638 {
3639 	tavor_cqhdl_t	sq_cq, rq_cq;
3640 
3641 	sq_cq = qp->qp_sq_cqhdl;
3642 	rq_cq = qp->qp_rq_cqhdl;
3643 
3644 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3645 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3646 
3647 	/*
3648 	 * See tavor_wrid_wqhdr_lock_both() above for more detail
3649 	 */
3650 	if (sq_cq == rq_cq) {
3651 #ifdef	__lock_lint
3652 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3653 #endif
3654 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3655 	} else {
3656 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3657 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3658 	}
3659 }
3660 
3661 
3662 /*
3663  * tavor_cq_wqhdr_add()
3664  *    Context: Can be called from interrupt or base context.
3665  */
3666 static void
3667 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3668 {
3669 	tavor_workq_compare_t	cmp;
3670 	avl_index_t		where;
3671 
3672 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3673 
3674 	cmp.cmp_qpn = wqhdr->wq_qpn;
3675 	cmp.cmp_type = wqhdr->wq_type;
3676 #ifdef __lock_lint
3677 	tavor_wrid_wqhdr_compare(NULL, NULL);
3678 #endif
3679 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3680 	/*
3681 	 * If the CQ's work queue list is empty, then just add it.
3682 	 * Otherwise, chain it to the beginning of the list.
3683 	 */
3684 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3685 }
3686 
3687 
3688 /*
3689  * tavor_cq_wqhdr_remove()
3690  *    Context: Can be called from interrupt or base context.
3691  */
3692 static void
3693 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3694 {
3695 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3696 
3697 #ifdef __lock_lint
3698 	tavor_wrid_wqhdr_compare(NULL, NULL);
3699 #endif
3700 	/* Remove "wqhdr" from the work queue header list on "cq" */
3701 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3702 
3703 	/*
3704 	 * Release reference to WQL; If this is the last reference, this call
3705 	 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3706 	 */
3707 	tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3708 
3709 	/* Free the memory associated with "wqhdr" */
3710 	kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3711 }
3712 
3713 
3714 /*
3715  * tavor_wql_refcnt_inc()
3716  * Context: Can be called from interrupt or base context
3717  */
3718 void
3719 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3720 {
3721 	ASSERT(wql != NULL);
3722 
3723 	mutex_enter(&wql->wql_lock);
3724 	wql->wql_refcnt++;
3725 	mutex_exit(&wql->wql_lock);
3726 }
3727 
3728 /*
3729  * tavor_wql_refcnt_dec()
3730  * Context: Can be called from interrupt or base context
3731  */
3732 void
3733 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3734 {
3735 	int	refcnt;
3736 
3737 	ASSERT(wql != NULL);
3738 
3739 	mutex_enter(&wql->wql_lock);
3740 	wql->wql_refcnt--;
3741 	refcnt = wql->wql_refcnt;
3742 	mutex_exit(&wql->wql_lock);
3743 
3744 	/*
3745 	 *
3746 	 * Free up WQL memory if we're the last one associated with this
3747 	 * structure.
3748 	 */
3749 	if (refcnt == 0) {
3750 		mutex_destroy(&wql->wql_lock);
3751 		kmem_free(wql, sizeof (tavor_wq_lock_t));
3752 	}
3753 }
3754