xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_cq.c (revision b6805bf78d2bbbeeaea8909a05623587b42d58b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_cq.c
29  *    Tavor Completion Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, resizing,
32  *    and handling the completion type events that the Tavor hardware can
33  *    generate.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 #include <sys/sysmacros.h>
43 
44 #include <sys/ib/adapters/tavor/tavor.h>
45 
46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
47     uint32_t cqn, uint32_t cq_param);
48 #pragma inline(tavor_cq_doorbell)
49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
50     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
52     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
54     uint_t flag);
55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
56     uint32_t old_cons_indx, uint32_t num_newcqe);
57 
58 /*
59  * tavor_cq_alloc()
60  *    Context: Can be called only from user or kernel context.
61  */
62 int
63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
64     ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
65     uint_t sleepflag)
66 {
67 	tavor_rsrc_t		*cqc, *rsrc;
68 	tavor_umap_db_entry_t	*umapdb;
69 	tavor_hw_cqc_t		cqc_entry;
70 	tavor_cqhdl_t		cq;
71 	ibt_mr_attr_t		mr_attr;
72 	tavor_mr_options_t	op;
73 	tavor_pdhdl_t		pd;
74 	tavor_mrhdl_t		mr;
75 	tavor_hw_cqe_t		*buf;
76 	uint64_t		addr, value;
77 	uint32_t		log_cq_size, lkey, uarpg;
78 	uint_t			dma_xfer_mode, cq_sync, cq_is_umap;
79 	int			status, i, flag;
80 	char			*errormsg;
81 
82 	TAVOR_TNF_ENTER(tavor_cq_alloc);
83 
84 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
85 
86 	/*
87 	 * Determine whether CQ is being allocated for userland access or
88 	 * whether it is being allocated for kernel access.  If the CQ is
89 	 * being allocated for userland access, then lookup the UAR doorbell
90 	 * page number for the current process.  Note:  If this is not found
91 	 * (e.g. if the process has not previously open()'d the Tavor driver),
92 	 * then an error is returned.
93 	 */
94 	cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
95 	if (cq_is_umap) {
96 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
97 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
98 		if (status != DDI_SUCCESS) {
99 			/* Set "status" and "errormsg" and goto failure */
100 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
101 			goto cqalloc_fail;
102 		}
103 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
104 	}
105 
106 	/* Use the internal protection domain (PD) for setting up CQs */
107 	pd = state->ts_pdhdl_internal;
108 
109 	/* Increment the reference count on the protection domain (PD) */
110 	tavor_pd_refcnt_inc(pd);
111 
112 	/*
113 	 * Allocate an CQ context entry.  This will be filled in with all
114 	 * the necessary parameters to define the Completion Queue.  And then
115 	 * ownership will be passed to the hardware in the final step
116 	 * below.  If we fail here, we must undo the protection domain
117 	 * reference count.
118 	 */
119 	status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
120 	if (status != DDI_SUCCESS) {
121 		/* Set "status" and "errormsg" and goto failure */
122 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context");
123 		goto cqalloc_fail1;
124 	}
125 
126 	/*
127 	 * Allocate the software structure for tracking the completion queue
128 	 * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
129 	 * undo the protection domain reference count and the previous
130 	 * resource allocation.
131 	 */
132 	status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
133 	if (status != DDI_SUCCESS) {
134 		/* Set "status" and "errormsg" and goto failure */
135 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle");
136 		goto cqalloc_fail2;
137 	}
138 	cq = (tavor_cqhdl_t)rsrc->tr_addr;
139 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
140 	cq->cq_is_umap = cq_is_umap;
141 
142 	/* Use the index as CQ number */
143 	cq->cq_cqnum = cqc->tr_indx;
144 
145 	/*
146 	 * If this will be a user-mappable CQ, then allocate an entry for
147 	 * the "userland resources database".  This will later be added to
148 	 * the database (after all further CQ operations are successful).
149 	 * If we fail here, we must undo the reference counts and the
150 	 * previous resource allocation.
151 	 */
152 	if (cq->cq_is_umap) {
153 		umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
154 		    MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
155 		if (umapdb == NULL) {
156 			/* Set "status" and "errormsg" and goto failure */
157 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
158 			goto cqalloc_fail3;
159 		}
160 	}
161 
162 	/*
163 	 * Calculate the appropriate size for the completion queue.
164 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
165 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
166 	 * to round the requested size up to the next highest power-of-2
167 	 */
168 	cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
169 	log_cq_size = highbit(cq_attr->cq_size);
170 
171 	/*
172 	 * Next we verify that the rounded-up size is valid (i.e. consistent
173 	 * with the device limits and/or software-configured limits)
174 	 */
175 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
176 		/* Set "status" and "errormsg" and goto failure */
177 		TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
178 		goto cqalloc_fail4;
179 	}
180 
181 	/*
182 	 * Allocate the memory for Completion Queue.
183 	 *
184 	 * Note: Although we use the common queue allocation routine, we
185 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
186 	 * kernel system memory) for kernel CQs because it would be
187 	 * inefficient to have CQs located in DDR memory.  This is primarily
188 	 * because CQs are read from (by software) more than they are written
189 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
190 	 * user-mappable CQs for a similar reason.)
191 	 * It is also worth noting that, unlike Tavor QP work queues,
192 	 * completion queues do not have the same strict alignment
193 	 * requirements.  It is sufficient for the CQ memory to be both
194 	 * aligned to and bound to addresses which are a multiple of CQE size.
195 	 */
196 	cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
197 	cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
198 	cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
199 	if (cq->cq_is_umap) {
200 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
201 	} else {
202 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
203 	}
204 	status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
205 	if (status != DDI_SUCCESS) {
206 		/* Set "status" and "errormsg" and goto failure */
207 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
208 		goto cqalloc_fail4;
209 	}
210 	buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
211 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
212 
213 	/*
214 	 * Initialize each of the Completion Queue Entries (CQE) by setting
215 	 * their ownership to hardware ("owner" bit set to HW).  This is in
216 	 * preparation for the final transfer of ownership (below) of the
217 	 * CQ context itself.
218 	 */
219 	for (i = 0; i < (1 << log_cq_size); i++) {
220 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
221 	}
222 
223 	/*
224 	 * Register the memory for the CQ.  The memory for the CQ must
225 	 * be registered in the Tavor TPT tables.  This gives us the LKey
226 	 * to specify in the CQ context below.  Note: If this is a user-
227 	 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
228 	 */
229 	flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
230 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
231 	mr_attr.mr_len	 = cq->cq_cqinfo.qa_size;
232 	mr_attr.mr_as	 = NULL;
233 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
234 	if (cq->cq_is_umap) {
235 		dma_xfer_mode = DDI_DMA_CONSISTENT;
236 	} else {
237 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
238 	}
239 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
240 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
241 	}
242 	op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
243 	op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
244 	op.mro_bind_override_addr = 0;
245 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
246 	if (status != DDI_SUCCESS) {
247 		/* Set "status" and "errormsg" and goto failure */
248 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
249 		goto cqalloc_fail5;
250 	}
251 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
252 	addr = mr->mr_bindinfo.bi_addr;
253 	lkey = mr->mr_lkey;
254 
255 	/* Determine if later ddi_dma_sync will be necessary */
256 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
257 
258 	/* Sync entire CQ for use by the hardware (if necessary). */
259 	if (cq_sync) {
260 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
261 		    cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
262 	}
263 
264 	/*
265 	 * Fill in the CQC entry.  This is the final step before passing
266 	 * ownership of the CQC entry to the Tavor hardware.  We use all of
267 	 * the information collected/calculated above to fill in the
268 	 * requisite portions of the CQC.  Note: If this CQ is going to be
269 	 * used for userland access, then we need to set the UAR page number
270 	 * appropriately (otherwise it's a "don't care")
271 	 */
272 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
273 	cq->cq_eqnum		= TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
274 	cq->cq_erreqnum		= TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
275 	cqc_entry.xlat		= TAVOR_VA2PA_XLAT_ENABLED;
276 	cqc_entry.state		= TAVOR_CQ_DISARMED;
277 	cqc_entry.start_addr_h	= (addr >> 32);
278 	cqc_entry.start_addr_l	= (addr & 0xFFFFFFFF);
279 	cqc_entry.log_cq_sz	= log_cq_size;
280 	if (cq->cq_is_umap) {
281 		cqc_entry.usr_page = uarpg;
282 	} else {
283 		cqc_entry.usr_page = 0;
284 	}
285 	cqc_entry.pd		= pd->pd_pdnum;
286 	cqc_entry.lkey		= lkey;
287 	cqc_entry.e_eqn		= cq->cq_erreqnum;
288 	cqc_entry.c_eqn		= cq->cq_eqnum;
289 	cqc_entry.cqn		= cq->cq_cqnum;
290 
291 	/*
292 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
293 	 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
294 	 * command).  Note: In general, this operation shouldn't fail.  But
295 	 * if it does, we have to undo everything we've done above before
296 	 * returning error.
297 	 */
298 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
299 	    sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
300 	if (status != TAVOR_CMD_SUCCESS) {
301 		cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
302 		    status);
303 		TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail,
304 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
305 		/* Set "status" and "errormsg" and goto failure */
306 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command");
307 		goto cqalloc_fail6;
308 	}
309 
310 	/*
311 	 * Fill in the rest of the Tavor Completion Queue handle.  Having
312 	 * successfully transferred ownership of the CQC, we can update the
313 	 * following fields for use in further operations on the CQ.
314 	 */
315 	cq->cq_cqcrsrcp	  = cqc;
316 	cq->cq_rsrcp	  = rsrc;
317 	cq->cq_consindx	  = 0;
318 	cq->cq_buf	  = buf;
319 	cq->cq_bufsz	  = (1 << log_cq_size);
320 	cq->cq_mrhdl	  = mr;
321 	cq->cq_sync	  = cq_sync;
322 	cq->cq_refcnt	  = 0;
323 	cq->cq_is_special = 0;
324 	cq->cq_uarpg	  = uarpg;
325 	cq->cq_umap_dhp	  = (devmap_cookie_t)NULL;
326 	avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
327 	    sizeof (struct tavor_workq_hdr_s),
328 	    offsetof(struct tavor_workq_hdr_s, wq_avl_link));
329 
330 	cq->cq_wrid_reap_head  = NULL;
331 	cq->cq_wrid_reap_tail  = NULL;
332 	cq->cq_hdlrarg	  = (void *)ibt_cqhdl;
333 
334 	/*
335 	 * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
336 	 * "actual_size" and "cqhdl" and return success
337 	 */
338 	ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
339 	state->ts_cqhdl[cqc->tr_indx] = cq;
340 
341 	/*
342 	 * If this is a user-mappable CQ, then we need to insert the previously
343 	 * allocated entry into the "userland resources database".  This will
344 	 * allow for later lookup during devmap() (i.e. mmap()) calls.
345 	 */
346 	if (cq->cq_is_umap) {
347 		tavor_umap_db_add(umapdb);
348 	}
349 
350 	/*
351 	 * Fill in the return arguments (if necessary).  This includes the
352 	 * real completion queue size.
353 	 */
354 	if (actual_size != NULL) {
355 		*actual_size = (1 << log_cq_size) - 1;
356 	}
357 	*cqhdl = cq;
358 
359 	TAVOR_TNF_EXIT(tavor_cq_alloc);
360 	return (DDI_SUCCESS);
361 
362 /*
363  * The following is cleanup for all possible failure cases in this routine
364  */
365 cqalloc_fail6:
366 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
367 	    sleepflag) != DDI_SUCCESS) {
368 		TAVOR_WARNING(state, "failed to deregister CQ memory");
369 	}
370 cqalloc_fail5:
371 	tavor_queue_free(state, &cq->cq_cqinfo);
372 cqalloc_fail4:
373 	if (cq_is_umap) {
374 		tavor_umap_db_free(umapdb);
375 	}
376 cqalloc_fail3:
377 	tavor_rsrc_free(state, &rsrc);
378 cqalloc_fail2:
379 	tavor_rsrc_free(state, &cqc);
380 cqalloc_fail1:
381 	tavor_pd_refcnt_dec(pd);
382 cqalloc_fail:
383 	TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "",
384 	    tnf_string, msg, errormsg);
385 	TAVOR_TNF_EXIT(tavor_cq_alloc);
386 	return (status);
387 }
388 
389 
390 /*
391  * tavor_cq_free()
392  *    Context: Can be called only from user or kernel context.
393  */
394 /* ARGSUSED */
395 int
396 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
397 {
398 	tavor_rsrc_t		*cqc, *rsrc;
399 	tavor_umap_db_entry_t	*umapdb;
400 	tavor_hw_cqc_t		cqc_entry;
401 	tavor_pdhdl_t		pd;
402 	tavor_mrhdl_t		mr;
403 	tavor_cqhdl_t		cq;
404 	uint32_t		cqnum;
405 	uint64_t		value;
406 	uint_t			maxprot;
407 	int			status;
408 
409 	TAVOR_TNF_ENTER(tavor_cq_free);
410 
411 	/*
412 	 * Pull all the necessary information from the Tavor Completion Queue
413 	 * handle.  This is necessary here because the resource for the
414 	 * CQ handle is going to be freed up as part of this operation.
415 	 */
416 	cq	= *cqhdl;
417 	mutex_enter(&cq->cq_lock);
418 	cqc	= cq->cq_cqcrsrcp;
419 	rsrc	= cq->cq_rsrcp;
420 	pd	= state->ts_pdhdl_internal;
421 	mr	= cq->cq_mrhdl;
422 	cqnum	= cq->cq_cqnum;
423 
424 	/*
425 	 * If there are work queues still associated with the CQ, then return
426 	 * an error.  Otherwise, we will be holding the CQ lock.
427 	 */
428 	if (cq->cq_refcnt != 0) {
429 		mutex_exit(&cq->cq_lock);
430 		TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
431 		    tnf_int, refcnt, cq->cq_refcnt);
432 		TAVOR_TNF_EXIT(tavor_cq_free);
433 		return (IBT_CQ_BUSY);
434 	}
435 
436 	/*
437 	 * If this was a user-mappable CQ, then we need to remove its entry
438 	 * from the "userland resources database".  If it is also currently
439 	 * mmap()'d out to a user process, then we need to call
440 	 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
441 	 * We also need to invalidate the CQ tracking information for the
442 	 * user mapping.
443 	 */
444 	if (cq->cq_is_umap) {
445 		status = tavor_umap_db_find(state->ts_instance, cqnum,
446 		    MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
447 		    &umapdb);
448 		if (status != DDI_SUCCESS) {
449 			mutex_exit(&cq->cq_lock);
450 			TAVOR_WARNING(state, "failed to find in database");
451 			TAVOR_TNF_EXIT(tavor_cq_free);
452 			return (ibc_get_ci_failure(0));
453 		}
454 		tavor_umap_db_free(umapdb);
455 		if (cq->cq_umap_dhp != NULL) {
456 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
457 			status = devmap_devmem_remap(cq->cq_umap_dhp,
458 			    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
459 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
460 			if (status != DDI_SUCCESS) {
461 				mutex_exit(&cq->cq_lock);
462 				TAVOR_WARNING(state, "failed in CQ memory "
463 				    "devmap_devmem_remap()");
464 				TAVOR_TNF_EXIT(tavor_cq_free);
465 				return (ibc_get_ci_failure(0));
466 			}
467 			cq->cq_umap_dhp = (devmap_cookie_t)NULL;
468 		}
469 	}
470 
471 	/*
472 	 * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
473 	 * in-progress events to detect that the CQ corresponding to this
474 	 * number has been freed.
475 	 */
476 	state->ts_cqhdl[cqc->tr_indx] = NULL;
477 
478 	/*
479 	 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
480 	 * list.  This cleans up all the structures associated with the WRID
481 	 * processing for this CQ.  Once we complete, drop the lock and finish
482 	 * the deallocation of the CQ.
483 	 */
484 	tavor_wrid_cq_force_reap(cq);
485 
486 	mutex_exit(&cq->cq_lock);
487 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
488 
489 	/*
490 	 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
491 	 * firmware command).  If the ownership transfer fails for any reason,
492 	 * then it is an indication that something (either in HW or SW) has
493 	 * gone seriously wrong.
494 	 */
495 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
496 	    sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
497 	if (status != TAVOR_CMD_SUCCESS) {
498 		TAVOR_WARNING(state, "failed to reclaim CQC ownership");
499 		cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
500 		    status);
501 		TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail,
502 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
503 		TAVOR_TNF_EXIT(tavor_cq_free);
504 		return (ibc_get_ci_failure(0));
505 	}
506 
507 	/*
508 	 * Deregister the memory for the Completion Queue.  If this fails
509 	 * for any reason, then it is an indication that something (either
510 	 * in HW or SW) has gone seriously wrong.  So we print a warning
511 	 * message and return.
512 	 */
513 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
514 	    sleepflag);
515 	if (status != DDI_SUCCESS) {
516 		TAVOR_WARNING(state, "failed to deregister CQ memory");
517 		TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
518 		TAVOR_TNF_EXIT(tavor_cq_free);
519 		return (ibc_get_ci_failure(0));
520 	}
521 
522 	/* Free the memory for the CQ */
523 	tavor_queue_free(state, &cq->cq_cqinfo);
524 
525 	/* Free the Tavor Completion Queue handle */
526 	tavor_rsrc_free(state, &rsrc);
527 
528 	/* Free up the CQC entry resource */
529 	tavor_rsrc_free(state, &cqc);
530 
531 	/* Decrement the reference count on the protection domain (PD) */
532 	tavor_pd_refcnt_dec(pd);
533 
534 	/* Set the cqhdl pointer to NULL and return success */
535 	*cqhdl = NULL;
536 
537 	TAVOR_TNF_EXIT(tavor_cq_free);
538 	return (DDI_SUCCESS);
539 }
540 
541 
542 /*
543  * tavor_cq_resize()
544  *    Context: Can be called only from user or kernel context.
545  */
546 int
547 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
548     uint_t *actual_size, uint_t sleepflag)
549 {
550 	tavor_hw_cqc_t		cqc_entry;
551 	tavor_qalloc_info_t	new_cqinfo, old_cqinfo;
552 	ibt_mr_attr_t		mr_attr;
553 	tavor_mr_options_t	op;
554 	tavor_pdhdl_t		pd;
555 	tavor_mrhdl_t		mr, mr_old;
556 	tavor_hw_cqe_t		*buf;
557 	uint32_t		new_prod_indx, old_cons_indx;
558 	uint_t			dma_xfer_mode, cq_sync, log_cq_size, maxprot;
559 	int			status, i, flag;
560 	char			*errormsg;
561 
562 	TAVOR_TNF_ENTER(tavor_cq_resize);
563 
564 	/* Use the internal protection domain (PD) for CQs */
565 	pd = state->ts_pdhdl_internal;
566 
567 	/*
568 	 * Calculate the appropriate size for the new resized completion queue.
569 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
570 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
571 	 * to round the requested size up to the next highest power-of-2
572 	 */
573 	req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
574 	log_cq_size = highbit(req_size);
575 
576 	/*
577 	 * Next we verify that the rounded-up size is valid (i.e. consistent
578 	 * with the device limits and/or software-configured limits)
579 	 */
580 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
581 		/* Set "status" and "errormsg" and goto failure */
582 		TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
583 		goto cqresize_fail;
584 	}
585 
586 	/*
587 	 * Allocate the memory for newly resized Completion Queue.
588 	 *
589 	 * Note: Although we use the common queue allocation routine, we
590 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
591 	 * kernel system memory) for kernel CQs because it would be
592 	 * inefficient to have CQs located in DDR memory.  This is the same
593 	 * as we do when we first allocate completion queues primarily
594 	 * because CQs are read from (by software) more than they are written
595 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
596 	 * user-mappable CQs for a similar reason.)
597 	 * It is also worth noting that, unlike Tavor QP work queues,
598 	 * completion queues do not have the same strict alignment
599 	 * requirements.  It is sufficient for the CQ memory to be both
600 	 * aligned to and bound to addresses which are a multiple of CQE size.
601 	 */
602 	new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
603 	new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
604 	new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
605 	if (cq->cq_is_umap) {
606 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
607 	} else {
608 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
609 	}
610 	status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
611 	if (status != DDI_SUCCESS) {
612 		/* Set "status" and "errormsg" and goto failure */
613 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
614 		goto cqresize_fail;
615 	}
616 	buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
617 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
618 
619 	/*
620 	 * Initialize each of the Completion Queue Entries (CQE) by setting
621 	 * their ownership to hardware ("owner" bit set to HW).  This is in
622 	 * preparation for the final resize operation (below).
623 	 */
624 	for (i = 0; i < (1 << log_cq_size); i++) {
625 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
626 	}
627 
628 	/*
629 	 * Register the memory for the CQ.  The memory for the CQ must
630 	 * be registered in the Tavor TPT tables.  This gives us the LKey
631 	 * to specify in the CQ context below.
632 	 */
633 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
634 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
635 	mr_attr.mr_len	 = new_cqinfo.qa_size;
636 	mr_attr.mr_as	 = NULL;
637 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
638 	if (cq->cq_is_umap) {
639 		dma_xfer_mode = DDI_DMA_CONSISTENT;
640 	} else {
641 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
642 	}
643 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
644 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
645 	}
646 	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
647 	op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
648 	op.mro_bind_override_addr = 0;
649 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
650 	if (status != DDI_SUCCESS) {
651 		tavor_queue_free(state, &new_cqinfo);
652 		/* Set "status" and "errormsg" and goto failure */
653 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
654 		goto cqresize_fail;
655 	}
656 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
657 
658 	/* Determine if later ddi_dma_sync will be necessary */
659 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
660 
661 	/* Sync entire "new" CQ for use by hardware (if necessary) */
662 	if (cq_sync) {
663 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
664 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
665 	}
666 
667 	/*
668 	 * Now we grab the CQ lock.  Since we will be updating the actual
669 	 * CQ location and the producer/consumer indexes, we should hold
670 	 * the lock.
671 	 *
672 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
673 	 * holding the "cq_lock" and if we got raised to interrupt level
674 	 * by priority inversion, we would not want to block in this routine
675 	 * waiting for success.
676 	 */
677 	mutex_enter(&cq->cq_lock);
678 
679 	/*
680 	 * Determine the current CQ "consumer index".
681 	 *
682 	 * Note:  This will depend on whether the CQ had previously been
683 	 * mapped for user access or whether it is a kernel CQ.  If this
684 	 * is a kernel CQ, then all PollCQ() operations have come through
685 	 * the IBTF and, hence, the driver's CQ state structure will
686 	 * contain the current consumer index.  If, however, the user has
687 	 * accessed this CQ by bypassing the driver (OS-bypass), then we
688 	 * need to query the firmware to determine the current CQ consumer
689 	 * index.  This also assumes that the user process will not continue
690 	 * to consume entries while at the same time doing the ResizeCQ()
691 	 * operation.  If the user process does not guarantee this, then it
692 	 * may see duplicate or missed completions.  But under no
693 	 * circumstances should this panic the system.
694 	 */
695 	if (cq->cq_is_umap) {
696 		status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
697 		    cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
698 		    TAVOR_NOSLEEP);
699 		if (status != TAVOR_CMD_SUCCESS) {
700 			/* Query CQ has failed, drop CQ lock and cleanup */
701 			mutex_exit(&cq->cq_lock);
702 			if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
703 			    sleepflag) != DDI_SUCCESS) {
704 				TAVOR_WARNING(state, "failed to deregister "
705 				    "CQ memory");
706 			}
707 			tavor_queue_free(state, &new_cqinfo);
708 			TAVOR_WARNING(state, "failed to find in database");
709 
710 			/* Set "status" and "errormsg" and goto failure */
711 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
712 			    "failed umap lookup");
713 			goto cqresize_fail;
714 		}
715 		old_cons_indx = cqc_entry.cons_indx;
716 	} else {
717 		old_cons_indx = cq->cq_consindx;
718 	}
719 
720 	/*
721 	 * Fill in the CQC entry.  For the resize operation this is the
722 	 * final step before attempting the resize operation on the CQC entry.
723 	 * We use all of the information collected/calculated above to fill
724 	 * in the requisite portions of the CQC.
725 	 */
726 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
727 	cqc_entry.start_addr_h	= (mr->mr_bindinfo.bi_addr >> 32);
728 	cqc_entry.start_addr_l	= (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
729 	cqc_entry.log_cq_sz	= log_cq_size;
730 	cqc_entry.lkey		= mr->mr_lkey;
731 
732 	/*
733 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
734 	 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
735 	 * command).  Note: In general, this operation shouldn't fail.  But
736 	 * if it does, we have to undo everything we've done above before
737 	 * returning error.  Also note that the status returned may indicate
738 	 * the code to return to the IBTF.
739 	 */
740 	status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
741 	    &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
742 	if (status != TAVOR_CMD_SUCCESS) {
743 		/* Resize attempt has failed, drop CQ lock and cleanup */
744 		mutex_exit(&cq->cq_lock);
745 		if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
746 		    sleepflag) != DDI_SUCCESS) {
747 			TAVOR_WARNING(state, "failed to deregister CQ memory");
748 		}
749 		tavor_queue_free(state, &new_cqinfo);
750 		if (status == TAVOR_CMD_BAD_SIZE) {
751 			TAVOR_TNF_EXIT(tavor_cq_resize);
752 			return (IBT_CQ_SZ_INSUFFICIENT);
753 		} else {
754 			cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
755 			    "%08x\n", status);
756 			TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail,
757 			    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
758 			TAVOR_TNF_EXIT(tavor_cq_resize);
759 			return (ibc_get_ci_failure(0));
760 		}
761 	}
762 
763 	/*
764 	 * The CQ resize attempt was successful.  Before dropping the CQ lock,
765 	 * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
766 	 * the Tavor firmware guarantees us that sufficient space is set aside
767 	 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
768 	 * The two parameters to this helper function ("old_cons_indx" and
769 	 * "new_prod_indx") essentially indicate the starting index and number
770 	 * of any CQEs that might remain in the "old" CQ memory.
771 	 */
772 	tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
773 
774 	/* Sync entire "new" CQ for use by hardware (if necessary) */
775 	if (cq_sync) {
776 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
777 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
778 	}
779 
780 	/*
781 	 * Update the Tavor Completion Queue handle with all the new
782 	 * information.  At the same time, save away all the necessary
783 	 * information for freeing up the old resources
784 	 */
785 	mr_old		 = cq->cq_mrhdl;
786 	old_cqinfo	 = cq->cq_cqinfo;
787 	cq->cq_cqinfo	 = new_cqinfo;
788 	cq->cq_consindx	 = 0;
789 	cq->cq_buf	 = buf;
790 	cq->cq_bufsz	 = (1 << log_cq_size);
791 	cq->cq_mrhdl	 = mr;
792 	cq->cq_sync	 = cq_sync;
793 
794 	/*
795 	 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
796 	 * to a user process, then we need to call devmap_devmem_remap() to
797 	 * invalidate the mapping to the CQ memory.  We also need to
798 	 * invalidate the CQ tracking information for the user mapping.
799 	 */
800 	if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
801 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
802 		status = devmap_devmem_remap(cq->cq_umap_dhp,
803 		    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
804 		    DEVMAP_MAPPING_INVALID, NULL);
805 		if (status != DDI_SUCCESS) {
806 			mutex_exit(&cq->cq_lock);
807 			TAVOR_WARNING(state, "failed in CQ memory "
808 			    "devmap_devmem_remap()");
809 			TAVOR_TNF_EXIT(tavor_cq_free);
810 			return (ibc_get_ci_failure(0));
811 		}
812 		cq->cq_umap_dhp = (devmap_cookie_t)NULL;
813 	}
814 
815 	/*
816 	 * Drop the CQ lock now.  The only thing left to do is to free up
817 	 * the old resources.
818 	 */
819 	mutex_exit(&cq->cq_lock);
820 
821 	/*
822 	 * Deregister the memory for the old Completion Queue.  Note: We
823 	 * really can't return error here because we have no good way to
824 	 * cleanup.  Plus, the deregistration really shouldn't ever happen.
825 	 * So, if it does, it is an indication that something has gone
826 	 * seriously wrong.  So we print a warning message and return error
827 	 * (knowing, of course, that the "old" CQ memory will be leaked)
828 	 */
829 	status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
830 	    sleepflag);
831 	if (status != DDI_SUCCESS) {
832 		TAVOR_WARNING(state, "failed to deregister old CQ memory");
833 		/* Set "status" and "errormsg" and goto failure */
834 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
835 		    "failed deregister mr (old)");
836 		goto cqresize_fail;
837 	}
838 
839 	/* Free the memory for the old CQ */
840 	tavor_queue_free(state, &old_cqinfo);
841 
842 	/*
843 	 * Fill in the return arguments (if necessary).  This includes the
844 	 * real new completion queue size.
845 	 */
846 	if (actual_size != NULL) {
847 		*actual_size = (1 << log_cq_size) - 1;
848 	}
849 
850 	TAVOR_TNF_EXIT(tavor_cq_resize);
851 	return (DDI_SUCCESS);
852 
853 cqresize_fail:
854 	TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "",
855 	    tnf_string, msg, errormsg);
856 	TAVOR_TNF_EXIT(tavor_cq_resize);
857 	return (status);
858 }
859 
860 
861 /*
862  * tavor_cq_notify()
863  *    Context: Can be called from interrupt or base context.
864  */
865 int
866 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
867     ibt_cq_notify_flags_t flags)
868 {
869 	uint_t		cqnum;
870 
871 	TAVOR_TNF_ENTER(tavor_cq_notify);
872 
873 	/*
874 	 * Determine if we are trying to get the next completion or the next
875 	 * "solicited" completion.  Then hit the appropriate doorbell.
876 	 *
877 	 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
878 	 * regarding why we do not have to do an extra PIO read here, and we
879 	 * will not lose an event after writing this doorbell.
880 	 */
881 	cqnum = cq->cq_cqnum;
882 	if (flags == IBT_NEXT_COMPLETION) {
883 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
884 		    TAVOR_CQDB_DEFAULT_PARAM);
885 
886 	} else if (flags == IBT_NEXT_SOLICITED) {
887 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
888 		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);
889 
890 	} else {
891 		TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "",
892 		    tnf_int, flags, flags);
893 		TAVOR_TNF_EXIT(tavor_cq_notify);
894 		return (IBT_CQ_NOTIFY_TYPE_INVALID);
895 	}
896 
897 	TAVOR_TNF_EXIT(tavor_cq_notify);
898 	return (DDI_SUCCESS);
899 }
900 
901 
902 /*
903  * tavor_cq_poll()
904  *    Context: Can be called from interrupt or base context.
905  */
906 int
907 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
908     uint_t num_wc, uint_t *num_polled)
909 {
910 	tavor_hw_cqe_t	*cqe;
911 	uint32_t	cons_indx, wrap_around_mask;
912 	uint32_t	polled_cnt, num_to_increment;
913 	int		status;
914 
915 	TAVOR_TNF_ENTER(tavor_cq_poll);
916 
917 	/*
918 	 * Check for user-mappable CQ memory.  Note:  We do not allow kernel
919 	 * clients to poll CQ memory that is accessible directly by the user.
920 	 * If the CQ memory is user accessible, then return an error.
921 	 */
922 	if (cq->cq_is_umap) {
923 		TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type,
924 		    TAVOR_TNF_ERROR, "");
925 		TAVOR_TNF_EXIT(tavor_cq_poll);
926 		return (IBT_CQ_HDL_INVALID);
927 	}
928 
929 	mutex_enter(&cq->cq_lock);
930 
931 	/* Get the consumer index */
932 	cons_indx = cq->cq_consindx;
933 
934 	/*
935 	 * Calculate the wrap around mask.  Note: This operation only works
936 	 * because all Tavor completion queues have power-of-2 sizes
937 	 */
938 	wrap_around_mask = (cq->cq_bufsz - 1);
939 
940 	/* Calculate the pointer to the first CQ entry */
941 	cqe = &cq->cq_buf[cons_indx];
942 
943 	/* Sync the current CQE to read */
944 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
945 
946 	/*
947 	 * Keep pulling entries from the CQ until we find an entry owned by
948 	 * the hardware.  As long as there the CQE's owned by SW, process
949 	 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
950 	 * consumer index.  Note:  We only update the consumer index if
951 	 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
952 	 * it indicates that we are going to "recycle" the CQE (probably
953 	 * because it is a error CQE and corresponds to more than one
954 	 * completion).
955 	 */
956 	polled_cnt = 0;
957 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
958 		status = tavor_cq_cqe_consume(state, cq, cqe,
959 		    &wc_p[polled_cnt++]);
960 		if (status == TAVOR_CQ_SYNC_AND_DB) {
961 			/* Reset entry to hardware ownership */
962 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
963 
964 			/* Sync the current CQE for device */
965 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
966 
967 			/* Increment the consumer index */
968 			cons_indx = (cons_indx + 1) & wrap_around_mask;
969 
970 			/* Update the pointer to the next CQ entry */
971 			cqe = &cq->cq_buf[cons_indx];
972 
973 			/* Sync the next CQE to read */
974 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
975 		}
976 
977 		/*
978 		 * If we have run out of space to store work completions,
979 		 * then stop and return the ones we have pulled of the CQ.
980 		 */
981 		if (polled_cnt >= num_wc) {
982 			break;
983 		}
984 	}
985 
986 	/*
987 	 * Now we only ring the doorbell (to update the consumer index) if
988 	 * we've actually consumed a CQ entry.  If we have, for example,
989 	 * pulled from a CQE that we are still in the process of "recycling"
990 	 * for error purposes, then we would not update the consumer index.
991 	 */
992 	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
993 		/*
994 		 * Post doorbell to update the consumer index.  Doorbell
995 		 * value indicates number of entries consumed (minus 1)
996 		 */
997 		if (cons_indx > cq->cq_consindx) {
998 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
999 		} else {
1000 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1001 			    cq->cq_consindx) - 1;
1002 		}
1003 		cq->cq_consindx = cons_indx;
1004 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1005 		    cq->cq_cqnum, num_to_increment);
1006 
1007 	} else if (polled_cnt == 0) {
1008 		/*
1009 		 * If the CQ is empty, we can try to free up some of the WRID
1010 		 * list containers.  See tavor_wr.c for more details on this
1011 		 * operation.
1012 		 */
1013 		tavor_wrid_cq_reap(cq);
1014 	}
1015 
1016 	mutex_exit(&cq->cq_lock);
1017 
1018 	/* Set "num_polled" (if necessary) */
1019 	if (num_polled != NULL) {
1020 		*num_polled = polled_cnt;
1021 	}
1022 
1023 	/* Set CQ_EMPTY condition if needed, otherwise return success */
1024 	if (polled_cnt == 0) {
1025 		status = IBT_CQ_EMPTY;
1026 	} else {
1027 		status = DDI_SUCCESS;
1028 	}
1029 
1030 	/*
1031 	 * Check if the system is currently panicking.  If it is, then call
1032 	 * the Tavor interrupt service routine.  This step is necessary here
1033 	 * because we might be in a polled I/O mode and without the call to
1034 	 * tavor_isr() - and its subsequent calls to poll and rearm each
1035 	 * event queue - we might overflow our EQs and render the system
1036 	 * unable to sync/dump.
1037 	 */
1038 	if (ddi_in_panic() != 0) {
1039 		(void) tavor_isr((caddr_t)state, (caddr_t)NULL);
1040 	}
1041 
1042 	TAVOR_TNF_EXIT(tavor_cq_poll);
1043 	return (status);
1044 }
1045 
1046 
1047 /*
1048  * tavor_cq_handler()
1049  *    Context: Only called from interrupt context
1050  */
1051 int
1052 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1053     tavor_hw_eqe_t *eqe)
1054 {
1055 	tavor_cqhdl_t		cq;
1056 	uint_t			cqnum;
1057 	uint_t			eqe_evttype;
1058 
1059 	TAVOR_TNF_ENTER(tavor_cq_handler);
1060 
1061 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1062 
1063 	ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
1064 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1065 
1066 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1067 		TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition,
1068 		    TAVOR_TNF_ERROR, "");
1069 		tavor_eq_overflow_handler(state, eq, eqe);
1070 
1071 		TAVOR_TNF_EXIT(tavor_cq_handler);
1072 		return (DDI_FAILURE);
1073 	}
1074 
1075 
1076 	/* Get the CQ handle from CQ number in event descriptor */
1077 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1078 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1079 
1080 	/*
1081 	 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1082 	 * This operation is to enable subsequent CQ doorbells (e.g. those
1083 	 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1084 	 */
1085 	tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1086 
1087 	/*
1088 	 * If the CQ handle is NULL, this is probably an indication
1089 	 * that the CQ has been freed already.  In which case, we
1090 	 * should not deliver this event.
1091 	 *
1092 	 * We also check that the CQ number in the handle is the
1093 	 * same as the CQ number in the event queue entry.  This
1094 	 * extra check allows us to handle the case where a CQ was
1095 	 * freed and then allocated again in the time it took to
1096 	 * handle the event queue processing.  By constantly incrementing
1097 	 * the non-constrained portion of the CQ number every time
1098 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1099 	 * that a stale event could be passed to the client's CQ
1100 	 * handler.
1101 	 *
1102 	 * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
1103 	 * means that we've have either received this event before we
1104 	 * finished attaching to the IBTF or we've received it while we
1105 	 * are in the process of detaching.
1106 	 */
1107 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1108 	    (state->ts_ibtfpriv != NULL)) {
1109 		TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1110 	} else {
1111 		TNF_PROBE_2(tavor_cq_handler_dropped_event,
1112 		    TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1113 		    tnf_uint, hdl_cqnum, cqnum);
1114 	}
1115 
1116 	TAVOR_TNF_EXIT(tavor_cq_handler);
1117 	return (DDI_SUCCESS);
1118 }
1119 
1120 
1121 /*
1122  * tavor_cq_err_handler()
1123  *    Context: Only called from interrupt context
1124  */
1125 int
1126 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1127     tavor_hw_eqe_t *eqe)
1128 {
1129 	tavor_cqhdl_t		cq;
1130 	uint_t			cqnum;
1131 	ibc_async_event_t	event;
1132 	ibt_async_code_t	type;
1133 	uint_t			eqe_evttype;
1134 
1135 	TAVOR_TNF_ENTER(tavor_cq_err_handler);
1136 
1137 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1138 
1139 	ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1140 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1141 
1142 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1143 		TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition,
1144 		    TAVOR_TNF_ERROR, "");
1145 		tavor_eq_overflow_handler(state, eq, eqe);
1146 
1147 		TAVOR_TNF_EXIT(tavor_cq_err_handler);
1148 		return (DDI_FAILURE);
1149 	}
1150 
1151 	/* cmn_err(CE_CONT, "CQ Error handler\n"); */
1152 
1153 	/* Get the CQ handle from CQ number in event descriptor */
1154 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1155 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1156 
1157 	/*
1158 	 * If the CQ handle is NULL, this is probably an indication
1159 	 * that the CQ has been freed already.  In which case, we
1160 	 * should not deliver this event.
1161 	 *
1162 	 * We also check that the CQ number in the handle is the
1163 	 * same as the CQ number in the event queue entry.  This
1164 	 * extra check allows us to handle the case where a CQ was
1165 	 * freed and then allocated again in the time it took to
1166 	 * handle the event queue processing.  By constantly incrementing
1167 	 * the non-constrained portion of the CQ number every time
1168 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1169 	 * that a stale event could be passed to the client's CQ
1170 	 * handler.
1171 	 *
1172 	 * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
1173 	 * means that we've have either received this event before we
1174 	 * finished attaching to the IBTF or we've received it while we
1175 	 * are in the process of detaching.
1176 	 */
1177 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1178 	    (state->ts_ibtfpriv != NULL)) {
1179 		event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1180 		type		= IBT_ERROR_CQ;
1181 
1182 		TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1183 	} else {
1184 		TNF_PROBE_2(tavor_cq_err_handler_dropped_event,
1185 		    TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1186 		    tnf_uint, hdl_cqnum, cqnum);
1187 	}
1188 
1189 	TAVOR_TNF_EXIT(tavor_cq_err_handler);
1190 	return (DDI_SUCCESS);
1191 }
1192 
1193 
1194 /*
1195  * tavor_cq_refcnt_inc()
1196  *    Context: Can be called from interrupt or base context.
1197  */
1198 int
1199 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1200 {
1201 	/*
1202 	 * Increment the completion queue's reference count.  Note: In order
1203 	 * to ensure compliance with IBA C11-15, we must ensure that a given
1204 	 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1205 	 * This is accomplished here by keeping track of how the referenced
1206 	 * CQ is being used.
1207 	 */
1208 	mutex_enter(&cq->cq_lock);
1209 	TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "",
1210 	    tnf_uint, refcnt, cq->cq_refcnt);
1211 	if (cq->cq_refcnt == 0) {
1212 		cq->cq_is_special = is_special;
1213 	} else {
1214 		if (cq->cq_is_special != is_special) {
1215 			mutex_exit(&cq->cq_lock);
1216 			return (DDI_FAILURE);
1217 		}
1218 	}
1219 	cq->cq_refcnt++;
1220 	mutex_exit(&cq->cq_lock);
1221 	return (DDI_SUCCESS);
1222 }
1223 
1224 
1225 /*
1226  * tavor_cq_refcnt_dec()
1227  *    Context: Can be called from interrupt or base context.
1228  */
1229 void
1230 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1231 {
1232 	/* Decrement the completion queue's reference count */
1233 	mutex_enter(&cq->cq_lock);
1234 	cq->cq_refcnt--;
1235 	TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "",
1236 	    tnf_uint, refcnt, cq->cq_refcnt);
1237 	mutex_exit(&cq->cq_lock);
1238 }
1239 
1240 
1241 /*
1242  * tavor_cq_doorbell()
1243  *    Context: Can be called from interrupt or base context.
1244  */
1245 static void
1246 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1247     uint32_t cq_param)
1248 {
1249 	uint64_t	doorbell = 0;
1250 
1251 	/* Build the doorbell from the parameters */
1252 	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1253 	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1254 
1255 	TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "",
1256 	    tnf_ulong, doorbell, doorbell);
1257 
1258 	/* Write the doorbell to UAR */
1259 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1260 	    doorbell);
1261 }
1262 
1263 
1264 /*
1265  * tavor_cqhdl_from_cqnum()
1266  *    Context: Can be called from interrupt or base context.
1267  *
1268  *    This routine is important because changing the unconstrained
1269  *    portion of the CQ number is critical to the detection of a
1270  *    potential race condition in the CQ handler code (i.e. the case
1271  *    where a CQ is freed and alloc'd again before an event for the
1272  *    "old" CQ can be handled).
1273  *
1274  *    While this is not a perfect solution (not sure that one exists)
1275  *    it does help to mitigate the chance that this race condition will
1276  *    cause us to deliver a "stale" event to the new CQ owner.  Note:
1277  *    this solution does not scale well because the number of constrained
1278  *    bits increases (and, hence, the number of unconstrained bits
1279  *    decreases) as the number of supported CQs grows.  For small and
1280  *    intermediate values, it should hopefully provide sufficient
1281  *    protection.
1282  */
1283 tavor_cqhdl_t
1284 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1285 {
1286 	uint_t	cqindx, cqmask;
1287 
1288 	/* Calculate the CQ table index from the cqnum */
1289 	cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1290 	cqindx = cqnum & cqmask;
1291 	return (state->ts_cqhdl[cqindx]);
1292 }
1293 
1294 
1295 /*
1296  * tavor_cq_cqe_consume()
1297  *    Context: Can be called from interrupt or base context.
1298  */
1299 static int
1300 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1301     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1302 {
1303 	uint_t		flags, type, opcode, qpnum, qp1_indx;
1304 	int		status;
1305 
1306 	TAVOR_TNF_ENTER(tavor_cq_cqe_consume);
1307 
1308 	/*
1309 	 * Determine if this is an "error" CQE by examining "opcode".  If it
1310 	 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1311 	 * whatever status it returns.  Otherwise, this is a successful
1312 	 * completion.
1313 	 */
1314 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1315 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1316 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1317 		status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1318 		TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1319 		return (status);
1320 	}
1321 
1322 	/*
1323 	 * Fetch the Work Request ID using the information in the CQE.
1324 	 * See tavor_wr.c for more details.
1325 	 */
1326 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1327 
1328 	/*
1329 	 * Parse the CQE opcode to determine completion type.  This will set
1330 	 * not only the type of the completion, but also any flags that might
1331 	 * be associated with it (e.g. whether immediate data is present).
1332 	 */
1333 	flags = IBT_WC_NO_FLAGS;
1334 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1335 
1336 		/* Send CQE */
1337 		switch (opcode) {
1338 		case TAVOR_CQE_SND_RDMAWR_IMM:
1339 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1340 			/* FALLTHROUGH */
1341 		case TAVOR_CQE_SND_RDMAWR:
1342 			type = IBT_WRC_RDMAW;
1343 			break;
1344 
1345 		case TAVOR_CQE_SND_SEND_IMM:
1346 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1347 			/* FALLTHROUGH */
1348 		case TAVOR_CQE_SND_SEND:
1349 			type = IBT_WRC_SEND;
1350 			break;
1351 
1352 		case TAVOR_CQE_SND_RDMARD:
1353 			type = IBT_WRC_RDMAR;
1354 			break;
1355 
1356 		case TAVOR_CQE_SND_ATOMIC_CS:
1357 			type = IBT_WRC_CSWAP;
1358 			break;
1359 
1360 		case TAVOR_CQE_SND_ATOMIC_FA:
1361 			type = IBT_WRC_FADD;
1362 			break;
1363 
1364 		case TAVOR_CQE_SND_BIND_MW:
1365 			type = IBT_WRC_BIND;
1366 			break;
1367 
1368 		default:
1369 			TAVOR_WARNING(state, "unknown send CQE type");
1370 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1371 			TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type,
1372 			    TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1373 			TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1374 			return (TAVOR_CQ_SYNC_AND_DB);
1375 		}
1376 	} else {
1377 
1378 		/* Receive CQE */
1379 		switch (opcode & 0x1F) {
1380 		case TAVOR_CQE_RCV_RECV_IMM:
1381 			/* FALLTHROUGH */
1382 		case TAVOR_CQE_RCV_RECV_IMM2:
1383 			/*
1384 			 * Note:  According to the Tavor PRM, all QP1 recv
1385 			 * completions look like the result of a Send with
1386 			 * Immediate.  They are not, however, (MADs are Send
1387 			 * Only) so we need to check the QP number and set
1388 			 * the flag only if it is non-QP1.
1389 			 */
1390 			qpnum	 = TAVOR_CQE_QPNUM_GET(cq, cqe);
1391 			qp1_indx = state->ts_spec_qp1->tr_indx;
1392 			if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1393 				flags |= IBT_WC_IMMED_DATA_PRESENT;
1394 			}
1395 			/* FALLTHROUGH */
1396 		case TAVOR_CQE_RCV_RECV:
1397 			/* FALLTHROUGH */
1398 		case TAVOR_CQE_RCV_RECV2:
1399 			type = IBT_WRC_RECV;
1400 			break;
1401 
1402 		case TAVOR_CQE_RCV_RDMAWR_IMM:
1403 			/* FALLTHROUGH */
1404 		case TAVOR_CQE_RCV_RDMAWR_IMM2:
1405 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1406 			type = IBT_WRC_RECV_RDMAWI;
1407 			break;
1408 
1409 		default:
1410 			TAVOR_WARNING(state, "unknown recv CQE type");
1411 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1412 			TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type,
1413 			    TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1414 			TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1415 			return (TAVOR_CQ_SYNC_AND_DB);
1416 		}
1417 	}
1418 	wc->wc_type = type;
1419 
1420 	/*
1421 	 * Check for GRH, update the flags, then fill in "wc_flags" field
1422 	 * in the work completion
1423 	 */
1424 	if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1425 		flags |= IBT_WC_GRH_PRESENT;
1426 	}
1427 	wc->wc_flags = flags;
1428 
1429 	/* If we got here, completion status must be success */
1430 	wc->wc_status = IBT_WC_SUCCESS;
1431 
1432 	/*
1433 	 * Parse the remaining contents of the CQE into the work completion.
1434 	 * This means filling in SL, QP number, SLID, immediate data, etc.
1435 	 * Note:  Not all of these fields are valid in a given completion.
1436 	 * Many of them depend on the actual type of completion.  So we fill
1437 	 * in all of the fields and leave it up to the IBTF and consumer to
1438 	 * sort out which are valid based on their context.
1439 	 */
1440 	wc->wc_sl	  = TAVOR_CQE_SL_GET(cq, cqe);
1441 	wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1442 	wc->wc_qpn	  = TAVOR_CQE_DQPN_GET(cq, cqe);
1443 	wc->wc_res_hash	  = 0;
1444 	wc->wc_slid	  = TAVOR_CQE_DLID_GET(cq, cqe);
1445 	wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
1446 	wc->wc_pkey_ix	  = (wc->wc_immed_data >> 16);
1447 
1448 	/*
1449 	 * Depending on whether the completion was a receive or a send
1450 	 * completion, fill in "bytes transferred" as appropriate.  Also,
1451 	 * if necessary, fill in the "path bits" field.
1452 	 */
1453 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1454 		wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1455 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1456 
1457 	} else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1458 	    (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1459 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1460 	}
1461 
1462 	TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1463 	return (TAVOR_CQ_SYNC_AND_DB);
1464 }
1465 
1466 
1467 /*
1468  * tavor_cq_errcqe_consume()
1469  *    Context: Can be called from interrupt or base context.
1470  */
1471 static int
1472 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1473     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1474 {
1475 	uint64_t		next_wqeaddr;
1476 	uint32_t		imm_eth_pkey_cred;
1477 	uint_t			nextwqesize, dbd;
1478 	uint_t			doorbell_cnt, status;
1479 	tavor_wrid_entry_t	wre;
1480 
1481 	TAVOR_TNF_ENTER(tavor_cq_errcqe_consume);
1482 
1483 	/*
1484 	 * Fetch the Work Request ID using the information in the CQE.
1485 	 * See tavor_wr.c for more details.
1486 	 */
1487 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1488 
1489 	/*
1490 	 * Parse the CQE opcode to determine completion type.  We know that
1491 	 * the CQE is an error completion, so we extract only the completion
1492 	 * status here.
1493 	 */
1494 	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1495 	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1496 	switch (status) {
1497 	case TAVOR_CQE_LOC_LEN_ERR:
1498 		status = IBT_WC_LOCAL_LEN_ERR;
1499 		break;
1500 
1501 	case TAVOR_CQE_LOC_OP_ERR:
1502 		status = IBT_WC_LOCAL_QP_OP_ERR;
1503 		break;
1504 
1505 	case TAVOR_CQE_LOC_PROT_ERR:
1506 		status = IBT_WC_LOCAL_PROTECT_ERR;
1507 		break;
1508 
1509 	case TAVOR_CQE_WR_FLUSHED_ERR:
1510 		status = IBT_WC_WR_FLUSHED_ERR;
1511 		break;
1512 
1513 	case TAVOR_CQE_MW_BIND_ERR:
1514 		status = IBT_WC_MEM_WIN_BIND_ERR;
1515 		break;
1516 
1517 	case TAVOR_CQE_BAD_RESPONSE_ERR:
1518 		status = IBT_WC_BAD_RESPONSE_ERR;
1519 		break;
1520 
1521 	case TAVOR_CQE_LOCAL_ACCESS_ERR:
1522 		status = IBT_WC_LOCAL_ACCESS_ERR;
1523 		break;
1524 
1525 	case TAVOR_CQE_REM_INV_REQ_ERR:
1526 		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1527 		break;
1528 
1529 	case TAVOR_CQE_REM_ACC_ERR:
1530 		status = IBT_WC_REMOTE_ACCESS_ERR;
1531 		break;
1532 
1533 	case TAVOR_CQE_REM_OP_ERR:
1534 		status = IBT_WC_REMOTE_OP_ERR;
1535 		break;
1536 
1537 	case TAVOR_CQE_TRANS_TO_ERR:
1538 		status = IBT_WC_TRANS_TIMEOUT_ERR;
1539 		break;
1540 
1541 	case TAVOR_CQE_RNRNAK_TO_ERR:
1542 		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1543 		break;
1544 
1545 	/*
1546 	 * The following error codes are not supported in the Tavor driver
1547 	 * as they relate only to Reliable Datagram completion statuses:
1548 	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1549 	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1550 	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1551 	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1552 	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1553 	 *    case TAVOR_CQE_LOC_EEC_ERR:
1554 	 */
1555 
1556 	default:
1557 		TAVOR_WARNING(state, "unknown error CQE status");
1558 		status = IBT_WC_LOCAL_QP_OP_ERR;
1559 		TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status,
1560 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1561 		break;
1562 	}
1563 	wc->wc_status = status;
1564 
1565 	/*
1566 	 * Now we do all the checking that's necessary to handle completion
1567 	 * queue entry "recycling"
1568 	 *
1569 	 * It is not necessary here to try to sync the WQE as we are only
1570 	 * attempting to read from the Work Queue (and hardware does not
1571 	 * write to it).
1572 	 */
1573 
1574 	/*
1575 	 * We can get doorbell info, WQE address, size for the next WQE
1576 	 * from the "wre" (which was filled in above in the call to the
1577 	 * tavor_wrid_get_entry() routine)
1578 	 */
1579 	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1580 	next_wqeaddr = wre.wr_wqeaddrsz;
1581 	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1582 
1583 	/*
1584 	 * Get the doorbell count from the CQE.  This indicates how many
1585 	 * completions this one CQE represents.
1586 	 */
1587 	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1588 
1589 	/*
1590 	 * Determine if we're ready to consume this CQE yet or not.  If the
1591 	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1592 	 * is down to zero, then this is the last/only completion represented
1593 	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1594 	 * current CQE needs to be recycled (see below).
1595 	 */
1596 	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1597 		/*
1598 		 * Consume the CQE
1599 		 *    Return status to indicate that doorbell and sync may be
1600 		 *    necessary.
1601 		 */
1602 		TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1603 		return (TAVOR_CQ_SYNC_AND_DB);
1604 
1605 	} else {
1606 		/*
1607 		 * Recycle the CQE for use in the next PollCQ() call
1608 		 *    Decrement the doorbell count, modify the error status,
1609 		 *    and update the WQE address and size (to point to the
1610 		 *    next WQE on the chain.  Put these update entries back
1611 		 *    into the CQE.
1612 		 *    Despite the fact that we have updated the CQE, it is not
1613 		 *    necessary for us to attempt to sync this entry just yet
1614 		 *    as we have not changed the "hardware's view" of the
1615 		 *    entry (i.e. we have not modified the "owner" bit - which
1616 		 *    is all that the Tavor hardware really cares about.
1617 		 */
1618 		doorbell_cnt = doorbell_cnt - dbd;
1619 		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1620 		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1621 		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1622 		TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1623 		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1624 
1625 		TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1626 		return (TAVOR_CQ_RECYCLE_ENTRY);
1627 	}
1628 }
1629 
1630 
1631 /*
1632  * tavor_cqe_sync()
1633  *    Context: Can be called from interrupt or base context.
1634  */
1635 static void
1636 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1637 {
1638 	ddi_dma_handle_t	dmahdl;
1639 	off_t			offset;
1640 	int			status;
1641 
1642 	TAVOR_TNF_ENTER(tavor_cqe_sync);
1643 
1644 	/* Determine if CQ needs to be synced or not */
1645 	if (cq->cq_sync == 0) {
1646 		TAVOR_TNF_EXIT(tavor_cqe_sync);
1647 		return;
1648 	}
1649 
1650 	/* Get the DMA handle from CQ context */
1651 	dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1652 
1653 	/* Calculate offset of next CQE */
1654 	offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1655 	status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1656 	if (status != DDI_SUCCESS) {
1657 		TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail,
1658 		    TAVOR_TNF_ERROR, "");
1659 		TAVOR_TNF_EXIT(tavor_cqe_sync);
1660 		return;
1661 	}
1662 
1663 	TAVOR_TNF_EXIT(tavor_cqe_sync);
1664 }
1665 
1666 
1667 /*
1668  * tavor_cq_resize_helper()
1669  *    Context: Can be called only from user or kernel context.
1670  */
1671 static void
1672 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1673     uint32_t old_cons_indx, uint32_t num_newcqe)
1674 {
1675 	tavor_hw_cqe_t	*old_cqe, *new_cqe;
1676 	uint32_t	new_cons_indx, wrap_around_mask;
1677 	int		i;
1678 
1679 	TAVOR_TNF_ENTER(tavor_cq_resize_helper);
1680 
1681 	ASSERT(MUTEX_HELD(&cq->cq_lock));
1682 
1683 	/* Get the consumer index */
1684 	new_cons_indx = 0;
1685 
1686 	/*
1687 	 * Calculate the wrap around mask.  Note: This operation only works
1688 	 * because all Tavor completion queues have power-of-2 sizes
1689 	 */
1690 	wrap_around_mask = (cq->cq_bufsz - 1);
1691 
1692 	/*
1693 	 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1694 	 * and the first CQ entry in the "new" CQ
1695 	 */
1696 	old_cqe = &cq->cq_buf[old_cons_indx];
1697 	new_cqe = &new_cqbuf[new_cons_indx];
1698 
1699 	/* Sync entire "old" CQ for use by software (if necessary). */
1700 	if (cq->cq_sync) {
1701 		(void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1702 		    0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1703 	}
1704 
1705 	/*
1706 	 * Keep pulling entries from the "old" CQ until we find an entry owned
1707 	 * by the hardware.  Process each entry by copying it into the "new"
1708 	 * CQ and updating respective indices and pointers in the "old" CQ.
1709 	 */
1710 	for (i = 0; i < num_newcqe; i++) {
1711 
1712 		/* Copy this old CQE into the "new_cqe" pointer */
1713 		bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1714 
1715 		/* Increment the consumer index (for both CQs) */
1716 		old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1717 		new_cons_indx = (new_cons_indx + 1);
1718 
1719 		/* Update the pointer to the next CQ entry */
1720 		old_cqe = &cq->cq_buf[old_cons_indx];
1721 		new_cqe = &new_cqbuf[new_cons_indx];
1722 	}
1723 
1724 	TAVOR_TNF_EXIT(tavor_cq_resize_helper);
1725 }
1726 
1727 /*
1728  * tavor_cq_srq_entries_flush()
1729  * Context: Can be called from interrupt or base context.
1730  */
1731 void
1732 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1733 {
1734 	tavor_cqhdl_t		cq;
1735 	tavor_workq_hdr_t	*wqhdr;
1736 	tavor_hw_cqe_t		*cqe;
1737 	tavor_hw_cqe_t		*next_cqe;
1738 	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
1739 	uint32_t		new_indx, check_indx, indx;
1740 	uint32_t		num_to_increment;
1741 	int			cqe_qpnum, cqe_type;
1742 	int			outstanding_cqes, removed_cqes;
1743 	int			i;
1744 
1745 	ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1746 
1747 	cq = qp->qp_rq_cqhdl;
1748 	wqhdr = qp->qp_rq_wqhdr;
1749 
1750 	ASSERT(wqhdr->wq_wrid_post != NULL);
1751 	ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1752 
1753 	/*
1754 	 * Check for user-mapped CQ memory.  Note:  We do not allow kernel
1755 	 * clients to modify any userland mapping CQ.  If the CQ is
1756 	 * user-mapped, then we simply return here, and this "flush" function
1757 	 * becomes a NO-OP in this case.
1758 	 */
1759 	if (cq->cq_is_umap) {
1760 		return;
1761 	}
1762 
1763 	/* Get the consumer index */
1764 	cons_indx = cq->cq_consindx;
1765 
1766 	/*
1767 	 * Calculate the wrap around mask.  Note: This operation only works
1768 	 * because all Tavor completion queues have power-of-2 sizes
1769 	 */
1770 	wrap_around_mask = (cq->cq_bufsz - 1);
1771 
1772 	/* Calculate the pointer to the first CQ entry */
1773 	cqe = &cq->cq_buf[cons_indx];
1774 
1775 	/* Sync the current CQE to read */
1776 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1777 
1778 	/*
1779 	 * Loop through the CQ looking for entries owned by software.  If an
1780 	 * entry is owned by software then we increment an 'outstanding_cqes'
1781 	 * count to know how many entries total we have on our CQ.  We use this
1782 	 * value further down to know how many entries to loop through looking
1783 	 * for our same QP number.
1784 	 */
1785 	outstanding_cqes = 0;
1786 	tail_cons_indx = cons_indx;
1787 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1788 		/* increment total cqes count */
1789 		outstanding_cqes++;
1790 
1791 		/* increment the consumer index */
1792 		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1793 
1794 		/* update the pointer to the next cq entry */
1795 		cqe = &cq->cq_buf[tail_cons_indx];
1796 
1797 		/* sync the next cqe to read */
1798 		tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1799 	}
1800 
1801 	/*
1802 	 * Using the 'tail_cons_indx' that was just set, we now know how many
1803 	 * total CQEs possible there are.  Set the 'check_indx' and the
1804 	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1805 	 */
1806 	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1807 
1808 	for (i = 0; i < outstanding_cqes; i++) {
1809 		cqe = &cq->cq_buf[check_indx];
1810 
1811 		/* Grab QP number from CQE */
1812 		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1813 		cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1814 
1815 		/*
1816 		 * If the QP number is the same in the CQE as the QP that we
1817 		 * have on this SRQ, then we must free up the entry off the
1818 		 * SRQ.  We also make sure that the completion type is of the
1819 		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1820 		 * this CQ will be left as-is.  The handling of returning
1821 		 * entries back to HW ownership happens further down.
1822 		 */
1823 		if (cqe_qpnum == qp->qp_qpnum &&
1824 		    cqe_type == TAVOR_COMPLETION_RECV) {
1825 
1826 			/* Add back to SRQ free list */
1827 			(void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1828 			    cq, cqe);
1829 		} else {
1830 			/* Do Copy */
1831 			if (check_indx != new_indx) {
1832 				next_cqe = &cq->cq_buf[new_indx];
1833 
1834 				/*
1835 				 * Copy the CQE into the "next_cqe"
1836 				 * pointer.
1837 				 */
1838 				bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1839 			}
1840 			new_indx = (new_indx - 1) & wrap_around_mask;
1841 		}
1842 		/* Move index to next CQE to check */
1843 		check_indx = (check_indx - 1) & wrap_around_mask;
1844 	}
1845 
1846 	/* Initialize removed cqes count */
1847 	removed_cqes = 0;
1848 
1849 	/* If an entry was removed */
1850 	if (check_indx != new_indx) {
1851 
1852 		/*
1853 		 * Set current pointer back to the beginning consumer index.
1854 		 * At this point, all unclaimed entries have been copied to the
1855 		 * index specified by 'new_indx'.  This 'new_indx' will be used
1856 		 * as the new consumer index after we mark all freed entries as
1857 		 * having HW ownership.  We do that here.
1858 		 */
1859 
1860 		/* Loop through all entries until we reach our new pointer */
1861 		for (indx = cons_indx; indx <= new_indx;
1862 		    indx = (indx + 1) & wrap_around_mask) {
1863 			removed_cqes++;
1864 			cqe = &cq->cq_buf[indx];
1865 
1866 			/* Reset entry to hardware ownership */
1867 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1868 		}
1869 	}
1870 
1871 	/*
1872 	 * Update consumer index to be the 'new_indx'.  This moves it past all
1873 	 * removed entries.  Because 'new_indx' is pointing to the last
1874 	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1875 	 * the first HW owned entry.
1876 	 */
1877 	cons_indx = (new_indx + 1) & wrap_around_mask;
1878 
1879 	/*
1880 	 * Now we only ring the doorbell (to update the consumer index) if
1881 	 * we've actually consumed a CQ entry.  If we found no QP number
1882 	 * matches above, then we would not have removed anything.  So only if
1883 	 * something was removed do we ring the doorbell.
1884 	 */
1885 	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1886 		/*
1887 		 * Post doorbell to update the consumer index.  Doorbell
1888 		 * value indicates number of entries consumed (minus 1)
1889 		 */
1890 		if (cons_indx > cq->cq_consindx) {
1891 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1892 		} else {
1893 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1894 			    cq->cq_consindx) - 1;
1895 		}
1896 		cq->cq_consindx = cons_indx;
1897 
1898 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1899 		    cq->cq_cqnum, num_to_increment);
1900 	}
1901 }
1902