xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_srq.c (revision b6805bf78d2bbbeeaea8909a05623587b42d58b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_srq.c
29  *    Tavor Shared Receive Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, querying,
32  *    modifying and posting shared receive queues.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/bitmap.h>
41 
42 #include <sys/ib/adapters/tavor/tavor.h>
43 
44 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
45     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
46 
47 /*
48  * tavor_srq_alloc()
49  *    Context: Can be called only from user or kernel context.
50  */
51 int
52 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
53     uint_t sleepflag, tavor_srq_options_t *op)
54 {
55 	ibt_srq_hdl_t		ibt_srqhdl;
56 	tavor_pdhdl_t		pd;
57 	ibt_srq_sizes_t		*sizes;
58 	ibt_srq_sizes_t		*real_sizes;
59 	tavor_srqhdl_t		*srqhdl;
60 	ibt_srq_flags_t		flags;
61 	tavor_rsrc_t		*srqc, *rsrc;
62 	tavor_hw_srqc_t		srqc_entry;
63 	uint32_t		*buf;
64 	tavor_srqhdl_t		srq;
65 	tavor_umap_db_entry_t	*umapdb;
66 	ibt_mr_attr_t		mr_attr;
67 	tavor_mr_options_t	mr_op;
68 	tavor_mrhdl_t		mr;
69 	uint64_t		addr;
70 	uint64_t		value, srq_desc_off;
71 	uint32_t		lkey;
72 	uint32_t		log_srq_size;
73 	uint32_t		uarpg;
74 	uint_t			wq_location, dma_xfer_mode, srq_is_umap;
75 	int			flag, status;
76 	char			*errormsg;
77 	uint_t			max_sgl;
78 	uint_t			wqesz;
79 
80 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
81 
82 	TAVOR_TNF_ENTER(tavor_srq_alloc);
83 
84 	/*
85 	 * Check the "options" flag.  Currently this flag tells the driver
86 	 * whether or not the SRQ's work queues should be come from normal
87 	 * system memory or whether they should be allocated from DDR memory.
88 	 */
89 	if (op == NULL) {
90 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
91 	} else {
92 		wq_location = op->srqo_wq_loc;
93 	}
94 
95 	/*
96 	 * Extract the necessary info from the tavor_srq_info_t structure
97 	 */
98 	real_sizes = srqinfo->srqi_real_sizes;
99 	sizes	   = srqinfo->srqi_sizes;
100 	pd	   = srqinfo->srqi_pd;
101 	ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
102 	flags	   = srqinfo->srqi_flags;
103 	srqhdl	   = srqinfo->srqi_srqhdl;
104 
105 	/*
106 	 * Determine whether SRQ is being allocated for userland access or
107 	 * whether it is being allocated for kernel access.  If the SRQ is
108 	 * being allocated for userland access, then lookup the UAR doorbell
109 	 * page number for the current process.  Note:  If this is not found
110 	 * (e.g. if the process has not previously open()'d the Tavor driver),
111 	 * then an error is returned.
112 	 */
113 	srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
114 	if (srq_is_umap) {
115 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
116 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
117 		if (status != DDI_SUCCESS) {
118 			/* Set "status" and "errormsg" and goto failure */
119 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
120 			goto srqalloc_fail3;
121 		}
122 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
123 	}
124 
125 	/* Increase PD refcnt */
126 	tavor_pd_refcnt_inc(pd);
127 
128 	/* Allocate an SRQ context entry */
129 	status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
130 	if (status != DDI_SUCCESS) {
131 		/* Set "status" and "errormsg" and goto failure */
132 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
133 		goto srqalloc_fail1;
134 	}
135 
136 	/* Allocate the SRQ Handle entry */
137 	status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
138 	if (status != DDI_SUCCESS) {
139 		/* Set "status" and "errormsg" and goto failure */
140 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
141 		goto srqalloc_fail2;
142 	}
143 
144 	srq = (tavor_srqhdl_t)rsrc->tr_addr;
145 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
146 
147 	srq->srq_srqnum = srqc->tr_indx;	/* just use index */
148 
149 	/*
150 	 * If this will be a user-mappable SRQ, then allocate an entry for
151 	 * the "userland resources database".  This will later be added to
152 	 * the database (after all further SRQ operations are successful).
153 	 * If we fail here, we must undo the reference counts and the
154 	 * previous resource allocation.
155 	 */
156 	if (srq_is_umap) {
157 		umapdb = tavor_umap_db_alloc(state->ts_instance,
158 		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
159 		    (uint64_t)(uintptr_t)rsrc);
160 		if (umapdb == NULL) {
161 			/* Set "status" and "errormsg" and goto failure */
162 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
163 			goto srqalloc_fail3;
164 		}
165 	}
166 
167 	/*
168 	 * Calculate the appropriate size for the SRQ.
169 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
170 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
171 	 * is to round the requested size up to the next highest power-of-2
172 	 */
173 	sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
174 	log_srq_size = highbit(sizes->srq_wr_sz);
175 	if ((sizes->srq_wr_sz & (sizes->srq_wr_sz - 1)) == 0) {
176 		log_srq_size = log_srq_size - 1;
177 	}
178 
179 	/*
180 	 * Next we verify that the rounded-up size is valid (i.e. consistent
181 	 * with the device limits and/or software-configured limits).  If not,
182 	 * then obviously we have a lot of cleanup to do before returning.
183 	 */
184 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
185 		/* Set "status" and "errormsg" and goto failure */
186 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
187 		goto srqalloc_fail4;
188 	}
189 
190 	/*
191 	 * Next we verify that the requested number of SGL is valid (i.e.
192 	 * consistent with the device limits and/or software-configured
193 	 * limits).  If not, then obviously the same cleanup needs to be done.
194 	 */
195 	max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
196 	if (sizes->srq_sgl_sz > max_sgl) {
197 		/* Set "status" and "errormsg" and goto failure */
198 		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
199 		goto srqalloc_fail4;
200 	}
201 
202 	/*
203 	 * Determine the SRQ's WQE sizes.  This depends on the requested
204 	 * number of SGLs.  Note: This also has the side-effect of
205 	 * calculating the real number of SGLs (for the calculated WQE size)
206 	 */
207 	tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
208 	    TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
209 	    &srq->srq_wq_sgl);
210 
211 	/*
212 	 * Allocate the memory for SRQ work queues.  Note:  The location from
213 	 * which we will allocate these work queues has been passed in through
214 	 * the tavor_qp_options_t structure.  Since Tavor work queues are not
215 	 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
216 	 * queue memory is very important.  We used to allocate work queues
217 	 * (the combined receive and send queues) so that they would be aligned
218 	 * on their combined size.  That alignment guaranteed that they would
219 	 * never cross the 4GB boundary (Tavor work queues are on the order of
220 	 * MBs at maximum).  Now we are able to relax this alignment constraint
221 	 * by ensuring that the IB address assigned to the queue memory (as a
222 	 * result of the tavor_mr_register() call) is offset from zero.
223 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
224 	 * guarantee the alignment, but when attempting to use IOMMU bypass
225 	 * mode we found that we were not allowed to specify any alignment that
226 	 * was more restrictive than the system page size.  So we avoided this
227 	 * constraint by passing two alignment values, one for the memory
228 	 * allocation itself and the other for the DMA handle (for later bind).
229 	 * This used to cause more memory than necessary to be allocated (in
230 	 * order to guarantee the more restrictive alignment contraint).  But
231 	 * be guaranteeing the zero-based IB virtual address for the queue, we
232 	 * are able to conserve this memory.
233 	 *
234 	 * Note: If SRQ is not user-mappable, then it may come from either
235 	 * kernel system memory or from HCA-attached local DDR memory.
236 	 *
237 	 * Note2: We align this queue on a pagesize boundary.  This is required
238 	 * to make sure that all the resulting IB addresses will start at 0, for
239 	 * a zero-based queue.  By making sure we are aligned on at least a
240 	 * page, any offset we use into our queue will be the same as when we
241 	 * perform tavor_srq_modify() operations later.
242 	 */
243 	wqesz = (1 << srq->srq_wq_log_wqesz);
244 	srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
245 	srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
246 	srq->srq_wqinfo.qa_bind_align = PAGESIZE;
247 	if (srq_is_umap) {
248 		srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
249 	} else {
250 		srq->srq_wqinfo.qa_location = wq_location;
251 	}
252 	status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
253 	if (status != DDI_SUCCESS) {
254 		/* Set "status" and "errormsg" and goto failure */
255 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
256 		goto srqalloc_fail4;
257 	}
258 	buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
259 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
260 
261 	/*
262 	 * Register the memory for the SRQ work queues.  The memory for the SRQ
263 	 * must be registered in the Tavor TPT tables.  This gives us the LKey
264 	 * to specify in the SRQ context later.  Note: If the work queue is to
265 	 * be allocated from DDR memory, then only a "bypass" mapping is
266 	 * appropriate.  And if the SRQ memory is user-mappable, then we force
267 	 * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
268 	 * restriction, we pass the "mro_bind_override_addr" flag in the call
269 	 * to tavor_mr_register().  This guarantees that the resulting IB vaddr
270 	 * will be zero-based (modulo the offset into the first page).  If we
271 	 * fail here, we still have the bunch of resource and reference count
272 	 * cleanup to do.
273 	 */
274 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
275 	    IBT_MR_NOSLEEP;
276 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
277 	mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
278 	mr_attr.mr_as    = NULL;
279 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
280 	if (srq_is_umap) {
281 		mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
282 	} else {
283 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
284 			mr_op.mro_bind_type =
285 			    state->ts_cfg_profile->cp_iommu_bypass;
286 			dma_xfer_mode =
287 			    state->ts_cfg_profile->cp_streaming_consistent;
288 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
289 				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
290 			}
291 		} else {
292 			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
293 		}
294 	}
295 	mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
296 	mr_op.mro_bind_override_addr = 1;
297 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
298 	if (status != DDI_SUCCESS) {
299 		/* Set "status" and "errormsg" and goto failure */
300 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
301 		goto srqalloc_fail5;
302 	}
303 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
304 	addr = mr->mr_bindinfo.bi_addr;
305 	lkey = mr->mr_lkey;
306 
307 	/*
308 	 * Calculate the offset between the kernel virtual address space
309 	 * and the IB virtual address space.  This will be used when
310 	 * posting work requests to properly initialize each WQE.
311 	 */
312 	srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
313 	    (uint64_t)mr->mr_bindinfo.bi_addr;
314 
315 	/*
316 	 * Create WQL and Wridlist for use by this SRQ
317 	 */
318 	srq->srq_wrid_wql = tavor_wrid_wql_create(state);
319 	if (srq->srq_wrid_wql == NULL) {
320 		/* Set "status" and "errormsg" and goto failure */
321 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
322 		goto srqalloc_fail6;
323 	}
324 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
325 
326 	srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
327 	if (srq->srq_wridlist == NULL) {
328 		/* Set "status" and "errormsg" and goto failure */
329 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
330 		goto srqalloc_fail7;
331 	}
332 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
333 
334 	srq->srq_wridlist->wl_srq_en = 1;
335 	srq->srq_wridlist->wl_free_list_indx = -1;
336 
337 	/*
338 	 * Fill in all the return arguments (if necessary).  This includes
339 	 * real queue size and real SGLs.
340 	 */
341 	if (real_sizes != NULL) {
342 		real_sizes->srq_wr_sz = (1 << log_srq_size);
343 		real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
344 	}
345 
346 	/*
347 	 * Fill in the SRQC entry.  This is the final step before passing
348 	 * ownership of the SRQC entry to the Tavor hardware.  We use all of
349 	 * the information collected/calculated above to fill in the
350 	 * requisite portions of the SRQC.  Note: If this SRQ is going to be
351 	 * used for userland access, then we need to set the UAR page number
352 	 * appropriately (otherwise it's a "don't care")
353 	 */
354 	bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
355 	srqc_entry.wqe_addr_h	   = (addr >> 32);
356 	srqc_entry.next_wqe_addr_l = 0;
357 	srqc_entry.ds		   = (wqesz >> 4);
358 	srqc_entry.state	   = TAVOR_SRQ_STATE_HW_OWNER;
359 	srqc_entry.pd		   = pd->pd_pdnum;
360 	srqc_entry.lkey		   = lkey;
361 	srqc_entry.wqe_cnt	   = 0;
362 	if (srq_is_umap) {
363 		srqc_entry.uar	   = uarpg;
364 	} else {
365 		srqc_entry.uar	   = 0;
366 	}
367 
368 	/*
369 	 * Write the SRQC entry to hardware.  Lastly, we pass ownership of
370 	 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
371 	 * command).  Note: In general, this operation shouldn't fail.  But
372 	 * if it does, we have to undo everything we've done above before
373 	 * returning error.
374 	 */
375 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
376 	    sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
377 	    sleepflag);
378 	if (status != TAVOR_CMD_SUCCESS) {
379 		cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
380 		    status);
381 		TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
382 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
383 		/* Set "status" and "errormsg" and goto failure */
384 		TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
385 		goto srqalloc_fail8;
386 	}
387 
388 	/*
389 	 * Fill in the rest of the Tavor SRQ handle.  We can update
390 	 * the following fields for use in further operations on the SRQ.
391 	 */
392 	srq->srq_srqcrsrcp = srqc;
393 	srq->srq_rsrcp	   = rsrc;
394 	srq->srq_mrhdl	   = mr;
395 	srq->srq_refcnt	   = 0;
396 	srq->srq_is_umap   = srq_is_umap;
397 	srq->srq_uarpg	   = (srq->srq_is_umap) ? uarpg : 0;
398 	srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
399 	srq->srq_pdhdl	   = pd;
400 	srq->srq_wq_lastwqeindx = -1;
401 	srq->srq_wq_bufsz  = (1 << log_srq_size);
402 	srq->srq_wq_buf	   = buf;
403 	srq->srq_desc_off  = srq_desc_off;
404 	srq->srq_hdlrarg   = (void *)ibt_srqhdl;
405 	srq->srq_state	   = 0;
406 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
407 	srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
408 
409 	/* Determine if later ddi_dma_sync will be necessary */
410 	srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
411 
412 	/*
413 	 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
414 	 * "srqhdl" and return success
415 	 */
416 	ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
417 	state->ts_srqhdl[srqc->tr_indx] = srq;
418 
419 	/*
420 	 * If this is a user-mappable SRQ, then we need to insert the
421 	 * previously allocated entry into the "userland resources database".
422 	 * This will allow for later lookup during devmap() (i.e. mmap())
423 	 * calls.
424 	 */
425 	if (srq->srq_is_umap) {
426 		tavor_umap_db_add(umapdb);
427 	} else {
428 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
429 		tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
430 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
431 	}
432 
433 	*srqhdl = srq;
434 
435 	TAVOR_TNF_EXIT(tavor_srq_alloc);
436 	return (status);
437 
438 /*
439  * The following is cleanup for all possible failure cases in this routine
440  */
441 srqalloc_fail8:
442 	kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
443 	    sizeof (tavor_wrid_entry_t));
444 	kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
445 srqalloc_fail7:
446 	tavor_wql_refcnt_dec(srq->srq_wrid_wql);
447 srqalloc_fail6:
448 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
449 	    TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
450 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
451 	}
452 srqalloc_fail5:
453 	tavor_queue_free(state, &srq->srq_wqinfo);
454 srqalloc_fail4:
455 	if (srq_is_umap) {
456 		tavor_umap_db_free(umapdb);
457 	}
458 srqalloc_fail3:
459 	tavor_rsrc_free(state, &rsrc);
460 srqalloc_fail2:
461 	tavor_rsrc_free(state, &srqc);
462 srqalloc_fail1:
463 	tavor_pd_refcnt_dec(pd);
464 srqalloc_fail:
465 	TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
466 	    tnf_string, msg, errormsg);
467 	TAVOR_TNF_EXIT(tavor_srq_alloc);
468 	return (status);
469 }
470 
471 
472 /*
473  * tavor_srq_free()
474  *    Context: Can be called only from user or kernel context.
475  */
476 /* ARGSUSED */
477 int
478 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
479 {
480 	tavor_rsrc_t		*srqc, *rsrc;
481 	tavor_umap_db_entry_t	*umapdb;
482 	uint64_t		value;
483 	tavor_srqhdl_t		srq;
484 	tavor_mrhdl_t		mr;
485 	tavor_pdhdl_t		pd;
486 	tavor_hw_srqc_t		srqc_entry;
487 	uint32_t		srqnum;
488 	uint32_t		size;
489 	uint_t			maxprot;
490 	int			status;
491 
492 	TAVOR_TNF_ENTER(tavor_srq_free);
493 
494 	/*
495 	 * Pull all the necessary information from the Tavor Shared Receive
496 	 * Queue handle.  This is necessary here because the resource for the
497 	 * SRQ handle is going to be freed up as part of this operation.
498 	 */
499 	srq	= *srqhdl;
500 	mutex_enter(&srq->srq_lock);
501 	srqc	= srq->srq_srqcrsrcp;
502 	rsrc	= srq->srq_rsrcp;
503 	pd	= srq->srq_pdhdl;
504 	mr	= srq->srq_mrhdl;
505 	srqnum	= srq->srq_srqnum;
506 
507 	/*
508 	 * If there are work queues still associated with the SRQ, then return
509 	 * an error.  Otherwise, we will be holding the SRQ lock.
510 	 */
511 	if (srq->srq_refcnt != 0) {
512 		mutex_exit(&srq->srq_lock);
513 		TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
514 		    tnf_int, refcnt, srq->srq_refcnt);
515 		TAVOR_TNF_EXIT(tavor_srq_free);
516 		return (IBT_SRQ_IN_USE);
517 	}
518 
519 	/*
520 	 * If this was a user-mappable SRQ, then we need to remove its entry
521 	 * from the "userland resources database".  If it is also currently
522 	 * mmap()'d out to a user process, then we need to call
523 	 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
524 	 * We also need to invalidate the SRQ tracking information for the
525 	 * user mapping.
526 	 */
527 	if (srq->srq_is_umap) {
528 		status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
529 		    MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
530 		    &umapdb);
531 		if (status != DDI_SUCCESS) {
532 			mutex_exit(&srq->srq_lock);
533 			TAVOR_WARNING(state, "failed to find in database");
534 			TAVOR_TNF_EXIT(tavor_srq_free);
535 			return (ibc_get_ci_failure(0));
536 		}
537 		tavor_umap_db_free(umapdb);
538 		if (srq->srq_umap_dhp != NULL) {
539 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
540 			status = devmap_devmem_remap(srq->srq_umap_dhp,
541 			    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
542 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
543 			if (status != DDI_SUCCESS) {
544 				mutex_exit(&srq->srq_lock);
545 				TAVOR_WARNING(state, "failed in SRQ memory "
546 				    "devmap_devmem_remap()");
547 				TAVOR_TNF_EXIT(tavor_srq_free);
548 				return (ibc_get_ci_failure(0));
549 			}
550 			srq->srq_umap_dhp = (devmap_cookie_t)NULL;
551 		}
552 	}
553 
554 	/*
555 	 * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
556 	 * in-progress events to detect that the SRQ corresponding to this
557 	 * number has been freed.
558 	 */
559 	state->ts_srqhdl[srqc->tr_indx] = NULL;
560 
561 	mutex_exit(&srq->srq_lock);
562 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
563 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
564 
565 	/*
566 	 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
567 	 * firmware command).  If the ownership transfer fails for any reason,
568 	 * then it is an indication that something (either in HW or SW) has
569 	 * gone seriously wrong.
570 	 */
571 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
572 	    sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
573 	if (status != TAVOR_CMD_SUCCESS) {
574 		TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
575 		cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
576 		    status);
577 		TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
578 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
579 		TAVOR_TNF_EXIT(tavor_srq_free);
580 		return (IBT_FAILURE);
581 	}
582 
583 	/*
584 	 * Deregister the memory for the Shared Receive Queue.  If this fails
585 	 * for any reason, then it is an indication that something (either
586 	 * in HW or SW) has gone seriously wrong.  So we print a warning
587 	 * message and return.
588 	 */
589 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
590 	    sleepflag);
591 	if (status != DDI_SUCCESS) {
592 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
593 		TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
594 		TAVOR_TNF_EXIT(tavor_srq_free);
595 		return (IBT_FAILURE);
596 	}
597 
598 	/* Calculate the size and free the wridlist container */
599 	if (srq->srq_wridlist != NULL) {
600 		size = (srq->srq_wridlist->wl_size *
601 		    sizeof (tavor_wrid_entry_t));
602 		kmem_free(srq->srq_wridlist->wl_wre, size);
603 		kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
604 
605 		/*
606 		 * Release reference to WQL; If this is the last reference,
607 		 * this call also has the side effect of freeing up the
608 		 * 'srq_wrid_wql' memory.
609 		 */
610 		tavor_wql_refcnt_dec(srq->srq_wrid_wql);
611 	}
612 
613 	/* Free the memory for the SRQ */
614 	tavor_queue_free(state, &srq->srq_wqinfo);
615 
616 	/* Free the Tavor SRQ Handle */
617 	tavor_rsrc_free(state, &rsrc);
618 
619 	/* Free the SRQC entry resource */
620 	tavor_rsrc_free(state, &srqc);
621 
622 	/* Decrement the reference count on the protection domain (PD) */
623 	tavor_pd_refcnt_dec(pd);
624 
625 	/* Set the srqhdl pointer to NULL and return success */
626 	*srqhdl = NULL;
627 
628 	TAVOR_TNF_EXIT(tavor_srq_free);
629 	return (DDI_SUCCESS);
630 }
631 
632 
633 /*
634  * tavor_srq_modify()
635  *    Context: Can be called only from user or kernel context.
636  */
637 int
638 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
639     uint_t *real_size, uint_t sleepflag)
640 {
641 	tavor_qalloc_info_t	new_srqinfo, old_srqinfo;
642 	tavor_rsrc_t		*mtt, *mpt, *old_mtt;
643 	tavor_bind_info_t	bind;
644 	tavor_bind_info_t	old_bind;
645 	tavor_rsrc_pool_info_t	*rsrc_pool;
646 	tavor_mrhdl_t		mr;
647 	tavor_hw_mpt_t		mpt_entry;
648 	tavor_wrid_entry_t	*wre_new, *wre_old;
649 	uint64_t		mtt_ddrbaseaddr, mtt_addr;
650 	uint64_t		srq_desc_off;
651 	uint32_t		*buf, srq_old_bufsz;
652 	uint32_t		wqesz;
653 	uint_t			max_srq_size;
654 	uint_t			dma_xfer_mode, mtt_pgsize_bits;
655 	uint_t			srq_sync, log_srq_size, maxprot;
656 	uint_t			wq_location;
657 	int			status;
658 	char			*errormsg;
659 
660 	TAVOR_TNF_ENTER(tavor_srq_modify);
661 
662 	/*
663 	 * Check the "inddr" flag.  This flag tells the driver whether or not
664 	 * the SRQ's work queues should be come from normal system memory or
665 	 * whether they should be allocated from DDR memory.
666 	 */
667 	wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
668 
669 	/*
670 	 * If size requested is larger than device capability, return
671 	 * Insufficient Resources
672 	 */
673 	max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
674 	if (size > max_srq_size) {
675 		TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
676 		    TAVOR_TNF_ERROR, "");
677 		TAVOR_TNF_EXIT(tavor_srq_modify);
678 		return (IBT_HCA_WR_EXCEEDED);
679 	}
680 
681 	/*
682 	 * Calculate the appropriate size for the SRQ.
683 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
684 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
685 	 * is to round the requested size up to the next highest power-of-2
686 	 */
687 	size = max(size, TAVOR_SRQ_MIN_SIZE);
688 	log_srq_size = highbit(size);
689 	if ((size & (size - 1)) == 0) {
690 		log_srq_size = log_srq_size - 1;
691 	}
692 
693 	/*
694 	 * Next we verify that the rounded-up size is valid (i.e. consistent
695 	 * with the device limits and/or software-configured limits).
696 	 */
697 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
698 		/* Set "status" and "errormsg" and goto failure */
699 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
700 		goto srqmodify_fail;
701 	}
702 
703 	/*
704 	 * Allocate the memory for newly resized Shared Receive Queue.
705 	 *
706 	 * Note: If SRQ is not user-mappable, then it may come from either
707 	 * kernel system memory or from HCA-attached local DDR memory.
708 	 *
709 	 * Note2: We align this queue on a pagesize boundary.  This is required
710 	 * to make sure that all the resulting IB addresses will start at 0,
711 	 * for a zero-based queue.  By making sure we are aligned on at least a
712 	 * page, any offset we use into our queue will be the same as it was
713 	 * when we allocated it at tavor_srq_alloc() time.
714 	 */
715 	wqesz = (1 << srq->srq_wq_log_wqesz);
716 	new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
717 	new_srqinfo.qa_alloc_align = PAGESIZE;
718 	new_srqinfo.qa_bind_align  = PAGESIZE;
719 	if (srq->srq_is_umap) {
720 		new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
721 	} else {
722 		new_srqinfo.qa_location = wq_location;
723 	}
724 	status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
725 	if (status != DDI_SUCCESS) {
726 		/* Set "status" and "errormsg" and goto failure */
727 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
728 		goto srqmodify_fail;
729 	}
730 	buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
731 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
732 
733 	/*
734 	 * Allocate the memory for the new WRE list.  This will be used later
735 	 * when we resize the wridlist based on the new SRQ size.
736 	 */
737 	wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
738 	    sizeof (tavor_wrid_entry_t), sleepflag);
739 	if (wre_new == NULL) {
740 		/* Set "status" and "errormsg" and goto failure */
741 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
742 		    "failed wre_new alloc");
743 		goto srqmodify_fail;
744 	}
745 
746 	/*
747 	 * Fill in the "bind" struct.  This struct provides the majority
748 	 * of the information that will be used to distinguish between an
749 	 * "addr" binding (as is the case here) and a "buf" binding (see
750 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
751 	 * which does most of the "heavy lifting" for the Tavor memory
752 	 * registration routines.
753 	 */
754 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
755 	bzero(&bind, sizeof (tavor_bind_info_t));
756 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
757 	bind.bi_addr  = (uint64_t)(uintptr_t)buf;
758 	bind.bi_len   = new_srqinfo.qa_size;
759 	bind.bi_as    = NULL;
760 	bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
761 	    IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
762 	if (srq->srq_is_umap) {
763 		bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
764 	} else {
765 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
766 			bind.bi_bypass =
767 			    state->ts_cfg_profile->cp_iommu_bypass;
768 			dma_xfer_mode =
769 			    state->ts_cfg_profile->cp_streaming_consistent;
770 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
771 				bind.bi_flags |= IBT_MR_NONCOHERENT;
772 			}
773 		} else {
774 			bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
775 		}
776 	}
777 	status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
778 	    &mtt_pgsize_bits);
779 	if (status != DDI_SUCCESS) {
780 		/* Set "status" and "errormsg" and goto failure */
781 		TAVOR_TNF_FAIL(status, "failed mtt bind");
782 		kmem_free(wre_new, srq->srq_wq_bufsz *
783 		    sizeof (tavor_wrid_entry_t));
784 		tavor_queue_free(state, &new_srqinfo);
785 		goto srqmodify_fail;
786 	}
787 
788 	/*
789 	 * Calculate the offset between the kernel virtual address space
790 	 * and the IB virtual address space.  This will be used when
791 	 * posting work requests to properly initialize each WQE.
792 	 *
793 	 * Note: bind addr is zero-based (from alloc) so we calculate the
794 	 * correct new offset here.
795 	 */
796 	bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
797 	srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
798 	    (uint64_t)bind.bi_addr;
799 
800 	/*
801 	 * Get the base address for the MTT table.  This will be necessary
802 	 * below when we are modifying the MPT entry.
803 	 */
804 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
805 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
806 
807 	/*
808 	 * Fill in the MPT entry.  This is the final step before passing
809 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
810 	 * the information collected/calculated above to fill in the
811 	 * requisite portions of the MPT.
812 	 */
813 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
814 	mpt_entry.reg_win_len	= bind.bi_len;
815 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
816 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
817 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
818 
819 	/*
820 	 * Now we grab the SRQ lock.  Since we will be updating the actual
821 	 * SRQ location and the producer/consumer indexes, we should hold
822 	 * the lock.
823 	 *
824 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
825 	 * holding the "srq_lock" and if we got raised to interrupt level
826 	 * by priority inversion, we would not want to block in this routine
827 	 * waiting for success.
828 	 */
829 	mutex_enter(&srq->srq_lock);
830 
831 	/*
832 	 * Copy old entries to new buffer
833 	 */
834 	srq_old_bufsz = srq->srq_wq_bufsz;
835 	bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
836 
837 	/* Determine if later ddi_dma_sync will be necessary */
838 	srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
839 
840 	/* Sync entire "new" SRQ for use by hardware (if necessary) */
841 	if (srq_sync) {
842 		(void) ddi_dma_sync(bind.bi_dmahdl, 0,
843 		    new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
844 	}
845 
846 	/*
847 	 * Setup MPT information for use in the MODIFY_MPT command
848 	 */
849 	mr = srq->srq_mrhdl;
850 	mutex_enter(&mr->mr_lock);
851 	mpt = srq->srq_mrhdl->mr_mptrsrcp;
852 
853 	/*
854 	 * MODIFY_MPT
855 	 *
856 	 * If this fails for any reason, then it is an indication that
857 	 * something (either in HW or SW) has gone seriously wrong.  So we
858 	 * print a warning message and return.
859 	 */
860 	status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
861 	    TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
862 	if (status != TAVOR_CMD_SUCCESS) {
863 		cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
864 		    status);
865 		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
866 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
867 		TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
868 		(void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
869 		    srq->srq_mrhdl->mr_mttrsrcp);
870 		kmem_free(wre_new, srq->srq_wq_bufsz *
871 		    sizeof (tavor_wrid_entry_t));
872 		tavor_queue_free(state, &new_srqinfo);
873 		mutex_exit(&mr->mr_lock);
874 		mutex_exit(&srq->srq_lock);
875 		return (ibc_get_ci_failure(0));
876 	}
877 
878 	/*
879 	 * Update the Tavor Shared Receive Queue handle with all the new
880 	 * information.  At the same time, save away all the necessary
881 	 * information for freeing up the old resources
882 	 */
883 	old_srqinfo	   = srq->srq_wqinfo;
884 	old_mtt		   = srq->srq_mrhdl->mr_mttrsrcp;
885 	bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
886 	    sizeof (tavor_bind_info_t));
887 
888 	/* Now set the new info */
889 	srq->srq_wqinfo	   = new_srqinfo;
890 	srq->srq_wq_buf	   = buf;
891 	srq->srq_wq_bufsz  = (1 << log_srq_size);
892 	bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
893 	srq->srq_mrhdl->mr_mttrsrcp = mtt;
894 	srq->srq_desc_off  = srq_desc_off;
895 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
896 
897 	/* Update MR mtt pagesize */
898 	mr->mr_logmttpgsz = mtt_pgsize_bits;
899 	mutex_exit(&mr->mr_lock);
900 
901 #ifdef __lock_lint
902 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
903 #else
904 	if (srq->srq_wrid_wql != NULL) {
905 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
906 	}
907 #endif
908 
909 	/*
910 	 * Initialize new wridlist, if needed.
911 	 *
912 	 * If a wridlist already is setup on an SRQ (the QP associated with an
913 	 * SRQ has moved "from_reset") then we must update this wridlist based
914 	 * on the new SRQ size.  We allocate the new size of Work Request ID
915 	 * Entries, copy over the old entries to the new list, and
916 	 * re-initialize the srq wridlist in non-umap case
917 	 */
918 	wre_old = NULL;
919 	if (srq->srq_wridlist != NULL) {
920 		wre_old = srq->srq_wridlist->wl_wre;
921 
922 		bcopy(wre_old, wre_new, srq_old_bufsz *
923 		    sizeof (tavor_wrid_entry_t));
924 
925 		/* Setup new sizes in wre */
926 		srq->srq_wridlist->wl_wre = wre_new;
927 		srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
928 
929 		if (!srq->srq_is_umap) {
930 			tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
931 			    srq_old_bufsz);
932 		}
933 	}
934 
935 #ifdef __lock_lint
936 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
937 #else
938 	if (srq->srq_wrid_wql != NULL) {
939 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
940 	}
941 #endif
942 
943 	/*
944 	 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
945 	 * to a user process, then we need to call devmap_devmem_remap() to
946 	 * invalidate the mapping to the SRQ memory.  We also need to
947 	 * invalidate the SRQ tracking information for the user mapping.
948 	 *
949 	 * Note: On failure, the remap really shouldn't ever happen.  So, if it
950 	 * does, it is an indication that something has gone seriously wrong.
951 	 * So we print a warning message and return error (knowing, of course,
952 	 * that the "old" SRQ memory will be leaked)
953 	 */
954 	if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
955 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
956 		status = devmap_devmem_remap(srq->srq_umap_dhp,
957 		    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
958 		    DEVMAP_MAPPING_INVALID, NULL);
959 		if (status != DDI_SUCCESS) {
960 			mutex_exit(&srq->srq_lock);
961 			TAVOR_WARNING(state, "failed in SRQ memory "
962 			    "devmap_devmem_remap()");
963 			/* We can, however, free the memory for old wre */
964 			if (wre_old != NULL) {
965 				kmem_free(wre_old, srq_old_bufsz *
966 				    sizeof (tavor_wrid_entry_t));
967 			}
968 			TAVOR_TNF_EXIT(tavor_srq_modify);
969 			return (ibc_get_ci_failure(0));
970 		}
971 		srq->srq_umap_dhp = (devmap_cookie_t)NULL;
972 	}
973 
974 	/*
975 	 * Drop the SRQ lock now.  The only thing left to do is to free up
976 	 * the old resources.
977 	 */
978 	mutex_exit(&srq->srq_lock);
979 
980 	/*
981 	 * Unbind the MTT entries.
982 	 */
983 	status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
984 	if (status != DDI_SUCCESS) {
985 		TAVOR_WARNING(state, "failed to unbind old SRQ memory");
986 		/* Set "status" and "errormsg" and goto failure */
987 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
988 		    "failed to unbind (old)");
989 		goto srqmodify_fail;
990 	}
991 
992 	/* Free the memory for old wre */
993 	if (wre_old != NULL) {
994 		kmem_free(wre_old, srq_old_bufsz *
995 		    sizeof (tavor_wrid_entry_t));
996 	}
997 
998 	/* Free the memory for the old SRQ */
999 	tavor_queue_free(state, &old_srqinfo);
1000 
1001 	/*
1002 	 * Fill in the return arguments (if necessary).  This includes the
1003 	 * real new completion queue size.
1004 	 */
1005 	if (real_size != NULL) {
1006 		*real_size = (1 << log_srq_size);
1007 	}
1008 
1009 	TAVOR_TNF_EXIT(tavor_srq_modify);
1010 	return (DDI_SUCCESS);
1011 
1012 srqmodify_fail:
1013 	TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
1014 	    tnf_string, msg, errormsg);
1015 	TAVOR_TNF_EXIT(tavor_srq_modify);
1016 	return (status);
1017 }
1018 
1019 
1020 /*
1021  * tavor_srq_refcnt_inc()
1022  *    Context: Can be called from interrupt or base context.
1023  */
1024 void
1025 tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
1026 {
1027 	mutex_enter(&srq->srq_lock);
1028 	TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc, TAVOR_TNF_TRACE, "",
1029 	    tnf_uint, refcnt, srq->srq_refcnt);
1030 	srq->srq_refcnt++;
1031 	mutex_exit(&srq->srq_lock);
1032 }
1033 
1034 
1035 /*
1036  * tavor_srq_refcnt_dec()
1037  *    Context: Can be called from interrupt or base context.
1038  */
1039 void
1040 tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
1041 {
1042 	mutex_enter(&srq->srq_lock);
1043 	srq->srq_refcnt--;
1044 	TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec, TAVOR_TNF_TRACE, "",
1045 	    tnf_uint, refcnt, srq->srq_refcnt);
1046 	mutex_exit(&srq->srq_lock);
1047 }
1048 
1049 
1050 /*
1051  * tavor_srqhdl_from_srqnum()
1052  *    Context: Can be called from interrupt or base context.
1053  *
1054  *    This routine is important because changing the unconstrained
1055  *    portion of the SRQ number is critical to the detection of a
1056  *    potential race condition in the SRQ handler code (i.e. the case
1057  *    where a SRQ is freed and alloc'd again before an event for the
1058  *    "old" SRQ can be handled).
1059  *
1060  *    While this is not a perfect solution (not sure that one exists)
1061  *    it does help to mitigate the chance that this race condition will
1062  *    cause us to deliver a "stale" event to the new SRQ owner.  Note:
1063  *    this solution does not scale well because the number of constrained
1064  *    bits increases (and, hence, the number of unconstrained bits
1065  *    decreases) as the number of supported SRQ grows.  For small and
1066  *    intermediate values, it should hopefully provide sufficient
1067  *    protection.
1068  */
1069 tavor_srqhdl_t
1070 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
1071 {
1072 	uint_t	srqindx, srqmask;
1073 
1074 	/* Calculate the SRQ table index from the srqnum */
1075 	srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1076 	srqindx = srqnum & srqmask;
1077 	return (state->ts_srqhdl[srqindx]);
1078 }
1079 
1080 
1081 /*
1082  * tavor_srq_sgl_to_logwqesz()
1083  *    Context: Can be called from interrupt or base context.
1084  */
1085 static void
1086 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1087     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1088 {
1089 	uint_t	max_size, log2, actual_sgl;
1090 
1091 	TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);
1092 
1093 	switch (wq_type) {
1094 	case TAVOR_QP_WQ_TYPE_RECVQ:
1095 		/*
1096 		 * Use requested maximum SGL to calculate max descriptor size
1097 		 * (while guaranteeing that the descriptor size is a
1098 		 * power-of-2 cachelines).
1099 		 */
1100 		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1101 		log2 = highbit(max_size);
1102 		if ((max_size & (max_size - 1)) == 0) {
1103 			log2 = log2 - 1;
1104 		}
1105 
1106 		/* Make sure descriptor is at least the minimum size */
1107 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1108 
1109 		/* Calculate actual number of SGL (given WQE size) */
1110 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1111 		break;
1112 
1113 	default:
1114 		TAVOR_WARNING(state, "unexpected work queue type");
1115 		TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
1116 		    TAVOR_TNF_ERROR, "");
1117 		break;
1118 	}
1119 
1120 	/* Fill in the return values */
1121 	*logwqesz = log2;
1122 	*max_sgl  = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1123 
1124 	TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
1125 }
1126