xref: /illumos-gate/usr/src/uts/common/io/ib/clients/iser/iser_ib.c (revision 581cede61ac9c14d8d4ea452562a567189eead78)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ddi.h>
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <netinet/in.h>
31 #include <sys/sunddi.h>
32 #include <sys/sysmacros.h>
33 #include <sys/iscsi_protocol.h>
34 
35 #include <sys/ib/clients/iser/iser.h>
36 #include <sys/ib/clients/iser/iser_idm.h>
37 
38 /*
39  * iser_ib.c
40  * Routines for InfiniBand transport for iSER
41  *
42  * This file contains the routines to interface with the IBT API to attach and
43  * allocate IB resources, handle async events, and post recv work requests.
44  *
45  */
46 
47 static iser_hca_t *iser_ib_gid2hca(ib_gid_t gid);
48 static iser_hca_t *iser_ib_guid2hca(ib_guid_t guid);
49 
50 static iser_hca_t *iser_ib_alloc_hca(ib_guid_t guid);
51 static int iser_ib_free_hca(iser_hca_t *hca);
52 static int iser_ib_update_hcaports(iser_hca_t *hca);
53 static int iser_ib_init_hcas(void);
54 static int iser_ib_fini_hcas(void);
55 
56 static iser_sbind_t *iser_ib_get_bind(
57     iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid);
58 static int iser_ib_activate_port(
59     idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid);
60 static void iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid);
61 
62 static void iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size);
63 static void iser_ib_fini_qp(iser_qp_t *qp);
64 
65 static int iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size,
66     ibt_cq_hdl_t *cq_hdl);
67 
68 static void iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
69     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
70     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs);
71 
72 static void iser_ib_handle_portup_event(ibt_hca_hdl_t hdl,
73     ibt_async_event_t *event);
74 static void iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl,
75     ibt_async_event_t *event);
76 static void iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl,
77     ibt_async_event_t *event);
78 
79 static void iser_ib_post_recv_task(void *arg);
80 
81 static struct ibt_clnt_modinfo_s iser_ib_modinfo = {
82 	IBTI_V_CURR,
83 	IBT_STORAGE_DEV,
84 	iser_ib_async_handler,
85 	NULL,
86 	"iSER"
87 };
88 
89 /*
90  * iser_ib_init
91  *
92  * This function registers the HCA drivers with IBTF and registers and binds
93  * iSER as a service with IBTF.
94  */
95 int
96 iser_ib_init(void)
97 {
98 	int		status;
99 
100 	/* Register with IBTF */
101 	status = ibt_attach(&iser_ib_modinfo, iser_state->is_dip, iser_state,
102 	    &iser_state->is_ibhdl);
103 	if (status != DDI_SUCCESS) {
104 		ISER_LOG(CE_NOTE, "iser_ib_init: ibt_attach failed (0x%x)",
105 		    status);
106 		return (DDI_FAILURE);
107 	}
108 
109 	/* Create the global work request kmem_cache */
110 	iser_state->iser_wr_cache = kmem_cache_create("iser_wr_cache",
111 	    sizeof (iser_wr_t), 0, NULL, NULL, NULL,
112 	    iser_state, NULL, KM_SLEEP);
113 
114 	/* Populate our list of HCAs */
115 	status = iser_ib_init_hcas();
116 	if (status != DDI_SUCCESS) {
117 		/* HCAs failed to initialize, tear it down */
118 		kmem_cache_destroy(iser_state->iser_wr_cache);
119 		(void) ibt_detach(iser_state->is_ibhdl);
120 		iser_state->is_ibhdl = NULL;
121 		ISER_LOG(CE_NOTE, "iser_ib_init: failed to initialize HCAs");
122 		return (DDI_FAILURE);
123 	}
124 
125 	/* Target will register iSER as a service with IBTF when required */
126 
127 	/* Target will bind this service when it comes online */
128 
129 	return (DDI_SUCCESS);
130 }
131 
132 /*
133  * iser_ib_fini
134  *
135  * This function unbinds and degisters the iSER service from IBTF
136  */
137 int
138 iser_ib_fini(void)
139 {
140 	/* IDM would have already disabled all the services */
141 
142 	/* Teardown the HCA list and associated resources */
143 	if (iser_ib_fini_hcas() != DDI_SUCCESS)
144 		return (DDI_FAILURE);
145 
146 	/* Teardown the global work request kmem_cache */
147 	kmem_cache_destroy(iser_state->iser_wr_cache);
148 
149 	/* Deregister with IBTF */
150 	if (iser_state->is_ibhdl != NULL) {
151 		(void) ibt_detach(iser_state->is_ibhdl);
152 		iser_state->is_ibhdl = NULL;
153 	}
154 
155 	return (DDI_SUCCESS);
156 }
157 
158 /*
159  * iser_ib_register_service
160  *
161  * This function registers the iSER service using the RDMA-Aware Service ID.
162  */
163 int
164 iser_ib_register_service(idm_svc_t *idm_svc)
165 {
166 	ibt_srv_desc_t	srvdesc;
167 	iser_svc_t	*iser_svc;
168 	int		status;
169 
170 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
171 
172 	/* Set up IBTI client callback handler from the CM */
173 	srvdesc.sd_handler = iser_ib_cm_handler;
174 
175 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
176 
177 	iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
178 
179 	/* Register the service on the specified port */
180 	status = ibt_register_service(
181 	    iser_state->is_ibhdl, &srvdesc,
182 	    iser_svc->is_svcid, 1, &iser_svc->is_srvhdl, NULL);
183 
184 	return (status);
185 }
186 
187 /*
188  * iser_ib_bind_service
189  *
190  * This function binds a given iSER service on all available HCA ports. The
191  * current specification does not allow user to specify transport bindings
192  * for each iscsi target. The ULP invokes this function to bind the target
193  * to all available iser ports after checking for the presence of an IB HCA.
194  * iSER is "configured" whenever an IB-capable IP address exists. The lack
195  * of active IB ports is a less-fatal condition, and sockets would be used
196  * as the transport even though an Infiniband HCA is configured but unusable.
197  *
198  */
199 int
200 iser_ib_bind_service(idm_svc_t *idm_svc)
201 {
202 	iser_hca_t	*hca;
203 	ib_gid_t	gid;
204 	int		num_ports = 0;
205 	int		num_binds = 0;
206 	int		num_inactive_binds = 0; /* if HCA ports inactive */
207 	int		status;
208 	int		i;
209 
210 	ASSERT(idm_svc != NULL);
211 	ASSERT(idm_svc->is_iser_svc != NULL);
212 
213 	/* Register the iSER service on all available ports */
214 	mutex_enter(&iser_state->is_hcalist_lock);
215 
216 	for (hca = list_head(&iser_state->is_hcalist);
217 	    hca != NULL;
218 	    hca = list_next(&iser_state->is_hcalist, hca)) {
219 
220 		for (i = 0; i < hca->hca_num_ports; i++) {
221 			num_ports++;
222 			if (hca->hca_port_info[i].p_linkstate !=
223 			    IBT_PORT_ACTIVE) {
224 				/*
225 				 * Move on. We will attempt to bind service
226 				 * in our async handler if the port comes up
227 				 * at a later time.
228 				 */
229 				num_inactive_binds++;
230 				continue;
231 			}
232 
233 			gid = hca->hca_port_info[i].p_sgid_tbl[0];
234 
235 			/* If the port is already bound, skip */
236 			if (iser_ib_get_bind(
237 			    idm_svc->is_iser_svc, hca->hca_guid, gid) == NULL) {
238 
239 				status = iser_ib_activate_port(
240 				    idm_svc, hca->hca_guid, gid);
241 				if (status != IBT_SUCCESS) {
242 					ISER_LOG(CE_NOTE,
243 					    "iser_ib_bind_service: "
244 					    "iser_ib_activate_port failure "
245 					    "(0x%x)", status);
246 					continue;
247 				}
248 			}
249 			num_binds++;
250 		}
251 	}
252 	mutex_exit(&iser_state->is_hcalist_lock);
253 
254 	if (num_binds) {
255 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Service available on "
256 		    "(%d) of (%d) ports", num_binds, num_ports);
257 		return (ISER_STATUS_SUCCESS);
258 	} else if (num_inactive_binds) {
259 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Could not bind "
260 		    "service, HCA ports are not active.");
261 		/*
262 		 * still considered success, the async handler will bind
263 		 * the service when the port comes up at a later time
264 		 */
265 		return (ISER_STATUS_SUCCESS);
266 	} else {
267 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Did not bind service");
268 		return (ISER_STATUS_FAIL);
269 	}
270 }
271 
272 /*
273  * iser_ib_unbind_service
274  *
275  * This function unbinds a given service on a all HCA ports
276  */
277 void
278 iser_ib_unbind_service(idm_svc_t *idm_svc)
279 {
280 	iser_svc_t	*iser_svc;
281 	iser_sbind_t	*is_sbind, *next_sb;
282 
283 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
284 
285 		iser_svc = idm_svc->is_iser_svc;
286 
287 		for (is_sbind = list_head(&iser_svc->is_sbindlist);
288 		    is_sbind != NULL;
289 		    is_sbind = next_sb) {
290 			next_sb = list_next(&iser_svc->is_sbindlist, is_sbind);
291 			ibt_unbind_service(iser_svc->is_srvhdl,
292 			    is_sbind->is_sbindhdl);
293 			list_remove(&iser_svc->is_sbindlist, is_sbind);
294 			kmem_free(is_sbind, sizeof (iser_sbind_t));
295 		}
296 	}
297 }
298 
299 /* ARGSUSED */
300 void
301 iser_ib_deregister_service(idm_svc_t *idm_svc)
302 {
303 	iser_svc_t	*iser_svc;
304 
305 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
306 
307 		iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
308 		ibt_deregister_service(iser_state->is_ibhdl,
309 		    iser_svc->is_srvhdl);
310 		ibt_release_ip_sid(iser_svc->is_svcid);
311 	}
312 }
313 
314 /*
315  * iser_ib_get_paths
316  * This function finds the IB path between the local and the remote address.
317  *
318  */
319 int
320 iser_ib_get_paths(ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip,
321     ibt_path_info_t *path, ibt_path_ip_src_t *path_src_ip)
322 {
323 	ibt_ip_path_attr_t	ipattr;
324 	int			status;
325 
326 	(void) bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
327 	ipattr.ipa_dst_ip	= remote_ip;
328 	ipattr.ipa_src_ip	= *local_ip;
329 	ipattr.ipa_max_paths	= 1;
330 	ipattr.ipa_ndst		= 1;
331 
332 	(void) bzero(path, sizeof (ibt_path_info_t));
333 	status = ibt_get_ip_paths(iser_state->is_ibhdl, IBT_PATH_NO_FLAGS,
334 	    &ipattr, path, NULL, path_src_ip);
335 	if (status != IBT_SUCCESS) {
336 		ISER_LOG(CE_NOTE, "ibt_get_ip_paths: ibt_get_ip_paths "
337 		    "failure: status (%d)", status);
338 		return (status);
339 	}
340 
341 	if (local_ip != NULL) {
342 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: IP[%x to %x]",
343 		    local_ip->un.ip4addr, remote_ip->un.ip4addr);
344 	} else {
345 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: "
346 		    "IP[INADDR_ANY to %x]", remote_ip->un.ip4addr);
347 	}
348 
349 	return (ISER_STATUS_SUCCESS);
350 }
351 
352 /*
353  * iser_ib_alloc_channel_nopathlookup
354  *
355  * This function allocates a reliable connected channel. This function does
356  * not invoke ibt_get_ip_paths() to do the path lookup. The HCA GUID and
357  * port are input to this function.
358  */
359 iser_chan_t *
360 iser_ib_alloc_channel_nopathlookup(ib_guid_t hca_guid, uint8_t hca_port)
361 {
362 	iser_hca_t	*hca;
363 	iser_chan_t	*chan;
364 
365 	/* Lookup the hca using the gid in the path info */
366 	hca = iser_ib_guid2hca(hca_guid);
367 	if (hca == NULL) {
368 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_nopathlookup: failed "
369 		    "to lookup HCA(%llx) handle", (longlong_t)hca_guid);
370 		return (NULL);
371 	}
372 
373 	chan = iser_ib_alloc_rc_channel(hca, hca_port);
374 	if (chan == NULL) {
375 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_nopathlookup: failed "
376 		    "to alloc channel on HCA(%llx) %d",
377 		    (longlong_t)hca_guid, hca_port);
378 		return (NULL);
379 	}
380 
381 	ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup success: "
382 	    "chanhdl (0x%p), HCA(%llx) %d",
383 	    (void *)chan->ic_chanhdl, (longlong_t)hca_guid, hca_port);
384 
385 	return (chan);
386 }
387 
388 /*
389  * iser_ib_alloc_channel_pathlookup
390  *
391  * This function allocates a reliable connected channel but first invokes
392  * ibt_get_ip_paths() with the given local and remote addres to get the
393  * HCA lgid and the port number.
394  */
395 iser_chan_t *
396 iser_ib_alloc_channel_pathlookup(
397     ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip)
398 {
399 	ibt_path_info_t		ibt_path;
400 	ibt_path_ip_src_t	path_src_ip;
401 	ib_gid_t		lgid;
402 	uint8_t			hca_port; /* from path */
403 	iser_hca_t		*hca;
404 	iser_chan_t		*chan;
405 	int			status;
406 
407 	/* Lookup a path to the given destination */
408 	status = iser_ib_get_paths(
409 	    local_ip, remote_ip, &ibt_path, &path_src_ip);
410 
411 	if (status != ISER_STATUS_SUCCESS) {
412 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: faild "
413 		    "Path lookup IP:[%llx to %llx] failed: status (%d)",
414 		    (longlong_t)local_ip->un.ip4addr,
415 		    (longlong_t)remote_ip->un.ip4addr,
416 		    status);
417 		return (NULL);
418 	}
419 
420 	/* get the local gid from the path info */
421 	lgid = ibt_path.pi_prim_cep_path.cep_adds_vect.av_sgid;
422 
423 	/* get the hca port from the path info */
424 	hca_port = ibt_path.pi_prim_cep_path.cep_hca_port_num;
425 
426 	/* Lookup the hca using the gid in the path info */
427 	hca = iser_ib_gid2hca(lgid);
428 	if (hca == NULL) {
429 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: failed "
430 		    "to lookup HCA (%llx) handle",
431 		    (longlong_t)hca->hca_guid);
432 		return (NULL);
433 	}
434 
435 	chan = iser_ib_alloc_rc_channel(hca, hca_port);
436 	if (chan == NULL) {
437 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: failed "
438 		    "to alloc channel from IP:[%llx to %llx] on HCA (%llx) %d",
439 		    (longlong_t)local_ip->un.ip4addr,
440 		    (longlong_t)remote_ip->un.ip4addr,
441 		    (longlong_t)hca->hca_guid, hca_port);
442 		return (NULL);
443 	}
444 
445 	ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup success: "
446 	    "chanhdl (0x%p), IP:[%llx to %llx], lgid (%llx:%llx), HCA(%llx) %d",
447 	    (void *)chan->ic_chanhdl,
448 	    (longlong_t)local_ip->un.ip4addr,
449 	    (longlong_t)remote_ip->un.ip4addr,
450 	    (longlong_t)lgid.gid_prefix, (longlong_t)lgid.gid_guid,
451 	    (longlong_t)hca->hca_guid, hca_port);
452 
453 	chan->ic_ibt_path	= ibt_path;
454 	chan->ic_localip	= path_src_ip.ip_primary;
455 	chan->ic_remoteip	= *remote_ip;
456 
457 	return (chan);
458 }
459 
460 /*
461  * iser_ib_alloc_rc_channel
462  *
463  * This function allocates a reliable communication channel using the specified
464  * channel attributes.
465  */
466 iser_chan_t *
467 iser_ib_alloc_rc_channel(iser_hca_t *hca, uint8_t hca_port)
468 {
469 
470 	iser_chan_t			*chan;
471 	ibt_rc_chan_alloc_args_t	chanargs;
472 	uint_t				sq_size, rq_size;
473 	int				status;
474 
475 	chan = kmem_zalloc(sizeof (iser_chan_t), KM_SLEEP);
476 
477 	mutex_init(&chan->ic_lock, NULL, MUTEX_DRIVER, NULL);
478 	mutex_init(&chan->ic_sq_post_lock, NULL, MUTEX_DRIVER, NULL);
479 
480 	/* Set up the iSER channel handle with HCA */
481 	chan->ic_hca		= hca;
482 
483 	/*
484 	 * Determine the queue sizes, based upon the HCA query data.
485 	 * For our Work Queues, we will use either our default value,
486 	 * or the HCA's maximum value, whichever is smaller.
487 	 */
488 	sq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_SENDQ_SIZE);
489 	rq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_RECVQ_SIZE);
490 
491 	/*
492 	 * For our Completion Queues, we again check the device maximum.
493 	 * We want to end up with CQs that are the next size up from the
494 	 * WQs they are servicing so that they have some overhead.
495 	 */
496 	if (hca->hca_attr.hca_max_cq_sz >= (sq_size + 1)) {
497 		chan->ic_sendcq_sz = sq_size + 1;
498 	} else {
499 		chan->ic_sendcq_sz = hca->hca_attr.hca_max_cq_sz;
500 		sq_size = chan->ic_sendcq_sz - 1;
501 	}
502 
503 	if (hca->hca_attr.hca_max_cq_sz >= (rq_size + 1)) {
504 		chan->ic_recvcq_sz = rq_size + 1;
505 	} else {
506 		chan->ic_recvcq_sz = hca->hca_attr.hca_max_cq_sz;
507 		rq_size = chan->ic_recvcq_sz - 1;
508 	}
509 
510 	/* Initialize the iSER channel's QP handle */
511 	iser_ib_init_qp(chan, sq_size, rq_size);
512 
513 	/* Set up the Send Completion Queue */
514 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_sendcq_sz,
515 	    &chan->ic_sendcq);
516 	if (status != ISER_STATUS_SUCCESS) {
517 		iser_ib_fini_qp(&chan->ic_qp);
518 		mutex_destroy(&chan->ic_lock);
519 		mutex_destroy(&chan->ic_sq_post_lock);
520 		kmem_free(chan, sizeof (iser_chan_t));
521 		return (NULL);
522 	}
523 	ibt_set_cq_handler(chan->ic_sendcq, iser_ib_sendcq_handler, chan);
524 	ibt_enable_cq_notify(chan->ic_sendcq, IBT_NEXT_COMPLETION);
525 
526 	/* Set up the Receive Completion Queue */
527 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_recvcq_sz,
528 	    &chan->ic_recvcq);
529 	if (status != ISER_STATUS_SUCCESS) {
530 		(void) ibt_free_cq(chan->ic_sendcq);
531 		iser_ib_fini_qp(&chan->ic_qp);
532 		mutex_destroy(&chan->ic_lock);
533 		mutex_destroy(&chan->ic_sq_post_lock);
534 		kmem_free(chan, sizeof (iser_chan_t));
535 		return (NULL);
536 	}
537 	ibt_set_cq_handler(chan->ic_recvcq, iser_ib_recvcq_handler, chan);
538 	ibt_enable_cq_notify(chan->ic_recvcq, IBT_NEXT_COMPLETION);
539 
540 	/* Setup the channel arguments */
541 	iser_ib_setup_chanargs(hca_port, chan->ic_sendcq, chan->ic_recvcq,
542 	    sq_size, rq_size, hca->hca_pdhdl, &chanargs);
543 
544 	status = ibt_alloc_rc_channel(hca->hca_hdl,
545 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chan->ic_chanhdl, NULL);
546 	if (status != IBT_SUCCESS) {
547 		ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel: failed "
548 		    "ibt_alloc_rc_channel: status (%d)", status);
549 		(void) ibt_free_cq(chan->ic_sendcq);
550 		(void) ibt_free_cq(chan->ic_recvcq);
551 		iser_ib_fini_qp(&chan->ic_qp);
552 		mutex_destroy(&chan->ic_lock);
553 		mutex_destroy(&chan->ic_sq_post_lock);
554 		kmem_free(chan, sizeof (iser_chan_t));
555 		return (NULL);
556 	}
557 
558 	/* Set the 'channel' as the client private data */
559 	(void) ibt_set_chan_private(chan->ic_chanhdl, chan);
560 
561 	return (chan);
562 }
563 
564 /*
565  * iser_ib_open_rc_channel
566  * This function opens a RC connection on the given allocated RC channel
567  */
568 int
569 iser_ib_open_rc_channel(iser_chan_t *chan)
570 {
571 	ibt_ip_cm_info_t	ipcm_info;
572 	iser_private_data_t	iser_priv_data;
573 	ibt_chan_open_args_t	ocargs;
574 	ibt_rc_returns_t	ocreturns;
575 	int			status;
576 
577 	mutex_enter(&chan->ic_lock);
578 
579 	/*
580 	 * For connection establishment, the initiator sends a CM REQ using the
581 	 * iSER RDMA-Aware Service ID. Included are the source and destination
582 	 * IP addresses, and the src port.
583 	 */
584 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
585 	ipcm_info.src_addr = chan->ic_localip;
586 	ipcm_info.dst_addr = chan->ic_remoteip;
587 	ipcm_info.src_port = chan->ic_lport;
588 
589 	/*
590 	 * The CM Private Data field defines the iSER connection parameters
591 	 * such as zero based virtual address exception (ZBVAE) and Send with
592 	 * invalidate Exception (SIE).
593 	 *
594 	 * Solaris IBT does not currently support ZBVAE or SIE.
595 	 */
596 	iser_priv_data.rsvd1	= 0;
597 	iser_priv_data.sie	= 1;
598 	iser_priv_data.zbvae	= 1;
599 
600 	status = ibt_format_ip_private_data(&ipcm_info,
601 	    sizeof (iser_private_data_t), &iser_priv_data);
602 	if (status != IBT_SUCCESS) {
603 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
604 		mutex_exit(&chan->ic_lock);
605 		return (status);
606 	}
607 
608 	/*
609 	 * Set the SID we are attempting to connect to, based upon the
610 	 * remote port number.
611 	 */
612 	chan->ic_ibt_path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, chan->ic_rport);
613 
614 	/* Set up the args for the channel open */
615 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
616 	ocargs.oc_path			= &chan->ic_ibt_path;
617 	ocargs.oc_cm_handler		= iser_ib_cm_handler;
618 	ocargs.oc_cm_clnt_private	= iser_state;
619 	ocargs.oc_rdma_ra_out		= 4;
620 	ocargs.oc_rdma_ra_in		= 4;
621 	ocargs.oc_path_retry_cnt	= 2;
622 	ocargs.oc_path_rnr_retry_cnt	= 2;
623 	ocargs.oc_priv_data_len		= sizeof (iser_private_data_t);
624 	ocargs.oc_priv_data		= &iser_priv_data;
625 
626 	bzero(&ocreturns, sizeof (ibt_rc_returns_t));
627 
628 	status = ibt_open_rc_channel(chan->ic_chanhdl,
629 	    IBT_OCHAN_NO_FLAGS, IBT_BLOCKING, &ocargs, &ocreturns);
630 
631 	if (status != IBT_SUCCESS) {
632 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
633 		mutex_exit(&chan->ic_lock);
634 		return (status);
635 	}
636 
637 	mutex_exit(&chan->ic_lock);
638 	return (IDM_STATUS_SUCCESS);
639 }
640 
641 /*
642  * iser_ib_close_rc_channel
643  * This function closes the RC channel related to this iser_chan handle.
644  * We invoke this in a non-blocking, no callbacks context.
645  */
646 void
647 iser_ib_close_rc_channel(iser_chan_t *chan)
648 {
649 	int			status;
650 
651 	mutex_enter(&chan->ic_lock);
652 	status = ibt_close_rc_channel(chan->ic_chanhdl, IBT_BLOCKING, NULL,
653 	    0, NULL, NULL, 0);
654 	if (status != IBT_SUCCESS) {
655 		ISER_LOG(CE_NOTE, "iser_ib_close_rc_channel: "
656 		    "ibt_close_rc_channel failed: status (%d)", status);
657 	}
658 	mutex_exit(&chan->ic_lock);
659 }
660 
661 /*
662  * iser_ib_free_rc_channel
663  *
664  * This function tears down an RC channel's QP initialization and frees it.
665  * Note that we do not need synchronization here; the channel has been
666  * closed already, so we should only have completion polling occuring.  Once
667  * complete, we are free to free the IBTF channel, WQ and CQ resources, and
668  * our own related resources.
669  */
670 void
671 iser_ib_free_rc_channel(iser_chan_t *chan)
672 {
673 	iser_qp_t	*iser_qp;
674 
675 	iser_qp = &chan->ic_qp;
676 
677 	/* Ensure the SQ is empty */
678 	while (chan->ic_sq_post_count != 0) {
679 		mutex_exit(&chan->ic_conn->ic_lock);
680 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
681 		mutex_enter(&chan->ic_conn->ic_lock);
682 	}
683 	mutex_destroy(&chan->ic_sq_post_lock);
684 
685 	/* Ensure the RQ is empty */
686 	(void) ibt_flush_channel(chan->ic_chanhdl);
687 	mutex_enter(&iser_qp->qp_lock);
688 	while (iser_qp->rq_level != 0) {
689 		mutex_exit(&iser_qp->qp_lock);
690 		mutex_exit(&chan->ic_conn->ic_lock);
691 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
692 		mutex_enter(&chan->ic_conn->ic_lock);
693 		mutex_enter(&iser_qp->qp_lock);
694 	}
695 
696 	/* Free our QP handle */
697 	mutex_exit(&iser_qp->qp_lock);
698 	(void) iser_ib_fini_qp(iser_qp);
699 
700 	/* Free the IBT channel resources */
701 	(void) ibt_free_channel(chan->ic_chanhdl);
702 	chan->ic_chanhdl = NULL;
703 
704 	/* Free the CQs */
705 	ibt_free_cq(chan->ic_sendcq);
706 	ibt_free_cq(chan->ic_recvcq);
707 
708 	/* Free the chan handle */
709 	mutex_destroy(&chan->ic_lock);
710 	kmem_free(chan, sizeof (iser_chan_t));
711 }
712 
713 /*
714  * iser_ib_post_recv
715  *
716  * This function handles keeping the RQ full on a given channel.
717  * This routine will mostly be run on a taskq, and will check the
718  * current fill level of the RQ, and post as many WRs as necessary
719  * to fill it again.
720  */
721 
722 int
723 iser_ib_post_recv_async(ibt_channel_hdl_t chanhdl)
724 {
725 	iser_chan_t	*chan;
726 	int		status;
727 
728 	/* Pull our iSER channel handle from the private data */
729 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
730 
731 	/*
732 	 * Caller must check that chan->ic_conn->ic_stage indicates
733 	 * the connection is active (not closing, not closed) and
734 	 * it must hold the mutex cross the check and the call to this function
735 	 */
736 	ASSERT(mutex_owned(&chan->ic_conn->ic_lock));
737 	ASSERT((chan->ic_conn->ic_stage >= ISER_CONN_STAGE_IC_CONNECTED) &&
738 	    (chan->ic_conn->ic_stage <= ISER_CONN_STAGE_LOGGED_IN));
739 	idm_conn_hold(chan->ic_conn->ic_idmc);
740 	status = ddi_taskq_dispatch(iser_taskq, iser_ib_post_recv_task,
741 	    (void *)chanhdl, DDI_NOSLEEP);
742 	if (status != DDI_SUCCESS) {
743 		idm_conn_rele(chan->ic_conn->ic_idmc);
744 	}
745 
746 	return (status);
747 }
748 
749 static void
750 iser_ib_post_recv_task(void *arg)
751 {
752 	ibt_channel_hdl_t	chanhdl = arg;
753 	iser_chan_t		*chan;
754 
755 	/* Pull our iSER channel handle from the private data */
756 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
757 
758 	iser_ib_post_recv(chanhdl);
759 	idm_conn_rele(chan->ic_conn->ic_idmc);
760 }
761 
762 void
763 iser_ib_post_recv(ibt_channel_hdl_t chanhdl)
764 {
765 	iser_chan_t	*chan;
766 	iser_hca_t	*hca;
767 	iser_msg_t	*msg;
768 	ibt_recv_wr_t	*wrlist, wr[ISER_IB_RQ_POST_MAX];
769 	int		rq_space, msg_ret;
770 	int		total_num, npost;
771 	uint_t		nposted;
772 	int		status, i;
773 	iser_qp_t	*iser_qp;
774 
775 	/* Pull our iSER channel handle from the private data */
776 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
777 
778 	ASSERT(chan != NULL);
779 
780 	mutex_enter(&chan->ic_conn->ic_lock);
781 
782 	/* Bail out if the connection is closed; no need for more recv WRs */
783 	if ((chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSING) ||
784 	    (chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSED)) {
785 		mutex_exit(&chan->ic_conn->ic_lock);
786 		return;
787 	}
788 
789 	/* get the QP handle from the iser_chan */
790 	iser_qp = &chan->ic_qp;
791 
792 	hca = chan->ic_hca;
793 
794 	if (hca == NULL) {
795 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to retrieve "
796 		    "HCA handle");
797 		mutex_exit(&chan->ic_conn->ic_lock);
798 		return;
799 	}
800 
801 	/* check for space to post on the RQ */
802 	mutex_enter(&iser_qp->qp_lock);
803 	rq_space = iser_qp->rq_depth - iser_qp->rq_level;
804 	if (rq_space == 0) {
805 		/* The RQ is full, clear the pending flag and return */
806 		iser_qp->rq_taskqpending = B_FALSE;
807 		mutex_exit(&iser_qp->qp_lock);
808 		mutex_exit(&chan->ic_conn->ic_lock);
809 		return;
810 	}
811 
812 	/* Keep track of the lowest value for rq_min_post_level */
813 	if (iser_qp->rq_level < iser_qp->rq_min_post_level)
814 		iser_qp->rq_min_post_level = iser_qp->rq_level;
815 
816 	mutex_exit(&iser_qp->qp_lock);
817 
818 	/* we've room to post, so pull from the msg cache */
819 	msg = iser_msg_get(hca, rq_space, &msg_ret);
820 	if (msg == NULL) {
821 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: no message handles "
822 		    "available in msg cache currently");
823 		/*
824 		 * There are no messages on the cache. Wait a half-
825 		 * second, then try again.
826 		 */
827 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
828 		status = iser_ib_post_recv_async(chanhdl);
829 		if (status != DDI_SUCCESS) {
830 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
831 			    "redispatch routine");
832 			/* Failed to dispatch, clear pending flag */
833 			mutex_enter(&iser_qp->qp_lock);
834 			iser_qp->rq_taskqpending = B_FALSE;
835 			mutex_exit(&iser_qp->qp_lock);
836 		}
837 		mutex_exit(&chan->ic_conn->ic_lock);
838 		return;
839 	}
840 
841 	if (msg_ret != rq_space) {
842 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: requested number of "
843 		    "messages not allocated: requested (%d) allocated (%d)",
844 		    rq_space, msg_ret);
845 		/* We got some, but not all, of our requested depth */
846 		rq_space = msg_ret;
847 	}
848 
849 	/*
850 	 * Now, walk through the allocated WRs and post them,
851 	 * ISER_IB_RQ_POST_MAX (or less) at a time.
852 	 */
853 	wrlist = &wr[0];
854 	total_num = rq_space;
855 
856 	while (total_num) {
857 		/* determine the number to post on this iteration */
858 		npost = (total_num > ISER_IB_RQ_POST_MAX) ?
859 		    ISER_IB_RQ_POST_MAX : total_num;
860 
861 		/* build a list of WRs from the msg list */
862 		for (i = 0; i < npost; i++) {
863 			wrlist[i].wr_id		= (ibt_wrid_t)(uintptr_t)msg;
864 			wrlist[i].wr_nds	= ISER_IB_SGLIST_SIZE;
865 			wrlist[i].wr_sgl	= &msg->msg_ds;
866 			msg = msg->nextp;
867 		}
868 
869 		/* post the list to the RQ */
870 		nposted = 0;
871 		status = ibt_post_recv(chanhdl, wrlist, npost, &nposted);
872 		if ((status != IBT_SUCCESS) || (nposted != npost)) {
873 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: ibt_post_recv "
874 			    "failed: requested (%d) posted (%d) status (%d)",
875 			    npost, nposted, status);
876 			total_num -= nposted;
877 			break;
878 		}
879 
880 		/* decrement total number to post by the number posted */
881 		total_num -= nposted;
882 	}
883 
884 	mutex_enter(&iser_qp->qp_lock);
885 	if (total_num != 0) {
886 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to fill RQ, "
887 		    "failed to post (%d) WRs", total_num);
888 		iser_qp->rq_level += rq_space - total_num;
889 	} else {
890 		iser_qp->rq_level += rq_space;
891 	}
892 
893 	/*
894 	 * Now that we've filled the RQ, check that all of the recv WRs
895 	 * haven't just been immediately consumed. If so, taskqpending is
896 	 * still B_TRUE, so we need to fire off a taskq thread to post
897 	 * more WRs.
898 	 */
899 	if (iser_qp->rq_level == 0) {
900 		mutex_exit(&iser_qp->qp_lock);
901 		status = iser_ib_post_recv_async(chanhdl);
902 		if (status != DDI_SUCCESS) {
903 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
904 			    "dispatch followup routine");
905 			/* Failed to dispatch, clear pending flag */
906 			mutex_enter(&iser_qp->qp_lock);
907 			iser_qp->rq_taskqpending = B_FALSE;
908 			mutex_exit(&iser_qp->qp_lock);
909 		}
910 	} else {
911 		/*
912 		 * We're done, we've filled the RQ. Clear the taskq
913 		 * flag so that we can run again.
914 		 */
915 		iser_qp->rq_taskqpending = B_FALSE;
916 		mutex_exit(&iser_qp->qp_lock);
917 	}
918 
919 	mutex_exit(&chan->ic_conn->ic_lock);
920 }
921 
922 /*
923  * iser_ib_handle_portup_event()
924  * This handles the IBT_EVENT_PORT_UP unaffiliated asynchronous event.
925  *
926  * To facilitate a seamless bringover of the port and configure the CM service
927  * for inbound iSER service requests on this newly active port, the existing
928  * IDM services will be checked for iSER support.
929  * If an iSER service was already created, then this service will simply be
930  * bound to the gid of the newly active port. If on the other hand, the CM
931  * service did not exist, i.e. only socket communication, then a new CM
932  * service will be first registered with the saved service parameters and
933  * then bound to the newly active port.
934  *
935  */
936 /* ARGSUSED */
937 static void
938 iser_ib_handle_portup_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
939 {
940 	iser_hca_t		*hca;
941 	ib_gid_t		gid;
942 	idm_svc_t		*idm_svc;
943 	int			status;
944 
945 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: HCA(0x%llx) port(%d)",
946 	    (longlong_t)event->ev_hca_guid, event->ev_port);
947 
948 	/*
949 	 * Query all ports on the HCA and update the port information
950 	 * maintainted in the iser_hca_t structure
951 	 */
952 	hca = iser_ib_guid2hca(event->ev_hca_guid);
953 	if (hca == NULL) {
954 
955 		/* HCA is just made available, first port on that HCA */
956 		hca = iser_ib_alloc_hca(event->ev_hca_guid);
957 
958 		mutex_enter(&iser_state->is_hcalist_lock);
959 		list_insert_tail(&iser_state->is_hcalist, hca);
960 		iser_state->is_num_hcas++;
961 		mutex_exit(&iser_state->is_hcalist_lock);
962 
963 	} else {
964 
965 		status = iser_ib_update_hcaports(hca);
966 
967 		if (status != IBT_SUCCESS) {
968 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
969 			    "status(0x%x): iser_ib_update_hcaports failed: "
970 			    "HCA(0x%llx) port(%d)", status,
971 			    (longlong_t)event->ev_hca_guid, event->ev_port);
972 			return;
973 		}
974 	}
975 
976 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
977 
978 	/*
979 	 * Iterate through the global list of IDM target services
980 	 * and check for existing iSER CM service.
981 	 */
982 	mutex_enter(&idm.idm_global_mutex);
983 	for (idm_svc = list_head(&idm.idm_tgt_svc_list);
984 	    idm_svc != NULL;
985 	    idm_svc = list_next(&idm.idm_tgt_svc_list, idm_svc)) {
986 
987 
988 		if (idm_svc->is_iser_svc == NULL) {
989 
990 			/* Establish a new CM service for iSER requests */
991 			status = iser_tgt_svc_create(
992 			    &idm_svc->is_svc_req, idm_svc);
993 
994 			if (status != IBT_SUCCESS) {
995 				ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
996 				    "status(0x%x): iser_tgt_svc_create failed: "
997 				    "HCA(0x%llx) port(%d)", status,
998 				    (longlong_t)event->ev_hca_guid,
999 				    event->ev_port);
1000 
1001 				continue;
1002 			}
1003 		}
1004 
1005 		status = iser_ib_activate_port(
1006 		    idm_svc, event->ev_hca_guid, gid);
1007 		if (status != IBT_SUCCESS) {
1008 
1009 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
1010 			    "status(0x%x): Bind service on port "
1011 			    "(%llx:%llx) failed",
1012 			    status, (longlong_t)gid.gid_prefix,
1013 			    (longlong_t)gid.gid_guid);
1014 
1015 			continue;
1016 		}
1017 		ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: service bound "
1018 		    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1019 		    event->ev_port);
1020 	}
1021 	mutex_exit(&idm.idm_global_mutex);
1022 
1023 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event success: "
1024 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1025 	    event->ev_port);
1026 }
1027 
1028 /*
1029  * iser_ib_handle_portdown_event()
1030  * This handles the IBT_EVENT_PORT_DOWN unaffiliated asynchronous error.
1031  *
1032  * Unconfigure the CM service on the deactivated port and teardown the
1033  * connections that are using the CM service.
1034  */
1035 /* ARGSUSED */
1036 static void
1037 iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
1038 {
1039 	iser_hca_t		*hca;
1040 	ib_gid_t		gid;
1041 	int			status;
1042 
1043 	/*
1044 	 * Query all ports on the HCA and update the port information
1045 	 * maintainted in the iser_hca_t structure
1046 	 */
1047 	hca = iser_ib_guid2hca(event->ev_hca_guid);
1048 	ASSERT(hca != NULL);
1049 
1050 	status = iser_ib_update_hcaports(hca);
1051 	if (status != IBT_SUCCESS) {
1052 		ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event status(0x%x): "
1053 		    "ibt_ib_update_hcaports failed: HCA(0x%llx) port(%d)",
1054 		    status, (longlong_t)event->ev_hca_guid, event->ev_port);
1055 		return;
1056 	}
1057 
1058 	/* get the gid of the new port */
1059 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
1060 	iser_ib_deactivate_port(event->ev_hca_guid, gid);
1061 
1062 	ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event success: "
1063 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1064 	    event->ev_port);
1065 }
1066 
1067 /*
1068  * iser_ib_handle_hca_detach_event()
1069  * Quiesce all activity bound for the port, teardown the connection, unbind
1070  * iSER services on all ports and release the HCA handle.
1071  */
1072 /* ARGSUSED */
1073 static void
1074 iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
1075 {
1076 	iser_hca_t	*nexthca, *hca;
1077 	int		i, status;
1078 
1079 	ISER_LOG(CE_NOTE, "iser_ib_handle_hca_detach_event: HCA(0x%llx)",
1080 	    (longlong_t)event->ev_hca_guid);
1081 
1082 	hca = iser_ib_guid2hca(event->ev_hca_guid);
1083 	for (i = 0; i < hca->hca_num_ports; i++) {
1084 		iser_ib_deactivate_port(hca->hca_guid,
1085 		    hca->hca_port_info[i].p_sgid_tbl[0]);
1086 	}
1087 
1088 	/*
1089 	 * Update the HCA list maintained in the iser_state. Free the
1090 	 * resources allocated to the HCA, i.e. caches, protection domain
1091 	 */
1092 	mutex_enter(&iser_state->is_hcalist_lock);
1093 
1094 	for (hca = list_head(&iser_state->is_hcalist);
1095 	    hca != NULL;
1096 	    hca = nexthca) {
1097 
1098 		nexthca = list_next(&iser_state->is_hcalist, hca);
1099 
1100 		if (hca->hca_guid == event->ev_hca_guid) {
1101 
1102 			list_remove(&iser_state->is_hcalist, hca);
1103 			iser_state->is_num_hcas--;
1104 
1105 			status = iser_ib_free_hca(hca);
1106 			if (status != DDI_SUCCESS) {
1107 				ISER_LOG(CE_WARN, "iser_ib_handle_hca_detach: "
1108 				    "Failed to free hca(%p)", (void *)hca);
1109 				list_insert_tail(&iser_state->is_hcalist, hca);
1110 				iser_state->is_num_hcas++;
1111 			}
1112 			/* No way to return status to IBT if this fails */
1113 		}
1114 	}
1115 	mutex_exit(&iser_state->is_hcalist_lock);
1116 
1117 }
1118 
1119 /*
1120  * iser_ib_async_handler
1121  * An IBT Asynchronous Event handler is registered it with the framework and
1122  * passed via the ibt_attach() routine. This function handles the following
1123  * asynchronous events.
1124  * IBT_EVENT_PORT_UP
1125  * IBT_ERROR_PORT_DOWN
1126  * IBT_HCA_ATTACH_EVENT
1127  * IBT_HCA_DETACH_EVENT
1128  */
1129 /* ARGSUSED */
1130 void
1131 iser_ib_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1132     ibt_async_event_t *event)
1133 {
1134 	switch (code) {
1135 	case IBT_EVENT_PORT_UP:
1136 		iser_ib_handle_portup_event(hdl, event);
1137 		break;
1138 
1139 	case IBT_ERROR_PORT_DOWN:
1140 		iser_ib_handle_portdown_event(hdl, event);
1141 		break;
1142 
1143 	case IBT_HCA_ATTACH_EVENT:
1144 		/*
1145 		 * A new HCA device is available for use, ignore this
1146 		 * event because the corresponding IBT_EVENT_PORT_UP
1147 		 * events will get triggered and handled accordingly.
1148 		 */
1149 		break;
1150 
1151 	case IBT_HCA_DETACH_EVENT:
1152 		iser_ib_handle_hca_detach_event(hdl, event);
1153 		break;
1154 
1155 	default:
1156 		break;
1157 	}
1158 }
1159 
1160 /*
1161  * iser_ib_init_hcas
1162  *
1163  * This function opens all the HCA devices, gathers the HCA state information
1164  * and adds the HCA handle for each HCA found in the iser_soft_state.
1165  */
1166 static int
1167 iser_ib_init_hcas(void)
1168 {
1169 	ib_guid_t	*guid;
1170 	int		num_hcas;
1171 	int		i;
1172 	iser_hca_t	*hca;
1173 
1174 	/* Retrieve the HCA list */
1175 	num_hcas = ibt_get_hca_list(&guid);
1176 	if (num_hcas == 0) {
1177 		/*
1178 		 * This shouldn't happen, but might if we have all HCAs
1179 		 * detach prior to initialization.
1180 		 */
1181 		return (DDI_FAILURE);
1182 	}
1183 
1184 	/* Initialize the hcalist lock */
1185 	mutex_init(&iser_state->is_hcalist_lock, NULL, MUTEX_DRIVER, NULL);
1186 
1187 	/* Create the HCA list */
1188 	list_create(&iser_state->is_hcalist, sizeof (iser_hca_t),
1189 	    offsetof(iser_hca_t, hca_node));
1190 
1191 	for (i = 0; i < num_hcas; i++) {
1192 
1193 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: initializing HCA "
1194 		    "(0x%llx)", (longlong_t)guid[i]);
1195 
1196 		hca = iser_ib_alloc_hca(guid[i]);
1197 		if (hca == NULL) {
1198 			/* This shouldn't happen, teardown and fail */
1199 			(void) iser_ib_fini_hcas();
1200 			(void) ibt_free_hca_list(guid, num_hcas);
1201 			return (DDI_FAILURE);
1202 		}
1203 
1204 		mutex_enter(&iser_state->is_hcalist_lock);
1205 		list_insert_tail(&iser_state->is_hcalist, hca);
1206 		iser_state->is_num_hcas++;
1207 		mutex_exit(&iser_state->is_hcalist_lock);
1208 
1209 	}
1210 
1211 	/* Free the IBT HCA list */
1212 	(void) ibt_free_hca_list(guid, num_hcas);
1213 
1214 	/* Check that we've initialized at least one HCA */
1215 	mutex_enter(&iser_state->is_hcalist_lock);
1216 	if (list_is_empty(&iser_state->is_hcalist)) {
1217 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: failed to initialize "
1218 		    "any HCAs");
1219 
1220 		mutex_exit(&iser_state->is_hcalist_lock);
1221 		(void) iser_ib_fini_hcas();
1222 		return (DDI_FAILURE);
1223 	}
1224 	mutex_exit(&iser_state->is_hcalist_lock);
1225 
1226 	return (DDI_SUCCESS);
1227 }
1228 
1229 /*
1230  * iser_ib_fini_hcas
1231  *
1232  * Teardown the iSER HCA list initialized above.
1233  */
1234 static int
1235 iser_ib_fini_hcas(void)
1236 {
1237 	iser_hca_t	*nexthca, *hca;
1238 	int		status;
1239 
1240 	mutex_enter(&iser_state->is_hcalist_lock);
1241 	for (hca = list_head(&iser_state->is_hcalist);
1242 	    hca != NULL;
1243 	    hca = nexthca) {
1244 
1245 		nexthca = list_next(&iser_state->is_hcalist, hca);
1246 
1247 		list_remove(&iser_state->is_hcalist, hca);
1248 
1249 		status = iser_ib_free_hca(hca);
1250 		if (status != IBT_SUCCESS) {
1251 			ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to free "
1252 			    "HCA during fini");
1253 			list_insert_tail(&iser_state->is_hcalist, hca);
1254 			return (DDI_FAILURE);
1255 		}
1256 
1257 		iser_state->is_num_hcas--;
1258 
1259 	}
1260 	mutex_exit(&iser_state->is_hcalist_lock);
1261 	list_destroy(&iser_state->is_hcalist);
1262 	mutex_destroy(&iser_state->is_hcalist_lock);
1263 
1264 	return (DDI_SUCCESS);
1265 }
1266 
1267 /*
1268  * iser_ib_alloc_hca
1269  *
1270  * This function opens the given HCA device, gathers the HCA state information
1271  * and adds the HCA handle
1272  */
1273 static iser_hca_t *
1274 iser_ib_alloc_hca(ib_guid_t guid)
1275 {
1276 	iser_hca_t	*hca;
1277 	int		status;
1278 
1279 	/* Allocate an iser_hca_t HCA handle */
1280 	hca = (iser_hca_t *)kmem_zalloc(sizeof (iser_hca_t), KM_SLEEP);
1281 
1282 	/* Open this HCA */
1283 	status = ibt_open_hca(iser_state->is_ibhdl, guid, &hca->hca_hdl);
1284 	if (status != IBT_SUCCESS) {
1285 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_open_hca failed:"
1286 		    " guid (0x%llx) status (0x%x)", (longlong_t)guid, status);
1287 		kmem_free(hca, sizeof (iser_hca_t));
1288 		return (NULL);
1289 	}
1290 
1291 	hca->hca_guid		= guid;
1292 	hca->hca_clnt_hdl	= iser_state->is_ibhdl;
1293 
1294 	/* Query the HCA */
1295 	status = ibt_query_hca(hca->hca_hdl, &hca->hca_attr);
1296 	if (status != IBT_SUCCESS) {
1297 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_query_hca "
1298 		    "failure: guid (0x%llx) status (0x%x)",
1299 		    (longlong_t)guid, status);
1300 		(void) ibt_close_hca(hca->hca_hdl);
1301 		kmem_free(hca, sizeof (iser_hca_t));
1302 		return (NULL);
1303 	}
1304 
1305 	/* Query all ports on the HCA */
1306 	status = ibt_query_hca_ports(hca->hca_hdl, 0,
1307 	    &hca->hca_port_info, &hca->hca_num_ports,
1308 	    &hca->hca_port_info_sz);
1309 	if (status != IBT_SUCCESS) {
1310 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: "
1311 		    "ibt_query_hca_ports failure: guid (0x%llx) "
1312 		    "status (0x%x)", (longlong_t)guid, status);
1313 		(void) ibt_close_hca(hca->hca_hdl);
1314 		kmem_free(hca, sizeof (iser_hca_t));
1315 		return (NULL);
1316 	}
1317 
1318 	/* Allocate a single PD on this HCA */
1319 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS,
1320 	    &hca->hca_pdhdl);
1321 	if (status != IBT_SUCCESS) {
1322 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_alloc_pd "
1323 		    "failure: guid (0x%llx) status (0x%x)",
1324 		    (longlong_t)guid, status);
1325 		(void) ibt_close_hca(hca->hca_hdl);
1326 		ibt_free_portinfo(hca->hca_port_info, hca->hca_port_info_sz);
1327 		kmem_free(hca, sizeof (iser_hca_t));
1328 		return (NULL);
1329 	}
1330 
1331 	/* Initialize the message and data MR caches for this HCA */
1332 	iser_init_hca_caches(hca);
1333 
1334 	return (hca);
1335 }
1336 
1337 static int
1338 iser_ib_free_hca(iser_hca_t *hca)
1339 {
1340 	int			status;
1341 	ibt_hca_portinfo_t	*hca_port_info;
1342 	uint_t			hca_port_info_sz;
1343 
1344 	ASSERT(hca != NULL);
1345 	if (hca->hca_failed)
1346 		return (DDI_FAILURE);
1347 
1348 	hca_port_info = hca->hca_port_info;
1349 	hca_port_info_sz = hca->hca_port_info_sz;
1350 
1351 	/*
1352 	 * Free the memory regions before freeing
1353 	 * the associated protection domain
1354 	 */
1355 	iser_fini_hca_caches(hca);
1356 
1357 	status = ibt_free_pd(hca->hca_hdl, hca->hca_pdhdl);
1358 	if (status != IBT_SUCCESS) {
1359 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: failed to free PD "
1360 		    "status=0x%x", status);
1361 		goto out_caches;
1362 	}
1363 
1364 	status = ibt_close_hca(hca->hca_hdl);
1365 	if (status != IBT_SUCCESS) {
1366 		ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to close HCA "
1367 		    "status=0x%x", status);
1368 		goto out_pd;
1369 	}
1370 
1371 	ibt_free_portinfo(hca_port_info, hca_port_info_sz);
1372 
1373 	kmem_free(hca, sizeof (iser_hca_t));
1374 	return (DDI_SUCCESS);
1375 
1376 	/*
1377 	 * We only managed to partially tear down the HCA, try to put it back
1378 	 * like it was before returning.
1379 	 */
1380 out_pd:
1381 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS, &hca->hca_pdhdl);
1382 	if (status != IBT_SUCCESS) {
1383 		hca->hca_failed = B_TRUE;
1384 		/* Report error and exit */
1385 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: could not re-alloc PD "
1386 		    "status=0x%x", status);
1387 		return (DDI_FAILURE);
1388 	}
1389 
1390 out_caches:
1391 	iser_init_hca_caches(hca);
1392 
1393 	return (DDI_FAILURE);
1394 }
1395 
1396 static int
1397 iser_ib_update_hcaports(iser_hca_t *hca)
1398 {
1399 	ibt_hca_portinfo_t	*pinfop, *oldpinfop;
1400 	uint_t			size, oldsize, nport;
1401 	int			status;
1402 
1403 	ASSERT(hca != NULL);
1404 
1405 	status = ibt_query_hca_ports(hca->hca_hdl, 0, &pinfop, &nport, &size);
1406 	if (status != IBT_SUCCESS) {
1407 		ISER_LOG(CE_NOTE, "ibt_query_hca_ports failed: %d", status);
1408 		return (status);
1409 	}
1410 
1411 	oldpinfop = hca->hca_port_info;
1412 	oldsize	= hca->hca_port_info_sz;
1413 	hca->hca_port_info = pinfop;
1414 	hca->hca_port_info_sz = size;
1415 
1416 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1417 
1418 	return (IBT_SUCCESS);
1419 }
1420 
1421 /*
1422  * iser_ib_gid2hca
1423  * Given a gid, find the corresponding hca
1424  */
1425 iser_hca_t *
1426 iser_ib_gid2hca(ib_gid_t gid)
1427 {
1428 
1429 	iser_hca_t	*hca;
1430 	int		i;
1431 
1432 	mutex_enter(&iser_state->is_hcalist_lock);
1433 	for (hca = list_head(&iser_state->is_hcalist);
1434 	    hca != NULL;
1435 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1436 
1437 		for (i = 0; i < hca->hca_num_ports; i++) {
1438 			if ((hca->hca_port_info[i].p_sgid_tbl[0].gid_prefix ==
1439 			    gid.gid_prefix) &&
1440 			    (hca->hca_port_info[i].p_sgid_tbl[0].gid_guid ==
1441 			    gid.gid_guid)) {
1442 
1443 				mutex_exit(&iser_state->is_hcalist_lock);
1444 
1445 				return (hca);
1446 			}
1447 		}
1448 	}
1449 	mutex_exit(&iser_state->is_hcalist_lock);
1450 	return (NULL);
1451 }
1452 
1453 /*
1454  * iser_ib_guid2hca
1455  * Given a HCA guid, find the corresponding HCA
1456  */
1457 iser_hca_t *
1458 iser_ib_guid2hca(ib_guid_t guid)
1459 {
1460 
1461 	iser_hca_t	*hca;
1462 
1463 	mutex_enter(&iser_state->is_hcalist_lock);
1464 	for (hca = list_head(&iser_state->is_hcalist);
1465 	    hca != NULL;
1466 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1467 
1468 		if (hca->hca_guid == guid) {
1469 			mutex_exit(&iser_state->is_hcalist_lock);
1470 			return (hca);
1471 		}
1472 	}
1473 	mutex_exit(&iser_state->is_hcalist_lock);
1474 	return (NULL);
1475 }
1476 
1477 /*
1478  * iser_ib_conv_sockaddr2ibtaddr
1479  * This function converts a socket address into the IBT format
1480  */
1481 void iser_ib_conv_sockaddr2ibtaddr(
1482     idm_sockaddr_t *saddr, ibt_ip_addr_t *ibt_addr)
1483 {
1484 	if (saddr == NULL) {
1485 		ibt_addr->family = AF_UNSPEC;
1486 		ibt_addr->un.ip4addr = 0;
1487 	} else {
1488 		switch (saddr->sin.sa_family) {
1489 		case AF_INET:
1490 
1491 			ibt_addr->family	= saddr->sin4.sin_family;
1492 			ibt_addr->un.ip4addr	= saddr->sin4.sin_addr.s_addr;
1493 			break;
1494 
1495 		case AF_INET6:
1496 
1497 			ibt_addr->family	= saddr->sin6.sin6_family;
1498 			ibt_addr->un.ip6addr	= saddr->sin6.sin6_addr;
1499 			break;
1500 
1501 		default:
1502 			ibt_addr->family = AF_UNSPEC;
1503 		}
1504 
1505 	}
1506 }
1507 
1508 /*
1509  * iser_ib_conv_ibtaddr2sockaddr
1510  * This function converts an IBT ip address handle to a sockaddr
1511  */
1512 void iser_ib_conv_ibtaddr2sockaddr(struct sockaddr_storage *ss,
1513     ibt_ip_addr_t *ibt_addr, in_port_t port)
1514 {
1515 	struct sockaddr_in *sin;
1516 	struct sockaddr_in6 *sin6;
1517 
1518 	switch (ibt_addr->family) {
1519 	case AF_INET:
1520 	case AF_UNSPEC:
1521 
1522 		sin = (struct sockaddr_in *)ibt_addr;
1523 		sin->sin_port = ntohs(port);
1524 		bcopy(sin, ss, sizeof (struct sockaddr_in));
1525 		break;
1526 
1527 	case AF_INET6:
1528 
1529 		sin6 = (struct sockaddr_in6 *)ibt_addr;
1530 		sin6->sin6_port = ntohs(port);
1531 		bcopy(sin6, ss, sizeof (struct sockaddr_in6));
1532 		break;
1533 
1534 	default:
1535 		ISER_LOG(CE_NOTE, "iser_ib_conv_ibtaddr2sockaddr: "
1536 		    "unknown family type: 0x%x", ibt_addr->family);
1537 	}
1538 }
1539 
1540 /*
1541  * iser_ib_setup_cq
1542  * This function sets up the Completion Queue size and allocates the specified
1543  * Completion Queue
1544  */
1545 static int
1546 iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size, ibt_cq_hdl_t *cq_hdl)
1547 {
1548 
1549 	ibt_cq_attr_t		cq_attr;
1550 	int			status;
1551 
1552 	cq_attr.cq_size		= cq_size;
1553 	cq_attr.cq_sched	= 0;
1554 	cq_attr.cq_flags	= IBT_CQ_NO_FLAGS;
1555 
1556 	/* Allocate a Completion Queue */
1557 	status = ibt_alloc_cq(hca_hdl, &cq_attr, cq_hdl, NULL);
1558 	if (status != IBT_SUCCESS) {
1559 		ISER_LOG(CE_NOTE, "iser_ib_setup_cq: ibt_alloc_cq failure (%d)",
1560 		    status);
1561 		return (status);
1562 	}
1563 
1564 	return (ISER_STATUS_SUCCESS);
1565 }
1566 
1567 /*
1568  * iser_ib_setup_chanargs
1569  *
1570  */
1571 static void
1572 iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
1573     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
1574     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs)
1575 {
1576 
1577 	bzero(cargs, sizeof (ibt_rc_chan_alloc_args_t));
1578 
1579 	/*
1580 	 * Set up the size of the channels send queue, receive queue and the
1581 	 * maximum number of elements in a scatter gather list of work requests
1582 	 * posted to the send and receive queues.
1583 	 */
1584 	cargs->rc_sizes.cs_sq		= sq_size;
1585 	cargs->rc_sizes.cs_rq		= rq_size;
1586 	cargs->rc_sizes.cs_sq_sgl	= ISER_IB_SGLIST_SIZE;
1587 	cargs->rc_sizes.cs_rq_sgl	= ISER_IB_SGLIST_SIZE;
1588 
1589 	/*
1590 	 * All Work requests signaled on a WR basis will receive a send
1591 	 * request completion.
1592 	 */
1593 	cargs->rc_flags			= IBT_ALL_SIGNALED;
1594 
1595 	/* Enable RDMA read and RDMA write on the channel end points */
1596 	cargs->rc_control		= IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1597 
1598 	/* Set the local hca port on which the channel is allocated */
1599 	cargs->rc_hca_port_num		= hca_port;
1600 
1601 	/* Set the Send and Receive Completion Queue handles */
1602 	cargs->rc_scq			= scq_hdl;
1603 	cargs->rc_rcq			= rcq_hdl;
1604 
1605 	/* Set the protection domain associated with the channel */
1606 	cargs->rc_pd			= hca_pdhdl;
1607 
1608 	/* No SRQ usage */
1609 	cargs->rc_srq			= NULL;
1610 }
1611 
1612 /*
1613  * iser_ib_init_qp
1614  * Initialize the QP handle
1615  */
1616 void
1617 iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size)
1618 {
1619 	/* Initialize the handle lock */
1620 	mutex_init(&chan->ic_qp.qp_lock, NULL, MUTEX_DRIVER, NULL);
1621 
1622 	/* Record queue sizes */
1623 	chan->ic_qp.sq_size = sq_size;
1624 	chan->ic_qp.rq_size = rq_size;
1625 
1626 	/* Initialize the RQ monitoring data */
1627 	chan->ic_qp.rq_depth  = rq_size;
1628 	chan->ic_qp.rq_level  = 0;
1629 	chan->ic_qp.rq_lwm = (chan->ic_recvcq_sz * ISER_IB_RQ_LWM_PCT) / 100;
1630 
1631 	/* Initialize the taskq flag */
1632 	chan->ic_qp.rq_taskqpending = B_FALSE;
1633 }
1634 
1635 /*
1636  * iser_ib_fini_qp
1637  * Teardown the QP handle
1638  */
1639 void
1640 iser_ib_fini_qp(iser_qp_t *qp)
1641 {
1642 	/* Destroy the handle lock */
1643 	mutex_destroy(&qp->qp_lock);
1644 }
1645 
1646 static int
1647 iser_ib_activate_port(idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid)
1648 {
1649 	iser_svc_t	*iser_svc;
1650 	iser_sbind_t	*is_sbind;
1651 	int		status;
1652 
1653 	iser_svc = idm_svc->is_iser_svc;
1654 
1655 	/*
1656 	 * Save the address of the service bind handle in the
1657 	 * iser_svc_t to undo the service binding at a later time
1658 	 */
1659 	is_sbind = kmem_zalloc(sizeof (iser_sbind_t), KM_SLEEP);
1660 	is_sbind->is_gid	= gid;
1661 	is_sbind->is_guid	= guid;
1662 
1663 	status  = ibt_bind_service(iser_svc->is_srvhdl, gid, NULL,
1664 	    idm_svc, &is_sbind->is_sbindhdl);
1665 
1666 	if (status != IBT_SUCCESS) {
1667 		ISER_LOG(CE_NOTE, "iser_ib_activate_port: status(0x%x): "
1668 		    "Bind service(%llx) on port(%llx:%llx) failed",
1669 		    status, (longlong_t)iser_svc->is_svcid,
1670 		    (longlong_t)gid.gid_prefix, (longlong_t)gid.gid_guid);
1671 
1672 		kmem_free(is_sbind, sizeof (iser_sbind_t));
1673 
1674 		return (status);
1675 	}
1676 
1677 	list_insert_tail(&iser_svc->is_sbindlist, is_sbind);
1678 
1679 	return (IBT_SUCCESS);
1680 }
1681 
1682 static void
1683 iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid)
1684 {
1685 	iser_svc_t	*iser_svc;
1686 	iser_conn_t	*iser_conn;
1687 	iser_sbind_t	*is_sbind;
1688 	idm_conn_t	*idm_conn;
1689 
1690 	/*
1691 	 * Iterate through the global list of IDM target connections.
1692 	 * Issue a TRANSPORT_FAIL for any connections on this port, and
1693 	 * if there is a bound service running on the port, tear it down.
1694 	 */
1695 	mutex_enter(&idm.idm_global_mutex);
1696 	for (idm_conn = list_head(&idm.idm_tgt_conn_list);
1697 	    idm_conn != NULL;
1698 	    idm_conn = list_next(&idm.idm_tgt_conn_list, idm_conn)) {
1699 
1700 		if (idm_conn->ic_transport_type != IDM_TRANSPORT_TYPE_ISER) {
1701 			/* this is not an iSER connection, skip it */
1702 			continue;
1703 		}
1704 
1705 		iser_conn = idm_conn->ic_transport_private;
1706 		if (iser_conn->ic_chan->ic_ibt_path.pi_hca_guid != hca_guid) {
1707 			/* this iSER connection is on a different port */
1708 			continue;
1709 		}
1710 
1711 		/* Fail the transport for this connection */
1712 		idm_conn_event(idm_conn, CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
1713 
1714 		if (idm_conn->ic_conn_type == CONN_TYPE_INI) {
1715 			/* initiator connection, nothing else to do */
1716 			continue;
1717 		}
1718 
1719 		/* Check for a service binding */
1720 		iser_svc = idm_conn->ic_svc_binding->is_iser_svc;
1721 		is_sbind = iser_ib_get_bind(iser_svc, hca_guid, gid);
1722 		if (is_sbind != NULL) {
1723 			/* This service is still bound, tear it down */
1724 			ibt_unbind_service(iser_svc->is_srvhdl,
1725 			    is_sbind->is_sbindhdl);
1726 			list_remove(&iser_svc->is_sbindlist, is_sbind);
1727 			kmem_free(is_sbind, sizeof (iser_sbind_t));
1728 		}
1729 	}
1730 	mutex_exit(&idm.idm_global_mutex);
1731 }
1732 
1733 static iser_sbind_t *
1734 iser_ib_get_bind(iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid)
1735 {
1736 	iser_sbind_t	*is_sbind;
1737 
1738 	for (is_sbind = list_head(&iser_svc->is_sbindlist);
1739 	    is_sbind != NULL;
1740 	    is_sbind = list_next(&iser_svc->is_sbindlist, is_sbind)) {
1741 
1742 		if ((is_sbind->is_guid == hca_guid) &&
1743 		    (is_sbind->is_gid.gid_prefix == gid.gid_prefix) &&
1744 		    (is_sbind->is_gid.gid_guid == gid.gid_guid)) {
1745 			return (is_sbind);
1746 		}
1747 	}
1748 	return (NULL);
1749 }
1750