xref: /illumos-gate/usr/src/uts/common/xen/io/xnf.c (revision c94be9439c4f0773ef60e2cec21d548359cfea20)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
29  * Copyright 2020 RackTop Systems, Inc.
30  */
31 
32 /*
33  *
34  * Copyright (c) 2004 Christian Limpach.
35  * All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. This section intentionally left blank.
46  * 4. The name of the author may not be used to endorse or promote products
47  *    derived from this software without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
50  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
51  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
52  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
53  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
54  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
55  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
56  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
57  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
58  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
59  */
60 /*
61  * Section 3 of the above license was updated in response to bug 6379571.
62  */
63 
64 /*
65  * xnf.c - GLDv3 network driver for domU.
66  */
67 
68 /*
69  * This driver uses four per-instance locks:
70  *
71  * xnf_gref_lock:
72  *
73  *    Protects access to the grant reference list stored in
74  *    xnf_gref_head. Grant references should be acquired and released
75  *    using gref_get() and gref_put() respectively.
76  *
77  * xnf_schedlock:
78  *
79  *    Protects:
80  *    xnf_need_sched - used to record that a previous transmit attempt
81  *       failed (and consequently it will be necessary to call
82  *       mac_tx_update() when transmit resources are available).
83  *    xnf_pending_multicast - the number of multicast requests that
84  *       have been submitted to the backend for which we have not
85  *       processed responses.
86  *
87  * xnf_txlock:
88  *
89  *    Protects the transmit ring (xnf_tx_ring) and associated
90  *    structures (notably xnf_tx_pkt_id and xnf_tx_pkt_id_head).
91  *
92  * xnf_rxlock:
93  *
94  *    Protects the receive ring (xnf_rx_ring) and associated
95  *    structures (notably xnf_rx_pkt_info).
96  *
97  * If driver-global state that affects both the transmit and receive
98  * rings is manipulated, both xnf_txlock and xnf_rxlock should be
99  * held, in that order.
100  *
101  * xnf_schedlock is acquired both whilst holding xnf_txlock and
102  * without. It should always be acquired after xnf_txlock if both are
103  * held.
104  *
105  * Notes:
106  * - atomic_add_64() is used to manipulate counters where we require
107  *   accuracy. For counters intended only for observation by humans,
108  *   post increment/decrement are used instead.
109  */
110 
111 #include <sys/types.h>
112 #include <sys/errno.h>
113 #include <sys/param.h>
114 #include <sys/sysmacros.h>
115 #include <sys/systm.h>
116 #include <sys/stream.h>
117 #include <sys/strsubr.h>
118 #include <sys/strsun.h>
119 #include <sys/conf.h>
120 #include <sys/ddi.h>
121 #include <sys/devops.h>
122 #include <sys/sunddi.h>
123 #include <sys/sunndi.h>
124 #include <sys/dlpi.h>
125 #include <sys/ethernet.h>
126 #include <sys/strsun.h>
127 #include <sys/pattr.h>
128 #include <inet/ip.h>
129 #include <inet/ip_impl.h>
130 #include <inet/tcp.h>
131 #include <netinet/udp.h>
132 #include <sys/gld.h>
133 #include <sys/modctl.h>
134 #include <sys/mac_provider.h>
135 #include <sys/mac_ether.h>
136 #include <sys/bootinfo.h>
137 #include <sys/mach_mmu.h>
138 #ifdef	XPV_HVM_DRIVER
139 #include <sys/xpv_support.h>
140 #include <sys/hypervisor.h>
141 #else
142 #include <sys/hypervisor.h>
143 #include <sys/evtchn_impl.h>
144 #include <sys/balloon_impl.h>
145 #endif
146 #include <xen/public/io/netif.h>
147 #include <sys/gnttab.h>
148 #include <xen/sys/xendev.h>
149 #include <sys/sdt.h>
150 #include <sys/note.h>
151 #include <sys/debug.h>
152 
153 #include <io/xnf.h>
154 
155 #if defined(DEBUG) || defined(__lint)
156 #define	XNF_DEBUG
157 #endif
158 
159 #ifdef XNF_DEBUG
160 int xnf_debug = 0;
161 xnf_t *xnf_debug_instance = NULL;
162 #endif
163 
164 /*
165  * On a 32 bit PAE system physical and machine addresses are larger
166  * than 32 bits.  ddi_btop() on such systems take an unsigned long
167  * argument, and so addresses above 4G are truncated before ddi_btop()
168  * gets to see them.  To avoid this, code the shift operation here.
169  */
170 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
171 
172 /*
173  * The parameters below should only be changed in /etc/system, never in mdb.
174  */
175 
176 /*
177  * Should we use the multicast control feature if the backend provides
178  * it?
179  */
180 boolean_t xnf_multicast_control = B_TRUE;
181 
182 /*
183  * Should we allow scatter-gather for tx if backend allows it?
184  */
185 boolean_t xnf_enable_tx_sg = B_TRUE;
186 
187 /*
188  * Should we allow scatter-gather for rx if backend allows it?
189  */
190 boolean_t xnf_enable_rx_sg = B_TRUE;
191 
192 /*
193  * Should we allow lso for tx sends if backend allows it?
194  * Requires xnf_enable_tx_sg to be also set to TRUE.
195  */
196 boolean_t xnf_enable_lso = B_TRUE;
197 
198 /*
199  * Should we allow lro on rx if backend supports it?
200  * Requires xnf_enable_rx_sg to be also set to TRUE.
201  *
202  * !! WARNING !!
203  * LRO is not yet supported in the OS so this should be left as FALSE.
204  * !! WARNING !!
205  */
206 boolean_t xnf_enable_lro = B_FALSE;
207 
208 /*
209  * Received packets below this size are copied to a new streams buffer
210  * rather than being desballoc'ed.
211  *
212  * This value is chosen to accommodate traffic where there are a large
213  * number of small packets. For data showing a typical distribution,
214  * see:
215  *
216  * Sinha07a:
217  *	Rishi Sinha, Christos Papadopoulos, and John
218  *	Heidemann. Internet Packet Size Distributions: Some
219  *	Observations. Technical Report ISI-TR-2007-643,
220  *	USC/Information Sciences Institute, May, 2007. Orignally
221  *	released October 2005 as web page
222  *	http://netweb.usc.edu/~sinha/pkt-sizes/.
223  *	<http://www.isi.edu/~johnh/PAPERS/Sinha07a.html>.
224  */
225 size_t xnf_rx_copy_limit = 64;
226 
227 #define	INVALID_GRANT_HANDLE	((grant_handle_t)-1)
228 #define	INVALID_GRANT_REF	((grant_ref_t)-1)
229 #define	INVALID_TX_ID		((uint16_t)-1)
230 
231 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
232 #define	TX_ID_VALID(i) \
233 	(((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
234 
235 /*
236  * calculate how many pages are spanned by an mblk fragment
237  */
238 #define	xnf_mblk_pages(mp)	(MBLKL(mp) == 0 ? 0 : \
239     xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
240 
241 /* Required system entry points */
242 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
243 static int	xnf_detach(dev_info_t *, ddi_detach_cmd_t);
244 
245 /* Required driver entry points for Nemo */
246 static int	xnf_start(void *);
247 static void	xnf_stop(void *);
248 static int	xnf_set_mac_addr(void *, const uint8_t *);
249 static int	xnf_set_multicast(void *, boolean_t, const uint8_t *);
250 static int	xnf_set_promiscuous(void *, boolean_t);
251 static mblk_t	*xnf_send(void *, mblk_t *);
252 static uint_t	xnf_intr(caddr_t);
253 static int	xnf_stat(void *, uint_t, uint64_t *);
254 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
255 static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
256 static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
257     const void *);
258 static void xnf_propinfo(void *, const char *, mac_prop_id_t,
259     mac_prop_info_handle_t);
260 
261 /* Driver private functions */
262 static int xnf_alloc_dma_resources(xnf_t *);
263 static void xnf_release_dma_resources(xnf_t *);
264 static void xnf_release_mblks(xnf_t *);
265 
266 static int xnf_buf_constructor(void *, void *, int);
267 static void xnf_buf_destructor(void *, void *);
268 static xnf_buf_t *xnf_buf_get(xnf_t *, int, boolean_t);
269 #pragma inline(xnf_buf_get)
270 static void xnf_buf_put(xnf_t *, xnf_buf_t *, boolean_t);
271 #pragma inline(xnf_buf_put)
272 static void xnf_buf_refresh(xnf_buf_t *);
273 #pragma inline(xnf_buf_refresh)
274 static void xnf_buf_recycle(xnf_buf_t *);
275 
276 static int xnf_tx_buf_constructor(void *, void *, int);
277 static void xnf_tx_buf_destructor(void *, void *);
278 
279 static grant_ref_t xnf_gref_get(xnf_t *);
280 #pragma inline(xnf_gref_get)
281 static void xnf_gref_put(xnf_t *, grant_ref_t);
282 #pragma inline(xnf_gref_put)
283 
284 static xnf_txid_t *xnf_txid_get(xnf_t *);
285 #pragma inline(xnf_txid_get)
286 static void xnf_txid_put(xnf_t *, xnf_txid_t *);
287 #pragma inline(xnf_txid_put)
288 
289 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
290 static int xnf_tx_clean_ring(xnf_t  *);
291 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
292     void *, void *);
293 static boolean_t xnf_kstat_init(xnf_t *);
294 static void xnf_rx_collect(xnf_t *);
295 
296 #define	XNF_CALLBACK_FLAGS	(MC_GETCAPAB | MC_PROPERTIES)
297 
298 static mac_callbacks_t xnf_callbacks = {
299 	.mc_callbacks = XNF_CALLBACK_FLAGS,
300 	.mc_getstat = xnf_stat,
301 	.mc_start = xnf_start,
302 	.mc_stop = xnf_stop,
303 	.mc_setpromisc = xnf_set_promiscuous,
304 	.mc_multicst = xnf_set_multicast,
305 	.mc_unicst = xnf_set_mac_addr,
306 	.mc_tx = xnf_send,
307 	.mc_getcapab = xnf_getcapab,
308 	.mc_setprop = xnf_setprop,
309 	.mc_getprop = xnf_getprop,
310 	.mc_propinfo = xnf_propinfo,
311 };
312 
313 /* DMA attributes for network ring buffer */
314 static ddi_dma_attr_t ringbuf_dma_attr = {
315 	.dma_attr_version = DMA_ATTR_V0,
316 	.dma_attr_addr_lo = 0,
317 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
318 	.dma_attr_count_max = 0x7fffffff,
319 	.dma_attr_align = MMU_PAGESIZE,
320 	.dma_attr_burstsizes = 0x7ff,
321 	.dma_attr_minxfer = 1,
322 	.dma_attr_maxxfer = 0xffffffffU,
323 	.dma_attr_seg = 0xffffffffffffffffULL,
324 	.dma_attr_sgllen = 1,
325 	.dma_attr_granular = 1,
326 	.dma_attr_flags = 0
327 };
328 
329 /* DMA attributes for receive data */
330 static ddi_dma_attr_t rx_buf_dma_attr = {
331 	.dma_attr_version = DMA_ATTR_V0,
332 	.dma_attr_addr_lo = 0,
333 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
334 	.dma_attr_count_max = MMU_PAGEOFFSET,
335 	.dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
336 	.dma_attr_burstsizes = 0x7ff,
337 	.dma_attr_minxfer = 1,
338 	.dma_attr_maxxfer = 0xffffffffU,
339 	.dma_attr_seg = 0xffffffffffffffffULL,
340 	.dma_attr_sgllen = 1,
341 	.dma_attr_granular = 1,
342 	.dma_attr_flags = 0
343 };
344 
345 /* DMA attributes for transmit data */
346 static ddi_dma_attr_t tx_buf_dma_attr = {
347 	.dma_attr_version = DMA_ATTR_V0,
348 	.dma_attr_addr_lo = 0,
349 	.dma_attr_addr_hi = 0xffffffffffffffffULL,
350 	.dma_attr_count_max = MMU_PAGEOFFSET,
351 	.dma_attr_align = 1,
352 	.dma_attr_burstsizes = 0x7ff,
353 	.dma_attr_minxfer = 1,
354 	.dma_attr_maxxfer = 0xffffffffU,
355 	.dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
356 	.dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
357 	.dma_attr_granular = 1,
358 	.dma_attr_flags = 0
359 };
360 
361 /* DMA access attributes for registers and descriptors */
362 static ddi_device_acc_attr_t accattr = {
363 	DDI_DEVICE_ATTR_V0,
364 	DDI_STRUCTURE_LE_ACC,	/* This is a little-endian device */
365 	DDI_STRICTORDER_ACC
366 };
367 
368 /* DMA access attributes for data: NOT to be byte swapped. */
369 static ddi_device_acc_attr_t data_accattr = {
370 	DDI_DEVICE_ATTR_V0,
371 	DDI_NEVERSWAP_ACC,
372 	DDI_STRICTORDER_ACC
373 };
374 
375 DDI_DEFINE_STREAM_OPS(xnf_dev_ops, nulldev, nulldev, xnf_attach, xnf_detach,
376     nodev, NULL, D_MP, NULL, ddi_quiesce_not_supported);
377 
378 static struct modldrv xnf_modldrv = {
379 	&mod_driverops,
380 	"Virtual Ethernet driver",
381 	&xnf_dev_ops
382 };
383 
384 static struct modlinkage modlinkage = {
385 	MODREV_1, &xnf_modldrv, NULL
386 };
387 
388 int
389 _init(void)
390 {
391 	int r;
392 
393 	mac_init_ops(&xnf_dev_ops, "xnf");
394 	r = mod_install(&modlinkage);
395 	if (r != DDI_SUCCESS)
396 		mac_fini_ops(&xnf_dev_ops);
397 
398 	return (r);
399 }
400 
401 int
402 _fini(void)
403 {
404 	return (EBUSY); /* XXPV should be removable */
405 }
406 
407 int
408 _info(struct modinfo *modinfop)
409 {
410 	return (mod_info(&modlinkage, modinfop));
411 }
412 
413 /*
414  * Acquire a grant reference.
415  */
416 static grant_ref_t
417 xnf_gref_get(xnf_t *xnfp)
418 {
419 	grant_ref_t gref;
420 
421 	mutex_enter(&xnfp->xnf_gref_lock);
422 
423 	do {
424 		gref = gnttab_claim_grant_reference(&xnfp->xnf_gref_head);
425 
426 	} while ((gref == INVALID_GRANT_REF) &&
427 	    (gnttab_alloc_grant_references(16, &xnfp->xnf_gref_head) == 0));
428 
429 	mutex_exit(&xnfp->xnf_gref_lock);
430 
431 	if (gref == INVALID_GRANT_REF) {
432 		xnfp->xnf_stat_gref_failure++;
433 	} else {
434 		atomic_inc_64(&xnfp->xnf_stat_gref_outstanding);
435 		if (xnfp->xnf_stat_gref_outstanding > xnfp->xnf_stat_gref_peak)
436 			xnfp->xnf_stat_gref_peak =
437 			    xnfp->xnf_stat_gref_outstanding;
438 	}
439 
440 	return (gref);
441 }
442 
443 /*
444  * Release a grant reference.
445  */
446 static void
447 xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
448 {
449 	ASSERT(gref != INVALID_GRANT_REF);
450 
451 	mutex_enter(&xnfp->xnf_gref_lock);
452 	gnttab_release_grant_reference(&xnfp->xnf_gref_head, gref);
453 	mutex_exit(&xnfp->xnf_gref_lock);
454 
455 	atomic_dec_64(&xnfp->xnf_stat_gref_outstanding);
456 }
457 
458 /*
459  * Acquire a transmit id.
460  */
461 static xnf_txid_t *
462 xnf_txid_get(xnf_t *xnfp)
463 {
464 	xnf_txid_t *tidp;
465 
466 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
467 
468 	if (xnfp->xnf_tx_pkt_id_head == INVALID_TX_ID)
469 		return (NULL);
470 
471 	ASSERT(TX_ID_VALID(xnfp->xnf_tx_pkt_id_head));
472 
473 	tidp = TX_ID_TO_TXID(xnfp, xnfp->xnf_tx_pkt_id_head);
474 	xnfp->xnf_tx_pkt_id_head = tidp->next;
475 	tidp->next = INVALID_TX_ID;
476 
477 	ASSERT(tidp->txbuf == NULL);
478 
479 	return (tidp);
480 }
481 
482 /*
483  * Release a transmit id.
484  */
485 static void
486 xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
487 {
488 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
489 	ASSERT(TX_ID_VALID(tidp->id));
490 	ASSERT(tidp->next == INVALID_TX_ID);
491 
492 	tidp->txbuf = NULL;
493 	tidp->next = xnfp->xnf_tx_pkt_id_head;
494 	xnfp->xnf_tx_pkt_id_head = tidp->id;
495 }
496 
497 static void
498 xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
499 {
500 	ASSERT3U(txp->tx_type, ==, TX_DATA);
501 
502 	/*
503 	 * We are either using a lookaside buffer or we are mapping existing
504 	 * buffers.
505 	 */
506 	if (txp->tx_bdesc != NULL) {
507 		ASSERT(!txp->tx_handle_bound);
508 		xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
509 	} else {
510 		if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
511 			if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
512 			    0) {
513 				cmn_err(CE_PANIC, "tx grant %d still in use by "
514 				    "backend domain", txp->tx_txreq.gref);
515 			}
516 			(void) gnttab_end_foreign_access_ref(
517 			    txp->tx_txreq.gref, 1);
518 			xnf_gref_put(xnfp, txp->tx_txreq.gref);
519 		}
520 
521 		if (txp->tx_handle_bound)
522 			(void) ddi_dma_unbind_handle(txp->tx_dma_handle);
523 	}
524 
525 	if (txp->tx_mp != NULL)
526 		freemsg(txp->tx_mp);
527 
528 	if (txp->tx_prev != NULL) {
529 		ASSERT3P(txp->tx_prev->tx_next, ==, txp);
530 		txp->tx_prev->tx_next = NULL;
531 	}
532 
533 	if (txp->tx_txreq.id != INVALID_TX_ID) {
534 		/*
535 		 * This should be only possible when resuming from a suspend.
536 		 */
537 		ASSERT(!xnfp->xnf_connected);
538 		xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
539 		txp->tx_txreq.id = INVALID_TX_ID;
540 	}
541 
542 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
543 }
544 
545 static void
546 xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
547 {
548 	if (txp == NULL)
549 		return;
550 
551 	while (txp->tx_next != NULL)
552 		txp = txp->tx_next;
553 
554 	/*
555 	 * We free the chain in reverse order so that grants can be released
556 	 * for all dma chunks before unbinding the dma handles. The mblk is
557 	 * freed last, after all its fragments' dma handles are unbound.
558 	 */
559 	xnf_txbuf_t *prev;
560 	for (; txp != NULL; txp = prev) {
561 		prev = txp->tx_prev;
562 		xnf_data_txbuf_free(xnfp, txp);
563 	}
564 }
565 
566 static xnf_txbuf_t *
567 xnf_data_txbuf_alloc(xnf_t *xnfp)
568 {
569 	xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
570 	txp->tx_type = TX_DATA;
571 	txp->tx_next = NULL;
572 	txp->tx_prev = NULL;
573 	txp->tx_head = txp;
574 	txp->tx_frags_to_ack = 0;
575 	txp->tx_mp = NULL;
576 	txp->tx_bdesc = NULL;
577 	txp->tx_handle_bound = B_FALSE;
578 	txp->tx_txreq.gref = INVALID_GRANT_REF;
579 	txp->tx_txreq.id = INVALID_TX_ID;
580 
581 	return (txp);
582 }
583 
584 /*
585  * Get `wanted' slots in the transmit ring, waiting for at least that
586  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
587  * `wanted' to zero.
588  *
589  * Return the number of slots available.
590  */
591 static int
592 xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
593 {
594 	int slotsfree;
595 	boolean_t forced_clean = (wanted == 0);
596 
597 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
598 
599 	/* LINTED: constant in conditional context */
600 	while (B_TRUE) {
601 		slotsfree = RING_FREE_REQUESTS(&xnfp->xnf_tx_ring);
602 
603 		if ((slotsfree < wanted) || forced_clean)
604 			slotsfree = xnf_tx_clean_ring(xnfp);
605 
606 		/*
607 		 * If there are more than we need free, tell other
608 		 * people to come looking again. We hold txlock, so we
609 		 * are able to take our slots before anyone else runs.
610 		 */
611 		if (slotsfree > wanted)
612 			cv_broadcast(&xnfp->xnf_cv_tx_slots);
613 
614 		if (slotsfree >= wanted)
615 			break;
616 
617 		if (!wait)
618 			break;
619 
620 		cv_wait(&xnfp->xnf_cv_tx_slots, &xnfp->xnf_txlock);
621 	}
622 
623 	ASSERT(slotsfree <= RING_SIZE(&(xnfp->xnf_tx_ring)));
624 
625 	return (slotsfree);
626 }
627 
628 static int
629 xnf_setup_rings(xnf_t *xnfp)
630 {
631 	domid_t			oeid;
632 	struct xenbus_device	*xsd;
633 	RING_IDX		i;
634 	int			err;
635 	xnf_txid_t		*tidp;
636 	xnf_buf_t **bdescp;
637 
638 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
639 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
640 
641 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
642 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
643 
644 	err = gnttab_grant_foreign_access(oeid,
645 	    xnf_btop(pa_to_ma(xnfp->xnf_tx_ring_phys_addr)), 0);
646 	if (err <= 0) {
647 		err = -err;
648 		xenbus_dev_error(xsd, err, "granting access to tx ring page");
649 		goto out;
650 	}
651 	xnfp->xnf_tx_ring_ref = (grant_ref_t)err;
652 
653 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
654 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
655 
656 	err = gnttab_grant_foreign_access(oeid,
657 	    xnf_btop(pa_to_ma(xnfp->xnf_rx_ring_phys_addr)), 0);
658 	if (err <= 0) {
659 		err = -err;
660 		xenbus_dev_error(xsd, err, "granting access to rx ring page");
661 		goto out;
662 	}
663 	xnfp->xnf_rx_ring_ref = (grant_ref_t)err;
664 
665 	mutex_enter(&xnfp->xnf_txlock);
666 
667 	/*
668 	 * We first cleanup the TX ring in case we are doing a resume.
669 	 * Note that this can lose packets, but we expect to stagger on.
670 	 */
671 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
672 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
673 	    i < NET_TX_RING_SIZE;
674 	    i++, tidp++) {
675 		xnf_txbuf_t *txp = tidp->txbuf;
676 		if (txp == NULL)
677 			continue;
678 
679 		switch (txp->tx_type) {
680 		case TX_DATA:
681 			/*
682 			 * txid_put() will be called for each txbuf's txid in
683 			 * the chain which will result in clearing tidp->txbuf.
684 			 */
685 			xnf_data_txbuf_free_chain(xnfp, txp);
686 
687 			break;
688 
689 		case TX_MCAST_REQ:
690 			txp->tx_type = TX_MCAST_RSP;
691 			txp->tx_status = NETIF_RSP_DROPPED;
692 			cv_broadcast(&xnfp->xnf_cv_multicast);
693 
694 			/*
695 			 * The request consumed two slots in the ring,
696 			 * yet only a single xnf_txid_t is used. Step
697 			 * over the empty slot.
698 			 */
699 			i++;
700 			ASSERT3U(i, <, NET_TX_RING_SIZE);
701 			break;
702 
703 		case TX_MCAST_RSP:
704 			break;
705 		}
706 	}
707 
708 	/*
709 	 * Now purge old list and add each txid to the new free list.
710 	 */
711 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
712 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
713 	    i < NET_TX_RING_SIZE;
714 	    i++, tidp++) {
715 		tidp->id = i;
716 		ASSERT3P(tidp->txbuf, ==, NULL);
717 		tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
718 		xnf_txid_put(xnfp, tidp);
719 	}
720 
721 	/* LINTED: constant in conditional context */
722 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
723 	/* LINTED: constant in conditional context */
724 	FRONT_RING_INIT(&xnfp->xnf_tx_ring,
725 	    xnfp->xnf_tx_ring.sring, PAGESIZE);
726 
727 	mutex_exit(&xnfp->xnf_txlock);
728 
729 	mutex_enter(&xnfp->xnf_rxlock);
730 
731 	/*
732 	 * Clean out any buffers currently posted to the receive ring
733 	 * before we reset it.
734 	 */
735 	for (i = 0, bdescp = &xnfp->xnf_rx_pkt_info[0];
736 	    i < NET_RX_RING_SIZE;
737 	    i++, bdescp++) {
738 		if (*bdescp != NULL) {
739 			xnf_buf_put(xnfp, *bdescp, B_FALSE);
740 			*bdescp = NULL;
741 		}
742 	}
743 
744 	/* LINTED: constant in conditional context */
745 	SHARED_RING_INIT(xnfp->xnf_rx_ring.sring);
746 	/* LINTED: constant in conditional context */
747 	FRONT_RING_INIT(&xnfp->xnf_rx_ring,
748 	    xnfp->xnf_rx_ring.sring, PAGESIZE);
749 
750 	/*
751 	 * Fill the ring with buffers.
752 	 */
753 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
754 		xnf_buf_t *bdesc;
755 
756 		bdesc = xnf_buf_get(xnfp, KM_SLEEP, B_FALSE);
757 		VERIFY(bdesc != NULL);
758 		xnf_rxbuf_hang(xnfp, bdesc);
759 	}
760 
761 	/* LINTED: constant in conditional context */
762 	RING_PUSH_REQUESTS(&xnfp->xnf_rx_ring);
763 
764 	mutex_exit(&xnfp->xnf_rxlock);
765 
766 	return (0);
767 
768 out:
769 	if (xnfp->xnf_tx_ring_ref != INVALID_GRANT_REF)
770 		gnttab_end_foreign_access(xnfp->xnf_tx_ring_ref, 0, 0);
771 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
772 
773 	if (xnfp->xnf_rx_ring_ref != INVALID_GRANT_REF)
774 		gnttab_end_foreign_access(xnfp->xnf_rx_ring_ref, 0, 0);
775 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
776 
777 	return (err);
778 }
779 
780 /*
781  * Connect driver to back end, called to set up communication with
782  * back end driver both initially and on resume after restore/migrate.
783  */
784 void
785 xnf_be_connect(xnf_t *xnfp)
786 {
787 	const char	*message;
788 	xenbus_transaction_t xbt;
789 	struct		xenbus_device *xsd;
790 	char		*xsname;
791 	int		err;
792 
793 	ASSERT(!xnfp->xnf_connected);
794 
795 	xsd = xvdi_get_xsd(xnfp->xnf_devinfo);
796 	xsname = xvdi_get_xsname(xnfp->xnf_devinfo);
797 
798 	err = xnf_setup_rings(xnfp);
799 	if (err != 0) {
800 		cmn_err(CE_WARN, "failed to set up tx/rx rings");
801 		xenbus_dev_error(xsd, err, "setting up ring");
802 		return;
803 	}
804 
805 again:
806 	err = xenbus_transaction_start(&xbt);
807 	if (err != 0) {
808 		xenbus_dev_error(xsd, EIO, "starting transaction");
809 		return;
810 	}
811 
812 	err = xenbus_printf(xbt, xsname, "tx-ring-ref", "%u",
813 	    xnfp->xnf_tx_ring_ref);
814 	if (err != 0) {
815 		message = "writing tx ring-ref";
816 		goto abort_transaction;
817 	}
818 
819 	err = xenbus_printf(xbt, xsname, "rx-ring-ref", "%u",
820 	    xnfp->xnf_rx_ring_ref);
821 	if (err != 0) {
822 		message = "writing rx ring-ref";
823 		goto abort_transaction;
824 	}
825 
826 	err = xenbus_printf(xbt, xsname, "event-channel", "%u",
827 	    xnfp->xnf_evtchn);
828 	if (err != 0) {
829 		message = "writing event-channel";
830 		goto abort_transaction;
831 	}
832 
833 	err = xenbus_printf(xbt, xsname, "feature-rx-notify", "%d", 1);
834 	if (err != 0) {
835 		message = "writing feature-rx-notify";
836 		goto abort_transaction;
837 	}
838 
839 	err = xenbus_printf(xbt, xsname, "request-rx-copy", "%d", 1);
840 	if (err != 0) {
841 		message = "writing request-rx-copy";
842 		goto abort_transaction;
843 	}
844 
845 	if (xnfp->xnf_be_mcast_control) {
846 		err = xenbus_printf(xbt, xsname, "request-multicast-control",
847 		    "%d", 1);
848 		if (err != 0) {
849 			message = "writing request-multicast-control";
850 			goto abort_transaction;
851 		}
852 	}
853 
854 	/*
855 	 * Tell backend if we support scatter-gather lists on the rx side.
856 	 */
857 	err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
858 	    xnf_enable_rx_sg ? 1 : 0);
859 	if (err != 0) {
860 		message = "writing feature-sg";
861 		goto abort_transaction;
862 	}
863 
864 	/*
865 	 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
866 	 * a prerequisite.
867 	 */
868 	err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
869 	    (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
870 	if (err != 0) {
871 		message = "writing feature-gso-tcpv4";
872 		goto abort_transaction;
873 	}
874 
875 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
876 	if (err != 0) {
877 		message = "switching state to XenbusStateConnected";
878 		goto abort_transaction;
879 	}
880 
881 	err = xenbus_transaction_end(xbt, 0);
882 	if (err != 0) {
883 		if (err == EAGAIN)
884 			goto again;
885 		xenbus_dev_error(xsd, err, "completing transaction");
886 	}
887 
888 	return;
889 
890 abort_transaction:
891 	(void) xenbus_transaction_end(xbt, 1);
892 	xenbus_dev_error(xsd, err, "%s", message);
893 }
894 
895 /*
896  * Read configuration information from xenstore.
897  */
898 void
899 xnf_read_config(xnf_t *xnfp)
900 {
901 	int err, be_cap;
902 	char mac[ETHERADDRL * 3];
903 	char *oename = xvdi_get_oename(xnfp->xnf_devinfo);
904 
905 	err = xenbus_scanf(XBT_NULL, oename, "mac",
906 	    "%s", (char *)&mac[0]);
907 	if (err != 0) {
908 		/*
909 		 * bad: we're supposed to be set up with a proper mac
910 		 * addr. at this point
911 		 */
912 		cmn_err(CE_WARN, "%s%d: no mac address",
913 		    ddi_driver_name(xnfp->xnf_devinfo),
914 		    ddi_get_instance(xnfp->xnf_devinfo));
915 			return;
916 	}
917 	if (ether_aton(mac, xnfp->xnf_mac_addr) != ETHERADDRL) {
918 		err = ENOENT;
919 		xenbus_dev_error(xvdi_get_xsd(xnfp->xnf_devinfo), ENOENT,
920 		    "parsing %s/mac", xvdi_get_xsname(xnfp->xnf_devinfo));
921 		return;
922 	}
923 
924 	err = xenbus_scanf(XBT_NULL, oename,
925 	    "feature-rx-copy", "%d", &be_cap);
926 	/*
927 	 * If we fail to read the store we assume that the key is
928 	 * absent, implying an older domain at the far end.  Older
929 	 * domains cannot do HV copy.
930 	 */
931 	if (err != 0)
932 		be_cap = 0;
933 	xnfp->xnf_be_rx_copy = (be_cap != 0);
934 
935 	err = xenbus_scanf(XBT_NULL, oename,
936 	    "feature-multicast-control", "%d", &be_cap);
937 	/*
938 	 * If we fail to read the store we assume that the key is
939 	 * absent, implying an older domain at the far end.  Older
940 	 * domains do not support multicast control.
941 	 */
942 	if (err != 0)
943 		be_cap = 0;
944 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
945 
946 	/*
947 	 * See if back-end supports scatter-gather for transmits. If not,
948 	 * we will not support LSO and limit the mtu to 1500.
949 	 */
950 	err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
951 	if (err != 0) {
952 		be_cap = 0;
953 		dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
954 		    "'feature-sg' from backend driver");
955 	}
956 	if (be_cap == 0) {
957 		dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
958 		    "supported for transmits in the backend driver. LSO is "
959 		    "disabled and MTU is restricted to 1500 bytes.");
960 	}
961 	xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
962 
963 	if (xnfp->xnf_be_tx_sg) {
964 		/*
965 		 * Check if LSO is supported. Currently we only check for
966 		 * IPv4 as Illumos doesn't support LSO for IPv6.
967 		 */
968 		err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
969 		    &be_cap);
970 		if (err != 0) {
971 			be_cap = 0;
972 			dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
973 			    "'feature-gso-tcpv4' from backend driver");
974 		}
975 		if (be_cap == 0) {
976 			dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
977 			    "supported by the backend driver. Performance "
978 			    "will be affected.");
979 		}
980 		xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
981 	}
982 }
983 
984 /*
985  *  attach(9E) -- Attach a device to the system
986  */
987 static int
988 xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
989 {
990 	mac_register_t *macp;
991 	xnf_t *xnfp;
992 	int err;
993 	char cachename[32];
994 
995 #ifdef XNF_DEBUG
996 	if (xnf_debug & XNF_DEBUG_DDI)
997 		printf("xnf%d: attach(0x%p)\n", ddi_get_instance(devinfo),
998 		    (void *)devinfo);
999 #endif
1000 
1001 	switch (cmd) {
1002 	case DDI_RESUME:
1003 		xnfp = ddi_get_driver_private(devinfo);
1004 		xnfp->xnf_gen++;
1005 
1006 		(void) xvdi_resume(devinfo);
1007 		(void) xvdi_alloc_evtchn(devinfo);
1008 		xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1009 #ifdef XPV_HVM_DRIVER
1010 		ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr,
1011 		    xnfp);
1012 #else
1013 		(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr,
1014 		    (caddr_t)xnfp);
1015 #endif
1016 		return (DDI_SUCCESS);
1017 
1018 	case DDI_ATTACH:
1019 		break;
1020 
1021 	default:
1022 		return (DDI_FAILURE);
1023 	}
1024 
1025 	/*
1026 	 *  Allocate gld_mac_info_t and xnf_instance structures
1027 	 */
1028 	macp = mac_alloc(MAC_VERSION);
1029 	if (macp == NULL)
1030 		return (DDI_FAILURE);
1031 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
1032 
1033 	xnfp->xnf_tx_pkt_id =
1034 	    kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
1035 
1036 	xnfp->xnf_rx_pkt_info =
1037 	    kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
1038 
1039 	macp->m_dip = devinfo;
1040 	macp->m_driver = xnfp;
1041 	xnfp->xnf_devinfo = devinfo;
1042 
1043 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1044 	macp->m_src_addr = xnfp->xnf_mac_addr;
1045 	macp->m_callbacks = &xnf_callbacks;
1046 	macp->m_min_sdu = 0;
1047 	xnfp->xnf_mtu = ETHERMTU;
1048 	macp->m_max_sdu = xnfp->xnf_mtu;
1049 
1050 	xnfp->xnf_running = B_FALSE;
1051 	xnfp->xnf_connected = B_FALSE;
1052 	xnfp->xnf_be_rx_copy = B_FALSE;
1053 	xnfp->xnf_be_mcast_control = B_FALSE;
1054 	xnfp->xnf_need_sched = B_FALSE;
1055 
1056 	xnfp->xnf_rx_head = NULL;
1057 	xnfp->xnf_rx_tail = NULL;
1058 	xnfp->xnf_rx_new_buffers_posted = B_FALSE;
1059 
1060 #ifdef XPV_HVM_DRIVER
1061 	/* Report our version to dom0 */
1062 	(void) xenbus_printf(XBT_NULL, "guest/xnf", "version", "%d",
1063 	    HVMPV_XNF_VERS);
1064 #endif
1065 
1066 	/*
1067 	 * Get the iblock cookie with which to initialize the mutexes.
1068 	 */
1069 	if (ddi_get_iblock_cookie(devinfo, 0, &xnfp->xnf_icookie)
1070 	    != DDI_SUCCESS)
1071 		goto failure;
1072 
1073 	mutex_init(&xnfp->xnf_txlock,
1074 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1075 	mutex_init(&xnfp->xnf_rxlock,
1076 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1077 	mutex_init(&xnfp->xnf_schedlock,
1078 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1079 	mutex_init(&xnfp->xnf_gref_lock,
1080 	    NULL, MUTEX_DRIVER, xnfp->xnf_icookie);
1081 
1082 	cv_init(&xnfp->xnf_cv_state, NULL, CV_DEFAULT, NULL);
1083 	cv_init(&xnfp->xnf_cv_multicast, NULL, CV_DEFAULT, NULL);
1084 	cv_init(&xnfp->xnf_cv_tx_slots, NULL, CV_DEFAULT, NULL);
1085 
1086 	(void) sprintf(cachename, "xnf_buf_cache_%d",
1087 	    ddi_get_instance(devinfo));
1088 	xnfp->xnf_buf_cache = kmem_cache_create(cachename,
1089 	    sizeof (xnf_buf_t), 0,
1090 	    xnf_buf_constructor, xnf_buf_destructor,
1091 	    NULL, xnfp, NULL, 0);
1092 	if (xnfp->xnf_buf_cache == NULL)
1093 		goto failure_0;
1094 
1095 	(void) sprintf(cachename, "xnf_tx_buf_cache_%d",
1096 	    ddi_get_instance(devinfo));
1097 	xnfp->xnf_tx_buf_cache = kmem_cache_create(cachename,
1098 	    sizeof (xnf_txbuf_t), 0,
1099 	    xnf_tx_buf_constructor, xnf_tx_buf_destructor,
1100 	    NULL, xnfp, NULL, 0);
1101 	if (xnfp->xnf_tx_buf_cache == NULL)
1102 		goto failure_1;
1103 
1104 	xnfp->xnf_gref_head = INVALID_GRANT_REF;
1105 
1106 	if (xnf_alloc_dma_resources(xnfp) == DDI_FAILURE) {
1107 		cmn_err(CE_WARN, "xnf%d: failed to allocate and initialize "
1108 		    "driver data structures",
1109 		    ddi_get_instance(xnfp->xnf_devinfo));
1110 		goto failure_2;
1111 	}
1112 
1113 	xnfp->xnf_rx_ring.sring->rsp_event =
1114 	    xnfp->xnf_tx_ring.sring->rsp_event = 1;
1115 
1116 	xnfp->xnf_tx_ring_ref = INVALID_GRANT_REF;
1117 	xnfp->xnf_rx_ring_ref = INVALID_GRANT_REF;
1118 
1119 	/* set driver private pointer now */
1120 	ddi_set_driver_private(devinfo, xnfp);
1121 
1122 	if (!xnf_kstat_init(xnfp))
1123 		goto failure_3;
1124 
1125 	/*
1126 	 * Allocate an event channel, add the interrupt handler and
1127 	 * bind it to the event channel.
1128 	 */
1129 	(void) xvdi_alloc_evtchn(devinfo);
1130 	xnfp->xnf_evtchn = xvdi_get_evtchn(devinfo);
1131 #ifdef XPV_HVM_DRIVER
1132 	ec_bind_evtchn_to_handler(xnfp->xnf_evtchn, IPL_VIF, xnf_intr, xnfp);
1133 #else
1134 	(void) ddi_add_intr(devinfo, 0, NULL, NULL, xnf_intr, (caddr_t)xnfp);
1135 #endif
1136 
1137 	err = mac_register(macp, &xnfp->xnf_mh);
1138 	mac_free(macp);
1139 	macp = NULL;
1140 	if (err != 0)
1141 		goto failure_4;
1142 
1143 	if (xvdi_add_event_handler(devinfo, XS_OE_STATE, oe_state_change, NULL)
1144 	    != DDI_SUCCESS)
1145 		goto failure_5;
1146 
1147 #ifdef XPV_HVM_DRIVER
1148 	/*
1149 	 * In the HVM case, this driver essentially replaces a driver for
1150 	 * a 'real' PCI NIC. Without the "model" property set to
1151 	 * "Ethernet controller", like the PCI code does, netbooting does
1152 	 * not work correctly, as strplumb_get_netdev_path() will not find
1153 	 * this interface.
1154 	 */
1155 	(void) ndi_prop_update_string(DDI_DEV_T_NONE, devinfo, "model",
1156 	    "Ethernet controller");
1157 #endif
1158 
1159 #ifdef XNF_DEBUG
1160 	if (xnf_debug_instance == NULL)
1161 		xnf_debug_instance = xnfp;
1162 #endif
1163 
1164 	return (DDI_SUCCESS);
1165 
1166 failure_5:
1167 	(void) mac_unregister(xnfp->xnf_mh);
1168 
1169 failure_4:
1170 #ifdef XPV_HVM_DRIVER
1171 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1172 	xvdi_free_evtchn(devinfo);
1173 #else
1174 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1175 #endif
1176 	xnfp->xnf_evtchn = INVALID_EVTCHN;
1177 	kstat_delete(xnfp->xnf_kstat_aux);
1178 
1179 failure_3:
1180 	xnf_release_dma_resources(xnfp);
1181 
1182 failure_2:
1183 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1184 
1185 failure_1:
1186 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1187 
1188 failure_0:
1189 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1190 	cv_destroy(&xnfp->xnf_cv_multicast);
1191 	cv_destroy(&xnfp->xnf_cv_state);
1192 
1193 	mutex_destroy(&xnfp->xnf_gref_lock);
1194 	mutex_destroy(&xnfp->xnf_schedlock);
1195 	mutex_destroy(&xnfp->xnf_rxlock);
1196 	mutex_destroy(&xnfp->xnf_txlock);
1197 
1198 failure:
1199 	kmem_free(xnfp, sizeof (*xnfp));
1200 	if (macp != NULL)
1201 		mac_free(macp);
1202 
1203 	return (DDI_FAILURE);
1204 }
1205 
1206 /*  detach(9E) -- Detach a device from the system */
1207 static int
1208 xnf_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd)
1209 {
1210 	xnf_t *xnfp;		/* Our private device info */
1211 
1212 #ifdef XNF_DEBUG
1213 	if (xnf_debug & XNF_DEBUG_DDI)
1214 		printf("xnf_detach(0x%p)\n", (void *)devinfo);
1215 #endif
1216 
1217 	xnfp = ddi_get_driver_private(devinfo);
1218 
1219 	switch (cmd) {
1220 	case DDI_SUSPEND:
1221 #ifdef XPV_HVM_DRIVER
1222 		ec_unbind_evtchn(xnfp->xnf_evtchn);
1223 		xvdi_free_evtchn(devinfo);
1224 #else
1225 		ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1226 #endif
1227 
1228 		xvdi_suspend(devinfo);
1229 
1230 		mutex_enter(&xnfp->xnf_rxlock);
1231 		mutex_enter(&xnfp->xnf_txlock);
1232 
1233 		xnfp->xnf_evtchn = INVALID_EVTCHN;
1234 		xnfp->xnf_connected = B_FALSE;
1235 		mutex_exit(&xnfp->xnf_txlock);
1236 		mutex_exit(&xnfp->xnf_rxlock);
1237 
1238 		/* claim link to be down after disconnect */
1239 		mac_link_update(xnfp->xnf_mh, LINK_STATE_DOWN);
1240 		return (DDI_SUCCESS);
1241 
1242 	case DDI_DETACH:
1243 		break;
1244 
1245 	default:
1246 		return (DDI_FAILURE);
1247 	}
1248 
1249 	if (xnfp->xnf_connected)
1250 		return (DDI_FAILURE);
1251 
1252 	/*
1253 	 * Cannot detach if we have xnf_buf_t outstanding.
1254 	 */
1255 	if (xnfp->xnf_stat_buf_allocated > 0)
1256 		return (DDI_FAILURE);
1257 
1258 	if (mac_unregister(xnfp->xnf_mh) != 0)
1259 		return (DDI_FAILURE);
1260 
1261 	kstat_delete(xnfp->xnf_kstat_aux);
1262 
1263 	/* Stop the receiver */
1264 	xnf_stop(xnfp);
1265 
1266 	xvdi_remove_event_handler(devinfo, XS_OE_STATE);
1267 
1268 	/* Remove the interrupt */
1269 #ifdef XPV_HVM_DRIVER
1270 	ec_unbind_evtchn(xnfp->xnf_evtchn);
1271 	xvdi_free_evtchn(devinfo);
1272 #else
1273 	ddi_remove_intr(devinfo, 0, xnfp->xnf_icookie);
1274 #endif
1275 
1276 	/* Release any pending xmit mblks */
1277 	xnf_release_mblks(xnfp);
1278 
1279 	/* Release all DMA resources */
1280 	xnf_release_dma_resources(xnfp);
1281 
1282 	cv_destroy(&xnfp->xnf_cv_tx_slots);
1283 	cv_destroy(&xnfp->xnf_cv_multicast);
1284 	cv_destroy(&xnfp->xnf_cv_state);
1285 
1286 	kmem_cache_destroy(xnfp->xnf_tx_buf_cache);
1287 	kmem_cache_destroy(xnfp->xnf_buf_cache);
1288 
1289 	mutex_destroy(&xnfp->xnf_gref_lock);
1290 	mutex_destroy(&xnfp->xnf_schedlock);
1291 	mutex_destroy(&xnfp->xnf_rxlock);
1292 	mutex_destroy(&xnfp->xnf_txlock);
1293 
1294 	kmem_free(xnfp, sizeof (*xnfp));
1295 
1296 	return (DDI_SUCCESS);
1297 }
1298 
1299 /*
1300  *  xnf_set_mac_addr() -- set the physical network address on the board.
1301  */
1302 static int
1303 xnf_set_mac_addr(void *arg, const uint8_t *macaddr)
1304 {
1305 	_NOTE(ARGUNUSED(arg, macaddr));
1306 
1307 	/*
1308 	 * We can't set our macaddr.
1309 	 */
1310 	return (ENOTSUP);
1311 }
1312 
1313 /*
1314  *  xnf_set_multicast() -- set (enable) or disable a multicast address.
1315  *
1316  *  Program the hardware to enable/disable the multicast address
1317  *  in "mca".  Enable if "add" is true, disable if false.
1318  */
1319 static int
1320 xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
1321 {
1322 	xnf_t *xnfp = arg;
1323 	xnf_txbuf_t *txp;
1324 	int n_slots;
1325 	RING_IDX slot;
1326 	xnf_txid_t *tidp;
1327 	netif_tx_request_t *txrp;
1328 	struct netif_extra_info *erp;
1329 	boolean_t notify, result;
1330 
1331 	/*
1332 	 * If the backend does not support multicast control then we
1333 	 * must assume that the right packets will just arrive.
1334 	 */
1335 	if (!xnfp->xnf_be_mcast_control)
1336 		return (0);
1337 
1338 	txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
1339 
1340 	mutex_enter(&xnfp->xnf_txlock);
1341 
1342 	/*
1343 	 * If we're not yet connected then claim success. This is
1344 	 * acceptable because we refresh the entire set of multicast
1345 	 * addresses when we get connected.
1346 	 *
1347 	 * We can't wait around here because the MAC layer expects
1348 	 * this to be a non-blocking operation - waiting ends up
1349 	 * causing a deadlock during resume.
1350 	 */
1351 	if (!xnfp->xnf_connected) {
1352 		mutex_exit(&xnfp->xnf_txlock);
1353 		return (0);
1354 	}
1355 
1356 	/*
1357 	 * 1. Acquire two slots in the ring.
1358 	 * 2. Fill in the slots.
1359 	 * 3. Request notification when the operation is done.
1360 	 * 4. Kick the peer.
1361 	 * 5. Wait for the response via xnf_tx_clean_ring().
1362 	 */
1363 
1364 	n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
1365 	ASSERT(n_slots >= 2);
1366 
1367 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1368 	tidp = xnf_txid_get(xnfp);
1369 	VERIFY(tidp != NULL);
1370 
1371 	txp->tx_type = TX_MCAST_REQ;
1372 	txp->tx_slot = slot;
1373 
1374 	txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1375 	erp = (struct netif_extra_info *)
1376 	    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot + 1);
1377 
1378 	txrp->gref = 0;
1379 	txrp->size = 0;
1380 	txrp->offset = 0;
1381 	/* Set tx_txreq.id to appease xnf_tx_clean_ring(). */
1382 	txrp->id = txp->tx_txreq.id = tidp->id;
1383 	txrp->flags = NETTXF_extra_info;
1384 
1385 	erp->type = add ? XEN_NETIF_EXTRA_TYPE_MCAST_ADD :
1386 	    XEN_NETIF_EXTRA_TYPE_MCAST_DEL;
1387 	bcopy((void *)mca, &erp->u.mcast.addr, ETHERADDRL);
1388 
1389 	tidp->txbuf = txp;
1390 
1391 	xnfp->xnf_tx_ring.req_prod_pvt = slot + 2;
1392 
1393 	mutex_enter(&xnfp->xnf_schedlock);
1394 	xnfp->xnf_pending_multicast++;
1395 	mutex_exit(&xnfp->xnf_schedlock);
1396 
1397 	/* LINTED: constant in conditional context */
1398 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
1399 	    notify);
1400 	if (notify)
1401 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1402 
1403 	while (txp->tx_type == TX_MCAST_REQ)
1404 		cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
1405 
1406 	ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
1407 
1408 	mutex_enter(&xnfp->xnf_schedlock);
1409 	xnfp->xnf_pending_multicast--;
1410 	mutex_exit(&xnfp->xnf_schedlock);
1411 
1412 	result = (txp->tx_status == NETIF_RSP_OKAY);
1413 
1414 	xnf_txid_put(xnfp, tidp);
1415 
1416 	mutex_exit(&xnfp->xnf_txlock);
1417 
1418 	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
1419 
1420 	return (result ? 0 : 1);
1421 }
1422 
1423 /*
1424  * xnf_set_promiscuous() -- set or reset promiscuous mode on the board
1425  *
1426  *  Program the hardware to enable/disable promiscuous mode.
1427  */
1428 static int
1429 xnf_set_promiscuous(void *arg, boolean_t on)
1430 {
1431 	_NOTE(ARGUNUSED(arg, on));
1432 
1433 	/*
1434 	 * We can't really do this, but we pretend that we can in
1435 	 * order that snoop will work.
1436 	 */
1437 	return (0);
1438 }
1439 
1440 /*
1441  * Clean buffers that we have responses for from the transmit ring.
1442  */
1443 static int
1444 xnf_tx_clean_ring(xnf_t *xnfp)
1445 {
1446 	boolean_t work_to_do;
1447 
1448 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1449 
1450 loop:
1451 	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_tx_ring)) {
1452 		RING_IDX cons, prod, i;
1453 
1454 		cons = xnfp->xnf_tx_ring.rsp_cons;
1455 		prod = xnfp->xnf_tx_ring.sring->rsp_prod;
1456 		membar_consumer();
1457 		/*
1458 		 * Clean tx requests from ring that we have responses
1459 		 * for.
1460 		 */
1461 		DTRACE_PROBE2(xnf_tx_clean_range, int, cons, int, prod);
1462 		for (i = cons; i != prod; i++) {
1463 			netif_tx_response_t *trp;
1464 			xnf_txid_t *tidp;
1465 			xnf_txbuf_t *txp;
1466 
1467 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
1468 			/*
1469 			 * if this slot was occupied by netif_extra_info_t,
1470 			 * then the response will be NETIF_RSP_NULL. In this
1471 			 * case there are no resources to clean up.
1472 			 */
1473 			if (trp->status == NETIF_RSP_NULL)
1474 				continue;
1475 
1476 			ASSERT(TX_ID_VALID(trp->id));
1477 
1478 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
1479 			ASSERT3U(tidp->id, ==, trp->id);
1480 			ASSERT3U(tidp->next, ==, INVALID_TX_ID);
1481 
1482 			txp = tidp->txbuf;
1483 			ASSERT(txp != NULL);
1484 			ASSERT3U(txp->tx_txreq.id, ==, trp->id);
1485 
1486 			switch (txp->tx_type) {
1487 			case TX_DATA:
1488 				/*
1489 				 * We must put the txid for each response we
1490 				 * acknowledge to make sure that we never have
1491 				 * more free slots than txids. Because of this
1492 				 * we do it here instead of waiting for it to
1493 				 * be done in xnf_data_txbuf_free_chain().
1494 				 */
1495 				xnf_txid_put(xnfp, tidp);
1496 				txp->tx_txreq.id = INVALID_TX_ID;
1497 				ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
1498 				txp->tx_head->tx_frags_to_ack--;
1499 
1500 				/*
1501 				 * We clean the whole chain once we got a
1502 				 * response for each fragment.
1503 				 */
1504 				if (txp->tx_head->tx_frags_to_ack == 0)
1505 					xnf_data_txbuf_free_chain(xnfp, txp);
1506 
1507 				break;
1508 
1509 			case TX_MCAST_REQ:
1510 				txp->tx_type = TX_MCAST_RSP;
1511 				txp->tx_status = trp->status;
1512 				cv_broadcast(&xnfp->xnf_cv_multicast);
1513 
1514 				break;
1515 
1516 			default:
1517 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
1518 				    "invalid xnf_txbuf_t type: %d",
1519 				    txp->tx_type);
1520 				break;
1521 			}
1522 		}
1523 		/*
1524 		 * Record the last response we dealt with so that we
1525 		 * know where to start next time around.
1526 		 */
1527 		xnfp->xnf_tx_ring.rsp_cons = prod;
1528 		membar_enter();
1529 	}
1530 
1531 	/* LINTED: constant in conditional context */
1532 	RING_FINAL_CHECK_FOR_RESPONSES(&xnfp->xnf_tx_ring, work_to_do);
1533 	if (work_to_do)
1534 		goto loop;
1535 
1536 	return (RING_FREE_REQUESTS(&xnfp->xnf_tx_ring));
1537 }
1538 
1539 /*
1540  * Allocate and fill in a look-aside buffer for the packet `mp'. Used
1541  * to ensure that the packet is physically contiguous and contained
1542  * within a single page.
1543  */
1544 static xnf_buf_t *
1545 xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
1546 {
1547 	xnf_buf_t *bd;
1548 	caddr_t bp;
1549 
1550 	bd = xnf_buf_get(xnfp, KM_SLEEP, B_TRUE);
1551 	if (bd == NULL)
1552 		return (NULL);
1553 
1554 	bp = bd->buf;
1555 	while (mp != NULL) {
1556 		size_t len = MBLKL(mp);
1557 
1558 		bcopy(mp->b_rptr, bp, len);
1559 		bp += len;
1560 
1561 		mp = mp->b_cont;
1562 	}
1563 
1564 	*plen = bp - bd->buf;
1565 	ASSERT3U(*plen, <=, PAGESIZE);
1566 
1567 	xnfp->xnf_stat_tx_lookaside++;
1568 
1569 	return (bd);
1570 }
1571 
1572 /*
1573  * Insert the pseudo-header checksum into the packet.
1574  * Assumes packet is IPv4, TCP/UDP since we only advertised support for
1575  * HCKSUM_INET_FULL_V4.
1576  */
1577 int
1578 xnf_pseudo_cksum(mblk_t *mp)
1579 {
1580 	struct ether_header *ehp;
1581 	uint16_t sap, iplen, *stuff;
1582 	uint32_t cksum;
1583 	size_t len;
1584 	ipha_t *ipha;
1585 	ipaddr_t src, dst;
1586 	uchar_t *ptr;
1587 
1588 	ptr = mp->b_rptr;
1589 	len = MBLKL(mp);
1590 
1591 	/* Each header must fit completely in an mblk. */
1592 	ASSERT3U(len, >=, sizeof (*ehp));
1593 
1594 	ehp = (struct ether_header *)ptr;
1595 
1596 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
1597 		struct ether_vlan_header *evhp;
1598 		ASSERT3U(len, >=, sizeof (*evhp));
1599 		evhp = (struct ether_vlan_header *)ptr;
1600 		sap = ntohs(evhp->ether_type);
1601 		ptr += sizeof (*evhp);
1602 		len -= sizeof (*evhp);
1603 	} else {
1604 		sap = ntohs(ehp->ether_type);
1605 		ptr += sizeof (*ehp);
1606 		len -= sizeof (*ehp);
1607 	}
1608 
1609 	ASSERT3U(sap, ==, ETHERTYPE_IP);
1610 
1611 	/*
1612 	 * Ethernet and IP headers may be in different mblks.
1613 	 */
1614 	ASSERT3P(ptr, <=, mp->b_wptr);
1615 	if (ptr == mp->b_wptr) {
1616 		mp = mp->b_cont;
1617 		ptr = mp->b_rptr;
1618 		len = MBLKL(mp);
1619 	}
1620 
1621 	ASSERT3U(len, >=, sizeof (ipha_t));
1622 	ipha = (ipha_t *)ptr;
1623 
1624 	/*
1625 	 * We assume the IP header has no options. (This is enforced in
1626 	 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
1627 	 */
1628 	ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
1629 	iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
1630 
1631 	ptr += IP_SIMPLE_HDR_LENGTH;
1632 	len -= IP_SIMPLE_HDR_LENGTH;
1633 
1634 	/*
1635 	 * IP and L4 headers may be in different mblks.
1636 	 */
1637 	ASSERT3P(ptr, <=, mp->b_wptr);
1638 	if (ptr == mp->b_wptr) {
1639 		mp = mp->b_cont;
1640 		ptr = mp->b_rptr;
1641 		len = MBLKL(mp);
1642 	}
1643 
1644 	switch (ipha->ipha_protocol) {
1645 	case IPPROTO_TCP:
1646 		ASSERT3U(len, >=, sizeof (tcph_t));
1647 		stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
1648 		cksum = IP_TCP_CSUM_COMP;
1649 		break;
1650 	case IPPROTO_UDP:
1651 		ASSERT3U(len, >=, sizeof (struct udphdr));
1652 		stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
1653 		cksum = IP_UDP_CSUM_COMP;
1654 		break;
1655 	default:
1656 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
1657 		    ipha->ipha_protocol);
1658 		return (EINVAL);
1659 	}
1660 
1661 	src = ipha->ipha_src;
1662 	dst = ipha->ipha_dst;
1663 
1664 	cksum += (dst >> 16) + (dst & 0xFFFF);
1665 	cksum += (src >> 16) + (src & 0xFFFF);
1666 	cksum += htons(iplen);
1667 
1668 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1669 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
1670 
1671 	ASSERT(cksum <= 0xFFFF);
1672 
1673 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
1674 
1675 	return (0);
1676 }
1677 
1678 /*
1679  * Push a packet into the transmit ring.
1680  *
1681  * Note: the format of a tx packet that spans multiple slots is similar to
1682  * what is described in xnf_rx_one_packet().
1683  */
1684 static void
1685 xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
1686 {
1687 	int nslots = 0;
1688 	int extras = 0;
1689 	RING_IDX slot;
1690 	boolean_t notify;
1691 
1692 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
1693 	ASSERT(xnfp->xnf_running);
1694 
1695 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
1696 
1697 	/*
1698 	 * The caller has already checked that we have enough slots to proceed.
1699 	 */
1700 	for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
1701 		xnf_txid_t *tidp;
1702 		netif_tx_request_t *txrp;
1703 
1704 		tidp = xnf_txid_get(xnfp);
1705 		VERIFY(tidp != NULL);
1706 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1707 
1708 		txp->tx_slot = slot;
1709 		txp->tx_txreq.id = tidp->id;
1710 		*txrp = txp->tx_txreq;
1711 
1712 		tidp->txbuf = txp;
1713 		slot++;
1714 		nslots++;
1715 
1716 		/*
1717 		 * When present, LSO info is placed in a slot after the first
1718 		 * data segment, and doesn't require a txid.
1719 		 */
1720 		if (txp->tx_txreq.flags & NETTXF_extra_info) {
1721 			netif_extra_info_t *extra;
1722 			ASSERT3U(nslots, ==, 1);
1723 
1724 			extra = (netif_extra_info_t *)
1725 			    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
1726 			*extra = txp->tx_extra;
1727 			slot++;
1728 			nslots++;
1729 			extras = 1;
1730 		}
1731 	}
1732 
1733 	ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
1734 
1735 	/*
1736 	 * Store the number of data fragments.
1737 	 */
1738 	head->tx_frags_to_ack = nslots - extras;
1739 
1740 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
1741 
1742 	/*
1743 	 * Tell the peer that we sent something, if it cares.
1744 	 */
1745 	/* LINTED: constant in conditional context */
1746 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
1747 	if (notify)
1748 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
1749 }
1750 
1751 static xnf_txbuf_t *
1752 xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
1753 {
1754 	xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp);
1755 	size_t length;
1756 
1757 	txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
1758 	if (txp->tx_bdesc == NULL) {
1759 		xnf_data_txbuf_free(xnfp, txp);
1760 		return (NULL);
1761 	}
1762 	txp->tx_mfn = txp->tx_bdesc->buf_mfn;
1763 	txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
1764 	txp->tx_txreq.size = length;
1765 	txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
1766 	txp->tx_txreq.flags = 0;
1767 
1768 	return (txp);
1769 }
1770 
1771 static xnf_txbuf_t *
1772 xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
1773 {
1774 	xnf_txbuf_t *head = NULL;
1775 	xnf_txbuf_t *tail = NULL;
1776 	domid_t oeid;
1777 	int nsegs = 0;
1778 
1779 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
1780 
1781 	for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
1782 		ddi_dma_handle_t dma_handle;
1783 		const ddi_dma_cookie_t *dma_cookie, *dma_cookie_prev;
1784 		xnf_txbuf_t *txp;
1785 
1786 		if (MBLKL(ml) == 0)
1787 			continue;
1788 
1789 		txp = xnf_data_txbuf_alloc(xnfp);
1790 
1791 		if (head == NULL) {
1792 			head = txp;
1793 		} else {
1794 			ASSERT(tail != NULL);
1795 			TXBUF_SETNEXT(tail, txp);
1796 			txp->tx_head = head;
1797 		}
1798 
1799 		/*
1800 		 * The necessary segmentation rules (e.g. not crossing a page
1801 		 * boundary) are enforced by the dma attributes of the handle.
1802 		 */
1803 		dma_handle = txp->tx_dma_handle;
1804 		int ret = ddi_dma_addr_bind_handle(dma_handle,
1805 		    NULL, (char *)ml->b_rptr, MBLKL(ml),
1806 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1807 		    DDI_DMA_DONTWAIT, 0, NULL, NULL);
1808 		if (ret != DDI_DMA_MAPPED) {
1809 			if (ret != DDI_DMA_NORESOURCES) {
1810 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1811 				    "ddi_dma_addr_bind_handle() failed "
1812 				    "[dma_error=%d]", ret);
1813 			}
1814 			goto error;
1815 		}
1816 		txp->tx_handle_bound = B_TRUE;
1817 
1818 		dma_cookie_prev = NULL;
1819 		while ((dma_cookie = ddi_dma_cookie_iter(dma_handle,
1820 		    dma_cookie_prev)) != NULL) {
1821 			if (nsegs == XEN_MAX_TX_DATA_PAGES) {
1822 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1823 				    "xnf_dmamap_alloc() failed: "
1824 				    "too many segments");
1825 				goto error;
1826 			}
1827 			if (dma_cookie_prev != NULL) {
1828 				txp = xnf_data_txbuf_alloc(xnfp);
1829 				ASSERT(tail != NULL);
1830 				TXBUF_SETNEXT(tail, txp);
1831 				txp->tx_head = head;
1832 			}
1833 
1834 			txp->tx_mfn =
1835 			    xnf_btop(pa_to_ma(dma_cookie->dmac_laddress));
1836 			txp->tx_txreq.gref = xnf_gref_get(xnfp);
1837 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
1838 				dev_err(xnfp->xnf_devinfo, CE_WARN,
1839 				    "xnf_dmamap_alloc() failed: "
1840 				    "invalid grant ref");
1841 				goto error;
1842 			}
1843 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
1844 			    oeid, txp->tx_mfn, 1);
1845 			txp->tx_txreq.offset =
1846 			    dma_cookie->dmac_laddress & PAGEOFFSET;
1847 			txp->tx_txreq.size = dma_cookie->dmac_size;
1848 			txp->tx_txreq.flags = 0;
1849 
1850 			nsegs++;
1851 
1852 			if (tail != NULL)
1853 				tail->tx_txreq.flags = NETTXF_more_data;
1854 			tail = txp;
1855 
1856 			dma_cookie_prev = dma_cookie;
1857 		}
1858 	}
1859 
1860 	*countp = nsegs;
1861 	return (head);
1862 
1863 error:
1864 	xnf_data_txbuf_free_chain(xnfp, head);
1865 	return (NULL);
1866 }
1867 
1868 static void
1869 xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
1870     uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
1871 {
1872 	if (lso_flags != 0) {
1873 		ASSERT3U(lso_flags, ==, HW_LSO);
1874 		ASSERT3P(head->tx_bdesc, ==, NULL);
1875 
1876 		head->tx_txreq.flags |= NETTXF_extra_info;
1877 		netif_extra_info_t *extra = &head->tx_extra;
1878 		extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
1879 		extra->flags = 0;
1880 		extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
1881 		extra->u.gso.size = mss;
1882 		extra->u.gso.features = 0;
1883 		extra->u.gso.pad = 0;
1884 	} else if (cksum_flags != 0) {
1885 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1886 		/*
1887 		 * If the local protocol stack requests checksum
1888 		 * offload we set the 'checksum blank' flag,
1889 		 * indicating to the peer that we need the checksum
1890 		 * calculated for us.
1891 		 *
1892 		 * We _don't_ set the validated flag, because we haven't
1893 		 * validated that the data and the checksum match.
1894 		 *
1895 		 * Note: we already called xnf_pseudo_cksum() in
1896 		 * xnf_send(), so we just set the txreq flag here.
1897 		 */
1898 		head->tx_txreq.flags |= NETTXF_csum_blank;
1899 		xnfp->xnf_stat_tx_cksum_deferred++;
1900 	}
1901 }
1902 
1903 /*
1904  * Send packet mp. Called by the MAC framework.
1905  */
1906 static mblk_t *
1907 xnf_send(void *arg, mblk_t *mp)
1908 {
1909 	xnf_t *xnfp = arg;
1910 	xnf_txbuf_t *head;
1911 	mblk_t *ml;
1912 	int length;
1913 	int pages, chunks, slots, slots_free;
1914 	uint32_t cksum_flags, lso_flags, mss;
1915 	boolean_t pulledup = B_FALSE;
1916 	boolean_t force_copy = B_FALSE;
1917 
1918 	ASSERT3P(mp->b_next, ==, NULL);
1919 
1920 	mutex_enter(&xnfp->xnf_txlock);
1921 
1922 	/*
1923 	 * Wait until we are connected to the backend.
1924 	 */
1925 	while (!xnfp->xnf_connected)
1926 		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
1927 
1928 	/*
1929 	 * To simplify logic and be in sync with the rescheduling mechanism,
1930 	 * we require the maximum amount of slots that could be used by a
1931 	 * transaction to be free before proceeding. The only downside of doing
1932 	 * this is that it slightly reduces the effective size of the ring.
1933 	 */
1934 	slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
1935 	if (slots_free < XEN_MAX_SLOTS_PER_TX) {
1936 		/*
1937 		 * We need to ask for a re-schedule later as the ring is full.
1938 		 */
1939 		mutex_enter(&xnfp->xnf_schedlock);
1940 		xnfp->xnf_need_sched = B_TRUE;
1941 		mutex_exit(&xnfp->xnf_schedlock);
1942 
1943 		xnfp->xnf_stat_tx_defer++;
1944 		mutex_exit(&xnfp->xnf_txlock);
1945 		return (mp);
1946 	}
1947 
1948 	/*
1949 	 * Get hw offload parameters.
1950 	 * This must be done before pulling up the mp as those parameters
1951 	 * are not copied over.
1952 	 */
1953 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
1954 	mac_lso_get(mp, &mss, &lso_flags);
1955 
1956 	/*
1957 	 * XXX: fix MAC framework so that we can advertise support for
1958 	 * partial checksum for IPv4 only. This way we won't need to calculate
1959 	 * the pseudo header checksum ourselves.
1960 	 */
1961 	if (cksum_flags != 0) {
1962 		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
1963 		(void) xnf_pseudo_cksum(mp);
1964 	}
1965 
1966 pulledup:
1967 	for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
1968 	    ml = ml->b_cont, chunks++) {
1969 		pages += xnf_mblk_pages(ml);
1970 		length += MBLKL(ml);
1971 	}
1972 	DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
1973 	DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
1974 
1975 	/*
1976 	 * If the ethernet header crosses a page boundary the packet
1977 	 * will be dropped by the backend. In practice it seems like
1978 	 * this happens fairly rarely so we'll do nothing unless the
1979 	 * packet is small enough to fit in a look-aside buffer.
1980 	 */
1981 	if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
1982 	    sizeof (struct ether_header) > PAGESIZE) {
1983 		xnfp->xnf_stat_tx_eth_hdr_split++;
1984 		if (length <= PAGESIZE)
1985 			force_copy = B_TRUE;
1986 	}
1987 
1988 	if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
1989 		/*
1990 		 * If the packet spans several pages and scatter-gather is not
1991 		 * supported then use a look-aside buffer.
1992 		 */
1993 		ASSERT3U(length, <=, PAGESIZE);
1994 		head = xnf_mblk_copy(xnfp, mp);
1995 		if (head == NULL) {
1996 			dev_err(xnfp->xnf_devinfo, CE_WARN,
1997 			    "xnf_mblk_copy() failed");
1998 			goto drop;
1999 		}
2000 	} else {
2001 		/*
2002 		 * There's a limit for how many pages can be passed to the
2003 		 * backend. If we pass that limit, the packet will be dropped
2004 		 * and some backend implementations (e.g. Linux) could even
2005 		 * offline the interface.
2006 		 */
2007 		if (pages > XEN_MAX_TX_DATA_PAGES) {
2008 			if (pulledup) {
2009 				dev_err(xnfp->xnf_devinfo, CE_WARN,
2010 				    "too many pages, even after pullup: %d.",
2011 				    pages);
2012 				goto drop;
2013 			}
2014 
2015 			/*
2016 			 * Defragment packet if it spans too many pages.
2017 			 */
2018 			mblk_t *newmp = msgpullup(mp, -1);
2019 			freemsg(mp);
2020 			mp = newmp;
2021 			xnfp->xnf_stat_tx_pullup++;
2022 			pulledup = B_TRUE;
2023 			goto pulledup;
2024 		}
2025 
2026 		head = xnf_mblk_map(xnfp, mp, &slots);
2027 		if (head == NULL)
2028 			goto drop;
2029 
2030 		IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
2031 	}
2032 
2033 	/*
2034 	 * Set tx_mp so that mblk is freed when the txbuf chain is freed.
2035 	 */
2036 	head->tx_mp = mp;
2037 
2038 	xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
2039 
2040 	/*
2041 	 * The first request must store the total length of the packet.
2042 	 */
2043 	head->tx_txreq.size = length;
2044 
2045 	/*
2046 	 * Push the packet we have prepared into the ring.
2047 	 */
2048 	xnf_tx_push_packet(xnfp, head);
2049 	xnfp->xnf_stat_opackets++;
2050 	xnfp->xnf_stat_obytes += length;
2051 
2052 	mutex_exit(&xnfp->xnf_txlock);
2053 	return (NULL);
2054 
2055 drop:
2056 	freemsg(mp);
2057 	xnfp->xnf_stat_tx_drop++;
2058 	mutex_exit(&xnfp->xnf_txlock);
2059 	return (NULL);
2060 }
2061 
2062 /*
2063  * Notification of RX packets. Currently no TX-complete interrupt is
2064  * used, as we clean the TX ring lazily.
2065  */
2066 static uint_t
2067 xnf_intr(caddr_t arg)
2068 {
2069 	xnf_t *xnfp = (xnf_t *)arg;
2070 	mblk_t *mp;
2071 	boolean_t need_sched, clean_ring;
2072 
2073 	mutex_enter(&xnfp->xnf_rxlock);
2074 
2075 	/*
2076 	 * Interrupts before we are connected are spurious.
2077 	 */
2078 	if (!xnfp->xnf_connected) {
2079 		mutex_exit(&xnfp->xnf_rxlock);
2080 		xnfp->xnf_stat_unclaimed_interrupts++;
2081 		return (DDI_INTR_UNCLAIMED);
2082 	}
2083 
2084 	/*
2085 	 * Receive side processing.
2086 	 */
2087 	do {
2088 		/*
2089 		 * Collect buffers from the ring.
2090 		 */
2091 		xnf_rx_collect(xnfp);
2092 
2093 		/*
2094 		 * Interrupt me when the next receive buffer is consumed.
2095 		 */
2096 		xnfp->xnf_rx_ring.sring->rsp_event =
2097 		    xnfp->xnf_rx_ring.rsp_cons + 1;
2098 		xen_mb();
2099 
2100 	} while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring));
2101 
2102 	if (xnfp->xnf_rx_new_buffers_posted) {
2103 		boolean_t notify;
2104 
2105 		/*
2106 		 * Indicate to the peer that we have re-filled the
2107 		 * receive ring, if it cares.
2108 		 */
2109 		/* LINTED: constant in conditional context */
2110 		RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_rx_ring, notify);
2111 		if (notify)
2112 			ec_notify_via_evtchn(xnfp->xnf_evtchn);
2113 		xnfp->xnf_rx_new_buffers_posted = B_FALSE;
2114 	}
2115 
2116 	mp = xnfp->xnf_rx_head;
2117 	xnfp->xnf_rx_head = xnfp->xnf_rx_tail = NULL;
2118 
2119 	xnfp->xnf_stat_interrupts++;
2120 	mutex_exit(&xnfp->xnf_rxlock);
2121 
2122 	if (mp != NULL)
2123 		mac_rx(xnfp->xnf_mh, NULL, mp);
2124 
2125 	/*
2126 	 * Transmit side processing.
2127 	 *
2128 	 * If a previous transmit attempt failed or we have pending
2129 	 * multicast requests, clean the ring.
2130 	 *
2131 	 * If we previously stalled transmission and cleaning produces
2132 	 * some free slots, tell upstream to attempt sending again.
2133 	 *
2134 	 * The odd style is to avoid acquiring xnf_txlock unless we
2135 	 * will actually look inside the tx machinery.
2136 	 */
2137 	mutex_enter(&xnfp->xnf_schedlock);
2138 	need_sched = xnfp->xnf_need_sched;
2139 	clean_ring = need_sched || (xnfp->xnf_pending_multicast > 0);
2140 	mutex_exit(&xnfp->xnf_schedlock);
2141 
2142 	if (clean_ring) {
2143 		int free_slots;
2144 
2145 		mutex_enter(&xnfp->xnf_txlock);
2146 		free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
2147 
2148 		if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
2149 			mutex_enter(&xnfp->xnf_schedlock);
2150 			xnfp->xnf_need_sched = B_FALSE;
2151 			mutex_exit(&xnfp->xnf_schedlock);
2152 
2153 			mac_tx_update(xnfp->xnf_mh);
2154 		}
2155 		mutex_exit(&xnfp->xnf_txlock);
2156 	}
2157 
2158 	return (DDI_INTR_CLAIMED);
2159 }
2160 
2161 /*
2162  *  xnf_start() -- start the board receiving and enable interrupts.
2163  */
2164 static int
2165 xnf_start(void *arg)
2166 {
2167 	xnf_t *xnfp = arg;
2168 
2169 #ifdef XNF_DEBUG
2170 	if (xnf_debug & XNF_DEBUG_TRACE)
2171 		printf("xnf%d start(0x%p)\n",
2172 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2173 #endif
2174 
2175 	mutex_enter(&xnfp->xnf_rxlock);
2176 	mutex_enter(&xnfp->xnf_txlock);
2177 
2178 	/* Accept packets from above. */
2179 	xnfp->xnf_running = B_TRUE;
2180 
2181 	mutex_exit(&xnfp->xnf_txlock);
2182 	mutex_exit(&xnfp->xnf_rxlock);
2183 
2184 	return (0);
2185 }
2186 
2187 /* xnf_stop() - disable hardware */
2188 static void
2189 xnf_stop(void *arg)
2190 {
2191 	xnf_t *xnfp = arg;
2192 
2193 #ifdef XNF_DEBUG
2194 	if (xnf_debug & XNF_DEBUG_TRACE)
2195 		printf("xnf%d stop(0x%p)\n",
2196 		    ddi_get_instance(xnfp->xnf_devinfo), (void *)xnfp);
2197 #endif
2198 
2199 	mutex_enter(&xnfp->xnf_rxlock);
2200 	mutex_enter(&xnfp->xnf_txlock);
2201 
2202 	xnfp->xnf_running = B_FALSE;
2203 
2204 	mutex_exit(&xnfp->xnf_txlock);
2205 	mutex_exit(&xnfp->xnf_rxlock);
2206 }
2207 
2208 /*
2209  * Hang buffer `bdesc' on the RX ring.
2210  */
2211 static void
2212 xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
2213 {
2214 	netif_rx_request_t *reqp;
2215 	RING_IDX hang_ix;
2216 
2217 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2218 
2219 	reqp = RING_GET_REQUEST(&xnfp->xnf_rx_ring,
2220 	    xnfp->xnf_rx_ring.req_prod_pvt);
2221 	hang_ix = (RING_IDX) (reqp - RING_GET_REQUEST(&xnfp->xnf_rx_ring, 0));
2222 	ASSERT(xnfp->xnf_rx_pkt_info[hang_ix] == NULL);
2223 
2224 	reqp->id = bdesc->id = hang_ix;
2225 	reqp->gref = bdesc->grant_ref;
2226 
2227 	xnfp->xnf_rx_pkt_info[hang_ix] = bdesc;
2228 	xnfp->xnf_rx_ring.req_prod_pvt++;
2229 
2230 	xnfp->xnf_rx_new_buffers_posted = B_TRUE;
2231 }
2232 
2233 /*
2234  * Receive an entire packet from the ring, starting from slot *consp.
2235  * prod indicates the slot of the latest response.
2236  * On return, *consp will point to the head of the next packet.
2237  *
2238  * Note: If slot prod was reached before we could gather a full packet, we will
2239  * drop the partial packet; this would most likely indicate a bug in either
2240  * the front-end or the back-end driver.
2241  *
2242  * An rx packet can consist of several fragments and thus span multiple slots.
2243  * Each fragment can contain up to 4k of data.
2244  *
2245  * A typical 9000 MTU packet with look like this:
2246  * +------+---------------------+-------------------+-----------------------+
2247  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2248  * +------+---------------------+-------------------+-----------------------+
2249  * | 1    | netif_rx_response_t | 1st data fragment | more_data             |
2250  * +------+---------------------+-------------------+-----------------------+
2251  * | 2    | netif_rx_response_t | 2nd data fragment | more_data             |
2252  * +------+---------------------+-------------------+-----------------------+
2253  * | 3    | netif_rx_response_t | 3rd data fragment | [none]                |
2254  * +------+---------------------+-------------------+-----------------------+
2255  *
2256  * Fragments are chained by setting NETRXF_more_data in the previous
2257  * response's flags. If there are additional flags, such as
2258  * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
2259  * first fragment.
2260  *
2261  * Sometimes extra info can be present. If so, it will follow the first
2262  * fragment, and NETRXF_extra_info flag will be set on the first response.
2263  * If LRO is set on a packet, it will be stored in the extra info. Conforming
2264  * to the spec, extra info can also be chained, but must all be present right
2265  * after the first fragment.
2266  *
2267  * Example of a packet with 2 extra infos:
2268  * +------+---------------------+-------------------+-----------------------+
2269  * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
2270  * +------+---------------------+-------------------+-----------------------+
2271  * | 1    | netif_rx_response_t | 1st data fragment | extra_info, more_data |
2272  * +------+---------------------+-------------------+-----------------------+
2273  * | 2    | netif_extra_info_t  | 1st extra info    | EXTRA_FLAG_MORE       |
2274  * +------+---------------------+-------------------+-----------------------+
2275  * | 3    | netif_extra_info_t  | 2nd extra info    | [none]                |
2276  * +------+---------------------+-------------------+-----------------------+
2277  * | 4    | netif_rx_response_t | 2nd data fragment | more_data             |
2278  * +------+---------------------+-------------------+-----------------------+
2279  * | 5    | netif_rx_response_t | 3rd data fragment | more_data             |
2280  * +------+---------------------+-------------------+-----------------------+
2281  * | 6    | netif_rx_response_t | 4th data fragment | [none]                |
2282  * +------+---------------------+-------------------+-----------------------+
2283  *
2284  * In practice, the only extra we expect is for LRO, but only if we advertise
2285  * that we support it to the backend (xnf_enable_lro == TRUE).
2286  */
2287 static int
2288 xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
2289 {
2290 	mblk_t *head = NULL;
2291 	mblk_t *tail = NULL;
2292 	mblk_t *mp;
2293 	int error = 0;
2294 	RING_IDX cons = *consp;
2295 	netif_extra_info_t lro;
2296 	boolean_t is_lro = B_FALSE;
2297 	boolean_t is_extra = B_FALSE;
2298 
2299 	netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2300 
2301 	boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
2302 	boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
2303 	boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
2304 
2305 	IMPLY(more_data, xnf_enable_rx_sg);
2306 
2307 	while (cons != prod) {
2308 		xnf_buf_t *bdesc;
2309 		int len, off;
2310 		int rxidx = cons & (NET_RX_RING_SIZE - 1);
2311 
2312 		bdesc = xnfp->xnf_rx_pkt_info[rxidx];
2313 		xnfp->xnf_rx_pkt_info[rxidx] = NULL;
2314 
2315 		if (is_extra) {
2316 			netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
2317 			/*
2318 			 * The only extra we expect is for LRO, and it should
2319 			 * only be present once.
2320 			 */
2321 			if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
2322 			    !is_lro) {
2323 				ASSERT(xnf_enable_lro);
2324 				lro = *extra;
2325 				is_lro = B_TRUE;
2326 				DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
2327 			} else {
2328 				dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
2329 				    "contains unexpected extra info of type %d",
2330 				    extra->type);
2331 				error = EINVAL;
2332 			}
2333 			more_extra =
2334 			    (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
2335 
2336 			goto hang_buf;
2337 		}
2338 
2339 		ASSERT3U(bdesc->id, ==, rsp.id);
2340 
2341 		/*
2342 		 * status stores packet length when >= 0, or errors when < 0.
2343 		 */
2344 		len = rsp.status;
2345 		off = rsp.offset;
2346 		more_data = (rsp.flags & NETRXF_more_data) != 0;
2347 
2348 		/*
2349 		 * sanity checks.
2350 		 */
2351 		if (!xnfp->xnf_running) {
2352 			error = EBUSY;
2353 		} else if (len <= 0) {
2354 			xnfp->xnf_stat_errrx++;
2355 
2356 			switch (len) {
2357 			case 0:
2358 				xnfp->xnf_stat_runt++;
2359 				break;
2360 			case NETIF_RSP_ERROR:
2361 				xnfp->xnf_stat_mac_rcv_error++;
2362 				break;
2363 			case NETIF_RSP_DROPPED:
2364 				xnfp->xnf_stat_norxbuf++;
2365 				break;
2366 			}
2367 			error = EINVAL;
2368 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
2369 			dev_err(xnfp->xnf_devinfo, CE_WARN,
2370 			    "Bad rx grant reference, rsp id %d", rsp.id);
2371 			error = EINVAL;
2372 		} else if ((off + len) > PAGESIZE) {
2373 			dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
2374 			    "page boundary (offset %d, length %d)", off, len);
2375 			error = EINVAL;
2376 		}
2377 
2378 		if (error != 0) {
2379 			/*
2380 			 * If an error has been detected, we do not attempt
2381 			 * to read the data but we still need to replace
2382 			 * the rx bufs.
2383 			 */
2384 			goto hang_buf;
2385 		}
2386 
2387 		xnf_buf_t *nbuf = NULL;
2388 
2389 		/*
2390 		 * If the packet is below a pre-determined size we will
2391 		 * copy data out of the buf rather than replace it.
2392 		 */
2393 		if (len > xnf_rx_copy_limit)
2394 			nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
2395 
2396 		if (nbuf != NULL) {
2397 			mp = desballoc((unsigned char *)bdesc->buf,
2398 			    bdesc->len, 0, &bdesc->free_rtn);
2399 
2400 			if (mp == NULL) {
2401 				xnfp->xnf_stat_rx_desballoc_fail++;
2402 				xnfp->xnf_stat_norxbuf++;
2403 				error = ENOMEM;
2404 				/*
2405 				 * we free the buf we just allocated as we
2406 				 * will re-hang the old buf.
2407 				 */
2408 				xnf_buf_put(xnfp, nbuf, B_FALSE);
2409 				goto hang_buf;
2410 			}
2411 
2412 			mp->b_rptr = mp->b_rptr + off;
2413 			mp->b_wptr = mp->b_rptr + len;
2414 
2415 			/*
2416 			 * Release the grant as the backend doesn't need to
2417 			 * access this buffer anymore and grants are scarce.
2418 			 */
2419 			(void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
2420 			    0);
2421 			xnf_gref_put(xnfp, bdesc->grant_ref);
2422 			bdesc->grant_ref = INVALID_GRANT_REF;
2423 
2424 			bdesc = nbuf;
2425 		} else {
2426 			/*
2427 			 * We failed to allocate a new buf or decided to reuse
2428 			 * the old one. In either case we copy the data off it
2429 			 * and put it back into the ring.
2430 			 */
2431 			mp = allocb(len, 0);
2432 			if (mp == NULL) {
2433 				xnfp->xnf_stat_rx_allocb_fail++;
2434 				xnfp->xnf_stat_norxbuf++;
2435 				error = ENOMEM;
2436 				goto hang_buf;
2437 			}
2438 			bcopy(bdesc->buf + off, mp->b_wptr, len);
2439 			mp->b_wptr += len;
2440 		}
2441 
2442 		if (head == NULL)
2443 			head = mp;
2444 		else
2445 			tail->b_cont = mp;
2446 		tail = mp;
2447 
2448 hang_buf:
2449 		/*
2450 		 * No matter what happens, for each response we need to hang
2451 		 * a new buf on the rx ring. Put either the old one, or a new
2452 		 * one if the old one is borrowed by the kernel via desballoc().
2453 		 */
2454 		xnf_rxbuf_hang(xnfp, bdesc);
2455 		cons++;
2456 
2457 		/* next response is an extra */
2458 		is_extra = more_extra;
2459 
2460 		if (!more_data && !more_extra)
2461 			break;
2462 
2463 		/*
2464 		 * Note that since requests and responses are union'd on the
2465 		 * same ring, we copy the response to a local variable instead
2466 		 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
2467 		 * overwritten contents of rsp.
2468 		 */
2469 		rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
2470 	}
2471 
2472 	/*
2473 	 * Check that we do not get stuck in a loop.
2474 	 */
2475 	ASSERT3U(*consp, !=, cons);
2476 	*consp = cons;
2477 
2478 	/*
2479 	 * We ran out of responses but the flags indicate there is more data.
2480 	 */
2481 	if (more_data) {
2482 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
2483 		error = EINVAL;
2484 	}
2485 	if (more_extra) {
2486 		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
2487 		    "(extras).");
2488 		error = EINVAL;
2489 	}
2490 
2491 	/*
2492 	 * An error means the packet must be dropped. If we have already formed
2493 	 * a partial packet, then discard it.
2494 	 */
2495 	if (error != 0) {
2496 		if (head != NULL)
2497 			freemsg(head);
2498 		xnfp->xnf_stat_rx_drop++;
2499 		return (error);
2500 	}
2501 
2502 	ASSERT(head != NULL);
2503 
2504 	if (hwcsum) {
2505 		/*
2506 		 * If the peer says that the data has been validated then we
2507 		 * declare that the full checksum has been verified.
2508 		 *
2509 		 * We don't look at the "checksum blank" flag, and hence could
2510 		 * have a packet here that we are asserting is good with
2511 		 * a blank checksum.
2512 		 */
2513 		mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
2514 		xnfp->xnf_stat_rx_cksum_no_need++;
2515 	}
2516 
2517 	/* XXX: set lro info for packet once LRO is supported in OS. */
2518 
2519 	*mpp = head;
2520 
2521 	return (0);
2522 }
2523 
2524 /*
2525  * Collect packets from the RX ring, storing them in `xnfp' for later use.
2526  */
2527 static void
2528 xnf_rx_collect(xnf_t *xnfp)
2529 {
2530 	RING_IDX prod;
2531 
2532 	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
2533 
2534 	prod = xnfp->xnf_rx_ring.sring->rsp_prod;
2535 	/*
2536 	 * Ensure we see queued responses up to 'prod'.
2537 	 */
2538 	membar_consumer();
2539 
2540 	while (xnfp->xnf_rx_ring.rsp_cons != prod) {
2541 		mblk_t *mp;
2542 
2543 		/*
2544 		 * Collect a packet.
2545 		 * rsp_cons is updated inside xnf_rx_one_packet().
2546 		 */
2547 		int error = xnf_rx_one_packet(xnfp, prod,
2548 		    &xnfp->xnf_rx_ring.rsp_cons, &mp);
2549 		if (error == 0) {
2550 			xnfp->xnf_stat_ipackets++;
2551 			xnfp->xnf_stat_rbytes += xmsgsize(mp);
2552 
2553 			/*
2554 			 * Append the mblk to the rx list.
2555 			 */
2556 			if (xnfp->xnf_rx_head == NULL) {
2557 				ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
2558 				xnfp->xnf_rx_head = mp;
2559 			} else {
2560 				ASSERT(xnfp->xnf_rx_tail != NULL);
2561 				xnfp->xnf_rx_tail->b_next = mp;
2562 			}
2563 			xnfp->xnf_rx_tail = mp;
2564 		}
2565 	}
2566 }
2567 
2568 /*
2569  *  xnf_alloc_dma_resources() -- initialize the drivers structures
2570  */
2571 static int
2572 xnf_alloc_dma_resources(xnf_t *xnfp)
2573 {
2574 	dev_info_t 		*devinfo = xnfp->xnf_devinfo;
2575 	size_t			len;
2576 	ddi_dma_cookie_t	dma_cookie;
2577 	uint_t			ncookies;
2578 	int			rc;
2579 	caddr_t			rptr;
2580 
2581 	/*
2582 	 * The code below allocates all the DMA data structures that
2583 	 * need to be released when the driver is detached.
2584 	 *
2585 	 * Allocate page for the transmit descriptor ring.
2586 	 */
2587 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2588 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_tx_ring_dma_handle) != DDI_SUCCESS)
2589 		goto alloc_error;
2590 
2591 	if (ddi_dma_mem_alloc(xnfp->xnf_tx_ring_dma_handle,
2592 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2593 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2594 	    &xnfp->xnf_tx_ring_dma_acchandle) != DDI_SUCCESS) {
2595 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2596 		xnfp->xnf_tx_ring_dma_handle = NULL;
2597 		goto alloc_error;
2598 	}
2599 
2600 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_tx_ring_dma_handle, NULL,
2601 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2602 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2603 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2604 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2605 		xnfp->xnf_tx_ring_dma_handle = NULL;
2606 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2607 		if (rc == DDI_DMA_NORESOURCES)
2608 			goto alloc_error;
2609 		else
2610 			goto error;
2611 	}
2612 
2613 	ASSERT(ncookies == 1);
2614 	bzero(rptr, PAGESIZE);
2615 	/* LINTED: constant in conditional context */
2616 	SHARED_RING_INIT((netif_tx_sring_t *)rptr);
2617 	/* LINTED: constant in conditional context */
2618 	FRONT_RING_INIT(&xnfp->xnf_tx_ring, (netif_tx_sring_t *)rptr, PAGESIZE);
2619 	xnfp->xnf_tx_ring_phys_addr = dma_cookie.dmac_laddress;
2620 
2621 	/*
2622 	 * Allocate page for the receive descriptor ring.
2623 	 */
2624 	if (ddi_dma_alloc_handle(devinfo, &ringbuf_dma_attr,
2625 	    DDI_DMA_SLEEP, 0, &xnfp->xnf_rx_ring_dma_handle) != DDI_SUCCESS)
2626 		goto alloc_error;
2627 
2628 	if (ddi_dma_mem_alloc(xnfp->xnf_rx_ring_dma_handle,
2629 	    PAGESIZE, &accattr, DDI_DMA_CONSISTENT,
2630 	    DDI_DMA_SLEEP, 0, &rptr, &len,
2631 	    &xnfp->xnf_rx_ring_dma_acchandle) != DDI_SUCCESS) {
2632 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2633 		xnfp->xnf_rx_ring_dma_handle = NULL;
2634 		goto alloc_error;
2635 	}
2636 
2637 	if ((rc = ddi_dma_addr_bind_handle(xnfp->xnf_rx_ring_dma_handle, NULL,
2638 	    rptr, PAGESIZE, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
2639 	    DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies)) != DDI_DMA_MAPPED) {
2640 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2641 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2642 		xnfp->xnf_rx_ring_dma_handle = NULL;
2643 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2644 		if (rc == DDI_DMA_NORESOURCES)
2645 			goto alloc_error;
2646 		else
2647 			goto error;
2648 	}
2649 
2650 	ASSERT(ncookies == 1);
2651 	bzero(rptr, PAGESIZE);
2652 	/* LINTED: constant in conditional context */
2653 	SHARED_RING_INIT((netif_rx_sring_t *)rptr);
2654 	/* LINTED: constant in conditional context */
2655 	FRONT_RING_INIT(&xnfp->xnf_rx_ring, (netif_rx_sring_t *)rptr, PAGESIZE);
2656 	xnfp->xnf_rx_ring_phys_addr = dma_cookie.dmac_laddress;
2657 
2658 	return (DDI_SUCCESS);
2659 
2660 alloc_error:
2661 	cmn_err(CE_WARN, "xnf%d: could not allocate enough DMA memory",
2662 	    ddi_get_instance(xnfp->xnf_devinfo));
2663 error:
2664 	xnf_release_dma_resources(xnfp);
2665 	return (DDI_FAILURE);
2666 }
2667 
2668 /*
2669  * Release all DMA resources in the opposite order from acquisition
2670  */
2671 static void
2672 xnf_release_dma_resources(xnf_t *xnfp)
2673 {
2674 	int i;
2675 
2676 	/*
2677 	 * Free receive buffers which are currently associated with
2678 	 * descriptors.
2679 	 */
2680 	mutex_enter(&xnfp->xnf_rxlock);
2681 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
2682 		xnf_buf_t *bp;
2683 
2684 		if ((bp = xnfp->xnf_rx_pkt_info[i]) == NULL)
2685 			continue;
2686 		xnfp->xnf_rx_pkt_info[i] = NULL;
2687 		xnf_buf_put(xnfp, bp, B_FALSE);
2688 	}
2689 	mutex_exit(&xnfp->xnf_rxlock);
2690 
2691 	/* Free the receive ring buffer. */
2692 	if (xnfp->xnf_rx_ring_dma_acchandle != NULL) {
2693 		(void) ddi_dma_unbind_handle(xnfp->xnf_rx_ring_dma_handle);
2694 		ddi_dma_mem_free(&xnfp->xnf_rx_ring_dma_acchandle);
2695 		ddi_dma_free_handle(&xnfp->xnf_rx_ring_dma_handle);
2696 		xnfp->xnf_rx_ring_dma_acchandle = NULL;
2697 	}
2698 	/* Free the transmit ring buffer. */
2699 	if (xnfp->xnf_tx_ring_dma_acchandle != NULL) {
2700 		(void) ddi_dma_unbind_handle(xnfp->xnf_tx_ring_dma_handle);
2701 		ddi_dma_mem_free(&xnfp->xnf_tx_ring_dma_acchandle);
2702 		ddi_dma_free_handle(&xnfp->xnf_tx_ring_dma_handle);
2703 		xnfp->xnf_tx_ring_dma_acchandle = NULL;
2704 	}
2705 
2706 }
2707 
2708 /*
2709  * Release any packets and associated structures used by the TX ring.
2710  */
2711 static void
2712 xnf_release_mblks(xnf_t *xnfp)
2713 {
2714 	RING_IDX i;
2715 	xnf_txid_t *tidp;
2716 
2717 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
2718 	    i < NET_TX_RING_SIZE;
2719 	    i++, tidp++) {
2720 		xnf_txbuf_t *txp = tidp->txbuf;
2721 
2722 		if (txp != NULL) {
2723 			ASSERT(txp->tx_mp != NULL);
2724 			freemsg(txp->tx_mp);
2725 
2726 			xnf_txid_put(xnfp, tidp);
2727 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
2728 		}
2729 	}
2730 }
2731 
2732 static int
2733 xnf_buf_constructor(void *buf, void *arg, int kmflag)
2734 {
2735 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2736 	xnf_buf_t *bdesc = buf;
2737 	xnf_t *xnfp = arg;
2738 	ddi_dma_cookie_t dma_cookie;
2739 	uint_t ncookies;
2740 	size_t len;
2741 
2742 	if (kmflag & KM_NOSLEEP)
2743 		ddiflags = DDI_DMA_DONTWAIT;
2744 
2745 	/* Allocate a DMA access handle for the buffer. */
2746 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
2747 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
2748 		goto failure;
2749 
2750 	/* Allocate DMA-able memory for buffer. */
2751 	if (ddi_dma_mem_alloc(bdesc->dma_handle,
2752 	    PAGESIZE, &data_accattr, DDI_DMA_STREAMING, ddiflags, 0,
2753 	    &bdesc->buf, &len, &bdesc->acc_handle) != DDI_SUCCESS)
2754 		goto failure_1;
2755 
2756 	/* Bind to virtual address of buffer to get physical address. */
2757 	if (ddi_dma_addr_bind_handle(bdesc->dma_handle, NULL,
2758 	    bdesc->buf, len, DDI_DMA_RDWR | DDI_DMA_STREAMING,
2759 	    ddiflags, 0, &dma_cookie, &ncookies) != DDI_DMA_MAPPED)
2760 		goto failure_2;
2761 	ASSERT(ncookies == 1);
2762 
2763 	bdesc->free_rtn.free_func = xnf_buf_recycle;
2764 	bdesc->free_rtn.free_arg = (caddr_t)bdesc;
2765 	bdesc->xnfp = xnfp;
2766 	bdesc->buf_phys = dma_cookie.dmac_laddress;
2767 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2768 	bdesc->len = dma_cookie.dmac_size;
2769 	bdesc->grant_ref = INVALID_GRANT_REF;
2770 	bdesc->gen = xnfp->xnf_gen;
2771 
2772 	atomic_inc_64(&xnfp->xnf_stat_buf_allocated);
2773 
2774 	return (0);
2775 
2776 failure_2:
2777 	ddi_dma_mem_free(&bdesc->acc_handle);
2778 
2779 failure_1:
2780 	ddi_dma_free_handle(&bdesc->dma_handle);
2781 
2782 failure:
2783 
2784 	ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2785 	return (-1);
2786 }
2787 
2788 static void
2789 xnf_buf_destructor(void *buf, void *arg)
2790 {
2791 	xnf_buf_t *bdesc = buf;
2792 	xnf_t *xnfp = arg;
2793 
2794 	(void) ddi_dma_unbind_handle(bdesc->dma_handle);
2795 	ddi_dma_mem_free(&bdesc->acc_handle);
2796 	ddi_dma_free_handle(&bdesc->dma_handle);
2797 
2798 	atomic_dec_64(&xnfp->xnf_stat_buf_allocated);
2799 }
2800 
2801 static xnf_buf_t *
2802 xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
2803 {
2804 	grant_ref_t gref;
2805 	xnf_buf_t *bufp;
2806 
2807 	/*
2808 	 * Usually grant references are more scarce than memory, so we
2809 	 * attempt to acquire a grant reference first.
2810 	 */
2811 	gref = xnf_gref_get(xnfp);
2812 	if (gref == INVALID_GRANT_REF)
2813 		return (NULL);
2814 
2815 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
2816 	if (bufp == NULL) {
2817 		xnf_gref_put(xnfp, gref);
2818 		return (NULL);
2819 	}
2820 
2821 	ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
2822 
2823 	bufp->grant_ref = gref;
2824 
2825 	if (bufp->gen != xnfp->xnf_gen)
2826 		xnf_buf_refresh(bufp);
2827 
2828 	gnttab_grant_foreign_access_ref(bufp->grant_ref,
2829 	    xvdi_get_oeid(bufp->xnfp->xnf_devinfo),
2830 	    bufp->buf_mfn, readonly ? 1 : 0);
2831 
2832 	atomic_inc_64(&xnfp->xnf_stat_buf_outstanding);
2833 
2834 	return (bufp);
2835 }
2836 
2837 static void
2838 xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
2839 {
2840 	if (bufp->grant_ref != INVALID_GRANT_REF) {
2841 		(void) gnttab_end_foreign_access_ref(
2842 		    bufp->grant_ref, readonly ? 1 : 0);
2843 		xnf_gref_put(xnfp, bufp->grant_ref);
2844 		bufp->grant_ref = INVALID_GRANT_REF;
2845 	}
2846 
2847 	kmem_cache_free(xnfp->xnf_buf_cache, bufp);
2848 
2849 	atomic_dec_64(&xnfp->xnf_stat_buf_outstanding);
2850 }
2851 
2852 /*
2853  * Refresh any cached data about a buffer after resume.
2854  */
2855 static void
2856 xnf_buf_refresh(xnf_buf_t *bdesc)
2857 {
2858 	bdesc->buf_mfn = pfn_to_mfn(xnf_btop(bdesc->buf_phys));
2859 	bdesc->gen = bdesc->xnfp->xnf_gen;
2860 }
2861 
2862 /*
2863  * Streams `freeb' routine for `xnf_buf_t' when used as transmit
2864  * look-aside buffers.
2865  */
2866 static void
2867 xnf_buf_recycle(xnf_buf_t *bdesc)
2868 {
2869 	xnf_t *xnfp = bdesc->xnfp;
2870 
2871 	xnf_buf_put(xnfp, bdesc, B_TRUE);
2872 }
2873 
2874 static int
2875 xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
2876 {
2877 	int (*ddiflags)(caddr_t) = DDI_DMA_SLEEP;
2878 	xnf_txbuf_t *txp = buf;
2879 	xnf_t *xnfp = arg;
2880 
2881 	if (kmflag & KM_NOSLEEP)
2882 		ddiflags = DDI_DMA_DONTWAIT;
2883 
2884 	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
2885 	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
2886 		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
2887 		return (-1);
2888 	}
2889 
2890 	return (0);
2891 }
2892 
2893 static void
2894 xnf_tx_buf_destructor(void *buf, void *arg)
2895 {
2896 	_NOTE(ARGUNUSED(arg));
2897 	xnf_txbuf_t *txp = buf;
2898 
2899 	ddi_dma_free_handle(&txp->tx_dma_handle);
2900 }
2901 
2902 /*
2903  * Statistics.
2904  */
2905 static char *xnf_aux_statistics[] = {
2906 	"tx_cksum_deferred",
2907 	"rx_cksum_no_need",
2908 	"interrupts",
2909 	"unclaimed_interrupts",
2910 	"tx_pullup",
2911 	"tx_lookaside",
2912 	"tx_drop",
2913 	"tx_eth_hdr_split",
2914 	"buf_allocated",
2915 	"buf_outstanding",
2916 	"gref_outstanding",
2917 	"gref_failure",
2918 	"gref_peak",
2919 	"rx_allocb_fail",
2920 	"rx_desballoc_fail",
2921 };
2922 
2923 static int
2924 xnf_kstat_aux_update(kstat_t *ksp, int flag)
2925 {
2926 	xnf_t *xnfp;
2927 	kstat_named_t *knp;
2928 
2929 	if (flag != KSTAT_READ)
2930 		return (EACCES);
2931 
2932 	xnfp = ksp->ks_private;
2933 	knp = ksp->ks_data;
2934 
2935 	/*
2936 	 * Assignment order must match that of the names in
2937 	 * xnf_aux_statistics.
2938 	 */
2939 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_cksum_deferred;
2940 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_cksum_no_need;
2941 
2942 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
2943 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
2944 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
2945 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
2946 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
2947 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
2948 
2949 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
2950 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
2951 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_outstanding;
2952 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_failure;
2953 	(knp++)->value.ui64 = xnfp->xnf_stat_gref_peak;
2954 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_allocb_fail;
2955 	(knp++)->value.ui64 = xnfp->xnf_stat_rx_desballoc_fail;
2956 
2957 	return (0);
2958 }
2959 
2960 static boolean_t
2961 xnf_kstat_init(xnf_t *xnfp)
2962 {
2963 	int nstat = sizeof (xnf_aux_statistics) /
2964 	    sizeof (xnf_aux_statistics[0]);
2965 	char **cp = xnf_aux_statistics;
2966 	kstat_named_t *knp;
2967 
2968 	/*
2969 	 * Create and initialise kstats.
2970 	 */
2971 	if ((xnfp->xnf_kstat_aux = kstat_create("xnf",
2972 	    ddi_get_instance(xnfp->xnf_devinfo),
2973 	    "aux_statistics", "net", KSTAT_TYPE_NAMED,
2974 	    nstat, 0)) == NULL)
2975 		return (B_FALSE);
2976 
2977 	xnfp->xnf_kstat_aux->ks_private = xnfp;
2978 	xnfp->xnf_kstat_aux->ks_update = xnf_kstat_aux_update;
2979 
2980 	knp = xnfp->xnf_kstat_aux->ks_data;
2981 	while (nstat > 0) {
2982 		kstat_named_init(knp, *cp, KSTAT_DATA_UINT64);
2983 
2984 		knp++;
2985 		cp++;
2986 		nstat--;
2987 	}
2988 
2989 	kstat_install(xnfp->xnf_kstat_aux);
2990 
2991 	return (B_TRUE);
2992 }
2993 
2994 static int
2995 xnf_stat(void *arg, uint_t stat, uint64_t *val)
2996 {
2997 	xnf_t *xnfp = arg;
2998 
2999 	mutex_enter(&xnfp->xnf_rxlock);
3000 	mutex_enter(&xnfp->xnf_txlock);
3001 
3002 #define	mac_stat(q, r)				\
3003 	case (MAC_STAT_##q):			\
3004 		*val = xnfp->xnf_stat_##r;	\
3005 		break
3006 
3007 #define	ether_stat(q, r)			\
3008 	case (ETHER_STAT_##q):			\
3009 		*val = xnfp->xnf_stat_##r;	\
3010 		break
3011 
3012 	switch (stat) {
3013 
3014 	mac_stat(IPACKETS, ipackets);
3015 	mac_stat(OPACKETS, opackets);
3016 	mac_stat(RBYTES, rbytes);
3017 	mac_stat(OBYTES, obytes);
3018 	mac_stat(NORCVBUF, norxbuf);
3019 	mac_stat(IERRORS, errrx);
3020 	mac_stat(NOXMTBUF, tx_defer);
3021 
3022 	ether_stat(MACRCV_ERRORS, mac_rcv_error);
3023 	ether_stat(TOOSHORT_ERRORS, runt);
3024 
3025 	/* always claim to be in full duplex mode */
3026 	case ETHER_STAT_LINK_DUPLEX:
3027 		*val = LINK_DUPLEX_FULL;
3028 		break;
3029 
3030 	/* always claim to be at 1Gb/s link speed */
3031 	case MAC_STAT_IFSPEED:
3032 		*val = 1000000000ull;
3033 		break;
3034 
3035 	default:
3036 		mutex_exit(&xnfp->xnf_txlock);
3037 		mutex_exit(&xnfp->xnf_rxlock);
3038 
3039 		return (ENOTSUP);
3040 	}
3041 
3042 #undef mac_stat
3043 #undef ether_stat
3044 
3045 	mutex_exit(&xnfp->xnf_txlock);
3046 	mutex_exit(&xnfp->xnf_rxlock);
3047 
3048 	return (0);
3049 }
3050 
3051 static int
3052 xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
3053 {
3054 	if (mtu > ETHERMTU) {
3055 		if (!xnf_enable_tx_sg) {
3056 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3057 			    "because scatter-gather is disabled for transmit "
3058 			    "in driver settings", ETHERMTU);
3059 			return (EINVAL);
3060 		} else if (!xnf_enable_rx_sg) {
3061 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3062 			    "because scatter-gather is disabled for receive "
3063 			    "in driver settings", ETHERMTU);
3064 			return (EINVAL);
3065 		} else if (!xnfp->xnf_be_tx_sg) {
3066 			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
3067 			    "because backend doesn't support scatter-gather",
3068 			    ETHERMTU);
3069 			return (EINVAL);
3070 		}
3071 		if (mtu > XNF_MAXPKT)
3072 			return (EINVAL);
3073 	}
3074 	int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
3075 	if (error == 0)
3076 		xnfp->xnf_mtu = mtu;
3077 
3078 	return (error);
3079 }
3080 
3081 /*ARGSUSED*/
3082 static int
3083 xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3084     uint_t prop_val_size, void *prop_val)
3085 {
3086 	xnf_t *xnfp = data;
3087 
3088 	switch (prop_id) {
3089 	case MAC_PROP_MTU:
3090 		ASSERT(prop_val_size >= sizeof (uint32_t));
3091 		bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
3092 		break;
3093 	default:
3094 		return (ENOTSUP);
3095 	}
3096 	return (0);
3097 }
3098 
3099 /*ARGSUSED*/
3100 static int
3101 xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
3102     uint_t prop_val_size, const void *prop_val)
3103 {
3104 	xnf_t *xnfp = data;
3105 	uint32_t new_mtu;
3106 	int error;
3107 
3108 	switch (prop_id) {
3109 	case MAC_PROP_MTU:
3110 		ASSERT(prop_val_size >= sizeof (uint32_t));
3111 		bcopy(prop_val, &new_mtu, sizeof (new_mtu));
3112 		error = xnf_change_mtu(xnfp, new_mtu);
3113 		break;
3114 	default:
3115 		return (ENOTSUP);
3116 	}
3117 
3118 	return (error);
3119 }
3120 
3121 /*ARGSUSED*/
3122 static void
3123 xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
3124     mac_prop_info_handle_t prop_handle)
3125 {
3126 	switch (prop_id) {
3127 	case MAC_PROP_MTU:
3128 		mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
3129 		break;
3130 	default:
3131 		break;
3132 	}
3133 }
3134 
3135 static boolean_t
3136 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
3137 {
3138 	xnf_t *xnfp = arg;
3139 
3140 	switch (cap) {
3141 	case MAC_CAPAB_HCKSUM: {
3142 		uint32_t *capab = cap_data;
3143 
3144 		/*
3145 		 * Whilst the flag used to communicate with the IO
3146 		 * domain is called "NETTXF_csum_blank", the checksum
3147 		 * in the packet must contain the pseudo-header
3148 		 * checksum and not zero.
3149 		 *
3150 		 * To help out the IO domain, we might use
3151 		 * HCKSUM_INET_PARTIAL. Unfortunately our stack will
3152 		 * then use checksum offload for IPv6 packets, which
3153 		 * the IO domain can't handle.
3154 		 *
3155 		 * As a result, we declare outselves capable of
3156 		 * HCKSUM_INET_FULL_V4. This means that we receive
3157 		 * IPv4 packets from the stack with a blank checksum
3158 		 * field and must insert the pseudo-header checksum
3159 		 * before passing the packet to the IO domain.
3160 		 */
3161 		*capab = HCKSUM_INET_FULL_V4;
3162 
3163 		/*
3164 		 * TODO: query the "feature-ipv6-csum-offload" capability.
3165 		 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
3166 		 */
3167 
3168 		break;
3169 	}
3170 	case MAC_CAPAB_LSO: {
3171 		if (!xnfp->xnf_be_lso)
3172 			return (B_FALSE);
3173 
3174 		mac_capab_lso_t *lso = cap_data;
3175 		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
3176 		lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
3177 		break;
3178 	}
3179 	default:
3180 		return (B_FALSE);
3181 	}
3182 
3183 	return (B_TRUE);
3184 }
3185 
3186 /*
3187  * The state of the peer has changed - react accordingly.
3188  */
3189 static void
3190 oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
3191     void *arg, void *impl_data)
3192 {
3193 	_NOTE(ARGUNUSED(id, arg));
3194 	xnf_t *xnfp = ddi_get_driver_private(dip);
3195 	XenbusState new_state = *(XenbusState *)impl_data;
3196 
3197 	ASSERT(xnfp != NULL);
3198 
3199 	switch (new_state) {
3200 	case XenbusStateUnknown:
3201 	case XenbusStateInitialising:
3202 	case XenbusStateInitialised:
3203 	case XenbusStateClosing:
3204 	case XenbusStateClosed:
3205 	case XenbusStateReconfiguring:
3206 	case XenbusStateReconfigured:
3207 		break;
3208 
3209 	case XenbusStateInitWait:
3210 		xnf_read_config(xnfp);
3211 
3212 		if (!xnfp->xnf_be_rx_copy) {
3213 			cmn_err(CE_WARN,
3214 			    "The xnf driver requires a dom0 that "
3215 			    "supports 'feature-rx-copy'.");
3216 			(void) xvdi_switch_state(xnfp->xnf_devinfo,
3217 			    XBT_NULL, XenbusStateClosed);
3218 			break;
3219 		}
3220 
3221 		/*
3222 		 * Connect to the backend.
3223 		 */
3224 		xnf_be_connect(xnfp);
3225 
3226 		/*
3227 		 * Our MAC address as discovered by xnf_read_config().
3228 		 */
3229 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
3230 
3231 		/*
3232 		 * We do not know if some features such as LSO are supported
3233 		 * until we connect to the backend. We request the MAC layer
3234 		 * to poll our capabilities again.
3235 		 */
3236 		mac_capab_update(xnfp->xnf_mh);
3237 
3238 		break;
3239 
3240 	case XenbusStateConnected:
3241 		mutex_enter(&xnfp->xnf_rxlock);
3242 		mutex_enter(&xnfp->xnf_txlock);
3243 
3244 		xnfp->xnf_connected = B_TRUE;
3245 		/*
3246 		 * Wake up any threads waiting to send data to
3247 		 * backend.
3248 		 */
3249 		cv_broadcast(&xnfp->xnf_cv_state);
3250 
3251 		mutex_exit(&xnfp->xnf_txlock);
3252 		mutex_exit(&xnfp->xnf_rxlock);
3253 
3254 		/*
3255 		 * Kick the peer in case it missed any transmits
3256 		 * request in the TX ring.
3257 		 */
3258 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
3259 
3260 		/*
3261 		 * There may already be completed receive requests in
3262 		 * the ring sent by backend after it gets connected
3263 		 * but before we see its state change here, so we call
3264 		 * xnf_intr() to handle them, if any.
3265 		 */
3266 		(void) xnf_intr((caddr_t)xnfp);
3267 
3268 		/*
3269 		 * Mark the link up now that we are connected.
3270 		 */
3271 		mac_link_update(xnfp->xnf_mh, LINK_STATE_UP);
3272 
3273 		/*
3274 		 * Tell the backend about the multicast addresses in
3275 		 * which we are interested.
3276 		 */
3277 		mac_multicast_refresh(xnfp->xnf_mh, NULL, xnfp, B_TRUE);
3278 
3279 		break;
3280 
3281 	default:
3282 		break;
3283 	}
3284 }
3285