xref: /illumos-gate/usr/src/uts/common/io/igc/igc_ring.c (revision 533affcbc7fc4d0c8132976ea454aaa715fe2307)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2024 Oxide Computer Company
14  */
15 
16 /*
17  * igc ring related functions. This is where the bulk of our I/O occurs.
18  */
19 
20 #include <sys/stddef.h>
21 #include <sys/strsubr.h>
22 #include <sys/strsun.h>
23 #include <sys/sysmacros.h>
24 #include <sys/sdt.h>
25 
26 #include "igc.h"
27 
28 /*
29  * Structure used to consolidate TX information about a given packet.
30  */
31 typedef struct igc_tx_state {
32 	list_t itx_bufs;
33 	mac_ether_offload_info_t itx_meoi;
34 	uint32_t itx_cksum;
35 	uint32_t itx_mss;
36 	uint32_t itx_lso;
37 	igc_tx_buffer_t *itx_cur_buf;
38 	size_t itx_buf_rem;
39 	mblk_t *itx_free_mp;
40 	uint32_t itx_ndescs;
41 } igc_tx_state_t;
42 
43 /*
44  * DMA attributes that are used for descriptor rings. .
45  */
46 static const ddi_dma_attr_t igc_desc_dma_attr = {
47 	.dma_attr_version = DMA_ATTR_V0,
48 	/*
49 	 * DMA descriptor rings can show up anywhere in the address space. The
50 	 * card supports a 64-bit address for this.
51 	 */
52 	.dma_attr_addr_lo = 0,
53 	.dma_attr_addr_hi = UINT64_MAX,
54 	/*
55 	 * The I210 datasheet says that the ring descriptor length can support
56 	 * at most 32K entries that are each 16 bytes long. Hence the following
57 	 * max.
58 	 */
59 	.dma_attr_count_max = 0x80000,
60 	/*
61 	 * The I210 datasheet, which is the closest we have for the I225,
62 	 * requires 128 byte alignment for rings. Note, igb and e1000g default
63 	 * to a 4KiB alignment here.
64 	 */
65 	.dma_attr_align = 0x80,
66 	/*
67 	 * Borrowed from igb(4D).
68 	 */
69 	.dma_attr_burstsizes = 0xfff,
70 	/*
71 	 * We set the minimum and maximum based upon what the RDLEN/TDLEN
72 	 * register will actually support.
73 	 */
74 	.dma_attr_minxfer = 0x80,
75 	.dma_attr_maxxfer = 0x80000,
76 	/*
77 	 * The receive ring must be continuous, indicated by the maximum sgllen
78 	 * value, which means that this doesn't have any boundary crossing
79 	 * constraints.
80 	 */
81 	.dma_attr_seg = UINT64_MAX,
82 	.dma_attr_sgllen = 1,
83 	/*
84 	 * For descriptor rings, hardware asks for the size in 128 byte chunks,
85 	 * so we set that here again.
86 	 */
87 	.dma_attr_granular = 0x80,
88 	.dma_attr_flags = 0
89 };
90 
91 /*
92  * DMA attributes that cover pre-allocated data buffers. Note, RX buffers are
93  * slightly more constrained than TX buffers because the RX buffer addr[0] can
94  * sometimes be used as a no snoop enable bit. Therefore we purposefully avoid
95  * that in our allocations here to allow for use of that in the future if
96  * desired.
97  */
98 static const ddi_dma_attr_t igc_data_dma_attr = {
99 	.dma_attr_version = DMA_ATTR_V0,
100 	/*
101 	 * Packet data can go anywhere in memory.
102 	 */
103 	.dma_attr_addr_lo = 0,
104 	.dma_attr_addr_hi = UINT64_MAX,
105 	/*
106 	 * The maximum size of an RX packet is 127 KiB in the SRRCTL register.
107 	 * For TX, the maximum value is a 16-bit quantity because that's the
108 	 * tx descriptor's size. So we cap it at this value.
109 	 */
110 	.dma_attr_count_max = UINT16_MAX,
111 	/*
112 	 * The hardware strictly requires only 2 byte alignment in RX
113 	 * descriptors in case no snoop is enabled and no such constraints in
114 	 * TX. We end up increasing this to a request for 16 byte alignment so
115 	 * that we can guarantee the IP header alignment and offsetting needs to
116 	 * happen on all rx descriptors.
117 	 */
118 	.dma_attr_align = 0x10,
119 	/*
120 	 * We're not constrained here at least via PCIe, so we use the wider
121 	 * setting here. Similarly to the ring descriptors we just set the
122 	 * granularity widely.
123 	 */
124 	.dma_attr_minxfer = 0x1,
125 	.dma_attr_maxxfer = UINT32_MAX,
126 	.dma_attr_seg = UINT64_MAX,
127 	/*
128 	 * The hardware allows for arbitrary chaining of descriptors; however,
129 	 * we want to move to a world where we are allocating page sized buffers
130 	 * at most and therefore constrain the number of cookies for these
131 	 * buffers. Transmit caps the buffer allocation size at the page size,
132 	 * but receive does not today. We set the granularity to 1 to reflect
133 	 * the device's flexibility.
134 	 */
135 	.dma_attr_sgllen = 1,
136 	.dma_attr_granular = 1,
137 	.dma_attr_flags = 0
138 };
139 
140 /*
141  * These are the DMA attributes we use when performing DMA TX binding for an
142  * mblk_t.
143  */
144 static const ddi_dma_attr_t igc_tx_dma_attr = {
145 	.dma_attr_version = DMA_ATTR_V0,
146 	/*
147 	 * Packet data can go anywhere in memory.
148 	 */
149 	.dma_attr_addr_lo = 0,
150 	.dma_attr_addr_hi = UINT64_MAX,
151 	/*
152 	 * For TX, the maximum value is a 16-bit quantity because that's the
153 	 * tx descriptor's size.
154 	 */
155 	.dma_attr_count_max = UINT16_MAX,
156 	/*
157 	 * TX data can go anywhere, but we ask for 16 byte alignment just to
158 	 * keep things somewhat aligned in the system.
159 	 */
160 	.dma_attr_align = 0x10,
161 	/*
162 	 * We're not constrained here at least via PCIe, so we use the wider
163 	 * setting here. Similarly to the ring descriptors we just set the
164 	 * granularity widely.
165 	 */
166 	.dma_attr_minxfer = 0x1,
167 	.dma_attr_maxxfer = UINT32_MAX,
168 	.dma_attr_seg = UINT64_MAX,
169 	/*
170 	 * We size our transmit cookies so that the maximum sized LSO packet can
171 	 * go through here.
172 	 */
173 	.dma_attr_sgllen = IGC_MAX_TX_COOKIES,
174 	.dma_attr_granular = 1,
175 	.dma_attr_flags = 0
176 
177 };
178 
179 /*
180  * All of these wrappers are so we only have one place to tack into FMA
181  * register accesses in the future.
182  */
183 static void
184 igc_dma_acc_attr(igc_t *igc, ddi_device_acc_attr_t *accp)
185 {
186 	bzero(accp, sizeof (ddi_device_acc_attr_t));
187 
188 	accp->devacc_attr_version = DDI_DEVICE_ATTR_V1;
189 	accp->devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
190 	accp->devacc_attr_dataorder = DDI_STRICTORDER_ACC;
191 	accp->devacc_attr_access = DDI_DEFAULT_ACC;
192 }
193 
194 static void
195 igc_dma_desc_attr(igc_t *igc, ddi_dma_attr_t *attrp)
196 {
197 	bcopy(&igc_desc_dma_attr, attrp, sizeof (ddi_dma_attr_t));
198 }
199 
200 static void
201 igc_dma_data_attr(igc_t *igc, ddi_dma_attr_t *attrp)
202 {
203 	bcopy(&igc_data_dma_attr, attrp, sizeof (ddi_dma_attr_t));
204 }
205 
206 static void
207 igc_dma_tx_attr(igc_t *igc, ddi_dma_attr_t *attrp)
208 {
209 	bcopy(&igc_tx_dma_attr, attrp, sizeof (ddi_dma_attr_t));
210 }
211 
212 static void
213 igc_dma_free(igc_dma_buffer_t *idb)
214 {
215 	/* Proxy for DMA handle bound */
216 	if (idb->idb_size != 0) {
217 		(void) ddi_dma_unbind_handle(idb->idb_hdl);
218 		idb->idb_size = 0;
219 	}
220 
221 	if (idb->idb_acc != NULL) {
222 		ddi_dma_mem_free(&idb->idb_acc);
223 		idb->idb_acc = NULL;
224 		idb->idb_va = NULL;
225 		idb->idb_alloc_len = 0;
226 	}
227 
228 	if (idb->idb_hdl != NULL) {
229 		ddi_dma_free_handle(&idb->idb_hdl);
230 		idb->idb_hdl = NULL;
231 	}
232 
233 	ASSERT0(idb->idb_size);
234 	ASSERT0(idb->idb_alloc_len);
235 	ASSERT3P(idb->idb_acc, ==, NULL);
236 	ASSERT3P(idb->idb_hdl, ==, NULL);
237 	ASSERT3P(idb->idb_va, ==, NULL);
238 }
239 
240 static bool
241 igc_dma_alloc(igc_t *igc, igc_dma_buffer_t *idb, ddi_dma_attr_t *attrp,
242     size_t size)
243 {
244 	int ret;
245 	ddi_device_acc_attr_t acc;
246 	uint_t flags = DDI_DMA_STREAMING;
247 
248 	bzero(idb, sizeof (igc_dma_buffer_t));
249 	ret = ddi_dma_alloc_handle(igc->igc_dip, attrp, DDI_DMA_DONTWAIT, NULL,
250 	    &idb->idb_hdl);
251 	if (ret != DDI_SUCCESS) {
252 		dev_err(igc->igc_dip, CE_WARN, "!failed to allocate DMA "
253 		    "handle: %d", ret);
254 		return (false);
255 	}
256 
257 	igc_dma_acc_attr(igc, &acc);
258 	ret = ddi_dma_mem_alloc(idb->idb_hdl, size, &acc, flags,
259 	    DDI_DMA_DONTWAIT, NULL, &idb->idb_va, &idb->idb_alloc_len,
260 	    &idb->idb_acc);
261 	if (ret != DDI_SUCCESS) {
262 		dev_err(igc->igc_dip, CE_WARN, "!failed to allocate %lu bytes "
263 		    "of DMA memory: %d", size, ret);
264 		igc_dma_free(idb);
265 		return (false);
266 	}
267 
268 	bzero(idb->idb_va, idb->idb_alloc_len);
269 	ret = ddi_dma_addr_bind_handle(idb->idb_hdl, NULL, idb->idb_va,
270 	    idb->idb_alloc_len, DDI_DMA_RDWR | flags, DDI_DMA_DONTWAIT, NULL,
271 	    NULL, NULL);
272 	if (ret != DDI_SUCCESS) {
273 		dev_err(igc->igc_dip, CE_WARN, "!failed to bind %lu bytes of "
274 		    "DMA memory: %d", idb->idb_alloc_len, ret);
275 		igc_dma_free(idb);
276 		return (false);
277 	}
278 
279 	idb->idb_size = size;
280 	return (true);
281 }
282 
283 static void
284 igc_rx_recycle(caddr_t arg)
285 {
286 	igc_rx_buffer_t *buf = (igc_rx_buffer_t *)arg;
287 	igc_rx_ring_t *ring = buf->irb_ring;
288 	caddr_t mblk_va;
289 	size_t mblk_len;
290 
291 	/*
292 	 * The mblk is free regardless of what happens next, so make sure we
293 	 * clean up.
294 	 */
295 	buf->irb_mp = NULL;
296 
297 	/*
298 	 * The mblk_t is pre-created ahead of binding. If loaned is not set then
299 	 * this simply means we're tearing down this as part of tearing down the
300 	 * device as opposed to getting it from the rest of the stack and
301 	 * therefore there's nothing else to do.
302 	 */
303 	if (!buf->irb_loaned) {
304 		return;
305 	}
306 
307 	/*
308 	 * Ensure we mark this buffer as no longer loaned and then insert it
309 	 * onto the free list.
310 	 */
311 	buf->irb_loaned = false;
312 
313 	/*
314 	 * Create a new mblk and insert it on the free list.
315 	 */
316 	mblk_va = buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN;
317 	mblk_len = buf->irb_dma.idb_size - IGC_RX_BUF_IP_ALIGN;
318 	buf->irb_mp = desballoc((uchar_t *)mblk_va, mblk_len, 0,
319 	    &buf->irb_free_rtn);
320 
321 	mutex_enter(&ring->irr_free_lock);
322 	ring->irr_free_list[ring->irr_nfree] = buf;
323 	ring->irr_nfree++;
324 #ifdef	DEBUG
325 	igc_t *igc = ring->irr_igc;
326 	ASSERT3U(ring->irr_nfree, <=, igc->igc_rx_nfree);
327 #endif
328 	cv_signal(&ring->irr_free_cv);
329 	mutex_exit(&ring->irr_free_lock);
330 }
331 
332 static void
333 igc_rx_bufs_free(igc_t *igc, igc_rx_ring_t *ring)
334 {
335 	for (uint32_t i = 0; i < igc->igc_rx_nbuf; i++) {
336 		igc_rx_buffer_t *buf = &ring->irr_arena[i];
337 
338 		ASSERT3U(buf->irb_loaned, ==, false);
339 		freemsg(buf->irb_mp);
340 		buf->irb_mp = NULL;
341 		igc_dma_free(&buf->irb_dma);
342 	}
343 }
344 
345 static bool
346 igc_rx_bufs_alloc(igc_t *igc, igc_rx_ring_t *ring)
347 {
348 	for (uint32_t i = 0; i < igc->igc_rx_nbuf; i++) {
349 		igc_rx_buffer_t *buf = &ring->irr_arena[i];
350 		ddi_dma_attr_t attr;
351 		caddr_t mblk_va;
352 		size_t mblk_len;
353 
354 		buf->irb_ring = ring;
355 		igc_dma_data_attr(igc, &attr);
356 		if (!igc_dma_alloc(igc, &buf->irb_dma, &attr,
357 		    igc->igc_rx_buf_size)) {
358 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate RX "
359 			    "ring %u buffer %u", ring->irr_idx, i);
360 			return (false);
361 		}
362 
363 		buf->irb_free_rtn.free_func = igc_rx_recycle;
364 		buf->irb_free_rtn.free_arg = (caddr_t)buf;
365 
366 		/*
367 		 * We ignore whether or not this was successful because we have
368 		 * to handle the case that we will have buffers without mblk's
369 		 * due to loaning and related.
370 		 */
371 		mblk_va = buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN;
372 		mblk_len = buf->irb_dma.idb_size - IGC_RX_BUF_IP_ALIGN;
373 		buf->irb_mp = desballoc((uchar_t *)mblk_va, mblk_len, 0,
374 		    &buf->irb_free_rtn);
375 
376 		if (i < igc->igc_rx_ndesc) {
377 			ring->irr_work_list[i] = buf;
378 		} else {
379 			ring->irr_free_list[ring->irr_nfree] = buf;
380 			ring->irr_nfree++;
381 		}
382 	}
383 
384 	return (true);
385 }
386 
387 void
388 igc_rx_data_free(igc_t *igc)
389 {
390 	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
391 		igc_rx_ring_t *ring = &igc->igc_rx_rings[i];
392 
393 		if (ring->irr_arena != NULL) {
394 			igc_rx_bufs_free(igc, ring);
395 			kmem_free(ring->irr_arena, sizeof (igc_rx_buffer_t) *
396 			    igc->igc_rx_nbuf);
397 			ring->irr_arena = NULL;
398 		}
399 
400 		if (ring->irr_free_list != NULL) {
401 			kmem_free(ring->irr_free_list, igc->igc_rx_nfree *
402 			    sizeof (igc_rx_buffer_t *));
403 			ring->irr_free_list = NULL;
404 		}
405 
406 		if (ring->irr_work_list != NULL) {
407 			kmem_free(ring->irr_work_list, igc->igc_rx_ndesc *
408 			    sizeof (igc_rx_buffer_t *));
409 			ring->irr_work_list = NULL;
410 		}
411 
412 		if (ring->irr_ring != NULL) {
413 			igc_dma_free(&ring->irr_desc_dma);
414 			ring->irr_ring = NULL;
415 			ring->irr_next = 0;
416 		}
417 	}
418 }
419 
420 bool
421 igc_rx_data_alloc(igc_t *igc)
422 {
423 	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
424 		igc_rx_ring_t *ring = &igc->igc_rx_rings[i];
425 		ddi_dma_attr_t desc_attr;
426 		size_t desc_len;
427 
428 		igc_dma_desc_attr(igc, &desc_attr);
429 		desc_len = sizeof (union igc_adv_rx_desc) *
430 		    igc->igc_rx_ndesc;
431 		if (!igc_dma_alloc(igc, &ring->irr_desc_dma, &desc_attr,
432 		    desc_len)) {
433 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate "
434 			    "RX descriptor ring %u", i);
435 			goto cleanup;
436 		}
437 		ring->irr_ring = (void *)ring->irr_desc_dma.idb_va;
438 
439 		ring->irr_work_list = kmem_zalloc(sizeof (igc_rx_buffer_t *) *
440 		    igc->igc_rx_ndesc, KM_NOSLEEP);
441 		if (ring->irr_work_list == NULL) {
442 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate "
443 			    "RX descriptor ring %u rx work list", i);
444 			goto cleanup;
445 		}
446 
447 		ring->irr_free_list = kmem_zalloc(sizeof (igc_rx_buffer_t *) *
448 		    igc->igc_rx_nfree, KM_NOSLEEP);
449 		if (ring->irr_free_list == NULL) {
450 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate "
451 			    "RX descriptor ring %u rx free list", i);
452 			goto cleanup;
453 		}
454 
455 
456 		ring->irr_arena = kmem_zalloc(sizeof (igc_rx_buffer_t) *
457 		    igc->igc_rx_nbuf, KM_NOSLEEP);
458 		if (ring->irr_arena == NULL) {
459 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate "
460 			    "RX descriptor ring %u rx buf arena", i);
461 			goto cleanup;
462 		}
463 
464 		if (!igc_rx_bufs_alloc(igc, ring)) {
465 			goto cleanup;
466 		}
467 	}
468 
469 	return (true);
470 
471 cleanup:
472 	igc_rx_data_free(igc);
473 	return (false);
474 }
475 
476 /*
477  * Write / update a descriptor ring entry. This had been implemented in a few
478  * places, so this was intended as a consolidation of those.
479  */
480 static inline void
481 igc_rx_ring_desc_write(igc_rx_ring_t *ring, uint32_t idx)
482 {
483 	const ddi_dma_cookie_t *cookie;
484 	uint64_t addr;
485 	igc_dma_buffer_t *irb = &ring->irr_work_list[idx]->irb_dma;
486 
487 	cookie = ddi_dma_cookie_one(irb->idb_hdl);
488 	addr = cookie->dmac_laddress + IGC_RX_BUF_IP_ALIGN;
489 	ring->irr_ring[idx].read.pkt_addr = LE_64(addr);
490 	ring->irr_ring[idx].read.hdr_addr = LE_64(0);
491 }
492 
493 /*
494  * Fully initialize a receive ring. This involves:
495  *
496  *  - Doing an initial programming and sync of the descriptor ring
497  *  - Programming the base and length registers
498  *  - Programming the ring's buffer size and descriptor type
499  *  - Programming the queue's receive control register
500  */
501 static void
502 igc_rx_ring_hw_init(igc_t *igc, igc_rx_ring_t *ring)
503 {
504 	uint32_t val, high, low;
505 	const ddi_dma_cookie_t *desc;
506 
507 	for (uint32_t i = 0; i < igc->igc_rx_ndesc; i++) {
508 		igc_rx_ring_desc_write(ring, i);
509 	}
510 	IGC_DMA_SYNC(&ring->irr_desc_dma, DDI_DMA_SYNC_FORDEV);
511 
512 	/*
513 	 * Program the ring's address.
514 	 */
515 	desc = ddi_dma_cookie_one(ring->irr_desc_dma.idb_hdl);
516 	high = (uint32_t)(desc->dmac_laddress >> 32);
517 	low = (uint32_t)desc->dmac_laddress;
518 	igc_write32(igc, IGC_RDBAH(ring->irr_idx), high);
519 	igc_write32(igc, IGC_RDBAL(ring->irr_idx), low);
520 
521 	/*
522 	 * Program the ring length.
523 	 */
524 	val = igc->igc_rx_ndesc * sizeof (union igc_adv_rx_desc);
525 	igc_write32(igc, IGC_RDLEN(ring->irr_idx), val);
526 
527 	/*
528 	 * Program the descriptor type and buffer length.
529 	 */
530 	val = (igc->igc_rx_buf_size >> IGC_SRRCTL_BSIZEPKT_SHIFT) |
531 	    IGC_SRRCTL_DESCTYPE_ADV_ONEBUF;
532 	igc_write32(igc, IGC_SRRCTL(ring->irr_idx), val);
533 
534 	/*
535 	 * Program the ring control register itself. Note, we crib the threshold
536 	 * values directly from igb and didn't think much harder than that.
537 	 */
538 	val = igc_read32(igc, IGC_RXDCTL(ring->irr_idx));
539 	val &= IGC_RXDCTL_PRESERVE;
540 	val |= IGC_RXDCTL_QUEUE_ENABLE;
541 	val = IGC_RXDCTL_SET_PTHRESH(val, 16);
542 	val = IGC_RXDCTL_SET_HTHRESH(val, 8);
543 	val = IGC_RXDCTL_SET_WTHRESH(val, 1);
544 	igc_write32(igc, IGC_RXDCTL(ring->irr_idx), val);
545 }
546 
547 void
548 igc_rx_hw_init(igc_t *igc)
549 {
550 	uint32_t rctl, rxcsum;
551 
552 	/*
553 	 * Start by setting up the receive control register.
554 	 *
555 	 * We clear out any bits in the multicast shift portion. This'll leave
556 	 * it so [47:36] of the address are used as part of the look up. We also
557 	 * don't want to receive bad packets, so make sure that's cleared out.
558 	 * In addition, we clear out loopback mode.
559 	 */
560 	rctl = igc_read32(igc, IGC_RCTL);
561 	rctl &= ~(3 << IGC_RCTL_MO_SHIFT);
562 	rctl &= ~IGC_RCTL_SBP;
563 	rctl &= ~(IGC_RCTL_LBM_MAC | IGC_RCTL_LBM_TCVR);
564 
565 	/*
566 	 * Set things up such that we're enabled, we receive broadcast packets,
567 	 * and we allow for large packets. We leave the rx descriptor threshold
568 	 * at 2048 bytes and make sure to always strip the Ethernet CRC as mac
569 	 * doesn't want it.
570 	 */
571 	rctl |= IGC_RCTL_EN | IGC_RCTL_BAM | IGC_RCTL_LPE |
572 	    IGC_RCTL_RDMTS_HALF | IGC_RCTL_SECRC;
573 
574 	/*
575 	 * Set the multicast filter based on hardware.
576 	 */
577 	rctl |= igc->igc_hw.mac.mc_filter_type << IGC_RCTL_MO_SHIFT;
578 
579 	/*
580 	 * Make sure each ring is set up and its registers are programmed.
581 	 */
582 	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
583 		igc_rx_ring_hw_init(igc, &igc->igc_rx_rings[i]);
584 	}
585 
586 	/*
587 	 * As we always set LPE (large packet enable) in the receive control
588 	 * register, we must go through and explicitly update the maximum frame
589 	 * size.
590 	 */
591 	igc_write32(igc, IGC_RLPML, igc->igc_max_frame);
592 
593 	/*
594 	 * Explicitly enable IPv4 and TCP checksums. We leave PCSD set to zero
595 	 * for the moment as we're not enabling RSS, which is what would be
596 	 * required to get that. After this is where we would set up the VMDq
597 	 * mode and RSS if we supported multiple RX rings.
598 	 */
599 	rxcsum = IGC_RXCSUM_IPOFL | IGC_RXCSUM_TUOFL;
600 	igc_write32(igc, IGC_RXCSUM, rxcsum);
601 
602 	/*
603 	 * Enable the receive unit finally
604 	 */
605 	igc_write32(igc, IGC_RCTL, rctl);
606 
607 	/*
608 	 * Only after the receive unit is initialized can we actually set up the
609 	 * ring head and tail pointers.
610 	 */
611 	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
612 		igc_write32(igc, IGC_RDH(igc->igc_rx_rings[i].irr_idx), 0);
613 		igc_write32(igc, IGC_RDT(igc->igc_rx_rings[i].irr_idx),
614 		    igc->igc_rx_ndesc - 1);
615 	}
616 }
617 
618 static inline uint32_t
619 igc_next_desc(uint32_t cur, uint32_t count, uint32_t size)
620 {
621 	uint32_t out;
622 
623 	if (cur + count < size) {
624 		out = cur + count;
625 	} else {
626 		out = cur + count - size;
627 	}
628 
629 	return (out);
630 }
631 
632 static inline uint32_t
633 igc_prev_desc(uint32_t cur, uint32_t count, uint32_t size)
634 {
635 	uint32_t out;
636 
637 	if (cur >= count) {
638 		out = cur - count;
639 	} else {
640 		out = cur - count + size;
641 	}
642 
643 	return (out);
644 }
645 
646 
647 static mblk_t *
648 igc_rx_copy(igc_rx_ring_t *ring, uint32_t idx, uint32_t len)
649 {
650 	const igc_rx_buffer_t *buf = ring->irr_work_list[idx];
651 	mblk_t *mp;
652 
653 	IGC_DMA_SYNC(&buf->irb_dma, DDI_DMA_SYNC_FORKERNEL);
654 	mp = allocb(len + IGC_RX_BUF_IP_ALIGN, 0);
655 	if (mp == NULL) {
656 		ring->irr_stat.irs_copy_nomem.value.ui64++;
657 		return (NULL);
658 	}
659 
660 	mp->b_rptr += IGC_RX_BUF_IP_ALIGN;
661 	bcopy(buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN, mp->b_rptr, len);
662 	mp->b_wptr = mp->b_rptr + len;
663 	ring->irr_stat.irs_ncopy.value.ui64++;
664 	return (mp);
665 }
666 
667 static mblk_t *
668 igc_rx_bind(igc_rx_ring_t *ring, uint32_t idx, uint32_t len)
669 {
670 	igc_rx_buffer_t *buf = ring->irr_work_list[idx];
671 	igc_rx_buffer_t *sub;
672 
673 	ASSERT(MUTEX_HELD(&ring->irr_lock));
674 
675 	/*
676 	 * If there are no free buffers, we can't bind. Try to grab this now so
677 	 * we can minimize free list contention.
678 	 */
679 	mutex_enter(&ring->irr_free_lock);
680 	if (ring->irr_nfree == 0) {
681 		ring->irr_stat.irs_bind_nobuf.value.ui64++;
682 		mutex_exit(&ring->irr_free_lock);
683 		return (NULL);
684 	}
685 	ring->irr_nfree--;
686 	sub = ring->irr_free_list[ring->irr_nfree];
687 	mutex_exit(&ring->irr_free_lock);
688 
689 	/*
690 	 * Check if we have an mblk_t here. If not, we'll need to allocate one
691 	 * again. If that fails, we'll fail this and fall back to copy, though
692 	 * the odds of that working are small.
693 	 */
694 	if (buf->irb_mp == NULL) {
695 		caddr_t mblk_va = buf->irb_dma.idb_va + IGC_RX_BUF_IP_ALIGN;
696 		size_t mblk_len = buf->irb_dma.idb_size - IGC_RX_BUF_IP_ALIGN;
697 		buf->irb_mp = desballoc((uchar_t *)mblk_va, mblk_len, 0,
698 		    &buf->irb_free_rtn);
699 		if (buf->irb_mp == NULL) {
700 			ring->irr_stat.irs_bind_nomp.value.ui64++;
701 			mutex_enter(&ring->irr_free_lock);
702 			ring->irr_free_list[ring->irr_nfree] = sub;
703 			ring->irr_nfree++;
704 			mutex_exit(&ring->irr_free_lock);
705 			return (NULL);
706 		}
707 	}
708 	buf->irb_mp->b_wptr = buf->irb_mp->b_rptr + len;
709 	IGC_DMA_SYNC(&buf->irb_dma, DDI_DMA_SYNC_FORKERNEL);
710 
711 	/*
712 	 * Swap an entry on the free list to replace this on the work list.
713 	 */
714 	ring->irr_work_list[idx] = sub;
715 	ring->irr_stat.irs_nbind.value.ui64++;
716 
717 	/*
718 	 * Update the buffer to make sure that we indicate it's been loaned for
719 	 * future recycling.
720 	 */
721 	buf->irb_loaned = true;
722 
723 	return (buf->irb_mp);
724 }
725 
726 /*
727  * Go through the status bits defined in hardware to see if we can set checksum
728  * information.
729  */
730 static void
731 igc_rx_hcksum(igc_rx_ring_t *ring, mblk_t *mp, uint32_t status)
732 {
733 	uint32_t cksum = 0;
734 	const uint32_t l4_valid = IGC_RXD_STAT_TCPCS | IGC_RXD_STAT_UDPCS;
735 	const uint32_t l4_invalid = IGC_RXDEXT_STATERR_L4E;
736 
737 	if ((status & IGC_RXD_STAT_IXSM) != 0) {
738 		ring->irr_stat.irs_ixsm.value.ui64++;
739 		return;
740 	}
741 
742 	if ((status & l4_invalid) != 0) {
743 		ring->irr_stat.irs_l4cksum_err.value.ui64++;
744 	} else if ((status & l4_valid) != 0) {
745 		cksum |= HCK_FULLCKSUM_OK;
746 	}
747 
748 	if ((status & IGC_RXDEXT_STATERR_IPE) != 0) {
749 		ring->irr_stat.irs_l3cksum_err.value.ui64++;
750 	} else if ((status & IGC_RXD_STAT_IPCS) != 0) {
751 		cksum |= HCK_IPV4_HDRCKSUM_OK;
752 	}
753 
754 	if (cksum != 0) {
755 		ring->irr_stat.irs_hcksum_hit.value.ui64++;
756 		mac_hcksum_set(mp, 0, 0, 0, 0, cksum);
757 	} else {
758 		ring->irr_stat.irs_hcksum_miss.value.ui64++;
759 	}
760 }
761 
762 mblk_t *
763 igc_ring_rx(igc_rx_ring_t *ring, int poll_bytes)
764 {
765 	union igc_adv_rx_desc *cur_desc;
766 	uint32_t cur_status, cur_head;
767 	uint64_t rx_bytes = 0, rx_frames = 0;
768 	igc_t *igc = ring->irr_igc;
769 	mblk_t *mp_head = NULL, **mp_tail = NULL;
770 
771 	ASSERT(MUTEX_HELD(&ring->irr_lock));
772 	IGC_DMA_SYNC(&ring->irr_desc_dma, DDI_DMA_SYNC_FORKERNEL);
773 
774 	/*
775 	 * Set up the invariants that we will maintain for the loop and then set
776 	 * up our mblk queue.
777 	 */
778 	cur_head = ring->irr_next;
779 	cur_desc = &ring->irr_ring[cur_head];
780 	cur_status = LE_32(cur_desc->wb.upper.status_error);
781 	mp_head = NULL;
782 	mp_tail = &mp_head;
783 
784 	while ((cur_status & IGC_RXD_STAT_DD) != 0) {
785 		uint16_t cur_length = 0;
786 		mblk_t *mp;
787 
788 		/*
789 		 * Check that we have no errors on this packet. This packet
790 		 * should also have EOP set because we only use a single
791 		 * descriptor today. We primarily just check for the RXE error.
792 		 * Most other error types were dropped in the extended format.
793 		 */
794 		if ((cur_status & IGC_RXDEXT_STATERR_RXE) != 0 ||
795 		    (cur_status & IGC_RXD_STAT_EOP) == 0) {
796 			ring->irr_stat.irs_desc_error.value.ui64++;
797 			goto discard;
798 		}
799 
800 
801 		/*
802 		 * We don't bump rx_frames here, because we do that at the end,
803 		 * even if we've discarded frames so we can know to write the
804 		 * tail register.
805 		 */
806 		cur_length = LE_16(cur_desc->wb.upper.length);
807 		rx_bytes += cur_length;
808 
809 		mp = NULL;
810 		if (cur_length > igc->igc_rx_bind_thresh) {
811 			mp = igc_rx_bind(ring, cur_head, cur_length);
812 		}
813 
814 		if (mp == NULL) {
815 			mp = igc_rx_copy(ring, cur_head, cur_length);
816 		}
817 
818 		if (mp != NULL) {
819 			igc_rx_hcksum(ring, mp, cur_status);
820 			*mp_tail = mp;
821 			mp_tail = &mp->b_next;
822 		}
823 
824 discard:
825 		/*
826 		 * Prepare the frame for use again. Note, we can't assume that
827 		 * the memory in the buffer is valid.
828 		 */
829 		igc_rx_ring_desc_write(ring, cur_head);
830 
831 		/*
832 		 * Go through and update the values that our loop is using now.
833 		 */
834 		cur_head = igc_next_desc(cur_head, 1, igc->igc_rx_ndesc);
835 		cur_desc = &ring->irr_ring[cur_head];
836 		cur_status = LE_32(cur_desc->wb.upper.status_error);
837 
838 		/*
839 		 * If we're polling, we need to check against the number of
840 		 * received bytes. If we're in interrupt mode, we have a maximum
841 		 * number of frames we're allowed to check.
842 		 */
843 		rx_frames++;
844 		if (poll_bytes != IGC_RX_POLL_INTR &&
845 		    (cur_length + rx_bytes) > poll_bytes) {
846 			break;
847 		} else if (poll_bytes == IGC_RX_POLL_INTR &&
848 		    rx_frames >= igc->igc_rx_intr_nframes) {
849 			break;
850 		}
851 	}
852 
853 	/*
854 	 * Go ahead and re-arm the ring and update our stats along the way as
855 	 * long as we received at least one frame. Because we modified the
856 	 * descriptor ring as part of resetting frames, we must resync.
857 	 */
858 	if (rx_frames != 0) {
859 		uint32_t tail;
860 
861 		IGC_DMA_SYNC(&ring->irr_desc_dma, DDI_DMA_SYNC_FORDEV);
862 		ring->irr_next = cur_head;
863 		tail = igc_prev_desc(cur_head, 1, igc->igc_rx_ndesc);
864 		igc_write32(igc, IGC_RDT(ring->irr_idx), tail);
865 
866 		ring->irr_stat.irs_rbytes.value.ui64 += rx_bytes;
867 		ring->irr_stat.irs_ipackets.value.ui64 += rx_frames;
868 	}
869 
870 #ifdef	DEBUG
871 	if (rx_frames == 0) {
872 		ASSERT0(rx_bytes);
873 	}
874 #endif
875 
876 	return (mp_head);
877 }
878 
879 /*
880  * This is called from the stop entry point after the hardware has been reset.
881  * After the hardware has been reset, the other possible consumer of rx buffers
882  * are those that have been loaned up the stack. As such, we need to wait on
883  * each free list until the number of free entries have gotten back to the
884  * expected number.
885  */
886 void
887 igc_rx_drain(igc_t *igc)
888 {
889 	for (uint32_t i = 0; i < igc->igc_nrx_rings; i++) {
890 		igc_rx_ring_t *ring = &igc->igc_rx_rings[i];
891 
892 		mutex_enter(&ring->irr_free_lock);
893 		while (ring->irr_nfree < igc->igc_rx_nfree) {
894 			cv_wait(&ring->irr_free_cv, &ring->irr_free_lock);
895 		}
896 		mutex_exit(&ring->irr_free_lock);
897 	}
898 }
899 
900 static void
901 igc_tx_bufs_free(igc_t *igc, igc_tx_ring_t *ring)
902 {
903 	for (uint32_t i = 0; i < igc->igc_tx_nbuf; i++) {
904 		igc_tx_buffer_t *buf = &ring->itr_arena[i];
905 
906 		/*
907 		 * While we try to clean up the ring reasonably well, if for
908 		 * some reason we insert descriptors that the device doesn't
909 		 * like, then parts of the ring may not end up cleaned up. In
910 		 * such cases we'll need to free the mblk here ourselves and
911 		 * clean up any binding.
912 		 */
913 		if (buf->itb_bind) {
914 			buf->itb_bind = false;
915 			(void) ddi_dma_unbind_handle(buf->itb_bind_hdl);
916 		}
917 		freemsgchain(buf->itb_mp);
918 		igc_dma_free(&buf->itb_dma);
919 		if (buf->itb_bind_hdl != NULL) {
920 			ddi_dma_free_handle(&buf->itb_bind_hdl);
921 		}
922 	}
923 }
924 
925 static bool
926 igc_tx_bufs_alloc(igc_t *igc, igc_tx_ring_t *ring)
927 {
928 	for (uint32_t i = 0; i < igc->igc_tx_nbuf; i++) {
929 		igc_tx_buffer_t *buf = &ring->itr_arena[i];
930 		ddi_dma_attr_t attr;
931 		int ret;
932 
933 		igc_dma_data_attr(igc, &attr);
934 		if (!igc_dma_alloc(igc, &buf->itb_dma, &attr,
935 		    igc->igc_tx_buf_size)) {
936 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate TX "
937 			    "ring %u buffer %u", ring->itr_idx, i);
938 			return (false);
939 		}
940 
941 		igc_dma_tx_attr(igc, &attr);
942 		if ((ret = ddi_dma_alloc_handle(igc->igc_dip, &attr,
943 		    DDI_DMA_DONTWAIT, NULL, &buf->itb_bind_hdl)) !=
944 		    DDI_SUCCESS) {
945 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate TX "
946 			    "ring %u TX DMA handle %u: %d", ring->itr_idx, i,
947 			    ret);
948 			return (false);
949 		}
950 
951 		list_insert_tail(&ring->itr_free_list, buf);
952 	}
953 
954 	return (true);
955 }
956 
957 void
958 igc_tx_data_free(igc_t *igc)
959 {
960 	for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
961 		igc_tx_ring_t *ring = &igc->igc_tx_rings[i];
962 
963 		/*
964 		 * Empty the free list before we destroy the list to avoid
965 		 * blowing an assertion.
966 		 */
967 		while (list_remove_head(&ring->itr_free_list) != NULL)
968 			;
969 
970 		if (ring->itr_arena != NULL) {
971 			igc_tx_bufs_free(igc, ring);
972 			kmem_free(ring->itr_arena, sizeof (igc_tx_buffer_t) *
973 			    igc->igc_tx_nbuf);
974 			ring->itr_arena = NULL;
975 		}
976 
977 		list_destroy(&ring->itr_free_list);
978 
979 		if (ring->itr_work_list != NULL) {
980 			kmem_free(ring->itr_work_list, igc->igc_tx_ndesc *
981 			    sizeof (igc_tx_buffer_t *));
982 			ring->itr_work_list = NULL;
983 		}
984 
985 		if (ring->itr_ring != NULL) {
986 			igc_dma_free(&ring->itr_desc_dma);
987 			ring->itr_ring = NULL;
988 			ring->itr_ring_head = 0;
989 			ring->itr_ring_tail = 0;
990 			ring->itr_ring_free = 0;
991 		}
992 	}
993 }
994 
995 bool
996 igc_tx_data_alloc(igc_t *igc)
997 {
998 	for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
999 		igc_tx_ring_t *ring = &igc->igc_tx_rings[i];
1000 		ddi_dma_attr_t desc_attr;
1001 		size_t desc_len;
1002 
1003 		igc_dma_desc_attr(igc, &desc_attr);
1004 		desc_len = sizeof (union igc_adv_tx_desc) *
1005 		    igc->igc_tx_ndesc;
1006 		if (!igc_dma_alloc(igc, &ring->itr_desc_dma, &desc_attr,
1007 		    desc_len)) {
1008 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate "
1009 			    "TX descriptor ring %u", i);
1010 			goto cleanup;
1011 		}
1012 		ring->itr_ring = (void *)ring->itr_desc_dma.idb_va;
1013 
1014 		ring->itr_work_list = kmem_zalloc(sizeof (igc_tx_buffer_t *) *
1015 		    igc->igc_tx_ndesc, KM_NOSLEEP);
1016 		if (ring->itr_work_list == NULL) {
1017 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate "
1018 			    "TX descriptor ring %u tx work list", i);
1019 			goto cleanup;
1020 		}
1021 
1022 		list_create(&ring->itr_free_list, sizeof (igc_tx_buffer_t),
1023 		    offsetof(igc_tx_buffer_t, itb_node));
1024 
1025 		ring->itr_arena = kmem_zalloc(sizeof (igc_tx_buffer_t) *
1026 		    igc->igc_tx_nbuf, KM_NOSLEEP);
1027 		if (ring->itr_arena == NULL) {
1028 			dev_err(igc->igc_dip, CE_WARN, "!failed to allocate "
1029 			    "TX descriptor ring %u tx buf arena", i);
1030 			goto cleanup;
1031 		}
1032 
1033 		if (!igc_tx_bufs_alloc(igc, ring)) {
1034 			goto cleanup;
1035 		}
1036 	}
1037 
1038 	return (true);
1039 
1040 cleanup:
1041 	igc_tx_data_free(igc);
1042 	return (false);
1043 }
1044 
1045 static void
1046 igc_tx_ring_hw_init(igc_t *igc, igc_tx_ring_t *ring)
1047 {
1048 	uint32_t val, high, low;
1049 	const ddi_dma_cookie_t *desc;
1050 
1051 	/*
1052 	 * Program the ring's address.
1053 	 */
1054 	desc = ddi_dma_cookie_one(ring->itr_desc_dma.idb_hdl);
1055 	high = (uint32_t)(desc->dmac_laddress >> 32);
1056 	low = (uint32_t)desc->dmac_laddress;
1057 	igc_write32(igc, IGC_TDBAH(ring->itr_idx), high);
1058 	igc_write32(igc, IGC_TDBAL(ring->itr_idx), low);
1059 
1060 	/*
1061 	 * Program the ring length.
1062 	 */
1063 	val = igc->igc_tx_ndesc * sizeof (union igc_adv_tx_desc);
1064 	igc_write32(igc, IGC_TDLEN(ring->itr_idx), val);
1065 
1066 	/*
1067 	 * Initialize the head and tail pointers that are in use. We can do this
1068 	 * for TX unlike RX because we don't want the device to transmit
1069 	 * anything.
1070 	 */
1071 	igc_write32(igc, IGC_TDH(ring->itr_idx), 0);
1072 	igc_write32(igc, IGC_TDT(ring->itr_idx), 0);
1073 	ring->itr_ring_head = 0;
1074 	ring->itr_ring_tail = 0;
1075 	ring->itr_ring_free = igc->igc_tx_ndesc;
1076 
1077 	/*
1078 	 * Ensure that a tx queue is disabled prior to taking any action. We do
1079 	 * a subsequent read just in case relaxed ordering is enabled. We are
1080 	 * required to set the various thresholds for when prefetch should
1081 	 * occur, how many valid descriptors it waits before prefetch, and then
1082 	 * what the write back granularity is. Picking these numbers is a bit
1083 	 * weird.
1084 	 *
1085 	 * igb historically didn't modify these values. e1000g varied based on
1086 	 * the hardware type and has done any number of different things here.
1087 	 * The generic datasheet recommendation in the I210 is to set WTHRESH to
1088 	 * 1 and leave everything else at zero. Drivers in other systems vary
1089 	 * their settings.
1090 	 *
1091 	 * Right now we end up basically just following the datasheet and also
1092 	 * rely on the ITR that we set. This can probably be improved upon at
1093 	 * some point.
1094 	 */
1095 	igc_write32(igc, IGC_TXDCTL(0), 0);
1096 	(void) igc_read32(igc, IGC_STATUS);
1097 	val = 0;
1098 	val = IGC_TXDCTL_SET_PTHRESH(val, 0);
1099 	val = IGC_TXDCTL_SET_HTHRESH(val, 0);
1100 	val = IGC_TXDCTL_SET_WTHRESH(val, 1);
1101 	val |= IGC_TXDCTL_QUEUE_ENABLE;
1102 	igc_write32(igc, IGC_TXDCTL(0), val);
1103 }
1104 
1105 void
1106 igc_tx_hw_init(igc_t *igc)
1107 {
1108 	uint32_t val;
1109 
1110 	for (uint32_t i = 0; i < igc->igc_ntx_rings; i++) {
1111 		igc_tx_ring_hw_init(igc, &igc->igc_tx_rings[i]);
1112 	}
1113 
1114 	val = igc_read32(igc, IGC_TCTL);
1115 	val &= ~IGC_TCTL_CT;
1116 	val |= IGC_TCTL_PSP | IGC_TCTL_RTLC | IGC_TCTL_EN |
1117 	    (IGC_COLLISION_THRESHOLD << IGC_CT_SHIFT);
1118 	igc_write32(igc, IGC_TCTL, val);
1119 }
1120 
1121 static void
1122 igc_tx_buf_reset(igc_tx_buffer_t *buf)
1123 {
1124 	buf->itb_mp = NULL;
1125 	buf->itb_len = 0;
1126 	buf->itb_last_desc = 0;
1127 	buf->itb_first = false;
1128 	if (buf->itb_bind) {
1129 		(void) ddi_dma_unbind_handle(buf->itb_bind_hdl);
1130 	}
1131 	buf->itb_bind = false;
1132 }
1133 
1134 /*
1135  * When we are recycling packets, we need to sync the ring and then walk from
1136  * what we last processed up to what is in the tail or the first entry that is
1137  * not done. It is not clear that the I225 hardware has the separate write back
1138  * feature that igb does, so instead we have to look for the packet being noted
1139  * as done in the descriptor.
1140  */
1141 void
1142 igc_tx_recycle(igc_t *igc, igc_tx_ring_t *ring)
1143 {
1144 	uint32_t head, tail, ndesc = 0;
1145 	list_t to_free;
1146 	mblk_t *mp = NULL;
1147 	bool notify = false;
1148 
1149 	/*
1150 	 * Snapshot the current head and tail before we do more processing. The
1151 	 * driver bumps the tail when transmitting and bumps the head only here,
1152 	 * so we know that anything in the region of [head, tail) is safe for us
1153 	 * to touch (if the hardware is done) while anything in the region of
1154 	 * [tail, head) is not.
1155 	 */
1156 	mutex_enter(&ring->itr_lock);
1157 	if (ring->itr_recycle) {
1158 		mutex_exit(&ring->itr_lock);
1159 		return;
1160 	}
1161 	ring->itr_recycle = true;
1162 	head = ring->itr_ring_head;
1163 	tail = ring->itr_ring_tail;
1164 	mutex_exit(&ring->itr_lock);
1165 
1166 	list_create(&to_free, sizeof (igc_tx_buffer_t),
1167 	    offsetof(igc_tx_buffer_t, itb_node));
1168 
1169 	IGC_DMA_SYNC(&ring->itr_desc_dma, DDI_DMA_SYNC_FORKERNEL);
1170 
1171 	/*
1172 	 * We need to walk the transmit descriptors to see what we can free.
1173 	 * Here is where we need to deal with the wrinkle the theory statement
1174 	 * discusses (see 'TX Data Path Design' in igc.c). We look at the head
1175 	 * of the ring and see what item has the tail that we expect to be done
1176 	 * and use that to determine if we are done with the entire packet. If
1177 	 * we're done with the entire packet, then we walk the rest of the
1178 	 * descriptors and will proceed.
1179 	 */
1180 	while (head != tail) {
1181 		uint32_t status, last_desc, next_desc;
1182 		igc_tx_buffer_t *check_buf = ring->itr_work_list[head];
1183 
1184 		ASSERT3P(check_buf, !=, NULL);
1185 		ASSERT3U(check_buf->itb_first, ==, true);
1186 
1187 		last_desc = check_buf->itb_last_desc;
1188 		status = LE_32(ring->itr_ring[last_desc].wb.status);
1189 		if ((status & IGC_TXD_STAT_DD) == 0) {
1190 			break;
1191 		}
1192 
1193 		/*
1194 		 * We need to clean up this packet. This involves walking each
1195 		 * descriptor, resetting it, finding each tx buffer, and mblk,
1196 		 * and cleaning that up. A descriptor may or may not have a tx
1197 		 * buffer associated with it.
1198 		 */
1199 		next_desc = igc_next_desc(last_desc, 1, igc->igc_tx_ndesc);
1200 		for (uint32_t desc = head; desc != next_desc;
1201 		    desc = igc_next_desc(desc, 1, igc->igc_tx_ndesc)) {
1202 			igc_tx_buffer_t *buf;
1203 			bzero(&ring->itr_ring[desc],
1204 			    sizeof (union igc_adv_tx_desc));
1205 			ndesc++;
1206 			buf = ring->itr_work_list[desc];
1207 			if (buf == NULL)
1208 				continue;
1209 			ring->itr_work_list[desc] = NULL;
1210 
1211 			if (buf->itb_mp != NULL) {
1212 				buf->itb_mp->b_next = mp;
1213 				mp = buf->itb_mp;
1214 			}
1215 			igc_tx_buf_reset(buf);
1216 			list_insert_tail(&to_free, buf);
1217 		}
1218 
1219 		head = next_desc;
1220 	}
1221 
1222 	mutex_enter(&ring->itr_lock);
1223 	ring->itr_ring_head = head;
1224 	ring->itr_ring_free += ndesc;
1225 	list_move_tail(&ring->itr_free_list, &to_free);
1226 	if (ring->itr_mac_blocked && ring->itr_ring_free >
1227 	    igc->igc_tx_notify_thresh) {
1228 		ring->itr_mac_blocked = false;
1229 		notify = true;
1230 	}
1231 	ring->itr_recycle = false;
1232 	mutex_exit(&ring->itr_lock);
1233 
1234 	if (notify) {
1235 		mac_tx_ring_update(igc->igc_mac_hdl, ring->itr_rh);
1236 	}
1237 
1238 	freemsgchain(mp);
1239 	list_destroy(&to_free);
1240 }
1241 
1242 static igc_tx_buffer_t *
1243 igc_tx_buffer_alloc(igc_tx_ring_t *ring)
1244 {
1245 	igc_tx_buffer_t *buf;
1246 	mutex_enter(&ring->itr_lock);
1247 	buf = list_remove_head(&ring->itr_free_list);
1248 	if (buf == NULL) {
1249 		ring->itr_stat.its_no_tx_bufs.value.ui64++;
1250 	}
1251 	mutex_exit(&ring->itr_lock);
1252 
1253 	return (buf);
1254 }
1255 
1256 /*
1257  * Utilize a new tx buffer to perform a DMA binding for this mblk.
1258  */
1259 static bool
1260 igc_tx_ring_bind(igc_tx_ring_t *ring, mblk_t *mp, igc_tx_state_t *tx)
1261 {
1262 	size_t len = MBLKL(mp);
1263 	igc_tx_buffer_t *buf;
1264 	int ret;
1265 	uint_t ncookie;
1266 
1267 	buf = igc_tx_buffer_alloc(ring);
1268 	if (buf == NULL) {
1269 		return (false);
1270 	}
1271 
1272 	ret = ddi_dma_addr_bind_handle(buf->itb_bind_hdl, NULL,
1273 	    (void *)mp->b_rptr, len, DDI_DMA_WRITE | DDI_DMA_STREAMING,
1274 	    DDI_DMA_DONTWAIT, NULL, NULL, &ncookie);
1275 	if (ret != DDI_DMA_MAPPED) {
1276 		/*
1277 		 * Binding failed. Give this buffer back.
1278 		 */
1279 		ring->itr_stat.its_tx_bind_fail.value.ui64++;
1280 		mutex_enter(&ring->itr_lock);
1281 		list_insert_tail(&ring->itr_free_list, buf);
1282 		mutex_exit(&ring->itr_lock);
1283 		return (false);
1284 	}
1285 
1286 	/*
1287 	 * Now that this is successful, we append it to the list and update our
1288 	 * tracking structure. We don't do this earlier so we can keep using the
1289 	 * extent buffer for copying as that's the fallback path.
1290 	 */
1291 	buf->itb_len = len;
1292 	buf->itb_bind = true;
1293 	tx->itx_ndescs += ncookie;
1294 	tx->itx_buf_rem = 0;
1295 	tx->itx_cur_buf = buf;
1296 	list_insert_tail(&tx->itx_bufs, tx->itx_cur_buf);
1297 	ring->itr_stat.its_tx_bind.value.ui64++;
1298 	return (true);
1299 }
1300 
1301 /*
1302  * Copy the current mblk into a series of one or more tx buffers depending on
1303  * what's available.
1304  */
1305 static bool
1306 igc_tx_ring_copy(igc_tx_ring_t *ring, mblk_t *mp, igc_tx_state_t *tx)
1307 {
1308 	size_t len = MBLKL(mp);
1309 	size_t off = 0;
1310 
1311 	while (len > 0) {
1312 		const void *src;
1313 		void *dest;
1314 		size_t to_copy;
1315 
1316 		/*
1317 		 * If the current buffer is used for binding, then we must get a
1318 		 * new one. If it is used for copying, we can keep going until
1319 		 * it is full.
1320 		 */
1321 		if (tx->itx_cur_buf != NULL && (tx->itx_cur_buf->itb_bind ||
1322 		    tx->itx_buf_rem == 0)) {
1323 			tx->itx_cur_buf = NULL;
1324 			tx->itx_buf_rem = 0;
1325 		}
1326 
1327 		if (tx->itx_cur_buf == NULL) {
1328 			tx->itx_cur_buf = igc_tx_buffer_alloc(ring);
1329 			if (tx->itx_cur_buf == NULL) {
1330 				return (false);
1331 			}
1332 			list_insert_tail(&tx->itx_bufs, tx->itx_cur_buf);
1333 			tx->itx_buf_rem = tx->itx_cur_buf->itb_dma.idb_size;
1334 			/*
1335 			 * Each DMA buffer used for TX only requires a single
1336 			 * cookie. So note that descriptor requirement here and
1337 			 * flag this tx buffer as being used for copying.
1338 			 */
1339 			tx->itx_ndescs++;
1340 			tx->itx_cur_buf->itb_bind = false;
1341 		}
1342 
1343 		to_copy = MIN(len, tx->itx_buf_rem);
1344 		src = mp->b_rptr + off;
1345 		dest = tx->itx_cur_buf->itb_dma.idb_va +
1346 		    tx->itx_cur_buf->itb_len;
1347 		bcopy(src, dest, to_copy);
1348 
1349 		tx->itx_buf_rem -= to_copy;
1350 		tx->itx_cur_buf->itb_len += to_copy;
1351 		len -= to_copy;
1352 		off += to_copy;
1353 	}
1354 
1355 	ring->itr_stat.its_tx_copy.value.ui64++;
1356 	return (true);
1357 }
1358 
1359 /*
1360  * We only need to load a context descriptor if what we're loading has changed.
1361  * This checks if it has and if so, updates the fields that have changed. Note,
1362  * a packet that doesn't require offloads won't end up taking us through this
1363  * path.
1364  */
1365 static bool
1366 igc_tx_ring_context_changed(igc_tx_ring_t *ring, igc_tx_state_t *tx)
1367 {
1368 	bool change = false;
1369 	igc_tx_context_data_t *data = &ring->itr_tx_ctx;
1370 
1371 	if (data->itc_l2hlen != tx->itx_meoi.meoi_l2hlen) {
1372 		change = true;
1373 		data->itc_l2hlen = tx->itx_meoi.meoi_l2hlen;
1374 	}
1375 
1376 	if (data->itc_l3hlen != tx->itx_meoi.meoi_l3hlen) {
1377 		change = true;
1378 		data->itc_l3hlen = tx->itx_meoi.meoi_l3hlen;
1379 	}
1380 
1381 	if (data->itc_l3proto != tx->itx_meoi.meoi_l3proto) {
1382 		change = true;
1383 		data->itc_l3proto = tx->itx_meoi.meoi_l3proto;
1384 	}
1385 
1386 	if (data->itc_l4proto != tx->itx_meoi.meoi_l4proto) {
1387 		change = true;
1388 		data->itc_l4proto = tx->itx_meoi.meoi_l4proto;
1389 	}
1390 
1391 	if (data->itc_l4hlen != tx->itx_meoi.meoi_l4hlen) {
1392 		change = true;
1393 		data->itc_l4hlen = tx->itx_meoi.meoi_l4hlen;
1394 	}
1395 
1396 	if (data->itc_mss != tx->itx_mss) {
1397 		change = true;
1398 		data->itc_mss = tx->itx_mss;
1399 	}
1400 
1401 	if (data->itc_cksum != tx->itx_cksum) {
1402 		change = true;
1403 		data->itc_cksum = tx->itx_cksum;
1404 	}
1405 
1406 	if (data->itc_lso != tx->itx_lso) {
1407 		change = true;
1408 		data->itc_lso = tx->itx_lso;
1409 	}
1410 
1411 	return (change);
1412 }
1413 
1414 /*
1415  * Fill out common descriptor information. First and last descriptor information
1416  * is handled after this.
1417  */
1418 static void
1419 igc_tx_ring_write_buf_descs(igc_t *igc, igc_tx_ring_t *ring,
1420     igc_tx_buffer_t *buf)
1421 {
1422 	ddi_dma_handle_t hdl = buf->itb_bind ? buf->itb_bind_hdl :
1423 	    buf->itb_dma.idb_hdl;
1424 	uint_t nc = ddi_dma_ncookies(hdl);
1425 	size_t rem_len = buf->itb_len;
1426 
1427 	ASSERT(MUTEX_HELD(&ring->itr_lock));
1428 	ASSERT3U(rem_len, !=, 0);
1429 
1430 	for (uint_t i = 0; i < nc; i++, ring->itr_ring_tail =
1431 	    igc_next_desc(ring->itr_ring_tail, 1, igc->igc_tx_ndesc)) {
1432 		const ddi_dma_cookie_t *c = ddi_dma_cookie_get(hdl, i);
1433 		union igc_adv_tx_desc *desc;
1434 		uint32_t type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT |
1435 		    IGC_ADVTXD_DCMD_IFCS;
1436 		uint32_t desc_len = MIN(rem_len, c->dmac_size);
1437 
1438 		/* Quick sanity check on max data descriptor */
1439 		ASSERT3U(desc_len, <, 0x10000);
1440 		ASSERT3U(desc_len, >, 0x0);
1441 		type |= desc_len;
1442 		rem_len -= desc_len;
1443 		desc = &ring->itr_ring[ring->itr_ring_tail];
1444 		desc->read.buffer_addr = LE_64(c->dmac_laddress);
1445 		desc->read.cmd_type_len = LE_32(type);
1446 		desc->read.olinfo_status = LE_32(0);
1447 
1448 		/*
1449 		 * Save the transmit buffer in the first descriptor entry that
1450 		 * we use for this.
1451 		 */
1452 		if (i == 0) {
1453 			ring->itr_work_list[ring->itr_ring_tail] = buf;
1454 		}
1455 	}
1456 }
1457 
1458 /*
1459  * We have created our chain of tx buffers that have been copied and bound. Now
1460  * insert them into place and insert a context descriptor if it will be
1461  * required. Unlike igb we don't save the old context descriptor to try to reuse
1462  * it and instead just always set it.
1463  */
1464 static bool
1465 igc_tx_ring_write_descs(igc_t *igc, igc_tx_ring_t *ring, mblk_t *mp,
1466     igc_tx_state_t *tx)
1467 {
1468 	bool do_ctx = false;
1469 	igc_tx_buffer_t *buf;
1470 	uint32_t ctx_desc, first_desc, last_desc, flags, status;
1471 
1472 	/*
1473 	 * If either checksumming or LSO is set, we may need a context
1474 	 * descriptor. We assume we will and then if not will adjust that.
1475 	 */
1476 	if (tx->itx_cksum != 0 || tx->itx_lso != 0) {
1477 		do_ctx = true;
1478 		tx->itx_ndescs++;
1479 	}
1480 
1481 	mutex_enter(&ring->itr_lock);
1482 	if (tx->itx_ndescs + igc->igc_tx_gap > ring->itr_ring_free) {
1483 		/*
1484 		 * Attempt to recycle descriptors before we give up.
1485 		 */
1486 		mutex_exit(&ring->itr_lock);
1487 		igc_tx_recycle(igc, ring);
1488 		mutex_enter(&ring->itr_lock);
1489 		if (tx->itx_ndescs + igc->igc_tx_gap > ring->itr_ring_free) {
1490 			mutex_exit(&ring->itr_lock);
1491 			return (false);
1492 		}
1493 	}
1494 
1495 	/*
1496 	 * Now see if the context descriptor has changed, if required. If not,
1497 	 * then we can reduce the number of descriptors required. We wnt to do
1498 	 * this after we've checked for descriptors because this will mutate the
1499 	 * next tx descriptor we have to load.
1500 	 */
1501 	if (do_ctx && !igc_tx_ring_context_changed(ring, tx)) {
1502 		do_ctx = false;
1503 		tx->itx_ndescs--;
1504 	}
1505 
1506 	ring->itr_ring_free -= tx->itx_ndescs;
1507 	ctx_desc = ring->itr_ring_tail;
1508 	if (do_ctx) {
1509 		struct igc_adv_tx_context_desc *ctx;
1510 		uint32_t len = tx->itx_meoi.meoi_l3hlen |
1511 		    (tx->itx_meoi.meoi_l2hlen << IGC_ADVTXD_MACLEN_SHIFT);
1512 		uint32_t tucmd = IGC_ADVTXD_DCMD_DEXT | IGC_ADVTXD_DTYP_CTXT;
1513 		uint32_t l4idx = 0;
1514 
1515 		if ((tx->itx_lso & HW_LSO) != 0 ||
1516 		    (tx->itx_cksum & HCK_IPV4_HDRCKSUM) != 0) {
1517 			if (tx->itx_meoi.meoi_l3proto == ETHERTYPE_IP) {
1518 				tucmd |= IGC_ADVTXD_TUCMD_IPV4;
1519 			} else {
1520 				ASSERT3U(tx->itx_meoi.meoi_l3proto, ==,
1521 				    ETHERTYPE_IPV6);
1522 				tucmd |= IGC_ADVTXD_TUCMD_IPV6;
1523 			}
1524 		}
1525 
1526 		if ((tx->itx_lso & HW_LSO) != 0 ||
1527 		    (tx->itx_cksum & HCK_PARTIALCKSUM) != 0) {
1528 			if (tx->itx_meoi.meoi_l4proto == IPPROTO_TCP) {
1529 				tucmd |= IGC_ADVTXD_TUCMD_L4T_TCP;
1530 			} else if (tx->itx_meoi.meoi_l4proto == IPPROTO_UDP) {
1531 				tucmd |= IGC_ADVTXD_TUCMD_L4T_UDP;
1532 			}
1533 		}
1534 
1535 		/*
1536 		 * The L4LEN and MSS fields are only required if we're
1537 		 * performing TSO. The index is always zero regardless because
1538 		 * the I225 only has one context per queue.
1539 		 */
1540 		if ((tx->itx_lso & HW_LSO) != 0) {
1541 			l4idx |= tx->itx_meoi.meoi_l4hlen <<
1542 			    IGC_ADVTXD_L4LEN_SHIFT;
1543 			l4idx |= tx->itx_mss << IGC_ADVTXD_MSS_SHIFT;
1544 		}
1545 
1546 		ctx = (void *)&ring->itr_ring[ctx_desc];
1547 		ctx->vlan_macip_lens = LE_32(len);
1548 		ctx->launch_time = 0;
1549 		ctx->type_tucmd_mlhl = LE_32(tucmd);
1550 		ctx->mss_l4len_idx = LE_32(l4idx);
1551 		ring->itr_ring_tail = igc_next_desc(ring->itr_ring_tail, 1,
1552 		    igc->igc_tx_ndesc);
1553 		DTRACE_PROBE4(igc__context__desc, igc_t *, igc, igc_tx_ring_t *,
1554 		    ring, igc_tx_state_t *, tx,
1555 		    struct igc_adv_tx_context_desc *, ctx);
1556 	}
1557 
1558 	first_desc = ring->itr_ring_tail;
1559 
1560 	while ((buf = list_remove_head(&tx->itx_bufs)) != NULL) {
1561 		igc_tx_ring_write_buf_descs(igc, ring, buf);
1562 	}
1563 
1564 	/*
1565 	 * The last descriptor must have end of packet set and is the entry that
1566 	 * we ask for status on. That is, we don't actually ask for the status
1567 	 * of each transmit buffer, only the final one so we can more easily
1568 	 * collect everything including the context descriptor if present.
1569 	 */
1570 	last_desc = igc_prev_desc(ring->itr_ring_tail, 1, igc->igc_tx_ndesc);
1571 	flags = IGC_ADVTXD_DCMD_EOP | IGC_ADVTXD_DCMD_RS;
1572 	ring->itr_ring[last_desc].read.cmd_type_len |= LE_32(flags);
1573 
1574 	/*
1575 	 * We must now go back and set settings on the first data descriptor to
1576 	 * indicate what checksumming and offload features we require. Note, we
1577 	 * keep the IDX field as zero because there is only one context field
1578 	 * per queue in the I225.
1579 	 *
1580 	 * We also save the mblk_t on the first tx buffer in the set which
1581 	 * should always be saved with the first descriptor we use, which may
1582 	 * include the context descriptor. Because this descriptor tracks when
1583 	 * the entire packet is sent and we won't collect it until we're done
1584 	 * with the entire packet, it's okay to leave this on the start.
1585 	 */
1586 	flags = 0;
1587 	status = 0;
1588 	if ((tx->itx_cksum & HCK_IPV4_HDRCKSUM) != 0) {
1589 		status |= IGC_TXD_POPTS_IXSM << 8;
1590 	}
1591 
1592 	if ((tx->itx_cksum & HCK_PARTIALCKSUM) != 0) {
1593 		status |= IGC_TXD_POPTS_TXSM << 8;
1594 	}
1595 
1596 	if ((tx->itx_lso & HW_LSO) != 0) {
1597 		size_t payload = tx->itx_meoi.meoi_len -
1598 		    tx->itx_meoi.meoi_l2hlen - tx->itx_meoi.meoi_l3hlen -
1599 		    tx->itx_meoi.meoi_l4hlen;
1600 		flags |= IGC_ADVTXD_DCMD_TSE;
1601 		status |= payload << IGC_ADVTXD_PAYLEN_SHIFT;
1602 	} else {
1603 		status |= tx->itx_meoi.meoi_len << IGC_ADVTXD_PAYLEN_SHIFT;
1604 	}
1605 
1606 	ring->itr_ring[first_desc].read.cmd_type_len |= LE_32(flags);
1607 	ring->itr_ring[first_desc].read.olinfo_status |= LE_32(status);
1608 	ring->itr_work_list[first_desc]->itb_mp = mp;
1609 	ring->itr_work_list[first_desc]->itb_first = true;
1610 	ring->itr_work_list[first_desc]->itb_last_desc = last_desc;
1611 
1612 	/*
1613 	 * If we have a context descriptor, we must adjust the first work list
1614 	 * item to point to the context descriptor. See 'TX Data Path Design' in
1615 	 * the theory statemenet for more information.
1616 	 */
1617 	if (do_ctx) {
1618 		ring->itr_work_list[ctx_desc] = ring->itr_work_list[first_desc];
1619 		ring->itr_work_list[first_desc] = NULL;
1620 	}
1621 
1622 	ring->itr_stat.its_obytes.value.ui64 += tx->itx_meoi.meoi_len;
1623 	ring->itr_stat.its_opackets.value.ui64++;
1624 
1625 	IGC_DMA_SYNC(&ring->itr_desc_dma, DDI_DMA_SYNC_FORDEV);
1626 	igc_write32(igc, IGC_TDT(ring->itr_idx), ring->itr_ring_tail);
1627 	mutex_exit(&ring->itr_lock);
1628 	return (true);
1629 }
1630 
1631 mblk_t *
1632 igc_ring_tx(void *arg, mblk_t *mp)
1633 {
1634 	igc_tx_ring_t *ring = arg;
1635 	igc_t *igc = ring->itr_igc;
1636 	igc_tx_state_t tx = { 0 };
1637 
1638 	ASSERT3P(mp->b_next, ==, NULL);
1639 
1640 	if (mac_ether_offload_info(mp, &tx.itx_meoi) != 0) {
1641 		freemsg(mp);
1642 		ring->itr_stat.its_bad_meo.value.ui64++;
1643 		return (NULL);
1644 	}
1645 
1646 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &tx.itx_cksum);
1647 	mac_lso_get(mp, &tx.itx_mss, &tx.itx_lso);
1648 
1649 	/*
1650 	 * Note, we don't really care that the following check of the number of
1651 	 * free descriptors may race with other threads due to a lack of the
1652 	 * lock.
1653 	 */
1654 	if (ring->itr_ring_free < igc->igc_tx_recycle_thresh) {
1655 		igc_tx_recycle(igc, ring);
1656 	}
1657 
1658 	mutex_enter(&ring->itr_lock);
1659 	if (ring->itr_ring_free < igc->igc_tx_notify_thresh) {
1660 		ring->itr_stat.its_ring_full.value.ui64++;
1661 		ring->itr_mac_blocked = true;
1662 		mutex_exit(&ring->itr_lock);
1663 		return (mp);
1664 	}
1665 	mutex_exit(&ring->itr_lock);
1666 
1667 	/*
1668 	 * If we end up some day supporting lso and it was requested, then we
1669 	 * need to check that the header and the payoad are all in one
1670 	 * contiguous block. If they're not then we'll need to force a copy into
1671 	 * the descriptor for the headers.
1672 	 */
1673 
1674 	/*
1675 	 * This list tracks the various tx buffers that we've allocated and will
1676 	 * use.
1677 	 */
1678 	list_create(&tx.itx_bufs, sizeof (igc_tx_buffer_t),
1679 	    offsetof(igc_tx_buffer_t, itb_node));
1680 
1681 	for (mblk_t *cur_mp = mp; cur_mp != NULL; cur_mp = cur_mp->b_cont) {
1682 		size_t len = MBLKL(cur_mp);
1683 
1684 		if (len == 0) {
1685 			continue;
1686 		}
1687 
1688 		if (len > igc->igc_tx_bind_thresh &&
1689 		    igc_tx_ring_bind(ring, cur_mp, &tx)) {
1690 			continue;
1691 		}
1692 
1693 		if (!igc_tx_ring_copy(ring, cur_mp, &tx))
1694 			goto tx_failure;
1695 	}
1696 
1697 	if (!igc_tx_ring_write_descs(igc, ring, mp, &tx)) {
1698 		goto tx_failure;
1699 	}
1700 
1701 	list_destroy(&tx.itx_bufs);
1702 	return (NULL);
1703 
1704 tx_failure:
1705 	/*
1706 	 * We are out of descriptors. Clean up and give the mblk back to MAC.
1707 	 */
1708 	for (igc_tx_buffer_t *buf = list_head(&tx.itx_bufs); buf != NULL;
1709 	    buf = list_next(&tx.itx_bufs, buf)) {
1710 		igc_tx_buf_reset(buf);
1711 	}
1712 
1713 	mutex_enter(&ring->itr_lock);
1714 	list_move_tail(&ring->itr_free_list, &tx.itx_bufs);
1715 	ring->itr_mac_blocked = true;
1716 	mutex_exit(&ring->itr_lock);
1717 	list_destroy(&tx.itx_bufs);
1718 
1719 	return (mp);
1720 }
1721