xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_send.c (revision bfed486ad8de8b8ebc6345a8e10accae08bf2f45)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IEEE 802.3ad Link Aggregation - Send code.
28  *
29  * Implements the Distributor function.
30  */
31 
32 #include <sys/conf.h>
33 #include <sys/modctl.h>
34 #include <sys/sunddi.h>
35 #include <sys/vlan.h>
36 #include <sys/strsun.h>
37 #include <sys/strsubr.h>
38 
39 #include <inet/common.h>
40 #include <inet/led.h>
41 #include <inet/ip.h>
42 #include <inet/ip6.h>
43 #include <inet/tcp.h>
44 #include <netinet/udp.h>
45 #include <inet/ipsec_impl.h>
46 #include <inet/sadb.h>
47 #include <inet/ipsecesp.h>
48 #include <inet/ipsecah.h>
49 
50 #include <sys/aggr.h>
51 #include <sys/aggr_impl.h>
52 
53 #define	HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3])
54 #define	HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5])
55 
56 static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *);
57 
58 static uint64_t
59 aggr_send_hash(aggr_grp_t *grp, mblk_t *mp)
60 {
61 	struct ether_header *ehp;
62 	uint16_t sap;
63 	uint_t skip_len;
64 	uint8_t proto;
65 	uint32_t policy = grp->lg_tx_policy;
66 	uint64_t hash = 0;
67 
68 	ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
69 	ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
70 	ASSERT(RW_READ_HELD(&grp->lg_tx_lock));
71 
72 	/* compute MAC hash */
73 
74 	ehp = (struct ether_header *)mp->b_rptr;
75 
76 	if (policy & AGGR_POLICY_L2) {
77 		uchar_t *mac_src = ehp->ether_shost.ether_addr_octet;
78 		uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet;
79 		hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst);
80 		policy &= ~AGGR_POLICY_L2;
81 	}
82 
83 	if (policy == 0)
84 		goto done;
85 
86 	/* skip ethernet header */
87 
88 	if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) {
89 		struct ether_vlan_header *evhp;
90 		mblk_t *newmp = NULL;
91 
92 		skip_len = sizeof (struct ether_vlan_header);
93 		if (MBLKL(mp) < skip_len) {
94 			/* the vlan tag is the payload, pull up first */
95 			newmp = msgpullup(mp, -1);
96 			if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
97 				goto done;
98 			}
99 			evhp = (struct ether_vlan_header *)newmp->b_rptr;
100 		} else {
101 			evhp = (struct ether_vlan_header *)mp->b_rptr;
102 		}
103 
104 		sap = ntohs(evhp->ether_type);
105 		freemsg(newmp);
106 	} else {
107 		sap = ntohs(ehp->ether_type);
108 		skip_len = sizeof (struct ether_header);
109 	}
110 
111 	/* if ethernet header is in its own mblk, skip it */
112 	if (MBLKL(mp) <= skip_len) {
113 		skip_len -= MBLKL(mp);
114 		mp = mp->b_cont;
115 	}
116 
117 	sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
118 
119 	/* compute IP src/dst addresses hash and skip IPv{4,6} header */
120 
121 	switch (sap) {
122 	case ETHERTYPE_IP: {
123 		ipha_t *iphp;
124 
125 		if (MBLKL(mp) < (skip_len + sizeof (ipha_t)))
126 			goto done;
127 
128 		iphp = (ipha_t *)(mp->b_rptr + skip_len);
129 		proto = iphp->ipha_protocol;
130 		skip_len += IPH_HDR_LENGTH(iphp);
131 
132 		if (policy & AGGR_POLICY_L3) {
133 			uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src);
134 			uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst);
135 
136 			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
137 			policy &= ~AGGR_POLICY_L3;
138 		}
139 		break;
140 	}
141 	case ETHERTYPE_IPV6: {
142 		ip6_t *ip6hp;
143 
144 		/*
145 		 * if ipv6 packet has options, the proto will not be one of the
146 		 * ones handled by the ULP processor below, and will return 0
147 		 * as the index
148 		 */
149 		if (MBLKL(mp) < (skip_len + sizeof (ip6_t)))
150 			goto done;
151 
152 		ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
153 		proto = ip6hp->ip6_nxt;
154 		skip_len += aggr_send_ip6_hdr_len(mp, ip6hp);
155 
156 		if (policy & AGGR_POLICY_L3) {
157 			uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]);
158 			uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]);
159 
160 			hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst));
161 			policy &= ~AGGR_POLICY_L3;
162 		}
163 		break;
164 	}
165 	default:
166 		goto done;
167 	}
168 
169 	if (!(policy & AGGR_POLICY_L4))
170 		goto done;
171 
172 	/* if ip header is in its own mblk, skip it */
173 	if (MBLKL(mp) <= skip_len) {
174 		skip_len -= MBLKL(mp);
175 		mp = mp->b_cont;
176 	}
177 
178 	/* parse ULP header */
179 again:
180 	switch (proto) {
181 	case IPPROTO_TCP:
182 	case IPPROTO_UDP:
183 	case IPPROTO_ESP:
184 	case IPPROTO_SCTP:
185 		/*
186 		 * These Internet Protocols are intentionally designed
187 		 * for hashing from the git-go.  Port numbers are in the first
188 		 * word for transports, SPI is first for ESP.
189 		 */
190 		hash ^= HASH_4BYTES((mp->b_rptr + skip_len));
191 		break;
192 
193 	case IPPROTO_AH: {
194 		ah_t *ah = (ah_t *)(mp->b_rptr + skip_len);
195 
196 		uint_t ah_length = AH_TOTAL_LEN(ah);
197 		proto = ah->ah_nexthdr;
198 		skip_len += ah_length;
199 
200 		/* if ip header is in its own mblk, skip it */
201 		if (MBLKL(mp) <= skip_len) {
202 			skip_len -= MBLKL(mp);
203 			mp = mp->b_cont;
204 		}
205 
206 		goto again;
207 	}
208 	}
209 
210 done:
211 	return (hash);
212 }
213 
214 /*
215  * Update the TX load balancing policy of the specified group.
216  */
217 void
218 aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
219 {
220 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
221 
222 	grp->lg_tx_policy = policy;
223 }
224 
225 /*
226  * Send function invoked by the MAC service module.
227  */
228 mblk_t *
229 aggr_m_tx(void *arg, mblk_t *mp)
230 {
231 	aggr_grp_t *grp = arg;
232 	aggr_port_t *port;
233 	mblk_t *nextp;
234 	mac_tx_cookie_t	cookie;
235 	uint64_t hash;
236 	void	*mytx_handle;
237 
238 	for (;;) {
239 		rw_enter(&grp->lg_tx_lock, RW_READER);
240 		if (grp->lg_ntx_ports == 0) {
241 			/*
242 			 * We could have returned from aggr_m_start() before
243 			 * the ports were actually attached. Drop the chain.
244 			 */
245 			rw_exit(&grp->lg_tx_lock);
246 			freemsgchain(mp);
247 			return (NULL);
248 		}
249 
250 		nextp = mp->b_next;
251 		mp->b_next = NULL;
252 
253 		hash = aggr_send_hash(grp, mp);
254 		port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
255 
256 		/*
257 		 * Bump the active Tx ref count so that the port won't
258 		 * be deleted. The reference count will be dropped in mac_tx().
259 		 */
260 		mytx_handle = mac_tx_hold(port->lp_mch);
261 		rw_exit(&grp->lg_tx_lock);
262 
263 		if (mytx_handle == NULL) {
264 			/*
265 			 * The port is quiesced.
266 			 */
267 			freemsg(mp);
268 		} else {
269 			mblk_t	*ret_mp;
270 
271 			/*
272 			 * It is fine that the port state changes now.
273 			 * Set MAC_TX_NO_HOLD to inform mac_tx() not to bump
274 			 * the active Tx ref again. Use hash as the hint so
275 			 * to direct traffic to different TX rings. Note below
276 			 * bit operation is needed to get the most benefit
277 			 * from the mac_tx() hash algorithm.
278 			 */
279 			hash = (hash << 24 | hash << 16 | hash);
280 			hash = (hash << 32 | hash);
281 			cookie = mac_tx(port->lp_mch, mp, (uintptr_t)hash,
282 			    MAC_TX_NO_ENQUEUE | MAC_TX_NO_HOLD, &ret_mp);
283 
284 			mac_tx_rele(port->lp_mch, mytx_handle);
285 
286 			if (cookie != NULL) {
287 				ret_mp->b_next = nextp;
288 				mp = ret_mp;
289 				break;
290 			}
291 		}
292 
293 		if ((mp = nextp) == NULL)
294 			break;
295 	}
296 	return (mp);
297 }
298 
299 /*
300  * Enable sending on the specified port.
301  */
302 void
303 aggr_send_port_enable(aggr_port_t *port)
304 {
305 	aggr_grp_t *grp = port->lp_grp;
306 
307 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
308 
309 	if (port->lp_tx_enabled || (port->lp_state !=
310 	    AGGR_PORT_STATE_ATTACHED)) {
311 		/* already enabled or port not yet attached */
312 		return;
313 	}
314 
315 	/*
316 	 * Add to group's array of tx ports.
317 	 */
318 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
319 	if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
320 		/* current array too small */
321 		aggr_port_t **new_ports;
322 		uint_t new_size;
323 
324 		new_size = grp->lg_ntx_ports+1;
325 		new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
326 		    KM_SLEEP);
327 
328 		if (grp->lg_tx_ports_size > 0) {
329 			ASSERT(grp->lg_tx_ports != NULL);
330 			bcopy(grp->lg_tx_ports, new_ports,
331 			    grp->lg_ntx_ports * sizeof (aggr_port_t *));
332 			kmem_free(grp->lg_tx_ports,
333 			    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
334 		}
335 
336 		grp->lg_tx_ports = new_ports;
337 		grp->lg_tx_ports_size = new_size;
338 	}
339 
340 	grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
341 	port->lp_tx_idx = grp->lg_ntx_ports-1;
342 	rw_exit(&grp->lg_tx_lock);
343 
344 	port->lp_tx_enabled = B_TRUE;
345 }
346 
347 /*
348  * Disable sending from the specified port.
349  */
350 void
351 aggr_send_port_disable(aggr_port_t *port)
352 {
353 	uint_t idx, ntx;
354 	aggr_grp_t *grp = port->lp_grp;
355 
356 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
357 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
358 
359 	if (!port->lp_tx_enabled) {
360 		/* not yet enabled */
361 		return;
362 	}
363 
364 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
365 	idx = port->lp_tx_idx;
366 	ntx = grp->lg_ntx_ports;
367 	ASSERT(idx < ntx);
368 
369 	/* remove from array of attached ports */
370 	if (idx == (ntx - 1)) {
371 		grp->lg_tx_ports[idx] = NULL;
372 	} else {
373 		/* not the last entry, replace with last one */
374 		aggr_port_t *victim;
375 
376 		victim = grp->lg_tx_ports[ntx - 1];
377 		grp->lg_tx_ports[ntx - 1] = NULL;
378 		victim->lp_tx_idx = idx;
379 		grp->lg_tx_ports[idx] = victim;
380 	}
381 
382 	port->lp_tx_idx = 0;
383 	grp->lg_ntx_ports--;
384 	rw_exit(&grp->lg_tx_lock);
385 
386 	port->lp_tx_enabled = B_FALSE;
387 }
388 
389 static uint16_t
390 aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h)
391 {
392 	uint16_t length;
393 	uint_t	ehdrlen;
394 	uint8_t	*nexthdrp;
395 	uint8_t *whereptr;
396 	uint8_t *endptr;
397 	ip6_dest_t *desthdr;
398 	ip6_rthdr_t *rthdr;
399 	ip6_frag_t *fraghdr;
400 
401 	length = IPV6_HDR_LEN;
402 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
403 	endptr = mp->b_wptr;
404 
405 	nexthdrp = &ip6h->ip6_nxt;
406 	while (whereptr < endptr) {
407 		switch (*nexthdrp) {
408 		case IPPROTO_HOPOPTS:
409 		case IPPROTO_DSTOPTS:
410 			/* Assumes the headers are identical for hbh and dst */
411 			desthdr = (ip6_dest_t *)whereptr;
412 			ehdrlen = 8 * (desthdr->ip6d_len + 1);
413 			nexthdrp = &desthdr->ip6d_nxt;
414 			break;
415 		case IPPROTO_ROUTING:
416 			rthdr = (ip6_rthdr_t *)whereptr;
417 			ehdrlen =  8 * (rthdr->ip6r_len + 1);
418 			nexthdrp = &rthdr->ip6r_nxt;
419 			break;
420 		case IPPROTO_FRAGMENT:
421 			fraghdr = (ip6_frag_t *)whereptr;
422 			ehdrlen = sizeof (ip6_frag_t);
423 			nexthdrp = &fraghdr->ip6f_nxt;
424 			break;
425 		case IPPROTO_NONE:
426 			/* No next header means we're finished */
427 		default:
428 			return (length);
429 		}
430 		length += ehdrlen;
431 		whereptr += ehdrlen;
432 	}
433 
434 	return (length);
435 }
436