xref: /illumos-gate/usr/src/uts/common/io/overlay/overlay_mux.c (revision a4955f4fa65e38d70c07d38e657a9aff43fa155f)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * Overlay device ksocket multiplexer.
18  *
19  * For more information, see the big theory statement in
20  * uts/common/io/overlay/overlay.c
21  */
22 
23 #include <sys/types.h>
24 #include <sys/socket.h>
25 #include <sys/ksynch.h>
26 #include <sys/ksocket.h>
27 #include <sys/avl.h>
28 #include <sys/list.h>
29 #include <sys/pattr.h>
30 #include <sys/sysmacros.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/tihdr.h>
34 
35 #include <sys/overlay_impl.h>
36 
37 #include <sys/sdt.h>
38 
39 static list_t overlay_mux_list;
40 static kmutex_t overlay_mux_lock;
41 
42 void
43 overlay_mux_init(void)
44 {
45 	list_create(&overlay_mux_list, sizeof (overlay_mux_t),
46 	    offsetof(overlay_mux_t, omux_lnode));
47 	mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
48 }
49 
50 void
51 overlay_mux_fini(void)
52 {
53 	mutex_destroy(&overlay_mux_lock);
54 	list_destroy(&overlay_mux_list);
55 }
56 
57 static int
58 overlay_mux_comparator(const void *a, const void *b)
59 {
60 	const overlay_dev_t *odl, *odr;
61 	odl = a;
62 	odr = b;
63 	if (odl->odd_vid > odr->odd_vid)
64 		return (1);
65 	else if (odl->odd_vid < odr->odd_vid)
66 		return (-1);
67 	else
68 		return (0);
69 }
70 
71 /*
72  * This is the central receive data path. We need to decode the packet, if we
73  * can, and then deliver it to the appropriate overlay.
74  */
75 /* ARGSUSED */
76 static boolean_t
77 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
78     void *arg)
79 {
80 	mblk_t *mp, *nmp, *fmp;
81 	overlay_mux_t *mux = arg;
82 
83 	/*
84 	 * We may have a received a chain of messages. Each message in the
85 	 * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
86 	 * If we aren't getting that, we should probably drop that for the
87 	 * moment.
88 	 */
89 	for (mp = mpchain; mp != NULL; mp = nmp) {
90 		struct T_unitdata_ind *tudi;
91 		ovep_encap_info_t infop;
92 		overlay_dev_t od, *odd;
93 		int ret;
94 
95 		nmp = mp->b_next;
96 		mp->b_next = NULL;
97 
98 		if (DB_TYPE(mp) != M_PROTO) {
99 			OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
100 			freemsg(mp);
101 			continue;
102 		}
103 
104 		if (mp->b_cont == NULL) {
105 			OVERLAY_FREEMSG(mp, "missing a b_cont");
106 			freemsg(mp);
107 			continue;
108 		}
109 
110 		tudi = (struct T_unitdata_ind *)mp->b_rptr;
111 		if (tudi->PRIM_type != T_UNITDATA_IND) {
112 			OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
113 			freemsg(mp);
114 			continue;
115 		}
116 
117 		/*
118 		 * In the future, we'll care about the source information
119 		 * for purposes of telling varpd for oob invalidation. But for
120 		 * now, just drop that block.
121 		 */
122 		fmp = mp;
123 		mp = fmp->b_cont;
124 		freeb(fmp);
125 
126 		/*
127 		 * Until we have VXLAN-or-other-decap HW acceleration support
128 		 * (e.g.  we support NICs that reach into VXLAN-encapsulated
129 		 * packets and check the inside-VXLAN IP packets' checksums,
130 		 * or do LSO with VXLAN), we should clear any HW-accelerated-
131 		 * performed bits.
132 		 */
133 		DB_CKSUMFLAGS(mp) = 0;
134 
135 		/*
136 		 * Decap and deliver.
137 		 */
138 		bzero(&infop, sizeof (ovep_encap_info_t));
139 		ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
140 		if (ret != 0) {
141 			OVERLAY_FREEMSG(mp, "decap failed");
142 			freemsg(mp);
143 			continue;
144 		}
145 		if (MBLKL(mp) > infop.ovdi_hdr_size) {
146 			mp->b_rptr += infop.ovdi_hdr_size;
147 		} else {
148 			while (infop.ovdi_hdr_size != 0) {
149 				size_t rem, blkl;
150 
151 				if (mp == NULL)
152 					break;
153 
154 				blkl = MBLKL(mp);
155 				rem = MIN(infop.ovdi_hdr_size, blkl);
156 				infop.ovdi_hdr_size -= rem;
157 				mp->b_rptr += rem;
158 				if (rem == blkl) {
159 					fmp = mp;
160 					mp = fmp->b_cont;
161 					fmp->b_cont = NULL;
162 					OVERLAY_FREEMSG(mp,
163 					    "freed a fmp block");
164 					freemsg(fmp);
165 				}
166 			}
167 			if (mp == NULL) {
168 				OVERLAY_FREEMSG(mp, "freed it all...");
169 				continue;
170 			}
171 		}
172 
173 
174 		od.odd_vid = infop.ovdi_id;
175 		mutex_enter(&mux->omux_lock);
176 		odd = avl_find(&mux->omux_devices, &od, NULL);
177 		if (odd == NULL) {
178 			mutex_exit(&mux->omux_lock);
179 			OVERLAY_FREEMSG(mp, "no matching vid");
180 			freemsg(mp);
181 			continue;
182 		}
183 		mutex_enter(&odd->odd_lock);
184 		if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
185 		    !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
186 			mutex_exit(&odd->odd_lock);
187 			mutex_exit(&mux->omux_lock);
188 			OVERLAY_FREEMSG(mp, "dev dropped");
189 			freemsg(mp);
190 			continue;
191 		}
192 		overlay_io_start(odd, OVERLAY_F_IN_RX);
193 		mutex_exit(&odd->odd_lock);
194 		mutex_exit(&mux->omux_lock);
195 
196 		mac_rx(odd->odd_mh, NULL, mp);
197 
198 		mutex_enter(&odd->odd_lock);
199 		overlay_io_done(odd, OVERLAY_F_IN_RX);
200 		mutex_exit(&odd->odd_lock);
201 	}
202 
203 	return (B_TRUE);
204 }
205 
206 /*
207  * Register a given device with a socket backend. If no such device socket
208  * exists, create a new one.
209  */
210 overlay_mux_t *
211 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
212     struct sockaddr *addr, socklen_t len, int *errp)
213 {
214 	int err;
215 	overlay_mux_t *mux;
216 	ksocket_t ksock;
217 
218 	if (errp == NULL)
219 		errp = &err;
220 
221 	mutex_enter(&overlay_mux_lock);
222 	for (mux = list_head(&overlay_mux_list); mux != NULL;
223 	    mux = list_next(&overlay_mux_list, mux)) {
224 		if (domain == mux->omux_domain &&
225 		    family == mux->omux_family &&
226 		    protocol == mux->omux_protocol &&
227 		    len == mux->omux_alen &&
228 		    bcmp(addr, mux->omux_addr, len) == 0) {
229 
230 			if (opp != mux->omux_plugin) {
231 				*errp = EEXIST;
232 				return (NULL);
233 			}
234 
235 			mutex_enter(&mux->omux_lock);
236 			mux->omux_count++;
237 			mutex_exit(&mux->omux_lock);
238 			mutex_exit(&overlay_mux_lock);
239 			*errp = 0;
240 			return (mux);
241 		}
242 	}
243 
244 	/*
245 	 * Today we aren't zone-aware and only exist in the global zone. When we
246 	 * allow for things to exist in the non-global zone, we'll want to use a
247 	 * credential that's actually specific to the zone.
248 	 */
249 	*errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
250 	    kcred);
251 	if (*errp != 0) {
252 		mutex_exit(&overlay_mux_lock);
253 		return (NULL);
254 	}
255 
256 	*errp = ksocket_bind(ksock, addr, len, kcred);
257 	if (*errp != 0) {
258 		mutex_exit(&overlay_mux_lock);
259 		ksocket_close(ksock, kcred);
260 		return (NULL);
261 	}
262 
263 	/*
264 	 * Ask our lower layer to optionally toggle anything they need on this
265 	 * socket. Because a socket is owned by a single type of plugin, we can
266 	 * then ask it to perform any additional socket set up it'd like to do.
267 	 */
268 	if (opp->ovp_ops->ovpo_sockopt != NULL &&
269 	    (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
270 		mutex_exit(&overlay_mux_lock);
271 		ksocket_close(ksock, kcred);
272 		return (NULL);
273 	}
274 
275 	mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
276 	list_link_init(&mux->omux_lnode);
277 	mux->omux_ksock = ksock;
278 	mux->omux_plugin = opp;
279 	mux->omux_domain = domain;
280 	mux->omux_family = family;
281 	mux->omux_protocol = protocol;
282 	mux->omux_addr = kmem_alloc(len, KM_SLEEP);
283 	bcopy(addr, mux->omux_addr, len);
284 	mux->omux_alen = len;
285 	mux->omux_count = 1;
286 	avl_create(&mux->omux_devices, overlay_mux_comparator,
287 	    sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
288 	mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
289 
290 
291 	/* Once this is called, we need to expect to rx data */
292 	*errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
293 	if (*errp != 0) {
294 		ksocket_close(ksock, kcred);
295 		mutex_destroy(&mux->omux_lock);
296 		avl_destroy(&mux->omux_devices);
297 		kmem_free(mux->omux_addr, len);
298 		kmem_free(mux, sizeof (overlay_mux_t));
299 		return (NULL);
300 	}
301 
302 	list_insert_tail(&overlay_mux_list, mux);
303 	mutex_exit(&overlay_mux_lock);
304 
305 	*errp = 0;
306 	return (mux);
307 }
308 
309 void
310 overlay_mux_close(overlay_mux_t *mux)
311 {
312 	mutex_enter(&overlay_mux_lock);
313 	mutex_enter(&mux->omux_lock);
314 	mux->omux_count--;
315 	if (mux->omux_count != 0) {
316 		mutex_exit(&mux->omux_lock);
317 		mutex_exit(&overlay_mux_lock);
318 		return;
319 	}
320 	list_remove(&overlay_mux_list, mux);
321 	mutex_exit(&mux->omux_lock);
322 	mutex_exit(&overlay_mux_lock);
323 
324 	ksocket_close(mux->omux_ksock, kcred);
325 	avl_destroy(&mux->omux_devices);
326 	kmem_free(mux->omux_addr, mux->omux_alen);
327 	kmem_free(mux, sizeof (overlay_mux_t));
328 }
329 
330 void
331 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
332 {
333 	mutex_enter(&mux->omux_lock);
334 	avl_add(&mux->omux_devices, odd);
335 	mutex_exit(&mux->omux_lock);
336 }
337 
338 void
339 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
340 {
341 	mutex_enter(&mux->omux_lock);
342 	avl_remove(&mux->omux_devices, odd);
343 	mutex_exit(&mux->omux_lock);
344 }
345 
346 int
347 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
348 {
349 	int ret;
350 
351 	/*
352 	 * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
353 	 * that isn't actually supported by UDP at this time.
354 	 */
355 	ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
356 	if (ret != 0)
357 		freemsg(mp);
358 
359 	return (ret);
360 }
361