xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_main.c (revision a4955f4fa65e38d70c07d38e657a9aff43fa155f)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2023 Oxide Computer Company
39  */
40 
41 /*
42  * viona - VirtIO-Net, Accelerated
43  *
44  * The purpose of viona is to provide high performance virtio-net devices to
45  * bhyve guests.  It does so by sitting directly atop MAC, skipping all of the
46  * DLS/DLD stack.
47  *
48  * --------------------
49  * General Architecture
50  * --------------------
51  *
52  * A single viona instance is comprised of a "link" handle and two "rings".
53  * After opening the viona device, it must be associated with a MAC network
54  * interface and a bhyve (vmm) instance to form its link resource.  This is
55  * done with the VNA_IOC_CREATE ioctl, where the datalink ID and vmm fd are
56  * passed in to perform the initialization.  With the MAC client opened, and a
57  * driver handle to the vmm instance established, the device is ready to be
58  * configured by the guest.
59  *
60  * The userspace portion of bhyve, which interfaces with the PCI device
61  * emulation framework, is meant to stay out of the datapath if at all
62  * possible.  Configuration changes made via PCI are mapped to actions which
63  * will steer the operation of the in-kernel logic.
64  *
65  *
66  * -----------
67  * Ring Basics
68  * -----------
69  *
70  * Each viona link has two viona_vring_t entities, RX and TX, for handling data
71  * transfers to and from the guest.  They represent an interface to the
72  * standard virtio ring structures.  When intiailized and active, each ring is
73  * backed by a kernel worker thread (parented to the bhyve process for the
74  * instance) which handles ring events.  The RX worker has the simple task of
75  * watching for ring shutdown conditions.  The TX worker does that in addition
76  * to processing all requests to transmit data.  Data destined for the guest is
77  * delivered directly by MAC to viona_rx() when the ring is active.
78  *
79  *
80  * -----------
81  * Ring States
82  * -----------
83  *
84  * The viona_vring_t instances follow a simple path through the possible state
85  * values represented in virtio_vring_t`vr_state:
86  *
87  *        +<--------------------------------------------+
88  *        |						|
89  *        V						^
90  *  +-----------+	This is the initial state when a link is created or
91  *  | VRS_RESET |	when the ring has been explicitly reset.
92  *  +-----------+
93  *        |						^
94  *        |---* ioctl(VNA_IOC_RING_INIT) issued		|
95  *        |						|
96  *        |						^
97  *        V
98  *  +-----------+	The ring parameters (size, guest physical addresses)
99  *  | VRS_SETUP |	have been set and start-up of the ring worker thread
100  *  +-----------+	has begun.
101  *        |						^
102  *        |						|
103  *        |---* ring worker thread begins execution	|
104  *        |						|
105  *        +-------------------------------------------->+
106  *        |	      |					^
107  *        |	      |
108  *        |	      *	If ring shutdown is requested (by ioctl or impending
109  *        |		bhyve process death) while the worker thread is
110  *        |		starting, the worker will transition the ring to
111  *        |		VRS_RESET and exit.
112  *        |						^
113  *        |						|
114  *        |<-------------------------------------------<+
115  *        |	      |					|
116  *        |	      |					^
117  *        |	      *	If ring is requested to pause (but not stop)from the
118  *        |             VRS_RUN state, it will return to the VRS_INIT state.
119  *        |
120  *        |						^
121  *        |						|
122  *        |						^
123  *        V
124  *  +-----------+	The worker thread associated with the ring has started
125  *  | VRS_INIT  |	executing.  It has allocated any extra resources needed
126  *  +-----------+	for the ring to operate.
127  *        |						^
128  *        |						|
129  *        +-------------------------------------------->+
130  *        |	      |					^
131  *        |	      |
132  *        |	      *	If ring shutdown is requested while the worker is
133  *        |		waiting in VRS_INIT, it will free any extra resources
134  *        |		and transition to VRS_RESET.
135  *        |						^
136  *        |						|
137  *        |--* ioctl(VNA_IOC_RING_KICK) issued		|
138  *        |						^
139  *        V
140  *  +-----------+	The worker thread associated with the ring is executing
141  *  | VRS_RUN   |	workload specific to that ring.
142  *  +-----------+
143  *        |						^
144  *        |---* ioctl(VNA_IOC_RING_RESET) issued	|
145  *        |	(or bhyve process begins exit)		^
146  *        |
147  *  +-----------+	The worker thread associated with the ring is in the
148  *  | VRS_STOP  |	process of exiting. All outstanding TX and RX
149  *  +-----------+	requests are allowed to complete, but new requests
150  *        |		must be ignored.
151  *        |						^
152  *        |						|
153  *        +-------------------------------------------->+
154  *
155  *
156  * While the worker thread is not running, changes to vr_state are only made by
157  * viona_ioc_ring_init() under vr_lock.  There, it initializes the ring, starts
158  * the worker, and sets the ring state to VRS_SETUP.  Once the worker thread
159  * has been started, only it may perform ring state transitions (still under
160  * the protection of vr_lock), when requested by outside consumers via
161  * vr_state_flags or when the containing bhyve process initiates an exit.
162  *
163  *
164  * ----------------------------
165  * Transmission mblk_t Handling
166  * ----------------------------
167  *
168  * For incoming frames destined for a bhyve guest, the data must first land in
169  * a host OS buffer from the physical NIC before it is copied into the awaiting
170  * guest buffer(s).  Outbound frames transmitted by the guest are not bound by
171  * this limitation and can avoid extra copying before the buffers are accessed
172  * directly by the NIC.  When a guest designates buffers to be transmitted,
173  * viona translates the guest-physical addresses contained in the ring
174  * descriptors to host-virtual addresses via viona_hold_page().  That pointer is
175  * wrapped in an mblk_t using a preallocated viona_desb_t for the desballoc().
176  * Doing so increments vr_xfer_outstanding, preventing the ring from being
177  * reset (allowing the link to drop its vmm handle to the guest) until all
178  * transmit mblks referencing guest memory have been processed.  Allocation of
179  * the viona_desb_t entries is done during the VRS_INIT stage of the ring
180  * worker thread.  The ring size informs that allocation as the number of
181  * concurrent transmissions is limited by the number of descriptors in the
182  * ring.  This minimizes allocation in the transmit hot-path by acquiring those
183  * fixed-size resources during initialization.
184  *
185  * This optimization depends on the underlying NIC driver freeing the mblks in
186  * a timely manner after they have been transmitted by the hardware.  Some
187  * drivers have been found to flush TX descriptors only when new transmissions
188  * are initiated.  This means that there is no upper bound to the time needed
189  * for an mblk to be flushed and can stall bhyve guests from shutting down
190  * since their memory must be free of viona TX references prior to clean-up.
191  *
192  * This expectation of deterministic mblk_t processing is likely the reason
193  * behind the notable exception to the zero-copy TX path: systems with 'bnxe'
194  * loaded will copy transmit data into fresh buffers rather than passing up
195  * zero-copy mblks.  It is a hold-over from the original viona sources provided
196  * by Pluribus and its continued necessity has not been confirmed.
197  *
198  *
199  * ----------------------------
200  * Ring Notification Fast-paths
201  * ----------------------------
202  *
203  * Device operation for viona requires that notifications flow to and from the
204  * guest to indicate certain ring conditions.  In order to minimize latency and
205  * processing overhead, the notification procedures are kept in-kernel whenever
206  * possible.
207  *
208  * Guest-to-host notifications, when new available descriptors have been placed
209  * in the ring, are posted via the 'queue notify' address in the virtio BAR.
210  * The vmm_drv_ioport_hook() interface was added to bhyve which allows viona to
211  * install a callback hook on an ioport address.  Guest exits for accesses to
212  * viona-hooked ioport addresses will result in direct calls to notify the
213  * appropriate ring worker without a trip to userland.
214  *
215  * Host-to-guest notifications in the form of interrupts enjoy similar
216  * acceleration.  Each viona ring can be configured to send MSI notifications
217  * to the guest as virtio conditions dictate.  This in-kernel interrupt
218  * configuration is kept synchronized through viona ioctls which are utilized
219  * during writes to the associated PCI config registers or MSI-X BAR.
220  *
221  * Guests which do not utilize MSI-X will result in viona falling back to the
222  * slow path for interrupts.  It will poll(2) the viona handle, receiving
223  * notification when ring events necessitate the assertion of an interrupt.
224  *
225  *
226  * ---------------
227  * Nethook Support
228  * ---------------
229  *
230  * Viona provides four nethook events that consumers (e.g. ipf) can hook into
231  * to intercept packets as they go up or down the stack.  Unfortunately,
232  * the nethook framework does not understand raw packets, so we can only
233  * generate events (in, out) for IPv4 and IPv6 packets.  At driver attach,
234  * we register callbacks with the neti (netinfo) module that will be invoked
235  * for each netstack already present, as well as for any additional netstack
236  * instances created as the system operates.  These callbacks will
237  * register/unregister the hooks with the nethook framework for each
238  * netstack instance.  This registration occurs prior to creating any
239  * viona instances for a given netstack, and the unregistration for a netstack
240  * instance occurs after all viona instances of the netstack instance have
241  * been deleted.
242  */
243 
244 #include <sys/conf.h>
245 #include <sys/file.h>
246 #include <sys/stat.h>
247 
248 #include <sys/dlpi.h>
249 #include <sys/vlan.h>
250 
251 #include "viona_impl.h"
252 
253 
254 #define	VIONA_NAME		"Virtio Network Accelerator"
255 #define	VIONA_CTL_MINOR		0
256 #define	VIONA_CLI_NAME		"viona"		/* MAC client name */
257 
258 
259 /*
260  * Host capabilities.
261  */
262 #define	VIONA_S_HOSTCAPS	(	\
263 	VIRTIO_NET_F_GUEST_CSUM |	\
264 	VIRTIO_NET_F_MAC |		\
265 	VIRTIO_NET_F_GUEST_TSO4 |	\
266 	VIRTIO_NET_F_MRG_RXBUF |	\
267 	VIRTIO_NET_F_STATUS |		\
268 	VIRTIO_F_RING_NOTIFY_ON_EMPTY |	\
269 	VIRTIO_F_RING_INDIRECT_DESC)
270 
271 /* MAC_CAPAB_HCKSUM specifics of interest */
272 #define	VIONA_CAP_HCKSUM_INTEREST	\
273 	(HCKSUM_INET_PARTIAL |		\
274 	HCKSUM_INET_FULL_V4 |		\
275 	HCKSUM_INET_FULL_V6)
276 
277 static void		*viona_state;
278 static dev_info_t	*viona_dip;
279 static id_space_t	*viona_minors;
280 
281 
282 static int viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
283     void **result);
284 static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
285 static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
286 static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
287 static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
288 static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
289     cred_t *credp, int *rval);
290 static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
291     struct pollhead **phpp);
292 
293 static int viona_ioc_create(viona_soft_state_t *, void *, int, cred_t *);
294 static int viona_ioc_delete(viona_soft_state_t *, boolean_t);
295 
296 static int viona_ioc_set_notify_ioport(viona_link_t *, uint16_t);
297 static int viona_ioc_ring_init(viona_link_t *, void *, int);
298 static int viona_ioc_ring_set_state(viona_link_t *, void *, int);
299 static int viona_ioc_ring_get_state(viona_link_t *, void *, int);
300 static int viona_ioc_ring_reset(viona_link_t *, uint_t);
301 static int viona_ioc_ring_kick(viona_link_t *, uint_t);
302 static int viona_ioc_ring_pause(viona_link_t *, uint_t);
303 static int viona_ioc_ring_set_msi(viona_link_t *, void *, int);
304 static int viona_ioc_ring_intr_clear(viona_link_t *, uint_t);
305 static int viona_ioc_intr_poll(viona_link_t *, void *, int, int *);
306 
307 static struct cb_ops viona_cb_ops = {
308 	viona_open,
309 	viona_close,
310 	nodev,
311 	nodev,
312 	nodev,
313 	nodev,
314 	nodev,
315 	viona_ioctl,
316 	nodev,
317 	nodev,
318 	nodev,
319 	viona_chpoll,
320 	ddi_prop_op,
321 	0,
322 	D_MP | D_NEW | D_HOTPLUG,
323 	CB_REV,
324 	nodev,
325 	nodev
326 };
327 
328 static struct dev_ops viona_ops = {
329 	DEVO_REV,
330 	0,
331 	viona_info,
332 	nulldev,
333 	nulldev,
334 	viona_attach,
335 	viona_detach,
336 	nodev,
337 	&viona_cb_ops,
338 	NULL,
339 	ddi_power,
340 	ddi_quiesce_not_needed
341 };
342 
343 static struct modldrv modldrv = {
344 	&mod_driverops,
345 	VIONA_NAME,
346 	&viona_ops,
347 };
348 
349 static struct modlinkage modlinkage = {
350 	MODREV_1, &modldrv, NULL
351 };
352 
353 int
354 _init(void)
355 {
356 	int ret;
357 
358 	ret = ddi_soft_state_init(&viona_state, sizeof (viona_soft_state_t), 0);
359 	if (ret != 0) {
360 		return (ret);
361 	}
362 
363 	viona_minors = id_space_create("viona_minors",
364 	    VIONA_CTL_MINOR + 1, UINT16_MAX);
365 	viona_rx_init();
366 	mutex_init(&viona_force_copy_lock, NULL, MUTEX_DRIVER, NULL);
367 
368 	ret = mod_install(&modlinkage);
369 	if (ret != 0) {
370 		ddi_soft_state_fini(&viona_state);
371 		id_space_destroy(viona_minors);
372 		viona_rx_fini();
373 		mutex_destroy(&viona_force_copy_lock);
374 	}
375 
376 	return (ret);
377 }
378 
379 int
380 _fini(void)
381 {
382 	int ret;
383 
384 	ret = mod_remove(&modlinkage);
385 	if (ret != 0) {
386 		return (ret);
387 	}
388 
389 	ddi_soft_state_fini(&viona_state);
390 	id_space_destroy(viona_minors);
391 	viona_rx_fini();
392 	mutex_destroy(&viona_force_copy_lock);
393 
394 	return (ret);
395 }
396 
397 int
398 _info(struct modinfo *modinfop)
399 {
400 	return (mod_info(&modlinkage, modinfop));
401 }
402 
403 /* ARGSUSED */
404 static int
405 viona_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
406 {
407 	int error;
408 
409 	switch (cmd) {
410 	case DDI_INFO_DEVT2DEVINFO:
411 		*result = (void *)viona_dip;
412 		error = DDI_SUCCESS;
413 		break;
414 	case DDI_INFO_DEVT2INSTANCE:
415 		*result = (void *)0;
416 		error = DDI_SUCCESS;
417 		break;
418 	default:
419 		error = DDI_FAILURE;
420 		break;
421 	}
422 	return (error);
423 }
424 
425 static int
426 viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
427 {
428 	if (cmd != DDI_ATTACH) {
429 		return (DDI_FAILURE);
430 	}
431 
432 	if (ddi_create_minor_node(dip, "viona", S_IFCHR, VIONA_CTL_MINOR,
433 	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
434 		return (DDI_FAILURE);
435 	}
436 
437 	viona_neti_attach();
438 
439 	viona_dip = dip;
440 	ddi_report_dev(viona_dip);
441 
442 	return (DDI_SUCCESS);
443 }
444 
445 static int
446 viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
447 {
448 	dev_info_t *old_dip = viona_dip;
449 
450 	if (cmd != DDI_DETACH) {
451 		return (DDI_FAILURE);
452 	}
453 
454 	VERIFY(old_dip != NULL);
455 
456 	viona_neti_detach();
457 	viona_dip = NULL;
458 	ddi_remove_minor_node(old_dip, NULL);
459 
460 	return (DDI_SUCCESS);
461 }
462 
463 static int
464 viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
465 {
466 	int	minor;
467 	viona_soft_state_t *ss;
468 
469 	if (otype != OTYP_CHR) {
470 		return (EINVAL);
471 	}
472 #if 0
473 	/*
474 	 * XXX-mg: drv_priv() is wrong, but I'm not sure what is right.
475 	 * Should the check be at open() or ioctl()?
476 	 */
477 	if (drv_priv(credp) != 0) {
478 		return (EPERM);
479 	}
480 #endif
481 	if (getminor(*devp) != VIONA_CTL_MINOR) {
482 		return (ENXIO);
483 	}
484 
485 	minor = id_alloc_nosleep(viona_minors);
486 	if (minor == -1) {
487 		/* All minors are busy */
488 		return (EBUSY);
489 	}
490 	if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
491 		id_free(viona_minors, minor);
492 		return (ENOMEM);
493 	}
494 
495 	ss = ddi_get_soft_state(viona_state, minor);
496 	mutex_init(&ss->ss_lock, NULL, MUTEX_DEFAULT, NULL);
497 	*devp = makedevice(getmajor(*devp), minor);
498 
499 	return (0);
500 }
501 
502 static int
503 viona_close(dev_t dev, int flag, int otype, cred_t *credp)
504 {
505 	int			minor;
506 	viona_soft_state_t	*ss;
507 
508 	if (otype != OTYP_CHR) {
509 		return (EINVAL);
510 	}
511 
512 	minor = getminor(dev);
513 
514 	ss = ddi_get_soft_state(viona_state, minor);
515 	if (ss == NULL) {
516 		return (ENXIO);
517 	}
518 
519 	VERIFY0(viona_ioc_delete(ss, B_TRUE));
520 	VERIFY(!list_link_active(&ss->ss_node));
521 	ddi_soft_state_free(viona_state, minor);
522 	id_free(viona_minors, minor);
523 
524 	return (0);
525 }
526 
527 static int
528 viona_ioctl(dev_t dev, int cmd, intptr_t data, int md, cred_t *cr, int *rv)
529 {
530 	viona_soft_state_t *ss;
531 	void *dptr = (void *)data;
532 	int err = 0, val;
533 	viona_link_t *link;
534 
535 	ss = ddi_get_soft_state(viona_state, getminor(dev));
536 	if (ss == NULL) {
537 		return (ENXIO);
538 	}
539 
540 	switch (cmd) {
541 	case VNA_IOC_CREATE:
542 		return (viona_ioc_create(ss, dptr, md, cr));
543 	case VNA_IOC_DELETE:
544 		return (viona_ioc_delete(ss, B_FALSE));
545 	case VNA_IOC_VERSION:
546 		*rv = VIONA_CURRENT_INTERFACE_VERSION;
547 		return (0);
548 	default:
549 		break;
550 	}
551 
552 	mutex_enter(&ss->ss_lock);
553 	if ((link = ss->ss_link) == NULL || link->l_destroyed ||
554 	    vmm_drv_release_reqd(link->l_vm_hold)) {
555 		mutex_exit(&ss->ss_lock);
556 		return (ENXIO);
557 	}
558 
559 	switch (cmd) {
560 	case VNA_IOC_GET_FEATURES:
561 		val = VIONA_S_HOSTCAPS | link->l_features_hw;
562 		if (ddi_copyout(&val, dptr, sizeof (val), md) != 0) {
563 			err = EFAULT;
564 		}
565 		break;
566 	case VNA_IOC_SET_FEATURES:
567 		if (ddi_copyin(dptr, &val, sizeof (val), md) != 0) {
568 			err = EFAULT;
569 			break;
570 		}
571 		val &= (VIONA_S_HOSTCAPS | link->l_features_hw);
572 
573 		if ((val & VIRTIO_NET_F_CSUM) == 0)
574 			val &= ~VIRTIO_NET_F_HOST_TSO4;
575 
576 		if ((val & VIRTIO_NET_F_GUEST_CSUM) == 0)
577 			val &= ~VIRTIO_NET_F_GUEST_TSO4;
578 
579 		link->l_features = val;
580 		break;
581 	case VNA_IOC_RING_INIT:
582 		err = viona_ioc_ring_init(link, dptr, md);
583 		break;
584 	case VNA_IOC_RING_RESET:
585 		err = viona_ioc_ring_reset(link, (uint_t)data);
586 		break;
587 	case VNA_IOC_RING_KICK:
588 		err = viona_ioc_ring_kick(link, (uint_t)data);
589 		break;
590 	case VNA_IOC_RING_SET_MSI:
591 		err = viona_ioc_ring_set_msi(link, dptr, md);
592 		break;
593 	case VNA_IOC_RING_INTR_CLR:
594 		err = viona_ioc_ring_intr_clear(link, (uint_t)data);
595 		break;
596 	case VNA_IOC_RING_SET_STATE:
597 		err = viona_ioc_ring_set_state(link, dptr, md);
598 		break;
599 	case VNA_IOC_RING_GET_STATE:
600 		err = viona_ioc_ring_get_state(link, dptr, md);
601 		break;
602 	case VNA_IOC_RING_PAUSE:
603 		err = viona_ioc_ring_pause(link, (uint_t)data);
604 		break;
605 
606 	case VNA_IOC_INTR_POLL:
607 		err = viona_ioc_intr_poll(link, dptr, md, rv);
608 		break;
609 	case VNA_IOC_SET_NOTIFY_IOP:
610 		if (data < 0 || data > UINT16_MAX) {
611 			err = EINVAL;
612 			break;
613 		}
614 		err = viona_ioc_set_notify_ioport(link, (uint16_t)data);
615 		break;
616 	default:
617 		err = ENOTTY;
618 		break;
619 	}
620 
621 	mutex_exit(&ss->ss_lock);
622 	return (err);
623 }
624 
625 static int
626 viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
627     struct pollhead **phpp)
628 {
629 	viona_soft_state_t *ss;
630 	viona_link_t *link;
631 
632 	ss = ddi_get_soft_state(viona_state, getminor(dev));
633 	if (ss == NULL) {
634 		return (ENXIO);
635 	}
636 
637 	mutex_enter(&ss->ss_lock);
638 	if ((link = ss->ss_link) == NULL || link->l_destroyed) {
639 		mutex_exit(&ss->ss_lock);
640 		return (ENXIO);
641 	}
642 
643 	*reventsp = 0;
644 	if ((events & POLLRDBAND) != 0) {
645 		for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
646 			if (link->l_vrings[i].vr_intr_enabled != 0) {
647 				*reventsp |= POLLRDBAND;
648 				break;
649 			}
650 		}
651 	}
652 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
653 		*phpp = &link->l_pollhead;
654 	}
655 	mutex_exit(&ss->ss_lock);
656 
657 	return (0);
658 }
659 
660 static void
661 viona_get_mac_capab(viona_link_t *link)
662 {
663 	mac_handle_t mh = link->l_mh;
664 	uint32_t cap = 0;
665 	mac_capab_lso_t lso_cap;
666 
667 	link->l_features_hw = 0;
668 	if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap)) {
669 		/*
670 		 * Only report HW checksum ability if the underlying MAC
671 		 * resource is capable of populating the L4 header.
672 		 */
673 		if ((cap & VIONA_CAP_HCKSUM_INTEREST) != 0) {
674 			link->l_features_hw |= VIRTIO_NET_F_CSUM;
675 		}
676 		link->l_cap_csum = cap;
677 	}
678 
679 	if ((link->l_features_hw & VIRTIO_NET_F_CSUM) &&
680 	    mac_capab_get(mh, MAC_CAPAB_LSO, &lso_cap)) {
681 		/*
682 		 * Virtio doesn't allow for negotiating a maximum LSO
683 		 * packet size. We have to assume that the guest may
684 		 * send a maximum length IP packet. Make sure the
685 		 * underlying MAC can handle an LSO of this size.
686 		 */
687 		if ((lso_cap.lso_flags & LSO_TX_BASIC_TCP_IPV4) &&
688 		    lso_cap.lso_basic_tcp_ipv4.lso_max >= IP_MAXPACKET)
689 			link->l_features_hw |= VIRTIO_NET_F_HOST_TSO4;
690 	}
691 }
692 
693 static int
694 viona_ioc_create(viona_soft_state_t *ss, void *dptr, int md, cred_t *cr)
695 {
696 	vioc_create_t	kvc;
697 	viona_link_t	*link = NULL;
698 	char		cli_name[MAXNAMELEN];
699 	int		err = 0;
700 	file_t		*fp;
701 	vmm_hold_t	*hold = NULL;
702 	viona_neti_t	*nip = NULL;
703 	zoneid_t	zid;
704 	mac_diag_t	mac_diag = MAC_DIAG_NONE;
705 
706 	ASSERT(MUTEX_NOT_HELD(&ss->ss_lock));
707 
708 	if (ddi_copyin(dptr, &kvc, sizeof (kvc), md) != 0) {
709 		return (EFAULT);
710 	}
711 
712 	zid = crgetzoneid(cr);
713 	nip = viona_neti_lookup_by_zid(zid);
714 	if (nip == NULL) {
715 		return (EIO);
716 	}
717 
718 	if (!nip->vni_nethook.vnh_hooked) {
719 		viona_neti_rele(nip);
720 		return (EIO);
721 	}
722 
723 	mutex_enter(&ss->ss_lock);
724 	if (ss->ss_link != NULL) {
725 		mutex_exit(&ss->ss_lock);
726 		viona_neti_rele(nip);
727 		return (EEXIST);
728 	}
729 
730 	if ((fp = getf(kvc.c_vmfd)) == NULL) {
731 		err = EBADF;
732 		goto bail;
733 	}
734 	err = vmm_drv_hold(fp, cr, &hold);
735 	releasef(kvc.c_vmfd);
736 	if (err != 0) {
737 		goto bail;
738 	}
739 
740 	link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
741 	link->l_linkid = kvc.c_linkid;
742 	link->l_vm_hold = hold;
743 
744 	err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
745 	if (err != 0) {
746 		goto bail;
747 	}
748 
749 	viona_get_mac_capab(link);
750 
751 	(void) snprintf(cli_name, sizeof (cli_name), "%s-%d", VIONA_CLI_NAME,
752 	    link->l_linkid);
753 	err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
754 	if (err != 0) {
755 		goto bail;
756 	}
757 
758 	err = mac_unicast_add(link->l_mch, NULL, MAC_UNICAST_PRIMARY,
759 	    &link->l_muh, VLAN_ID_NONE, &mac_diag);
760 	if (err != 0) {
761 		goto bail;
762 	}
763 
764 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_RX]);
765 	viona_ring_alloc(link, &link->l_vrings[VIONA_VQ_TX]);
766 
767 	if ((err = viona_rx_set(link)) != 0) {
768 		viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
769 		viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
770 		goto bail;
771 	}
772 
773 	link->l_neti = nip;
774 	ss->ss_link = link;
775 	mutex_exit(&ss->ss_lock);
776 
777 	mutex_enter(&nip->vni_lock);
778 	list_insert_tail(&nip->vni_dev_list, ss);
779 	mutex_exit(&nip->vni_lock);
780 
781 	return (0);
782 
783 bail:
784 	if (link != NULL) {
785 		if (link->l_mch != NULL) {
786 			if (link->l_muh != NULL) {
787 				VERIFY0(mac_unicast_remove(link->l_mch,
788 				    link->l_muh));
789 				link->l_muh = NULL;
790 			}
791 			mac_client_close(link->l_mch, 0);
792 		}
793 		if (link->l_mh != NULL) {
794 			mac_close(link->l_mh);
795 		}
796 		kmem_free(link, sizeof (viona_link_t));
797 	}
798 	if (hold != NULL) {
799 		vmm_drv_rele(hold);
800 	}
801 	viona_neti_rele(nip);
802 
803 	mutex_exit(&ss->ss_lock);
804 	return (err);
805 }
806 
807 static int
808 viona_ioc_delete(viona_soft_state_t *ss, boolean_t on_close)
809 {
810 	viona_link_t *link;
811 	viona_neti_t *nip = NULL;
812 
813 	mutex_enter(&ss->ss_lock);
814 	if ((link = ss->ss_link) == NULL) {
815 		/* Link destruction already complete */
816 		mutex_exit(&ss->ss_lock);
817 		return (0);
818 	}
819 
820 	if (link->l_destroyed) {
821 		/*
822 		 * Link destruction has been started by another thread, but has
823 		 * not completed.  This condition should be impossible to
824 		 * encounter when performing the on-close destroy of the link,
825 		 * since racing ioctl accessors must necessarily be absent.
826 		 */
827 		VERIFY(!on_close);
828 		mutex_exit(&ss->ss_lock);
829 		return (EAGAIN);
830 	}
831 	/*
832 	 * The link deletion cannot fail after this point, continuing until its
833 	 * successful completion is reached.
834 	 */
835 	link->l_destroyed = B_TRUE;
836 
837 	/*
838 	 * Tear down the IO port hook so it cannot be used to kick any of the
839 	 * rings which are about to be reset and stopped.
840 	 */
841 	VERIFY0(viona_ioc_set_notify_ioport(link, 0));
842 	mutex_exit(&ss->ss_lock);
843 
844 	/*
845 	 * Return the rings to their reset state, ignoring any possible
846 	 * interruptions from signals.
847 	 */
848 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_RX], B_FALSE));
849 	VERIFY0(viona_ring_reset(&link->l_vrings[VIONA_VQ_TX], B_FALSE));
850 
851 	mutex_enter(&ss->ss_lock);
852 	if (link->l_mch != NULL) {
853 		/* Unhook the receive callbacks and close out the client */
854 		viona_rx_clear(link);
855 		if (link->l_muh != NULL) {
856 			VERIFY0(mac_unicast_remove(link->l_mch, link->l_muh));
857 			link->l_muh = NULL;
858 		}
859 		mac_client_close(link->l_mch, 0);
860 	}
861 	if (link->l_mh != NULL) {
862 		mac_close(link->l_mh);
863 	}
864 	if (link->l_vm_hold != NULL) {
865 		vmm_drv_rele(link->l_vm_hold);
866 		link->l_vm_hold = NULL;
867 	}
868 
869 	nip = link->l_neti;
870 	link->l_neti = NULL;
871 
872 	viona_ring_free(&link->l_vrings[VIONA_VQ_RX]);
873 	viona_ring_free(&link->l_vrings[VIONA_VQ_TX]);
874 	pollhead_clean(&link->l_pollhead);
875 	ss->ss_link = NULL;
876 	mutex_exit(&ss->ss_lock);
877 
878 	mutex_enter(&nip->vni_lock);
879 	list_remove(&nip->vni_dev_list, ss);
880 	mutex_exit(&nip->vni_lock);
881 
882 	viona_neti_rele(nip);
883 
884 	kmem_free(link, sizeof (viona_link_t));
885 	return (0);
886 }
887 
888 static int
889 viona_ioc_ring_init(viona_link_t *link, void *udata, int md)
890 {
891 	vioc_ring_init_t kri;
892 	int err;
893 
894 	if (ddi_copyin(udata, &kri, sizeof (kri), md) != 0) {
895 		return (EFAULT);
896 	}
897 	const struct viona_ring_params params = {
898 		.vrp_pa = kri.ri_qaddr,
899 		.vrp_size = kri.ri_qsize,
900 		.vrp_avail_idx = 0,
901 		.vrp_used_idx = 0,
902 	};
903 
904 	err = viona_ring_init(link, kri.ri_index, &params);
905 
906 	return (err);
907 }
908 
909 static int
910 viona_ioc_ring_set_state(viona_link_t *link, void *udata, int md)
911 {
912 	vioc_ring_state_t krs;
913 	int err;
914 
915 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
916 		return (EFAULT);
917 	}
918 	const struct viona_ring_params params = {
919 		.vrp_pa = krs.vrs_qaddr,
920 		.vrp_size = krs.vrs_qsize,
921 		.vrp_avail_idx = krs.vrs_avail_idx,
922 		.vrp_used_idx = krs.vrs_used_idx,
923 	};
924 
925 	err = viona_ring_init(link, krs.vrs_index, &params);
926 
927 	return (err);
928 }
929 
930 static int
931 viona_ioc_ring_get_state(viona_link_t *link, void *udata, int md)
932 {
933 	vioc_ring_state_t krs;
934 
935 	if (ddi_copyin(udata, &krs, sizeof (krs), md) != 0) {
936 		return (EFAULT);
937 	}
938 
939 	struct viona_ring_params params;
940 	int err = viona_ring_get_state(link, krs.vrs_index, &params);
941 	if (err != 0) {
942 		return (err);
943 	}
944 	krs.vrs_qsize = params.vrp_size;
945 	krs.vrs_qaddr = params.vrp_pa;
946 	krs.vrs_avail_idx = params.vrp_avail_idx;
947 	krs.vrs_used_idx = params.vrp_used_idx;
948 
949 	if (ddi_copyout(&krs, udata, sizeof (krs), md) != 0) {
950 		return (EFAULT);
951 	}
952 	return (0);
953 }
954 
955 static int
956 viona_ioc_ring_reset(viona_link_t *link, uint_t idx)
957 {
958 	viona_vring_t *ring;
959 
960 	if (idx >= VIONA_VQ_MAX) {
961 		return (EINVAL);
962 	}
963 	ring = &link->l_vrings[idx];
964 
965 	return (viona_ring_reset(ring, B_TRUE));
966 }
967 
968 static int
969 viona_ioc_ring_kick(viona_link_t *link, uint_t idx)
970 {
971 	viona_vring_t *ring;
972 	int err;
973 
974 	if (idx >= VIONA_VQ_MAX) {
975 		return (EINVAL);
976 	}
977 	ring = &link->l_vrings[idx];
978 
979 	mutex_enter(&ring->vr_lock);
980 	switch (ring->vr_state) {
981 	case VRS_SETUP:
982 		/*
983 		 * An early kick to a ring which is starting its worker thread
984 		 * is fine.  Once that thread is active, it will process the
985 		 * start-up request immediately.
986 		 */
987 		/* FALLTHROUGH */
988 	case VRS_INIT:
989 		ring->vr_state_flags |= VRSF_REQ_START;
990 		/* FALLTHROUGH */
991 	case VRS_RUN:
992 		cv_broadcast(&ring->vr_cv);
993 		err = 0;
994 		break;
995 	default:
996 		err = EBUSY;
997 		break;
998 	}
999 	mutex_exit(&ring->vr_lock);
1000 
1001 	return (err);
1002 }
1003 
1004 static int
1005 viona_ioc_ring_pause(viona_link_t *link, uint_t idx)
1006 {
1007 	if (idx >= VIONA_VQ_MAX) {
1008 		return (EINVAL);
1009 	}
1010 
1011 	viona_vring_t *ring = &link->l_vrings[idx];
1012 	return (viona_ring_pause(ring));
1013 }
1014 
1015 static int
1016 viona_ioc_ring_set_msi(viona_link_t *link, void *data, int md)
1017 {
1018 	vioc_ring_msi_t vrm;
1019 	viona_vring_t *ring;
1020 
1021 	if (ddi_copyin(data, &vrm, sizeof (vrm), md) != 0) {
1022 		return (EFAULT);
1023 	}
1024 	if (vrm.rm_index >= VIONA_VQ_MAX) {
1025 		return (EINVAL);
1026 	}
1027 
1028 	ring = &link->l_vrings[vrm.rm_index];
1029 	mutex_enter(&ring->vr_lock);
1030 	ring->vr_msi_addr = vrm.rm_addr;
1031 	ring->vr_msi_msg = vrm.rm_msg;
1032 	mutex_exit(&ring->vr_lock);
1033 
1034 	return (0);
1035 }
1036 
1037 static int
1038 viona_notify_iop(void *arg, bool in, uint16_t port, uint8_t bytes,
1039     uint32_t *val)
1040 {
1041 	viona_link_t *link = (viona_link_t *)arg;
1042 
1043 	/*
1044 	 * If the request is a read (in/ins), or direct at a port other than
1045 	 * what we expect to be registered on, ignore it.
1046 	 */
1047 	if (in || port != link->l_notify_ioport) {
1048 		return (ESRCH);
1049 	}
1050 
1051 	/* Let userspace handle notifications for rings other than RX/TX. */
1052 	const uint16_t vq = *val;
1053 	if (vq >= VIONA_VQ_MAX) {
1054 		return (ESRCH);
1055 	}
1056 
1057 	viona_vring_t *ring = &link->l_vrings[vq];
1058 	int res = 0;
1059 
1060 	mutex_enter(&ring->vr_lock);
1061 	if (ring->vr_state == VRS_RUN) {
1062 		cv_broadcast(&ring->vr_cv);
1063 	} else {
1064 		res = ESRCH;
1065 	}
1066 	mutex_exit(&ring->vr_lock);
1067 
1068 	return (res);
1069 }
1070 
1071 static int
1072 viona_ioc_set_notify_ioport(viona_link_t *link, uint16_t ioport)
1073 {
1074 	int err = 0;
1075 
1076 	if (link->l_notify_ioport != 0) {
1077 		vmm_drv_ioport_unhook(link->l_vm_hold, &link->l_notify_cookie);
1078 		link->l_notify_ioport = 0;
1079 	}
1080 
1081 	if (ioport != 0) {
1082 		err = vmm_drv_ioport_hook(link->l_vm_hold, ioport,
1083 		    viona_notify_iop, (void *)link, &link->l_notify_cookie);
1084 		if (err == 0) {
1085 			link->l_notify_ioport = ioport;
1086 		}
1087 	}
1088 	return (err);
1089 }
1090 
1091 static int
1092 viona_ioc_ring_intr_clear(viona_link_t *link, uint_t idx)
1093 {
1094 	if (idx >= VIONA_VQ_MAX) {
1095 		return (EINVAL);
1096 	}
1097 
1098 	link->l_vrings[idx].vr_intr_enabled = 0;
1099 	return (0);
1100 }
1101 
1102 static int
1103 viona_ioc_intr_poll(viona_link_t *link, void *udata, int md, int *rv)
1104 {
1105 	uint_t cnt = 0;
1106 	vioc_intr_poll_t vip;
1107 
1108 	for (uint_t i = 0; i < VIONA_VQ_MAX; i++) {
1109 		uint_t val = link->l_vrings[i].vr_intr_enabled;
1110 
1111 		vip.vip_status[i] = val;
1112 		if (val != 0) {
1113 			cnt++;
1114 		}
1115 	}
1116 
1117 	if (ddi_copyout(&vip, udata, sizeof (vip), md) != 0) {
1118 		return (EFAULT);
1119 	}
1120 	*rv = (int)cnt;
1121 	return (0);
1122 }
1123