xref: /illumos-gate/usr/src/cmd/bhyve/pci_passthru.c (revision f96a0cef040313f6281fbc014a0b63d5c5cc760f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42 
43 #include <sys/pci.h>
44 
45 #include <dev/io/iodev.h>
46 #include <dev/pci/pcireg.h>
47 
48 #include <machine/iodev.h>
49 
50 #ifndef WITHOUT_CAPSICUM
51 #include <capsicum_helpers.h>
52 #endif
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <err.h>
57 #include <errno.h>
58 #include <fcntl.h>
59 #include <sysexits.h>
60 #include <unistd.h>
61 
62 #include <machine/vmm.h>
63 #include <vmmapi.h>
64 #include <sys/ppt_dev.h>
65 
66 #include "config.h"
67 #include "debug.h"
68 #include "pci_emul.h"
69 #include "mem.h"
70 
71 #define	LEGACY_SUPPORT	1
72 
73 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
74 #define MSIX_CAPLEN 12
75 
76 struct passthru_softc {
77 	struct pci_devinst *psc_pi;
78 	struct pcibar psc_bar[PCI_BARMAX + 1];
79 	struct {
80 		int		capoff;
81 		int		msgctrl;
82 		int		emulated;
83 	} psc_msi;
84 	struct {
85 		int		capoff;
86 	} psc_msix;
87 	int pptfd;
88 	int msi_limit;
89 	int msix_limit;
90 };
91 
92 static int
93 msi_caplen(int msgctrl)
94 {
95 	int len;
96 
97 	len = 10;		/* minimum length of msi capability */
98 
99 	if (msgctrl & PCIM_MSICTRL_64BIT)
100 		len += 4;
101 
102 #if 0
103 	/*
104 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
105 	 * We'll let the guest manipulate them directly.
106 	 */
107 	if (msgctrl & PCIM_MSICTRL_VECTOR)
108 		len += 10;
109 #endif
110 
111 	return (len);
112 }
113 
114 static uint32_t
115 read_config(const struct passthru_softc *sc, long reg, int width)
116 {
117 	struct ppt_cfg_io pi;
118 
119 	pi.pci_off = reg;
120 	pi.pci_width = width;
121 
122 	if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
123 		return (0);
124 	}
125 	return (pi.pci_data);
126 }
127 
128 static void
129 write_config(const struct passthru_softc *sc, long reg, int width,
130     uint32_t data)
131 {
132 	struct ppt_cfg_io pi;
133 
134 	pi.pci_off = reg;
135 	pi.pci_width = width;
136 	pi.pci_data = data;
137 
138 	(void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
139 }
140 
141 static int
142 passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
143     uint64_t *base, uint64_t *size)
144 {
145 	struct ppt_bar_query pb;
146 
147 	pb.pbq_baridx = bar;
148 
149 	if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
150 		return (-1);
151 	}
152 
153 	switch (pb.pbq_type) {
154 	case PCI_ADDR_IO:
155 		*type = PCIBAR_IO;
156 		break;
157 	case PCI_ADDR_MEM32:
158 		*type = PCIBAR_MEM32;
159 		break;
160 	case PCI_ADDR_MEM64:
161 		*type = PCIBAR_MEM64;
162 		break;
163 	default:
164 		err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
165 		break;
166 	}
167 
168 	*base = pb.pbq_base;
169 	*size = pb.pbq_size;
170 	return (0);
171 }
172 
173 static int
174 passthru_dev_open(const char *path, int *pptfdp)
175 {
176 	int pptfd;
177 
178 	if ((pptfd = open(path, O_RDWR)) < 0) {
179 		return (errno);
180 	}
181 
182 	/* XXX: verify fd with ioctl? */
183 	*pptfdp = pptfd;
184 	return (0);
185 }
186 
187 #ifdef LEGACY_SUPPORT
188 static int
189 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
190 {
191 	int capoff, i;
192 	struct msicap msicap;
193 	u_char *capdata;
194 
195 	pci_populate_msicap(&msicap, msgnum, nextptr);
196 
197 	/*
198 	 * XXX
199 	 * Copy the msi capability structure in the last 16 bytes of the
200 	 * config space. This is wrong because it could shadow something
201 	 * useful to the device.
202 	 */
203 	capoff = 256 - roundup(sizeof(msicap), 4);
204 	capdata = (u_char *)&msicap;
205 	for (i = 0; i < sizeof(msicap); i++)
206 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
207 
208 	return (capoff);
209 }
210 #endif	/* LEGACY_SUPPORT */
211 
212 static void
213 passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
214 {
215 	struct pci_devinst *pi = sc->psc_pi;
216 	int off;
217 
218 	/* Reduce the number of MSI vectors if higher than OS limit */
219 	if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
220 		int msi_limit, mmc;
221 
222 		msi_limit =
223 		    sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
224 		    sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
225 		    sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
226 		    sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
227 		    sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
228 		    PCIM_MSICTRL_MMC_1;
229 		mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
230 
231 		if (mmc > msi_limit) {
232 			sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
233 			sc->psc_msi.msgctrl |= msi_limit;
234 			pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
235 		}
236 	}
237 
238 	/* Reduce the number of MSI-X vectors if higher than OS limit */
239 	if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
240 		if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
241 			msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
242 			msixcap->msgctrl |= sc->msix_limit - 1;
243 			pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
244 		}
245 	}
246 }
247 
248 static int
249 cfginitmsi(struct passthru_softc *sc)
250 {
251 	int i, ptr, capptr, cap, sts, caplen, table_size;
252 	uint32_t u32;
253 	struct pci_devinst *pi = sc->psc_pi;
254 	struct msixcap msixcap;
255 	uint32_t *msixcap_ptr;
256 
257 	/*
258 	 * Parse the capabilities and cache the location of the MSI
259 	 * and MSI-X capabilities.
260 	 */
261 	sts = read_config(sc, PCIR_STATUS, 2);
262 	if (sts & PCIM_STATUS_CAPPRESENT) {
263 		ptr = read_config(sc, PCIR_CAP_PTR, 1);
264 		while (ptr != 0 && ptr != 0xff) {
265 			cap = read_config(sc, ptr + PCICAP_ID, 1);
266 			if (cap == PCIY_MSI) {
267 				/*
268 				 * Copy the MSI capability into the config
269 				 * space of the emulated pci device
270 				 */
271 				sc->psc_msi.capoff = ptr;
272 				sc->psc_msi.msgctrl = read_config(sc,
273 				    ptr + 2, 2);
274 				sc->psc_msi.emulated = 0;
275 				caplen = msi_caplen(sc->psc_msi.msgctrl);
276 				capptr = ptr;
277 				while (caplen > 0) {
278 					u32 = read_config(sc, capptr, 4);
279 					pci_set_cfgdata32(pi, capptr, u32);
280 					caplen -= 4;
281 					capptr += 4;
282 				}
283 			} else if (cap == PCIY_MSIX) {
284 				/*
285 				 * Copy the MSI-X capability
286 				 */
287 				sc->psc_msix.capoff = ptr;
288 				caplen = 12;
289 				msixcap_ptr = (uint32_t*) &msixcap;
290 				capptr = ptr;
291 				while (caplen > 0) {
292 					u32 = read_config(sc, capptr, 4);
293 					*msixcap_ptr = u32;
294 					pci_set_cfgdata32(pi, capptr, u32);
295 					caplen -= 4;
296 					capptr += 4;
297 					msixcap_ptr++;
298 				}
299 			}
300 			ptr = read_config(sc, ptr + PCICAP_NEXTPTR, 1);
301 		}
302 	}
303 
304 	passthru_intr_limit(sc, &msixcap);
305 
306 	if (sc->psc_msix.capoff != 0) {
307 		pi->pi_msix.pba_bar =
308 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
309 		pi->pi_msix.pba_offset =
310 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
311 		pi->pi_msix.table_bar =
312 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
313 		pi->pi_msix.table_offset =
314 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
315 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
316 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
317 
318 		/* Allocate the emulated MSI-X table array */
319 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
320 		pi->pi_msix.table = calloc(1, table_size);
321 
322 		/* Mask all table entries */
323 		for (i = 0; i < pi->pi_msix.table_count; i++) {
324 			pi->pi_msix.table[i].vector_control |=
325 						PCIM_MSIX_VCTRL_MASK;
326 		}
327 	}
328 
329 #ifdef LEGACY_SUPPORT
330 	/*
331 	 * If the passthrough device does not support MSI then craft a
332 	 * MSI capability for it. We link the new MSI capability at the
333 	 * head of the list of capabilities.
334 	 */
335 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
336 		int origptr, msiptr;
337 		origptr = read_config(sc, PCIR_CAP_PTR, 1);
338 		msiptr = passthru_add_msicap(pi, 1, origptr);
339 		sc->psc_msi.capoff = msiptr;
340 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
341 		sc->psc_msi.emulated = 1;
342 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
343 	}
344 #endif
345 
346 	/* Make sure one of the capabilities is present */
347 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) {
348 		return (-1);
349 	} else {
350 		return (0);
351 	}
352 }
353 
354 static uint64_t
355 passthru_msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
356 {
357 	struct pci_devinst *pi;
358 	struct msix_table_entry *entry;
359 	uint8_t *src8;
360 	uint16_t *src16;
361 	uint32_t *src32;
362 	uint64_t *src64;
363 	uint64_t data;
364 	size_t entry_offset;
365 	int index;
366 
367 	pi = sc->psc_pi;
368 	if (offset >= pi->pi_msix.pba_offset &&
369 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
370 		switch(size) {
371 		case 1:
372 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
373 			    pi->pi_msix.pba_page_offset);
374 			data = *src8;
375 			break;
376 		case 2:
377 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
378 			    pi->pi_msix.pba_page_offset);
379 			data = *src16;
380 			break;
381 		case 4:
382 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
383 			    pi->pi_msix.pba_page_offset);
384 			data = *src32;
385 			break;
386 		case 8:
387 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
388 			    pi->pi_msix.pba_page_offset);
389 			data = *src64;
390 			break;
391 		default:
392 			return (-1);
393 		}
394 		return (data);
395 	}
396 
397 	if (offset < pi->pi_msix.table_offset)
398 		return (-1);
399 
400 	offset -= pi->pi_msix.table_offset;
401 	index = offset / MSIX_TABLE_ENTRY_SIZE;
402 	if (index >= pi->pi_msix.table_count)
403 		return (-1);
404 
405 	entry = &pi->pi_msix.table[index];
406 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
407 
408 	switch(size) {
409 	case 1:
410 		src8 = (uint8_t *)((void *)entry + entry_offset);
411 		data = *src8;
412 		break;
413 	case 2:
414 		src16 = (uint16_t *)((void *)entry + entry_offset);
415 		data = *src16;
416 		break;
417 	case 4:
418 		src32 = (uint32_t *)((void *)entry + entry_offset);
419 		data = *src32;
420 		break;
421 	case 8:
422 		src64 = (uint64_t *)((void *)entry + entry_offset);
423 		data = *src64;
424 		break;
425 	default:
426 		return (-1);
427 	}
428 
429 	return (data);
430 }
431 
432 static void
433 passthru_msix_table_write(struct vmctx *ctx, int vcpu,
434     struct passthru_softc *sc, uint64_t offset, int size, uint64_t data)
435 {
436 	struct pci_devinst *pi;
437 	struct msix_table_entry *entry;
438 	uint8_t *dest8;
439 	uint16_t *dest16;
440 	uint32_t *dest32;
441 	uint64_t *dest64;
442 	size_t entry_offset;
443 	uint32_t vector_control;
444 	int index;
445 
446 	pi = sc->psc_pi;
447 	if (offset >= pi->pi_msix.pba_offset &&
448 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
449 		switch(size) {
450 		case 1:
451 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
452 			    pi->pi_msix.pba_page_offset);
453 			*dest8 = data;
454 			break;
455 		case 2:
456 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
457 			    pi->pi_msix.pba_page_offset);
458 			*dest16 = data;
459 			break;
460 		case 4:
461 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
462 			    pi->pi_msix.pba_page_offset);
463 			*dest32 = data;
464 			break;
465 		case 8:
466 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
467 			    pi->pi_msix.pba_page_offset);
468 			*dest64 = data;
469 			break;
470 		default:
471 			break;
472 		}
473 		return;
474 	}
475 
476 	if (offset < pi->pi_msix.table_offset)
477 		return;
478 
479 	offset -= pi->pi_msix.table_offset;
480 	index = offset / MSIX_TABLE_ENTRY_SIZE;
481 	if (index >= pi->pi_msix.table_count)
482 		return;
483 
484 	entry = &pi->pi_msix.table[index];
485 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
486 
487 	/* Only 4 byte naturally-aligned writes are supported */
488 	assert(size == 4);
489 	assert(entry_offset % 4 == 0);
490 
491 	vector_control = entry->vector_control;
492 	dest32 = (uint32_t *)((void *)entry + entry_offset);
493 	*dest32 = data;
494 	/* If MSI-X hasn't been enabled, do nothing */
495 	if (pi->pi_msix.enabled) {
496 		/* If the entry is masked, don't set it up */
497 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
498 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
499 			(void) vm_setup_pptdev_msix(ctx, vcpu, sc->pptfd,
500 			    index, entry->addr, entry->msg_data,
501 			    entry->vector_control);
502 		}
503 	}
504 }
505 
506 static int
507 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
508 {
509 	int idx;
510 	size_t remaining __unused;
511 	uint32_t table_size, table_offset;
512 	uint32_t pba_size, pba_offset;
513 	vm_paddr_t start __unused;
514 	struct pci_devinst *pi = sc->psc_pi;
515 
516 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
517 
518 	/*
519 	 * If the MSI-X table BAR maps memory intended for
520 	 * other uses, it is at least assured that the table
521 	 * either resides in its own page within the region,
522 	 * or it resides in a page shared with only the PBA.
523 	 */
524 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
525 
526 	table_size = pi->pi_msix.table_offset - table_offset;
527 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
528 	table_size = roundup2(table_size, 4096);
529 
530 	idx = pi->pi_msix.table_bar;
531 	start = pi->pi_bar[idx].addr;
532 	remaining = pi->pi_bar[idx].size;
533 
534 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
535 		pba_offset = pi->pi_msix.pba_offset;
536 		pba_size = pi->pi_msix.pba_size;
537 		if (pba_offset >= table_offset + table_size ||
538 		    table_offset >= pba_offset + pba_size) {
539 			/*
540 			 * If the PBA does not share a page with the MSI-x
541 			 * tables, no PBA emulation is required.
542 			 */
543 			pi->pi_msix.pba_page = NULL;
544 			pi->pi_msix.pba_page_offset = 0;
545 		} else {
546 			/*
547 			 * The PBA overlaps with either the first or last
548 			 * page of the MSI-X table region.  Map the
549 			 * appropriate page.
550 			 */
551 			if (pba_offset <= table_offset)
552 				pi->pi_msix.pba_page_offset = table_offset;
553 			else
554 				pi->pi_msix.pba_page_offset = table_offset +
555 				    table_size - 4096;
556 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
557 			    PROT_WRITE, MAP_SHARED, sc->pptfd,
558 			    pi->pi_msix.pba_page_offset);
559 			if (pi->pi_msix.pba_page == MAP_FAILED) {
560 				warn("Failed to map PBA page for MSI-X on %d",
561 				    sc->pptfd);
562 				return (-1);
563 			}
564 		}
565 	}
566 
567 	return (0);
568 }
569 
570 static int
571 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
572 {
573 	struct pci_devinst *pi = sc->psc_pi;
574 	uint_t i;
575 
576 	/*
577 	 * Initialize BAR registers
578 	 */
579 	for (i = 0; i <= PCI_BARMAX; i++) {
580 		enum pcibar_type bartype;
581 		uint64_t base, size;
582 		int error;
583 
584 		if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
585 			continue;
586 		}
587 
588 		if (bartype != PCIBAR_IO) {
589 			if (((base | size) & PAGE_MASK) != 0) {
590 				warnx("passthru device %d BAR %d: "
591 				    "base %#lx or size %#lx not page aligned\n",
592 				    sc->pptfd, i, base, size);
593 				return (-1);
594 			}
595 		}
596 
597 		/* Cache information about the "real" BAR */
598 		sc->psc_bar[i].type = bartype;
599 		sc->psc_bar[i].size = size;
600 		sc->psc_bar[i].addr = base;
601 
602 		/* Allocate the BAR in the guest I/O or MMIO space */
603 		error = pci_emul_alloc_bar(pi, i, bartype, size);
604 		if (error)
605 			return (-1);
606 
607 		/* The MSI-X table needs special handling */
608 		if (i == pci_msix_table_bar(pi)) {
609 			error = init_msix_table(ctx, sc, base);
610 			if (error)
611 				return (-1);
612 		}
613 
614 		/*
615 		 * 64-bit BAR takes up two slots so skip the next one.
616 		 */
617 		if (bartype == PCIBAR_MEM64) {
618 			i++;
619 			assert(i <= PCI_BARMAX);
620 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
621 		}
622 	}
623 	return (0);
624 }
625 
626 static int
627 cfginit(struct vmctx *ctx, struct passthru_softc *sc)
628 {
629 	struct pci_devinst *pi = sc->psc_pi;
630 
631 	if (cfginitmsi(sc) != 0) {
632 		warnx("failed to initialize MSI for PCI %d", sc->pptfd);
633 		return (-1);
634 	}
635 
636 	if (cfginitbar(ctx, sc) != 0) {
637 		warnx("failed to initialize BARs for PCI %d", sc->pptfd);
638 		return (-1);
639 	}
640 
641 	pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(sc, PCIR_COMMAND, 2));
642 
643 	return (0);
644 }
645 
646 static int
647 passthru_legacy_config(nvlist_t *nvl, const char *opts)
648 {
649 	if (opts == NULL)
650 		return (0);
651 
652 	if (strncmp(opts, "/dev/ppt", 8) == 0)
653 		set_config_value_node(nvl, "path", opts);
654 
655 	return (0);
656 }
657 
658 static int
659 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
660 {
661 	int error, memflags, pptfd;
662 	struct passthru_softc *sc;
663 	const char *path;
664 
665 	sc = NULL;
666 	error = 1;
667 
668 	memflags = vm_get_memflags(ctx);
669 	if (!(memflags & VM_MEM_F_WIRED)) {
670 		warnx("passthru requires guest memory to be wired");
671 		goto done;
672 	}
673 
674 	path = get_config_value_node(nvl, "path");
675 	if (path == NULL || passthru_dev_open(path, &pptfd) != 0) {
676 		warnx("invalid passthru options");
677 		goto done;
678 	}
679 
680 	if (vm_assign_pptdev(ctx, pptfd) != 0) {
681 		warnx("PCI device at %d is not using the ppt driver", pptfd);
682 		goto done;
683 	}
684 
685 	sc = calloc(1, sizeof(struct passthru_softc));
686 
687 	pi->pi_arg = sc;
688 	sc->psc_pi = pi;
689 	sc->pptfd = pptfd;
690 
691 	if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
692 	    &sc->msix_limit)) != 0)
693 		goto done;
694 
695 	/* initialize config space */
696 	if ((error = cfginit(ctx, sc)) != 0)
697 		goto done;
698 
699 	error = 0;		/* success */
700 done:
701 	if (error) {
702 		free(sc);
703 		vm_unassign_pptdev(ctx, pptfd);
704 	}
705 	return (error);
706 }
707 
708 static int
709 bar_access(int coff)
710 {
711 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
712 		return (1);
713 	else
714 		return (0);
715 }
716 
717 static int
718 msicap_access(struct passthru_softc *sc, int coff)
719 {
720 	int caplen;
721 
722 	if (sc->psc_msi.capoff == 0)
723 		return (0);
724 
725 	caplen = msi_caplen(sc->psc_msi.msgctrl);
726 
727 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
728 		return (1);
729 	else
730 		return (0);
731 }
732 
733 static int
734 msixcap_access(struct passthru_softc *sc, int coff)
735 {
736 	if (sc->psc_msix.capoff == 0)
737 		return (0);
738 
739 	return (coff >= sc->psc_msix.capoff &&
740 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
741 }
742 
743 static int
744 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
745     int coff, int bytes, uint32_t *rv)
746 {
747 	struct passthru_softc *sc;
748 
749 	sc = pi->pi_arg;
750 
751 	/*
752 	 * PCI BARs and MSI capability is emulated.
753 	 */
754 	if (bar_access(coff) || msicap_access(sc, coff))
755 		return (-1);
756 
757 	/*
758 	 * MSI-X is also emulated since a limit on interrupts may be imposed by
759 	 * the OS, altering the perceived register state.
760 	 */
761 	if (msixcap_access(sc, coff))
762 		return (-1);
763 
764 #ifdef LEGACY_SUPPORT
765 	/*
766 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
767 	 * natively.
768 	 */
769 	if (sc->psc_msi.emulated) {
770 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
771 			return (-1);
772 	}
773 #endif
774 
775 	/*
776 	 * Emulate the command register.  If a single read reads both the
777 	 * command and status registers, read the status register from the
778 	 * device's config space.
779 	 */
780 	if (coff == PCIR_COMMAND) {
781 		if (bytes <= 2)
782 			return (-1);
783 		*rv = pci_get_cfgdata16(pi, PCIR_COMMAND) << 16 |
784 		    read_config(sc, PCIR_STATUS, 2);
785 		return (0);
786 	}
787 
788 	/* Everything else just read from the device's config space */
789 	*rv = read_config(sc, coff, bytes);
790 
791 	return (0);
792 }
793 
794 static int
795 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
796     int coff, int bytes, uint32_t val)
797 {
798 	int error, msix_table_entries, i;
799 	struct passthru_softc *sc;
800 	uint16_t cmd_old;
801 
802 	sc = pi->pi_arg;
803 
804 	/*
805 	 * PCI BARs are emulated
806 	 */
807 	if (bar_access(coff))
808 		return (-1);
809 
810 	/*
811 	 * MSI capability is emulated
812 	 */
813 	if (msicap_access(sc, coff)) {
814 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
815 		    PCIY_MSI);
816 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->pptfd,
817 		    pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
818 		if (error != 0)
819 			err(1, "vm_setup_pptdev_msi");
820 		return (0);
821 	}
822 
823 	if (msixcap_access(sc, coff)) {
824 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
825 		    PCIY_MSIX);
826 		if (pi->pi_msix.enabled) {
827 			msix_table_entries = pi->pi_msix.table_count;
828 			for (i = 0; i < msix_table_entries; i++) {
829 				error = vm_setup_pptdev_msix(ctx, vcpu,
830 				    sc->pptfd, i,
831 				    pi->pi_msix.table[i].addr,
832 				    pi->pi_msix.table[i].msg_data,
833 				    pi->pi_msix.table[i].vector_control);
834 
835 				if (error)
836 					err(1, "vm_setup_pptdev_msix");
837 			}
838 		} else {
839 			error = vm_disable_pptdev_msix(ctx, sc->pptfd);
840 			if (error)
841 				err(1, "vm_disable_pptdev_msix");
842 		}
843 		return (0);
844 	}
845 
846 #ifdef LEGACY_SUPPORT
847 	/*
848 	 * If this device does not support MSI natively then we cannot let
849 	 * the guest disable legacy interrupts from the device. It is the
850 	 * legacy interrupt that is triggering the virtual MSI to the guest.
851 	 */
852 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
853 		if (coff == PCIR_COMMAND && bytes == 2)
854 			val &= ~PCIM_CMD_INTxDIS;
855 	}
856 #endif
857 
858 	write_config(sc, coff, bytes, val);
859 	if (coff == PCIR_COMMAND) {
860 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
861 		if (bytes == 1)
862 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
863 		else if (bytes == 2)
864 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
865 		pci_emul_cmd_changed(pi, cmd_old);
866 	}
867 
868 	return (0);
869 }
870 
871 static void
872 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
873     uint64_t offset, int size, uint64_t value)
874 {
875 	struct passthru_softc *sc = pi->pi_arg;
876 
877 	if (baridx == pci_msix_table_bar(pi)) {
878 		passthru_msix_table_write(ctx, vcpu, sc, offset, size, value);
879 	} else {
880 		struct ppt_bar_io pbi;
881 
882 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
883 
884 		pbi.pbi_bar = baridx;
885 		pbi.pbi_width = size;
886 		pbi.pbi_off = offset;
887 		pbi.pbi_data = value;
888 		(void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
889 	}
890 }
891 
892 static uint64_t
893 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
894     uint64_t offset, int size)
895 {
896 	struct passthru_softc *sc = pi->pi_arg;
897 	uint64_t val;
898 
899 	if (baridx == pci_msix_table_bar(pi)) {
900 		val = passthru_msix_table_read(sc, offset, size);
901 	} else {
902 		struct ppt_bar_io pbi;
903 
904 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
905 
906 		pbi.pbi_bar = baridx;
907 		pbi.pbi_width = size;
908 		pbi.pbi_off = offset;
909 		if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
910 			val = pbi.pbi_data;
911 		} else {
912 			val = 0;
913 		}
914 	}
915 
916 	return (val);
917 }
918 
919 static void
920 passthru_msix_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
921     int enabled, uint64_t address)
922 {
923 	struct passthru_softc *sc;
924 	size_t remaining;
925 	uint32_t table_size, table_offset;
926 
927 	sc = pi->pi_arg;
928 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
929 	if (table_offset > 0) {
930 		if (!enabled) {
931 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
932 			    table_offset) != 0)
933 				warnx("pci_passthru: unmap_pptdev_mmio failed");
934 		} else {
935 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
936 			    table_offset, sc->psc_bar[baridx].addr) != 0)
937 				warnx("pci_passthru: map_pptdev_mmio failed");
938 		}
939 	}
940 	table_size = pi->pi_msix.table_offset - table_offset;
941 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
942 	table_size = roundup2(table_size, 4096);
943 	remaining = pi->pi_bar[baridx].size - table_offset - table_size;
944 	if (remaining > 0) {
945 		address += table_offset + table_size;
946 		if (!enabled) {
947 			if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
948 			    remaining) != 0)
949 				warnx("pci_passthru: unmap_pptdev_mmio failed");
950 		} else {
951 			if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
952 			    remaining, sc->psc_bar[baridx].addr +
953 			    table_offset + table_size) != 0)
954 				warnx("pci_passthru: map_pptdev_mmio failed");
955 		}
956 	}
957 }
958 
959 static void
960 passthru_mmio_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
961     int enabled, uint64_t address)
962 {
963 	struct passthru_softc *sc;
964 
965 	sc = pi->pi_arg;
966 	if (!enabled) {
967 		if (vm_unmap_pptdev_mmio(ctx, sc->pptfd, address,
968 		    sc->psc_bar[baridx].size) != 0)
969 			warnx("pci_passthru: unmap_pptdev_mmio failed");
970 	} else {
971 		if (vm_map_pptdev_mmio(ctx, sc->pptfd, address,
972 		    sc->psc_bar[baridx].size, sc->psc_bar[baridx].addr) != 0)
973 			warnx("pci_passthru: map_pptdev_mmio failed");
974 	}
975 }
976 
977 static void
978 passthru_addr(struct vmctx *ctx, struct pci_devinst *pi, int baridx,
979 	      int enabled, uint64_t address)
980 {
981 
982 	if (pi->pi_bar[baridx].type == PCIBAR_IO)
983 		return;
984 	if (baridx == pci_msix_table_bar(pi))
985 		passthru_msix_addr(ctx, pi, baridx, enabled, address);
986 	else
987 		passthru_mmio_addr(ctx, pi, baridx, enabled, address);
988 }
989 
990 struct pci_devemu passthru = {
991 	.pe_emu		= "passthru",
992 	.pe_init	= passthru_init,
993 	.pe_legacy_config = passthru_legacy_config,
994 	.pe_cfgwrite	= passthru_cfgwrite,
995 	.pe_cfgread	= passthru_cfgread,
996 	.pe_barwrite 	= passthru_write,
997 	.pe_barread    	= passthru_read,
998 	.pe_baraddr	= passthru_addr,
999 };
1000 PCI_EMUL_SET(passthru);
1001