xref: /illumos-gate/usr/src/cmd/bhyve/pci_passthru.c (revision fdb2a7e9480266dfaa0b5aaa0e1237456552f332)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #ifndef WITHOUT_CAPSICUM
36 #include <sys/capsicum.h>
37 #endif
38 #include <sys/types.h>
39 #include <sys/mman.h>
40 #include <sys/pciio.h>
41 #include <sys/ioctl.h>
42 
43 #include <sys/pci.h>
44 
45 #include <dev/io/iodev.h>
46 #include <dev/pci/pcireg.h>
47 
48 #include <machine/iodev.h>
49 
50 #ifndef WITHOUT_CAPSICUM
51 #include <capsicum_helpers.h>
52 #endif
53 #include <stdio.h>
54 #include <stdlib.h>
55 #include <string.h>
56 #include <err.h>
57 #include <errno.h>
58 #include <fcntl.h>
59 #include <sysexits.h>
60 #include <unistd.h>
61 
62 #include <machine/vmm.h>
63 #include <vmmapi.h>
64 #include <sys/ppt_dev.h>
65 #include "pci_emul.h"
66 #include "mem.h"
67 
68 #define	LEGACY_SUPPORT	1
69 
70 #define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1)
71 #define MSIX_CAPLEN 12
72 
73 struct passthru_softc {
74 	struct pci_devinst *psc_pi;
75 	struct pcibar psc_bar[PCI_BARMAX + 1];
76 	struct {
77 		int		capoff;
78 		int		msgctrl;
79 		int		emulated;
80 	} psc_msi;
81 	struct {
82 		int		capoff;
83 	} psc_msix;
84 	int pptfd;
85 	int msi_limit;
86 	int msix_limit;
87 };
88 
89 static int
90 msi_caplen(int msgctrl)
91 {
92 	int len;
93 
94 	len = 10;		/* minimum length of msi capability */
95 
96 	if (msgctrl & PCIM_MSICTRL_64BIT)
97 		len += 4;
98 
99 #if 0
100 	/*
101 	 * Ignore the 'mask' and 'pending' bits in the MSI capability.
102 	 * We'll let the guest manipulate them directly.
103 	 */
104 	if (msgctrl & PCIM_MSICTRL_VECTOR)
105 		len += 10;
106 #endif
107 
108 	return (len);
109 }
110 
111 static uint32_t
112 read_config(const struct passthru_softc *sc, long reg, int width)
113 {
114 	struct ppt_cfg_io pi;
115 
116 	pi.pci_off = reg;
117 	pi.pci_width = width;
118 
119 	if (ioctl(sc->pptfd, PPT_CFG_READ, &pi) != 0) {
120 		return (0);
121 	}
122 	return (pi.pci_data);
123 }
124 
125 static void
126 write_config(const struct passthru_softc *sc, long reg, int width,
127     uint32_t data)
128 {
129 	struct ppt_cfg_io pi;
130 
131 	pi.pci_off = reg;
132 	pi.pci_width = width;
133 	pi.pci_data = data;
134 
135 	(void) ioctl(sc->pptfd, PPT_CFG_WRITE, &pi);
136 }
137 
138 static int
139 passthru_get_bar(struct passthru_softc *sc, int bar, enum pcibar_type *type,
140     uint64_t *base, uint64_t *size)
141 {
142 	struct ppt_bar_query pb;
143 
144 	pb.pbq_baridx = bar;
145 
146 	if (ioctl(sc->pptfd, PPT_BAR_QUERY, &pb) != 0) {
147 		return (-1);
148 	}
149 
150 	switch (pb.pbq_type) {
151 	case PCI_ADDR_IO:
152 		*type = PCIBAR_IO;
153 		break;
154 	case PCI_ADDR_MEM32:
155 		*type = PCIBAR_MEM32;
156 		break;
157 	case PCI_ADDR_MEM64:
158 		*type = PCIBAR_MEM64;
159 		break;
160 	default:
161 		err(1, "unrecognized BAR type: %u\n", pb.pbq_type);
162 		break;
163 	}
164 
165 	*base = pb.pbq_base;
166 	*size = pb.pbq_size;
167 	return (0);
168 }
169 
170 static int
171 passthru_dev_open(const char *path, int *pptfdp)
172 {
173 	int pptfd;
174 
175 	if ((pptfd = open(path, O_RDWR)) < 0) {
176 		return (errno);
177 	}
178 
179 	/* XXX: verify fd with ioctl? */
180 	*pptfdp = pptfd;
181 	return (0);
182 }
183 
184 #ifdef LEGACY_SUPPORT
185 static int
186 passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr)
187 {
188 	int capoff, i;
189 	struct msicap msicap;
190 	u_char *capdata;
191 
192 	pci_populate_msicap(&msicap, msgnum, nextptr);
193 
194 	/*
195 	 * XXX
196 	 * Copy the msi capability structure in the last 16 bytes of the
197 	 * config space. This is wrong because it could shadow something
198 	 * useful to the device.
199 	 */
200 	capoff = 256 - roundup(sizeof(msicap), 4);
201 	capdata = (u_char *)&msicap;
202 	for (i = 0; i < sizeof(msicap); i++)
203 		pci_set_cfgdata8(pi, capoff + i, capdata[i]);
204 
205 	return (capoff);
206 }
207 #endif	/* LEGACY_SUPPORT */
208 
209 static void
210 passthru_intr_limit(struct passthru_softc *sc, struct msixcap *msixcap)
211 {
212 	struct pci_devinst *pi = sc->psc_pi;
213 	int off;
214 
215 	/* Reduce the number of MSI vectors if higher than OS limit */
216 	if ((off = sc->psc_msi.capoff) != 0 && sc->msi_limit != -1) {
217 		int msi_limit, mmc;
218 
219 		msi_limit =
220 		    sc->msi_limit > 16 ? PCIM_MSICTRL_MMC_32 :
221 		    sc->msi_limit > 8 ? PCIM_MSICTRL_MMC_16 :
222 		    sc->msi_limit > 4 ? PCIM_MSICTRL_MMC_8 :
223 		    sc->msi_limit > 2 ? PCIM_MSICTRL_MMC_4 :
224 		    sc->msi_limit > 1 ? PCIM_MSICTRL_MMC_2 :
225 		    PCIM_MSICTRL_MMC_1;
226 		mmc = sc->psc_msi.msgctrl & PCIM_MSICTRL_MMC_MASK;
227 
228 		if (mmc > msi_limit) {
229 			sc->psc_msi.msgctrl &= ~PCIM_MSICTRL_MMC_MASK;
230 			sc->psc_msi.msgctrl |= msi_limit;
231 			pci_set_cfgdata16(pi, off + 2, sc->psc_msi.msgctrl);
232 		}
233 	}
234 
235 	/* Reduce the number of MSI-X vectors if higher than OS limit */
236 	if ((off = sc->psc_msix.capoff) != 0 && sc->msix_limit != -1) {
237 		if (MSIX_TABLE_COUNT(msixcap->msgctrl) > sc->msix_limit) {
238 			msixcap->msgctrl &= ~PCIM_MSIXCTRL_TABLE_SIZE;
239 			msixcap->msgctrl |= sc->msix_limit - 1;
240 			pci_set_cfgdata16(pi, off + 2, msixcap->msgctrl);
241 		}
242 	}
243 }
244 
245 static int
246 cfginitmsi(struct passthru_softc *sc)
247 {
248 	int i, ptr, capptr, cap, sts, caplen, table_size;
249 	uint32_t u32;
250 	struct pci_devinst *pi = sc->psc_pi;
251 	struct msixcap msixcap;
252 	uint32_t *msixcap_ptr;
253 
254 	/*
255 	 * Parse the capabilities and cache the location of the MSI
256 	 * and MSI-X capabilities.
257 	 */
258 	sts = read_config(sc, PCIR_STATUS, 2);
259 	if (sts & PCIM_STATUS_CAPPRESENT) {
260 		ptr = read_config(sc, PCIR_CAP_PTR, 1);
261 		while (ptr != 0 && ptr != 0xff) {
262 			cap = read_config(sc, ptr + PCICAP_ID, 1);
263 			if (cap == PCIY_MSI) {
264 				/*
265 				 * Copy the MSI capability into the config
266 				 * space of the emulated pci device
267 				 */
268 				sc->psc_msi.capoff = ptr;
269 				sc->psc_msi.msgctrl = read_config(sc,
270 				    ptr + 2, 2);
271 				sc->psc_msi.emulated = 0;
272 				caplen = msi_caplen(sc->psc_msi.msgctrl);
273 				capptr = ptr;
274 				while (caplen > 0) {
275 					u32 = read_config(sc, capptr, 4);
276 					pci_set_cfgdata32(pi, capptr, u32);
277 					caplen -= 4;
278 					capptr += 4;
279 				}
280 			} else if (cap == PCIY_MSIX) {
281 				/*
282 				 * Copy the MSI-X capability
283 				 */
284 				sc->psc_msix.capoff = ptr;
285 				caplen = 12;
286 				msixcap_ptr = (uint32_t*) &msixcap;
287 				capptr = ptr;
288 				while (caplen > 0) {
289 					u32 = read_config(sc, capptr, 4);
290 					*msixcap_ptr = u32;
291 					pci_set_cfgdata32(pi, capptr, u32);
292 					caplen -= 4;
293 					capptr += 4;
294 					msixcap_ptr++;
295 				}
296 			}
297 			ptr = read_config(sc, ptr + PCICAP_NEXTPTR, 1);
298 		}
299 	}
300 
301 	passthru_intr_limit(sc, &msixcap);
302 
303 	if (sc->psc_msix.capoff != 0) {
304 		pi->pi_msix.pba_bar =
305 		    msixcap.pba_info & PCIM_MSIX_BIR_MASK;
306 		pi->pi_msix.pba_offset =
307 		    msixcap.pba_info & ~PCIM_MSIX_BIR_MASK;
308 		pi->pi_msix.table_bar =
309 		    msixcap.table_info & PCIM_MSIX_BIR_MASK;
310 		pi->pi_msix.table_offset =
311 		    msixcap.table_info & ~PCIM_MSIX_BIR_MASK;
312 		pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl);
313 		pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count);
314 
315 		/* Allocate the emulated MSI-X table array */
316 		table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
317 		pi->pi_msix.table = calloc(1, table_size);
318 
319 		/* Mask all table entries */
320 		for (i = 0; i < pi->pi_msix.table_count; i++) {
321 			pi->pi_msix.table[i].vector_control |=
322 						PCIM_MSIX_VCTRL_MASK;
323 		}
324 	}
325 
326 #ifdef LEGACY_SUPPORT
327 	/*
328 	 * If the passthrough device does not support MSI then craft a
329 	 * MSI capability for it. We link the new MSI capability at the
330 	 * head of the list of capabilities.
331 	 */
332 	if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) {
333 		int origptr, msiptr;
334 		origptr = read_config(sc, PCIR_CAP_PTR, 1);
335 		msiptr = passthru_add_msicap(pi, 1, origptr);
336 		sc->psc_msi.capoff = msiptr;
337 		sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2);
338 		sc->psc_msi.emulated = 1;
339 		pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr);
340 	}
341 #endif
342 
343 	/* Make sure one of the capabilities is present */
344 	if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) {
345 		return (-1);
346 	} else {
347 		return (0);
348 	}
349 }
350 
351 static uint64_t
352 passthru_msix_table_read(struct passthru_softc *sc, uint64_t offset, int size)
353 {
354 	struct pci_devinst *pi;
355 	struct msix_table_entry *entry;
356 	uint8_t *src8;
357 	uint16_t *src16;
358 	uint32_t *src32;
359 	uint64_t *src64;
360 	uint64_t data;
361 	size_t entry_offset;
362 	int index;
363 
364 	pi = sc->psc_pi;
365 	if (offset >= pi->pi_msix.pba_offset &&
366 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
367 		switch(size) {
368 		case 1:
369 			src8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
370 			    pi->pi_msix.pba_page_offset);
371 			data = *src8;
372 			break;
373 		case 2:
374 			src16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
375 			    pi->pi_msix.pba_page_offset);
376 			data = *src16;
377 			break;
378 		case 4:
379 			src32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
380 			    pi->pi_msix.pba_page_offset);
381 			data = *src32;
382 			break;
383 		case 8:
384 			src64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
385 			    pi->pi_msix.pba_page_offset);
386 			data = *src64;
387 			break;
388 		default:
389 			return (-1);
390 		}
391 		return (data);
392 	}
393 
394 	if (offset < pi->pi_msix.table_offset)
395 		return (-1);
396 
397 	offset -= pi->pi_msix.table_offset;
398 	index = offset / MSIX_TABLE_ENTRY_SIZE;
399 	if (index >= pi->pi_msix.table_count)
400 		return (-1);
401 
402 	entry = &pi->pi_msix.table[index];
403 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
404 
405 	switch(size) {
406 	case 1:
407 		src8 = (uint8_t *)((void *)entry + entry_offset);
408 		data = *src8;
409 		break;
410 	case 2:
411 		src16 = (uint16_t *)((void *)entry + entry_offset);
412 		data = *src16;
413 		break;
414 	case 4:
415 		src32 = (uint32_t *)((void *)entry + entry_offset);
416 		data = *src32;
417 		break;
418 	case 8:
419 		src64 = (uint64_t *)((void *)entry + entry_offset);
420 		data = *src64;
421 		break;
422 	default:
423 		return (-1);
424 	}
425 
426 	return (data);
427 }
428 
429 static void
430 passthru_msix_table_write(struct vmctx *ctx, int vcpu,
431     struct passthru_softc *sc, uint64_t offset, int size, uint64_t data)
432 {
433 	struct pci_devinst *pi;
434 	struct msix_table_entry *entry;
435 	uint8_t *dest8;
436 	uint16_t *dest16;
437 	uint32_t *dest32;
438 	uint64_t *dest64;
439 	size_t entry_offset;
440 	uint32_t vector_control;
441 	int index;
442 
443 	pi = sc->psc_pi;
444 	if (offset >= pi->pi_msix.pba_offset &&
445 	    offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
446 		switch(size) {
447 		case 1:
448 			dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset -
449 			    pi->pi_msix.pba_page_offset);
450 			*dest8 = data;
451 			break;
452 		case 2:
453 			dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset -
454 			    pi->pi_msix.pba_page_offset);
455 			*dest16 = data;
456 			break;
457 		case 4:
458 			dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset -
459 			    pi->pi_msix.pba_page_offset);
460 			*dest32 = data;
461 			break;
462 		case 8:
463 			dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset -
464 			    pi->pi_msix.pba_page_offset);
465 			*dest64 = data;
466 			break;
467 		default:
468 			break;
469 		}
470 		return;
471 	}
472 
473 	if (offset < pi->pi_msix.table_offset)
474 		return;
475 
476 	offset -= pi->pi_msix.table_offset;
477 	index = offset / MSIX_TABLE_ENTRY_SIZE;
478 	if (index >= pi->pi_msix.table_count)
479 		return;
480 
481 	entry = &pi->pi_msix.table[index];
482 	entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
483 
484 	/* Only 4 byte naturally-aligned writes are supported */
485 	assert(size == 4);
486 	assert(entry_offset % 4 == 0);
487 
488 	vector_control = entry->vector_control;
489 	dest32 = (uint32_t *)((void *)entry + entry_offset);
490 	*dest32 = data;
491 	/* If MSI-X hasn't been enabled, do nothing */
492 	if (pi->pi_msix.enabled) {
493 		/* If the entry is masked, don't set it up */
494 		if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 ||
495 		    (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
496 			(void) vm_setup_pptdev_msix(ctx, vcpu, sc->pptfd,
497 			    index, entry->addr, entry->msg_data,
498 			    entry->vector_control);
499 		}
500 	}
501 }
502 
503 static int
504 init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base)
505 {
506 	int error, idx;
507 	size_t len, remaining;
508 	uint32_t table_size, table_offset;
509 	uint32_t pba_size, pba_offset;
510 	vm_paddr_t start;
511 	struct pci_devinst *pi = sc->psc_pi;
512 
513 	assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0);
514 
515 	/*
516 	 * If the MSI-X table BAR maps memory intended for
517 	 * other uses, it is at least assured that the table
518 	 * either resides in its own page within the region,
519 	 * or it resides in a page shared with only the PBA.
520 	 */
521 	table_offset = rounddown2(pi->pi_msix.table_offset, 4096);
522 
523 	table_size = pi->pi_msix.table_offset - table_offset;
524 	table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE;
525 	table_size = roundup2(table_size, 4096);
526 
527 	idx = pi->pi_msix.table_bar;
528 	start = pi->pi_bar[idx].addr;
529 	remaining = pi->pi_bar[idx].size;
530 
531 	if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) {
532 		pba_offset = pi->pi_msix.pba_offset;
533 		pba_size = pi->pi_msix.pba_size;
534 		if (pba_offset >= table_offset + table_size ||
535 		    table_offset >= pba_offset + pba_size) {
536 			/*
537 			 * If the PBA does not share a page with the MSI-x
538 			 * tables, no PBA emulation is required.
539 			 */
540 			pi->pi_msix.pba_page = NULL;
541 			pi->pi_msix.pba_page_offset = 0;
542 		} else {
543 			/*
544 			 * The PBA overlaps with either the first or last
545 			 * page of the MSI-X table region.  Map the
546 			 * appropriate page.
547 			 */
548 			if (pba_offset <= table_offset)
549 				pi->pi_msix.pba_page_offset = table_offset;
550 			else
551 				pi->pi_msix.pba_page_offset = table_offset +
552 				    table_size - 4096;
553 			pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ |
554 			    PROT_WRITE, MAP_SHARED, sc->pptfd,
555 			    pi->pi_msix.pba_page_offset);
556 			if (pi->pi_msix.pba_page == MAP_FAILED) {
557 				warn("Failed to map PBA page for MSI-X on %d",
558 				    sc->pptfd);
559 				return (-1);
560 			}
561 		}
562 	}
563 
564 	/* Map everything before the MSI-X table */
565 	if (table_offset > 0) {
566 		len = table_offset;
567 		error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base);
568 		if (error)
569 			return (error);
570 
571 		base += len;
572 		start += len;
573 		remaining -= len;
574 	}
575 
576 	/* Skip the MSI-X table */
577 	base += table_size;
578 	start += table_size;
579 	remaining -= table_size;
580 
581 	/* Map everything beyond the end of the MSI-X table */
582 	if (remaining > 0) {
583 		len = remaining;
584 		error = vm_map_pptdev_mmio(ctx, sc->pptfd, start, len, base);
585 		if (error)
586 			return (error);
587 	}
588 
589 	return (0);
590 }
591 
592 static int
593 cfginitbar(struct vmctx *ctx, struct passthru_softc *sc)
594 {
595 	struct pci_devinst *pi = sc->psc_pi;
596 	uint_t i;
597 
598 	/*
599 	 * Initialize BAR registers
600 	 */
601 	for (i = 0; i <= PCI_BARMAX; i++) {
602 		enum pcibar_type bartype;
603 		uint64_t base, size;
604 		int error;
605 
606 		if (passthru_get_bar(sc, i, &bartype, &base, &size) != 0) {
607 			continue;
608 		}
609 
610 		if (bartype != PCIBAR_IO) {
611 			if (((base | size) & PAGE_MASK) != 0) {
612 				warnx("passthru device %d BAR %d: "
613 				    "base %#lx or size %#lx not page aligned\n",
614 				    sc->pptfd, i, base, size);
615 				return (-1);
616 			}
617 		}
618 
619 		/* Cache information about the "real" BAR */
620 		sc->psc_bar[i].type = bartype;
621 		sc->psc_bar[i].size = size;
622 		sc->psc_bar[i].addr = base;
623 
624 		/* Allocate the BAR in the guest I/O or MMIO space */
625 		error = pci_emul_alloc_pbar(pi, i, base, bartype, size);
626 		if (error)
627 			return (-1);
628 
629 		/* The MSI-X table needs special handling */
630 		if (i == pci_msix_table_bar(pi)) {
631 			error = init_msix_table(ctx, sc, base);
632 			if (error)
633 				return (-1);
634 		} else if (bartype != PCIBAR_IO) {
635 			/* Map the physical BAR in the guest MMIO space */
636 			error = vm_map_pptdev_mmio(ctx, sc->pptfd,
637 			    pi->pi_bar[i].addr, pi->pi_bar[i].size, base);
638 			if (error)
639 				return (-1);
640 		}
641 
642 		/*
643 		 * 64-bit BAR takes up two slots so skip the next one.
644 		 */
645 		if (bartype == PCIBAR_MEM64) {
646 			i++;
647 			assert(i <= PCI_BARMAX);
648 			sc->psc_bar[i].type = PCIBAR_MEMHI64;
649 		}
650 	}
651 	return (0);
652 }
653 
654 static int
655 cfginit(struct vmctx *ctx, struct passthru_softc *sc)
656 {
657 	struct pci_devinst *pi = sc->psc_pi;
658 
659 	if (cfginitmsi(sc) != 0) {
660 		warnx("failed to initialize MSI for PCI %d", sc->pptfd);
661 		return (-1);
662 	}
663 
664 	if (cfginitbar(ctx, sc) != 0) {
665 		warnx("failed to initialize BARs for PCI %d", sc->pptfd);
666 		return (-1);
667 	}
668 
669 	pci_set_cfgdata16(pi, PCIR_COMMAND, read_config(sc, PCIR_COMMAND, 2));
670 
671 	return (0);
672 }
673 
674 static int
675 passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
676 {
677 	int error, memflags, pptfd;
678 	struct passthru_softc *sc;
679 
680 	sc = NULL;
681 	error = 1;
682 
683 	memflags = vm_get_memflags(ctx);
684 	if (!(memflags & VM_MEM_F_WIRED)) {
685 		warnx("passthru requires guest memory to be wired");
686 		goto done;
687 	}
688 
689 	if (opts == NULL || passthru_dev_open(opts, &pptfd) != 0) {
690 		warnx("invalid passthru options");
691 		goto done;
692 	}
693 
694 	if (vm_assign_pptdev(ctx, pptfd) != 0) {
695 		warnx("PCI device at %d is not using the ppt driver", pptfd);
696 		goto done;
697 	}
698 
699 	sc = calloc(1, sizeof(struct passthru_softc));
700 
701 	pi->pi_arg = sc;
702 	sc->psc_pi = pi;
703 	sc->pptfd = pptfd;
704 
705 	if ((error = vm_get_pptdev_limits(ctx, pptfd, &sc->msi_limit,
706 	    &sc->msix_limit)) != 0)
707 		goto done;
708 
709 	/* initialize config space */
710 	if ((error = cfginit(ctx, sc)) != 0)
711 		goto done;
712 
713 	error = 0;		/* success */
714 done:
715 	if (error) {
716 		free(sc);
717 		vm_unassign_pptdev(ctx, pptfd);
718 	}
719 	return (error);
720 }
721 
722 static int
723 bar_access(int coff)
724 {
725 	if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1))
726 		return (1);
727 	else
728 		return (0);
729 }
730 
731 static int
732 msicap_access(struct passthru_softc *sc, int coff)
733 {
734 	int caplen;
735 
736 	if (sc->psc_msi.capoff == 0)
737 		return (0);
738 
739 	caplen = msi_caplen(sc->psc_msi.msgctrl);
740 
741 	if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen)
742 		return (1);
743 	else
744 		return (0);
745 }
746 
747 static int
748 msixcap_access(struct passthru_softc *sc, int coff)
749 {
750 	if (sc->psc_msix.capoff == 0)
751 		return (0);
752 
753 	return (coff >= sc->psc_msix.capoff &&
754 	        coff < sc->psc_msix.capoff + MSIX_CAPLEN);
755 }
756 
757 static int
758 passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
759     int coff, int bytes, uint32_t *rv)
760 {
761 	struct passthru_softc *sc;
762 
763 	sc = pi->pi_arg;
764 
765 	/*
766 	 * PCI BARs and MSI capability is emulated.
767 	 */
768 	if (bar_access(coff) || msicap_access(sc, coff))
769 		return (-1);
770 
771 	/*
772 	 * MSI-X is also emulated since a limit on interrupts may be imposed by
773 	 * the OS, altering the perceived register state.
774 	 */
775 	if (msixcap_access(sc, coff))
776 		return (-1);
777 
778 #ifdef LEGACY_SUPPORT
779 	/*
780 	 * Emulate PCIR_CAP_PTR if this device does not support MSI capability
781 	 * natively.
782 	 */
783 	if (sc->psc_msi.emulated) {
784 		if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4)
785 			return (-1);
786 	}
787 #endif
788 
789 	/*
790 	 * Emulate the command register.  If a single read reads both the
791 	 * command and status registers, read the status register from the
792 	 * device's config space.
793 	 */
794 	if (coff == PCIR_COMMAND) {
795 		if (bytes <= 2)
796 			return (-1);
797 		*rv = pci_get_cfgdata16(pi, PCIR_COMMAND) << 16 |
798 		    read_config(sc, PCIR_STATUS, 2);
799 		return (0);
800 	}
801 
802 	/* Everything else just read from the device's config space */
803 	*rv = read_config(sc, coff, bytes);
804 
805 	return (0);
806 }
807 
808 static int
809 passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
810     int coff, int bytes, uint32_t val)
811 {
812 	int error, msix_table_entries, i;
813 	struct passthru_softc *sc;
814 	uint16_t cmd_old;
815 
816 	sc = pi->pi_arg;
817 
818 	/*
819 	 * PCI BARs are emulated
820 	 */
821 	if (bar_access(coff))
822 		return (-1);
823 
824 	/*
825 	 * MSI capability is emulated
826 	 */
827 	if (msicap_access(sc, coff)) {
828 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff,
829 		    PCIY_MSI);
830 		error = vm_setup_pptdev_msi(ctx, vcpu, sc->pptfd,
831 		    pi->pi_msi.addr, pi->pi_msi.msg_data, pi->pi_msi.maxmsgnum);
832 		if (error != 0)
833 			err(1, "vm_setup_pptdev_msi");
834 		return (0);
835 	}
836 
837 	if (msixcap_access(sc, coff)) {
838 		pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff,
839 		    PCIY_MSIX);
840 		if (pi->pi_msix.enabled) {
841 			msix_table_entries = pi->pi_msix.table_count;
842 			for (i = 0; i < msix_table_entries; i++) {
843 				error = vm_setup_pptdev_msix(ctx, vcpu,
844 				    sc->pptfd, i,
845 				    pi->pi_msix.table[i].addr,
846 				    pi->pi_msix.table[i].msg_data,
847 				    pi->pi_msix.table[i].vector_control);
848 
849 				if (error)
850 					err(1, "vm_setup_pptdev_msix");
851 			}
852 		}
853 		return (0);
854 	}
855 
856 #ifdef LEGACY_SUPPORT
857 	/*
858 	 * If this device does not support MSI natively then we cannot let
859 	 * the guest disable legacy interrupts from the device. It is the
860 	 * legacy interrupt that is triggering the virtual MSI to the guest.
861 	 */
862 	if (sc->psc_msi.emulated && pci_msi_enabled(pi)) {
863 		if (coff == PCIR_COMMAND && bytes == 2)
864 			val &= ~PCIM_CMD_INTxDIS;
865 	}
866 #endif
867 
868 	write_config(sc, coff, bytes, val);
869 	if (coff == PCIR_COMMAND) {
870 		cmd_old = pci_get_cfgdata16(pi, PCIR_COMMAND);
871 		if (bytes == 1)
872 			pci_set_cfgdata8(pi, PCIR_COMMAND, val);
873 		else if (bytes == 2)
874 			pci_set_cfgdata16(pi, PCIR_COMMAND, val);
875 		pci_emul_cmd_changed(pi, cmd_old);
876 	}
877 
878 	return (0);
879 }
880 
881 static void
882 passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
883     uint64_t offset, int size, uint64_t value)
884 {
885 	struct passthru_softc *sc = pi->pi_arg;
886 
887 	if (baridx == pci_msix_table_bar(pi)) {
888 		passthru_msix_table_write(ctx, vcpu, sc, offset, size, value);
889 	} else {
890 		struct ppt_bar_io pbi;
891 
892 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
893 
894 		pbi.pbi_bar = baridx;
895 		pbi.pbi_width = size;
896 		pbi.pbi_off = offset;
897 		pbi.pbi_data = value;
898 		(void) ioctl(sc->pptfd, PPT_BAR_WRITE, &pbi);
899 	}
900 }
901 
902 static uint64_t
903 passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
904     uint64_t offset, int size)
905 {
906 	struct passthru_softc *sc = pi->pi_arg;
907 	uint64_t val;
908 
909 	if (baridx == pci_msix_table_bar(pi)) {
910 		val = passthru_msix_table_read(sc, offset, size);
911 	} else {
912 		struct ppt_bar_io pbi;
913 
914 		assert(pi->pi_bar[baridx].type == PCIBAR_IO);
915 
916 		pbi.pbi_bar = baridx;
917 		pbi.pbi_width = size;
918 		pbi.pbi_off = offset;
919 		if (ioctl(sc->pptfd, PPT_BAR_READ, &pbi) == 0) {
920 			val = pbi.pbi_data;
921 		} else {
922 			val = 0;
923 		}
924 	}
925 
926 	return (val);
927 }
928 
929 struct pci_devemu passthru = {
930 	.pe_emu		= "passthru",
931 	.pe_init	= passthru_init,
932 	.pe_cfgwrite	= passthru_cfgwrite,
933 	.pe_cfgread	= passthru_cfgread,
934 	.pe_barwrite 	= passthru_write,
935 	.pe_barread    	= passthru_read,
936 };
937 PCI_EMUL_SET(passthru);
938