xref: /illumos-gate/usr/src/uts/intel/io/vmm/intel/vmx_msr.c (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 /*
31  * Copyright 2020 Joyent, Inc.
32  * Copyright 2021 Oxide Computer Company
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 
42 #include <machine/clock.h>
43 #include <machine/cpufunc.h>
44 #include <machine/md_var.h>
45 #include <machine/specialreg.h>
46 #include <machine/vmm.h>
47 #include <sys/vmm_kernel.h>
48 
49 #include "vmx.h"
50 #include "vmx_msr.h"
51 
52 static bool
53 vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
54 {
55 
56 	return ((msr_val & (1UL << (bitpos + 32))) != 0);
57 }
58 
59 static bool
60 vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
61 {
62 
63 	return ((msr_val & (1UL << bitpos)) == 0);
64 }
65 
66 /*
67  * Generate a bitmask to be used for the VMCS execution control fields.
68  *
69  * The caller specifies what bits should be set to one in 'ones_mask'
70  * and what bits should be set to zero in 'zeros_mask'. The don't-care
71  * bits are set to the default value. The default values are obtained
72  * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
73  * VMX Capabilities".
74  *
75  * Returns zero on success and non-zero on error.
76  */
77 int
78 vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
79     uint32_t zeros_mask, uint32_t *retval)
80 {
81 	int i;
82 	uint64_t val, trueval;
83 	bool true_ctls_avail, one_allowed, zero_allowed;
84 
85 	/* We cannot ask the same bit to be set to both '1' and '0' */
86 	if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
87 		return (EINVAL);
88 
89 	true_ctls_avail = (rdmsr(MSR_VMX_BASIC) & (1UL << 55)) != 0;
90 
91 	val = rdmsr(ctl_reg);
92 	if (true_ctls_avail)
93 		trueval = rdmsr(true_ctl_reg);		/* step c */
94 	else
95 		trueval = val;				/* step a */
96 
97 	for (i = 0; i < 32; i++) {
98 		one_allowed = vmx_ctl_allows_one_setting(trueval, i);
99 		zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
100 
101 		KASSERT(one_allowed || zero_allowed,
102 		    ("invalid zero/one setting for bit %d of ctl 0x%0x, "
103 		    "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
104 
105 		if (zero_allowed && !one_allowed) {		/* b(i),c(i) */
106 			if (ones_mask & (1 << i))
107 				return (EINVAL);
108 			*retval &= ~(1 << i);
109 		} else if (one_allowed && !zero_allowed) {	/* b(i),c(i) */
110 			if (zeros_mask & (1 << i))
111 				return (EINVAL);
112 			*retval |= 1 << i;
113 		} else {
114 			if (zeros_mask & (1 << i)) {
115 				/* b(ii),c(ii) */
116 				*retval &= ~(1 << i);
117 			} else if (ones_mask & (1 << i)) {
118 				/* b(ii), c(ii) */
119 				*retval |= 1 << i;
120 			} else if (!true_ctls_avail) {
121 				/* b(iii) */
122 				*retval &= ~(1 << i);
123 			} else if (vmx_ctl_allows_zero_setting(val, i)) {
124 				/* c(iii) */
125 				*retval &= ~(1 << i);
126 			} else if (vmx_ctl_allows_one_setting(val, i)) {
127 				/* c(iv) */
128 				*retval |= 1 << i;
129 			} else {
130 				panic("vmx_set_ctlreg: unable to determine "
131 				    "correct value of ctl bit %d for msr "
132 				    "0x%0x and true msr 0x%0x", i, ctl_reg,
133 				    true_ctl_reg);
134 			}
135 		}
136 	}
137 
138 	return (0);
139 }
140 
141 void
142 vmx_msr_bitmap_initialize(struct vmx *vmx)
143 {
144 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
145 		uint8_t *bitmap;
146 
147 		bitmap = kmem_alloc(PAGESIZE, KM_SLEEP);
148 		VERIFY3U((uintptr_t)bitmap & PAGEOFFSET, ==, 0);
149 		memset(bitmap, 0xff, PAGESIZE);
150 
151 		vmx->msr_bitmap[i] = bitmap;
152 	}
153 }
154 
155 void
156 vmx_msr_bitmap_destroy(struct vmx *vmx)
157 {
158 	for (uint_t i = 0; i < VM_MAXCPU; i++) {
159 		VERIFY3P(vmx->msr_bitmap[i], !=, NULL);
160 		kmem_free(vmx->msr_bitmap[i], PAGESIZE);
161 		vmx->msr_bitmap[i] = NULL;
162 	}
163 }
164 
165 void
166 vmx_msr_bitmap_change_access(struct vmx *vmx, int vcpuid, uint_t msr, int acc)
167 {
168 	uint8_t *bitmap = vmx->msr_bitmap[vcpuid];
169 	int byte, bit;
170 
171 	if (msr <= 0x00001FFF) {
172 		byte = msr / 8;
173 	} else if (msr >= 0xC0000000 && msr <= 0xC0001FFF) {
174 		byte = 1024 + (msr - 0xC0000000) / 8;
175 	} else {
176 		panic("Invalid MSR for bitmap: %x", msr);
177 	}
178 
179 	bit = msr & 0x7;
180 
181 	if (acc & MSR_BITMAP_ACCESS_READ) {
182 		bitmap[byte] &= ~(1 << bit);
183 	} else {
184 		bitmap[byte] |= 1 << bit;
185 	}
186 
187 	byte += 2048;
188 	if (acc & MSR_BITMAP_ACCESS_WRITE) {
189 		bitmap[byte] &= ~(1 << bit);
190 	} else {
191 		bitmap[byte] |= 1 << bit;
192 	}
193 }
194 
195 static uint64_t misc_enable;
196 static uint64_t platform_info;
197 static uint64_t turbo_ratio_limit;
198 
199 static bool
200 nehalem_cpu(void)
201 {
202 	uint_t family, model;
203 
204 	/*
205 	 * The family:model numbers belonging to the Nehalem microarchitecture
206 	 * are documented in Section 35.5, Intel SDM dated Feb 2014.
207 	 */
208 	family = CPUID_TO_FAMILY(cpu_id);
209 	model = CPUID_TO_MODEL(cpu_id);
210 	if (family == 0x6) {
211 		switch (model) {
212 		case 0x1A:
213 		case 0x1E:
214 		case 0x1F:
215 		case 0x2E:
216 			return (true);
217 		default:
218 			break;
219 		}
220 	}
221 	return (false);
222 }
223 
224 static bool
225 westmere_cpu(void)
226 {
227 	uint_t family, model;
228 
229 	/*
230 	 * The family:model numbers belonging to the Westmere microarchitecture
231 	 * are documented in Section 35.6, Intel SDM dated Feb 2014.
232 	 */
233 	family = CPUID_TO_FAMILY(cpu_id);
234 	model = CPUID_TO_MODEL(cpu_id);
235 	if (family == 0x6) {
236 		switch (model) {
237 		case 0x25:
238 		case 0x2C:
239 			return (true);
240 		default:
241 			break;
242 		}
243 	}
244 	return (false);
245 }
246 
247 static bool
248 pat_valid(uint64_t val)
249 {
250 	int i, pa;
251 
252 	/*
253 	 * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT"
254 	 *
255 	 * Extract PA0 through PA7 and validate that each one encodes a
256 	 * valid memory type.
257 	 */
258 	for (i = 0; i < 8; i++) {
259 		pa = (val >> (i * 8)) & 0xff;
260 		if (pa == 2 || pa == 3 || pa >= 8)
261 			return (false);
262 	}
263 	return (true);
264 }
265 
266 void
267 vmx_msr_init(void)
268 {
269 	uint64_t bus_freq, ratio;
270 	int i;
271 
272 	/*
273 	 * Initialize emulated MSRs
274 	 */
275 	misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
276 	/*
277 	 * Set mandatory bits
278 	 *  11:   branch trace disabled
279 	 *  12:   PEBS unavailable
280 	 * Clear unsupported features
281 	 *  16:   SpeedStep enable
282 	 *  18:   enable MONITOR FSM
283 	 */
284 	misc_enable |= (1 << 12) | (1 << 11);
285 	misc_enable &= ~((1 << 18) | (1 << 16));
286 
287 	if (nehalem_cpu() || westmere_cpu())
288 		bus_freq = 133330000;		/* 133Mhz */
289 	else
290 		bus_freq = 100000000;		/* 100Mhz */
291 
292 	/*
293 	 * XXXtime
294 	 * The ratio should really be based on the virtual TSC frequency as
295 	 * opposed to the host TSC.
296 	 */
297 	ratio = (tsc_freq / bus_freq) & 0xff;
298 
299 	/*
300 	 * The register definition is based on the micro-architecture
301 	 * but the following bits are always the same:
302 	 * [15:8]  Maximum Non-Turbo Ratio
303 	 * [28]    Programmable Ratio Limit for Turbo Mode
304 	 * [29]    Programmable TDC-TDP Limit for Turbo Mode
305 	 * [47:40] Maximum Efficiency Ratio
306 	 *
307 	 * The other bits can be safely set to 0 on all
308 	 * micro-architectures up to Haswell.
309 	 */
310 	platform_info = (ratio << 8) | (ratio << 40);
311 
312 	/*
313 	 * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
314 	 * dependent on the maximum cores per package supported by the micro-
315 	 * architecture. For e.g., Westmere supports 6 cores per package and
316 	 * uses the low 48 bits. Sandybridge support 8 cores per package and
317 	 * uses up all 64 bits.
318 	 *
319 	 * However, the unused bits are reserved so we pretend that all bits
320 	 * in this MSR are valid.
321 	 */
322 	for (i = 0; i < 8; i++)
323 		turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
324 }
325 
326 void
327 vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
328 {
329 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
330 
331 	/*
332 	 * It is safe to allow direct access to MSR_GSBASE and
333 	 * MSR_FSBASE.  The guest FSBASE and GSBASE are saved and
334 	 * restored during vm-exit and vm-entry respectively. The host
335 	 * FSBASE and GSBASE are always restored from the vmcs host
336 	 * state area on vm-exit.
337 	 *
338 	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
339 	 * how they are saved/restored so can be directly accessed by
340 	 * the guest.
341 	 *
342 	 * MSR_EFER is saved and restored in the guest VMCS area on a VM
343 	 * exit and entry respectively. It is also restored from the
344 	 * host VMCS area on a VM exit.
345 	 *
346 	 * The TSC MSR is exposed read-only. Writes are disallowed as
347 	 * that will impact the host TSC.  If the guest does a write the
348 	 * "use TSC offsetting" execution control is enabled and the
349 	 * difference between the host TSC and the guest TSC is written
350 	 * into the TSC offset in the VMCS.
351 	 */
352 	guest_msr_rw(vmx, vcpuid, MSR_GSBASE);
353 	guest_msr_rw(vmx, vcpuid, MSR_FSBASE);
354 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_CS_MSR);
355 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_ESP_MSR);
356 	guest_msr_rw(vmx, vcpuid, MSR_SYSENTER_EIP_MSR);
357 	guest_msr_rw(vmx, vcpuid, MSR_EFER);
358 	guest_msr_ro(vmx, vcpuid, MSR_TSC);
359 
360 	/*
361 	 * The guest may have direct access to these MSRs as they are
362 	 * saved/restored in vmx_msr_guest_enter() and vmx_msr_guest_exit().
363 	 */
364 	guest_msr_rw(vmx, vcpuid, MSR_LSTAR);
365 	guest_msr_rw(vmx, vcpuid, MSR_CSTAR);
366 	guest_msr_rw(vmx, vcpuid, MSR_STAR);
367 	guest_msr_rw(vmx, vcpuid, MSR_SF_MASK);
368 	guest_msr_rw(vmx, vcpuid, MSR_KGSBASE);
369 
370 	/*
371 	 * Initialize guest IA32_PAT MSR with default value after reset.
372 	 */
373 	guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) |
374 	    PAT_VALUE(1, PAT_WRITE_THROUGH)	|
375 	    PAT_VALUE(2, PAT_UNCACHED)		|
376 	    PAT_VALUE(3, PAT_UNCACHEABLE)	|
377 	    PAT_VALUE(4, PAT_WRITE_BACK)	|
378 	    PAT_VALUE(5, PAT_WRITE_THROUGH)	|
379 	    PAT_VALUE(6, PAT_UNCACHED)		|
380 	    PAT_VALUE(7, PAT_UNCACHEABLE);
381 }
382 
383 void
384 vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
385 {
386 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
387 	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
388 
389 	/* Save host MSRs */
390 	host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
391 	host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
392 	host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
393 	host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
394 
395 	/* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */
396 	wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
397 	wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
398 	wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
399 	wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
400 	wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
401 }
402 
403 void
404 vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
405 {
406 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
407 	uint64_t *host_msrs = vmx->host_msrs[vcpuid];
408 
409 	/* Save guest MSRs */
410 	guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
411 	guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
412 	guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
413 	guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
414 	guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
415 
416 	/* Restore host MSRs */
417 	wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
418 	wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
419 	wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
420 	wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
421 
422 	/* MSR_KGSBASE will be restored on the way back to userspace */
423 }
424 
425 vm_msr_result_t
426 vmx_rdmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t *val)
427 {
428 	const uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
429 
430 	switch (num) {
431 	case MSR_IA32_FEATURE_CONTROL:
432 		/*
433 		 * We currently don't support SGX support in guests, so
434 		 * always report those features as disabled with the MSR
435 		 * locked so the guest won't attempt to write to it.
436 		 */
437 		*val = IA32_FEATURE_CONTROL_LOCK;
438 		break;
439 	case MSR_IA32_MISC_ENABLE:
440 		*val = misc_enable;
441 		break;
442 	case MSR_PLATFORM_INFO:
443 		*val = platform_info;
444 		break;
445 	case MSR_TURBO_RATIO_LIMIT:
446 	case MSR_TURBO_RATIO_LIMIT1:
447 		*val = turbo_ratio_limit;
448 		break;
449 	case MSR_PAT:
450 		*val = guest_msrs[IDX_MSR_PAT];
451 		break;
452 	default:
453 		return (VMR_UNHANLDED);
454 	}
455 	return (VMR_OK);
456 }
457 
458 vm_msr_result_t
459 vmx_wrmsr(struct vmx *vmx, int vcpuid, uint32_t num, uint64_t val)
460 {
461 	uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
462 	uint64_t changed;
463 
464 	switch (num) {
465 	case MSR_IA32_MISC_ENABLE:
466 		changed = val ^ misc_enable;
467 		/*
468 		 * If the host has disabled the NX feature then the guest
469 		 * also cannot use it. However, a Linux guest will try to
470 		 * enable the NX feature by writing to the MISC_ENABLE MSR.
471 		 *
472 		 * This can be safely ignored because the memory management
473 		 * code looks at CPUID.80000001H:EDX.NX to check if the
474 		 * functionality is actually enabled.
475 		 */
476 		changed &= ~(1UL << 34);
477 
478 		/*
479 		 * Punt to userspace if any other bits are being modified.
480 		 */
481 		if (changed) {
482 			return (VMR_UNHANLDED);
483 		}
484 		break;
485 	case MSR_PAT:
486 		if (!pat_valid(val)) {
487 			return (VMR_GP);
488 		}
489 		guest_msrs[IDX_MSR_PAT] = val;
490 		break;
491 	default:
492 		return (VMR_UNHANLDED);
493 	}
494 
495 	return (VMR_OK);
496 }
497