acrn-hypervisor/hypervisor/arch/x86/guest/nested.c

1629 lines
47 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (C) 2021 Intel Corporation.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <types.h>
#include <logmsg.h>
#include <asm/mmu.h>
#include <asm/guest/virq.h>
#include <asm/guest/ept.h>
#include <asm/guest/vcpu.h>
#include <asm/guest/vm.h>
#include <asm/guest/vmcs.h>
#include <asm/guest/nested.h>
#include <asm/guest/vept.h>
/* Cache the content of MSR_IA32_VMX_BASIC */
static uint32_t vmx_basic;
static void disable_vmcs_shadowing(void);
static void clear_vvmcs(struct acrn_vcpu *vcpu, struct acrn_vvmcs *vvmcs);
/* The only purpose of this array is to serve the is_vmx_msr() function */
static const uint32_t vmx_msrs[NUM_VMX_MSRS] = {
LIST_OF_VMX_MSRS
};
bool is_vmx_msr(uint32_t msr)
{
bool found = false;
uint32_t i;
for (i = 0U; i < NUM_VMX_MSRS; i++) {
if (msr == vmx_msrs[i]) {
found = true;
break;
}
}
return found;
}
static uint64_t adjust_vmx_ctrls(uint32_t msr, uint64_t request_bits)
{
union value_64 val64, msr_val;
/*
* ISDM Appendix A.3, A.4, A.5:
* - Bits 31:0 indicate the allowed 0-settings of these controls.
* bit X of the corresponding VM-execution controls field is allowed to be 0
* if bit X in the MSR is cleared to 0
* - Bits 63:32 indicate the allowed 1-settings of these controls.
* VM entry allows control X to be 1 if bit 32+X in the MSR is set to 1
*/
msr_val.full = msr_read(msr);
/*
* The reserved bits in VMCS Control fields could be 0 or 1, determined by the
* corresponding capability MSR. So need to read them from physical MSR.
*
* We consider the bits that are set in the allowed 0-settings group as the
* minimal set of bits that need to be set from the physical processor's perspective.
* Since we shadow this control field, we passthru the allowed 0-settings bits.
*/
val64.u.lo_32 = msr_val.u.lo_32;
/* allowed 1-settings include those bits are NOT allowed to be 0 */
val64.u.hi_32 = msr_val.u.lo_32;
/* make sure the requested features are supported by hardware */
val64.u.hi_32 |= (msr_val.u.hi_32 & request_bits);
return val64.full;
}
/*
* @pre vcpu != NULL
*/
void init_vmx_msrs(struct acrn_vcpu *vcpu)
{
union value_64 val64;
uint64_t request_bits, msr_value;
if (is_nvmx_configured(vcpu->vm)) {
/* MSR_IA32_VMX_BASIC */
val64.full = VMCS12_REVISION_ID /* Bits 30:0 - VMCS revision ID */
| (4096UL << 32U) /* Bits 44:32 - size of VMXON region and VMCS region */
| (6UL << 50U) /* Bits 53:50 - memory type for VMCS etc. (6: Write Back) */
| (1UL << 54U) /* Bit 54: VM-exit instruction-information for INS and OUTS */
| (1UL << 55U); /* Bit 55: VMX controls that default to 1 may be cleared to 0 */
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_BASIC, val64.full);
/* MSR_IA32_VMX_MISC */
/*
* some bits need to read from physical MSR. For exmaple Bits 4:0 report the relationship between
* the rate of the VMX-preemption timer and that of the timestamp counter (TSC).
*/
val64.full = msr_read(MSR_IA32_VMX_MISC);
val64.u.hi_32 = 0U;
/* Don't support Intel® Processor Trace (Intel PT) in VMX operation */
val64.u.lo_32 &= ~(1U << 14U);
/* Don't support SMM in VMX operation */
val64.u.lo_32 &= ~((1U << 15U) | (1U << 28U));
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_MISC, val64.full);
/*
* TODO: These emulated VMX Control MSRs work for Tiger Lake and Kaby Lake,
* potentially it may have problems if run on other platforms.
*
* We haven't put our best efforts to try to enable as much as features as
* possible.
*/
/* MSR_IA32_VMX_PINBASED_CTLS */
request_bits = VMX_PINBASED_CTLS_IRQ_EXIT
| VMX_PINBASED_CTLS_NMI_EXIT
| VMX_PINBASED_CTLS_ENABLE_PTMR;
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PINBASED_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PINBASED_CTLS, msr_value);
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_PINBASED_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS, msr_value);
/* MSR_IA32_VMX_PROCBASED_CTLS */
request_bits = VMX_PROCBASED_CTLS_IRQ_WIN | VMX_PROCBASED_CTLS_TSC_OFF
| VMX_PROCBASED_CTLS_HLT | VMX_PROCBASED_CTLS_INVLPG
| VMX_PROCBASED_CTLS_MWAIT | VMX_PROCBASED_CTLS_RDPMC
| VMX_PROCBASED_CTLS_RDTSC | VMX_PROCBASED_CTLS_CR3_LOAD
| VMX_PROCBASED_CTLS_CR3_STORE | VMX_PROCBASED_CTLS_CR8_LOAD
| VMX_PROCBASED_CTLS_CR8_STORE | VMX_PROCBASED_CTLS_NMI_WINEXIT
| VMX_PROCBASED_CTLS_MOV_DR | VMX_PROCBASED_CTLS_UNCOND_IO
| VMX_PROCBASED_CTLS_MSR_BITMAP | VMX_PROCBASED_CTLS_MONITOR
| VMX_PROCBASED_CTLS_PAUSE | VMX_PROCBASED_CTLS_SECONDARY;
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PROCBASED_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS, msr_value);
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_PROCBASED_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, msr_value);
/* MSR_IA32_VMX_PROCBASED_CTLS2 */
request_bits = VMX_PROCBASED_CTLS2_EPT | VMX_PROCBASED_CTLS2_RDTSCP
| VMX_PROCBASED_CTLS2_VPID | VMX_PROCBASED_CTLS2_WBINVD
| VMX_PROCBASED_CTLS2_UNRESTRICT | VMX_PROCBASED_CTLS2_PAUSE_LOOP
| VMX_PROCBASED_CTLS2_RDRAND | VMX_PROCBASED_CTLS2_INVPCID
| VMX_PROCBASED_CTLS2_RDSEED | VMX_PROCBASED_CTLS2_XSVE_XRSTR
| VMX_PROCBASED_CTLS2_TSC_SCALING;
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PROCBASED_CTLS2, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2, msr_value);
/* MSR_IA32_VMX_EXIT_CTLS */
request_bits = VMX_EXIT_CTLS_SAVE_DBG | VMX_EXIT_CTLS_HOST_ADDR64
| VMX_EXIT_CTLS_ACK_IRQ | VMX_EXIT_CTLS_LOAD_PAT
| VMX_EXIT_CTLS_LOAD_EFER;
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_EXIT_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_EXIT_CTLS, msr_value);
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_EXIT_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_EXIT_CTLS, msr_value);
/* MSR_IA32_VMX_ENTRY_CTLS */
request_bits = VMX_ENTRY_CTLS_LOAD_DBG | VMX_ENTRY_CTLS_IA32E_MODE
| VMX_ENTRY_CTLS_LOAD_PERF | VMX_ENTRY_CTLS_LOAD_PAT
| VMX_ENTRY_CTLS_LOAD_EFER;
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_ENTRY_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_ENTRY_CTLS, msr_value);
msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_ENTRY_CTLS, request_bits);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS, msr_value);
msr_value = msr_read(MSR_IA32_VMX_EPT_VPID_CAP);
/*
* Hide 5 level EPT capability
* Hide accessed and dirty flags for EPT
*/
msr_value &= ~(VMX_EPT_PAGE_WALK_5 | VMX_EPT_AD | VMX_EPT_2MB_PAGE | VMX_EPT_1GB_PAGE);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_EPT_VPID_CAP, msr_value);
/* For now passthru the value from physical MSR to L1 guest */
msr_value = msr_read(MSR_IA32_VMX_CR0_FIXED0);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR0_FIXED0, msr_value);
msr_value = msr_read(MSR_IA32_VMX_CR0_FIXED1);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR0_FIXED1, msr_value);
msr_value = msr_read(MSR_IA32_VMX_CR4_FIXED0);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR4_FIXED0, msr_value);
msr_value = msr_read(MSR_IA32_VMX_CR4_FIXED1);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR4_FIXED1, msr_value);
msr_value = msr_read(MSR_IA32_VMX_VMCS_ENUM);
vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, msr_value);
}
}
/*
* @pre vcpu != NULL
*/
int32_t read_vmx_msr(struct acrn_vcpu *vcpu, uint32_t msr, uint64_t *val)
{
uint64_t v = 0UL;
int32_t err = 0;
if (is_nvmx_configured(vcpu->vm)) {
switch (msr) {
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
case MSR_IA32_VMX_PROCBASED_CTLS:
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
case MSR_IA32_VMX_PROCBASED_CTLS2:
case MSR_IA32_VMX_EXIT_CTLS:
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
case MSR_IA32_VMX_ENTRY_CTLS:
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
case MSR_IA32_VMX_BASIC:
case MSR_IA32_VMX_MISC:
case MSR_IA32_VMX_EPT_VPID_CAP:
case MSR_IA32_VMX_CR0_FIXED0:
case MSR_IA32_VMX_CR0_FIXED1:
case MSR_IA32_VMX_CR4_FIXED0:
case MSR_IA32_VMX_CR4_FIXED1:
case MSR_IA32_VMX_VMCS_ENUM:
{
v = vcpu_get_guest_msr(vcpu, msr);
break;
}
/* Don't support these MSRs yet */
case MSR_IA32_SMBASE:
case MSR_IA32_VMX_PROCBASED_CTLS3:
case MSR_IA32_VMX_VMFUNC:
default:
err = -EACCES;
break;
}
} else {
err = -EACCES;
}
*val = v;
return err;
}
#define MAX_SHADOW_VMCS_FIELDS 113U
/*
* VMCS fields included in the dual-purpose VMCS: as shadow for L1 and
* as hardware VMCS for nested guest (L2).
*
* TODO: This list is for TGL and CFL machines and the fields
* for advacned APICv features such as Posted Interrupt and Virtual
* Interrupt Delivery are not included, as these are not available
* on those platforms.
*
* Certain fields, e.g. VMX_TSC_MULTIPLIER_FULL is available only if
* "use TSC scaling” is supported. Thus a static array may not work
* for all platforms.
*/
static const uint32_t vmcs_shadowing_fields[MAX_SHADOW_VMCS_FIELDS] = {
/* 16-bits */
VMX_GUEST_ES_SEL,
VMX_GUEST_CS_SEL,
VMX_GUEST_SS_SEL,
VMX_GUEST_DS_SEL,
VMX_GUEST_FS_SEL,
VMX_GUEST_GS_SEL,
VMX_GUEST_LDTR_SEL,
VMX_GUEST_TR_SEL,
VMX_GUEST_PML_INDEX,
/* 64-bits */
VMX_IO_BITMAP_A_FULL,
VMX_IO_BITMAP_B_FULL,
VMX_EXIT_MSR_STORE_ADDR_FULL,
VMX_EXIT_MSR_LOAD_ADDR_FULL,
VMX_ENTRY_MSR_LOAD_ADDR_FULL,
VMX_EXECUTIVE_VMCS_PTR_FULL,
VMX_TSC_OFFSET_FULL,
VMX_VIRTUAL_APIC_PAGE_ADDR_FULL,
VMX_APIC_ACCESS_ADDR_FULL,
VMX_VMREAD_BITMAP_FULL,
VMX_VMWRITE_BITMAP_FULL,
VMX_XSS_EXITING_BITMAP_FULL,
VMX_TSC_MULTIPLIER_FULL,
VMX_GUEST_PHYSICAL_ADDR_FULL,
VMX_VMS_LINK_PTR_FULL,
VMX_GUEST_IA32_DEBUGCTL_FULL,
VMX_GUEST_IA32_PAT_FULL,
VMX_GUEST_IA32_EFER_FULL,
VMX_GUEST_IA32_PERF_CTL_FULL,
VMX_GUEST_PDPTE0_FULL,
VMX_GUEST_PDPTE1_FULL,
VMX_GUEST_PDPTE2_FULL,
VMX_GUEST_PDPTE3_FULL,
/* 32-bits */
VMX_PIN_VM_EXEC_CONTROLS,
VMX_PROC_VM_EXEC_CONTROLS,
VMX_EXCEPTION_BITMAP,
VMX_PF_ERROR_CODE_MASK,
VMX_PF_ERROR_CODE_MATCH,
VMX_CR3_TARGET_COUNT,
VMX_EXIT_MSR_STORE_COUNT,
VMX_EXIT_MSR_LOAD_COUNT,
VMX_ENTRY_MSR_LOAD_COUNT,
VMX_ENTRY_INT_INFO_FIELD,
VMX_ENTRY_EXCEPTION_ERROR_CODE,
VMX_ENTRY_INSTR_LENGTH,
VMX_TPR_THRESHOLD,
VMX_PROC_VM_EXEC_CONTROLS2,
VMX_PLE_GAP,
VMX_PLE_WINDOW,
VMX_INSTR_ERROR,
VMX_EXIT_REASON,
VMX_EXIT_INT_INFO,
VMX_EXIT_INT_ERROR_CODE,
VMX_IDT_VEC_INFO_FIELD,
VMX_IDT_VEC_ERROR_CODE,
VMX_EXIT_INSTR_LEN,
VMX_INSTR_INFO,
VMX_GUEST_ES_LIMIT,
VMX_GUEST_CS_LIMIT,
VMX_GUEST_SS_LIMIT,
VMX_GUEST_DS_LIMIT,
VMX_GUEST_FS_LIMIT,
VMX_GUEST_GS_LIMIT,
VMX_GUEST_LDTR_LIMIT,
VMX_GUEST_TR_LIMIT,
VMX_GUEST_GDTR_LIMIT,
VMX_GUEST_IDTR_LIMIT,
VMX_GUEST_ES_ATTR,
VMX_GUEST_CS_ATTR,
VMX_GUEST_SS_ATTR,
VMX_GUEST_DS_ATTR,
VMX_GUEST_FS_ATTR,
VMX_GUEST_GS_ATTR,
VMX_GUEST_LDTR_ATTR,
VMX_GUEST_TR_ATTR,
VMX_GUEST_INTERRUPTIBILITY_INFO,
VMX_GUEST_ACTIVITY_STATE,
VMX_GUEST_SMBASE,
VMX_GUEST_IA32_SYSENTER_CS,
VMX_GUEST_TIMER,
VMX_CR0_GUEST_HOST_MASK,
VMX_CR4_GUEST_HOST_MASK,
VMX_CR0_READ_SHADOW,
VMX_CR4_READ_SHADOW,
VMX_CR3_TARGET_0,
VMX_CR3_TARGET_1,
VMX_CR3_TARGET_2,
VMX_CR3_TARGET_3,
VMX_EXIT_QUALIFICATION,
VMX_IO_RCX,
VMX_IO_RSI,
VMX_IO_RDI,
VMX_IO_RIP,
VMX_GUEST_LINEAR_ADDR,
VMX_GUEST_CR0,
VMX_GUEST_CR3,
VMX_GUEST_CR4,
VMX_GUEST_ES_BASE,
VMX_GUEST_CS_BASE,
VMX_GUEST_SS_BASE,
VMX_GUEST_DS_BASE,
VMX_GUEST_FS_BASE,
VMX_GUEST_GS_BASE,
VMX_GUEST_LDTR_BASE,
VMX_GUEST_TR_BASE,
VMX_GUEST_GDTR_BASE,
VMX_GUEST_IDTR_BASE,
VMX_GUEST_DR7,
VMX_GUEST_RSP,
VMX_GUEST_RIP,
VMX_GUEST_RFLAGS,
VMX_GUEST_PENDING_DEBUG_EXCEPT,
VMX_GUEST_IA32_SYSENTER_ESP,
VMX_GUEST_IA32_SYSENTER_EIP
};
/* to be shared by all vCPUs for all nested guests */
static uint64_t vmcs_shadowing_bitmap[PAGE_SIZE / sizeof(uint64_t)] __aligned(PAGE_SIZE);
static void setup_vmcs_shadowing_bitmap(void)
{
uint16_t field_index;
uint32_t array_index;
uint16_t bit_pos;
/*
* Set all the bits to 1s first and clear out the bits for
* the corresponding fields that ACRN lets its guest to access Shadow VMCS
*/
memset((void *)vmcs_shadowing_bitmap, 0xFFU, PAGE_SIZE);
/*
* Refer to ISDM Section 24.6.15 VMCS Shadowing Bitmap Addresses
* and Section 30.3 VMX Instructions - VMWRITE/VMREAD
*/
for (field_index = 0U; field_index < MAX_SHADOW_VMCS_FIELDS; field_index++) {
bit_pos = vmcs_shadowing_fields[field_index] % 64U;
array_index = vmcs_shadowing_fields[field_index] / 64U;
bitmap_clear_nolock(bit_pos, &vmcs_shadowing_bitmap[array_index]);
}
}
/*
* This is an array of offsets into a structure of type "struct acrn_vmcs12"
* 16 offsets for a total of 16 GROUPs. 4 "field widths" by 4 "field types".
* "Field type" is either Control, Read-Only Data, Guest State or Host State.
* Refer to the definition of "struct acrn_vmcs12" on how the fields are
* grouped together for these offsets to work in tandem.
* Refer to Intel SDM Appendix B Field Encoding in VMCS for info on how
* fields are grouped and indexed within a group.
*/
static const uint16_t vmcs12_group_offset_table[16] = {
offsetof(struct acrn_vmcs12, vpid), /* 16-bit Control Fields */
offsetof(struct acrn_vmcs12, padding), /* 16-bit Read-Only Fields */
offsetof(struct acrn_vmcs12, guest_es), /* 16-bit Guest-State Fields */
offsetof(struct acrn_vmcs12, host_es), /* 16-bit Host-State Fields */
offsetof(struct acrn_vmcs12, io_bitmap_a), /* 64-bit Control Fields */
offsetof(struct acrn_vmcs12, guest_phys_addr), /* 64-bit Read-Only Data Fields */
offsetof(struct acrn_vmcs12, vmcs_link_ptr), /* 64-bit Guest-State Fields */
offsetof(struct acrn_vmcs12, host_ia32_pat), /* 64-bit Host-State Fields */
offsetof(struct acrn_vmcs12, pin_based_exec_ctrl), /* 32-bit Control Fields */
offsetof(struct acrn_vmcs12, vm_instr_error), /* 32-bit Read-Only Data Fields */
offsetof(struct acrn_vmcs12, guest_es_limit), /* 32-bit Guest-State Fields */
offsetof(struct acrn_vmcs12, host_ia32_sysenter_cs), /* 32-bit Host-State Fields */
offsetof(struct acrn_vmcs12, cr0_guest_host_mask), /* Natural-width Control Fields */
offsetof(struct acrn_vmcs12, exit_qual), /* Natural-width Read-Only Data Fields */
offsetof(struct acrn_vmcs12, guest_cr0), /* Natural-width Guest-State Fields */
offsetof(struct acrn_vmcs12, host_cr0), /* Natural-width Host-State Fields */
};
/*
* field_idx is the index of the field within the group.
*
* Access-type is 0 for all widths except for 64-bit
* For 64-bit if Access-type is 1, offset is moved to
* high 4 bytes of the field.
*/
#define OFFSET_INTO_VMCS12(group_idx, field_idx, width_in_bytes, access_type) \
(vmcs12_group_offset_table[group_idx] + \
field_idx * width_in_bytes + \
access_type * sizeof(uint32_t))
/* Given a vmcs field, this API returns the offset into "struct acrn_vmcs12" */
static uint16_t vmcs_field_to_vmcs12_offset(uint32_t vmcs_field)
{
/*
* Refer to Appendix B Field Encoding in VMCS in SDM
* A value of group index 0001b is not valid because there are no 16-bit
* Read-Only fields.
*
* TODO: check invalid VMCS field
*/
uint16_t group_idx = (VMX_VMCS_FIELD_WIDTH(vmcs_field) << 2U) | VMX_VMCS_FIELD_TYPE(vmcs_field);
uint8_t field_width = VMX_VMCS_FIELD_WIDTH(vmcs_field);
uint8_t width_in_bytes;
if (field_width == VMX_VMCS_FIELD_WIDTH_16) {
width_in_bytes = 2U;
} else if (field_width == VMX_VMCS_FIELD_WIDTH_32) {
width_in_bytes = 4U;
} else {
/*
* Natural-width or 64-bit
*/
width_in_bytes = 8U;
}
return OFFSET_INTO_VMCS12(group_idx,
VMX_VMCS_FIELD_INDEX(vmcs_field), width_in_bytes, /* field index within the group */
VMX_VMCS_FIELD_ACCESS_HIGH(vmcs_field));
}
/*
* Given a vmcs field and the pointer to the vmcs12, this API returns the
* corresponding value from the VMCS
*/
static uint64_t vmcs12_read_field(void *vmcs_hva, uint32_t field)
{
uint64_t *ptr = (uint64_t *)(vmcs_hva + vmcs_field_to_vmcs12_offset(field));
uint64_t val64 = 0UL;
switch (VMX_VMCS_FIELD_WIDTH(field)) {
case VMX_VMCS_FIELD_WIDTH_16:
val64 = *(uint16_t *)ptr;
break;
case VMX_VMCS_FIELD_WIDTH_32:
val64 = *(uint32_t *)ptr;
break;
case VMX_VMCS_FIELD_WIDTH_64:
if (!!VMX_VMCS_FIELD_ACCESS_HIGH(field)) {
val64 = *(uint32_t *)ptr;
} else {
val64 = *ptr;
}
break;
case VMX_VMCS_FIELD_WIDTH_NATURAL:
default:
val64 = *ptr;
break;
}
return val64;
}
/*
* Write the given VMCS field to the given vmcs12 data structure.
*/
static void vmcs12_write_field(void *vmcs_hva, uint32_t field, uint64_t val64)
{
uint64_t *ptr = (uint64_t *)(vmcs_hva + vmcs_field_to_vmcs12_offset(field));
switch (VMX_VMCS_FIELD_WIDTH(field)) {
case VMX_VMCS_FIELD_WIDTH_16:
*(uint16_t *)ptr = (uint16_t)val64;
break;
case VMX_VMCS_FIELD_WIDTH_32:
*(uint32_t *)ptr = (uint32_t)val64;
break;
case VMX_VMCS_FIELD_WIDTH_64:
if (!!VMX_VMCS_FIELD_ACCESS_HIGH(field)) {
*(uint32_t *)ptr = (uint32_t)val64;
} else {
*ptr = val64;
}
break;
case VMX_VMCS_FIELD_WIDTH_NATURAL:
default:
*ptr = val64;
break;
}
}
void nested_vmx_result(enum VMXResult result, int error_number)
{
uint64_t rflags = exec_vmread(VMX_GUEST_RFLAGS);
/* ISDM: section 30.2 CONVENTIONS */
rflags &= ~(RFLAGS_C | RFLAGS_P | RFLAGS_A | RFLAGS_Z | RFLAGS_S | RFLAGS_O);
if (result == VMfailValid) {
rflags |= RFLAGS_Z;
exec_vmwrite(VMX_INSTR_ERROR, error_number);
} else if (result == VMfailInvalid) {
rflags |= RFLAGS_C;
} else {
/* VMsucceed, do nothing */
}
if (result != VMsucceed) {
pr_err("VMX failed: %d/%d", result, error_number);
}
exec_vmwrite(VMX_GUEST_RFLAGS, rflags);
}
/**
* @brief get the memory-address operand of a vmx instruction
*
* @pre vcpu != NULL
*/
static uint64_t get_vmx_memory_operand(struct acrn_vcpu *vcpu, uint32_t instr_info)
{
uint64_t gva, gpa, seg_base = 0UL;
uint32_t seg, err_code = 0U;
uint64_t offset;
/*
* According to ISDM 3B: Basic VM-Exit Information: For INVEPT, INVPCID, INVVPID, LGDT,
* LIDT, LLDT, LTR, SGDT, SIDT, SLDT, STR, VMCLEAR, VMPTRLD, VMPTRST, VMREAD, VMWRITE,
* VMXON, XRSTORS, and XSAVES, the exit qualification receives the value of the instructions
* displacement field, which is sign-extended to 64 bits.
*/
offset = vcpu->arch.exit_qualification;
/* TODO: should we consider the cases of address size (bits 9:7 in instr_info) is 16 or 32? */
/*
* refer to ISDM Vol.1-3-24 Operand addressing on how to calculate an effective address
* offset = base + [index * scale] + displacement
* address = segment_base + offset
*/
if (VMX_II_BASE_REG_VALID(instr_info)) {
offset += vcpu_get_gpreg(vcpu, VMX_II_BASE_REG(instr_info));
}
if (VMX_II_IDX_REG_VALID(instr_info)) {
uint64_t val64 = vcpu_get_gpreg(vcpu, VMX_II_IDX_REG(instr_info));
offset += (val64 << VMX_II_SCALING(instr_info));
}
/*
* In 64-bit mode, the processor treats the segment base of CS, DS, ES, SS as zero,
* creating a linear address that is equal to the effective address.
* The exceptions are the FS and GS segments, whose segment registers can be used as
* additional base registers in some linear address calculations.
*/
seg = VMX_II_SEG_REG(instr_info);
if (seg == 4U) {
seg_base = exec_vmread(VMX_GUEST_FS_BASE);
}
if (seg == 5U) {
seg_base = exec_vmread(VMX_GUEST_GS_BASE);
}
gva = seg_base + offset;
(void)gva2gpa(vcpu, gva, &gpa, &err_code);
return gpa;
}
/*
* @pre vcpu != NULL
*/
static uint64_t get_vmptr_gpa(struct acrn_vcpu *vcpu)
{
uint64_t gpa, vmptr;
/* get VMX pointer, which points to the VMCS or VMXON region GPA */
gpa = get_vmx_memory_operand(vcpu, exec_vmread(VMX_INSTR_INFO));
/* get the address (GPA) of the VMCS for VMPTRLD/VMCLEAR, or VMXON region for VMXON */
(void)copy_from_gpa(vcpu->vm, (void *)&vmptr, gpa, sizeof(uint64_t));
return vmptr;
}
static bool validate_vmptr_gpa(uint64_t vmptr_gpa)
{
/* We don't emulate CPUID.80000008H for guests, so check with physical address width */
struct cpuinfo_x86 *cpu_info = get_pcpu_info();
return (mem_aligned_check(vmptr_gpa, PAGE_SIZE) && ((vmptr_gpa >> cpu_info->phys_bits) == 0UL));
}
/**
* @pre vm != NULL
*/
static bool validate_vmcs_revision_id(struct acrn_vcpu *vcpu, uint64_t vmptr_gpa)
{
uint32_t revision_id;
(void)copy_from_gpa(vcpu->vm, (void *)&revision_id, vmptr_gpa, sizeof(uint32_t));
/*
* VMCS revision ID must equal to what reported by the emulated IA32_VMX_BASIC MSR.
* The MSB of VMCS12_REVISION_ID is always smaller than 31, so the following statement
* implicitly validates revision_id[31] as well.
*/
return (revision_id == VMCS12_REVISION_ID);
}
int32_t get_guest_cpl(void)
{
/*
* We get CPL from SS.DPL because:
*
* CS.DPL could not equal to the CPL for conforming code segments. ISDM 5.5 PRIVILEGE LEVELS:
* Conforming code segments can be accessed from any privilege level that is equal to or
* numerically greater (less privileged) than the DPL of the conforming code segment.
*
* ISDM 24.4.1 Guest Register State: The value of the DPL field for SS is always
* equal to the logical processors current privilege level (CPL).
*/
uint32_t ar = exec_vmread32(VMX_GUEST_SS_ATTR);
return ((ar >> 5) & 3);
}
static bool validate_nvmx_cr0_cr4(uint64_t cr0_4, uint64_t fixed0, uint64_t fixed1)
{
bool valid = true;
/* If bit X is 1 in IA32_VMX_CR0/4_FIXED0, then that bit of CR0/4 is fixed to 1 in VMX operation */
if ((cr0_4 & fixed0) != fixed0) {
valid = false;
}
/* if bit X is 0 in IA32_VMX_CR0/4_FIXED1, then that bit of CR0/4 is fixed to 0 in VMX operation */
/* Bits 63:32 of CR0 and CR4 are reserved and must be written with zeros */
if ((uint32_t)(~cr0_4 & ~fixed1) != (uint32_t)~fixed1) {
valid = false;
}
return valid;
}
/*
* @pre vcpu != NULL
*/
static bool validate_nvmx_cr0(struct acrn_vcpu *vcpu)
{
return validate_nvmx_cr0_cr4(vcpu_get_cr0(vcpu), msr_read(MSR_IA32_VMX_CR0_FIXED0),
msr_read(MSR_IA32_VMX_CR0_FIXED1));
}
/*
* @pre vcpu != NULL
*/
static bool validate_nvmx_cr4(struct acrn_vcpu *vcpu)
{
return validate_nvmx_cr0_cr4(vcpu_get_cr4(vcpu), msr_read(MSR_IA32_VMX_CR4_FIXED0),
msr_read(MSR_IA32_VMX_CR4_FIXED1));
}
/*
* @pre vcpu != NULL
*/
static void reset_vvmcs(struct acrn_vcpu *vcpu)
{
struct acrn_vvmcs *vvmcs;
uint32_t idx;
vcpu->arch.nested.current_vvmcs = NULL;
for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
vvmcs = &vcpu->arch.nested.vvmcs[idx];
vvmcs->host_state_dirty = false;
vvmcs->control_fields_dirty = false;
vvmcs->vmcs12_gpa = INVALID_GPA;
vvmcs->ref_cnt = 0;
(void)memset(vvmcs->vmcs02, 0U, PAGE_SIZE);
(void)memset(&vvmcs->vmcs12, 0U, sizeof(struct acrn_vmcs12));
}
}
/*
* @pre vcpu != NULL
*/
int32_t vmxon_vmexit_handler(struct acrn_vcpu *vcpu)
{
const uint64_t features = MSR_IA32_FEATURE_CONTROL_LOCK | MSR_IA32_FEATURE_CONTROL_VMX_NO_SMX;
uint32_t ar = exec_vmread32(VMX_GUEST_CS_ATTR);
if (is_nvmx_configured(vcpu->vm)) {
if (((vcpu_get_cr0(vcpu) & CR0_PE) == 0UL)
|| ((vcpu_get_cr4(vcpu) & CR4_VMXE) == 0UL)
|| ((vcpu_get_rflags(vcpu) & RFLAGS_VM) != 0U)) {
vcpu_inject_ud(vcpu);
} else if (((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LMA_BIT) == 0U)
|| ((ar & (1U << 13U)) == 0U)) {
/* Current ACRN doesn't support 32 bits L1 hypervisor */
vcpu_inject_ud(vcpu);
} else if ((get_guest_cpl() != 0)
|| !validate_nvmx_cr0(vcpu)
|| !validate_nvmx_cr4(vcpu)
|| ((vcpu_get_guest_msr(vcpu, MSR_IA32_FEATURE_CONTROL) & features) != features)) {
vcpu_inject_gp(vcpu, 0U);
} else if (vcpu->arch.nested.vmxon == true) {
nested_vmx_result(VMfailValid, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
} else {
uint64_t vmptr_gpa = get_vmptr_gpa(vcpu);
if (!validate_vmptr_gpa(vmptr_gpa)) {
nested_vmx_result(VMfailInvalid, 0);
} else if (!validate_vmcs_revision_id(vcpu, vmptr_gpa)) {
nested_vmx_result(VMfailInvalid, 0);
} else {
vcpu->arch.nested.vmxon = true;
vcpu->arch.nested.in_l2_guest = false;
vcpu->arch.nested.vmxon_ptr = vmptr_gpa;
reset_vvmcs(vcpu);
nested_vmx_result(VMsucceed, 0);
}
}
} else {
vcpu_inject_ud(vcpu);
}
return 0;
}
/*
* @pre vcpu != NULL
*/
bool check_vmx_permission(struct acrn_vcpu *vcpu)
{
bool permit = true;
/* If this VM is not nVMX enabled, it implies that 'vmxon == false' */
if ((vcpu->arch.nested.vmxon == false)
|| ((vcpu_get_cr0(vcpu) & CR0_PE) == 0UL)
|| ((vcpu_get_rflags(vcpu) & RFLAGS_VM) != 0U)) {
/* We rely on hardware to check "IA32_EFER.LMA = 1 and CS.L = 0" */
vcpu_inject_ud(vcpu);
permit = false;
} else if (get_guest_cpl() != 0) {
vcpu_inject_gp(vcpu, 0U);
permit = false;
}
return permit;
}
/*
* @pre vcpu != NULL
* @pre vcpu->vm != NULL
*/
int32_t vmxoff_vmexit_handler(struct acrn_vcpu *vcpu)
{
if (check_vmx_permission(vcpu)) {
disable_vmcs_shadowing();
vcpu->arch.nested.vmxon = false;
vcpu->arch.nested.in_l2_guest = false;
reset_vvmcs(vcpu);
nested_vmx_result(VMsucceed, 0);
}
return 0;
}
/*
* Only VMCS fields of width 64-bit, 32-bit, and natural-width can be
* read-only. A value of 1 in bits [11:10] of these field encodings
* indicates a read-only field. ISDM Appendix B.
*/
static inline bool is_ro_vmcs_field(uint32_t field)
{
const uint8_t w = VMX_VMCS_FIELD_WIDTH(field);
return (VMX_VMCS_FIELD_WIDTH_16 != w) && (VMX_VMCS_FIELD_TYPE(field) == 1U);
}
/*
* @pre vcpu != NULL
*/
static struct acrn_vvmcs *lookup_vvmcs(struct acrn_vcpu *vcpu, uint64_t vmcs12_gpa)
{
struct acrn_vvmcs *vvmcs = NULL;
uint32_t idx;
for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
if (vcpu->arch.nested.vvmcs[idx].vmcs12_gpa == vmcs12_gpa) {
vvmcs = &vcpu->arch.nested.vvmcs[idx];
break;
}
}
return vvmcs;
}
/*
* @pre vcpu != NULL
*/
static struct acrn_vvmcs *get_or_replace_vvmcs_entry(struct acrn_vcpu *vcpu)
{
struct acrn_nested *nested = &vcpu->arch.nested;
struct acrn_vvmcs *vvmcs = NULL;
uint32_t idx, min_cnt = ~0U;
/* look for an inactive entry first */
for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
if (nested->vvmcs[idx].vmcs12_gpa == INVALID_GPA) {
/* found an inactive vvmcs[] entry. */
vvmcs = &nested->vvmcs[idx];
break;
}
}
/* In case we have to release an active entry to make room for the new VMCS12 */
if (vvmcs == NULL) {
for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
/* look for the entry with least reference count */
if (nested->vvmcs[idx].ref_cnt < min_cnt) {
min_cnt = nested->vvmcs[idx].ref_cnt;
vvmcs = &nested->vvmcs[idx];
}
}
clear_vvmcs(vcpu, vvmcs);
}
/* reset ref_cnt for all entries */
for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
nested->vvmcs[idx].ref_cnt = 0U;
}
return vvmcs;
}
/*
* @brief emulate VMREAD instruction from L1
* @pre vcpu != NULL
*/
int32_t vmread_vmexit_handler(struct acrn_vcpu *vcpu)
{
struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
const uint32_t info = exec_vmread(VMX_INSTR_INFO);
uint64_t vmcs_value, gpa;
uint32_t vmcs_field;
if (check_vmx_permission(vcpu)) {
if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
nested_vmx_result(VMfailInvalid, 0);
} else {
/* TODO: VMfailValid for invalid VMCS fields */
vmcs_field = (uint32_t)vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
vmcs_value = vmcs12_read_field(&cur_vvmcs->vmcs12, vmcs_field);
/* Currently ACRN doesn't support 32bits L1 hypervisor, assuming operands are 64 bits */
if (VMX_II_IS_REG(info)) {
vcpu_set_gpreg(vcpu, VMX_II_REG1(info), vmcs_value);
} else {
gpa = get_vmx_memory_operand(vcpu, info);
(void)copy_to_gpa(vcpu->vm, &vmcs_value, gpa, 8U);
}
pr_dbg("vmcs_field: %x vmcs_value: %llx", vmcs_field, vmcs_value);
nested_vmx_result(VMsucceed, 0);
}
}
return 0;
}
/*
* @brief emulate VMWRITE instruction from L1
* @pre vcpu != NULL
*/
int32_t vmwrite_vmexit_handler(struct acrn_vcpu *vcpu)
{
struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
const uint32_t info = exec_vmread(VMX_INSTR_INFO);
uint64_t vmcs_value, gpa;
uint32_t vmcs_field;
if (check_vmx_permission(vcpu)) {
if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
nested_vmx_result(VMfailInvalid, 0);
} else {
/* TODO: VMfailValid for invalid VMCS fields */
vmcs_field = (uint32_t)vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
if (is_ro_vmcs_field(vmcs_field) &&
((vcpu_get_guest_msr(vcpu, MSR_IA32_VMX_MISC) & (1UL << 29U)) == 0UL)) {
nested_vmx_result(VMfailValid, VMXERR_VMWRITE_RO_COMPONENT);
} else {
/* Currently not support 32bits L1 hypervisor, assuming operands are 64 bits */
if (VMX_II_IS_REG(info)) {
vmcs_value = vcpu_get_gpreg(vcpu, VMX_II_REG1(info));
} else {
gpa = get_vmx_memory_operand(vcpu, info);
(void)copy_from_gpa(vcpu->vm, &vmcs_value, gpa, 8U);
}
if (VMX_VMCS_FIELD_TYPE(vmcs_field) == VMX_VMCS_FIELD_TYPE_HOST) {
cur_vvmcs->host_state_dirty = true;
}
if ((vmcs_field == VMX_MSR_BITMAP_FULL)
|| (vmcs_field == VMX_EPT_POINTER_FULL)
|| (vmcs_field == VMX_VPID)
|| (vmcs_field == VMX_ENTRY_CONTROLS)
|| (vmcs_field == VMX_EXIT_CONTROLS)) {
cur_vvmcs->control_fields_dirty = true;
if (vmcs_field == VMX_EPT_POINTER_FULL) {
if (cur_vvmcs->vmcs12.ept_pointer != vmcs_value) {
put_vept_desc(cur_vvmcs->vmcs12.ept_pointer);
get_vept_desc(vmcs_value);
}
}
}
pr_dbg("vmcs_field: %x vmcs_value: %llx", vmcs_field, vmcs_value);
vmcs12_write_field(&cur_vvmcs->vmcs12, vmcs_field, vmcs_value);
nested_vmx_result(VMsucceed, 0);
}
}
}
return 0;
}
/**
* @brief Sync shadow fields from vmcs02 to cache VMCS12
*
* @pre vcpu != NULL
* @pre vmcs02 is current
*/
static void sync_vmcs02_to_vmcs12(struct acrn_vmcs12 *vmcs12)
{
uint64_t val64;
uint32_t idx;
for (idx = 0; idx < MAX_SHADOW_VMCS_FIELDS; idx++) {
val64 = exec_vmread(vmcs_shadowing_fields[idx]);
vmcs12_write_field(vmcs12, vmcs_shadowing_fields[idx], val64);
}
}
/*
* @pre vcpu != NULL
* @pre VMCS02 (as an ordinary VMCS) is current
*/
static void merge_and_sync_control_fields(struct acrn_vcpu *vcpu, struct acrn_vmcs12 *vmcs12)
{
uint64_t value64;
/* Sync VMCS fields that are not shadowing. Don't need to sync these fields back to VMCS12. */
exec_vmwrite(VMX_MSR_BITMAP_FULL, gpa2hpa(vcpu->vm, vmcs12->msr_bitmap));
exec_vmwrite(VMX_EPT_POINTER_FULL, get_shadow_eptp(vmcs12->ept_pointer));
/* For VM-execution, entry and exit controls */
value64 = vmcs12->vm_entry_controls;
if ((value64 & VMX_ENTRY_CTLS_LOAD_EFER) != VMX_ENTRY_CTLS_LOAD_EFER) {
/*
* L1 hypervisor wishes to use its IA32_EFER for L2 guest so we turn on the
* VMX_ENTRY_CTLS_LOAD_EFER on VMCS02.
*/
value64 |= VMX_ENTRY_CTLS_LOAD_EFER;
exec_vmwrite(VMX_GUEST_IA32_EFER_FULL, vcpu_get_efer(vcpu));
}
exec_vmwrite(VMX_ENTRY_CONTROLS, value64);
/* Host is alway runing in 64-bit mode */
value64 = vmcs12->vm_exit_controls | VMX_EXIT_CTLS_HOST_ADDR64;
exec_vmwrite(VMX_EXIT_CONTROLS, value64);
exec_vmwrite(VMX_VPID, vmcs12->vpid);
}
/**
* @brief Sync shadow fields from vmcs12 to vmcs02
*
* @pre vcpu != NULL
* @pre vmcs02 is current
*/
static void sync_vmcs12_to_vmcs02(struct acrn_vcpu *vcpu, struct acrn_vmcs12 *vmcs12)
{
uint64_t val64;
uint32_t idx;
for (idx = 0; idx < MAX_SHADOW_VMCS_FIELDS; idx++) {
val64 = vmcs12_read_field(vmcs12, vmcs_shadowing_fields[idx]);
exec_vmwrite(vmcs_shadowing_fields[idx], val64);
}
merge_and_sync_control_fields(vcpu, vmcs12);
}
/*
* @pre vcpu != NULL
*/
static void set_vmcs02_shadow_indicator(struct acrn_vvmcs *vvmcs)
{
/* vmcs02 is shadowing */
*((uint32_t*)vvmcs->vmcs02) |= VMCS_SHADOW_BIT_INDICATOR;
}
/*
* @pre vcpu != NULL
* @pre vmcs01 is current
*/
static void clear_vmcs02_shadow_indicator(struct acrn_vvmcs *vvmcs)
{
/* vmcs02 is s an ordinary VMCS */
*((uint32_t*)vvmcs->vmcs02) &= ~VMCS_SHADOW_BIT_INDICATOR;
}
/*
* @pre vcpu != NULL
* @pre vmcs01 is current
*/
static void enable_vmcs_shadowing(struct acrn_vvmcs *vvmcs)
{
uint32_t val32;
/*
* This method of using the same bitmap for VMRead and VMWrite is not typical.
* Here we assume L1 hypervisor will not erroneously write to Read-Only fields.
* TODO: may use different bitmap to exclude read-only fields from VMWRITE bitmap.
*/
exec_vmwrite(VMX_VMREAD_BITMAP_FULL, hva2hpa(vmcs_shadowing_bitmap));
exec_vmwrite(VMX_VMWRITE_BITMAP_FULL, hva2hpa(vmcs_shadowing_bitmap));
/* Set VMCS shadowing bit in Secondary Proc Exec Controls */
val32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS2);
val32 |= VMX_PROCBASED_CTLS2_VMCS_SHADW;
exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, val32);
/* Set VMCS Link pointer */
exec_vmwrite(VMX_VMS_LINK_PTR_FULL, hva2hpa(vvmcs->vmcs02));
}
/*
* @pre vcpu != NULL
* @pre vmcs01 is current
*/
static void disable_vmcs_shadowing(void)
{
uint32_t val32;
/* clear VMCS shadowing bit in Secondary Proc Exec Controls */
val32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS2);
val32 &= ~VMX_PROCBASED_CTLS2_VMCS_SHADW;
exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, val32);
exec_vmwrite(VMX_VMS_LINK_PTR_FULL, ~0UL);
}
/*
* @pre vcpu != NULL
* @pre vmcs01 is current
*/
static void clear_vvmcs(struct acrn_vcpu *vcpu, struct acrn_vvmcs *vvmcs)
{
/*
* Now VMCS02 is active and being used as a shadow VMCS.
* Disable VMCS shadowing to avoid VMCS02 will be loaded by VMPTRLD
* and referenced by VMCS01 as a shadow VMCS simultaneously.
*/
disable_vmcs_shadowing();
/* Flush shadow VMCS to memory */
clear_va_vmcs(vvmcs->vmcs02);
/* VMPTRLD the shadow VMCS so that we are able to sync it to VMCS12 */
load_va_vmcs(vvmcs->vmcs02);
sync_vmcs02_to_vmcs12(&vvmcs->vmcs12);
/* flush cached VMCS12 back to L1 guest */
(void)copy_to_gpa(vcpu->vm, (void *)&vvmcs->vmcs12, vvmcs->vmcs12_gpa, sizeof(struct acrn_vmcs12));
/*
* The current VMCS12 has been flushed out, so that the active VMCS02
* needs to be VMCLEARed as well
*/
clear_va_vmcs(vvmcs->vmcs02);
/* This VMCS can no longer refer to any shadow EPT */
put_vept_desc(vvmcs->vmcs12.ept_pointer);
/* This vvmcs[] entry doesn't cache a VMCS12 any more */
vvmcs->vmcs12_gpa = INVALID_GPA;
/* Cleanup per VVMCS dirty flags */
vvmcs->host_state_dirty = false;
vvmcs->control_fields_dirty = false;
}
/*
* @pre vcpu != NULL
*/
int32_t vmptrld_vmexit_handler(struct acrn_vcpu *vcpu)
{
struct acrn_nested *nested = &vcpu->arch.nested;
struct acrn_vvmcs *vvmcs;
uint64_t vmcs12_gpa;
if (check_vmx_permission(vcpu)) {
vmcs12_gpa = get_vmptr_gpa(vcpu);
if (!validate_vmptr_gpa(vmcs12_gpa)) {
nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
} else if (vmcs12_gpa == nested->vmxon_ptr) {
nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_VMXON_POINTER);
} else if (!validate_vmcs_revision_id(vcpu, vmcs12_gpa)) {
nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
} else if ((nested->current_vvmcs != NULL) && (nested->current_vvmcs->vmcs12_gpa == vmcs12_gpa)) {
/* VMPTRLD current VMCS12, do nothing */
nested_vmx_result(VMsucceed, 0);
} else {
vvmcs = lookup_vvmcs(vcpu, vmcs12_gpa);
if (vvmcs == NULL) {
vvmcs = get_or_replace_vvmcs_entry(vcpu);
/* Create the VMCS02 based on this new VMCS12 */
/*
* initialize VMCS02
* VMCS revision ID must equal to what reported by IA32_VMX_BASIC MSR
*/
(void)memcpy_s(vvmcs->vmcs02, 4U, (void *)&vmx_basic, 4U);
/* VMPTRLD VMCS02 so that we can VMWRITE to it */
load_va_vmcs(vvmcs->vmcs02);
init_host_state();
/* Load VMCS12 from L1 guest memory */
(void)copy_from_gpa(vcpu->vm, (void *)&vvmcs->vmcs12, vmcs12_gpa,
sizeof(struct acrn_vmcs12));
/* if needed, create nept_desc and allocate shadow root for the EPTP */
get_vept_desc(vvmcs->vmcs12.ept_pointer);
/* Need to load shadow fields from this new VMCS12 to VMCS02 */
sync_vmcs12_to_vmcs02(vcpu, &vvmcs->vmcs12);
} else {
vvmcs->ref_cnt += 1U;
}
/* Before VMCS02 is being used as a shadow VMCS, VMCLEAR it */
clear_va_vmcs(vvmcs->vmcs02);
/*
* Now VMCS02 is not active, set the shadow-VMCS indicator.
* At L1 VM entry, VMCS02 will be referenced as a shadow VMCS.
*/
set_vmcs02_shadow_indicator(vvmcs);
/* Switch back to vmcs01 */
load_va_vmcs(vcpu->arch.vmcs);
/* VMCS02 is referenced by VMCS01 Link Pointer */
enable_vmcs_shadowing(vvmcs);
vvmcs->vmcs12_gpa = vmcs12_gpa;
nested->current_vvmcs = vvmcs;
nested_vmx_result(VMsucceed, 0);
}
}
return 0;
}
/*
* @pre vcpu != NULL
*/
int32_t vmclear_vmexit_handler(struct acrn_vcpu *vcpu)
{
struct acrn_nested *nested = &vcpu->arch.nested;
struct acrn_vvmcs *vvmcs;
uint64_t vmcs12_gpa;
if (check_vmx_permission(vcpu)) {
vmcs12_gpa = get_vmptr_gpa(vcpu);
if (!validate_vmptr_gpa(vmcs12_gpa)) {
nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
} else if (vmcs12_gpa == nested->vmxon_ptr) {
nested_vmx_result(VMfailValid, VMXERR_VMCLEAR_VMXON_POINTER);
} else {
vvmcs = lookup_vvmcs(vcpu, vmcs12_gpa);
if (vvmcs != NULL) {
uint64_t current_vmcs12_gpa = INVALID_GPA;
/* Save for comparison */
if (nested->current_vvmcs) {
current_vmcs12_gpa = nested->current_vvmcs->vmcs12_gpa;
}
/* VMCLEAR an active VMCS12, may or may not be current */
vvmcs->vmcs12.launch_state = VMCS12_LAUNCH_STATE_CLEAR;
clear_vvmcs(vcpu, vvmcs);
/* Switch back to vmcs01 (no VMCS shadowing) */
load_va_vmcs(vcpu->arch.vmcs);
if (current_vmcs12_gpa != INVALID_GPA) {
if (current_vmcs12_gpa == vmcs12_gpa) {
/* VMCLEAR current VMCS12 */
nested->current_vvmcs = NULL;
} else {
/*
* VMCLEAR an active but not current VMCS12.
* VMCS shadowing was cleared earlier in clear_vvmcs()
*/
enable_vmcs_shadowing(nested->current_vvmcs);
}
} else {
/* do nothing if there is no current VMCS12 */
}
} else {
/*
* we need to update the VMCS12 launch state in L1 memory in these two cases:
* - L1 hypervisor VMCLEAR a VMCS12 that is already flushed by ACRN to L1 guest
* - L1 hypervisor VMCLEAR a never VMPTRLDed VMCS12.
*/
uint32_t launch_state = VMCS12_LAUNCH_STATE_CLEAR;
(void)copy_to_gpa(vcpu->vm, &launch_state, vmcs12_gpa +
offsetof(struct acrn_vmcs12, launch_state), sizeof(launch_state));
}
nested_vmx_result(VMsucceed, 0);
}
}
return 0;
}
/*
* @pre vcpu != NULL
*/
bool is_vcpu_in_l2_guest(struct acrn_vcpu *vcpu)
{
return vcpu->arch.nested.in_l2_guest;
}
/*
* @pre seg != NULL
*/
static void set_segment(struct segment_sel *seg, uint16_t sel, uint64_t b, uint32_t l, uint32_t a)
{
seg->selector = sel;
seg->base = b;
seg->limit = l;
seg->attr = a;
}
/*
* @pre vcpu != NULL
* @pre vmcs01 is current
*/
static void set_vmcs01_guest_state(struct acrn_vcpu *vcpu)
{
/*
* All host fields are not shadowing, and all VMWRITE to these fields
* are saved in vmcs12.
*
* Load host state from vmcs12 to vmcs01 guest state before entering
* L1 to emulate VMExit from L2 to L1.
*
* We assume host only change these host-state fields in run time.
*
* Section 27.5 Loading Host State
* 1. Load Control Registers, Debug Registers, MSRs
* 2. Load RSP/RIP/RFLAGS
* 3. Load Segmentation State
* 4. Non-Register state
*/
struct acrn_vmcs12 *vmcs12 = &vcpu->arch.nested.current_vvmcs->vmcs12;
struct segment_sel seg;
if (vcpu->arch.nested.current_vvmcs->host_state_dirty == true) {
vcpu->arch.nested.current_vvmcs->host_state_dirty = false;
/*
* We want vcpu_get_cr0/4() can get the up-to-date values, but we don't
* want to call vcpu_set_cr0/4() to handle the CR0/4 write.
*/
exec_vmwrite(VMX_GUEST_CR0, vmcs12->host_cr0);
exec_vmwrite(VMX_GUEST_CR4, vmcs12->host_cr4);
bitmap_clear_nolock(CPU_REG_CR0, &vcpu->reg_cached);
bitmap_clear_nolock(CPU_REG_CR4, &vcpu->reg_cached);
exec_vmwrite(VMX_GUEST_CR3, vmcs12->host_cr3);
exec_vmwrite(VMX_GUEST_DR7, DR7_INIT_VALUE);
exec_vmwrite64(VMX_GUEST_IA32_DEBUGCTL_FULL, 0UL);
exec_vmwrite32(VMX_GUEST_IA32_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
exec_vmwrite(VMX_GUEST_IA32_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
exec_vmwrite(VMX_GUEST_IA32_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
exec_vmwrite(VMX_GUEST_IA32_EFER_FULL, vmcs12->host_ia32_efer);
/*
* type: 11 (Execute/Read, accessed)
* l: 64-bit mode active
*/
set_segment(&seg, vmcs12->host_cs, 0UL, 0xFFFFFFFFU, 0xa09bU);
load_segment(seg, VMX_GUEST_CS);
/*
* type: 3 (Read/Write, accessed)
* D/B: 1 (32-bit segment)
*/
set_segment(&seg, vmcs12->host_ds, 0UL, 0xFFFFFFFFU, 0xc093);
load_segment(seg, VMX_GUEST_DS);
seg.selector = vmcs12->host_ss;
load_segment(seg, VMX_GUEST_SS);
seg.selector = vmcs12->host_es;
load_segment(seg, VMX_GUEST_ES);
seg.selector = vmcs12->host_fs;
seg.base = vmcs12->host_fs_base;
load_segment(seg, VMX_GUEST_FS);
seg.selector = vmcs12->host_gs;
seg.base = vmcs12->host_gs_base;
load_segment(seg, VMX_GUEST_GS);
/*
* ISDM 27.5.2: segment limit for TR is set to 67H
* Type set to 11 and S set to 0 (busy 32-bit task-state segment).
*/
set_segment(&seg, vmcs12->host_tr, vmcs12->host_tr_base, 0x67U, TR_AR);
load_segment(seg, VMX_GUEST_TR);
/*
* ISDM 27.5.2: LDTR is established as follows on all VM exits:
* the selector is cleared to 0000H, the segment is marked unusable
* and is otherwise undefined (although the base address is always canonical).
*/
exec_vmwrite16(VMX_GUEST_LDTR_SEL, 0U);
exec_vmwrite32(VMX_GUEST_LDTR_ATTR, 0x10000U);
}
/*
* For those registers that are managed by the vcpu->reg_updated flag,
* need to write with vcpu_set_xxx() so that vcpu_get_xxx() can get the
* correct values.
*/
vcpu_set_rip(vcpu, vmcs12->host_rip);
vcpu_set_rsp(vcpu, vmcs12->host_rsp);
vcpu_set_rflags(vcpu, 0x2U);
}
/**
* @pre vcpu != NULL
*/
static void sanitize_l2_vpid(struct acrn_vmcs12 *vmcs12)
{
/* Flush VPID if the L2 VPID could be conflicted with any L1 VPIDs */
if (vmcs12->vpid >= ALLOCATED_MIN_L1_VPID) {
flush_vpid_single(vmcs12->vpid);
}
}
/**
* @brief handler for all VMEXITs from nested guests
*
* @pre vcpu != NULL
* @pre VMCS02 (as an ordinary VMCS) is current
*/
int32_t nested_vmexit_handler(struct acrn_vcpu *vcpu)
{
struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
bool is_l1_vmexit = true;
if ((vcpu->arch.exit_reason & 0xFFFFU) == VMX_EXIT_REASON_EPT_VIOLATION) {
is_l1_vmexit = handle_l2_ept_violation(vcpu);
}
if (is_l1_vmexit) {
sanitize_l2_vpid(&cur_vvmcs->vmcs12);
/*
* Clear VMCS02 because: ISDM: Before modifying the shadow-VMCS indicator,
* software should execute VMCLEAR for the VMCS to ensure that it is not active.
*/
clear_va_vmcs(cur_vvmcs->vmcs02);
set_vmcs02_shadow_indicator(cur_vvmcs);
/* Switch to VMCS01, and VMCS02 is referenced as a shadow VMCS */
load_va_vmcs(vcpu->arch.vmcs);
/* Load host state from VMCS12 host area to Guest state of VMCS01 */
set_vmcs01_guest_state(vcpu);
/* vCPU is NOT in guest mode from this point */
vcpu->arch.nested.in_l2_guest = false;
}
/*
* For VM-exits that reflect to L1 hypervisor, ACRN can't advance to next guest RIP
* which is up to the L1 hypervisor to make the decision.
*
* The only case that doesn't need to be reflected is EPT violations that can be
* completely handled by ACRN, which requires L2 VM to re-execute the instruction
* after the shadow EPT is being properly setup.
* In either case, need to set vcpu->arch.inst_len to zero.
*/
vcpu_retain_rip(vcpu);
return 0;
}
/*
* @pre vcpu != NULL
* @pre VMCS01 is current and VMCS02 is referenced by VMCS Link Pointer
*/
static void nested_vmentry(struct acrn_vcpu *vcpu, bool is_launch)
{
struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
struct acrn_vmcs12 *vmcs12 = &cur_vvmcs->vmcs12;
if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
nested_vmx_result(VMfailInvalid, 0);
} else if (is_launch && (vmcs12->launch_state != VMCS12_LAUNCH_STATE_CLEAR)) {
nested_vmx_result(VMfailValid, VMXERR_VMLAUNCH_NONCLEAR_VMCS);
} else if (!is_launch && (vmcs12->launch_state != VMCS12_LAUNCH_STATE_LAUNCHED)) {
nested_vmx_result(VMfailValid, VMXERR_VMRESUME_NONLAUNCHED_VMCS);
} else {
/*
* TODO: Need to do VM-Entry checks before L2 VM entry.
* Refer to ISDM Vol3 VMX Instructions reference.
*/
/*
* Convert the shadow VMCS to an ordinary VMCS.
* ISDM: Software should not modify the shadow-VMCS indicator in
* the VMCS region of a VMCS that is active
*/
clear_va_vmcs(cur_vvmcs->vmcs02);
clear_vmcs02_shadow_indicator(cur_vvmcs);
/* as an ordinary VMCS, VMCS02 is active and currernt when L2 guest is running */
load_va_vmcs(cur_vvmcs->vmcs02);
if (cur_vvmcs->control_fields_dirty) {
cur_vvmcs->control_fields_dirty = false;
merge_and_sync_control_fields(vcpu, vmcs12);
}
/* vCPU is in guest mode from this point */
vcpu->arch.nested.in_l2_guest = true;
if (is_launch) {
vmcs12->launch_state = VMCS12_LAUNCH_STATE_LAUNCHED;
}
sanitize_l2_vpid(vmcs12);
/*
* set vcpu->launched to false because the launch state of VMCS02 is
* clear at this moment, even for VMRESUME
*/
vcpu->launched = false;
}
}
/*
* @pre vcpu != NULL
*/
int32_t vmresume_vmexit_handler(struct acrn_vcpu *vcpu)
{
if (check_vmx_permission(vcpu)) {
nested_vmentry(vcpu, false);
}
return 0;
}
/*
* @pre vcpu != NULL
*/
int32_t vmlaunch_vmexit_handler(struct acrn_vcpu *vcpu)
{
if (check_vmx_permission(vcpu)) {
nested_vmentry(vcpu, true);
}
return 0;
}
/*
* @pre vcpu != NULL
* @pre desc != NULL
*/
int64_t get_invvpid_ept_operands(struct acrn_vcpu *vcpu, void *desc, size_t size)
{
const uint32_t info = exec_vmread(VMX_INSTR_INFO);
uint64_t gpa;
gpa = get_vmx_memory_operand(vcpu, info);
(void)copy_from_gpa(vcpu->vm, desc, gpa, size);
return vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
}
/*
* @pre vcpu != NULL
*/
static bool validate_canonical_addr(struct acrn_vcpu *vcpu, uint64_t va)
{
uint32_t addr_width = 48U; /* linear address width */
uint64_t msb_mask;
if (vcpu_get_cr4(vcpu) & CR4_LA57) {
addr_width = 57U;
}
/*
* In 64-bit mode, an address is considered to be in canonical form if address
* bits 63 through to the most-significant implemented bit by the microarchitecture
* are set to either all ones or all zeros.
*/
msb_mask = ~((1UL << addr_width) - 1UL);
return ((msb_mask & va) == 0UL) || ((msb_mask & va) == msb_mask);
}
/*
* @pre vcpu != NULL
*/
int32_t invvpid_vmexit_handler(struct acrn_vcpu *vcpu)
{
uint32_t supported_types = (vcpu_get_guest_msr(vcpu, MSR_IA32_VMX_EPT_VPID_CAP) >> 40U) & 0xfU;
struct invvpid_operand desc;
uint64_t type;
if (check_vmx_permission(vcpu)) {
type = get_invvpid_ept_operands(vcpu, (void *)&desc, sizeof(desc));
if ((type > VMX_VPID_TYPE_SINGLE_NON_GLOBAL) || ((supported_types & (1U << type)) == 0)) {
nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
} else if ((desc.rsvd1 != 0U) || (desc.rsvd2 != 0U)) {
nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
} else if ((type != VMX_VPID_TYPE_ALL_CONTEXT) && (desc.vpid == 0U)) {
/* check VPID for type 0, 1, 3 */
nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
} else if ((type == VMX_VPID_TYPE_INDIVIDUAL_ADDR) && !validate_canonical_addr(vcpu, desc.gva)) {
nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
} else {
/*
* VPIDs are pass-thru. Values programmed by L1 are used by L0.
* INVVPID type, VPID and GLA, operands of INVVPID instruction, are
* passed as is to the pCPU.
*/
asm_invvpid(desc, type);
nested_vmx_result(VMsucceed, 0);
}
}
return 0;
}
void init_nested_vmx(__unused struct acrn_vm *vm)
{
static bool initialized = false;
if (!initialized) {
initialized = true;
/* Cache the value of physical MSR_IA32_VMX_BASIC */
vmx_basic = (uint32_t)msr_read(MSR_IA32_VMX_BASIC);
setup_vmcs_shadowing_bitmap();
}
}