acrn-hypervisor/hypervisor/arch/x86/guest/nested.c

/*
 * Copyright (C) 2021 Intel Corporation.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */

#include <types.h>
#include <logmsg.h>
#include <asm/mmu.h>
#include <asm/guest/virq.h>
#include <asm/guest/ept.h>
#include <asm/guest/vcpu.h>
#include <asm/guest/vm.h>
#include <asm/guest/vmcs.h>
#include <asm/guest/nested.h>
#include <asm/guest/vept.h>

/* Cache the content of MSR_IA32_VMX_BASIC */
static uint32_t vmx_basic;

static void disable_vmcs_shadowing(void);
static void clear_vvmcs(struct acrn_vcpu *vcpu, struct acrn_vvmcs *vvmcs);

/* The only purpose of this array is to serve the is_vmx_msr() function */
static const uint32_t vmx_msrs[NUM_VMX_MSRS] = {
	LIST_OF_VMX_MSRS
};

bool is_vmx_msr(uint32_t msr)
{
	bool found = false;
	uint32_t i;

	for (i = 0U; i < NUM_VMX_MSRS; i++) {
		if (msr == vmx_msrs[i]) {
			found = true;
			break;
		}
	}

	return found;
}

static uint64_t adjust_vmx_ctrls(uint32_t msr, uint64_t request_bits)
{
	union value_64 val64, msr_val;

	/*
	 * ISDM Appendix A.3, A.4, A.5:
	 * - Bits 31:0 indicate the allowed 0-settings of these controls.
	 *   bit X of the corresponding VM-execution controls field is allowed to be 0
	 *   if bit X in the MSR is cleared to 0
	 * - Bits 63:32 indicate the allowed 1-settings of these controls.
	 *   VM entry allows control X to be 1 if bit 32+X in the MSR is set to 1
	 */
	msr_val.full = msr_read(msr);

	/*
	 * The reserved bits in VMCS Control fields could be 0 or 1, determined by the
	 * corresponding capability MSR. So need to read them from physical MSR.
	 *
	 * We consider the bits that are set in the allowed 0-settings group as the
	 * minimal set of bits that need to be set from the physical processor's perspective.
	 * Since we shadow this control field, we passthru the allowed 0-settings bits.
	 */
	val64.u.lo_32 = msr_val.u.lo_32;

	/* allowed 1-settings include those bits are NOT allowed to be 0 */
	val64.u.hi_32 = msr_val.u.lo_32;

	/* make sure the requested features are supported by hardware */
	val64.u.hi_32 |= (msr_val.u.hi_32 & request_bits);

	return val64.full;
}

/*
 * @pre vcpu != NULL
 */
void init_vmx_msrs(struct acrn_vcpu *vcpu)
{
	union value_64 val64;
	uint64_t request_bits, msr_value;

	if (is_nvmx_configured(vcpu->vm)) {
		/* MSR_IA32_VMX_BASIC */
		val64.full = VMCS12_REVISION_ID	/* Bits 30:0 - VMCS revision ID */
			| (4096UL << 32U)	/* Bits 44:32 - size of VMXON region and VMCS region */
			| (6UL << 50U)		/* Bits 53:50 - memory type for VMCS etc. (6: Write Back) */
			| (1UL << 54U)		/* Bit 54: VM-exit instruction-information for INS and OUTS */
			| (1UL << 55U);		/* Bit 55: VMX controls that default to 1 may be cleared to 0 */
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_BASIC, val64.full);

		/* MSR_IA32_VMX_MISC */

		/*
		 * some bits need to read from physical MSR. For exmaple Bits 4:0 report the relationship between
		 * the rate of the VMX-preemption timer and that of the timestamp counter (TSC).
		 */
		val64.full = msr_read(MSR_IA32_VMX_MISC);
		val64.u.hi_32 = 0U;

		/* Don't support Intel® Processor Trace (Intel PT) in VMX operation */
		val64.u.lo_32 &= ~(1U << 14U);

		/* Don't support SMM in VMX operation */
		val64.u.lo_32 &= ~((1U << 15U) | (1U << 28U));

		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_MISC, val64.full);

		/*
		 * TODO: These emulated VMX Control MSRs work for Tiger Lake and Kaby Lake,
		 * potentially it may have problems if run on other platforms.
		 *
		 * We haven't put our best efforts to try to enable as much as features as
		 * possible.
		 */

		/* MSR_IA32_VMX_PINBASED_CTLS */
		request_bits = VMX_PINBASED_CTLS_IRQ_EXIT
			| VMX_PINBASED_CTLS_NMI_EXIT
			| VMX_PINBASED_CTLS_ENABLE_PTMR;
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PINBASED_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PINBASED_CTLS, msr_value);
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_PINBASED_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS, msr_value);

		/* MSR_IA32_VMX_PROCBASED_CTLS */
		request_bits = VMX_PROCBASED_CTLS_IRQ_WIN | VMX_PROCBASED_CTLS_TSC_OFF
			| VMX_PROCBASED_CTLS_HLT | VMX_PROCBASED_CTLS_INVLPG
			| VMX_PROCBASED_CTLS_MWAIT | VMX_PROCBASED_CTLS_RDPMC
			| VMX_PROCBASED_CTLS_RDTSC | VMX_PROCBASED_CTLS_CR3_LOAD
			| VMX_PROCBASED_CTLS_CR3_STORE | VMX_PROCBASED_CTLS_CR8_LOAD
			| VMX_PROCBASED_CTLS_CR8_STORE | VMX_PROCBASED_CTLS_NMI_WINEXIT
			| VMX_PROCBASED_CTLS_MOV_DR | VMX_PROCBASED_CTLS_UNCOND_IO
			| VMX_PROCBASED_CTLS_MSR_BITMAP | VMX_PROCBASED_CTLS_MONITOR
			| VMX_PROCBASED_CTLS_PAUSE | VMX_PROCBASED_CTLS_SECONDARY;
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PROCBASED_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS, msr_value);
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_PROCBASED_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, msr_value);

		/* MSR_IA32_VMX_PROCBASED_CTLS2 */
		request_bits = VMX_PROCBASED_CTLS2_EPT | VMX_PROCBASED_CTLS2_RDTSCP
			| VMX_PROCBASED_CTLS2_VPID | VMX_PROCBASED_CTLS2_WBINVD
			| VMX_PROCBASED_CTLS2_UNRESTRICT | VMX_PROCBASED_CTLS2_PAUSE_LOOP
			| VMX_PROCBASED_CTLS2_RDRAND | VMX_PROCBASED_CTLS2_INVPCID
			| VMX_PROCBASED_CTLS2_RDSEED | VMX_PROCBASED_CTLS2_XSVE_XRSTR
			| VMX_PROCBASED_CTLS2_TSC_SCALING;
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PROCBASED_CTLS2, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2, msr_value);

		/* MSR_IA32_VMX_EXIT_CTLS */
		request_bits = VMX_EXIT_CTLS_SAVE_DBG | VMX_EXIT_CTLS_HOST_ADDR64
			| VMX_EXIT_CTLS_ACK_IRQ | VMX_EXIT_CTLS_LOAD_PAT
			| VMX_EXIT_CTLS_LOAD_EFER;
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_EXIT_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_EXIT_CTLS, msr_value);
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_EXIT_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_EXIT_CTLS, msr_value);

		/* MSR_IA32_VMX_ENTRY_CTLS */
		request_bits = VMX_ENTRY_CTLS_LOAD_DBG | VMX_ENTRY_CTLS_IA32E_MODE
			| VMX_ENTRY_CTLS_LOAD_PERF | VMX_ENTRY_CTLS_LOAD_PAT
			| VMX_ENTRY_CTLS_LOAD_EFER;
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_ENTRY_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_ENTRY_CTLS, msr_value);
		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_ENTRY_CTLS, request_bits);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS, msr_value);

		msr_value = msr_read(MSR_IA32_VMX_EPT_VPID_CAP);
		/*
		 * Hide 5 level EPT capability
		 * Hide accessed and dirty flags for EPT
		 */
		msr_value &= ~(VMX_EPT_PAGE_WALK_5 | VMX_EPT_AD | VMX_EPT_2MB_PAGE | VMX_EPT_1GB_PAGE);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_EPT_VPID_CAP, msr_value);

		/* For now passthru the value from physical MSR to L1 guest */
		msr_value = msr_read(MSR_IA32_VMX_CR0_FIXED0);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR0_FIXED0, msr_value);

		msr_value = msr_read(MSR_IA32_VMX_CR0_FIXED1);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR0_FIXED1, msr_value);

		msr_value = msr_read(MSR_IA32_VMX_CR4_FIXED0);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR4_FIXED0, msr_value);

		msr_value = msr_read(MSR_IA32_VMX_CR4_FIXED1);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR4_FIXED1, msr_value);

		msr_value = msr_read(MSR_IA32_VMX_VMCS_ENUM);
		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, msr_value);
	}
}

/*
 * @pre vcpu != NULL
 */
int32_t read_vmx_msr(struct acrn_vcpu *vcpu, uint32_t msr, uint64_t *val)
{
	uint64_t v = 0UL;
	int32_t err = 0;

	if (is_nvmx_configured(vcpu->vm)) {
		switch (msr) {
		case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
		case MSR_IA32_VMX_PINBASED_CTLS:
		case MSR_IA32_VMX_PROCBASED_CTLS:
		case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
		case MSR_IA32_VMX_PROCBASED_CTLS2:
		case MSR_IA32_VMX_EXIT_CTLS:
		case MSR_IA32_VMX_TRUE_EXIT_CTLS:
		case MSR_IA32_VMX_ENTRY_CTLS:
		case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
		case MSR_IA32_VMX_BASIC:
		case MSR_IA32_VMX_MISC:
		case MSR_IA32_VMX_EPT_VPID_CAP:
		case MSR_IA32_VMX_CR0_FIXED0:
		case MSR_IA32_VMX_CR0_FIXED1:
		case MSR_IA32_VMX_CR4_FIXED0:
		case MSR_IA32_VMX_CR4_FIXED1:
		case MSR_IA32_VMX_VMCS_ENUM:
		{
			v = vcpu_get_guest_msr(vcpu, msr);
			break;
		}
		/* Don't support these MSRs yet */
		case MSR_IA32_SMBASE:
		case MSR_IA32_VMX_PROCBASED_CTLS3:
		case MSR_IA32_VMX_VMFUNC:
		default:
			err = -EACCES;
			break;
		}
	} else {
		err = -EACCES;
	}

	*val = v;
	return err;
}

#define MAX_SHADOW_VMCS_FIELDS 113U
/*
 * VMCS fields included in the dual-purpose VMCS: as shadow for L1 and
 * as hardware VMCS for nested guest (L2).
 *
 * TODO: This list is for TGL and CFL machines and the fields
 * for advacned APICv features such as Posted Interrupt and Virtual
 * Interrupt Delivery are not included, as these are not available
 * on those platforms.
 *
 * Certain fields, e.g. VMX_TSC_MULTIPLIER_FULL is available only if
 * "use TSC scaling” is supported. Thus a static array may not work
 * for all platforms.
 */
static const uint32_t vmcs_shadowing_fields[MAX_SHADOW_VMCS_FIELDS] = {
	/* 16-bits */
	VMX_GUEST_ES_SEL,
	VMX_GUEST_CS_SEL,
	VMX_GUEST_SS_SEL,
	VMX_GUEST_DS_SEL,
	VMX_GUEST_FS_SEL,
	VMX_GUEST_GS_SEL,
	VMX_GUEST_LDTR_SEL,
	VMX_GUEST_TR_SEL,
	VMX_GUEST_PML_INDEX,

	/* 64-bits */
	VMX_IO_BITMAP_A_FULL,
	VMX_IO_BITMAP_B_FULL,
	VMX_EXIT_MSR_STORE_ADDR_FULL,
	VMX_EXIT_MSR_LOAD_ADDR_FULL,
	VMX_ENTRY_MSR_LOAD_ADDR_FULL,
	VMX_EXECUTIVE_VMCS_PTR_FULL,
	VMX_TSC_OFFSET_FULL,
	VMX_VIRTUAL_APIC_PAGE_ADDR_FULL,
	VMX_APIC_ACCESS_ADDR_FULL,
	VMX_VMREAD_BITMAP_FULL,
	VMX_VMWRITE_BITMAP_FULL,
	VMX_XSS_EXITING_BITMAP_FULL,
	VMX_TSC_MULTIPLIER_FULL,
	VMX_GUEST_PHYSICAL_ADDR_FULL,
	VMX_VMS_LINK_PTR_FULL,
	VMX_GUEST_IA32_DEBUGCTL_FULL,
	VMX_GUEST_IA32_PAT_FULL,
	VMX_GUEST_IA32_EFER_FULL,
	VMX_GUEST_IA32_PERF_CTL_FULL,
	VMX_GUEST_PDPTE0_FULL,
	VMX_GUEST_PDPTE1_FULL,
	VMX_GUEST_PDPTE2_FULL,
	VMX_GUEST_PDPTE3_FULL,

	/* 32-bits */
	VMX_PIN_VM_EXEC_CONTROLS,
	VMX_PROC_VM_EXEC_CONTROLS,
	VMX_EXCEPTION_BITMAP,
	VMX_PF_ERROR_CODE_MASK,
	VMX_PF_ERROR_CODE_MATCH,
	VMX_CR3_TARGET_COUNT,
	VMX_EXIT_MSR_STORE_COUNT,
	VMX_EXIT_MSR_LOAD_COUNT,
	VMX_ENTRY_MSR_LOAD_COUNT,
	VMX_ENTRY_INT_INFO_FIELD,
	VMX_ENTRY_EXCEPTION_ERROR_CODE,
	VMX_ENTRY_INSTR_LENGTH,
	VMX_TPR_THRESHOLD,
	VMX_PROC_VM_EXEC_CONTROLS2,
	VMX_PLE_GAP,
	VMX_PLE_WINDOW,
	VMX_INSTR_ERROR,
	VMX_EXIT_REASON,
	VMX_EXIT_INT_INFO,
	VMX_EXIT_INT_ERROR_CODE,
	VMX_IDT_VEC_INFO_FIELD,
	VMX_IDT_VEC_ERROR_CODE,
	VMX_EXIT_INSTR_LEN,
	VMX_INSTR_INFO,
	VMX_GUEST_ES_LIMIT,
	VMX_GUEST_CS_LIMIT,
	VMX_GUEST_SS_LIMIT,
	VMX_GUEST_DS_LIMIT,
	VMX_GUEST_FS_LIMIT,
	VMX_GUEST_GS_LIMIT,
	VMX_GUEST_LDTR_LIMIT,
	VMX_GUEST_TR_LIMIT,
	VMX_GUEST_GDTR_LIMIT,
	VMX_GUEST_IDTR_LIMIT,
	VMX_GUEST_ES_ATTR,
	VMX_GUEST_CS_ATTR,
	VMX_GUEST_SS_ATTR,
	VMX_GUEST_DS_ATTR,
	VMX_GUEST_FS_ATTR,
	VMX_GUEST_GS_ATTR,
	VMX_GUEST_LDTR_ATTR,
	VMX_GUEST_TR_ATTR,
	VMX_GUEST_INTERRUPTIBILITY_INFO,
	VMX_GUEST_ACTIVITY_STATE,
	VMX_GUEST_SMBASE,
	VMX_GUEST_IA32_SYSENTER_CS,
	VMX_GUEST_TIMER,
	VMX_CR0_GUEST_HOST_MASK,
	VMX_CR4_GUEST_HOST_MASK,
	VMX_CR0_READ_SHADOW,
	VMX_CR4_READ_SHADOW,
	VMX_CR3_TARGET_0,
	VMX_CR3_TARGET_1,
	VMX_CR3_TARGET_2,
	VMX_CR3_TARGET_3,
	VMX_EXIT_QUALIFICATION,
	VMX_IO_RCX,
	VMX_IO_RSI,
	VMX_IO_RDI,
	VMX_IO_RIP,
	VMX_GUEST_LINEAR_ADDR,
	VMX_GUEST_CR0,
	VMX_GUEST_CR3,
	VMX_GUEST_CR4,
	VMX_GUEST_ES_BASE,
	VMX_GUEST_CS_BASE,
	VMX_GUEST_SS_BASE,
	VMX_GUEST_DS_BASE,
	VMX_GUEST_FS_BASE,
	VMX_GUEST_GS_BASE,
	VMX_GUEST_LDTR_BASE,
	VMX_GUEST_TR_BASE,
	VMX_GUEST_GDTR_BASE,
	VMX_GUEST_IDTR_BASE,
	VMX_GUEST_DR7,
	VMX_GUEST_RSP,
	VMX_GUEST_RIP,
	VMX_GUEST_RFLAGS,
	VMX_GUEST_PENDING_DEBUG_EXCEPT,
	VMX_GUEST_IA32_SYSENTER_ESP,
	VMX_GUEST_IA32_SYSENTER_EIP
};

/* to be shared by all vCPUs for all nested guests */
static uint64_t vmcs_shadowing_bitmap[PAGE_SIZE / sizeof(uint64_t)] __aligned(PAGE_SIZE);

static void setup_vmcs_shadowing_bitmap(void)
{
	uint16_t field_index;
	uint32_t array_index;
	uint16_t bit_pos;

	/*
	 * Set all the bits to 1s first and clear out the bits for
	 * the corresponding fields that ACRN lets its guest to access Shadow VMCS
	 */
	memset((void *)vmcs_shadowing_bitmap, 0xFFU, PAGE_SIZE);

	/*
	 * Refer to ISDM Section 24.6.15 VMCS Shadowing Bitmap Addresses
	 * and Section 30.3 VMX Instructions - VMWRITE/VMREAD
	 */
	for (field_index = 0U; field_index < MAX_SHADOW_VMCS_FIELDS; field_index++) {
		bit_pos = vmcs_shadowing_fields[field_index] % 64U;
		array_index = vmcs_shadowing_fields[field_index] / 64U;
		bitmap_clear_nolock(bit_pos, &vmcs_shadowing_bitmap[array_index]);
	}
}

/*
 * This is an array of offsets into a structure of type "struct acrn_vmcs12"
 * 16 offsets for a total of 16 GROUPs. 4 "field widths" by 4 "field types".
 * "Field type" is either Control, Read-Only Data, Guest State or Host State.
 * Refer to the definition of "struct acrn_vmcs12" on how the fields are
 * grouped together for these offsets to work in tandem.
 * Refer to Intel SDM Appendix B Field Encoding in VMCS for info on how
 * fields are grouped and indexed within a group.
 */
static const uint16_t vmcs12_group_offset_table[16] = {
	offsetof(struct acrn_vmcs12, vpid),		/* 16-bit Control Fields */
	offsetof(struct acrn_vmcs12, padding),		/* 16-bit Read-Only Fields */
	offsetof(struct acrn_vmcs12, guest_es),		/* 16-bit Guest-State Fields */
	offsetof(struct acrn_vmcs12, host_es),		/* 16-bit Host-State Fields */
	offsetof(struct acrn_vmcs12, io_bitmap_a),	/* 64-bit Control Fields */
	offsetof(struct acrn_vmcs12, guest_phys_addr),	/* 64-bit Read-Only Data Fields */
	offsetof(struct acrn_vmcs12, vmcs_link_ptr),	/* 64-bit Guest-State Fields */
	offsetof(struct acrn_vmcs12, host_ia32_pat),	/* 64-bit Host-State Fields */
	offsetof(struct acrn_vmcs12, pin_based_exec_ctrl),	/* 32-bit Control Fields */
	offsetof(struct acrn_vmcs12, vm_instr_error),	/* 32-bit Read-Only Data Fields */
	offsetof(struct acrn_vmcs12, guest_es_limit),	/* 32-bit Guest-State Fields */
	offsetof(struct acrn_vmcs12, host_ia32_sysenter_cs),	/* 32-bit Host-State Fields */
	offsetof(struct acrn_vmcs12, cr0_guest_host_mask),	/* Natural-width Control Fields */
	offsetof(struct acrn_vmcs12, exit_qual),		/* Natural-width Read-Only Data Fields */
	offsetof(struct acrn_vmcs12, guest_cr0),		/* Natural-width Guest-State Fields */
	offsetof(struct acrn_vmcs12, host_cr0),			/* Natural-width Host-State Fields */
};

/*
 * field_idx is the index of the field within the group.
 *
 * Access-type is 0 for all widths except for 64-bit
 * For 64-bit if Access-type is 1, offset is moved to
 * high 4 bytes of the field.
 */
#define OFFSET_INTO_VMCS12(group_idx, field_idx, width_in_bytes, access_type) \
	(vmcs12_group_offset_table[group_idx] + \
	field_idx * width_in_bytes + \
	access_type * sizeof(uint32_t))

/* Given a vmcs field, this API returns the offset into "struct acrn_vmcs12" */
static uint16_t vmcs_field_to_vmcs12_offset(uint32_t vmcs_field)
{
	/*
	 * Refer to Appendix B Field Encoding in VMCS in SDM
	 * A value of group index 0001b is not valid because there are no 16-bit
	 * Read-Only fields.
	 *
	 * TODO: check invalid VMCS field
	 */
	uint16_t group_idx = (VMX_VMCS_FIELD_WIDTH(vmcs_field) << 2U) | VMX_VMCS_FIELD_TYPE(vmcs_field);
	uint8_t field_width = VMX_VMCS_FIELD_WIDTH(vmcs_field);
	uint8_t width_in_bytes;

	if (field_width == VMX_VMCS_FIELD_WIDTH_16) {
		width_in_bytes = 2U;
	} else if (field_width == VMX_VMCS_FIELD_WIDTH_32) {
		width_in_bytes = 4U;
	} else {
		/*
		 * Natural-width or 64-bit
		 */
		width_in_bytes = 8U;
	}

	return OFFSET_INTO_VMCS12(group_idx,
		VMX_VMCS_FIELD_INDEX(vmcs_field), width_in_bytes, /* field index within the group */
		VMX_VMCS_FIELD_ACCESS_HIGH(vmcs_field));
}

/*
 * Given a vmcs field and the pointer to the vmcs12, this API returns the
 * corresponding value from the VMCS
 */
static uint64_t vmcs12_read_field(void *vmcs_hva, uint32_t field)
{
	uint64_t *ptr = (uint64_t *)(vmcs_hva + vmcs_field_to_vmcs12_offset(field));
	uint64_t val64 = 0UL;

	switch (VMX_VMCS_FIELD_WIDTH(field)) {
		case VMX_VMCS_FIELD_WIDTH_16:
			val64 = *(uint16_t *)ptr;
			break;
		case VMX_VMCS_FIELD_WIDTH_32:
			val64 = *(uint32_t *)ptr;
			break;
		case VMX_VMCS_FIELD_WIDTH_64:
			if (!!VMX_VMCS_FIELD_ACCESS_HIGH(field)) {
				val64 = *(uint32_t *)ptr;
			} else {
				val64 = *ptr;
			}
			break;
		case VMX_VMCS_FIELD_WIDTH_NATURAL:
		default:
			val64 = *ptr;
			break;
	}

	return val64;
}

/*
 * Write the given VMCS field to the given vmcs12 data structure.
 */
static void vmcs12_write_field(void *vmcs_hva, uint32_t field, uint64_t val64)
{
	uint64_t *ptr = (uint64_t *)(vmcs_hva + vmcs_field_to_vmcs12_offset(field));

	switch (VMX_VMCS_FIELD_WIDTH(field)) {
		case VMX_VMCS_FIELD_WIDTH_16:
			*(uint16_t *)ptr = (uint16_t)val64;
			break;
		case VMX_VMCS_FIELD_WIDTH_32:
			*(uint32_t *)ptr = (uint32_t)val64;
			break;
		case VMX_VMCS_FIELD_WIDTH_64:
			if (!!VMX_VMCS_FIELD_ACCESS_HIGH(field)) {
				*(uint32_t *)ptr = (uint32_t)val64;
			} else {
				*ptr = val64;
			}
			break;
		case VMX_VMCS_FIELD_WIDTH_NATURAL:
		default:
			*ptr = val64;
			break;
	}
}

void nested_vmx_result(enum VMXResult result, int error_number)
{
	uint64_t rflags = exec_vmread(VMX_GUEST_RFLAGS);

	/* ISDM: section 30.2 CONVENTIONS */
	rflags &= ~(RFLAGS_C | RFLAGS_P | RFLAGS_A | RFLAGS_Z | RFLAGS_S | RFLAGS_O);

	if (result == VMfailValid) {
		rflags |= RFLAGS_Z;
		exec_vmwrite(VMX_INSTR_ERROR, error_number);
	} else if (result == VMfailInvalid) {
		rflags |= RFLAGS_C;
	} else {
		/* VMsucceed, do nothing */
	}

	if (result != VMsucceed) {
		pr_err("VMX failed: %d/%d", result, error_number);
	}

	exec_vmwrite(VMX_GUEST_RFLAGS, rflags);
}

/**
 * @brief get the memory-address operand of a vmx instruction
 *
 * @pre vcpu != NULL
 */
static uint64_t get_vmx_memory_operand(struct acrn_vcpu *vcpu, uint32_t instr_info)
{
	uint64_t gva, gpa, seg_base = 0UL;
	uint32_t seg, err_code = 0U;
	uint64_t offset;

	/*
	 * According to ISDM 3B: Basic VM-Exit Information: For INVEPT, INVPCID, INVVPID, LGDT,
	 * LIDT, LLDT, LTR, SGDT, SIDT, SLDT, STR, VMCLEAR, VMPTRLD, VMPTRST, VMREAD, VMWRITE,
	 * VMXON, XRSTORS, and XSAVES, the exit qualification receives the value of the instruction’s
	 * displacement field, which is sign-extended to 64 bits.
	 */
	offset = vcpu->arch.exit_qualification;

	/* TODO: should we consider the cases of address size (bits 9:7 in instr_info) is 16 or 32? */

	/*
	 * refer to ISDM Vol.1-3-24 Operand addressing on how to calculate an effective address
	 * offset = base + [index * scale] + displacement
	 * address = segment_base + offset
	 */
	if (VMX_II_BASE_REG_VALID(instr_info)) {
		offset += vcpu_get_gpreg(vcpu, VMX_II_BASE_REG(instr_info));
	}

	if (VMX_II_IDX_REG_VALID(instr_info)) {
		uint64_t val64 = vcpu_get_gpreg(vcpu, VMX_II_IDX_REG(instr_info));
		offset += (val64 << VMX_II_SCALING(instr_info));
	}

	/*
	 * In 64-bit mode, the processor treats the segment base of CS, DS, ES, SS as zero,
	 * creating a linear address that is equal to the effective address.
	 * The exceptions are the FS and GS segments, whose segment registers can be used as
	 * additional base registers in some linear address calculations.
	 */
	seg = VMX_II_SEG_REG(instr_info);
	if (seg == 4U) {
		seg_base = exec_vmread(VMX_GUEST_FS_BASE);
	}

	if (seg == 5U) {
		seg_base = exec_vmread(VMX_GUEST_GS_BASE);
	}

	gva = seg_base + offset;
	(void)gva2gpa(vcpu, gva, &gpa, &err_code);

	return gpa;
}

/*
 * @pre vcpu != NULL
 */
static uint64_t get_vmptr_gpa(struct acrn_vcpu *vcpu)
{
	uint64_t gpa, vmptr;

	/* get VMX pointer, which points to the VMCS or VMXON region GPA */
	gpa = get_vmx_memory_operand(vcpu, exec_vmread(VMX_INSTR_INFO));

	/* get the address (GPA) of the VMCS for VMPTRLD/VMCLEAR, or VMXON region for VMXON */
	(void)copy_from_gpa(vcpu->vm, (void *)&vmptr, gpa, sizeof(uint64_t));

	return vmptr;
}

static bool validate_vmptr_gpa(uint64_t vmptr_gpa)
{
	/* We don't emulate CPUID.80000008H for guests, so check with physical address width */
	struct cpuinfo_x86 *cpu_info = get_pcpu_info();

	return (mem_aligned_check(vmptr_gpa, PAGE_SIZE) && ((vmptr_gpa >> cpu_info->phys_bits) == 0UL));
}

/**
 * @pre vm != NULL
 */
static bool validate_vmcs_revision_id(struct acrn_vcpu *vcpu, uint64_t vmptr_gpa)
{
	uint32_t revision_id;

	(void)copy_from_gpa(vcpu->vm, (void *)&revision_id, vmptr_gpa, sizeof(uint32_t));

	/*
	 * VMCS revision ID must equal to what reported by the emulated IA32_VMX_BASIC MSR.
	 * The MSB of VMCS12_REVISION_ID is always smaller than 31, so the following statement
	 * implicitly validates revision_id[31] as well.
	 */
	return (revision_id == VMCS12_REVISION_ID);
}

int32_t get_guest_cpl(void)
{
	/*
	 * We get CPL from SS.DPL because:
	 *
	 * CS.DPL could not equal to the CPL for conforming code segments. ISDM 5.5 PRIVILEGE LEVELS:
	 * Conforming code segments can be accessed from any privilege level that is equal to or
	 * numerically greater (less privileged) than the DPL of the conforming code segment.
	 *
	 * ISDM 24.4.1 Guest Register State: The value of the DPL field for SS is always
	 * equal to the logical processor’s current privilege level (CPL).
	 */
	uint32_t ar = exec_vmread32(VMX_GUEST_SS_ATTR);
	return ((ar >> 5) & 3);
}

static bool validate_nvmx_cr0_cr4(uint64_t cr0_4, uint64_t fixed0, uint64_t fixed1)
{
	bool valid = true;

	/* If bit X is 1 in IA32_VMX_CR0/4_FIXED0, then that bit of CR0/4 is fixed to 1 in VMX operation */
	if ((cr0_4 & fixed0) != fixed0) {
		valid = false;
	}

	/* if bit X is 0 in IA32_VMX_CR0/4_FIXED1, then that bit of CR0/4 is fixed to 0 in VMX operation */
	/* Bits 63:32 of CR0 and CR4 are reserved and must be written with zeros */
	if ((uint32_t)(~cr0_4 & ~fixed1) != (uint32_t)~fixed1) {
		valid = false;
	}

	return valid;
}

/*
 * @pre vcpu != NULL
 */
static bool validate_nvmx_cr0(struct acrn_vcpu *vcpu)
{
	return validate_nvmx_cr0_cr4(vcpu_get_cr0(vcpu), msr_read(MSR_IA32_VMX_CR0_FIXED0),
		msr_read(MSR_IA32_VMX_CR0_FIXED1));
}

/*
 * @pre vcpu != NULL
 */
static bool validate_nvmx_cr4(struct acrn_vcpu *vcpu)
{
	return validate_nvmx_cr0_cr4(vcpu_get_cr4(vcpu), msr_read(MSR_IA32_VMX_CR4_FIXED0),
		msr_read(MSR_IA32_VMX_CR4_FIXED1));
}

/*
 * @pre vcpu != NULL
 */
static void reset_vvmcs(struct acrn_vcpu *vcpu)
{
	struct acrn_vvmcs *vvmcs;
	uint32_t idx;

	vcpu->arch.nested.current_vvmcs = NULL;

	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
		vvmcs = &vcpu->arch.nested.vvmcs[idx];
		vvmcs->host_state_dirty = false;
		vvmcs->control_fields_dirty = false;
		vvmcs->vmcs12_gpa = INVALID_GPA;
		vvmcs->ref_cnt = 0;

		(void)memset(vvmcs->vmcs02, 0U, PAGE_SIZE);
		(void)memset(&vvmcs->vmcs12, 0U, sizeof(struct acrn_vmcs12));
	}
}

/*
 * @pre vcpu != NULL
 */
int32_t vmxon_vmexit_handler(struct acrn_vcpu *vcpu)
{
	const uint64_t features = MSR_IA32_FEATURE_CONTROL_LOCK | MSR_IA32_FEATURE_CONTROL_VMX_NO_SMX;
	uint32_t ar = exec_vmread32(VMX_GUEST_CS_ATTR);

	if (is_nvmx_configured(vcpu->vm)) {
		if (((vcpu_get_cr0(vcpu) & CR0_PE) == 0UL)
			|| ((vcpu_get_cr4(vcpu) & CR4_VMXE) == 0UL)
			|| ((vcpu_get_rflags(vcpu) & RFLAGS_VM) != 0U)) {
			vcpu_inject_ud(vcpu);
		} else if (((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LMA_BIT) == 0U)
			|| ((ar & (1U << 13U)) == 0U)) {
			/* Current ACRN doesn't support 32 bits L1 hypervisor */
			vcpu_inject_ud(vcpu);
		} else if ((get_guest_cpl() != 0)
			|| !validate_nvmx_cr0(vcpu)
			|| !validate_nvmx_cr4(vcpu)
			|| ((vcpu_get_guest_msr(vcpu, MSR_IA32_FEATURE_CONTROL) & features) != features)) {
			vcpu_inject_gp(vcpu, 0U);
		} else if (vcpu->arch.nested.vmxon == true) {
			nested_vmx_result(VMfailValid, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
		} else {
			uint64_t vmptr_gpa = get_vmptr_gpa(vcpu);

			if (!validate_vmptr_gpa(vmptr_gpa)) {
				nested_vmx_result(VMfailInvalid, 0);
			} else if (!validate_vmcs_revision_id(vcpu, vmptr_gpa)) {
				nested_vmx_result(VMfailInvalid, 0);
			} else {
				vcpu->arch.nested.vmxon = true;
				vcpu->arch.nested.in_l2_guest = false;
				vcpu->arch.nested.vmxon_ptr = vmptr_gpa;

				reset_vvmcs(vcpu);
				nested_vmx_result(VMsucceed, 0);
			}
		}
	} else {
		vcpu_inject_ud(vcpu);
	}

	return 0;
}

/*
 * @pre vcpu != NULL
 */
bool check_vmx_permission(struct acrn_vcpu *vcpu)
{
	bool permit = true;

	/* If this VM is not nVMX enabled, it implies that 'vmxon == false' */
	if ((vcpu->arch.nested.vmxon == false)
		|| ((vcpu_get_cr0(vcpu) & CR0_PE) == 0UL)
		|| ((vcpu_get_rflags(vcpu) & RFLAGS_VM) != 0U)) {
		/* We rely on hardware to check "IA32_EFER.LMA = 1 and CS.L = 0" */
		vcpu_inject_ud(vcpu);
		permit = false;
	} else if (get_guest_cpl() != 0) {
		vcpu_inject_gp(vcpu, 0U);
		permit = false;
	}

	return permit;
}

/*
 * @pre vcpu != NULL
 * @pre vcpu->vm != NULL
 */
int32_t vmxoff_vmexit_handler(struct acrn_vcpu *vcpu)
{
	if (check_vmx_permission(vcpu)) {
		disable_vmcs_shadowing();

		vcpu->arch.nested.vmxon = false;
		vcpu->arch.nested.in_l2_guest = false;

		reset_vvmcs(vcpu);
		nested_vmx_result(VMsucceed, 0);
	}

	return 0;
}

/*
 * Only VMCS fields of width 64-bit, 32-bit, and natural-width can be
 * read-only. A value of 1 in bits [11:10] of these field encodings
 * indicates a read-only field. ISDM Appendix B.
 */
static inline bool is_ro_vmcs_field(uint32_t field)
{
	const uint8_t w = VMX_VMCS_FIELD_WIDTH(field);
	return (VMX_VMCS_FIELD_WIDTH_16 != w) && (VMX_VMCS_FIELD_TYPE(field) == 1U);
}

/*
 * @pre vcpu != NULL
 */
static struct acrn_vvmcs *lookup_vvmcs(struct acrn_vcpu *vcpu, uint64_t vmcs12_gpa)
{
	struct acrn_vvmcs *vvmcs = NULL;
	uint32_t idx;

	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
		if (vcpu->arch.nested.vvmcs[idx].vmcs12_gpa == vmcs12_gpa) {
			vvmcs = &vcpu->arch.nested.vvmcs[idx];
			break;
		}
	}

	return vvmcs;
}

/*
 * @pre vcpu != NULL
 */
static struct acrn_vvmcs *get_or_replace_vvmcs_entry(struct acrn_vcpu *vcpu)
{
	struct acrn_nested *nested = &vcpu->arch.nested;
	struct acrn_vvmcs *vvmcs = NULL;
	uint32_t idx, min_cnt = ~0U;

	/* look for an inactive entry first */
	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
		if (nested->vvmcs[idx].vmcs12_gpa == INVALID_GPA) {
			/* found an inactive vvmcs[] entry. */
			vvmcs = &nested->vvmcs[idx];
			break;
		}
	}

	/* In case we have to release an active entry to make room for the new VMCS12 */
	if (vvmcs == NULL) {
		for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
			/* look for the entry with least reference count */
			if (nested->vvmcs[idx].ref_cnt < min_cnt) {
				min_cnt = nested->vvmcs[idx].ref_cnt;
				vvmcs = &nested->vvmcs[idx];
			}
		}

		clear_vvmcs(vcpu, vvmcs);
	}

	/* reset ref_cnt for all entries */
	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
		nested->vvmcs[idx].ref_cnt = 0U;
	}

	return vvmcs;
}

/*
 * @brief emulate VMREAD instruction from L1
 * @pre vcpu != NULL
 */
int32_t vmread_vmexit_handler(struct acrn_vcpu *vcpu)
{
	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
	const uint32_t info = exec_vmread(VMX_INSTR_INFO);
	uint64_t vmcs_value, gpa;
	uint32_t vmcs_field;

	if (check_vmx_permission(vcpu)) {
		if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
			nested_vmx_result(VMfailInvalid, 0);
		} else {
			/* TODO: VMfailValid for invalid VMCS fields */
			vmcs_field = (uint32_t)vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
			vmcs_value = vmcs12_read_field(&cur_vvmcs->vmcs12, vmcs_field);

			/* Currently ACRN doesn't support 32bits L1 hypervisor, assuming operands are 64 bits */
			if (VMX_II_IS_REG(info)) {
				vcpu_set_gpreg(vcpu, VMX_II_REG1(info), vmcs_value);
			} else {
				gpa = get_vmx_memory_operand(vcpu, info);
				(void)copy_to_gpa(vcpu->vm, &vmcs_value, gpa, 8U);
			}

			pr_dbg("vmcs_field: %x vmcs_value: %llx", vmcs_field, vmcs_value);
			nested_vmx_result(VMsucceed, 0);
		}
	}

	return 0;
}

/*
 * @brief emulate VMWRITE instruction from L1
 * @pre vcpu != NULL
 */
int32_t vmwrite_vmexit_handler(struct acrn_vcpu *vcpu)
{
	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
	const uint32_t info = exec_vmread(VMX_INSTR_INFO);
	uint64_t vmcs_value, gpa;
	uint32_t vmcs_field;

	if (check_vmx_permission(vcpu)) {
		if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
			nested_vmx_result(VMfailInvalid, 0);
		} else {
			/* TODO: VMfailValid for invalid VMCS fields */
			vmcs_field = (uint32_t)vcpu_get_gpreg(vcpu, VMX_II_REG2(info));

			if (is_ro_vmcs_field(vmcs_field) &&
				((vcpu_get_guest_msr(vcpu, MSR_IA32_VMX_MISC) & (1UL << 29U)) == 0UL)) {
				nested_vmx_result(VMfailValid, VMXERR_VMWRITE_RO_COMPONENT);
			} else {
				/* Currently not support 32bits L1 hypervisor, assuming operands are 64 bits */
				if (VMX_II_IS_REG(info)) {
					vmcs_value = vcpu_get_gpreg(vcpu, VMX_II_REG1(info));
				} else {
					gpa = get_vmx_memory_operand(vcpu, info);
					(void)copy_from_gpa(vcpu->vm, &vmcs_value, gpa, 8U);
				}

				if (VMX_VMCS_FIELD_TYPE(vmcs_field) == VMX_VMCS_FIELD_TYPE_HOST) {
					cur_vvmcs->host_state_dirty = true;
				}

				if ((vmcs_field == VMX_MSR_BITMAP_FULL)
					|| (vmcs_field == VMX_EPT_POINTER_FULL)
					|| (vmcs_field == VMX_VPID)
					|| (vmcs_field == VMX_ENTRY_CONTROLS)
					|| (vmcs_field == VMX_EXIT_CONTROLS)) {
					cur_vvmcs->control_fields_dirty = true;

					if (vmcs_field == VMX_EPT_POINTER_FULL) {
						if (cur_vvmcs->vmcs12.ept_pointer != vmcs_value) {
							put_vept_desc(cur_vvmcs->vmcs12.ept_pointer);
							get_vept_desc(vmcs_value);
						}
					}
				}

				pr_dbg("vmcs_field: %x vmcs_value: %llx", vmcs_field, vmcs_value);
				vmcs12_write_field(&cur_vvmcs->vmcs12, vmcs_field, vmcs_value);
				nested_vmx_result(VMsucceed, 0);
			}
		}
	}

	return 0;
}

/**
 * @brief Sync shadow fields from vmcs02 to cache VMCS12
 *
 * @pre vcpu != NULL
 * @pre vmcs02 is current
 */
static void sync_vmcs02_to_vmcs12(struct acrn_vmcs12 *vmcs12)
{
	uint64_t val64;
	uint32_t idx;

	for (idx = 0; idx < MAX_SHADOW_VMCS_FIELDS; idx++) {
		val64 = exec_vmread(vmcs_shadowing_fields[idx]);
		vmcs12_write_field(vmcs12, vmcs_shadowing_fields[idx], val64);
	}
}

/*
 * @pre vcpu != NULL
 * @pre VMCS02 (as an ordinary VMCS) is current
 */
static void merge_and_sync_control_fields(struct acrn_vcpu *vcpu, struct acrn_vmcs12 *vmcs12)
{
	uint64_t value64;

	/* Sync VMCS fields that are not shadowing. Don't need to sync these fields back to VMCS12. */

	exec_vmwrite(VMX_MSR_BITMAP_FULL, gpa2hpa(vcpu->vm, vmcs12->msr_bitmap));
	exec_vmwrite(VMX_EPT_POINTER_FULL, get_shadow_eptp(vmcs12->ept_pointer));

	/* For VM-execution, entry and exit controls */
	value64 = vmcs12->vm_entry_controls;
	if ((value64 & VMX_ENTRY_CTLS_LOAD_EFER) != VMX_ENTRY_CTLS_LOAD_EFER) {
		/*
		 * L1 hypervisor wishes to use its IA32_EFER for L2 guest so we turn on the
		 * VMX_ENTRY_CTLS_LOAD_EFER on VMCS02.
		 */
		value64 |= VMX_ENTRY_CTLS_LOAD_EFER;
		exec_vmwrite(VMX_GUEST_IA32_EFER_FULL, vcpu_get_efer(vcpu));
	}

	exec_vmwrite(VMX_ENTRY_CONTROLS, value64);

	/* Host is alway runing in 64-bit mode */
	value64 = vmcs12->vm_exit_controls | VMX_EXIT_CTLS_HOST_ADDR64;
	exec_vmwrite(VMX_EXIT_CONTROLS, value64);

	exec_vmwrite(VMX_VPID, vmcs12->vpid);
}

/**
 * @brief Sync shadow fields from vmcs12 to vmcs02
 *
 * @pre vcpu != NULL
 * @pre vmcs02 is current
 */
static void sync_vmcs12_to_vmcs02(struct acrn_vcpu *vcpu, struct acrn_vmcs12 *vmcs12)
{
	uint64_t val64;
	uint32_t idx;

	for (idx = 0; idx < MAX_SHADOW_VMCS_FIELDS; idx++) {
		val64 = vmcs12_read_field(vmcs12, vmcs_shadowing_fields[idx]);
		exec_vmwrite(vmcs_shadowing_fields[idx], val64);
	}

	merge_and_sync_control_fields(vcpu, vmcs12);
}

/*
 * @pre vcpu != NULL
 */
static void set_vmcs02_shadow_indicator(struct acrn_vvmcs *vvmcs)
{
	/* vmcs02 is shadowing */
	*((uint32_t*)vvmcs->vmcs02) |= VMCS_SHADOW_BIT_INDICATOR;
}

/*
 * @pre vcpu != NULL
 * @pre vmcs01 is current
 */
static void clear_vmcs02_shadow_indicator(struct acrn_vvmcs *vvmcs)
{
	/* vmcs02 is s an ordinary VMCS */
	*((uint32_t*)vvmcs->vmcs02) &= ~VMCS_SHADOW_BIT_INDICATOR;
}

/*
 * @pre vcpu != NULL
 * @pre vmcs01 is current
 */
static void enable_vmcs_shadowing(struct acrn_vvmcs *vvmcs)
{
	uint32_t val32;

	/*
	 * This method of using the same bitmap for VMRead and VMWrite is not typical.
	 * Here we assume L1 hypervisor will not erroneously write to Read-Only fields.
	 * TODO: may use different bitmap to exclude read-only fields from VMWRITE bitmap.
	 */
	exec_vmwrite(VMX_VMREAD_BITMAP_FULL, hva2hpa(vmcs_shadowing_bitmap));
	exec_vmwrite(VMX_VMWRITE_BITMAP_FULL, hva2hpa(vmcs_shadowing_bitmap));

	/* Set VMCS shadowing bit in Secondary Proc Exec Controls */
	val32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS2);
	val32 |= VMX_PROCBASED_CTLS2_VMCS_SHADW;
	exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, val32);

	/* Set VMCS Link pointer */
	exec_vmwrite(VMX_VMS_LINK_PTR_FULL, hva2hpa(vvmcs->vmcs02));
}

/*
 * @pre vcpu != NULL
 * @pre vmcs01 is current
 */
static void disable_vmcs_shadowing(void)
{
	uint32_t val32;

	/* clear VMCS shadowing bit in Secondary Proc Exec Controls */
	val32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS2);
	val32 &= ~VMX_PROCBASED_CTLS2_VMCS_SHADW;
	exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, val32);

	exec_vmwrite(VMX_VMS_LINK_PTR_FULL, ~0UL);
}

/*
 * @pre vcpu != NULL
 * @pre vmcs01 is current
 */
static void clear_vvmcs(struct acrn_vcpu *vcpu, struct acrn_vvmcs *vvmcs)
{
	/*
	 * Now VMCS02 is active and being used as a shadow VMCS.
	 * Disable VMCS shadowing to avoid VMCS02 will be loaded by VMPTRLD
	 * and referenced by VMCS01 as a shadow VMCS simultaneously.
	 */
	disable_vmcs_shadowing();

	/* Flush shadow VMCS to memory */
	clear_va_vmcs(vvmcs->vmcs02);

	/* VMPTRLD the shadow VMCS so that we are able to sync it to VMCS12 */
	load_va_vmcs(vvmcs->vmcs02);

	sync_vmcs02_to_vmcs12(&vvmcs->vmcs12);

	/* flush cached VMCS12 back to L1 guest */
	(void)copy_to_gpa(vcpu->vm, (void *)&vvmcs->vmcs12, vvmcs->vmcs12_gpa, sizeof(struct acrn_vmcs12));

	/*
	 * The current VMCS12 has been flushed out, so that the active VMCS02
	 * needs to be VMCLEARed as well
	 */
	clear_va_vmcs(vvmcs->vmcs02);

	/* This VMCS can no longer refer to any shadow EPT */
	put_vept_desc(vvmcs->vmcs12.ept_pointer);

	/* This vvmcs[] entry doesn't cache a VMCS12 any more */
	vvmcs->vmcs12_gpa = INVALID_GPA;

	/* Cleanup per VVMCS dirty flags */
	vvmcs->host_state_dirty = false;
	vvmcs->control_fields_dirty = false;
}

/*
 * @pre vcpu != NULL
 */
int32_t vmptrld_vmexit_handler(struct acrn_vcpu *vcpu)
{
	struct acrn_nested *nested = &vcpu->arch.nested;
	struct acrn_vvmcs *vvmcs;
	uint64_t vmcs12_gpa;

	if (check_vmx_permission(vcpu)) {
		vmcs12_gpa = get_vmptr_gpa(vcpu);

		if (!validate_vmptr_gpa(vmcs12_gpa)) {
			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
		} else if (vmcs12_gpa == nested->vmxon_ptr) {
			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_VMXON_POINTER);
		} else if (!validate_vmcs_revision_id(vcpu, vmcs12_gpa)) {
			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
		} else if ((nested->current_vvmcs != NULL) && (nested->current_vvmcs->vmcs12_gpa == vmcs12_gpa)) {
			/* VMPTRLD current VMCS12, do nothing */
			nested_vmx_result(VMsucceed, 0);
		} else {
			vvmcs = lookup_vvmcs(vcpu, vmcs12_gpa);
			if (vvmcs == NULL) {
				vvmcs = get_or_replace_vvmcs_entry(vcpu);

				/* Create the VMCS02 based on this new VMCS12 */

				/*
				 * initialize VMCS02
				 * VMCS revision ID must equal to what reported by IA32_VMX_BASIC MSR
				 */
				(void)memcpy_s(vvmcs->vmcs02, 4U, (void *)&vmx_basic, 4U);

				/* VMPTRLD VMCS02 so that we can VMWRITE to it */
				load_va_vmcs(vvmcs->vmcs02);
				init_host_state();

				/* Load VMCS12 from L1 guest memory */
				(void)copy_from_gpa(vcpu->vm, (void *)&vvmcs->vmcs12, vmcs12_gpa,
					sizeof(struct acrn_vmcs12));

				/* if needed, create nept_desc and allocate shadow root for the EPTP */
				get_vept_desc(vvmcs->vmcs12.ept_pointer);

				/* Need to load shadow fields from this new VMCS12 to VMCS02 */
				sync_vmcs12_to_vmcs02(vcpu, &vvmcs->vmcs12);
			} else {
				vvmcs->ref_cnt += 1U;
			}

			/* Before VMCS02 is being used as a shadow VMCS, VMCLEAR it */
			clear_va_vmcs(vvmcs->vmcs02);

			/*
			 * Now VMCS02 is not active, set the shadow-VMCS indicator.
			 * At L1 VM entry, VMCS02 will be referenced as a shadow VMCS.
			 */
			set_vmcs02_shadow_indicator(vvmcs);

			/* Switch back to vmcs01 */
			load_va_vmcs(vcpu->arch.vmcs);

			/* VMCS02 is referenced by VMCS01 Link Pointer */
			enable_vmcs_shadowing(vvmcs);

			vvmcs->vmcs12_gpa = vmcs12_gpa;
			nested->current_vvmcs = vvmcs;
			nested_vmx_result(VMsucceed, 0);
		}
	}

	return 0;
}

/*
 * @pre vcpu != NULL
 */
int32_t vmclear_vmexit_handler(struct acrn_vcpu *vcpu)
{
	struct acrn_nested *nested = &vcpu->arch.nested;
	struct acrn_vvmcs *vvmcs;
	uint64_t vmcs12_gpa;

	if (check_vmx_permission(vcpu)) {
		vmcs12_gpa = get_vmptr_gpa(vcpu);

		if (!validate_vmptr_gpa(vmcs12_gpa)) {
			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
		} else if (vmcs12_gpa == nested->vmxon_ptr) {
			nested_vmx_result(VMfailValid, VMXERR_VMCLEAR_VMXON_POINTER);
		} else {
			vvmcs = lookup_vvmcs(vcpu, vmcs12_gpa);
			if (vvmcs != NULL) {
				uint64_t current_vmcs12_gpa = INVALID_GPA;

				/* Save for comparison */
				if (nested->current_vvmcs) {
					current_vmcs12_gpa = nested->current_vvmcs->vmcs12_gpa;
				}

				/* VMCLEAR an active VMCS12, may or may not be current */
				vvmcs->vmcs12.launch_state = VMCS12_LAUNCH_STATE_CLEAR;
				clear_vvmcs(vcpu, vvmcs);

				/* Switch back to vmcs01 (no VMCS shadowing) */
				load_va_vmcs(vcpu->arch.vmcs);

				if (current_vmcs12_gpa != INVALID_GPA) {
					if (current_vmcs12_gpa == vmcs12_gpa) {
						/* VMCLEAR current VMCS12 */
						nested->current_vvmcs = NULL;
					} else {
						/*
						 * VMCLEAR an active but not current VMCS12.
						 * VMCS shadowing was cleared earlier in clear_vvmcs()
						 */
						enable_vmcs_shadowing(nested->current_vvmcs);
					}
				} else {
					/* do nothing if there is no current VMCS12 */
				}
			} else {
				 /*
				  * we need to update the VMCS12 launch state in L1 memory in these two cases:
				  * - L1 hypervisor VMCLEAR a VMCS12 that is already flushed by ACRN to L1 guest
				  * - L1 hypervisor VMCLEAR a never VMPTRLDed VMCS12.
				  */
				uint32_t launch_state = VMCS12_LAUNCH_STATE_CLEAR;
				(void)copy_to_gpa(vcpu->vm, &launch_state, vmcs12_gpa +
					offsetof(struct acrn_vmcs12, launch_state), sizeof(launch_state));
			}

			nested_vmx_result(VMsucceed, 0);
		}
	}

	return 0;
}

/*
 * @pre vcpu != NULL
 */
bool is_vcpu_in_l2_guest(struct acrn_vcpu *vcpu)
{
	return vcpu->arch.nested.in_l2_guest;
}

/*
 * @pre seg != NULL
 */
static void set_segment(struct segment_sel *seg, uint16_t sel, uint64_t b, uint32_t l, uint32_t a)
{
	seg->selector = sel;
	seg->base = b;
	seg->limit = l;
	seg->attr = a;
}

/*
 * @pre vcpu != NULL
 * @pre vmcs01 is current
 */
static void set_vmcs01_guest_state(struct acrn_vcpu *vcpu)
{
	/*
	 * All host fields are not shadowing, and all VMWRITE to these fields
	 * are saved in vmcs12.
	 *
	 * Load host state from vmcs12 to vmcs01 guest state before entering
	 * L1 to emulate VMExit from L2 to L1.
	 *
	 * We assume host only change these host-state fields in run time.
	 *
	 * Section 27.5 Loading Host State
	 * 1. Load Control Registers, Debug Registers, MSRs
	 * 2. Load RSP/RIP/RFLAGS
	 * 3. Load Segmentation State
	 * 4. Non-Register state
	 */
	struct acrn_vmcs12 *vmcs12 = &vcpu->arch.nested.current_vvmcs->vmcs12;
	struct segment_sel seg;

	if (vcpu->arch.nested.current_vvmcs->host_state_dirty == true) {
		vcpu->arch.nested.current_vvmcs->host_state_dirty = false;

		/*
		 * We want vcpu_get_cr0/4() can get the up-to-date values, but we don't
		 * want to call vcpu_set_cr0/4() to handle the CR0/4 write.
		 */
		exec_vmwrite(VMX_GUEST_CR0, vmcs12->host_cr0);
		exec_vmwrite(VMX_GUEST_CR4, vmcs12->host_cr4);
		bitmap_clear_nolock(CPU_REG_CR0, &vcpu->reg_cached);
		bitmap_clear_nolock(CPU_REG_CR4, &vcpu->reg_cached);

		exec_vmwrite(VMX_GUEST_CR3, vmcs12->host_cr3);
		exec_vmwrite(VMX_GUEST_DR7, DR7_INIT_VALUE);
		exec_vmwrite64(VMX_GUEST_IA32_DEBUGCTL_FULL, 0UL);
		exec_vmwrite32(VMX_GUEST_IA32_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
		exec_vmwrite(VMX_GUEST_IA32_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
		exec_vmwrite(VMX_GUEST_IA32_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);

		exec_vmwrite(VMX_GUEST_IA32_EFER_FULL, vmcs12->host_ia32_efer);

		/*
		 * type: 11 (Execute/Read, accessed)
		 * l: 64-bit mode active
		 */
		set_segment(&seg, vmcs12->host_cs, 0UL, 0xFFFFFFFFU, 0xa09bU);
		load_segment(seg, VMX_GUEST_CS);

		/*
		 * type: 3 (Read/Write, accessed)
		 * D/B: 1 (32-bit segment)
		 */
		set_segment(&seg, vmcs12->host_ds, 0UL, 0xFFFFFFFFU, 0xc093);
		load_segment(seg, VMX_GUEST_DS);

		seg.selector = vmcs12->host_ss;
		load_segment(seg, VMX_GUEST_SS);

		seg.selector = vmcs12->host_es;
		load_segment(seg, VMX_GUEST_ES);

		seg.selector = vmcs12->host_fs;
		seg.base = vmcs12->host_fs_base;
		load_segment(seg, VMX_GUEST_FS);

		seg.selector = vmcs12->host_gs;
		seg.base = vmcs12->host_gs_base;
		load_segment(seg, VMX_GUEST_GS);

		/*
		 * ISDM 27.5.2: segment limit for TR is set to 67H
		 * Type set to 11 and S set to 0 (busy 32-bit task-state segment).
		 */
		set_segment(&seg, vmcs12->host_tr, vmcs12->host_tr_base, 0x67U, TR_AR);
		load_segment(seg, VMX_GUEST_TR);

		/*
		 * ISDM 27.5.2: LDTR is established as follows on all VM exits:
		 * the selector is cleared to 0000H, the segment is marked unusable
		 * and is otherwise undefined (although the base address is always canonical).
		 */
		exec_vmwrite16(VMX_GUEST_LDTR_SEL, 0U);
		exec_vmwrite32(VMX_GUEST_LDTR_ATTR, 0x10000U);
	}

	/*
	 * For those registers that are managed by the vcpu->reg_updated flag,
	 * need to write with vcpu_set_xxx() so that vcpu_get_xxx() can get the
	 * correct values.
	 */
	vcpu_set_rip(vcpu, vmcs12->host_rip);
	vcpu_set_rsp(vcpu, vmcs12->host_rsp);
	vcpu_set_rflags(vcpu, 0x2U);
}

/**
 * @pre vcpu != NULL
 */
static void sanitize_l2_vpid(struct acrn_vmcs12 *vmcs12)
{
	/* Flush VPID if the L2 VPID could be conflicted with any L1 VPIDs */
	if (vmcs12->vpid >= ALLOCATED_MIN_L1_VPID) {
		flush_vpid_single(vmcs12->vpid);
	}
}

/**
 * @brief handler for all VMEXITs from nested guests
 *
 * @pre vcpu != NULL
 * @pre VMCS02 (as an ordinary VMCS) is current
 */
int32_t nested_vmexit_handler(struct acrn_vcpu *vcpu)
{
	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
	bool is_l1_vmexit = true;

	if ((vcpu->arch.exit_reason & 0xFFFFU) == VMX_EXIT_REASON_EPT_VIOLATION) {
		is_l1_vmexit = handle_l2_ept_violation(vcpu);
	}

	if (is_l1_vmexit) {
		sanitize_l2_vpid(&cur_vvmcs->vmcs12);

		/*
		 * Clear VMCS02 because: ISDM: Before modifying the shadow-VMCS indicator,
		 * software should execute VMCLEAR for the VMCS to ensure that it is not active.
		 */
		clear_va_vmcs(cur_vvmcs->vmcs02);
		set_vmcs02_shadow_indicator(cur_vvmcs);

		/* Switch to VMCS01, and VMCS02 is referenced as a shadow VMCS */
		load_va_vmcs(vcpu->arch.vmcs);

		/* Load host state from VMCS12 host area to Guest state of VMCS01 */
		set_vmcs01_guest_state(vcpu);

		/* vCPU is NOT in guest mode from this point */
		vcpu->arch.nested.in_l2_guest = false;
	}

	/*
	 * For VM-exits that reflect to L1 hypervisor, ACRN can't advance to next guest RIP
	 * which is up to the L1 hypervisor to make the decision.
	 *
	 * The only case that doesn't need to be reflected is EPT violations that can be
	 * completely handled by ACRN, which requires L2 VM to re-execute the instruction
	 * after the shadow EPT is being properly setup.

	 * In either case, need to set vcpu->arch.inst_len to zero.
	 */
	vcpu_retain_rip(vcpu);
	return 0;
}

/*
 * @pre vcpu != NULL
 * @pre VMCS01 is current and VMCS02 is referenced by VMCS Link Pointer
 */
static void nested_vmentry(struct acrn_vcpu *vcpu, bool is_launch)
{
	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
	struct acrn_vmcs12 *vmcs12 = &cur_vvmcs->vmcs12;

	if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
		nested_vmx_result(VMfailInvalid, 0);
	} else if (is_launch && (vmcs12->launch_state != VMCS12_LAUNCH_STATE_CLEAR)) {
		nested_vmx_result(VMfailValid, VMXERR_VMLAUNCH_NONCLEAR_VMCS);
	} else if (!is_launch && (vmcs12->launch_state != VMCS12_LAUNCH_STATE_LAUNCHED)) {
		nested_vmx_result(VMfailValid, VMXERR_VMRESUME_NONLAUNCHED_VMCS);
	} else {
		/*
		 * TODO: Need to do VM-Entry checks before L2 VM entry.
		 * Refer to ISDM Vol3 VMX Instructions reference.
		 */

		/*
		 * Convert the shadow VMCS to an ordinary VMCS.
		 * ISDM: Software should not modify the shadow-VMCS indicator in
		 * the VMCS region of a VMCS that is active
		 */
		clear_va_vmcs(cur_vvmcs->vmcs02);
		clear_vmcs02_shadow_indicator(cur_vvmcs);

		/* as an ordinary VMCS, VMCS02 is active and currernt when L2 guest is running */
		load_va_vmcs(cur_vvmcs->vmcs02);

		if (cur_vvmcs->control_fields_dirty) {
			cur_vvmcs->control_fields_dirty = false;
			merge_and_sync_control_fields(vcpu, vmcs12);
		}

		/* vCPU is in guest mode from this point */
		vcpu->arch.nested.in_l2_guest = true;

		if (is_launch) {
			vmcs12->launch_state = VMCS12_LAUNCH_STATE_LAUNCHED;
		}

		sanitize_l2_vpid(vmcs12);

		/*
		 * set vcpu->launched to false because the launch state of VMCS02 is
		 * clear at this moment, even for VMRESUME
		 */
		vcpu->launched = false;
	}
}

/*
 * @pre vcpu != NULL
 */
int32_t vmresume_vmexit_handler(struct acrn_vcpu *vcpu)
{
	if (check_vmx_permission(vcpu)) {
		nested_vmentry(vcpu, false);
	}

	return 0;
}

/*
 * @pre vcpu != NULL
 */
int32_t vmlaunch_vmexit_handler(struct acrn_vcpu *vcpu)
{
	if (check_vmx_permission(vcpu)) {
		nested_vmentry(vcpu, true);
	}

	return 0;
}

/*
 * @pre vcpu != NULL
 * @pre desc != NULL
 */
int64_t get_invvpid_ept_operands(struct acrn_vcpu *vcpu, void *desc, size_t size)
{
	const uint32_t info = exec_vmread(VMX_INSTR_INFO);
	uint64_t gpa;

	gpa = get_vmx_memory_operand(vcpu, info);
	(void)copy_from_gpa(vcpu->vm, desc, gpa, size);

	return vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
}

/*
 * @pre vcpu != NULL
 */
static bool validate_canonical_addr(struct acrn_vcpu *vcpu, uint64_t va)
{
	uint32_t addr_width = 48U; /* linear address width */
	uint64_t msb_mask;

	if (vcpu_get_cr4(vcpu) & CR4_LA57) {
		addr_width = 57U;
	}

	/*
	 * In 64-bit mode, an address is considered to be in canonical form if address
	 * bits 63 through to the most-significant implemented bit by the microarchitecture
	 * are set to either all ones or all zeros.
	 */

	msb_mask = ~((1UL << addr_width) - 1UL);
	return ((msb_mask & va) == 0UL) || ((msb_mask & va) == msb_mask);
}

/*
 * @pre vcpu != NULL
 */
int32_t invvpid_vmexit_handler(struct acrn_vcpu *vcpu)
{
	uint32_t supported_types = (vcpu_get_guest_msr(vcpu, MSR_IA32_VMX_EPT_VPID_CAP) >> 40U) & 0xfU;
	struct invvpid_operand desc;
	uint64_t type;

	if (check_vmx_permission(vcpu)) {
		type = get_invvpid_ept_operands(vcpu, (void *)&desc, sizeof(desc));

		if ((type > VMX_VPID_TYPE_SINGLE_NON_GLOBAL) || ((supported_types & (1U << type)) == 0)) {
			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
		} else if ((desc.rsvd1 != 0U) || (desc.rsvd2 != 0U)) {
			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
		} else if ((type != VMX_VPID_TYPE_ALL_CONTEXT) && (desc.vpid == 0U)) {
			/* check VPID for type 0, 1, 3 */
			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
		} else if ((type == VMX_VPID_TYPE_INDIVIDUAL_ADDR) && !validate_canonical_addr(vcpu, desc.gva)) {
			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
		} else {
			/*
			 * VPIDs are pass-thru. Values programmed by L1 are used by L0.
			 * INVVPID type, VPID and GLA, operands of INVVPID instruction, are
			 * passed as is to the pCPU.
			 */
			asm_invvpid(desc, type);
			nested_vmx_result(VMsucceed, 0);
		}
	}

	return 0;
}

void init_nested_vmx(__unused struct acrn_vm *vm)
{
	static bool initialized = false;

	if (!initialized) {
		initialized = true;

		/* Cache the value of physical MSR_IA32_VMX_BASIC */
		vmx_basic = (uint32_t)msr_read(MSR_IA32_VMX_BASIC);
		setup_vmcs_shadowing_bitmap();
	}
}