From 2c5c8754dec57bb3866467c0e982c59aa3a0ba1f Mon Sep 17 00:00:00 2001
From: "Liu,Junming" <junming.liu@intel.com>
Date: Thu, 12 Aug 2021 09:30:05 +0000
Subject: [PATCH] hv:enable GVT-d for pre-launched linux guest in logical
 partion mode

When pass-thru GPU to pre-launched Linux guest,
need to pass GPU OpRegion to the guest.
Here's the detailed steps:
1. reserve a memory region in ve820 table for GPU OpRegion
2. build EPT mapping for GPU OpRegion to pass-thru OpRegion to guest
3. emulate the pci config register for OpRegion
For the third step, here's detailed description:
The address of OpRegion locates on PCI config space offset 0xFC,
Normal Linux guest won't write this register,
so we can regard this register as read-only.
When guest reads this register, return the emulated value.
When guest writes this register, ignore the operation.

Tracked-On: #6387

Signed-off-by: Liu,Junming <junming.liu@intel.com>
---
 hypervisor/arch/x86/guest/ve820.c | 14 ++++++++++----
 hypervisor/dm/vpci/pci_pt.c       | 27 +++++++++++++++++++++++++++
 hypervisor/dm/vpci/vpci.c         |  9 +++++++--
 hypervisor/include/common/ptdev.h |  5 +++++
 4 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/hypervisor/arch/x86/guest/ve820.c b/hypervisor/arch/x86/guest/ve820.c
index 8de17d519..4e8dc5581 100644
--- a/hypervisor/arch/x86/guest/ve820.c
+++ b/hypervisor/arch/x86/guest/ve820.c
@@ -12,10 +12,11 @@
 #include <vacpi.h>
 #include <logmsg.h>
 #include <asm/rtcm.h>
+#include <ptdev.h>
 
 #define ENTRY_HPA1_LOW_PART1	2U
-#define ENTRY_HPA1_LOW_PART2	4U
-#define ENTRY_HPA1_HI		8U
+#define ENTRY_HPA1_LOW_PART2	5U
+#define ENTRY_HPA1_HI		9U
 
 static struct e820_entry sos_vm_e820[E820_MAX_ENTRIES];
 static struct e820_entry pre_vm_e820[PRE_VM_NUM][E820_MAX_ENTRIES];
@@ -197,9 +198,14 @@ static const struct e820_entry pre_ve820_template[E820_MAX_ENTRIES] = {
 		.length   = PRE_RTVM_SW_SRAM_MAX_SIZE,
 		.type     = E820_TYPE_RESERVED
 	},
+	{	/* GPU OpRegion for pre-launched VM */
+		.baseaddr = GPU_OPREGION_GPA,
+		.length   = GPU_OPREGION_SIZE,
+		.type     = E820_TYPE_RESERVED
+	},
 	{	/* part2 of lowmem of hpa1*/
-		.baseaddr = PRE_RTVM_SW_SRAM_BASE_GPA + PRE_RTVM_SW_SRAM_MAX_SIZE,
-		.length   = VIRT_ACPI_DATA_ADDR - (PRE_RTVM_SW_SRAM_BASE_GPA + PRE_RTVM_SW_SRAM_MAX_SIZE),
+		.baseaddr = GPU_OPREGION_GPA + GPU_OPREGION_SIZE,
+		.length   = VIRT_ACPI_DATA_ADDR - (GPU_OPREGION_GPA + GPU_OPREGION_SIZE),
 		.type     = E820_TYPE_RAM
 	},
 	{	/* ACPI Reclaim */
diff --git a/hypervisor/dm/vpci/pci_pt.c b/hypervisor/dm/vpci/pci_pt.c
index 01b9d2fb4..ab3c9ed68 100644
--- a/hypervisor/dm/vpci/pci_pt.c
+++ b/hypervisor/dm/vpci/pci_pt.c
@@ -35,6 +35,7 @@
 #include <asm/mmu.h>
 #include <asm/io.h>
 #include <logmsg.h>
+#include <config.h>
 #include "vpci_priv.h"
 
 /**
@@ -487,6 +488,28 @@ void vdev_pt_hide_sriov_cap(struct pci_vdev *vdev)
 
 	pr_acrnlog("Hide sriov cap for %02x:%02x.%x", vdev->pdev->bdf.bits.b, vdev->pdev->bdf.bits.d, vdev->pdev->bdf.bits.f);
 }
+
+/* TODO:
+ * The OpRegion is not 4KB aligned, while under some platforms,
+ * it will take up to 16KB. In this case, OpRegion overlay 5 pages.
+ * So set GPU_OPREGION_SIZE to 0x5000U(20KB) here.
+ *
+ * The solution that pass-thru OpRegion has potential security issue.
+ * Will take the copy + emulation solution to expose host OpRegion to guest later.
+ */
+void passthru_gpu_opregion(struct pci_vdev *vdev)
+{
+	uint32_t gpu_opregion_hpa, gpu_opregion_gpa, gpu_asls_phys;
+
+	gpu_opregion_gpa = GPU_OPREGION_GPA;
+	gpu_asls_phys = pci_pdev_read_cfg(vdev->pdev->bdf, PCIR_ASLS_CTL, 4U);
+	gpu_opregion_hpa = gpu_asls_phys & PCIM_ASLS_OPREGION_MASK;
+	ept_add_mr(vpci2vm(vdev->vpci), vpci2vm(vdev->vpci)->arch_vm.nworld_eptp,
+			gpu_opregion_hpa, gpu_opregion_gpa,
+			GPU_OPREGION_SIZE, EPT_RD | EPT_UNCACHED);
+	pci_vdev_write_vcfg(vdev, PCIR_ASLS_CTL, 4U, gpu_opregion_gpa | (gpu_asls_phys & ~PCIM_ASLS_OPREGION_MASK));
+}
+
 /*
  * @brief Initialize a specified passthrough vdev structure.
  *
@@ -523,6 +546,10 @@ void init_vdev_pt(struct pci_vdev *vdev, bool is_pf_vdev)
 			/* Disable INTX */
 			pci_command |= 0x400U;
 			pci_pdev_write_cfg(vdev->pdev->bdf, PCIR_COMMAND, 2U, pci_command);
+
+			if (vdev->pdev->bdf.value == CONFIG_GPU_SBDF) {
+				passthru_gpu_opregion(vdev);
+			}
 		}
 	} else {
 		if (vdev->phyfun->vpci != vdev->vpci) {
diff --git a/hypervisor/dm/vpci/vpci.c b/hypervisor/dm/vpci/vpci.c
index 73b7fcb9e..c1f813895 100644
--- a/hypervisor/dm/vpci/vpci.c
+++ b/hypervisor/dm/vpci/vpci.c
@@ -516,8 +516,10 @@ static int32_t write_pt_dev_cfg(struct pci_vdev *vdev, uint32_t offset,
 	} else {
 		if (offset != vdev->pdev->sriov.pre_pos) {
 			if (!is_quirk_ptdev(vdev)) {
-				/* passthru to physical device */
-				pci_pdev_write_cfg(vdev->pdev->bdf, offset, bytes, val);
+				if ((vdev->pdev->bdf.value != CONFIG_GPU_SBDF) || (offset != PCIR_ASLS_CTL)) {
+					/* passthru to physical device */
+					pci_pdev_write_cfg(vdev->pdev->bdf, offset, bytes, val);
+				}
 			} else {
 				ret = -ENODEV;
 			}
@@ -544,6 +546,9 @@ static int32_t read_pt_dev_cfg(const struct pci_vdev *vdev, uint32_t offset,
 		} else if (!is_quirk_ptdev(vdev)) {
 			/* passthru to physical device */
 			*val = pci_pdev_read_cfg(vdev->pdev->bdf, offset, bytes);
+			if ((vdev->pdev->bdf.value == CONFIG_GPU_SBDF) && (offset == PCIR_ASLS_CTL)) {
+				*val = pci_vdev_read_vcfg(vdev, offset, bytes);
+			}
 		} else {
 			ret = -ENODEV;
 		}
diff --git a/hypervisor/include/common/ptdev.h b/hypervisor/include/common/ptdev.h
index 2bc8d1146..f586e20bd 100644
--- a/hypervisor/include/common/ptdev.h
+++ b/hypervisor/include/common/ptdev.h
@@ -19,6 +19,11 @@ enum intx_ctlr {
 #define PTDEV_INTR_MSI		(1U << 0U)
 #define PTDEV_INTR_INTX		(1U << 1U)
 
+#define GPU_OPREGION_SIZE	0x5000U
+#define GPU_OPREGION_GPA	0x40880000U
+#define PCIR_ASLS_CTL		0xfcU /* register offset in PCIe configuration space for Opregion base address */
+#define PCIM_ASLS_OPREGION_MASK	0xfffff000U /* opregion need 4KB aligned */
+
 #define INVALID_PTDEV_ENTRY_ID 0xffffU
 
 #define DEFINE_MSI_SID(name, a, b)	\