From 6804537f2c0e9017461b03fee68932d6d2b6773d Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:01:16 +0000
Subject: [PATCH 01/26] XMM

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/hyperv-tlfs.h |  5 +---
 arch/x86/kvm/hyperv.c              | 44 ++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h           |  4 +++
 3 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 2ff26f53cd6244..fe6a94acfbb906 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -49,7 +49,7 @@
 /* Support for physical CPU dynamic partitioning events is available*/
 #define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE	BIT(3)
 /*
- * Support for passing hypercall input parameter block via XMM
+ * Support for passing hypercall input and output parameter block via XMM
  * registers is available
  */
 #define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE		BIT(4)
@@ -387,9 +387,6 @@ struct hv_tsc_emulation_status {
 #define HV_X64_MSR_TSC_REFERENCE_ENABLE		0x00000001
 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT	12
 
-/* Number of XMM registers used in hypercall input/output */
-#define HV_HYPERCALL_MAX_XMM_REGISTERS		6
-
 struct hv_nested_enlightenments_control {
 	struct {
 		__u32 directhypercall:1;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 238afd7335e46d..7222a88d229dce 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1815,6 +1815,7 @@ struct kvm_hv_hcall {
 	u16 rep_idx;
 	bool fast;
 	bool rep;
+	bool xmm_dirty;
 	sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
 
 	/*
@@ -2323,6 +2324,18 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
 	}
 }
 
+static void kvm_hv_hypercall_set_xmm_regs(struct kvm_vcpu *vcpu)
+{
+	u64 *xmm = vcpu->run->hyperv.u.hcall.xmm;
+	int reg;
+
+	kvm_fpu_get();
+	for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
+		//TODO: This is not great :(
+		_kvm_write_sse_reg(reg, &(const sse128_t){sse128(xmm[reg * 2], xmm[(reg * 2) + 1])});
+	kvm_fpu_put();
+}
+
 static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
 {
 	u32 tlb_lock_count = 0;
@@ -2348,6 +2361,13 @@ static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
 
 static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
 {
+	u16 call = vcpu->run->hyperv.u.hcall.input & 0xffff;
+	bool fast = !!(vcpu->run->hyperv.u.hcall.input & HV_HYPERCALL_FAST_BIT);
+
+	//TODO: Not in love with this approach
+	if (call == HVCALL_GET_VP_REGISTERS && fast)
+		kvm_hv_hypercall_set_xmm_regs(vcpu);
+
 	return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
 }
 
@@ -2414,6 +2434,20 @@ static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc)
 	for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
 		_kvm_read_sse_reg(reg, &hc->xmm[reg]);
 	kvm_fpu_put();
+
+	/* It's not dirty because we've replaced any possible changes */
+	hc->xmm_dirty = false;
+}
+
+static void kvm_hv_hypercall_write_xmm(struct kvm_hv_hcall *hc)
+{
+	int reg;
+
+	kvm_fpu_get();
+	for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++)
+		_kvm_write_sse_reg(reg, &hc->xmm[reg]);
+	kvm_fpu_put();
+	hc->xmm_dirty = false;
 }
 
 static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
@@ -2623,6 +2657,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 		break;
 	}
 
+	if ((ret & HV_HYPERCALL_RESULT_MASK) == HV_STATUS_SUCCESS && hc.xmm_dirty)
+		kvm_hv_hypercall_write_xmm(&hc);
+
 hypercall_complete:
 	return kvm_hv_hypercall_complete(vcpu, ret);
 
@@ -2632,6 +2669,12 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	vcpu->run->hyperv.u.hcall.input = hc.param;
 	vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
 	vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
+	if (hc.fast) {
+		for (i = 0; i < HV_HYPERCALL_MAX_XMM_REGISTERS; i++) {
+			vcpu->run->hyperv.u.hcall.xmm[i * 2] = sse128_lo(hc.xmm[i]);
+			vcpu->run->hyperv.u.hcall.xmm[(i * 2) + 1] = sse128_hi(hc.xmm[i]);
+		}
+	}
 	vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace;
 	return 0;
 }
@@ -2780,6 +2823,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 			ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
 
 			ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
+			ent->edx |= HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE;
 			ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
 			ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 5b5820d19e7191..c5c3c1b6970dd6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -210,6 +210,10 @@ struct kvm_hyperv_exit {
 			__u64 input;
 			__u64 result;
 			__u64 params[2];
+			//TODO: Maybe export sse128_t?
+			/* Number of XMM registers used in hypercall input/output */
+			#define HV_HYPERCALL_MAX_XMM_REGISTERS		6
+			__u64 xmm[HV_HYPERCALL_MAX_XMM_REGISTERS * 2];
 		} hcall;
 		struct {
 			__u32 msr;

From 38b99a32fd64b48eeff3ca5b2e0c02387f0018f7 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:07:37 +0000
Subject: [PATCH 02/26] Hypercall page

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/kvm_host.h   |   3 +
 arch/x86/include/uapi/asm/kvm.h   |   6 +
 arch/x86/kvm/hyperv.c             | 198 +++++++++++++++++++++++++-----
 include/asm-generic/hyperv-tlfs.h |  15 +++
 include/uapi/linux/kvm.h          |   8 ++
 5 files changed, 200 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dff10051e9b63c..f9be5a6f74d864 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1105,6 +1105,9 @@ struct kvm_hv {
 	u64 hv_tsc_emulation_status;
 	u64 hv_invtsc_control;
 
+	union hv_register_vsm_code_page_offsets vsm_code_page_offsets32;
+	union hv_register_vsm_code_page_offsets vsm_code_page_offsets64;
+
 	/* How many vCPUs have VP index != vCPU index */
 	atomic_t num_mismatched_vp_indexes;
 
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a448d0964fc06e..eb0182f76b4a05 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -565,4 +565,10 @@ struct kvm_pmu_event_filter {
 #define KVM_X86_DEFAULT_VM	0
 #define KVM_X86_SW_PROTECTED_VM	1
 
+/* Partition-wide VSM state; for KVM_HV_GET/SET_VSM_STATE */
+struct kvm_hv_vsm_state {
+	__u64 vsm_code_page_offsets64;
+	__u64 vsm_code_page_offsets32;
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 7222a88d229dce..bbe4d6190ed170 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -256,6 +256,163 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
 	kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
 }
 
+static int patch_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_hv *hv = to_kvm_hv(kvm);
+	u8 instructions[0x30];
+	int i = 0;
+	u64 addr;
+
+	/*
+	 * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+	 * the same way Xen itself does, by setting the bit 31 of EAX
+	 * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+	 * going to be clobbered on 64-bit.
+	 */
+	if (kvm_xen_hypercall_enabled(kvm)) {
+		/* orl $0x80000000, %eax */
+		instructions[i++] = 0x0d;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x80;
+	}
+
+	/* vmcall/vmmcall */
+	static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+	i += 3;
+
+	/* ret */
+	((unsigned char *)instructions)[i++] = 0xc3;
+
+	/* VTL call/return entries */
+	if (!kvm_xen_hypercall_enabled(kvm) && kvm->arch.hyperv.hv_enable_vsm) {
+		/*
+		 * VTL call 32-bit entry prologue:
+		 * 	mov %eax, %ecx
+		 * 	mov $0x11, %eax
+		 * 	jmp 0:
+		 */
+		hv->vsm_code_page_offsets32.vtl_call_offset = i;
+		instructions[i++] = 0x89;
+		instructions[i++] = 0xc1;
+		instructions[i++] = 0xb8;
+		instructions[i++] = 0x11;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0xeb;
+		instructions[i++] = 0xf3;
+		/*
+		 * VTL return 32-bit entry prologue:
+		 * 	mov %eax, %ecx
+		 * 	mov $0x12, %eax
+		 * 	jmp 0:
+		 */
+		hv->vsm_code_page_offsets32.vtl_return_offset = i;
+		instructions[i++] = 0x89;
+		instructions[i++] = 0xc1;
+		instructions[i++] = 0xb8;
+		instructions[i++] = 0x12;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0xeb;
+		instructions[i++] = 0xea;
+
+#ifdef CONFIG_X86_64
+		/*
+		 * VTL call 64-bit entry prologue:
+		 * 	mov %rcx, %rax
+		 * 	mov $0x11, %ecx
+		 * 	jmp 0:
+		 */
+		hv->vsm_code_page_offsets64.vtl_call_offset = i;
+		instructions[i++] = 0x48;
+		instructions[i++] = 0x89;
+		instructions[i++] = 0xc8;
+		instructions[i++] = 0xb9;
+		instructions[i++] = 0x11;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0xeb;
+		instructions[i++] = 0xe0;
+		/*
+		 * VTL return 64-bit entry prologue:
+		 * 	mov %rcx, %rax
+		 * 	mov $0x12, %ecx
+		 * 	jmp 0:
+		 */
+		hv->vsm_code_page_offsets64.vtl_return_offset = i;
+		instructions[i++] = 0x48;
+		instructions[i++] = 0x89;
+		instructions[i++] = 0xc8;
+		instructions[i++] = 0xb9;
+		instructions[i++] = 0x12;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0x00;
+		instructions[i++] = 0xeb;
+		instructions[i++] = 0xd6;
+#endif
+	}
+	addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+	if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
+		return 1;
+
+	return 0;
+}
+
+static int kvm_hv_overlay_completion(struct kvm_vcpu *vcpu)
+{
+	struct kvm_hyperv_exit *exit = &vcpu->run->hyperv;
+	u64 data = exit->u.overlay.gpa;
+	int r = exit->u.overlay.error;
+
+	if (r)
+		goto out;
+
+	switch (exit->u.overlay.msr) {
+	case HV_X64_MSR_GUEST_OS_ID:
+		break;
+	case HV_X64_MSR_HYPERCALL:
+		r = patch_hypercall_page(vcpu, data);
+		break;
+	default:
+		r = 1;
+		pr_err("%s: unknown overlay MSR, %x\n", __func__,
+		       exit->u.overlay.msr);
+	}
+
+out:
+	if (r) {
+		if (exit->u.overlay.is_hypercall)
+			kvm_queue_exception(vcpu, UD_VECTOR);
+		else
+			kvm_inject_gp(vcpu, 0);
+	}
+	return 1;
+}
+
+static int overlay_exit(struct kvm_vcpu *vcpu, u32 msr, u64 gpa, bool is_hypercall)
+{
+	struct kvm_hyperv_exit *exit = &to_hv_vcpu(vcpu)->exit;
+
+	pr_info("%s, msr %x, gpa %llx\n", __func__, msr, gpa);
+	vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+	exit->type = KVM_EXIT_HYPERV_OVERLAY;
+	exit->u.overlay.msr = msr;
+	exit->u.overlay.gpa = gpa;
+	exit->u.overlay.error = 0;
+	exit->u.overlay.is_hypercall = is_hypercall;
+	vcpu->arch.complete_userspace_io = kvm_hv_overlay_completion;
+
+	kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+	return 0;
+}
+
 static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 			 u32 msr, u64 data, bool host)
 {
@@ -1335,14 +1492,13 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 	case HV_X64_MSR_GUEST_OS_ID:
 		hv->hv_guest_os_id = data;
 		/* setting guest os id to zero disables hypercall page */
-		if (!hv->hv_guest_os_id)
+		if (!data) {
 			hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+			if (kvm->arch.hyperv.hv_enable_vsm && !host)
+				return overlay_exit(vcpu, HV_X64_MSR_GUEST_OS_ID, data, false);
+		}
 		break;
-	case HV_X64_MSR_HYPERCALL: {
-		u8 instructions[9];
-		int i = 0;
-		u64 addr;
-
+	case HV_X64_MSR_HYPERCALL:
 		/* if guest os id is not set hypercall should remain disabled */
 		if (!hv->hv_guest_os_id)
 			break;
@@ -1351,34 +1507,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 			break;
 		}
 
-		/*
-		 * If Xen and Hyper-V hypercalls are both enabled, disambiguate
-		 * the same way Xen itself does, by setting the bit 31 of EAX
-		 * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
-		 * going to be clobbered on 64-bit.
-		 */
-		if (kvm_xen_hypercall_enabled(kvm)) {
-			/* orl $0x80000000, %eax */
-			instructions[i++] = 0x0d;
-			instructions[i++] = 0x00;
-			instructions[i++] = 0x00;
-			instructions[i++] = 0x00;
-			instructions[i++] = 0x80;
+		if (kvm->arch.hyperv.hv_enable_vsm) {
+			hv->hv_hypercall = data;
+			if (!host)
+				return overlay_exit(vcpu, HV_X64_MSR_HYPERCALL, data, false);
+			break;
 		}
-
-		/* vmcall/vmmcall */
-		static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
-		i += 3;
-
-		/* ret */
-		((unsigned char *)instructions)[i++] = 0xc3;
-
-		addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
-		if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
+		if (patch_hypercall_page(vcpu, data))
 			return 1;
 		hv->hv_hypercall = data;
 		break;
-	}
 	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
 		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index fdac4a1714ec09..0344293d1f2d41 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -823,4 +823,19 @@ struct hv_mmio_write_input {
 	u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH];
 } __packed;
 
+#define HV_NUM_VTLS		2
+#define HV_INVALID_VTL	((u8) -1)
+#define HV_ALL_VTLS		((u8) 0xF)
+
+/*
+ * VTL call/return hypercall page offsets register
+ */
+union hv_register_vsm_code_page_offsets {
+	u64 as_u64;
+	struct {
+		u64 vtl_call_offset:12;
+		u64 vtl_return_offset:12;
+		u64 reserved:40;
+	} __packed;
+};
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c5c3c1b6970dd6..1a8e29a079e148 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -196,6 +196,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
 #define KVM_EXIT_HYPERV_HCALL          2
 #define KVM_EXIT_HYPERV_SYNDBG         3
+#define KVM_EXIT_HYPERV_OVERLAY        4
 	__u32 type;
 	__u32 pad1;
 	union {
@@ -224,6 +225,13 @@ struct kvm_hyperv_exit {
 			__u64 recv_page;
 			__u64 pending_page;
 		} syndbg;
+		struct {
+			__u32 msr; /* kernel -> user */
+			__u8 error; /* user -> kernel */
+			__u8 is_hypercall; /* kernel -> user */
+			__u8 pad;
+			__u64 gpa; /* kernel -> user */
+		} overlay;
 	} u;
 };
 

From 142db2efe9a6fac30dab7ee24e7b04e700de7aff Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:09:44 +0000
Subject: [PATCH 03/26] vp assist page

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/hyperv-tlfs.h | 33 +++++++++++--
 arch/x86/kvm/hyperv.c              | 75 ++++++++++++++++++------------
 2 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index fe6a94acfbb906..602f219c46bad6 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -398,14 +398,39 @@ struct hv_nested_enlightenments_control {
 	} hypercallControls;
 } __packed;
 
+struct hv_vp_vtl_control {
+	__u32 vtl_entry_reason;
+
+	union {
+		__u8 as_u8;
+		struct {
+			__u8 vina_asserted:1;
+			__u8 reserved0:7;
+		};
+	};
+
+	__u8 reserved1[3];
+
+	union {
+		struct {
+			__u64 vtl_ret_x64rax;
+			__u64 vtl_ret_x64rcx;
+		};
+
+		struct {
+			__u32 vtl_return_x86_eax;
+			__u32 vtl_return_x86_ecx;
+			__u32 vtl_return_x86_edx;
+			__u32 reserved2;
+		};
+	};
+};
+
 /* Define virtual processor assist page structure. */
 struct hv_vp_assist_page {
 	__u32 apic_assist;
 	__u32 reserved1;
-	__u32 vtl_entry_reason;
-	__u32 vtl_reserved;
-	__u64 vtl_ret_x64rax;
-	__u64 vtl_ret_x64rcx;
+	struct hv_vp_vtl_control vtl_control;
 	struct hv_nested_enlightenments_control nested_control;
 	__u8 enlighten_vmentry;
 	__u8 reserved2[7];
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index bbe4d6190ed170..f20ea9b1058ae0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -365,6 +365,8 @@ static int patch_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 	return 0;
 }
 
+static int set_vp_assist_page(struct kvm_vcpu *vcpu, u64 data);
+
 static int kvm_hv_overlay_completion(struct kvm_vcpu *vcpu)
 {
 	struct kvm_hyperv_exit *exit = &vcpu->run->hyperv;
@@ -380,6 +382,9 @@ static int kvm_hv_overlay_completion(struct kvm_vcpu *vcpu)
 	case HV_X64_MSR_HYPERCALL:
 		r = patch_hypercall_page(vcpu, data);
 		break;
+	case HV_X64_MSR_VP_ASSIST_PAGE:
+		r = set_vp_assist_page(vcpu, data);
+		break;
 	default:
 		r = 1;
 		pr_err("%s: unknown overlay MSR, %x\n", __func__,
@@ -1065,6 +1070,42 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
 	vcpu->arch.hyperv = NULL;
 }
 
+/* Write to VP assist page register */
+static int set_vp_assist_page(struct kvm_vcpu *vcpu, u64 data)
+{
+	u64 gfn;
+	unsigned long addr;
+	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+	trace_printk("vpu_id %d, gpa %llx\n", vcpu->vcpu_id, data);
+	if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
+		hv_vcpu->hv_vapic = data;
+		if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
+			return 1;
+		return 0;
+	}
+
+	gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
+	addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+	if (kvm_is_error_hva(addr))
+		return 1;
+
+	/*
+	 * Clear apic_assist portion of struct hv_vp_assist_page
+	 * only, there can be valuable data in the rest which needs
+	 * to be preserved e.g. on migration.
+	 */
+	if (__put_user(0, (u32 __user *)addr))
+		return 1;
+	hv_vcpu->hv_vapic = data;
+	kvm_vcpu_mark_page_dirty(vcpu, gfn);
+	if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
+				 sizeof(struct hv_vp_assist_page)))
+		return 1;
+	return 0;
+
+}
+
 bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
@@ -1635,36 +1676,10 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		hv_vcpu->vp_index = new_vp_index;
 		break;
 	}
-	case HV_X64_MSR_VP_ASSIST_PAGE: {
-		u64 gfn;
-		unsigned long addr;
-
-		if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) {
-			hv_vcpu->hv_vapic = data;
-			if (kvm_lapic_set_pv_eoi(vcpu, 0, 0))
-				return 1;
-			break;
-		}
-		gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT;
-		addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
-		if (kvm_is_error_hva(addr))
-			return 1;
-
-		/*
-		 * Clear apic_assist portion of struct hv_vp_assist_page
-		 * only, there can be valuable data in the rest which needs
-		 * to be preserved e.g. on migration.
-		 */
-		if (__put_user(0, (u32 __user *)addr))
-			return 1;
-		hv_vcpu->hv_vapic = data;
-		kvm_vcpu_mark_page_dirty(vcpu, gfn);
-		if (kvm_lapic_set_pv_eoi(vcpu,
-					    gfn_to_gpa(gfn) | KVM_MSR_ENABLED,
-					    sizeof(struct hv_vp_assist_page)))
-			return 1;
-		break;
-	}
+	case HV_X64_MSR_VP_ASSIST_PAGE:
+		if (vcpu->kvm->arch.hyperv.hv_enable_vsm && !host)
+			return overlay_exit(vcpu, HV_X64_MSR_VP_ASSIST_PAGE, data, false);
+		return set_vp_assist_page(vcpu, data);
 	case HV_X64_MSR_EOI:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
 	case HV_X64_MSR_ICR:

From 3c0e57ea613ca45482b1b3d29244313d4f8fa0a3 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:13:18 +0000
Subject: [PATCH 04/26] VSM CAP

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/kvm_host.h | 3 +++
 arch/x86/kvm/hyperv.h           | 5 +++++
 arch/x86/kvm/x86.c              | 5 +++++
 include/uapi/linux/kvm.h        | 1 +
 4 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f9be5a6f74d864..3137e02a7f6319 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1119,6 +1119,9 @@ struct kvm_hv {
 
 	struct hv_partition_assist_pg *hv_pa_pg;
 	struct kvm_hv_syndbg hv_syndbg;
+
+	/* status of KVM_CAP_HYPERV_VSM */
+	bool hv_enable_vsm;
 };
 
 struct msr_bitmap_range {
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index f83b8db72b118c..d7c2bbaf0df5a0 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -238,4 +238,9 @@ static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
 
 int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
 
+static inline bool kvm_hv_vsm_enabled(struct kvm *kvm)
+{
+       return !!(kvm->arch.hyperv.hv_enable_vsm);
+}
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3eb608b6692c7..a58d2964242555 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4485,6 +4485,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_CPUID:
 	case KVM_CAP_HYPERV_ENFORCE_CPUID:
 	case KVM_CAP_SYS_HYPERV_CPUID:
+	case KVM_CAP_HYPERV_VSM:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -6518,6 +6519,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_HYPERV_VSM:
+		kvm->arch.hyperv.hv_enable_vsm = true;
+		r = 0;
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 1a8e29a079e148..4459c17ea51566 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1231,6 +1231,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_MEMORY_ATTRIBUTES 232
 #define KVM_CAP_GUEST_MEMFD 233
 #define KVM_CAP_VM_TYPES 234
+#define KVM_CAP_HYPERV_VSM 235
 
 #ifdef KVM_CAP_IRQ_ROUTING
 

From 9cecc7ae6ab1188e5f4acbd7272147218bfe5f54 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Mon, 30 Oct 2023 18:39:35 +0000
Subject: [PATCH 05/26] KVM: x86: Don't use hv_timer if CAP_HYPERV_VSM enabled

VSM's VTLs are modeled by using a distinct vCPU per VTL. While one VTL
is running the rest of vCPUs that track other VTLs are left idle. This
doesn't play well with the approach of tracking emulated timer
expiration by using the VSM preemption timer, since inactive VTL's
timers are still meant to run and interrupt execution of lower VTLs when
relevant.

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/lapic.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3e977dbbf9933d..d34ab093e8b938 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -152,9 +152,10 @@ static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
 
 bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
 {
-	return kvm_x86_ops.set_hv_timer
-	       && !(kvm_mwait_in_guest(vcpu->kvm) ||
-		    kvm_can_post_timer_interrupt(vcpu));
+	return kvm_x86_ops.set_hv_timer &&
+	       !(kvm_mwait_in_guest(vcpu->kvm) ||
+		 kvm_can_post_timer_interrupt(vcpu)) &&
+	       !(to_kvm_hv(vcpu->kvm)->hv_enable_vsm);
 }
 
 static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)

From 36ee1a37e6a69d0885b2aa97a64039d6f5c9ce50 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:16:32 +0000
Subject: [PATCH 06/26] IPIs

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv.c             | 27 +++++++++++++++++++--------
 arch/x86/kvm/trace.h              | 20 ++++++++++++--------
 include/asm-generic/hyperv-tlfs.h |  6 ++++--
 3 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index f20ea9b1058ae0..64f64e5819cb1f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2297,8 +2297,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 		((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
 }
 
+#define VTL_MASK	0x0
 static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
-				    u64 *sparse_banks, u64 valid_bank_mask)
+				    u64 *sparse_banks, u64 valid_bank_mask, int vtl)
 {
 	struct kvm_lapic_irq irq = {
 		.delivery_mode = APIC_DM_FIXED,
@@ -2309,10 +2310,13 @@ static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (sparse_banks &&
-		    !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
+		    !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu) & VTL_MASK,
 					    valid_bank_mask, sparse_banks))
 			continue;
 
+		if (get_active_vtl(vcpu) != vtl)
+			continue;
+
 		/* We fail only when APIC is disabled */
 		kvm_apic_set_irq(vcpu, &irq, NULL);
 	}
@@ -2325,13 +2329,19 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 	struct kvm *kvm = vcpu->kvm;
 	struct hv_send_ipi_ex send_ipi_ex;
 	struct hv_send_ipi send_ipi;
+	union hv_input_vtl *in_vtl;
 	u64 valid_bank_mask;
 	u32 vector;
 	bool all_cpus;
+	u8 vtl;
+
+	/* VTL is at the same offset on both IPI types */
+	in_vtl = &send_ipi.in_vtl;
+	vtl = in_vtl->use_target_vtl ? in_vtl->target_vtl : get_active_vtl(vcpu);
 
 	if (hc->code == HVCALL_SEND_IPI) {
 		if (!hc->fast) {
-			if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi,
+			if (unlikely(kvm_vcpu_read_guest(vcpu, hc->ingpa, &send_ipi,
 						    sizeof(send_ipi))))
 				return HV_STATUS_INVALID_HYPERCALL_INPUT;
 			sparse_banks[0] = send_ipi.cpu_mask;
@@ -2346,10 +2356,10 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 		all_cpus = false;
 		valid_bank_mask = BIT_ULL(0);
 
-		trace_kvm_hv_send_ipi(vector, sparse_banks[0]);
+		trace_kvm_hv_send_ipi(vector, sparse_banks[0], vtl);
 	} else {
 		if (!hc->fast) {
-			if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex,
+			if (unlikely(kvm_vcpu_read_guest(vcpu, hc->ingpa, &send_ipi_ex,
 						    sizeof(send_ipi_ex))))
 				return HV_STATUS_INVALID_HYPERCALL_INPUT;
 		} else {
@@ -2360,7 +2370,8 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 
 		trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector,
 					 send_ipi_ex.vp_set.format,
-					 send_ipi_ex.vp_set.valid_bank_mask);
+					 send_ipi_ex.vp_set.valid_bank_mask,
+					 vtl);
 
 		vector = send_ipi_ex.vector;
 		valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask;
@@ -2390,9 +2401,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 		return HV_STATUS_INVALID_HYPERCALL_INPUT;
 
 	if (all_cpus)
-		kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
+		kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0, vtl);
 	else
-		kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
+		kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask, vtl);
 
 ret_success:
 	return HV_STATUS_SUCCESS;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 83843379813ee3..ab8839c47bc763 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1606,42 +1606,46 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
  * Tracepoints for kvm_hv_send_ipi.
  */
 TRACE_EVENT(kvm_hv_send_ipi,
-	TP_PROTO(u32 vector, u64 processor_mask),
-	TP_ARGS(vector, processor_mask),
+	TP_PROTO(u32 vector, u64 processor_mask, u8 vtl),
+	TP_ARGS(vector, processor_mask, vtl),
 
 	TP_STRUCT__entry(
 		__field(u32, vector)
 		__field(u64, processor_mask)
+		__field(u8, vtl)
 	),
 
 	TP_fast_assign(
 		__entry->vector = vector;
 		__entry->processor_mask = processor_mask;
+		__entry->vtl = vtl;
 	),
 
-	TP_printk("vector %x processor_mask 0x%llx",
-		  __entry->vector, __entry->processor_mask)
+	TP_printk("vector %x processor_mask 0x%llx vtl %d",
+		  __entry->vector, __entry->processor_mask, __entry->vtl)
 );
 
 TRACE_EVENT(kvm_hv_send_ipi_ex,
-	TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask),
-	TP_ARGS(vector, format, valid_bank_mask),
+	TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask, u8 vtl),
+	TP_ARGS(vector, format, valid_bank_mask, vtl),
 
 	TP_STRUCT__entry(
 		__field(u32, vector)
 		__field(u64, format)
 		__field(u64, valid_bank_mask)
+		__field(u8, vtl)
 	),
 
 	TP_fast_assign(
 		__entry->vector = vector;
 		__entry->format = format;
 		__entry->valid_bank_mask = valid_bank_mask;
+		__entry->vtl = vtl;
 	),
 
-	TP_printk("vector %x format %llx valid_bank_mask 0x%llx",
+	TP_printk("vector %x format %llx valid_bank_mask 0x%llx vtl %d",
 		  __entry->vector, __entry->format,
-		  __entry->valid_bank_mask)
+		  __entry->valid_bank_mask, __entry->vtl)
 );
 
 TRACE_EVENT(kvm_pv_tlb_flush,
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 0344293d1f2d41..ff0795aa64580b 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -424,14 +424,16 @@ struct hv_vpset {
 /* HvCallSendSyntheticClusterIpi hypercall */
 struct hv_send_ipi {
 	u32 vector;
-	u32 reserved;
+	union hv_input_vtl in_vtl;
+	u8 reserved[3];
 	u64 cpu_mask;
 } __packed;
 
 /* HvCallSendSyntheticClusterIpiEx hypercall */
 struct hv_send_ipi_ex {
 	u32 vector;
-	u32 reserved;
+	union hv_input_vtl in_vtl;
+	u8 reserved[3];
 	struct hv_vpset vp_set;
 } __packed;
 

From 7ac86be873e5512df4ef248eb332cc766db422c2 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:19:16 +0000
Subject: [PATCH 07/26] VP registers

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv.c             | 49 +++++++++++++++++++++++++++++++
 include/asm-generic/hyperv-tlfs.h |  1 +
 2 files changed, 50 insertions(+)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 64f64e5819cb1f..619b2dc60eb0e0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2584,6 +2584,8 @@ static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
 	case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX:
 	case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX:
 	case HVCALL_SEND_IPI_EX:
+	case HVCALL_GET_VP_REGISTERS:
+	case HVCALL_SET_VP_REGISTERS:
 		return true;
 	}
 
@@ -2664,11 +2666,51 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
 	return true;
 }
 
+static bool is_hyperv_feature_advertised(struct kvm_vcpu *vcpu, enum kvm_reg reg, u64 feature_mask)
+{
+	struct kvm_cpuid_entry2 *entry;
+	u64 regval;
+
+	entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
+	if (!entry)
+		return false;
+
+	switch (reg) {
+	case VCPU_REGS_RAX: regval = entry->eax; break;
+	case VCPU_REGS_RBX: regval = entry->ebx; break;
+	case VCPU_REGS_RDX: regval = entry->edx; break;
+	default: return false;
+	};
+
+	return (regval & feature_mask) == feature_mask;
+}
+
+static bool is_hypercall_advertised(struct kvm_vcpu *vcpu, u16 code)
+{
+	u64 feature_mask;
+	enum kvm_reg reg;
+
+	/* Some hypercalls are advertised by default, the others are not */
+	switch (code) {
+	case HVCALL_GET_VP_REGISTERS:
+	case HVCALL_SET_VP_REGISTERS:
+		feature_mask = HV_ACCESS_VP_REGISTERS;
+		reg = VCPU_REGS_RBX;
+		break;
+	default:
+		/* everything else is advertised by default */
+		return true;
+	}
+
+	return is_hyperv_feature_advertised(vcpu, reg, feature_mask);
+}
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
 	struct kvm_hv_hcall hc;
 	u64 ret = HV_STATUS_SUCCESS;
+	int i;
 
 	/*
 	 * hypercall generates UD from non zero cpl and real mode
@@ -2726,6 +2768,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 		kvm_hv_hypercall_read_xmm(&hc);
 	}
 
+	if (unlikely(!is_hypercall_advertised(vcpu, hc.code)))
+		return kvm_hv_hypercall_complete(vcpu, HV_STATUS_INVALID_HYPERCALL_CODE);
+
 	switch (hc.code) {
 	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		if (unlikely(hc.rep || hc.var_cnt)) {
@@ -2816,6 +2861,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 			break;
 		}
 		goto hypercall_userspace_exit;
+	case HVCALL_GET_VP_REGISTERS:
+	case HVCALL_SET_VP_REGISTERS:
+		goto hypercall_userspace_exit;
 	default:
 		ret = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
@@ -2985,6 +3033,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 			ent->ebx |= HV_POST_MESSAGES;
 			ent->ebx |= HV_SIGNAL_EVENTS;
 			ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
+			ent->ebx |= HV_ACCESS_VP_REGISTERS;
 
 			ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
 			ent->edx |= HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE;
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index ff0795aa64580b..4e290d448052ea 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -89,6 +89,7 @@
 #define HV_ACCESS_STATS				BIT(8)
 #define HV_DEBUGGING				BIT(11)
 #define HV_CPU_MANAGEMENT			BIT(12)
+#define HV_ACCESS_VP_REGISTERS			BIT(17)
 #define HV_ENABLE_EXTENDED_HYPERCALLS		BIT(20)
 #define HV_ISOLATION				BIT(22)
 

From 287ac0aad9a35cf5c6c2694a238014367b772b73 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:20:25 +0000
Subject: [PATCH 08/26] GET VSM STATE

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv.c    | 18 ++++++++++++++++++
 arch/x86/kvm/hyperv.h    |  3 +++
 arch/x86/kvm/x86.c       | 41 ++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/kvm.h |  4 ++++
 4 files changed, 66 insertions(+)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 619b2dc60eb0e0..543d388034ced3 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -3121,3 +3121,21 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 
 	return 0;
 }
+
+int kvm_vm_ioctl_get_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state)
+{
+	struct kvm_hv* hv = &kvm->arch.hyperv;
+
+	state->vsm_code_page_offsets64 = hv->vsm_code_page_offsets64.as_u64;
+	state->vsm_code_page_offsets32 = hv->vsm_code_page_offsets32.as_u64;
+	return 0;
+}
+
+int kvm_vm_ioctl_set_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state)
+{
+	struct kvm_hv* hv = &kvm->arch.hyperv;
+
+	hv->vsm_code_page_offsets64.as_u64 = state->vsm_code_page_offsets64;
+	hv->vsm_code_page_offsets32.as_u64 = state->vsm_code_page_offsets32;
+	return 0;
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index d7c2bbaf0df5a0..e431276dbd1439 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -243,4 +243,7 @@ static inline bool kvm_hv_vsm_enabled(struct kvm *kvm)
        return !!(kvm->arch.hyperv.hv_enable_vsm);
 }
 
+int kvm_vm_ioctl_get_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state);
+int kvm_vm_ioctl_set_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state);
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a58d2964242555..7a989dc803fe05 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7117,6 +7117,47 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 		r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
 		break;
 	}
+	case KVM_HV_GET_VSM_STATE: {
+		struct kvm_hv_vsm_state *vsm_state;
+
+		r = -EINVAL;
+		if (!kvm->arch.hyperv.hv_enable_vsm)
+			goto out;
+
+		r = -ENOMEM;
+		vsm_state = kzalloc(sizeof(*vsm_state), GFP_USER | __GFP_NOWARN);
+		if (!vsm_state)
+			goto out;
+
+		r = kvm_vm_ioctl_get_hv_vsm_state(kvm, vsm_state);
+		if (r)
+			goto out_get_vsm_state;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, vsm_state, sizeof(*vsm_state)))
+			goto out_get_vsm_state;
+
+		r = 0;
+out_get_vsm_state:
+		kfree(vsm_state);
+		break;
+	}
+	case KVM_HV_SET_VSM_STATE: {
+		struct kvm_hv_vsm_state *vsm_state;
+
+		r = -EINVAL;
+		if (!kvm->arch.hyperv.hv_enable_vsm)
+			goto out;
+
+		vsm_state = memdup_user(argp, sizeof(*vsm_state));
+		if (IS_ERR(vsm_state)) {
+			r = PTR_ERR(vsm_state);
+			goto out;
+		}
+		r = kvm_vm_ioctl_set_hv_vsm_state(kvm, vsm_state);
+		kfree(vsm_state);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4459c17ea51566..51de34bbe922b9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -2320,4 +2320,8 @@ struct kvm_create_guest_memfd {
 
 #define KVM_GUEST_MEMFD_ALLOW_HUGEPAGE		(1ULL << 0)
 
+/* Get/Set Hyper-V VSM state. Available with KVM_CAP_HYPERV_VSM */
+#define KVM_HV_GET_VSM_STATE _IOR(KVMIO, 0xd5, struct kvm_hv_vsm_state)
+#define KVM_HV_SET_VSM_STATE _IOW(KVMIO, 0xd6, struct kvm_hv_vsm_state)
+
 #endif /* __LINUX_KVM_H */

From d8c2493986840f4e407e093df03e9e729e61ad3d Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:22:00 +0000
Subject: [PATCH 09/26] VSM_ACCESS HCALLS

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv.c             | 19 +++++++++++++++++++
 include/asm-generic/hyperv-tlfs.h |  7 ++++++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 543d388034ced3..4cfd2303e4bbdb 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2532,6 +2532,10 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
 	if (call == HVCALL_GET_VP_REGISTERS && fast)
 		kvm_hv_hypercall_set_xmm_regs(vcpu);
 
+	//TODO move this to qemu
+	if (call == HVCALL_VTL_CALL || call == HVCALL_VTL_RETURN)
+		return kvm_skip_emulated_instruction(vcpu);
+
 	return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result);
 }
 
@@ -2586,6 +2590,7 @@ static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
 	case HVCALL_SEND_IPI_EX:
 	case HVCALL_GET_VP_REGISTERS:
 	case HVCALL_SET_VP_REGISTERS:
+	case HVCALL_MODIFY_VTL_PROTECTION_MASK:
 		return true;
 	}
 
@@ -2697,6 +2702,14 @@ static bool is_hypercall_advertised(struct kvm_vcpu *vcpu, u16 code)
 		feature_mask = HV_ACCESS_VP_REGISTERS;
 		reg = VCPU_REGS_RBX;
 		break;
+	case HVCALL_ENABLE_PARTITION_VTL:
+	case HVCALL_ENABLE_VP_VTL:
+	case HVCALL_MODIFY_VTL_PROTECTION_MASK:
+	case HVCALL_VTL_CALL:
+	case HVCALL_VTL_RETURN:
+		feature_mask = HV_ACCESS_VSM;
+		reg = VCPU_REGS_RBX;
+		break;
 	default:
 		/* everything else is advertised by default */
 		return true;
@@ -2863,6 +2876,11 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 		goto hypercall_userspace_exit;
 	case HVCALL_GET_VP_REGISTERS:
 	case HVCALL_SET_VP_REGISTERS:
+	case HVCALL_MODIFY_VTL_PROTECTION_MASK:
+	case HVCALL_ENABLE_PARTITION_VTL:
+	case HVCALL_ENABLE_VP_VTL:
+	case HVCALL_VTL_CALL:
+	case HVCALL_VTL_RETURN:
 		goto hypercall_userspace_exit;
 	default:
 		ret = HV_STATUS_INVALID_HYPERCALL_CODE;
@@ -3034,6 +3052,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
 			ent->ebx |= HV_SIGNAL_EVENTS;
 			ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
 			ent->ebx |= HV_ACCESS_VP_REGISTERS;
+			ent->ebx |= HV_ACCESS_VSM;
 
 			ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
 			ent->edx |= HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE;
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 4e290d448052ea..a11dc9175e5095 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -89,6 +89,7 @@
 #define HV_ACCESS_STATS				BIT(8)
 #define HV_DEBUGGING				BIT(11)
 #define HV_CPU_MANAGEMENT			BIT(12)
+#define HV_ACCESS_VSM				BIT(16)
 #define HV_ACCESS_VP_REGISTERS			BIT(17)
 #define HV_ENABLE_EXTENDED_HYPERCALLS		BIT(20)
 #define HV_ISOLATION				BIT(22)
@@ -147,9 +148,13 @@ union hv_reference_tsc_msr {
 /* Declare the various hypercall operations. */
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
-#define HVCALL_ENABLE_VP_VTL			0x000f
 #define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
 #define HVCALL_SEND_IPI				0x000b
+#define HVCALL_MODIFY_VTL_PROTECTION_MASK	0x000c
+#define HVCALL_ENABLE_PARTITION_VTL		0x000d
+#define HVCALL_ENABLE_VP_VTL			0x000f
+#define HVCALL_VTL_CALL				0x0011
+#define HVCALL_VTL_RETURN			0x0012
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX	0x0013
 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX	0x0014
 #define HVCALL_SEND_IPI_EX			0x0015

From 2cd4b41a6ee5719ed105a9d133f32509613f61eb Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:22:55 +0000
Subject: [PATCH 10/26] HV EXT_CAPS

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv.c             | 30 +++++++++++++++++++++++++++++-
 arch/x86/kvm/trace.h              | 15 +++++++++++++++
 include/asm-generic/hyperv-tlfs.h |  1 +
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 4cfd2303e4bbdb..3486c38c8c72c9 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2621,6 +2621,21 @@ static void kvm_hv_hypercall_write_xmm(struct kvm_hv_hcall *hc)
 	hc->xmm_dirty = false;
 }
 
+static u64 kvm_hv_ext_query_capabilities(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
+{
+	u64 caps = 0; /* No caps */
+
+	if (!hc->fast) {
+		if (unlikely(kvm_write_guest(vcpu->kvm, hc->outgpa, &caps, sizeof(caps)) != 0))
+			return HV_STATUS_INVALID_HYPERCALL_INPUT;
+	} else {
+		kvm_rdx_write(vcpu, caps);
+	}
+
+	trace_kvm_hv_ext_query_capabilities(caps);
+	return HV_STATUS_SUCCESS;
+}
+
 static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
 {
 	if (!hv_vcpu->enforce_cpuid)
@@ -2710,6 +2725,10 @@ static bool is_hypercall_advertised(struct kvm_vcpu *vcpu, u16 code)
 		feature_mask = HV_ACCESS_VSM;
 		reg = VCPU_REGS_RBX;
 		break;
+	case HV_EXT_CALL_QUERY_CAPABILITIES:
+		feature_mask = HV_ENABLE_EXTENDED_HYPERCALLS;
+		reg = VCPU_REGS_RBX;
+		break;
 	default:
 		/* everything else is advertised by default */
 		return true;
@@ -2868,7 +2887,16 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 		}
 		goto hypercall_userspace_exit;
 	}
-	case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+	case HV_EXT_CALL_QUERY_CAPABILITIES: {
+		if (unlikely(hc.rep_cnt)) {
+			ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+		}
+
+		ret = kvm_hv_ext_query_capabilities(vcpu, &hc);
+		break;
+	}
+	case HV_EXT_CALL_GET_BOOT_ZEROED_MEMORY ... HV_EXT_CALL_MAX:
 		if (unlikely(hc.fast)) {
 			ret = HV_STATUS_INVALID_PARAMETER;
 			break;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ab8839c47bc763..f0b6bb674e8001 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1648,6 +1648,21 @@ TRACE_EVENT(kvm_hv_send_ipi_ex,
 		  __entry->valid_bank_mask, __entry->vtl)
 );
 
+TRACE_EVENT(kvm_hv_ext_query_capabilities,
+	TP_PROTO(u64 caps),
+	TP_ARGS(caps),
+
+	TP_STRUCT__entry(
+		__field(u64, caps)
+	),
+
+	TP_fast_assign(
+		__entry->caps = caps;
+	),
+
+	TP_printk("reported capabilities 0x%llx", __entry->caps)
+);
+
 TRACE_EVENT(kvm_pv_tlb_flush,
 	TP_PROTO(unsigned int vcpu_id, bool need_flush_tlb),
 	TP_ARGS(vcpu_id, need_flush_tlb),
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index a11dc9175e5095..0f142f3e494fde 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -182,6 +182,7 @@ union hv_reference_tsc_msr {
 
 /* Extended hypercalls */
 #define HV_EXT_CALL_QUERY_CAPABILITIES		0x8001
+#define HV_EXT_CALL_GET_BOOT_ZEROED_MEMORY	0x8002
 #define HV_EXT_CALL_MEMORY_HEAT_HINT		0x8003
 
 #define HV_FLUSH_ALL_PROCESSORS			BIT(0)

From 6b11286e7c36d9a9a237992287bd439dc52a556e Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:32:57 +0000
Subject: [PATCH 11/26] KVM: Allow polling vCPUs for events

This is a follow up RFC to David Woodhouse's proposal[1] to allow
user-space exits on halt/mwait.

A number of use cases have surfaced where it'd be beneficial to have a
vCPU stop its execution in user-space, as opposed to having it sleep
in-kernel. Be it in order to make better use of the pCPU's time while
the vCPU is halted, or to implement security features like Hyper-V's
VSM.

A problem with this approach is that user-space has no way of knowing
whether the vCPU has pending events (interrupts, timers, etc...), so we
need a new interface to query if they are. poll() turned out to be a
very good fit.

vCPUs being polled are now switched into a new mode, POLLING_FOR_EVENTS.
This mode behaves similar to how OUTSIDE_GUEST_MODE does, except in
kvm_vcpu_kick(), which now wakes up the polling vCPU thread to signal
attention is needed. On wake up the polling thread checks if the pending
requests are relevant for the vCPU (the vCPU might be halted, or it
might be a quiesced VTL vCPU, with different wakeup needs), and if so,
exits back to user-space.

This vCPU mode switch also serves as a synchronization point vs
asynchronous sources of interruptions and it's a benefit versus other
approaches to this problem (for ex. using ioeventfd), which requires
extra synchronization to be viable.

Ultimately, it's up to the code triggering the user-space exit to set
the poll request mask. This allows different exits reasons to be woken
up by different type of events. The request mask is reset upon
re-entering KVM_RUN.

This was tested alongside a Hyper-V VSM PoC that implements Virtual
Trust Level (VTL) handling in user-space by using a distinct vCPU per
VTL. Hence the out-of-tree code in 'hyperv.c'. Note that our approach
requires HVCALL_VTL_RETURN to quiesce the vCPU in user-space, until a
HVCALL_VTL_CALL is performed from a lower VTL, or an interrupt is
targeted at that VTL.

[1] https://lore.kernel.org/lkml/1b52b557beb6606007f7ec5672eab0adf1606a34.camel@infradead.org/

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
---
 arch/x86/kvm/x86.c       |  1 +
 include/linux/kvm_host.h |  2 ++
 virt/kvm/kvm_main.c      | 26 ++++++++++++++++++++++++++
 3 files changed, 29 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a989dc803fe05..70d0b7118f6413 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10759,6 +10759,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	/* Store vcpu->apicv_active before vcpu->mode.  */
 	smp_store_release(&vcpu->mode, IN_GUEST_MODE);
+	WRITE_ONCE(vcpu->kicked, false);
 
 	kvm_vcpu_srcu_read_unlock(vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 687589ce9f6302..71e1e8cf893600 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -336,6 +336,7 @@ struct kvm_vcpu {
 #endif
 	int mode;
 	u64 requests;
+	bool kicked;
 	unsigned long guest_debug;
 
 	struct mutex mutex;
@@ -395,6 +396,7 @@ struct kvm_vcpu {
 	 */
 	struct kvm_memory_slot *last_used_slot;
 	u64 last_used_slot_gen;
+	wait_queue_head_t wqh;
 };
 
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ad9aab898a0c22..2d28381ffe4cf0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -497,12 +497,14 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	kvm_vcpu_set_dy_eligible(vcpu, false);
 	vcpu->preempted = false;
 	vcpu->ready = false;
+	vcpu->kicked = false;
 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 	vcpu->last_used_slot = NULL;
 
 	/* Fill the stats id string for the vcpu */
 	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
 		 task_pid_nr(current), id);
+	init_waitqueue_head(&vcpu->wqh);
 }
 
 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -3969,7 +3971,12 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 		cpu = READ_ONCE(vcpu->cpu);
 		if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
 			smp_send_reschedule(cpu);
+		goto out;
 	}
+
+	if (!cmpxchg(&vcpu->kicked, false, true))
+		wake_up_interruptible(&vcpu->wqh);
+
 out:
 	put_cpu();
 }
@@ -4174,6 +4181,24 @@ static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static __poll_t kvm_vcpu_poll(struct file *file, poll_table *wait)
+{
+	struct kvm_vcpu *vcpu = file->private_data;
+
+	poll_wait(file, &vcpu->wqh, wait);
+
+	/*
+	 * Make sure we read vcpu->kicked after adding the vcpu into
+	 * the waitqueue list.
+	 */
+	smp_mb();
+	if (READ_ONCE(vcpu->kicked)) {
+		return EPOLLIN;
+	}
+
+	return 0;
+}
+
 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
@@ -4186,6 +4211,7 @@ static const struct file_operations kvm_vcpu_fops = {
 	.release        = kvm_vcpu_release,
 	.unlocked_ioctl = kvm_vcpu_ioctl,
 	.mmap           = kvm_vcpu_mmap,
+	.poll		= kvm_vcpu_poll,
 	.llseek		= noop_llseek,
 	KVM_COMPAT(kvm_vcpu_compat_ioctl),
 };

From 6d07f9fa9d7b65b3813386d1fff21ec22aea1b92 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Mon, 2 Oct 2023 09:16:57 +0000
Subject: [PATCH 12/26] VTL MMU role

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/hyperv.c           |  5 +++++
 arch/x86/kvm/hyperv.h           | 10 ++++++++++
 arch/x86/kvm/mmu.h              |  2 ++
 arch/x86/kvm/mmu/mmu.c          |  1 +
 5 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3137e02a7f6319..d4ffb71b374f4f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -338,7 +338,8 @@ union kvm_mmu_page_role {
 		unsigned ad_disabled:1;
 		unsigned guest_mode:1;
 		unsigned passthrough:1;
-		unsigned :5;
+		unsigned vtl:4;
+		unsigned :1;
 
 		/*
 		 * This is left at the top of the word so that
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 3486c38c8c72c9..a93cd9f8b67c23 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -62,6 +62,11 @@
  */
 #define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64)
 
+void kvm_tdp_mmu_role_set_hv_bits(struct kvm_vcpu *vcpu, union kvm_mmu_page_role *role)
+{
+	role->vtl = to_kvm_hv(vcpu->kvm)->hv_enable_vsm ? get_active_vtl(vcpu) : 0;
+}
+
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
 				bool vcpu_kick);
 
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e431276dbd1439..e5dac7466acf98 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -64,6 +64,16 @@ static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
 	return vcpu->arch.hyperv;
 }
 
+static inline u8 get_active_vtl(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+	if (!hv_vcpu)
+		return 0;
+
+	return hv_vcpu->vp_index;
+}
+
 static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 253fb2093d5dad..39d28334c5e1d0 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -304,4 +304,6 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
 		return gpa;
 	return translate_nested_gpa(vcpu, gpa, access, exception);
 }
+
+void kvm_tdp_mmu_role_set_hv_bits(struct kvm_vcpu *vcpu, union kvm_mmu_page_role *role);
 #endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index baeba8fc1c38ea..e2d370ceb6a6a1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5271,6 +5271,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 	role.level = kvm_mmu_get_tdp_level(vcpu);
 	role.direct = true;
 	role.has_4_byte_gpte = false;
+	kvm_tdp_mmu_role_set_hv_bits(vcpu, &role);
 
 	return role;
 }

From 7585ce4c02609b6c813305b2fdd80315daca4748 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 15:48:14 +0000
Subject: [PATCH 13/26] Introduce non-executable faults

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/mmu/mmu.c          | 6 +++++-
 arch/x86/kvm/mmu/mmu_internal.h | 3 +++
 arch/x86/kvm/mmu/tdp_mmu.c      | 8 ++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e2d370ceb6a6a1..37525b13793575 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3244,6 +3244,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	struct kvm_mmu_page *sp;
 	int ret;
 	gfn_t base_gfn = fault->gfn;
+	unsigned access = ACC_ALL;
 
 	kvm_mmu_hugepage_adjust(vcpu, fault);
 
@@ -3273,7 +3274,10 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	if (WARN_ON_ONCE(it.level != fault->goal_level))
 		return -EFAULT;
 
-	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+	if (!fault->map_executable)
+		access &= ~ACC_EXEC_MASK;
+
+	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, access,
 			   base_gfn, fault->pfn, fault);
 	if (ret == RET_PF_SPURIOUS)
 		return ret;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index b66a7d47e0e4ef..de5e5596fb48c7 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -239,6 +239,7 @@ struct kvm_page_fault {
 	kvm_pfn_t pfn;
 	hva_t hva;
 	bool map_writable;
+	bool map_executable;
 
 	/*
 	 * Indicates the guest is trying to write a gfn that contains one or
@@ -298,6 +299,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		.req_level = PG_LEVEL_4K,
 		.goal_level = PG_LEVEL_4K,
 		.is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+		.map_writable = true,
+		.map_executable = true,
 	};
 	int r;
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6cd4dd631a2fac..46f3e72ab770e9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -957,14 +957,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 	u64 new_spte;
 	int ret = RET_PF_FIXED;
 	bool wrprot = false;
+	unsigned access = ACC_ALL;
 
 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
 		return RET_PF_RETRY;
 
+	if (!fault->map_executable)
+		access &= ~ACC_EXEC_MASK;
+
 	if (unlikely(!fault->slot))
-		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
+		new_spte = make_mmio_spte(vcpu, iter->gfn, access);
 	else
-		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+		wrprot = make_spte(vcpu, sp, fault->slot, access, iter->gfn,
 					 fault->pfn, iter->old_spte, fault->prefetch, true,
 					 fault->map_writable, &new_spte);
 

From 51366a7cc6742b15e57e9be4d876ae5ab3918366 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Mon, 2 Oct 2023 18:30:44 +0000
Subject: [PATCH 14/26] KVM: Add KVM_EXIT_MEMORY_FAULT exit

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu/mmu.c          | 4 ++--
 arch/x86/kvm/mmu/mmu_internal.h | 4 ++++
 arch/x86/kvm/mmu/mmutrace.h     | 1 +
 include/linux/kvm_host.h        | 8 +++++++-
 include/trace/events/kvm.h      | 3 ++-
 include/uapi/linux/kvm.h        | 6 ++++++
 7 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d4ffb71b374f4f..ab4b77db56f1d6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1502,6 +1502,7 @@ struct kvm_vcpu_stat {
 	u64 pf_fast;
 	u64 pf_mmio_spte_created;
 	u64 pf_guest;
+	u64 pf_user;
 	u64 tlb_flush;
 	u64 invlpg;
 
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 37525b13793575..7eb3c5009c437d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4299,8 +4299,8 @@ static inline u8 kvm_max_level_for_order(int order)
 static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 					      struct kvm_page_fault *fault)
 {
-	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
-				      PAGE_SIZE, fault->write, fault->exec,
+	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, PAGE_SIZE,
+				      fault->write, fault->exec, fault->user,
 				      fault->is_private);
 }
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index de5e5596fb48c7..55fe901e448671 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -261,6 +261,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
  * RET_PF_FIXED: The faulting entry has been fixed.
  * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ * RET_PF_USER: need to exit to userspace to handle this fault.
  *
  * Any names added to this enum should be exported to userspace for use in
  * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
@@ -277,6 +278,7 @@ enum {
 	RET_PF_INVALID,
 	RET_PF_FIXED,
 	RET_PF_SPURIOUS,
+	RET_PF_USER,
 };
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
@@ -338,6 +340,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 		vcpu->stat.pf_emulate++;
 	else if (r == RET_PF_SPURIOUS)
 		vcpu->stat.pf_spurious++;
+	else if (r == RET_PF_USER)
+		vcpu->stat.pf_user++;
 	return r;
 }
 
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef697a..4a74b74861dff1 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -60,6 +60,7 @@ TRACE_DEFINE_ENUM(RET_PF_EMULATE);
 TRACE_DEFINE_ENUM(RET_PF_INVALID);
 TRACE_DEFINE_ENUM(RET_PF_FIXED);
 TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+TRACE_DEFINE_ENUM(RET_PF_USER);
 
 /*
  * A pagetable walk has started
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 71e1e8cf893600..918828bca8d1e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2367,7 +2367,7 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 						 gpa_t gpa, gpa_t size,
 						 bool is_write, bool is_exec,
-						 bool is_private)
+						 bool is_read, bool is_private)
 {
 	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
 	vcpu->run->memory_fault.gpa = gpa;
@@ -2375,6 +2375,12 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 
 	/* RWX flags are not (yet) defined or communicated to userspace. */
 	vcpu->run->memory_fault.flags = 0;
+	if (is_read)
+		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_READ;
+	if (is_write)
+		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_WRITE;
+	if (is_exec)
+		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_EXECUTE;
 	if (is_private)
 		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
 }
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 3bd31ea23fee9e..aa34fdb16c90bb 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -17,7 +17,8 @@
 	ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL),	\
 	ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
 	ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI),          \
-	ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR)
+	ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR), \
+	ERSN(MEMORY_FAULT)
 
 TRACE_EVENT(kvm_userspace_exit,
 	    TP_PROTO(__u32 reason, int errno),
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 51de34bbe922b9..62c847b195fabc 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -539,7 +539,13 @@ struct kvm_run {
 		} notify;
 		/* KVM_EXIT_MEMORY_FAULT */
 		struct {
+#define KVM_MEMORY_EXIT_FLAG_READ	(1ULL << 0)
+#define KVM_MEMORY_EXIT_FLAG_WRITE	(1ULL << 1)
+#define KVM_MEMORY_EXIT_FLAG_EXECUTE	(1ULL << 2)
 #define KVM_MEMORY_EXIT_FLAG_PRIVATE	(1ULL << 3)
+#define KVM_MEMORY_EXIT_NO_ACCESS                            \
+	(KVM_MEMORY_EXIT_FLAG_NR | KVM_MEMORY_EXIT_FLAG_NW | \
+	 KVM_MEMORY_EXIT_FLAG_NX)
 			__u64 flags;
 			__u64 gpa;
 			__u64 size;

From 798985b4a67215f85ca2a3f03d23815133d6cc22 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 31 Oct 2023 18:32:08 +0000
Subject: [PATCH 15/26] KVM: Allow kvm_ioctl_set_mem_attributes() to select
 attrs array

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/mmu/mmu.c   |  6 ++++--
 include/linux/kvm_host.h |  3 +++
 virt/kvm/kvm_main.c      | 28 ++++++++++++++++++++--------
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7eb3c5009c437d..710de4fe047e85 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7263,7 +7263,8 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
 	 * Zapping SPTEs in this case ensures KVM will reassess whether or not
 	 * a hugepage can be used for affected ranges.
 	 */
-	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm) &&
+			 !kvm_hv_vsm_enabled(kvm)))
 		return false;
 
 	return kvm_unmap_gfn_range(kvm, range);
@@ -7320,7 +7321,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
 	 * a range that has PRIVATE GFNs, and conversely converting a range to
 	 * SHARED may now allow hugepages.
 	 */
-	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
+	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm) &&
+			 !kvm_hv_vsm_enabled(kvm)))
 		return false;
 
 	/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 918828bca8d1e2..37a0bb6f99f0b0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2391,6 +2391,9 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
 	return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
 }
 
+int kvm_ioctl_set_mem_attributes(struct kvm *kvm, struct xarray *mem_attr_array,
+				 u64 supported_attrs,
+				 struct kvm_memory_attributes *attrs);
 bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 				     unsigned long attrs);
 bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2d28381ffe4cf0..78cc4489af3f49 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2554,8 +2554,9 @@ static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
 }
 
 /* Set @attributes for the gfn range [@start, @end). */
-static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
-				     unsigned long attributes)
+static int kvm_set_mem_attributes(struct kvm *kvm,
+				  struct xarray *mem_attr_array, gfn_t start,
+				  gfn_t end, unsigned long attributes)
 {
 	struct kvm_mmu_notifier_range pre_set_range = {
 		.start = start,
@@ -2590,7 +2591,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 	 * partway through setting the new attributes.
 	 */
 	for (i = start; i < end; i++) {
-		r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
+		r = xa_reserve(mem_attr_array, i, GFP_KERNEL_ACCOUNT);
 		if (r)
 			goto out_unlock;
 	}
@@ -2598,7 +2599,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 	kvm_handle_gfn_range(kvm, &pre_set_range);
 
 	for (i = start; i < end; i++) {
-		r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
+		r = xa_err(xa_store(mem_attr_array, i, entry,
 				    GFP_KERNEL_ACCOUNT));
 		KVM_BUG_ON(r, kvm);
 	}
@@ -2610,15 +2611,17 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 
 	return r;
 }
-static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
-					   struct kvm_memory_attributes *attrs)
+
+int kvm_ioctl_set_mem_attributes(struct kvm *kvm, struct xarray *mem_attr_array,
+				 u64 supported_attrs,
+				 struct kvm_memory_attributes *attrs)
 {
 	gfn_t start, end;
 
 	/* flags is currently not used. */
 	if (attrs->flags)
 		return -EINVAL;
-	if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
+	if (attrs->attributes & ~supported_attrs)
 		return -EINVAL;
 	if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
 		return -EINVAL;
@@ -2635,7 +2638,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 	 */
 	BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));
 
-	return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
+	return kvm_set_mem_attributes(kvm, mem_attr_array, start, end,
+				      attrs->attributes);
+}
+
+static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
+					   struct kvm_memory_attributes *attrs)
+{
+	return kvm_ioctl_set_mem_attributes(kvm, &kvm->mem_attr_array,
+					    kvm_supported_mem_attributes(kvm),
+					    attrs);
 }
 #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
 

From c9cac30f8a09b6651ba7aec61cd73863e53fe0b9 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Mon, 2 Oct 2023 15:19:16 +0000
Subject: [PATCH 16/26] Introduce VSM KVM Device

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/Kconfig      |  6 +++
 arch/x86/kvm/Makefile     |  5 ++-
 arch/x86/kvm/hyperv-vsm.c | 81 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/hyperv.h     |  3 +-
 arch/x86/kvm/x86.c        |  3 ++
 include/uapi/linux/kvm.h  |  5 +++
 6 files changed, 100 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kvm/hyperv-vsm.c

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8452ed0228cb6b..5d1d3eba8dfd40 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -166,4 +166,10 @@ config KVM_PROVE_MMU
 config KVM_EXTERNAL_WRITE_TRACKING
 	bool
 
+config KVM_HYPERV_VSM
+	bool "KVM Hyper-V Virtual Secure Mode (VSM) support"
+	help
+	  Enables the KVM VSM device, and all dependencies necessary in to
+	  emulate Hyper-V's VSM.
+
 endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 80e3fe184d17e6..e8cec84b0d4e67 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -11,8 +11,8 @@ include $(srctree)/virt/kvm/Makefile.kvm
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-			   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
-			   mmu/spte.o
+			   hyperv.o debugfs.o \
+			   mmu/mmu.o mmu/page_track.o  mmu/spte.o
 
 ifdef CONFIG_HYPERV
 kvm-y			+= kvm_onhyperv.o
@@ -21,6 +21,7 @@ endif
 kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
 kvm-$(CONFIG_KVM_XEN)	+= xen.o
 kvm-$(CONFIG_KVM_SMM)	+= smm.o
+kvm-$(CONFIG_KVM_HYPERV_VSM) += hyperv-vsm.o
 
 kvm-intel-y		+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
 			   vmx/hyperv.o vmx/nested.o vmx/posted_intr.o
diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c
new file mode 100644
index 00000000000000..f3a15a4670936b
--- /dev/null
+++ b/arch/x86/kvm/hyperv-vsm.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM Microsoft Hyper-V VSM emulation
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+
+struct kvm_hv_vtl_dev {
+	int vtl;
+	struct xarray mem_attrs;
+};
+
+static int kvm_hv_vtl_get_attr(struct kvm_device *dev,
+			       struct kvm_device_attr *attr)
+{
+	struct kvm_hv_vtl_dev *vtl_dev = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_HV_VTL_GROUP:
+	switch (attr->attr){
+	case KVM_DEV_HV_VTL_GROUP_VTLNUM:
+		return put_user(vtl_dev->vtl, (u32 __user *)attr->addr);
+	}
+	}
+
+	return -EINVAL;
+}
+
+static void kvm_hv_vtl_release(struct kvm_device *dev)
+{
+	struct kvm_hv_vtl_dev *vtl_dev = dev->private;
+
+	xa_destroy(&vtl_dev->mem_attrs);
+	kfree(vtl_dev);
+	kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */
+}
+
+static int kvm_hv_vtl_create(struct kvm_device *dev, u32 type);
+
+static struct kvm_device_ops kvm_hv_vtl_ops = {
+	.name = "kvm-hv-vtl",
+	.create = kvm_hv_vtl_create,
+	.release = kvm_hv_vtl_release,
+	.get_attr = kvm_hv_vtl_get_attr,
+};
+
+static int kvm_hv_vtl_create(struct kvm_device *dev, u32 type)
+{
+	struct kvm_hv_vtl_dev *vtl_dev;
+	struct kvm_device *tmp;
+	int vtl = 0;
+
+	vtl_dev = kzalloc(sizeof(*vtl_dev), GFP_KERNEL_ACCOUNT);
+	if (!vtl_dev)
+		return -ENOMEM;
+
+	/* Device creation is protected by kvm->lock */
+	list_for_each_entry(tmp, &dev->kvm->devices, vm_node)
+		if (tmp->ops == &kvm_hv_vtl_ops)
+			vtl++;
+
+	vtl_dev->vtl = vtl;
+	xa_init(&vtl_dev->mem_attrs);
+	dev->private = vtl_dev;
+
+	return 0;
+}
+
+int kvm_hv_vtl_dev_register(void)
+{
+	return kvm_register_device_ops(&kvm_hv_vtl_ops, KVM_DEV_TYPE_HV_VSM_VTL);
+}
+
+void kvm_hv_vtl_dev_unregister(void)
+{
+	kvm_unregister_device_ops(KVM_DEV_TYPE_HV_VSM_VTL);
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e5dac7466acf98..e1518594fc7573 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -255,5 +255,6 @@ static inline bool kvm_hv_vsm_enabled(struct kvm *kvm)
 
 int kvm_vm_ioctl_get_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state);
 int kvm_vm_ioctl_set_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state);
-
+int kvm_hv_vtl_dev_register(void);
+void kvm_hv_vtl_dev_unregister(void);
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70d0b7118f6413..2ca8f3795b7ee9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6520,6 +6520,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		mutex_unlock(&kvm->lock);
 		break;
 	case KVM_CAP_HYPERV_VSM:
+		kvm_hv_vtl_dev_register();
 		kvm->arch.hyperv.hv_enable_vsm = true;
 		r = 0;
 		break;
@@ -9683,6 +9684,8 @@ void kvm_x86_vendor_exit(void)
 	mutex_lock(&vendor_module_lock);
 	kvm_x86_ops.hardware_enable = NULL;
 	mutex_unlock(&vendor_module_lock);
+
+	kvm_hv_vtl_dev_unregister();
 }
 EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 62c847b195fabc..729a0ef91da9a9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1476,6 +1476,9 @@ struct kvm_device_attr {
 #define   KVM_DEV_VFIO_GROUP_DEL	KVM_DEV_VFIO_FILE_DEL
 #define   KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE		3
 
+#define KVM_DEV_HV_VTL_GROUP		1
+#define  KVM_DEV_HV_VTL_GROUP_VTLNUM		1
+
 enum kvm_device_type {
 	KVM_DEV_TYPE_FSL_MPIC_20	= 1,
 #define KVM_DEV_TYPE_FSL_MPIC_20	KVM_DEV_TYPE_FSL_MPIC_20
@@ -1499,6 +1502,8 @@ enum kvm_device_type {
 #define KVM_DEV_TYPE_ARM_PV_TIME	KVM_DEV_TYPE_ARM_PV_TIME
 	KVM_DEV_TYPE_RISCV_AIA,
 #define KVM_DEV_TYPE_RISCV_AIA		KVM_DEV_TYPE_RISCV_AIA
+	KVM_DEV_TYPE_HV_VSM_VTL,
+#define KVM_DEV_TYPE_HV_VSM_VTL		KVM_DEV_TYPE_HV_VSM_VTL
 	KVM_DEV_TYPE_MAX,
 };
 

From 517194af799ef1240733124d6231f317639b99ee Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Mon, 2 Oct 2023 18:25:28 +0000
Subject: [PATCH 17/26] hyperv-vsm: introduce attrs ioctl

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv-vsm.c | 30 +++++++++++++++++++++++++++++-
 include/uapi/linux/kvm.h  |  4 ++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c
index f3a15a4670936b..58e90ae5acaed3 100644
--- a/arch/x86/kvm/hyperv-vsm.c
+++ b/arch/x86/kvm/hyperv-vsm.c
@@ -9,6 +9,10 @@
 
 #include <linux/kvm_host.h>
 
+#define KVM_HV_VTL_ATTRS                                          \
+	(KVM_MEMORY_ATTRIBUTE_READ | KVM_MEMORY_ATTRIBUTE_WRITE | \
+	 KVM_MEMORY_ATTRIBUTE_EXECUTE | KVM_MEMORY_ATTRIBUTE_NO_ACCESS)
+
 struct kvm_hv_vtl_dev {
 	int vtl;
 	struct xarray mem_attrs;
@@ -39,12 +43,36 @@ static void kvm_hv_vtl_release(struct kvm_device *dev)
 	kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */
 }
 
-static int kvm_hv_vtl_create(struct kvm_device *dev, u32 type);
+static long kvm_hv_vtl_ioctl(struct kvm_device *dev, unsigned int ioctl,
+			     unsigned long arg)
+{
+	switch (ioctl) {
+	case KVM_SET_MEMORY_ATTRIBUTES: {
+		struct kvm_hv_vtl_dev *vtl_dev = dev->private;
+		struct kvm_memory_attributes attrs;
+		int r;
+
+		if (copy_from_user(&attrs, (void __user *)arg, sizeof(attrs)))
+			return -EFAULT;
+
+		r = kvm_ioctl_set_mem_attributes(dev->kvm, &vtl_dev->mem_attrs,
+						 KVM_HV_VTL_ATTRS, &attrs);
+		if (r)
+			return r;
+		break;
+	}
+	default:
+		return -ENOTTY;
+	}
+
+	return 0;
+}
 
 static struct kvm_device_ops kvm_hv_vtl_ops = {
 	.name = "kvm-hv-vtl",
 	.create = kvm_hv_vtl_create,
 	.release = kvm_hv_vtl_release,
+	.ioctl = kvm_hv_vtl_ioctl,
 	.get_attr = kvm_hv_vtl_get_attr,
 };
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 729a0ef91da9a9..07aafb046e4473 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -2319,7 +2319,11 @@ struct kvm_memory_attributes {
 	__u64 flags;
 };
 
+#define KVM_MEMORY_ATTRIBUTE_READ              (1ULL << 0)
+#define KVM_MEMORY_ATTRIBUTE_WRITE             (1ULL << 1)
+#define KVM_MEMORY_ATTRIBUTE_EXECUTE           (1ULL << 2)
 #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
+#define KVM_MEMORY_ATTRIBUTE_NO_ACCESS         (1ULL << 4)
 
 #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
 

From 4e388934b9f59f97598a680b43e505c321a5b2f8 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 18:16:38 +0000
Subject: [PATCH 18/26] faultin

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/Kconfig      |  1 +
 arch/x86/kvm/hyperv-vsm.c | 66 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/hyperv.h     |  2 ++
 arch/x86/kvm/mmu/mmu.c    | 12 +++++++
 4 files changed, 81 insertions(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 5d1d3eba8dfd40..41e11bbc1e6eee 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -168,6 +168,7 @@ config KVM_EXTERNAL_WRITE_TRACKING
 
 config KVM_HYPERV_VSM
 	bool "KVM Hyper-V Virtual Secure Mode (VSM) support"
+	select KVM_GENERIC_MEMORY_ATTRIBUTES
 	help
 	  Enables the KVM VSM device, and all dependencies necessary in to
 	  emulate Hyper-V's VSM.
diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c
index 58e90ae5acaed3..a6facbe83f6b5f 100644
--- a/arch/x86/kvm/hyperv-vsm.c
+++ b/arch/x86/kvm/hyperv-vsm.c
@@ -5,6 +5,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include "mmu/mmu_internal.h"
 #include "hyperv.h"
 
 #include <linux/kvm_host.h>
@@ -18,6 +19,54 @@ struct kvm_hv_vtl_dev {
 	struct xarray mem_attrs;
 };
 
+static struct xarray *kvm_hv_vsm_get_memprots(struct kvm_vcpu *vcpu);
+
+
+bool kvm_hv_vsm_access_valid(struct kvm_page_fault *fault, unsigned long attrs)
+{
+	if (attrs == KVM_MEMORY_ATTRIBUTE_NO_ACCESS)
+		return false;
+
+	if (fault->write && !(attrs & KVM_MEMORY_ATTRIBUTE_WRITE))
+		return false;
+
+	if (fault->exec && !(attrs & KVM_MEMORY_ATTRIBUTE_EXECUTE))
+		return false;
+
+	return true;
+}
+
+static unsigned long kvm_hv_vsm_get_memory_attributes(struct kvm_vcpu *vcpu,
+						      gfn_t gfn)
+{
+	struct xarray *prots = kvm_hv_vsm_get_memprots(vcpu);
+
+	if (!prots)
+		return 0;
+
+	return xa_to_value(xa_load(prots, gfn));
+}
+
+int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+	unsigned long attrs;
+
+	attrs = kvm_hv_vsm_get_memory_attributes(vcpu, fault->gfn);
+	if (!attrs)
+		return RET_PF_CONTINUE;
+
+	if (kvm_hv_vsm_access_valid(fault, attrs)) {
+		fault->map_executable = attrs & KVM_MEMORY_ATTRIBUTE_EXECUTE;
+		fault->map_writable = attrs & KVM_MEMORY_ATTRIBUTE_WRITE;
+		return RET_PF_CONTINUE;
+	}
+
+	kvm_prepare_memory_fault_exit(vcpu, fault->addr, PAGE_SIZE,
+				      fault->write, fault->exec, fault->user,
+				      fault->is_private);
+	return RET_PF_USER;
+}
+
 static int kvm_hv_vtl_get_attr(struct kvm_device *dev,
 			       struct kvm_device_attr *attr)
 {
@@ -68,6 +117,8 @@ static long kvm_hv_vtl_ioctl(struct kvm_device *dev, unsigned int ioctl,
 	return 0;
 }
 
+static int kvm_hv_vtl_create(struct kvm_device *dev, u32 type);
+
 static struct kvm_device_ops kvm_hv_vtl_ops = {
 	.name = "kvm-hv-vtl",
 	.create = kvm_hv_vtl_create,
@@ -76,6 +127,21 @@ static struct kvm_device_ops kvm_hv_vtl_ops = {
 	.get_attr = kvm_hv_vtl_get_attr,
 };
 
+static struct xarray *kvm_hv_vsm_get_memprots(struct kvm_vcpu *vcpu)
+{
+	struct kvm_hv_vtl_dev *vtl_dev;
+	struct kvm_device *tmp;
+
+	list_for_each_entry(tmp, &vcpu->kvm->devices, vm_node)
+		if (tmp->ops == &kvm_hv_vtl_ops) {
+			vtl_dev = tmp->private;
+			if (vtl_dev->vtl == get_active_vtl(vcpu))
+				return &vtl_dev->mem_attrs;
+		}
+
+	return NULL;
+}
+
 static int kvm_hv_vtl_create(struct kvm_device *dev, u32 type)
 {
 	struct kvm_hv_vtl_dev *vtl_dev;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e1518594fc7573..67fdbb8289b47b 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -257,4 +257,6 @@ int kvm_vm_ioctl_get_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *stat
 int kvm_vm_ioctl_set_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state);
 int kvm_hv_vtl_dev_register(void);
 void kvm_hv_vtl_dev_unregister(void);
+
+int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 #endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 710de4fe047e85..95fceea04cc015 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4332,6 +4332,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 {
 	struct kvm_memory_slot *slot = fault->slot;
 	bool async;
+	int r;
 
 	/*
 	 * Retry the page fault if the gfn hit a memslot that is being deleted
@@ -4368,6 +4369,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (fault->is_private)
 		return kvm_faultin_pfn_private(vcpu, fault);
 
+#ifdef CONFIG_KVM_HYPERV_VSM
+	if (kvm_hv_vsm_enabled(vcpu->kvm)) {
+		r = kvm_hv_faultin_pfn(vcpu, fault);
+		if (r != RET_PF_CONTINUE)
+			return r;
+	}
+#endif
+
 	async = false;
 	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
 					  fault->write, &fault->map_writable,
@@ -5814,6 +5823,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
 			return -EIO;
 	}
 
+	if (r == RET_PF_USER)
+		return 0;
+
 	if (r < 0)
 		return r;
 	if (r != RET_PF_EMULATE)

From 1d2edc4a6550bf10c18192acad22e6920e7d5b86 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 14:57:18 +0000
Subject: [PATCH 19/26] intercepts

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/hyperv-tlfs.h |  76 +++++++++++++++++++
 arch/x86/include/asm/kvm_host.h    |  12 +++
 arch/x86/kvm/hyperv-vsm.c          |  22 ++++++
 arch/x86/kvm/hyperv.c              | 114 +++++++++++++++++++++++++++++
 arch/x86/kvm/hyperv.h              |   7 ++
 arch/x86/kvm/vmx/vmx.c             |   1 +
 arch/x86/kvm/x86.c                 |   3 +
 7 files changed, 235 insertions(+)

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 602f219c46bad6..fbb7f05aa517a8 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -821,6 +821,82 @@ struct hv_get_vp_from_apic_id_in {
 	u32 apic_ids[];
 } __packed;
 
+
+/* struct hv_intercept_header::access_type_mask */
+#define HV_INTERCEPT_ACCESS_MASK_NONE    0
+#define HV_INTERCEPT_ACCESS_MASK_READ    1
+#define HV_INTERCEPT_ACCESS_MASK_WRITE   2
+#define HV_INTERCEPT_ACCESS_MASK_EXECUTE 4
+
+/* struct hv_intercept_exception::cache_type */
+#define HV_X64_CACHE_TYPE_UNCACHED       0
+#define HV_X64_CACHE_TYPE_WRITECOMBINING 1
+#define HV_X64_CACHE_TYPE_WRITETHROUGH   4
+#define HV_X64_CACHE_TYPE_WRITEPROTECTED 5
+#define HV_X64_CACHE_TYPE_WRITEBACK      6
+
+/* Intecept message header */
+struct hv_intercept_header {
+	__u32 vp_index;
+	__u8 instruction_length;
+#define HV_INTERCEPT_ACCESS_READ    0
+#define HV_INTERCEPT_ACCESS_WRITE   1
+#define HV_INTERCEPT_ACCESS_EXECUTE 2
+	__u8 access_type_mask;
+	union {
+		__u16 as_u16;
+		struct {
+			__u16 cpl:2;
+			__u16 cr0_pe:1;
+			__u16 cr0_am:1;
+			__u16 efer_lma:1;
+			__u16 debug_active:1;
+			__u16 interruption_pending:1;
+			__u16 reserved:9;
+		};
+	} exec_state;
+	struct hv_x64_segment_register cs;
+	__u64 rip;
+	__u64 rflags;
+} __packed;
+
+union hv_x64_memory_access_info {
+	__u8 as_u8;
+	struct {
+		__u8 gva_valid:1;
+		__u8 _reserved:7;
+	};
+};
+
+struct hv_memory_intercept_message {
+	struct hv_intercept_header header;
+	__u32 cache_type;
+	__u8 instruction_byte_count;
+	union hv_x64_memory_access_info memory_access_info;
+	__u16 _reserved;
+	__u64 gva;
+	__u64 gpa;
+	__u8 instruction_bytes[16];
+	struct hv_x64_segment_register ds;
+	struct hv_x64_segment_register ss;
+	__u64 rax;
+	__u64 rcx;
+	__u64 rdx;
+	__u64 rbx;
+	__u64 rsp;
+	__u64 rbp;
+	__u64 rsi;
+	__u64 rdi;
+	__u64 r8;
+	__u64 r9;
+	__u64 r10;
+	__u64 r11;
+	__u64 r12;
+	__u64 r13;
+	__u64 r14;
+	__u64 r15;
+} __packed;
+
 #include <asm-generic/hyperv-tlfs.h>
 
 #endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ab4b77db56f1d6..29e136539b4b4c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -113,6 +113,7 @@
 	KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_TLB_FLUSH \
 	KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_INJECT_INTERCEPT	KVM_ARCH_REQ(33)
 
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -639,6 +640,13 @@ struct kvm_vcpu_hv_tlb_flush_fifo {
 	DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE);
 };
 
+struct kvm_vcpu_hv_intercept_info {
+	struct kvm_vcpu *vcpu;
+	int type;
+	u64 gpa;
+	u8 access;
+};
+
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
 	struct kvm_vcpu *vcpu;
@@ -673,6 +681,8 @@ struct kvm_vcpu_hv {
 		u64 vm_id;
 		u32 vp_id;
 	} nested;
+
+	struct kvm_vcpu_hv_intercept_info intercept_info;
 };
 
 struct kvm_hypervisor_cpuid {
@@ -967,6 +977,8 @@ struct kvm_vcpu_arch {
 	/* set at EPT violation at this point */
 	unsigned long exit_qualification;
 
+	u32 exit_instruction_len;
+
 	/* pv related host specific info */
 	struct {
 		bool pv_unhalted;
diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c
index a6facbe83f6b5f..5e3f92c4d0d99b 100644
--- a/arch/x86/kvm/hyperv-vsm.c
+++ b/arch/x86/kvm/hyperv-vsm.c
@@ -21,6 +21,27 @@ struct kvm_hv_vtl_dev {
 
 static struct xarray *kvm_hv_vsm_get_memprots(struct kvm_vcpu *vcpu);
 
+static void kvm_hv_inject_gpa_intercept(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+{
+	struct kvm_vcpu *target_vcpu =
+		kvm_hv_get_vtl_vcpu(vcpu, get_active_vtl(vcpu) + 1);
+	struct kvm_vcpu_hv_intercept_info *intercept =
+		&target_vcpu->arch.hyperv->intercept_info;
+
+	WARN_ON_ONCE(!to_kvm_hv(vcpu->kvm)->hv_enable_vsm);
+
+	intercept->type = HVMSG_GPA_INTERCEPT;
+	intercept->gpa = fault->addr;
+	intercept->access =
+		(fault->user ? HV_INTERCEPT_ACCESS_READ : 0) |
+		(fault->write ? HV_INTERCEPT_ACCESS_WRITE : 0) |
+		(fault->exec ? HV_INTERCEPT_ACCESS_EXECUTE : 0);
+	intercept->vcpu = vcpu;
+
+	kvm_make_request(KVM_REQ_HV_INJECT_INTERCEPT, target_vcpu);
+	kvm_vcpu_kick(target_vcpu);
+}
+
 
 bool kvm_hv_vsm_access_valid(struct kvm_page_fault *fault, unsigned long attrs)
 {
@@ -61,6 +82,7 @@ int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		return RET_PF_CONTINUE;
 	}
 
+	kvm_hv_inject_gpa_intercept(vcpu, fault);
 	kvm_prepare_memory_fault_exit(vcpu, fault->addr, PAGE_SIZE,
 				      fault->write, fault->exec, fault->user,
 				      fault->is_private);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a93cd9f8b67c23..ed0b1494b6174e 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2942,6 +2942,120 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static void store_kvm_segment(const struct kvm_segment *kvmseg,
+			      struct hv_x64_segment_register *reg)
+{
+	reg->base = kvmseg->base;
+	reg->limit = kvmseg->limit;
+	reg->selector = kvmseg->selector;
+	reg->segment_type = kvmseg->type;
+	reg->present = kvmseg->present;
+	reg->descriptor_privilege_level = kvmseg->dpl;
+	reg->_default = kvmseg->db;
+	reg->non_system_segment = kvmseg->s;
+	reg->_long = kvmseg->l;
+	reg->granularity = kvmseg->g;
+	reg->available = kvmseg->avl;
+}
+
+static void deliver_gpa_intercept(struct kvm_vcpu *target_vcpu,
+				  struct kvm_vcpu *intercepted_vcpu, u64 gpa,
+				  u64 gva, u8 access_type_mask)
+{
+	ulong cr0;
+	struct hv_message msg = { 0 };
+	struct hv_memory_intercept_message *intercept = (struct hv_memory_intercept_message *)msg.u.payload;
+	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(target_vcpu);
+	struct x86_exception e;
+	struct kvm_segment kvmseg;
+
+	msg.header.message_type = HVMSG_GPA_INTERCEPT;
+	msg.header.payload_size = sizeof(*intercept);
+
+	intercept->header.vp_index = to_hv_vcpu(intercepted_vcpu)->vp_index;
+	intercept->header.instruction_length = intercepted_vcpu->arch.exit_instruction_len;
+	intercept->header.access_type_mask = access_type_mask;
+	kvm_x86_ops.get_segment(intercepted_vcpu, &kvmseg, VCPU_SREG_CS);
+	store_kvm_segment(&kvmseg, &intercept->header.cs);
+
+	cr0 = kvm_read_cr0(intercepted_vcpu);
+	intercept->header.exec_state.cr0_pe = (cr0 & X86_CR0_PE);
+	intercept->header.exec_state.cr0_am = (cr0 & X86_CR0_AM);
+	intercept->header.exec_state.cpl = kvm_x86_ops.get_cpl(intercepted_vcpu);
+	intercept->header.exec_state.efer_lma = is_long_mode(intercepted_vcpu);
+	intercept->header.exec_state.debug_active = 0;
+	intercept->header.exec_state.interruption_pending = 0;
+	intercept->header.rip = kvm_rip_read(intercepted_vcpu);
+	intercept->header.rflags = kvm_get_rflags(intercepted_vcpu);
+
+	/*
+	 * For exec violations we don't have a way to decode an instruction that issued a fetch
+	 * to a non-X page because CPU points RIP and GPA to the fetch destination in the faulted page.
+	 * Instruction length though is the length of the fetch source.
+	 * Seems like Hyper-V is aware of that and is not trying to access those fields.
+	 */
+	if (access_type_mask == HV_INTERCEPT_ACCESS_EXECUTE) {
+		intercept->instruction_byte_count = 0;
+	} else {
+		intercept->instruction_byte_count = intercepted_vcpu->arch.exit_instruction_len;
+		if (intercept->instruction_byte_count > sizeof(intercept->instruction_bytes))
+			intercept->instruction_byte_count = sizeof(intercept->instruction_bytes);
+		if (kvm_read_guest_virt(intercepted_vcpu,
+					kvm_rip_read(intercepted_vcpu),
+					intercept->instruction_bytes,
+					intercept->instruction_byte_count, &e))
+			goto inject_ud;
+	}
+
+	intercept->memory_access_info.gva_valid = (gva != 0);
+	intercept->gva = gva;
+	intercept->gpa = gpa;
+	intercept->cache_type = HV_X64_CACHE_TYPE_WRITEBACK;
+	kvm_x86_ops.get_segment(intercepted_vcpu, &kvmseg, VCPU_SREG_DS);
+	store_kvm_segment(&kvmseg, &intercept->ds);
+	kvm_x86_ops.get_segment(intercepted_vcpu, &kvmseg, VCPU_SREG_SS);
+	store_kvm_segment(&kvmseg, &intercept->ss);
+	intercept->rax = kvm_rax_read(intercepted_vcpu);
+	intercept->rcx = kvm_rcx_read(intercepted_vcpu);
+	intercept->rdx = kvm_rdx_read(intercepted_vcpu);
+	intercept->rbx = kvm_rbx_read(intercepted_vcpu);
+	intercept->rsp = kvm_rsp_read(intercepted_vcpu);
+	intercept->rbp = kvm_rbp_read(intercepted_vcpu);
+	intercept->rsi = kvm_rsi_read(intercepted_vcpu);
+	intercept->rdi = kvm_rdi_read(intercepted_vcpu);
+	intercept->r8 = kvm_r8_read(intercepted_vcpu);
+	intercept->r9 = kvm_r9_read(intercepted_vcpu);
+	intercept->r10 = kvm_r10_read(intercepted_vcpu);
+	intercept->r11 = kvm_r11_read(intercepted_vcpu);
+	intercept->r12 = kvm_r12_read(intercepted_vcpu);
+	intercept->r13 = kvm_r13_read(intercepted_vcpu);
+	intercept->r14 = kvm_r14_read(intercepted_vcpu);
+	intercept->r15 = kvm_r15_read(intercepted_vcpu);
+
+	if (synic_deliver_msg(&hv_vcpu->synic, 0, &msg, true))
+		goto inject_ud;
+
+	return;
+
+inject_ud:
+	kvm_queue_exception(target_vcpu, UD_VECTOR);
+}
+
+void kvm_hv_deliver_intercept(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv_intercept_info *info = &to_hv_vcpu(vcpu)->intercept_info;
+
+	switch (info->type) {
+	case HVMSG_GPA_INTERCEPT:
+		deliver_gpa_intercept(vcpu, info->vcpu, info->gpa, 0,
+				      info->access);
+		break;
+	default:
+		pr_warn("Unknown exception\n");
+	}
+}
+EXPORT_SYMBOL_GPL(kvm_hv_deliver_intercept);
+
 void kvm_hv_init_vm(struct kvm *kvm)
 {
 	struct kvm_hv *hv = to_kvm_hv(kvm);
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 67fdbb8289b47b..7c3c2c04cefb58 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -259,4 +259,11 @@ int kvm_hv_vtl_dev_register(void);
 void kvm_hv_vtl_dev_unregister(void);
 
 int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+static inline struct kvm_vcpu *kvm_hv_get_vtl_vcpu(struct kvm_vcpu *vcpu, int vtl)
+{
+	return kvm_get_vcpu_by_id(vcpu->kvm, vtl);
+}
+
+void kvm_hv_deliver_intercept(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 6e502ba931416d..9c83ee3a293de7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5773,6 +5773,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 	       PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
 
 	vcpu->arch.exit_qualification = exit_qualification;
+	vcpu->arch.exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 
 	/*
 	 * Check that the GPA doesn't exceed physical memory limits, as that is
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2ca8f3795b7ee9..e2ccf721bd3b7f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10715,6 +10715,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
 			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
+
+		if (kvm_check_request(KVM_REQ_HV_INJECT_INTERCEPT, vcpu))
+			kvm_hv_deliver_intercept(vcpu);
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||

From 926fe47b2f05b4fcfedab04f18fe8223b468c9fe Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Wed, 25 Oct 2023 15:22:40 +0000
Subject: [PATCH 20/26] Implement HVCALL_TRANSLATE_VIRTUAL_ADDRESS

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv.c             | 102 ++++++++++++++++++++++++++++++
 arch/x86/kvm/trace.h              |  23 +++++++
 include/asm-generic/hyperv-tlfs.h |  28 ++++++++
 3 files changed, 153 insertions(+)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ed0b1494b6174e..14b144ff35235a 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2596,6 +2596,7 @@ static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc)
 	case HVCALL_GET_VP_REGISTERS:
 	case HVCALL_SET_VP_REGISTERS:
 	case HVCALL_MODIFY_VTL_PROTECTION_MASK:
+	case HVCALL_TRANSLATE_VIRTUAL_ADDRESS:
 		return true;
 	}
 
@@ -2641,6 +2642,99 @@ static u64 kvm_hv_ext_query_capabilities(struct kvm_vcpu *vcpu, struct kvm_hv_hc
 	return HV_STATUS_SUCCESS;
 }
 
+static bool kvm_hv_xlate_va_validate_input(struct kvm_vcpu* vcpu,
+					   struct hv_xlate_va_input *in,
+					   u8 *vtl, u8 *flags)
+{
+	struct kvm_vcpu_hv *hv = vcpu->arch.hyperv;
+	union hv_input_vtl in_vtl;
+
+	if (in->partition_id != HV_PARTITION_ID_SELF)
+		return false;
+
+	if (in->vp_index != HV_VP_INDEX_SELF && in->vp_index != hv->vp_index)
+		return false;
+
+	in_vtl.as_uint8 = in->control_flags >> 56;
+	*flags = in->control_flags & HV_XLATE_GVA_FLAGS_MASK;
+	if (*flags > (HV_XLATE_GVA_VAL_READ |
+		      HV_XLATE_GVA_VAL_WRITE |
+		      HV_XLATE_GVA_VAL_EXECUTE))
+		pr_info_ratelimited("Translate VA control flags unsupported and will be ignored: 0x%llx\n",
+				    in->control_flags);
+
+	*vtl = in_vtl.use_target_vtl ? in_vtl.target_vtl : get_active_vtl(vcpu);
+
+	if (*vtl >= HV_NUM_VTLS || *vtl > get_active_vtl(vcpu))
+		return false;
+
+	return true;
+}
+
+static u64 kvm_hv_xlate_va_walk(struct kvm_vcpu* vcpu, u64 gva, u8 flags)
+{
+	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+	u32 access = 0;
+
+	if (flags & HV_XLATE_GVA_VAL_WRITE)
+		access |= PFERR_WRITE_MASK;
+	if (flags & HV_XLATE_GVA_VAL_EXECUTE)
+		access |= PFERR_FETCH_MASK;
+
+	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, mmu, gva, access, NULL);
+}
+
+static u64 kvm_hv_translate_virtual_address(struct kvm_vcpu* vcpu,
+					    struct kvm_hv_hcall *hc)
+{
+	struct hv_xlate_va_output output = {};
+	struct hv_xlate_va_input input;
+	struct kvm_vcpu *target_vcpu;
+	u8 flags, target_vtl;
+
+	if (hc->fast) {
+		input.partition_id = hc->ingpa;
+		input.vp_index = hc->outgpa & 0xFFFFFFFF;
+		input.control_flags = sse128_lo(hc->xmm[0]);
+		input.gva = sse128_hi(hc->xmm[0]);
+	} else {
+		if (unlikely(kvm_vcpu_read_guest(vcpu, hc->ingpa, &input, sizeof(input)) != 0))
+			return HV_STATUS_INVALID_HYPERCALL_INPUT;
+	}
+
+	trace_kvm_hv_translate_virtual_address(input.partition_id, input.vp_index, input.control_flags, input.gva);
+
+	if (!kvm_hv_xlate_va_validate_input(vcpu, &input, &target_vtl, &flags))
+		return HV_STATUS_INVALID_HYPERCALL_INPUT;
+
+	target_vcpu = kvm_hv_get_vtl_vcpu(vcpu, target_vtl);
+	output.gpa = kvm_hv_xlate_va_walk(target_vcpu, input.gva << PAGE_SHIFT, flags);
+	if (output.gpa == INVALID_GPA) {
+		output.result_code = HV_XLATE_GVA_UNMAPPED;
+	} else {
+		struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+		u64 hcall_page = hv->hv_hypercall &
+				HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+
+		if (output.gpa == hcall_page)
+			output.overlay_page = 1;
+
+		output.gpa >>= PAGE_SHIFT;
+		output.result_code = HV_XLATE_GVA_SUCCESS;
+		output.cache_type = HV_CACHE_TYPE_X64_WB;
+	}
+
+	if (hc->fast) {
+		memcpy(&hc->xmm[1], &output, sizeof(output));
+		hc->xmm_dirty = true;
+	} else {
+		if (unlikely(kvm_vcpu_write_guest(vcpu, hc->outgpa, &output, sizeof(output)) != 0))
+			return HV_STATUS_INVALID_HYPERCALL_INPUT;
+	}
+
+	return HV_STATUS_SUCCESS;
+}
+
 static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
 {
 	if (!hv_vcpu->enforce_cpuid)
@@ -2915,6 +3009,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	case HVCALL_VTL_CALL:
 	case HVCALL_VTL_RETURN:
 		goto hypercall_userspace_exit;
+	case HVCALL_TRANSLATE_VIRTUAL_ADDRESS:
+		if (unlikely(hc.rep_cnt)) {
+			ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
+			break;
+		}
+
+		ret = kvm_hv_translate_virtual_address(vcpu, &hc);
+		break;
 	default:
 		ret = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index f0b6bb674e8001..86f9e890824ae0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1372,6 +1372,29 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
 		  __entry->vcpu_id, __entry->timer_index)
 );
 
+TRACE_EVENT(kvm_hv_translate_virtual_address,
+	TP_PROTO(u64 partition_id, u32 vp_index, u64 control_flags, u64 gva),
+	TP_ARGS(partition_id, vp_index, control_flags, gva),
+
+	TP_STRUCT__entry(
+		__field(u64, partition_id)
+		__field(u32, vp_index)
+		__field(u64, control_flags)
+		__field(u64, gva)
+	),
+
+	TP_fast_assign(
+		__entry->partition_id = partition_id;
+		__entry->vp_index = vp_index;
+		__entry->control_flags = control_flags;
+		__entry->gva = gva;
+	),
+
+	TP_printk("partition id 0x%llx, vp index 0x%x, control flags 0x%llx, gva 0x%llx",
+		  __entry->partition_id, __entry->vp_index,
+		  __entry->control_flags, __entry->gva)
+);
+
 TRACE_EVENT(kvm_apicv_inhibit_changed,
 	    TP_PROTO(int reason, bool set, unsigned long inhibits),
 	    TP_ARGS(reason, set, inhibits),
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index 0f142f3e494fde..369e1a004d016e 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -163,6 +163,7 @@ union hv_reference_tsc_msr {
 #define HVCALL_CREATE_VP			0x004e
 #define HVCALL_GET_VP_REGISTERS			0x0050
 #define HVCALL_SET_VP_REGISTERS			0x0051
+#define HVCALL_TRANSLATE_VIRTUAL_ADDRESS	0x0052
 #define HVCALL_POST_MESSAGE			0x005c
 #define HVCALL_SIGNAL_EVENT			0x005d
 #define HVCALL_POST_DEBUG_DATA			0x0069
@@ -847,4 +848,31 @@ union hv_register_vsm_code_page_offsets {
 		u64 reserved:40;
 	} __packed;
 };
+
+#define HV_XLATE_GVA_SUCCESS 0
+#define HV_XLATE_GVA_UNMAPPED 1
+#define HV_XLATE_GPA_UNMAPPED 4
+#define HV_CACHE_TYPE_X64_WB 6
+
+#define HV_XLATE_GVA_VAL_READ 1
+#define HV_XLATE_GVA_VAL_WRITE 2
+#define HV_XLATE_GVA_VAL_EXECUTE 4
+#define HV_XLATE_GVA_FLAGS_MASK 0x3F
+
+struct hv_xlate_va_input {
+	u64 partition_id;
+	u32 vp_index;
+	u32 reserved;
+	u64 control_flags;
+	u64 gva;
+};
+
+struct hv_xlate_va_output {
+	u32 result_code;
+	u32 cache_type:8;
+	u32 overlay_page:1;
+	u32 reserved:23;
+	u64 gpa;
+};
+
 #endif

From 8c2289b02e70b81d7e0e3bdaf976710f07bb1c7a Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 10 Oct 2023 16:01:38 +0000
Subject: [PATCH 21/26] tracing

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/kvm/hyperv-vsm.c  |  4 +++
 arch/x86/kvm/trace.h       | 50 ++++++++++++++++++++++++++++++++++++++
 include/trace/events/kvm.h | 23 ++++++++++++++++++
 3 files changed, 77 insertions(+)

diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c
index 5e3f92c4d0d99b..3f906f7d00e29b 100644
--- a/arch/x86/kvm/hyperv-vsm.c
+++ b/arch/x86/kvm/hyperv-vsm.c
@@ -7,6 +7,7 @@
 
 #include "mmu/mmu_internal.h"
 #include "hyperv.h"
+#include "trace.h"
 
 #include <linux/kvm_host.h>
 
@@ -76,6 +77,9 @@ int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	if (!attrs)
 		return RET_PF_CONTINUE;
 
+	trace_kvm_hv_faultin_pfn(vcpu->vcpu_id, fault->gfn, fault->write,
+				 fault->exec, fault->user, attrs);
+
 	if (kvm_hv_vsm_access_valid(fault, attrs)) {
 		fault->map_executable = attrs & KVM_MEMORY_ATTRIBUTE_EXECUTE;
 		fault->map_writable = attrs & KVM_MEMORY_ATTRIBUTE_WRITE;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 86f9e890824ae0..195e5839a3ac91 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1671,6 +1671,56 @@ TRACE_EVENT(kvm_hv_send_ipi_ex,
 		  __entry->valid_bank_mask, __entry->vtl)
 );
 
+TRACE_EVENT(kvm_hv_faultin_pfn,
+	TP_PROTO(u32 vcpu_id, u64 gfn, bool write, bool exec, bool user, u64 prots),
+	TP_ARGS(vcpu_id, gfn, write, exec, user, prots),
+
+	TP_STRUCT__entry(
+		__field(u32, vcpu_id)
+		__field(u64, gfn)
+		__field(bool, write)
+		__field(bool, exec)
+		__field(bool, user)
+		__field(u64, prots)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->gfn = gfn;
+		__entry->write = write;
+		__entry->exec = exec;
+		__entry->user = user;
+		__entry->prots = prots;
+	),
+
+	TP_printk("vcpu%d gfn %llx write %d exec %d user %d prots %llx",
+		  __entry->vcpu_id, __entry->gfn, __entry->write,
+		  __entry->exec, __entry->user, __entry->prots)
+);
+
+TRACE_EVENT(kvm_hv_modify_vtl_protection_mask,
+	TP_PROTO(u64 target_partition_id, u32 map_flags, u8 target_vtl, u16 count),
+	TP_ARGS(target_partition_id, map_flags, target_vtl, count),
+
+	TP_STRUCT__entry(
+		__field(u64, target_partition_id)
+		__field(u32, map_flags)
+		__field(u8, target_vtl)
+		__field(u16, count)
+	),
+
+	TP_fast_assign(
+		__entry->target_partition_id = target_partition_id;
+		__entry->map_flags = map_flags;
+		__entry->target_vtl = target_vtl;
+		__entry->count = count;
+	),
+
+	TP_printk("target partition id 0x%llx, map flags 0x%x, target VTL %d, count %d",
+		  __entry->target_partition_id, __entry->map_flags,
+		  __entry->target_vtl, __entry->count)
+);
+
 TRACE_EVENT(kvm_hv_ext_query_capabilities,
 	TP_PROTO(u64 caps),
 	TP_ARGS(caps),
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index aa34fdb16c90bb..ca28e3bd5fc4ae 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -505,6 +505,29 @@ TRACE_EVENT(kvm_test_age_hva,
 	TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
 );
 
+TRACE_EVENT(kvm_set_mem_attributes,
+	TP_PROTO(u64 start, u64 size, u64 attributes, u64 flags),
+	TP_ARGS(start, size, attributes, flags),
+
+	TP_STRUCT__entry(
+		__field(	u64,	start		)
+		__field(	u64,	size		)
+		__field(	u64,	attributes	)
+		__field(	u64,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->start		= start;
+		__entry->size		= size;
+		__entry->attributes	= attributes;
+		__entry->flags		= flags;
+	),
+
+	TP_printk("start 0x%llx, size 0x%llx, attributes 0x%llx, flags 0x%llx",
+		  __entry->start, __entry->size, __entry->attributes,
+		  __entry->flags)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */

From df730dc5acdda12c51ce2af978cff7100e63f13b Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenz@amazon.com>
Date: Tue, 26 Sep 2023 10:30:20 +0000
Subject: [PATCH 22/26] Debug traces

Signed-off-by: Nicolas Saenz Julienne <nsaenz@amazon.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/kvm/hyperv.c           |  85 ++++++++++++-
 arch/x86/kvm/hyperv.h           |   3 +-
 arch/x86/kvm/vmx/vmx.c          | 209 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 208 +++++++++++++++++++++++++++++--
 arch/x86/kvm/x86.h              |   8 ++
 include/linux/kvm_host.h        |   1 +
 virt/kvm/kvm_main.c             |   8 +-
 8 files changed, 513 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 29e136539b4b4c..a7a74d8f43846a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2001,6 +2001,8 @@ void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
 
+void dump_ftrace_vmcs(struct kvm_vcpu *vcpu);
+void dump_ftrace_vcpu_state(struct kvm_vcpu * vcpu);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 		    int reason, bool has_error_code, u32 error_code);
 
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 14b144ff35235a..6ee6f7ef9617fa 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -3006,8 +3006,16 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	case HVCALL_MODIFY_VTL_PROTECTION_MASK:
 	case HVCALL_ENABLE_PARTITION_VTL:
 	case HVCALL_ENABLE_VP_VTL:
+		goto hypercall_userspace_exit;
 	case HVCALL_VTL_CALL:
 	case HVCALL_VTL_RETURN:
+		vcpu->dump_state_on_run = true;
+		trace_printk("-------------------------------------------VCPU%d---------------------------------\n", vcpu->vcpu_id);
+		trace_printk("Exiting to user-space with code 0x%x\n", hc.code);
+		dump_ftrace_vmcs(vcpu);
+		dump_ftrace_vcpu_state(vcpu);
+		trace_printk("---------------------------------------------------------------------------\n");
+		kvm_get_vcpu_by_id(vcpu->kvm, 0)->dump_state_on_run = true;
 		goto hypercall_userspace_exit;
 	case HVCALL_TRANSLATE_VIRTUAL_ADDRESS:
 		if (unlikely(hc.rep_cnt)) {
@@ -3070,7 +3078,7 @@ static void deliver_gpa_intercept(struct kvm_vcpu *target_vcpu,
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(target_vcpu);
 	struct x86_exception e;
 	struct kvm_segment kvmseg;
-
+	int i;
 	msg.header.message_type = HVMSG_GPA_INTERCEPT;
 	msg.header.payload_size = sizeof(*intercept);
 
@@ -3134,6 +3142,60 @@ static void deliver_gpa_intercept(struct kvm_vcpu *target_vcpu,
 	intercept->r14 = kvm_r14_read(intercepted_vcpu);
 	intercept->r15 = kvm_r15_read(intercepted_vcpu);
 
+	trace_printk("  header.vp_index: %x\n", intercept->header.vp_index);
+	trace_printk("  header.instruction_length: %x\n", intercept->header.instruction_length);
+	trace_printk("  header.access_type_mask: %x\n", intercept->header.access_type_mask);
+	trace_printk("  header.exec_state.cpl: %x\n", intercept->header.exec_state.cpl);
+	trace_printk("  header.exec_state.cr0_pe: %x\n", intercept->header.exec_state.cr0_pe);
+	trace_printk("  header.exec_state.cr0_am: %x\n", intercept->header.exec_state.cr0_am);
+	trace_printk("  header.exec_state.efer_lma: %x\n", intercept->header.exec_state.efer_lma);
+	trace_printk("  header.exec_state.debug_active: %x\n",
+		 intercept->header.exec_state.debug_active);
+	trace_printk("  header.exec_state.interruption_pending: %x\n",
+		 intercept->header.exec_state.interruption_pending);
+	trace_printk("  header.cs: (values for cs segment register)\n");
+	trace_printk("    base: %llx\n", (unsigned long long)intercept->header.cs.base);
+	trace_printk("    limit: %x\n", intercept->header.cs.limit);
+	trace_printk("    selector: %x\n", intercept->header.cs.selector);
+	trace_printk("  header.rip: %llx\n", (unsigned long long)intercept->header.rip);
+	trace_printk("  header.rflags: %llx\n", (unsigned long long)intercept->header.rflags);
+	trace_printk("  cache_type: %x\n", intercept->cache_type);
+	trace_printk("  instruction_byte_count: %x\n", intercept->instruction_byte_count);
+	trace_printk("  memory_access_info.gva_valid: %x\n",
+		 intercept->memory_access_info.gva_valid);
+	trace_printk("  _reserved: %x\n", intercept->_reserved);
+	trace_printk("  gva: %llx\n", (unsigned long long)intercept->gva);
+	trace_printk("  gpa: %llx\n", (unsigned long long)intercept->gpa);
+	trace_printk("  instruction_bytes: ");
+	for (i = 0; i < 16; i++) {
+		trace_printk("%02x ", intercept->instruction_bytes[i]);
+	}
+	trace_printk("\n");
+	trace_printk("  ds: (values for ds segment register)\n");
+	trace_printk("    base: %llx\n", (unsigned long long)intercept->ds.base);
+	trace_printk("    limit: %x\n", intercept->ds.limit);
+	trace_printk("    selector: %x\n", intercept->ds.selector);
+	trace_printk("  ss: (values for ss segment register)\n");
+	trace_printk("    base: %llx\n", (unsigned long long)intercept->ss.base);
+	trace_printk("    limit: %x\n", intercept->ss.limit);
+	trace_printk("    selector: %x\n", intercept->ss.selector);
+	trace_printk("  rax: %llx\n", (unsigned long long)intercept->rax);
+	trace_printk("  rcx: %llx\n", (unsigned long long)intercept->rcx);
+	trace_printk("  rdx: %llx\n", (unsigned long long)intercept->rdx);
+	trace_printk("  rbx: %llx\n", (unsigned long long)intercept->rbx);
+	trace_printk("  rsp: %llx\n", (unsigned long long)intercept->rsp);
+	trace_printk("  rbp: %llx\n", (unsigned long long)intercept->rbp);
+	trace_printk("  rsi: %llx\n", (unsigned long long)intercept->rsi);
+	trace_printk("  rdi: %llx\n", (unsigned long long)intercept->rdi);
+	trace_printk("  r8: %llx\n", (unsigned long long)intercept->r8);
+	trace_printk("  r9: %llx\n", (unsigned long long)intercept->r9);
+	trace_printk("  r10: %llx\n", (unsigned long long)intercept->r10);
+	trace_printk("  r11: %llx\n", (unsigned long long)intercept->r11);
+	trace_printk("  r12: %llx\n", (unsigned long long)intercept->r12);
+	trace_printk("  r13: %llx\n", (unsigned long long)intercept->r13);
+	trace_printk("  r14: %llx\n", (unsigned long long)intercept->r14);
+	trace_printk("  r15: %llx\n", (unsigned long long)intercept->r15);
+
 	if (synic_deliver_msg(&hv_vcpu->synic, 0, &msg, true))
 		goto inject_ud;
 
@@ -3407,3 +3469,24 @@ int kvm_vm_ioctl_set_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *stat
 	hv->vsm_code_page_offsets32.as_u64 = state->vsm_code_page_offsets32;
 	return 0;
 }
+
+static bool hv_read_vtl_control(struct kvm_vcpu *vcpu, struct hv_vp_vtl_control *vtl_control)
+{
+       /* VTL control is a part of VP assist page, which is accessed through pv_eoi */
+	if (!vcpu->arch.pv_eoi.data.len)
+		return 0;
+
+	return !kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, vtl_control,
+			offsetof(struct hv_vp_assist_page, vtl_control), sizeof(*vtl_control));
+}
+
+void dump_ftrace_vcpu_hyperv(struct kvm_vcpu *vcpu)
+{
+	struct hv_vp_vtl_control vtl_control;
+
+	trace_printk("*** HyperV VTL state ***\n");
+	if (get_active_vtl(vcpu) && hv_read_vtl_control(vcpu, &vtl_control))
+		trace_printk("entry_reason 0x%x, vina %d, rax %llx, rcx %llx\n",
+			     vtl_control.vtl_entry_reason, vtl_control.vina_asserted,
+			     vtl_control.vtl_ret_x64rax, vtl_control.vtl_ret_x64rcx);
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 7c3c2c04cefb58..e6d4ceebf955f0 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -264,6 +264,7 @@ static inline struct kvm_vcpu *kvm_hv_get_vtl_vcpu(struct kvm_vcpu *vcpu, int vt
 {
 	return kvm_get_vcpu_by_id(vcpu->kvm, vtl);
 }
-
 void kvm_hv_deliver_intercept(struct kvm_vcpu *vcpu);
+
+void dump_ftrace_vcpu_hyperv(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9c83ee3a293de7..70212486577a19 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -5339,6 +5339,11 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.irq_exits;
+	trace_printk("-------------------------------------------VCPU%d---------------------------------\n", vcpu->vcpu_id);
+	trace_printk("External interrupt\n");
+	dump_ftrace_vmcs(vcpu);
+	dump_ftrace_vcpu_state(vcpu);
+	trace_printk("---------------------------------------------------------------------------\n");
 	return 1;
 }
 
@@ -6397,6 +6402,202 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
 		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
 }
 
+static void vmx_ftrace_dump_sel(char *name, uint32_t sel)
+{
+	trace_printk("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+	       name, vmcs_read16(sel),
+	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_ftrace_dump_dtsel(char *name, uint32_t limit)
+{
+	trace_printk("%s                           limit=0x%08x, base=0x%016lx\n",
+	       name, vmcs_read32(limit),
+	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void vmx_ftrace_dump_msrs(char *name, struct vmx_msrs *m)
+{
+	unsigned int i;
+	struct vmx_msr_entry *e;
+
+	trace_printk("MSR %s:\n", name);
+	for (i = 0, e = m->val; i < m->nr; ++i, ++e)
+		trace_printk("  %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
+}
+
+void dump_ftrace_vmcs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 vmentry_ctl, vmexit_ctl;
+	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
+	u64 tertiary_exec_control;
+	unsigned long cr4;
+	int efer_slot;
+
+	vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+	vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+	cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	cr4 = vmcs_readl(GUEST_CR4);
+
+	if (cpu_has_secondary_exec_ctrls())
+		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+	else
+		secondary_exec_control = 0;
+
+	if (cpu_has_tertiary_exec_ctrls())
+		tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL);
+	else
+		tertiary_exec_control = 0;
+
+	trace_printk("VMCS %p, last attempted VM-entry on CPU %d\n",
+	       vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
+	trace_printk("*** Guest State ***\n");
+	trace_printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+	       vmcs_readl(CR0_GUEST_HOST_MASK));
+	trace_printk("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+	trace_printk("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+	if (cpu_has_vmx_ept()) {
+		trace_printk("PDPTR0 = 0x%016llx  PDPTR1 = 0x%016llx\n",
+		       vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
+		trace_printk("PDPTR2 = 0x%016llx  PDPTR3 = 0x%016llx\n",
+		       vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
+	}
+	trace_printk("RSP = 0x%016lx  RIP = 0x%016lx\n",
+	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+	trace_printk("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
+	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+	trace_printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+	       vmcs_readl(GUEST_SYSENTER_ESP),
+	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+	vmx_ftrace_dump_sel("CS:  ", GUEST_CS_SELECTOR);
+	vmx_ftrace_dump_sel("DS:  ", GUEST_DS_SELECTOR);
+	vmx_ftrace_dump_sel("SS:  ", GUEST_SS_SELECTOR);
+	vmx_ftrace_dump_sel("ES:  ", GUEST_ES_SELECTOR);
+	vmx_ftrace_dump_sel("FS:  ", GUEST_FS_SELECTOR);
+	vmx_ftrace_dump_sel("GS:  ", GUEST_GS_SELECTOR);
+	vmx_ftrace_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+	vmx_ftrace_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+	vmx_ftrace_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+	vmx_ftrace_dump_sel("TR:  ", GUEST_TR_SELECTOR);
+	efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER);
+	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
+		trace_printk("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
+	else if (efer_slot >= 0)
+		trace_printk("EFER= 0x%016llx (autoload)\n",
+		       vmx->msr_autoload.guest.val[efer_slot].value);
+	else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
+		trace_printk("EFER= 0x%016llx (effective)\n",
+		       vcpu->arch.efer | (EFER_LMA | EFER_LME));
+	else
+		trace_printk("EFER= 0x%016llx (effective)\n",
+		       vcpu->arch.efer & ~(EFER_LMA | EFER_LME));
+	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
+		trace_printk("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
+	trace_printk("DebugCtl = 0x%016llx  DebugExceptions = 0x%016lx\n",
+	       vmcs_read64(GUEST_IA32_DEBUGCTL),
+	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+	if (cpu_has_load_perf_global_ctrl() &&
+	    vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+		trace_printk("PerfGlobCtl = 0x%016llx\n",
+		       vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
+	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+		trace_printk("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
+	trace_printk("Interruptibility = %08x  ActivityState = %08x\n",
+	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+	       vmcs_read32(GUEST_ACTIVITY_STATE));
+	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+		trace_printk("InterruptStatus = %04x\n",
+		       vmcs_read16(GUEST_INTR_STATUS));
+	if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0)
+		vmx_ftrace_dump_msrs("guest autoload", &vmx->msr_autoload.guest);
+	if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0)
+		vmx_ftrace_dump_msrs("guest autostore", &vmx->msr_autostore.guest);
+
+	trace_printk("*** Host State ***\n");
+	trace_printk("RIP = 0x%016lx  RSP = 0x%016lx\n",
+	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+	trace_printk("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+	       vmcs_read16(HOST_TR_SELECTOR));
+	trace_printk("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+	       vmcs_readl(HOST_TR_BASE));
+	trace_printk("GDTBase=%016lx IDTBase=%016lx\n",
+	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+	trace_printk("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+	       vmcs_readl(HOST_CR4));
+	trace_printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
+	       vmcs_read32(HOST_IA32_SYSENTER_CS),
+	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
+	if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
+		trace_printk("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
+	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
+		trace_printk("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
+	if (cpu_has_load_perf_global_ctrl() &&
+	    vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+		trace_printk("PerfGlobCtl = 0x%016llx\n",
+		       vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
+	if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0)
+		vmx_ftrace_dump_msrs("host autoload", &vmx->msr_autoload.host);
+
+	trace_printk("*** Control State ***\n");
+	trace_printk("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
+	       cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
+	trace_printk("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
+	       pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
+	trace_printk("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+	       vmcs_read32(EXCEPTION_BITMAP),
+	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+	trace_printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+	trace_printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+	       vmcs_read32(VM_EXIT_INTR_INFO),
+	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+	trace_printk("        reason=%08x qualification=%016lx\n",
+	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+	trace_printk("IDTVectoring: info=%08x errcode=%08x\n",
+	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
+	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
+	trace_printk("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
+	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+		trace_printk("TSC Multiplier = 0x%016llx\n",
+		       vmcs_read64(TSC_MULTIPLIER));
+	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
+		if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+			u16 status = vmcs_read16(GUEST_INTR_STATUS);
+			trace_printk("SVI|RVI = %02x|%02x \n", status >> 8, status & 0xff);
+		}
+		trace_printk("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+		if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+			trace_printk("APIC-access addr = 0x%016llx \n", vmcs_read64(APIC_ACCESS_ADDR));
+		trace_printk("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
+	}
+	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+		trace_printk("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+		trace_printk("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
+	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+		trace_printk("PLE Gap=%08x Window=%08x\n",
+		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+		trace_printk("Virtual processor ID = 0x%04x\n",
+		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -7359,6 +7560,14 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		vmx_update_hv_timer(vcpu);
 
 	kvm_wait_lapic_expire(vcpu);
+	if (vcpu->dump_state_on_run) {
+		trace_printk("-------------------------------------------VCPU%d---------------------------------\n", vcpu->vcpu_id);
+		trace_printk("Entering guest\n");
+		dump_ftrace_vmcs(vcpu);
+		dump_ftrace_vcpu_state(vcpu);
+		trace_printk("---------------------------------------------------------------------------\n");
+		vcpu->dump_state_on_run = false;
+	}
 
 	/* The actual VMENTER/EXIT is in the .noinstr.text section. */
 	vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e2ccf721bd3b7f..51fbb35417ae04 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5153,8 +5153,8 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
-					       struct kvm_vcpu_events *events)
+void kvm_vcpu_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+				  struct kvm_vcpu_events *events)
 {
 	struct kvm_queued_exception *ex;
 
@@ -5246,8 +5246,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	}
 }
 
-static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
-					      struct kvm_vcpu_events *events)
+int kvm_vcpu_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+			         struct kvm_vcpu_events *events)
 {
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
@@ -5822,7 +5822,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	case KVM_GET_VCPU_EVENTS: {
 		struct kvm_vcpu_events events;
 
-		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
+		kvm_vcpu_x86_get_vcpu_events(vcpu, &events);
 
 		r = -EFAULT;
 		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
@@ -5837,7 +5837,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
 			break;
 
-		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
+		r = kvm_vcpu_x86_set_vcpu_events(vcpu, &events);
 		break;
 	}
 	case KVM_GET_DEBUGREGS: {
@@ -11412,6 +11412,11 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 			(unsigned long *)sregs->interrupt_bitmap);
 }
 
+void kvm_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	__get_sregs(vcpu, sregs);
+}
+
 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
 {
 	int i;
@@ -11513,6 +11518,188 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+static void dump_ftrace_vcpu_state_events(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_events events;
+
+	kvm_vcpu_x86_get_vcpu_events(vcpu, &events);
+	trace_printk("*** vCPU Events ***\n");
+	trace_printk("exception: inj=%u nr=%u err_code=%u pending=%u err_code=%u\n",
+		events.exception.injected, events.exception.nr,
+		events.exception.has_error_code, events.exception.pending,
+		events.exception.error_code);
+	trace_printk("interrupt: inj=%u nr=%u soft=%u shadow=%u\n",
+		     events.interrupt.injected, events.interrupt.nr,
+		     events.interrupt.soft, events.interrupt.shadow);
+	trace_printk("nmi: inj=%u pending=%u masked=%u pad=%u\n",
+		     events.nmi.injected, events.nmi.pending, events.nmi.masked,
+		     events.nmi.pad);
+	trace_printk("sipi_vector: %u\n", events.sipi_vector);
+	trace_printk("flags: 0x%x\n", events.flags);
+	trace_printk("smi: smm=%u pending=%u smm_in_nmi=%u latched_init=%u\n",
+		     events.smi.smm, events.smi.pending,
+		     events.smi.smm_inside_nmi, events.smi.latched_init);
+	trace_printk("triple_fault: pending=%u\n", events.triple_fault.pending);
+	trace_printk("exc_payload: has_payload=%u payload=0x%llx\n",
+		     events.exception_has_payload, events.exception_payload);
+}
+
+static void dump_ftrace_vcpu_mp_state(struct kvm_vcpu *vcpu)
+{
+	trace_printk("*** vCPU MP state ***\n");
+	trace_printk("mp_state=0x%x\n", vcpu->arch.mp_state);
+}
+
+static void dump_ftrace_vcpu_regs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_regs regs;
+
+	__get_regs(vcpu, &regs);
+	trace_printk("*** vCPU Regs ***\n");
+	trace_printk("rax=0x%llx rbx=0x%llx rcx=0x%llx rdx=0x%llx\n", regs.rax,
+		     regs.rbx, regs.rcx, regs.rdx);
+
+	trace_printk("rsi=0x%llx rdi=0x%llx rsp=0x%llx rbp=0x%llx\n", regs.rsi,
+		     regs.rdi, regs.rsp, regs.rbp);
+
+	trace_printk("r8=0x%llx  r9=0x%llx  r10=0x%llx r11=0x%llx\n", regs.r8, regs.r9,
+		     regs.r10, regs.r11);
+
+	trace_printk("r12=0x%llx r13=0x%llx r14=0x%llx r15=0x%llx\n", regs.r12,
+		     regs.r13, regs.r14, regs.r15);
+
+	trace_printk("rip=0x%llx rflags=0x%llx\n", regs.rip, regs.rflags);
+}
+
+static void print_segment(const char *name, const struct kvm_segment seg)
+{
+	trace_printk("%s: base=0x%llx limit=0x%x selector=0x%x type=0x%x present=0x%x dpl=0x%x db=0x%x s=0x%x l=0x%x g=0x%x avl=0x%x unusable=0x%x\n",
+		name, seg.base, seg.limit, seg.selector, seg.type, seg.present,
+		seg.dpl, seg.db, seg.s, seg.l, seg.g, seg.avl, seg.unusable);
+}
+
+static void print_dtable(const char *name, const struct kvm_dtable dtable)
+{
+	trace_printk("%s: base=0x%llx limit=0x%x\n", name, dtable.base, dtable.limit);
+}
+
+static void dump_ftrace_vcpu_sregs2(struct kvm_vcpu *vcpu)
+{
+	struct kvm_sregs2 sregs;
+
+	__get_sregs2(vcpu, &sregs);
+
+	trace_printk("*** vCPU Sregs ***\n");
+	print_segment("cs", sregs.cs);
+	print_segment("ds", sregs.ds);
+	print_segment("es", sregs.es);
+	print_segment("fs", sregs.fs);
+	print_segment("gs", sregs.gs);
+	print_segment("ss", sregs.ss);
+
+	print_segment("tr", sregs.tr);
+	print_segment("ldt", sregs.ldt);
+
+	print_dtable("gdt", sregs.gdt);
+	print_dtable("idt", sregs.idt);
+
+	trace_printk("cr0=0x%llx cr2=0x%llx cr3=0x%llx cr4=0x%llx cr8=0x%llx\n",
+		     sregs.cr0, sregs.cr2, sregs.cr3, sregs.cr4, sregs.cr8);
+
+	trace_printk("efer=0x%llx apic_base=0x%llx flags=0x%llx\n", sregs.efer,
+		     sregs.apic_base, sregs.flags);
+
+	trace_printk("pdptrs: 0x%llx 0x%llx 0x%llx 0x%llx\n", sregs.pdptrs[0],
+		     sregs.pdptrs[1], sregs.pdptrs[2], sregs.pdptrs[3]);
+}
+
+static void dump_ftrace_vcpu_kvm_lapic_state(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic_state lapic_state;
+
+	kvm_apic_get_state(vcpu, &lapic_state);
+
+	trace_printk("*** vCPU apic state ***\n");
+	trace_printk("APIC_ID: 0x%x\n", (unsigned char)lapic_state.regs[0x20]);
+	trace_printk("APIC_LVR: 0x%x\n", (unsigned char)lapic_state.regs[0x30]);
+	trace_printk("APIC_TASKPRI: 0x%x\n", (unsigned char)lapic_state.regs[0x80]);
+	trace_printk("APIC_ARBPRI: 0x%x\n", (unsigned char)lapic_state.regs[0x90]);
+	trace_printk("APIC_PROCPRI: 0x%x\n", (unsigned char)lapic_state.regs[0xA0]);
+	trace_printk("APIC_EOI: 0x%x\n", (unsigned char)lapic_state.regs[0xB0]);
+	trace_printk("APIC_RRR: 0x%x\n", (unsigned char)lapic_state.regs[0xC0]);
+	trace_printk("APIC_LDR: 0x%x\n", (unsigned char)lapic_state.regs[0xD0]);
+	trace_printk("APIC_DFR: 0x%x\n", (unsigned char)lapic_state.regs[0xE0]);
+	trace_printk("APIC_SPIV: 0x%x\n", (unsigned char)lapic_state.regs[0xF0]);
+	trace_printk("APIC_ISR: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
+		     (unsigned char)lapic_state.regs[0x100],
+		     (unsigned char)lapic_state.regs[0x101],
+		     (unsigned char)lapic_state.regs[0x102],
+		     (unsigned char)lapic_state.regs[0x103],
+		     (unsigned char)lapic_state.regs[0x104],
+		     (unsigned char)lapic_state.regs[0x105],
+		     (unsigned char)lapic_state.regs[0x106],
+		     (unsigned char)lapic_state.regs[0x107]);
+
+	trace_printk("APIC_TMR: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
+		     (unsigned char)lapic_state.regs[0x180],
+		     (unsigned char)lapic_state.regs[0x181],
+		     (unsigned char)lapic_state.regs[0x182],
+		     (unsigned char)lapic_state.regs[0x183],
+		     (unsigned char)lapic_state.regs[0x184],
+		     (unsigned char)lapic_state.regs[0x185],
+		     (unsigned char)lapic_state.regs[0x186],
+		     (unsigned char)lapic_state.regs[0x187]);
+
+	trace_printk("APIC_IRR: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
+		     (unsigned char)lapic_state.regs[0x200],
+		     (unsigned char)lapic_state.regs[0x201],
+		     (unsigned char)lapic_state.regs[0x202],
+		     (unsigned char)lapic_state.regs[0x203],
+		     (unsigned char)lapic_state.regs[0x204],
+		     (unsigned char)lapic_state.regs[0x205],
+		     (unsigned char)lapic_state.regs[0x206],
+		     (unsigned char)lapic_state.regs[0x207]);
+	trace_printk("APIC_ESR: 0x%x\n", (unsigned char)lapic_state.regs[0x280]);
+	trace_printk("APIC_ICR: 0x%x\n", (unsigned char)lapic_state.regs[0x300]);
+	trace_printk("APIC_ICR2: 0x%x\n", (unsigned char)lapic_state.regs[0x310]);
+	trace_printk("APIC_LVTT: 0x%x\n", (unsigned char)lapic_state.regs[0x320]);
+	trace_printk("APIC_LVTTHMR: 0x%x\n", (unsigned char)lapic_state.regs[0x330]);
+	trace_printk("APIC_LVTPC: 0x%x\n", (unsigned char)lapic_state.regs[0x340]);
+	trace_printk("APIC_LVT0: 0x%x\n", (unsigned char)lapic_state.regs[0x350]);
+	trace_printk("APIC_LVT1: 0x%x\n", (unsigned char)lapic_state.regs[0x360]);
+	trace_printk("APIC_LVTERR: 0x%x\n", (unsigned char)lapic_state.regs[0x370]);
+	trace_printk("APIC_TMICT: 0x%x\n", (unsigned char)lapic_state.regs[0x380]);
+	trace_printk("APIC_TMCCT: 0x%x\n", (unsigned char)lapic_state.regs[0x390]);
+	trace_printk("APIC_TDCR: 0x%x\n", (unsigned char)lapic_state.regs[0x3E0]);
+	trace_printk("APIC_SELF_IPI: 0x%x\n", (unsigned char)lapic_state.regs[0x3F0]);
+}
+
+static void dump_ftrace_vcpu_debugregs(struct kvm_vcpu *vcpu)
+{
+	struct kvm_debugregs debugregs;
+
+	kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &debugregs);
+	trace_printk("*** vCPU Debug Regs ***\n");
+	trace_printk("db[0]=0x%llx db[1]=0x%llx db[2]=0x%llx db[3]=0x%llx\n",
+		     debugregs.db[0], debugregs.db[1], debugregs.db[2],
+		     debugregs.db[3]);
+
+	trace_printk("dr6=0x%llx dr7=0x%llx flags=0x%llx\n", debugregs.dr6,
+		     debugregs.dr7, debugregs.flags);
+
+}
+
+void dump_ftrace_vcpu_state(struct kvm_vcpu *vcpu)
+{
+	dump_ftrace_vcpu_state_events(vcpu);
+	dump_ftrace_vcpu_mp_state(vcpu);
+	dump_ftrace_vcpu_regs(vcpu);
+	dump_ftrace_vcpu_sregs2(vcpu);
+	dump_ftrace_vcpu_kvm_lapic_state(vcpu);
+	dump_ftrace_vcpu_debugregs(vcpu);
+	dump_ftrace_vcpu_hyperv(vcpu);
+}
+
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 		    int reason, bool has_error_code, u32 error_code)
 {
@@ -11690,6 +11877,11 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
 	return 0;
 }
 
+int kvm_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	return __set_sregs(vcpu, sregs);
+}
+
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
@@ -11865,7 +12057,7 @@ static void store_regs(struct kvm_vcpu *vcpu)
 		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
 
 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
-		kvm_vcpu_ioctl_x86_get_vcpu_events(
+		kvm_vcpu_x86_get_vcpu_events(
 				vcpu, &vcpu->run->s.regs.events);
 }
 
@@ -11888,7 +12080,7 @@ static int sync_regs(struct kvm_vcpu *vcpu)
 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
 		struct kvm_vcpu_events events = vcpu->run->s.regs.events;
 
-		if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events))
+		if (kvm_vcpu_x86_set_vcpu_events(vcpu, &events))
 			return -EINVAL;
 
 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 1e7be1f6ab299d..86c12b202c46b5 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -499,6 +499,14 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
 
+void kvm_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvm_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+
+void kvm_vcpu_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
+			          struct kvm_vcpu_events *events);
+int kvm_vcpu_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
+			         struct kvm_vcpu_events *events);
+
 /*
  * Internal error codes that are used to indicate that MSR emulation encountered
  * an error that should result in #GP in the guest, unless userspace
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 37a0bb6f99f0b0..1e50b13db5d551 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -397,6 +397,7 @@ struct kvm_vcpu {
 	struct kvm_memory_slot *last_used_slot;
 	u64 last_used_slot_gen;
 	wait_queue_head_t wqh;
+	bool dump_state_on_run;
 };
 
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 78cc4489af3f49..482218b73554ee 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -505,6 +505,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
 		 task_pid_nr(current), id);
 	init_waitqueue_head(&vcpu->wqh);
+	vcpu->dump_state_on_run = true;
 }
 
 static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -3986,8 +3987,12 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	if (!cmpxchg(&vcpu->kicked, false, true))
+	if (!cmpxchg(&vcpu->kicked, false, true)) {
 		wake_up_interruptible(&vcpu->wqh);
+		trace_printk("vCPU%d\n", vcpu->vcpu_id);
+		trace_dump_stack(0);
+		kvm_get_vcpu_by_id(vcpu->kvm, 0)->dump_state_on_run = true;
+	}
 
 out:
 	put_cpu();
@@ -4205,6 +4210,7 @@ static __poll_t kvm_vcpu_poll(struct file *file, poll_table *wait)
 	 */
 	smp_mb();
 	if (READ_ONCE(vcpu->kicked)) {
+		trace_printk("up!\n");
 		return EPOLLIN;
 	}
 

From cc2377baa038bbe3b1030b6d9ad22cf20b0e6408 Mon Sep 17 00:00:00 2001
From: Anel Orazgaliyeva <anelkz@amazon.de>
Date: Tue, 24 Oct 2023 14:38:27 +0000
Subject: [PATCH 23/26] KVM: x86: Introduce KVM_CAP_APIC_ID_MASK

Signed-off-by: Anel Orazgaliyeva <anelkz@amazon.de>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/include/uapi/asm/kvm.h |  4 ++++
 arch/x86/kvm/hyperv-vsm.c       |  1 +
 arch/x86/kvm/lapic.c            | 12 ++++++------
 arch/x86/kvm/lapic.h            | 10 ++++++++++
 arch/x86/kvm/x86.c              | 24 ++++++++++++++++++++++++
 include/uapi/linux/kvm.h        |  2 ++
 7 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7a74d8f43846a..b837b1f8ffd70f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1317,6 +1317,8 @@ struct kvm_arch {
 	struct rw_semaphore apicv_update_lock;
 	unsigned long apicv_inhibit_reasons;
 
+	uint64_t apic_id_mask_shift;
+
 	gpa_t wall_clock;
 
 	bool mwait_in_guest;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index eb0182f76b4a05..574152d5aae54d 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -571,4 +571,8 @@ struct kvm_hv_vsm_state {
 	__u64 vsm_code_page_offsets32;
 };
 
+struct kvm_apic_id_mask {
+	__u64 width; /* mask width in bits, max 32 */
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c
index 3f906f7d00e29b..5ac269afa91e17 100644
--- a/arch/x86/kvm/hyperv-vsm.c
+++ b/arch/x86/kvm/hyperv-vsm.c
@@ -7,6 +7,7 @@
 
 #include "mmu/mmu_internal.h"
 #include "hyperv.h"
+#include "lapic.h"
 #include "trace.h"
 
 #include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d34ab093e8b938..bec6a3c04421fb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -141,7 +141,7 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 
 static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
 {
-	return apic->vcpu->vcpu_id;
+	return kvm_apic_id_masked(apic->vcpu->kvm, apic->vcpu->vcpu_id);
 }
 
 static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
@@ -526,7 +526,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
 {
 	u32 ldr = kvm_apic_calc_x2apic_ldr(id);
 
-	WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+	WARN_ON_ONCE(id != kvm_apic_id_masked(apic->vcpu->kvm, apic->vcpu->vcpu_id));
 
 	kvm_lapic_set_reg(apic, APIC_ID, id);
 	kvm_lapic_set_reg(apic, APIC_LDR, ldr);
@@ -2542,7 +2542,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 	/* update jump label if enable bit changes */
 	if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
 		if (value & MSR_IA32_APICBASE_ENABLE) {
-			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+			kvm_apic_set_xapic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id));
 			static_branch_slow_dec_deferred(&apic_hw_disabled);
 			/* Check if there are APF page ready requests pending */
 			kvm_make_request(KVM_REQ_APF_READY, vcpu);
@@ -2554,9 +2554,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 	if ((old_value ^ value) & X2APIC_ENABLE) {
 		if (value & X2APIC_ENABLE)
-			kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+			kvm_apic_set_x2apic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id));
 		else if (value & MSR_IA32_APICBASE_ENABLE)
-			kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+			kvm_apic_set_xapic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id));
 	}
 
 	if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
@@ -2686,7 +2686,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	/* The xAPIC ID is set at RESET even if the APIC was already enabled. */
 	if (!init_event)
-		kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+		kvm_apic_set_xapic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id));
 	kvm_apic_set_version(apic->vcpu);
 
 	for (i = 0; i < apic->nr_lvt_entries; i++)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 0a0ea4b5dd8ce7..751bca845d6aa8 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -278,4 +278,14 @@ static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
 	return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
 }
 
+static inline u32 kvm_apic_id_masked(struct kvm *kvm, u32 apic_id)
+{
+	return apic_id & ~(0xFFFFFFFF << kvm->arch.apic_id_mask_shift);
+}
+
+static inline u32 kvm_apic_id_mask_value(struct kvm *kvm, u32 apic_id)
+{
+	return apic_id >> kvm->arch.apic_id_mask_shift;
+}
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51fbb35417ae04..ec10383a10b57c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4527,6 +4527,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
 	case KVM_CAP_IRQFD_RESAMPLE:
 	case KVM_CAP_MEMORY_FAULT_INFO:
+	case KVM_CAP_APIC_ID_MASK:
 		r = 1;
 		break;
 	case KVM_CAP_EXIT_HYPERCALL:
@@ -6783,6 +6784,15 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
 	return 0;
 }
 
+static int kvm_vm_ioctl_set_apic_id_mask(struct kvm *kvm, struct kvm_apic_id_mask *mask)
+{
+	if (mask->width > 32)
+		return -EINVAL;
+
+	kvm->arch.apic_id_mask_shift = 32 - mask->width;
+	return 0;
+}
+
 int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
 	struct kvm *kvm = filp->private_data;
@@ -7159,6 +7169,20 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 		kfree(vsm_state);
 		break;
 	}
+	case KVM_SET_APIC_ID_MASK: {
+		struct kvm_apic_id_mask *mask;
+
+		r = -EINVAL;
+
+		mask = memdup_user(argp, sizeof(*mask));
+		if (IS_ERR(mask)) {
+			r = PTR_ERR(mask);
+			goto out;
+		}
+		r = kvm_vm_ioctl_set_apic_id_mask(kvm, mask);
+		kfree(mask);
+		break;
+	}
 	default:
 		r = -ENOTTY;
 	}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 07aafb046e4473..d87cca1c715c06 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1238,6 +1238,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_GUEST_MEMFD 233
 #define KVM_CAP_VM_TYPES 234
 #define KVM_CAP_HYPERV_VSM 235
+#define KVM_CAP_APIC_ID_MASK 236
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -2339,4 +2340,5 @@ struct kvm_create_guest_memfd {
 #define KVM_HV_GET_VSM_STATE _IOR(KVMIO, 0xd5, struct kvm_hv_vsm_state)
 #define KVM_HV_SET_VSM_STATE _IOW(KVMIO, 0xd6, struct kvm_hv_vsm_state)
 
+#define KVM_SET_APIC_ID_MASK _IOW(KVMIO, 0xd7, struct kvm_apic_id_mask)
 #endif /* __LINUX_KVM_H */

From 7effa7b6aae0fdc19806e3d7e9a7c09e2e9445ba Mon Sep 17 00:00:00 2001
From: Anel Orazgaliyeva <anelkz@amazon.de>
Date: Fri, 27 Oct 2023 12:50:26 +0000
Subject: [PATCH 24/26] KVM: x86: hyperv: use APIC id mask to calculate VTL

Signed-off-by: Anel Orazgaliyeva <anelkz@amazon.de>
---
 arch/x86/kvm/hyperv-vsm.c |  4 ++--
 arch/x86/kvm/hyperv.c     | 12 ++++++------
 arch/x86/kvm/hyperv.h     | 14 --------------
 arch/x86/kvm/lapic.h      | 11 +++++++++++
 4 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c
index 5ac269afa91e17..1515cf4d0a1544 100644
--- a/arch/x86/kvm/hyperv-vsm.c
+++ b/arch/x86/kvm/hyperv-vsm.c
@@ -26,7 +26,7 @@ static struct xarray *kvm_hv_vsm_get_memprots(struct kvm_vcpu *vcpu);
 static void kvm_hv_inject_gpa_intercept(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	struct kvm_vcpu *target_vcpu =
-		kvm_hv_get_vtl_vcpu(vcpu, get_active_vtl(vcpu) + 1);
+		kvm_hv_get_vtl_vcpu(vcpu, kvm_hv_get_active_vtl(vcpu) + 1);
 	struct kvm_vcpu_hv_intercept_info *intercept =
 		&target_vcpu->arch.hyperv->intercept_info;
 
@@ -162,7 +162,7 @@ static struct xarray *kvm_hv_vsm_get_memprots(struct kvm_vcpu *vcpu)
 	list_for_each_entry(tmp, &vcpu->kvm->devices, vm_node)
 		if (tmp->ops == &kvm_hv_vtl_ops) {
 			vtl_dev = tmp->private;
-			if (vtl_dev->vtl == get_active_vtl(vcpu))
+			if (vtl_dev->vtl == kvm_hv_get_active_vtl(vcpu))
 				return &vtl_dev->mem_attrs;
 		}
 
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 6ee6f7ef9617fa..ad2e1520752e39 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -64,7 +64,7 @@
 
 void kvm_tdp_mmu_role_set_hv_bits(struct kvm_vcpu *vcpu, union kvm_mmu_page_role *role)
 {
-	role->vtl = to_kvm_hv(vcpu->kvm)->hv_enable_vsm ? get_active_vtl(vcpu) : 0;
+	role->vtl = to_kvm_hv(vcpu->kvm)->hv_enable_vsm ? kvm_hv_get_active_vtl(vcpu) : 0;
 }
 
 static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -2319,7 +2319,7 @@ static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
 					    valid_bank_mask, sparse_banks))
 			continue;
 
-		if (get_active_vtl(vcpu) != vtl)
+		if (kvm_hv_get_active_vtl(vcpu) != vtl)
 			continue;
 
 		/* We fail only when APIC is disabled */
@@ -2342,7 +2342,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 
 	/* VTL is at the same offset on both IPI types */
 	in_vtl = &send_ipi.in_vtl;
-	vtl = in_vtl->use_target_vtl ? in_vtl->target_vtl : get_active_vtl(vcpu);
+	vtl = in_vtl->use_target_vtl ? in_vtl->target_vtl : kvm_hv_get_active_vtl(vcpu);
 
 	if (hc->code == HVCALL_SEND_IPI) {
 		if (!hc->fast) {
@@ -2663,9 +2663,9 @@ static bool kvm_hv_xlate_va_validate_input(struct kvm_vcpu* vcpu,
 		pr_info_ratelimited("Translate VA control flags unsupported and will be ignored: 0x%llx\n",
 				    in->control_flags);
 
-	*vtl = in_vtl.use_target_vtl ? in_vtl.target_vtl : get_active_vtl(vcpu);
+	*vtl = in_vtl.use_target_vtl ? in_vtl.target_vtl : kvm_hv_get_active_vtl(vcpu);
 
-	if (*vtl >= HV_NUM_VTLS || *vtl > get_active_vtl(vcpu))
+	if (*vtl >= HV_NUM_VTLS || *vtl > kvm_hv_get_active_vtl(vcpu))
 		return false;
 
 	return true;
@@ -3485,7 +3485,7 @@ void dump_ftrace_vcpu_hyperv(struct kvm_vcpu *vcpu)
 	struct hv_vp_vtl_control vtl_control;
 
 	trace_printk("*** HyperV VTL state ***\n");
-	if (get_active_vtl(vcpu) && hv_read_vtl_control(vcpu, &vtl_control))
+	if (kvm_hv_get_active_vtl(vcpu) && hv_read_vtl_control(vcpu, &vtl_control))
 		trace_printk("entry_reason 0x%x, vina %d, rax %llx, rcx %llx\n",
 			     vtl_control.vtl_entry_reason, vtl_control.vina_asserted,
 			     vtl_control.vtl_ret_x64rax, vtl_control.vtl_ret_x64rcx);
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index e6d4ceebf955f0..8bdc74902c9972 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -64,16 +64,6 @@ static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
 	return vcpu->arch.hyperv;
 }
 
-static inline u8 get_active_vtl(struct kvm_vcpu *vcpu)
-{
-	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
-
-	if (!hv_vcpu)
-		return 0;
-
-	return hv_vcpu->vp_index;
-}
-
 static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
 {
 	struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
@@ -260,10 +250,6 @@ void kvm_hv_vtl_dev_unregister(void);
 
 int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 
-static inline struct kvm_vcpu *kvm_hv_get_vtl_vcpu(struct kvm_vcpu *vcpu, int vtl)
-{
-	return kvm_get_vcpu_by_id(vcpu->kvm, vtl);
-}
 void kvm_hv_deliver_intercept(struct kvm_vcpu *vcpu);
 
 void dump_ftrace_vcpu_hyperv(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 751bca845d6aa8..3207ad787f48a2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -288,4 +288,15 @@ static inline u32 kvm_apic_id_mask_value(struct kvm *kvm, u32 apic_id)
 	return apic_id >> kvm->arch.apic_id_mask_shift;
 }
 
+static inline struct kvm_vcpu *kvm_hv_get_vtl_vcpu(struct kvm_vcpu *vcpu, int vtl)
+{
+	return kvm_get_vcpu_by_id(vcpu->kvm,
+            kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id) | (vtl << vcpu->kvm->arch.apic_id_mask_shift));
+}
+
+static inline u8 kvm_hv_get_active_vtl(struct kvm_vcpu *vcpu)
+{
+	return (u8)(kvm_apic_id_mask_value(vcpu->kvm, vcpu->vcpu_id));
+}
+
 #endif

From 23de295bed5a77e8fe63a6d746b2de7d4c43bba6 Mon Sep 17 00:00:00 2001
From: Anel Orazgaliyeva <anelkz@amazon.de>
Date: Wed, 1 Nov 2023 16:08:51 +0000
Subject: [PATCH 25/26] KVM: x86: hyperv: Set correct VTL for self IPIs

Signed-off-by: Anel Orazgaliyeva <anelkz@amazon.de>
---
 arch/x86/kvm/lapic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bec6a3c04421fb..a0969ac24d79af 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1519,6 +1519,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high)
 	else
 		irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high);
 
+	irq.dest_id |= kvm_hv_get_active_vtl(apic->vcpu) << apic->vcpu->kvm->arch.apic_id_mask_shift;
 	trace_kvm_apic_ipi(icr_low, irq.dest_id);
 
 	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);

From b8c2bf163340ea0d8262bd0de757bc564002d32a Mon Sep 17 00:00:00 2001
From: Anel Orazgaliyeva <anelkz@amazon.de>
Date: Mon, 30 Oct 2023 08:20:40 +0000
Subject: [PATCH 26/26] temp hack: Ignore MAX_VCPU_ID limit for APIC_ID

Signed-off-by: Anel Orazgaliyeva <anelkz@amazon.de>
---
 arch/x86/kvm/x86.c  | 2 +-
 virt/kvm/kvm_main.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ec10383a10b57c..3c44ec76fa927f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12122,7 +12122,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
 	if (!kvm->arch.max_vcpu_ids)
 		kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
 
-	if (id >= kvm->arch.max_vcpu_ids)
+	if (kvm_apic_id_masked(kvm, id) >= kvm->arch.max_vcpu_ids)
 		return -EINVAL;
 
 	return static_call(kvm_x86_vcpu_precreate)(kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 482218b73554ee..c2b663bdee2cbe 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4285,8 +4285,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	struct kvm_vcpu *vcpu;
 	struct page *page;
 
-	if (id >= KVM_MAX_VCPU_IDS)
-		return -EINVAL;
+	/* TODO: fix this */
+//	if (id >= KVM_MAX_VCPU_IDS)
+//		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
 	if (kvm->created_vcpus >= kvm->max_vcpus) {