diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 2ff26f53cd6244..fbb7f05aa517a8 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -49,7 +49,7 @@ /* Support for physical CPU dynamic partitioning events is available*/ #define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) /* - * Support for passing hypercall input parameter block via XMM + * Support for passing hypercall input and output parameter block via XMM * registers is available */ #define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4) @@ -387,9 +387,6 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 -/* Number of XMM registers used in hypercall input/output */ -#define HV_HYPERCALL_MAX_XMM_REGISTERS 6 - struct hv_nested_enlightenments_control { struct { __u32 directhypercall:1; @@ -401,14 +398,39 @@ struct hv_nested_enlightenments_control { } hypercallControls; } __packed; +struct hv_vp_vtl_control { + __u32 vtl_entry_reason; + + union { + __u8 as_u8; + struct { + __u8 vina_asserted:1; + __u8 reserved0:7; + }; + }; + + __u8 reserved1[3]; + + union { + struct { + __u64 vtl_ret_x64rax; + __u64 vtl_ret_x64rcx; + }; + + struct { + __u32 vtl_return_x86_eax; + __u32 vtl_return_x86_ecx; + __u32 vtl_return_x86_edx; + __u32 reserved2; + }; + }; +}; + /* Define virtual processor assist page structure. */ struct hv_vp_assist_page { __u32 apic_assist; __u32 reserved1; - __u32 vtl_entry_reason; - __u32 vtl_reserved; - __u64 vtl_ret_x64rax; - __u64 vtl_ret_x64rcx; + struct hv_vp_vtl_control vtl_control; struct hv_nested_enlightenments_control nested_control; __u8 enlighten_vmentry; __u8 reserved2[7]; @@ -799,6 +821,82 @@ struct hv_get_vp_from_apic_id_in { u32 apic_ids[]; } __packed; + +/* struct hv_intercept_header::access_type_mask */ +#define HV_INTERCEPT_ACCESS_MASK_NONE 0 +#define HV_INTERCEPT_ACCESS_MASK_READ 1 +#define HV_INTERCEPT_ACCESS_MASK_WRITE 2 +#define HV_INTERCEPT_ACCESS_MASK_EXECUTE 4 + +/* struct hv_intercept_exception::cache_type */ +#define HV_X64_CACHE_TYPE_UNCACHED 0 +#define HV_X64_CACHE_TYPE_WRITECOMBINING 1 +#define HV_X64_CACHE_TYPE_WRITETHROUGH 4 +#define HV_X64_CACHE_TYPE_WRITEPROTECTED 5 +#define HV_X64_CACHE_TYPE_WRITEBACK 6 + +/* Intecept message header */ +struct hv_intercept_header { + __u32 vp_index; + __u8 instruction_length; +#define HV_INTERCEPT_ACCESS_READ 0 +#define HV_INTERCEPT_ACCESS_WRITE 1 +#define HV_INTERCEPT_ACCESS_EXECUTE 2 + __u8 access_type_mask; + union { + __u16 as_u16; + struct { + __u16 cpl:2; + __u16 cr0_pe:1; + __u16 cr0_am:1; + __u16 efer_lma:1; + __u16 debug_active:1; + __u16 interruption_pending:1; + __u16 reserved:9; + }; + } exec_state; + struct hv_x64_segment_register cs; + __u64 rip; + __u64 rflags; +} __packed; + +union hv_x64_memory_access_info { + __u8 as_u8; + struct { + __u8 gva_valid:1; + __u8 _reserved:7; + }; +}; + +struct hv_memory_intercept_message { + struct hv_intercept_header header; + __u32 cache_type; + __u8 instruction_byte_count; + union hv_x64_memory_access_info memory_access_info; + __u16 _reserved; + __u64 gva; + __u64 gpa; + __u8 instruction_bytes[16]; + struct hv_x64_segment_register ds; + struct hv_x64_segment_register ss; + __u64 rax; + __u64 rcx; + __u64 rdx; + __u64 rbx; + __u64 rsp; + __u64 rbp; + __u64 rsi; + __u64 rdi; + __u64 r8; + __u64 r9; + __u64 r10; + __u64 r11; + __u64 r12; + __u64 r13; + __u64 r14; + __u64 r15; +} __packed; + #include #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dff10051e9b63c..b837b1f8ffd70f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -113,6 +113,7 @@ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_INJECT_INTERCEPT KVM_ARCH_REQ(33) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -338,7 +339,8 @@ union kvm_mmu_page_role { unsigned ad_disabled:1; unsigned guest_mode:1; unsigned passthrough:1; - unsigned :5; + unsigned vtl:4; + unsigned :1; /* * This is left at the top of the word so that @@ -638,6 +640,13 @@ struct kvm_vcpu_hv_tlb_flush_fifo { DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); }; +struct kvm_vcpu_hv_intercept_info { + struct kvm_vcpu *vcpu; + int type; + u64 gpa; + u8 access; +}; + /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; @@ -672,6 +681,8 @@ struct kvm_vcpu_hv { u64 vm_id; u32 vp_id; } nested; + + struct kvm_vcpu_hv_intercept_info intercept_info; }; struct kvm_hypervisor_cpuid { @@ -966,6 +977,8 @@ struct kvm_vcpu_arch { /* set at EPT violation at this point */ unsigned long exit_qualification; + u32 exit_instruction_len; + /* pv related host specific info */ struct { bool pv_unhalted; @@ -1105,6 +1118,9 @@ struct kvm_hv { u64 hv_tsc_emulation_status; u64 hv_invtsc_control; + union hv_register_vsm_code_page_offsets vsm_code_page_offsets32; + union hv_register_vsm_code_page_offsets vsm_code_page_offsets64; + /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; @@ -1116,6 +1132,9 @@ struct kvm_hv { struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; + + /* status of KVM_CAP_HYPERV_VSM */ + bool hv_enable_vsm; }; struct msr_bitmap_range { @@ -1298,6 +1317,8 @@ struct kvm_arch { struct rw_semaphore apicv_update_lock; unsigned long apicv_inhibit_reasons; + uint64_t apic_id_mask_shift; + gpa_t wall_clock; bool mwait_in_guest; @@ -1495,6 +1516,7 @@ struct kvm_vcpu_stat { u64 pf_fast; u64 pf_mmio_spte_created; u64 pf_guest; + u64 pf_user; u64 tlb_flush; u64 invlpg; @@ -1981,6 +2003,8 @@ void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); +void dump_ftrace_vmcs(struct kvm_vcpu *vcpu); +void dump_ftrace_vcpu_state(struct kvm_vcpu * vcpu); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a448d0964fc06e..574152d5aae54d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -565,4 +565,14 @@ struct kvm_pmu_event_filter { #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 +/* Partition-wide VSM state; for KVM_HV_GET/SET_VSM_STATE */ +struct kvm_hv_vsm_state { + __u64 vsm_code_page_offsets64; + __u64 vsm_code_page_offsets32; +}; + +struct kvm_apic_id_mask { + __u64 width; /* mask width in bits, max 32 */ +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8452ed0228cb6b..41e11bbc1e6eee 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -166,4 +166,11 @@ config KVM_PROVE_MMU config KVM_EXTERNAL_WRITE_TRACKING bool +config KVM_HYPERV_VSM + bool "KVM Hyper-V Virtual Secure Mode (VSM) support" + select KVM_GENERIC_MEMORY_ATTRIBUTES + help + Enables the KVM VSM device, and all dependencies necessary in to + emulate Hyper-V's VSM. + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 80e3fe184d17e6..e8cec84b0d4e67 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -11,8 +11,8 @@ include $(srctree)/virt/kvm/Makefile.kvm kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \ - mmu/spte.o + hyperv.o debugfs.o \ + mmu/mmu.o mmu/page_track.o mmu/spte.o ifdef CONFIG_HYPERV kvm-y += kvm_onhyperv.o @@ -21,6 +21,7 @@ endif kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o kvm-$(CONFIG_KVM_XEN) += xen.o kvm-$(CONFIG_KVM_SMM) += smm.o +kvm-$(CONFIG_KVM_HYPERV_VSM) += hyperv-vsm.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ vmx/hyperv.o vmx/nested.o vmx/posted_intr.o diff --git a/arch/x86/kvm/hyperv-vsm.c b/arch/x86/kvm/hyperv-vsm.c new file mode 100644 index 00000000000000..1515cf4d0a1544 --- /dev/null +++ b/arch/x86/kvm/hyperv-vsm.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM Microsoft Hyper-V VSM emulation + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "mmu/mmu_internal.h" +#include "hyperv.h" +#include "lapic.h" +#include "trace.h" + +#include + +#define KVM_HV_VTL_ATTRS \ + (KVM_MEMORY_ATTRIBUTE_READ | KVM_MEMORY_ATTRIBUTE_WRITE | \ + KVM_MEMORY_ATTRIBUTE_EXECUTE | KVM_MEMORY_ATTRIBUTE_NO_ACCESS) + +struct kvm_hv_vtl_dev { + int vtl; + struct xarray mem_attrs; +}; + +static struct xarray *kvm_hv_vsm_get_memprots(struct kvm_vcpu *vcpu); + +static void kvm_hv_inject_gpa_intercept(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ + struct kvm_vcpu *target_vcpu = + kvm_hv_get_vtl_vcpu(vcpu, kvm_hv_get_active_vtl(vcpu) + 1); + struct kvm_vcpu_hv_intercept_info *intercept = + &target_vcpu->arch.hyperv->intercept_info; + + WARN_ON_ONCE(!to_kvm_hv(vcpu->kvm)->hv_enable_vsm); + + intercept->type = HVMSG_GPA_INTERCEPT; + intercept->gpa = fault->addr; + intercept->access = + (fault->user ? HV_INTERCEPT_ACCESS_READ : 0) | + (fault->write ? HV_INTERCEPT_ACCESS_WRITE : 0) | + (fault->exec ? HV_INTERCEPT_ACCESS_EXECUTE : 0); + intercept->vcpu = vcpu; + + kvm_make_request(KVM_REQ_HV_INJECT_INTERCEPT, target_vcpu); + kvm_vcpu_kick(target_vcpu); +} + + +bool kvm_hv_vsm_access_valid(struct kvm_page_fault *fault, unsigned long attrs) +{ + if (attrs == KVM_MEMORY_ATTRIBUTE_NO_ACCESS) + return false; + + if (fault->write && !(attrs & KVM_MEMORY_ATTRIBUTE_WRITE)) + return false; + + if (fault->exec && !(attrs & KVM_MEMORY_ATTRIBUTE_EXECUTE)) + return false; + + return true; +} + +static unsigned long kvm_hv_vsm_get_memory_attributes(struct kvm_vcpu *vcpu, + gfn_t gfn) +{ + struct xarray *prots = kvm_hv_vsm_get_memprots(vcpu); + + if (!prots) + return 0; + + return xa_to_value(xa_load(prots, gfn)); +} + +int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +{ + unsigned long attrs; + + attrs = kvm_hv_vsm_get_memory_attributes(vcpu, fault->gfn); + if (!attrs) + return RET_PF_CONTINUE; + + trace_kvm_hv_faultin_pfn(vcpu->vcpu_id, fault->gfn, fault->write, + fault->exec, fault->user, attrs); + + if (kvm_hv_vsm_access_valid(fault, attrs)) { + fault->map_executable = attrs & KVM_MEMORY_ATTRIBUTE_EXECUTE; + fault->map_writable = attrs & KVM_MEMORY_ATTRIBUTE_WRITE; + return RET_PF_CONTINUE; + } + + kvm_hv_inject_gpa_intercept(vcpu, fault); + kvm_prepare_memory_fault_exit(vcpu, fault->addr, PAGE_SIZE, + fault->write, fault->exec, fault->user, + fault->is_private); + return RET_PF_USER; +} + +static int kvm_hv_vtl_get_attr(struct kvm_device *dev, + struct kvm_device_attr *attr) +{ + struct kvm_hv_vtl_dev *vtl_dev = dev->private; + + switch (attr->group) { + case KVM_DEV_HV_VTL_GROUP: + switch (attr->attr){ + case KVM_DEV_HV_VTL_GROUP_VTLNUM: + return put_user(vtl_dev->vtl, (u32 __user *)attr->addr); + } + } + + return -EINVAL; +} + +static void kvm_hv_vtl_release(struct kvm_device *dev) +{ + struct kvm_hv_vtl_dev *vtl_dev = dev->private; + + xa_destroy(&vtl_dev->mem_attrs); + kfree(vtl_dev); + kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */ +} + +static long kvm_hv_vtl_ioctl(struct kvm_device *dev, unsigned int ioctl, + unsigned long arg) +{ + switch (ioctl) { + case KVM_SET_MEMORY_ATTRIBUTES: { + struct kvm_hv_vtl_dev *vtl_dev = dev->private; + struct kvm_memory_attributes attrs; + int r; + + if (copy_from_user(&attrs, (void __user *)arg, sizeof(attrs))) + return -EFAULT; + + r = kvm_ioctl_set_mem_attributes(dev->kvm, &vtl_dev->mem_attrs, + KVM_HV_VTL_ATTRS, &attrs); + if (r) + return r; + break; + } + default: + return -ENOTTY; + } + + return 0; +} + +static int kvm_hv_vtl_create(struct kvm_device *dev, u32 type); + +static struct kvm_device_ops kvm_hv_vtl_ops = { + .name = "kvm-hv-vtl", + .create = kvm_hv_vtl_create, + .release = kvm_hv_vtl_release, + .ioctl = kvm_hv_vtl_ioctl, + .get_attr = kvm_hv_vtl_get_attr, +}; + +static struct xarray *kvm_hv_vsm_get_memprots(struct kvm_vcpu *vcpu) +{ + struct kvm_hv_vtl_dev *vtl_dev; + struct kvm_device *tmp; + + list_for_each_entry(tmp, &vcpu->kvm->devices, vm_node) + if (tmp->ops == &kvm_hv_vtl_ops) { + vtl_dev = tmp->private; + if (vtl_dev->vtl == kvm_hv_get_active_vtl(vcpu)) + return &vtl_dev->mem_attrs; + } + + return NULL; +} + +static int kvm_hv_vtl_create(struct kvm_device *dev, u32 type) +{ + struct kvm_hv_vtl_dev *vtl_dev; + struct kvm_device *tmp; + int vtl = 0; + + vtl_dev = kzalloc(sizeof(*vtl_dev), GFP_KERNEL_ACCOUNT); + if (!vtl_dev) + return -ENOMEM; + + /* Device creation is protected by kvm->lock */ + list_for_each_entry(tmp, &dev->kvm->devices, vm_node) + if (tmp->ops == &kvm_hv_vtl_ops) + vtl++; + + vtl_dev->vtl = vtl; + xa_init(&vtl_dev->mem_attrs); + dev->private = vtl_dev; + + return 0; +} + +int kvm_hv_vtl_dev_register(void) +{ + return kvm_register_device_ops(&kvm_hv_vtl_ops, KVM_DEV_TYPE_HV_VSM_VTL); +} + +void kvm_hv_vtl_dev_unregister(void) +{ + kvm_unregister_device_ops(KVM_DEV_TYPE_HV_VSM_VTL); +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 238afd7335e46d..ad2e1520752e39 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -62,6 +62,11 @@ */ #define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64) +void kvm_tdp_mmu_role_set_hv_bits(struct kvm_vcpu *vcpu, union kvm_mmu_page_role *role) +{ + role->vtl = to_kvm_hv(vcpu->kvm)->hv_enable_vsm ? kvm_hv_get_active_vtl(vcpu) : 0; +} + static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, bool vcpu_kick); @@ -256,6 +261,168 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr) kvm_make_request(KVM_REQ_HV_EXIT, vcpu); } +static int patch_hypercall_page(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = to_kvm_hv(kvm); + u8 instructions[0x30]; + int i = 0; + u64 addr; + + /* + * If Xen and Hyper-V hypercalls are both enabled, disambiguate + * the same way Xen itself does, by setting the bit 31 of EAX + * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just + * going to be clobbered on 64-bit. + */ + if (kvm_xen_hypercall_enabled(kvm)) { + /* orl $0x80000000, %eax */ + instructions[i++] = 0x0d; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x80; + } + + /* vmcall/vmmcall */ + static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); + i += 3; + + /* ret */ + ((unsigned char *)instructions)[i++] = 0xc3; + + /* VTL call/return entries */ + if (!kvm_xen_hypercall_enabled(kvm) && kvm->arch.hyperv.hv_enable_vsm) { + /* + * VTL call 32-bit entry prologue: + * mov %eax, %ecx + * mov $0x11, %eax + * jmp 0: + */ + hv->vsm_code_page_offsets32.vtl_call_offset = i; + instructions[i++] = 0x89; + instructions[i++] = 0xc1; + instructions[i++] = 0xb8; + instructions[i++] = 0x11; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0xeb; + instructions[i++] = 0xf3; + /* + * VTL return 32-bit entry prologue: + * mov %eax, %ecx + * mov $0x12, %eax + * jmp 0: + */ + hv->vsm_code_page_offsets32.vtl_return_offset = i; + instructions[i++] = 0x89; + instructions[i++] = 0xc1; + instructions[i++] = 0xb8; + instructions[i++] = 0x12; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0xeb; + instructions[i++] = 0xea; + +#ifdef CONFIG_X86_64 + /* + * VTL call 64-bit entry prologue: + * mov %rcx, %rax + * mov $0x11, %ecx + * jmp 0: + */ + hv->vsm_code_page_offsets64.vtl_call_offset = i; + instructions[i++] = 0x48; + instructions[i++] = 0x89; + instructions[i++] = 0xc8; + instructions[i++] = 0xb9; + instructions[i++] = 0x11; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0xeb; + instructions[i++] = 0xe0; + /* + * VTL return 64-bit entry prologue: + * mov %rcx, %rax + * mov $0x12, %ecx + * jmp 0: + */ + hv->vsm_code_page_offsets64.vtl_return_offset = i; + instructions[i++] = 0x48; + instructions[i++] = 0x89; + instructions[i++] = 0xc8; + instructions[i++] = 0xb9; + instructions[i++] = 0x12; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0x00; + instructions[i++] = 0xeb; + instructions[i++] = 0xd6; +#endif + } + addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; + if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) + return 1; + + return 0; +} + +static int set_vp_assist_page(struct kvm_vcpu *vcpu, u64 data); + +static int kvm_hv_overlay_completion(struct kvm_vcpu *vcpu) +{ + struct kvm_hyperv_exit *exit = &vcpu->run->hyperv; + u64 data = exit->u.overlay.gpa; + int r = exit->u.overlay.error; + + if (r) + goto out; + + switch (exit->u.overlay.msr) { + case HV_X64_MSR_GUEST_OS_ID: + break; + case HV_X64_MSR_HYPERCALL: + r = patch_hypercall_page(vcpu, data); + break; + case HV_X64_MSR_VP_ASSIST_PAGE: + r = set_vp_assist_page(vcpu, data); + break; + default: + r = 1; + pr_err("%s: unknown overlay MSR, %x\n", __func__, + exit->u.overlay.msr); + } + +out: + if (r) { + if (exit->u.overlay.is_hypercall) + kvm_queue_exception(vcpu, UD_VECTOR); + else + kvm_inject_gp(vcpu, 0); + } + return 1; +} + +static int overlay_exit(struct kvm_vcpu *vcpu, u32 msr, u64 gpa, bool is_hypercall) +{ + struct kvm_hyperv_exit *exit = &to_hv_vcpu(vcpu)->exit; + + pr_info("%s, msr %x, gpa %llx\n", __func__, msr, gpa); + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + exit->type = KVM_EXIT_HYPERV_OVERLAY; + exit->u.overlay.msr = msr; + exit->u.overlay.gpa = gpa; + exit->u.overlay.error = 0; + exit->u.overlay.is_hypercall = is_hypercall; + vcpu->arch.complete_userspace_io = kvm_hv_overlay_completion; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); + return 0; +} + static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 data, bool host) { @@ -908,6 +1075,42 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) vcpu->arch.hyperv = NULL; } +/* Write to VP assist page register */ +static int set_vp_assist_page(struct kvm_vcpu *vcpu, u64 data) +{ + u64 gfn; + unsigned long addr; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + trace_printk("vpu_id %d, gpa %llx\n", vcpu->vcpu_id, data); + if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { + hv_vcpu->hv_vapic = data; + if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) + return 1; + return 0; + } + + gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT; + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); + if (kvm_is_error_hva(addr)) + return 1; + + /* + * Clear apic_assist portion of struct hv_vp_assist_page + * only, there can be valuable data in the rest which needs + * to be preserved e.g. on migration. + */ + if (__put_user(0, (u32 __user *)addr)) + return 1; + hv_vcpu->hv_vapic = data; + kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, + sizeof(struct hv_vp_assist_page))) + return 1; + return 0; + +} + bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); @@ -1335,14 +1538,13 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_GUEST_OS_ID: hv->hv_guest_os_id = data; /* setting guest os id to zero disables hypercall page */ - if (!hv->hv_guest_os_id) + if (!data) { hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + if (kvm->arch.hyperv.hv_enable_vsm && !host) + return overlay_exit(vcpu, HV_X64_MSR_GUEST_OS_ID, data, false); + } break; - case HV_X64_MSR_HYPERCALL: { - u8 instructions[9]; - int i = 0; - u64 addr; - + case HV_X64_MSR_HYPERCALL: /* if guest os id is not set hypercall should remain disabled */ if (!hv->hv_guest_os_id) break; @@ -1351,34 +1553,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, break; } - /* - * If Xen and Hyper-V hypercalls are both enabled, disambiguate - * the same way Xen itself does, by setting the bit 31 of EAX - * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just - * going to be clobbered on 64-bit. - */ - if (kvm_xen_hypercall_enabled(kvm)) { - /* orl $0x80000000, %eax */ - instructions[i++] = 0x0d; - instructions[i++] = 0x00; - instructions[i++] = 0x00; - instructions[i++] = 0x00; - instructions[i++] = 0x80; + if (kvm->arch.hyperv.hv_enable_vsm) { + hv->hv_hypercall = data; + if (!host) + return overlay_exit(vcpu, HV_X64_MSR_HYPERCALL, data, false); + break; } - - /* vmcall/vmmcall */ - static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i); - i += 3; - - /* ret */ - ((unsigned char *)instructions)[i++] = 0xc3; - - addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; - if (kvm_vcpu_write_guest(vcpu, addr, instructions, i)) + if (patch_hypercall_page(vcpu, data)) return 1; hv->hv_hypercall = data; break; - } case HV_X64_MSR_REFERENCE_TSC: hv->hv_tsc_page = data; if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) { @@ -1497,36 +1681,10 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) hv_vcpu->vp_index = new_vp_index; break; } - case HV_X64_MSR_VP_ASSIST_PAGE: { - u64 gfn; - unsigned long addr; - - if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { - hv_vcpu->hv_vapic = data; - if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) - return 1; - break; - } - gfn = data >> HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT; - addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); - if (kvm_is_error_hva(addr)) - return 1; - - /* - * Clear apic_assist portion of struct hv_vp_assist_page - * only, there can be valuable data in the rest which needs - * to be preserved e.g. on migration. - */ - if (__put_user(0, (u32 __user *)addr)) - return 1; - hv_vcpu->hv_vapic = data; - kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_set_pv_eoi(vcpu, - gfn_to_gpa(gfn) | KVM_MSR_ENABLED, - sizeof(struct hv_vp_assist_page))) - return 1; - break; - } + case HV_X64_MSR_VP_ASSIST_PAGE: + if (vcpu->kvm->arch.hyperv.hv_enable_vsm && !host) + return overlay_exit(vcpu, HV_X64_MSR_VP_ASSIST_PAGE, data, false); + return set_vp_assist_page(vcpu, data); case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data); case HV_X64_MSR_ICR: @@ -1815,6 +1973,7 @@ struct kvm_hv_hcall { u16 rep_idx; bool fast; bool rep; + bool xmm_dirty; sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; /* @@ -2143,8 +2302,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) ((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } +#define VTL_MASK 0x0 static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, - u64 *sparse_banks, u64 valid_bank_mask) + u64 *sparse_banks, u64 valid_bank_mask, int vtl) { struct kvm_lapic_irq irq = { .delivery_mode = APIC_DM_FIXED, @@ -2155,10 +2315,13 @@ static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector, kvm_for_each_vcpu(i, vcpu, kvm) { if (sparse_banks && - !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu), + !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu) & VTL_MASK, valid_bank_mask, sparse_banks)) continue; + if (kvm_hv_get_active_vtl(vcpu) != vtl) + continue; + /* We fail only when APIC is disabled */ kvm_apic_set_irq(vcpu, &irq, NULL); } @@ -2171,13 +2334,19 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct kvm *kvm = vcpu->kvm; struct hv_send_ipi_ex send_ipi_ex; struct hv_send_ipi send_ipi; + union hv_input_vtl *in_vtl; u64 valid_bank_mask; u32 vector; bool all_cpus; + u8 vtl; + + /* VTL is at the same offset on both IPI types */ + in_vtl = &send_ipi.in_vtl; + vtl = in_vtl->use_target_vtl ? in_vtl->target_vtl : kvm_hv_get_active_vtl(vcpu); if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { - if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, + if (unlikely(kvm_vcpu_read_guest(vcpu, hc->ingpa, &send_ipi, sizeof(send_ipi)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; sparse_banks[0] = send_ipi.cpu_mask; @@ -2192,10 +2361,10 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) all_cpus = false; valid_bank_mask = BIT_ULL(0); - trace_kvm_hv_send_ipi(vector, sparse_banks[0]); + trace_kvm_hv_send_ipi(vector, sparse_banks[0], vtl); } else { if (!hc->fast) { - if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi_ex, + if (unlikely(kvm_vcpu_read_guest(vcpu, hc->ingpa, &send_ipi_ex, sizeof(send_ipi_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; } else { @@ -2206,7 +2375,8 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, send_ipi_ex.vp_set.format, - send_ipi_ex.vp_set.valid_bank_mask); + send_ipi_ex.vp_set.valid_bank_mask, + vtl); vector = send_ipi_ex.vector; valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; @@ -2236,9 +2406,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) return HV_STATUS_INVALID_HYPERCALL_INPUT; if (all_cpus) - kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0); + kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0, vtl); else - kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask); + kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask, vtl); ret_success: return HV_STATUS_SUCCESS; @@ -2323,6 +2493,18 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) } } +static void kvm_hv_hypercall_set_xmm_regs(struct kvm_vcpu *vcpu) +{ + u64 *xmm = vcpu->run->hyperv.u.hcall.xmm; + int reg; + + kvm_fpu_get(); + for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) + //TODO: This is not great :( + _kvm_write_sse_reg(reg, &(const sse128_t){sse128(xmm[reg * 2], xmm[(reg * 2) + 1])}); + kvm_fpu_put(); +} + static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { u32 tlb_lock_count = 0; @@ -2348,6 +2530,17 @@ static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) { + u16 call = vcpu->run->hyperv.u.hcall.input & 0xffff; + bool fast = !!(vcpu->run->hyperv.u.hcall.input & HV_HYPERCALL_FAST_BIT); + + //TODO: Not in love with this approach + if (call == HVCALL_GET_VP_REGISTERS && fast) + kvm_hv_hypercall_set_xmm_regs(vcpu); + + //TODO move this to qemu + if (call == HVCALL_VTL_CALL || call == HVCALL_VTL_RETURN) + return kvm_skip_emulated_instruction(vcpu); + return kvm_hv_hypercall_complete(vcpu, vcpu->run->hyperv.u.hcall.result); } @@ -2400,6 +2593,10 @@ static bool is_xmm_fast_hypercall(struct kvm_hv_hcall *hc) case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX: case HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX: case HVCALL_SEND_IPI_EX: + case HVCALL_GET_VP_REGISTERS: + case HVCALL_SET_VP_REGISTERS: + case HVCALL_MODIFY_VTL_PROTECTION_MASK: + case HVCALL_TRANSLATE_VIRTUAL_ADDRESS: return true; } @@ -2414,6 +2611,128 @@ static void kvm_hv_hypercall_read_xmm(struct kvm_hv_hcall *hc) for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) _kvm_read_sse_reg(reg, &hc->xmm[reg]); kvm_fpu_put(); + + /* It's not dirty because we've replaced any possible changes */ + hc->xmm_dirty = false; +} + +static void kvm_hv_hypercall_write_xmm(struct kvm_hv_hcall *hc) +{ + int reg; + + kvm_fpu_get(); + for (reg = 0; reg < HV_HYPERCALL_MAX_XMM_REGISTERS; reg++) + _kvm_write_sse_reg(reg, &hc->xmm[reg]); + kvm_fpu_put(); + hc->xmm_dirty = false; +} + +static u64 kvm_hv_ext_query_capabilities(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) +{ + u64 caps = 0; /* No caps */ + + if (!hc->fast) { + if (unlikely(kvm_write_guest(vcpu->kvm, hc->outgpa, &caps, sizeof(caps)) != 0)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } else { + kvm_rdx_write(vcpu, caps); + } + + trace_kvm_hv_ext_query_capabilities(caps); + return HV_STATUS_SUCCESS; +} + +static bool kvm_hv_xlate_va_validate_input(struct kvm_vcpu* vcpu, + struct hv_xlate_va_input *in, + u8 *vtl, u8 *flags) +{ + struct kvm_vcpu_hv *hv = vcpu->arch.hyperv; + union hv_input_vtl in_vtl; + + if (in->partition_id != HV_PARTITION_ID_SELF) + return false; + + if (in->vp_index != HV_VP_INDEX_SELF && in->vp_index != hv->vp_index) + return false; + + in_vtl.as_uint8 = in->control_flags >> 56; + *flags = in->control_flags & HV_XLATE_GVA_FLAGS_MASK; + if (*flags > (HV_XLATE_GVA_VAL_READ | + HV_XLATE_GVA_VAL_WRITE | + HV_XLATE_GVA_VAL_EXECUTE)) + pr_info_ratelimited("Translate VA control flags unsupported and will be ignored: 0x%llx\n", + in->control_flags); + + *vtl = in_vtl.use_target_vtl ? in_vtl.target_vtl : kvm_hv_get_active_vtl(vcpu); + + if (*vtl >= HV_NUM_VTLS || *vtl > kvm_hv_get_active_vtl(vcpu)) + return false; + + return true; +} + +static u64 kvm_hv_xlate_va_walk(struct kvm_vcpu* vcpu, u64 gva, u8 flags) +{ + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u32 access = 0; + + if (flags & HV_XLATE_GVA_VAL_WRITE) + access |= PFERR_WRITE_MASK; + if (flags & HV_XLATE_GVA_VAL_EXECUTE) + access |= PFERR_FETCH_MASK; + + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, mmu, gva, access, NULL); +} + +static u64 kvm_hv_translate_virtual_address(struct kvm_vcpu* vcpu, + struct kvm_hv_hcall *hc) +{ + struct hv_xlate_va_output output = {}; + struct hv_xlate_va_input input; + struct kvm_vcpu *target_vcpu; + u8 flags, target_vtl; + + if (hc->fast) { + input.partition_id = hc->ingpa; + input.vp_index = hc->outgpa & 0xFFFFFFFF; + input.control_flags = sse128_lo(hc->xmm[0]); + input.gva = sse128_hi(hc->xmm[0]); + } else { + if (unlikely(kvm_vcpu_read_guest(vcpu, hc->ingpa, &input, sizeof(input)) != 0)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + trace_kvm_hv_translate_virtual_address(input.partition_id, input.vp_index, input.control_flags, input.gva); + + if (!kvm_hv_xlate_va_validate_input(vcpu, &input, &target_vtl, &flags)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + target_vcpu = kvm_hv_get_vtl_vcpu(vcpu, target_vtl); + output.gpa = kvm_hv_xlate_va_walk(target_vcpu, input.gva << PAGE_SHIFT, flags); + if (output.gpa == INVALID_GPA) { + output.result_code = HV_XLATE_GVA_UNMAPPED; + } else { + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + u64 hcall_page = hv->hv_hypercall & + HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK; + + if (output.gpa == hcall_page) + output.overlay_page = 1; + + output.gpa >>= PAGE_SHIFT; + output.result_code = HV_XLATE_GVA_SUCCESS; + output.cache_type = HV_CACHE_TYPE_X64_WB; + } + + if (hc->fast) { + memcpy(&hc->xmm[1], &output, sizeof(output)); + hc->xmm_dirty = true; + } else { + if (unlikely(kvm_vcpu_write_guest(vcpu, hc->outgpa, &output, sizeof(output)) != 0)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + return HV_STATUS_SUCCESS; } static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) @@ -2466,11 +2785,63 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) return true; } +static bool is_hyperv_feature_advertised(struct kvm_vcpu *vcpu, enum kvm_reg reg, u64 feature_mask) +{ + struct kvm_cpuid_entry2 *entry; + u64 regval; + + entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES); + if (!entry) + return false; + + switch (reg) { + case VCPU_REGS_RAX: regval = entry->eax; break; + case VCPU_REGS_RBX: regval = entry->ebx; break; + case VCPU_REGS_RDX: regval = entry->edx; break; + default: return false; + }; + + return (regval & feature_mask) == feature_mask; +} + +static bool is_hypercall_advertised(struct kvm_vcpu *vcpu, u16 code) +{ + u64 feature_mask; + enum kvm_reg reg; + + /* Some hypercalls are advertised by default, the others are not */ + switch (code) { + case HVCALL_GET_VP_REGISTERS: + case HVCALL_SET_VP_REGISTERS: + feature_mask = HV_ACCESS_VP_REGISTERS; + reg = VCPU_REGS_RBX; + break; + case HVCALL_ENABLE_PARTITION_VTL: + case HVCALL_ENABLE_VP_VTL: + case HVCALL_MODIFY_VTL_PROTECTION_MASK: + case HVCALL_VTL_CALL: + case HVCALL_VTL_RETURN: + feature_mask = HV_ACCESS_VSM; + reg = VCPU_REGS_RBX; + break; + case HV_EXT_CALL_QUERY_CAPABILITIES: + feature_mask = HV_ENABLE_EXTENDED_HYPERCALLS; + reg = VCPU_REGS_RBX; + break; + default: + /* everything else is advertised by default */ + return true; + } + + return is_hyperv_feature_advertised(vcpu, reg, feature_mask); +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; + int i; /* * hypercall generates UD from non zero cpl and real mode @@ -2528,6 +2899,9 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) kvm_hv_hypercall_read_xmm(&hc); } + if (unlikely(!is_hypercall_advertised(vcpu, hc.code))) + return kvm_hv_hypercall_complete(vcpu, HV_STATUS_INVALID_HYPERCALL_CODE); + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep || hc.var_cnt)) { @@ -2612,17 +2986,53 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } goto hypercall_userspace_exit; } - case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX: + case HV_EXT_CALL_QUERY_CAPABILITIES: { + if (unlikely(hc.rep_cnt)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + + ret = kvm_hv_ext_query_capabilities(vcpu, &hc); + break; + } + case HV_EXT_CALL_GET_BOOT_ZEROED_MEMORY ... HV_EXT_CALL_MAX: if (unlikely(hc.fast)) { ret = HV_STATUS_INVALID_PARAMETER; break; } goto hypercall_userspace_exit; + case HVCALL_GET_VP_REGISTERS: + case HVCALL_SET_VP_REGISTERS: + case HVCALL_MODIFY_VTL_PROTECTION_MASK: + case HVCALL_ENABLE_PARTITION_VTL: + case HVCALL_ENABLE_VP_VTL: + goto hypercall_userspace_exit; + case HVCALL_VTL_CALL: + case HVCALL_VTL_RETURN: + vcpu->dump_state_on_run = true; + trace_printk("-------------------------------------------VCPU%d---------------------------------\n", vcpu->vcpu_id); + trace_printk("Exiting to user-space with code 0x%x\n", hc.code); + dump_ftrace_vmcs(vcpu); + dump_ftrace_vcpu_state(vcpu); + trace_printk("---------------------------------------------------------------------------\n"); + kvm_get_vcpu_by_id(vcpu->kvm, 0)->dump_state_on_run = true; + goto hypercall_userspace_exit; + case HVCALL_TRANSLATE_VIRTUAL_ADDRESS: + if (unlikely(hc.rep_cnt)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + + ret = kvm_hv_translate_virtual_address(vcpu, &hc); + break; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; } + if ((ret & HV_HYPERCALL_RESULT_MASK) == HV_STATUS_SUCCESS && hc.xmm_dirty) + kvm_hv_hypercall_write_xmm(&hc); + hypercall_complete: return kvm_hv_hypercall_complete(vcpu, ret); @@ -2632,10 +3042,184 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) vcpu->run->hyperv.u.hcall.input = hc.param; vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa; vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa; + if (hc.fast) { + for (i = 0; i < HV_HYPERCALL_MAX_XMM_REGISTERS; i++) { + vcpu->run->hyperv.u.hcall.xmm[i * 2] = sse128_lo(hc.xmm[i]); + vcpu->run->hyperv.u.hcall.xmm[(i * 2) + 1] = sse128_hi(hc.xmm[i]); + } + } vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace; return 0; } +static void store_kvm_segment(const struct kvm_segment *kvmseg, + struct hv_x64_segment_register *reg) +{ + reg->base = kvmseg->base; + reg->limit = kvmseg->limit; + reg->selector = kvmseg->selector; + reg->segment_type = kvmseg->type; + reg->present = kvmseg->present; + reg->descriptor_privilege_level = kvmseg->dpl; + reg->_default = kvmseg->db; + reg->non_system_segment = kvmseg->s; + reg->_long = kvmseg->l; + reg->granularity = kvmseg->g; + reg->available = kvmseg->avl; +} + +static void deliver_gpa_intercept(struct kvm_vcpu *target_vcpu, + struct kvm_vcpu *intercepted_vcpu, u64 gpa, + u64 gva, u8 access_type_mask) +{ + ulong cr0; + struct hv_message msg = { 0 }; + struct hv_memory_intercept_message *intercept = (struct hv_memory_intercept_message *)msg.u.payload; + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(target_vcpu); + struct x86_exception e; + struct kvm_segment kvmseg; + int i; + msg.header.message_type = HVMSG_GPA_INTERCEPT; + msg.header.payload_size = sizeof(*intercept); + + intercept->header.vp_index = to_hv_vcpu(intercepted_vcpu)->vp_index; + intercept->header.instruction_length = intercepted_vcpu->arch.exit_instruction_len; + intercept->header.access_type_mask = access_type_mask; + kvm_x86_ops.get_segment(intercepted_vcpu, &kvmseg, VCPU_SREG_CS); + store_kvm_segment(&kvmseg, &intercept->header.cs); + + cr0 = kvm_read_cr0(intercepted_vcpu); + intercept->header.exec_state.cr0_pe = (cr0 & X86_CR0_PE); + intercept->header.exec_state.cr0_am = (cr0 & X86_CR0_AM); + intercept->header.exec_state.cpl = kvm_x86_ops.get_cpl(intercepted_vcpu); + intercept->header.exec_state.efer_lma = is_long_mode(intercepted_vcpu); + intercept->header.exec_state.debug_active = 0; + intercept->header.exec_state.interruption_pending = 0; + intercept->header.rip = kvm_rip_read(intercepted_vcpu); + intercept->header.rflags = kvm_get_rflags(intercepted_vcpu); + + /* + * For exec violations we don't have a way to decode an instruction that issued a fetch + * to a non-X page because CPU points RIP and GPA to the fetch destination in the faulted page. + * Instruction length though is the length of the fetch source. + * Seems like Hyper-V is aware of that and is not trying to access those fields. + */ + if (access_type_mask == HV_INTERCEPT_ACCESS_EXECUTE) { + intercept->instruction_byte_count = 0; + } else { + intercept->instruction_byte_count = intercepted_vcpu->arch.exit_instruction_len; + if (intercept->instruction_byte_count > sizeof(intercept->instruction_bytes)) + intercept->instruction_byte_count = sizeof(intercept->instruction_bytes); + if (kvm_read_guest_virt(intercepted_vcpu, + kvm_rip_read(intercepted_vcpu), + intercept->instruction_bytes, + intercept->instruction_byte_count, &e)) + goto inject_ud; + } + + intercept->memory_access_info.gva_valid = (gva != 0); + intercept->gva = gva; + intercept->gpa = gpa; + intercept->cache_type = HV_X64_CACHE_TYPE_WRITEBACK; + kvm_x86_ops.get_segment(intercepted_vcpu, &kvmseg, VCPU_SREG_DS); + store_kvm_segment(&kvmseg, &intercept->ds); + kvm_x86_ops.get_segment(intercepted_vcpu, &kvmseg, VCPU_SREG_SS); + store_kvm_segment(&kvmseg, &intercept->ss); + intercept->rax = kvm_rax_read(intercepted_vcpu); + intercept->rcx = kvm_rcx_read(intercepted_vcpu); + intercept->rdx = kvm_rdx_read(intercepted_vcpu); + intercept->rbx = kvm_rbx_read(intercepted_vcpu); + intercept->rsp = kvm_rsp_read(intercepted_vcpu); + intercept->rbp = kvm_rbp_read(intercepted_vcpu); + intercept->rsi = kvm_rsi_read(intercepted_vcpu); + intercept->rdi = kvm_rdi_read(intercepted_vcpu); + intercept->r8 = kvm_r8_read(intercepted_vcpu); + intercept->r9 = kvm_r9_read(intercepted_vcpu); + intercept->r10 = kvm_r10_read(intercepted_vcpu); + intercept->r11 = kvm_r11_read(intercepted_vcpu); + intercept->r12 = kvm_r12_read(intercepted_vcpu); + intercept->r13 = kvm_r13_read(intercepted_vcpu); + intercept->r14 = kvm_r14_read(intercepted_vcpu); + intercept->r15 = kvm_r15_read(intercepted_vcpu); + + trace_printk(" header.vp_index: %x\n", intercept->header.vp_index); + trace_printk(" header.instruction_length: %x\n", intercept->header.instruction_length); + trace_printk(" header.access_type_mask: %x\n", intercept->header.access_type_mask); + trace_printk(" header.exec_state.cpl: %x\n", intercept->header.exec_state.cpl); + trace_printk(" header.exec_state.cr0_pe: %x\n", intercept->header.exec_state.cr0_pe); + trace_printk(" header.exec_state.cr0_am: %x\n", intercept->header.exec_state.cr0_am); + trace_printk(" header.exec_state.efer_lma: %x\n", intercept->header.exec_state.efer_lma); + trace_printk(" header.exec_state.debug_active: %x\n", + intercept->header.exec_state.debug_active); + trace_printk(" header.exec_state.interruption_pending: %x\n", + intercept->header.exec_state.interruption_pending); + trace_printk(" header.cs: (values for cs segment register)\n"); + trace_printk(" base: %llx\n", (unsigned long long)intercept->header.cs.base); + trace_printk(" limit: %x\n", intercept->header.cs.limit); + trace_printk(" selector: %x\n", intercept->header.cs.selector); + trace_printk(" header.rip: %llx\n", (unsigned long long)intercept->header.rip); + trace_printk(" header.rflags: %llx\n", (unsigned long long)intercept->header.rflags); + trace_printk(" cache_type: %x\n", intercept->cache_type); + trace_printk(" instruction_byte_count: %x\n", intercept->instruction_byte_count); + trace_printk(" memory_access_info.gva_valid: %x\n", + intercept->memory_access_info.gva_valid); + trace_printk(" _reserved: %x\n", intercept->_reserved); + trace_printk(" gva: %llx\n", (unsigned long long)intercept->gva); + trace_printk(" gpa: %llx\n", (unsigned long long)intercept->gpa); + trace_printk(" instruction_bytes: "); + for (i = 0; i < 16; i++) { + trace_printk("%02x ", intercept->instruction_bytes[i]); + } + trace_printk("\n"); + trace_printk(" ds: (values for ds segment register)\n"); + trace_printk(" base: %llx\n", (unsigned long long)intercept->ds.base); + trace_printk(" limit: %x\n", intercept->ds.limit); + trace_printk(" selector: %x\n", intercept->ds.selector); + trace_printk(" ss: (values for ss segment register)\n"); + trace_printk(" base: %llx\n", (unsigned long long)intercept->ss.base); + trace_printk(" limit: %x\n", intercept->ss.limit); + trace_printk(" selector: %x\n", intercept->ss.selector); + trace_printk(" rax: %llx\n", (unsigned long long)intercept->rax); + trace_printk(" rcx: %llx\n", (unsigned long long)intercept->rcx); + trace_printk(" rdx: %llx\n", (unsigned long long)intercept->rdx); + trace_printk(" rbx: %llx\n", (unsigned long long)intercept->rbx); + trace_printk(" rsp: %llx\n", (unsigned long long)intercept->rsp); + trace_printk(" rbp: %llx\n", (unsigned long long)intercept->rbp); + trace_printk(" rsi: %llx\n", (unsigned long long)intercept->rsi); + trace_printk(" rdi: %llx\n", (unsigned long long)intercept->rdi); + trace_printk(" r8: %llx\n", (unsigned long long)intercept->r8); + trace_printk(" r9: %llx\n", (unsigned long long)intercept->r9); + trace_printk(" r10: %llx\n", (unsigned long long)intercept->r10); + trace_printk(" r11: %llx\n", (unsigned long long)intercept->r11); + trace_printk(" r12: %llx\n", (unsigned long long)intercept->r12); + trace_printk(" r13: %llx\n", (unsigned long long)intercept->r13); + trace_printk(" r14: %llx\n", (unsigned long long)intercept->r14); + trace_printk(" r15: %llx\n", (unsigned long long)intercept->r15); + + if (synic_deliver_msg(&hv_vcpu->synic, 0, &msg, true)) + goto inject_ud; + + return; + +inject_ud: + kvm_queue_exception(target_vcpu, UD_VECTOR); +} + +void kvm_hv_deliver_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv_intercept_info *info = &to_hv_vcpu(vcpu)->intercept_info; + + switch (info->type) { + case HVMSG_GPA_INTERCEPT: + deliver_gpa_intercept(vcpu, info->vcpu, info->gpa, 0, + info->access); + break; + default: + pr_warn("Unknown exception\n"); + } +} +EXPORT_SYMBOL_GPL(kvm_hv_deliver_intercept); + void kvm_hv_init_vm(struct kvm *kvm) { struct kvm_hv *hv = to_kvm_hv(kvm); @@ -2778,8 +3362,11 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->ebx |= HV_POST_MESSAGES; ent->ebx |= HV_SIGNAL_EVENTS; ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS; + ent->ebx |= HV_ACCESS_VP_REGISTERS; + ent->ebx |= HV_ACCESS_VSM; ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE; + ent->edx |= HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE; ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; @@ -2864,3 +3451,42 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, return 0; } + +int kvm_vm_ioctl_get_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state) +{ + struct kvm_hv* hv = &kvm->arch.hyperv; + + state->vsm_code_page_offsets64 = hv->vsm_code_page_offsets64.as_u64; + state->vsm_code_page_offsets32 = hv->vsm_code_page_offsets32.as_u64; + return 0; +} + +int kvm_vm_ioctl_set_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state) +{ + struct kvm_hv* hv = &kvm->arch.hyperv; + + hv->vsm_code_page_offsets64.as_u64 = state->vsm_code_page_offsets64; + hv->vsm_code_page_offsets32.as_u64 = state->vsm_code_page_offsets32; + return 0; +} + +static bool hv_read_vtl_control(struct kvm_vcpu *vcpu, struct hv_vp_vtl_control *vtl_control) +{ + /* VTL control is a part of VP assist page, which is accessed through pv_eoi */ + if (!vcpu->arch.pv_eoi.data.len) + return 0; + + return !kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, vtl_control, + offsetof(struct hv_vp_assist_page, vtl_control), sizeof(*vtl_control)); +} + +void dump_ftrace_vcpu_hyperv(struct kvm_vcpu *vcpu) +{ + struct hv_vp_vtl_control vtl_control; + + trace_printk("*** HyperV VTL state ***\n"); + if (kvm_hv_get_active_vtl(vcpu) && hv_read_vtl_control(vcpu, &vtl_control)) + trace_printk("entry_reason 0x%x, vina %d, rax %llx, rcx %llx\n", + vtl_control.vtl_entry_reason, vtl_control.vina_asserted, + vtl_control.vtl_ret_x64rax, vtl_control.vtl_ret_x64rcx); +} diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index f83b8db72b118c..8bdc74902c9972 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -238,4 +238,19 @@ static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu); +static inline bool kvm_hv_vsm_enabled(struct kvm *kvm) +{ + return !!(kvm->arch.hyperv.hv_enable_vsm); +} + +int kvm_vm_ioctl_get_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state); +int kvm_vm_ioctl_set_hv_vsm_state(struct kvm *kvm, struct kvm_hv_vsm_state *state); +int kvm_hv_vtl_dev_register(void); +void kvm_hv_vtl_dev_unregister(void); + +int kvm_hv_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); + +void kvm_hv_deliver_intercept(struct kvm_vcpu *vcpu); + +void dump_ftrace_vcpu_hyperv(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3e977dbbf9933d..a0969ac24d79af 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -141,7 +141,7 @@ static inline int apic_enabled(struct kvm_lapic *apic) static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { - return apic->vcpu->vcpu_id; + return kvm_apic_id_masked(apic->vcpu->kvm, apic->vcpu->vcpu_id); } static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) @@ -152,9 +152,10 @@ static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.set_hv_timer - && !(kvm_mwait_in_guest(vcpu->kvm) || - kvm_can_post_timer_interrupt(vcpu)); + return kvm_x86_ops.set_hv_timer && + !(kvm_mwait_in_guest(vcpu->kvm) || + kvm_can_post_timer_interrupt(vcpu)) && + !(to_kvm_hv(vcpu->kvm)->hv_enable_vsm); } static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) @@ -525,7 +526,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = kvm_apic_calc_x2apic_ldr(id); - WARN_ON_ONCE(id != apic->vcpu->vcpu_id); + WARN_ON_ONCE(id != kvm_apic_id_masked(apic->vcpu->kvm, apic->vcpu->vcpu_id)); kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); @@ -1518,6 +1519,7 @@ void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) else irq.dest_id = GET_XAPIC_DEST_FIELD(icr_high); + irq.dest_id |= kvm_hv_get_active_vtl(apic->vcpu) << apic->vcpu->kvm->arch.apic_id_mask_shift; trace_kvm_apic_ipi(icr_low, irq.dest_id); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); @@ -2541,7 +2543,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { - kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + kvm_apic_set_xapic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id)); static_branch_slow_dec_deferred(&apic_hw_disabled); /* Check if there are APF page ready requests pending */ kvm_make_request(KVM_REQ_APF_READY, vcpu); @@ -2553,9 +2555,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) - kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); + kvm_apic_set_x2apic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id)); else if (value & MSR_IA32_APICBASE_ENABLE) - kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + kvm_apic_set_xapic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id)); } if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) { @@ -2685,7 +2687,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* The xAPIC ID is set at RESET even if the APIC was already enabled. */ if (!init_event) - kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + kvm_apic_set_xapic_id(apic, kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id)); kvm_apic_set_version(apic->vcpu); for (i = 0; i < apic->nr_lvt_entries; i++) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 0a0ea4b5dd8ce7..3207ad787f48a2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -278,4 +278,25 @@ static inline u8 kvm_xapic_id(struct kvm_lapic *apic) return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } +static inline u32 kvm_apic_id_masked(struct kvm *kvm, u32 apic_id) +{ + return apic_id & ~(0xFFFFFFFF << kvm->arch.apic_id_mask_shift); +} + +static inline u32 kvm_apic_id_mask_value(struct kvm *kvm, u32 apic_id) +{ + return apic_id >> kvm->arch.apic_id_mask_shift; +} + +static inline struct kvm_vcpu *kvm_hv_get_vtl_vcpu(struct kvm_vcpu *vcpu, int vtl) +{ + return kvm_get_vcpu_by_id(vcpu->kvm, + kvm_apic_id_masked(vcpu->kvm, vcpu->vcpu_id) | (vtl << vcpu->kvm->arch.apic_id_mask_shift)); +} + +static inline u8 kvm_hv_get_active_vtl(struct kvm_vcpu *vcpu) +{ + return (u8)(kvm_apic_id_mask_value(vcpu->kvm, vcpu->vcpu_id)); +} + #endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 253fb2093d5dad..39d28334c5e1d0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -304,4 +304,6 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu, return gpa; return translate_nested_gpa(vcpu, gpa, access, exception); } + +void kvm_tdp_mmu_role_set_hv_bits(struct kvm_vcpu *vcpu, union kvm_mmu_page_role *role); #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index baeba8fc1c38ea..95fceea04cc015 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3244,6 +3244,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu_page *sp; int ret; gfn_t base_gfn = fault->gfn; + unsigned access = ACC_ALL; kvm_mmu_hugepage_adjust(vcpu, fault); @@ -3273,7 +3274,10 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; - ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, + if (!fault->map_executable) + access &= ~ACC_EXEC_MASK; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, access, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -4295,8 +4299,8 @@ static inline u8 kvm_max_level_for_order(int order) static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, - PAGE_SIZE, fault->write, fault->exec, + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, PAGE_SIZE, + fault->write, fault->exec, fault->user, fault->is_private); } @@ -4328,6 +4332,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault { struct kvm_memory_slot *slot = fault->slot; bool async; + int r; /* * Retry the page fault if the gfn hit a memslot that is being deleted @@ -4364,6 +4369,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); +#ifdef CONFIG_KVM_HYPERV_VSM + if (kvm_hv_vsm_enabled(vcpu->kvm)) { + r = kvm_hv_faultin_pfn(vcpu, fault); + if (r != RET_PF_CONTINUE) + return r; + } +#endif + async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, @@ -5271,6 +5284,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; + kvm_tdp_mmu_role_set_hv_bits(vcpu, &role); return role; } @@ -5809,6 +5823,9 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err return -EIO; } + if (r == RET_PF_USER) + return 0; + if (r < 0) return r; if (r != RET_PF_EMULATE) @@ -7258,7 +7275,8 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, * Zapping SPTEs in this case ensures KVM will reassess whether or not * a hugepage can be used for affected ranges. */ - if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm) && + !kvm_hv_vsm_enabled(kvm))) return false; return kvm_unmap_gfn_range(kvm, range); @@ -7315,7 +7333,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, * a range that has PRIVATE GFNs, and conversely converting a range to * SHARED may now allow hugepages. */ - if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm) && + !kvm_hv_vsm_enabled(kvm))) return false; /* diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index b66a7d47e0e4ef..55fe901e448671 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -239,6 +239,7 @@ struct kvm_page_fault { kvm_pfn_t pfn; hva_t hva; bool map_writable; + bool map_executable; /* * Indicates the guest is trying to write a gfn that contains one or @@ -260,6 +261,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * RET_PF_USER: need to exit to userspace to handle this fault. * * Any names added to this enum should be exported to userspace for use in * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h @@ -276,6 +278,7 @@ enum { RET_PF_INVALID, RET_PF_FIXED, RET_PF_SPURIOUS, + RET_PF_USER, }; static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, @@ -298,6 +301,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), + .map_writable = true, + .map_executable = true, }; int r; @@ -335,6 +340,8 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, vcpu->stat.pf_emulate++; else if (r == RET_PF_SPURIOUS) vcpu->stat.pf_spurious++; + else if (r == RET_PF_USER) + vcpu->stat.pf_user++; return r; } diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ae86820cef697a..4a74b74861dff1 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -60,6 +60,7 @@ TRACE_DEFINE_ENUM(RET_PF_EMULATE); TRACE_DEFINE_ENUM(RET_PF_INVALID); TRACE_DEFINE_ENUM(RET_PF_FIXED); TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); +TRACE_DEFINE_ENUM(RET_PF_USER); /* * A pagetable walk has started diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6cd4dd631a2fac..46f3e72ab770e9 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -957,14 +957,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, u64 new_spte; int ret = RET_PF_FIXED; bool wrprot = false; + unsigned access = ACC_ALL; if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) return RET_PF_RETRY; + if (!fault->map_executable) + access &= ~ACC_EXEC_MASK; + if (unlikely(!fault->slot)) - new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); + new_spte = make_mmio_spte(vcpu, iter->gfn, access); else - wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + wrprot = make_spte(vcpu, sp, fault->slot, access, iter->gfn, fault->pfn, iter->old_spte, fault->prefetch, true, fault->map_writable, &new_spte); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 83843379813ee3..195e5839a3ac91 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1372,6 +1372,29 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +TRACE_EVENT(kvm_hv_translate_virtual_address, + TP_PROTO(u64 partition_id, u32 vp_index, u64 control_flags, u64 gva), + TP_ARGS(partition_id, vp_index, control_flags, gva), + + TP_STRUCT__entry( + __field(u64, partition_id) + __field(u32, vp_index) + __field(u64, control_flags) + __field(u64, gva) + ), + + TP_fast_assign( + __entry->partition_id = partition_id; + __entry->vp_index = vp_index; + __entry->control_flags = control_flags; + __entry->gva = gva; + ), + + TP_printk("partition id 0x%llx, vp index 0x%x, control flags 0x%llx, gva 0x%llx", + __entry->partition_id, __entry->vp_index, + __entry->control_flags, __entry->gva) +); + TRACE_EVENT(kvm_apicv_inhibit_changed, TP_PROTO(int reason, bool set, unsigned long inhibits), TP_ARGS(reason, set, inhibits), @@ -1606,42 +1629,111 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex, * Tracepoints for kvm_hv_send_ipi. */ TRACE_EVENT(kvm_hv_send_ipi, - TP_PROTO(u32 vector, u64 processor_mask), - TP_ARGS(vector, processor_mask), + TP_PROTO(u32 vector, u64 processor_mask, u8 vtl), + TP_ARGS(vector, processor_mask, vtl), TP_STRUCT__entry( __field(u32, vector) __field(u64, processor_mask) + __field(u8, vtl) ), TP_fast_assign( __entry->vector = vector; __entry->processor_mask = processor_mask; + __entry->vtl = vtl; ), - TP_printk("vector %x processor_mask 0x%llx", - __entry->vector, __entry->processor_mask) + TP_printk("vector %x processor_mask 0x%llx vtl %d", + __entry->vector, __entry->processor_mask, __entry->vtl) ); TRACE_EVENT(kvm_hv_send_ipi_ex, - TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask), - TP_ARGS(vector, format, valid_bank_mask), + TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask, u8 vtl), + TP_ARGS(vector, format, valid_bank_mask, vtl), TP_STRUCT__entry( __field(u32, vector) __field(u64, format) __field(u64, valid_bank_mask) + __field(u8, vtl) ), TP_fast_assign( __entry->vector = vector; __entry->format = format; __entry->valid_bank_mask = valid_bank_mask; + __entry->vtl = vtl; ), - TP_printk("vector %x format %llx valid_bank_mask 0x%llx", + TP_printk("vector %x format %llx valid_bank_mask 0x%llx vtl %d", __entry->vector, __entry->format, - __entry->valid_bank_mask) + __entry->valid_bank_mask, __entry->vtl) +); + +TRACE_EVENT(kvm_hv_faultin_pfn, + TP_PROTO(u32 vcpu_id, u64 gfn, bool write, bool exec, bool user, u64 prots), + TP_ARGS(vcpu_id, gfn, write, exec, user, prots), + + TP_STRUCT__entry( + __field(u32, vcpu_id) + __field(u64, gfn) + __field(bool, write) + __field(bool, exec) + __field(bool, user) + __field(u64, prots) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->gfn = gfn; + __entry->write = write; + __entry->exec = exec; + __entry->user = user; + __entry->prots = prots; + ), + + TP_printk("vcpu%d gfn %llx write %d exec %d user %d prots %llx", + __entry->vcpu_id, __entry->gfn, __entry->write, + __entry->exec, __entry->user, __entry->prots) +); + +TRACE_EVENT(kvm_hv_modify_vtl_protection_mask, + TP_PROTO(u64 target_partition_id, u32 map_flags, u8 target_vtl, u16 count), + TP_ARGS(target_partition_id, map_flags, target_vtl, count), + + TP_STRUCT__entry( + __field(u64, target_partition_id) + __field(u32, map_flags) + __field(u8, target_vtl) + __field(u16, count) + ), + + TP_fast_assign( + __entry->target_partition_id = target_partition_id; + __entry->map_flags = map_flags; + __entry->target_vtl = target_vtl; + __entry->count = count; + ), + + TP_printk("target partition id 0x%llx, map flags 0x%x, target VTL %d, count %d", + __entry->target_partition_id, __entry->map_flags, + __entry->target_vtl, __entry->count) +); + +TRACE_EVENT(kvm_hv_ext_query_capabilities, + TP_PROTO(u64 caps), + TP_ARGS(caps), + + TP_STRUCT__entry( + __field(u64, caps) + ), + + TP_fast_assign( + __entry->caps = caps; + ), + + TP_printk("reported capabilities 0x%llx", __entry->caps) ); TRACE_EVENT(kvm_pv_tlb_flush, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6e502ba931416d..70212486577a19 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5339,6 +5339,11 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; + trace_printk("-------------------------------------------VCPU%d---------------------------------\n", vcpu->vcpu_id); + trace_printk("External interrupt\n"); + dump_ftrace_vmcs(vcpu); + dump_ftrace_vcpu_state(vcpu); + trace_printk("---------------------------------------------------------------------------\n"); return 1; } @@ -5773,6 +5778,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; vcpu->arch.exit_qualification = exit_qualification; + vcpu->arch.exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); /* * Check that the GPA doesn't exceed physical memory limits, as that is @@ -6396,6 +6402,202 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmcs_read16(VIRTUAL_PROCESSOR_ID)); } +static void vmx_ftrace_dump_sel(char *name, uint32_t sel) +{ + trace_printk("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", + name, vmcs_read16(sel), + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); +} + +static void vmx_ftrace_dump_dtsel(char *name, uint32_t limit) +{ + trace_printk("%s limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(limit), + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); +} + +static void vmx_ftrace_dump_msrs(char *name, struct vmx_msrs *m) +{ + unsigned int i; + struct vmx_msr_entry *e; + + trace_printk("MSR %s:\n", name); + for (i = 0, e = m->val; i < m->nr; ++i, ++e) + trace_printk(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); +} + +void dump_ftrace_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 vmentry_ctl, vmexit_ctl; + u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; + u64 tertiary_exec_control; + unsigned long cr4; + int efer_slot; + + vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + cr4 = vmcs_readl(GUEST_CR4); + + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + else + secondary_exec_control = 0; + + if (cpu_has_tertiary_exec_ctrls()) + tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); + else + tertiary_exec_control = 0; + + trace_printk("VMCS %p, last attempted VM-entry on CPU %d\n", + vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); + trace_printk("*** Guest State ***\n"); + trace_printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), + vmcs_readl(CR0_GUEST_HOST_MASK)); + trace_printk("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); + trace_printk("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + if (cpu_has_vmx_ept()) { + trace_printk("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); + trace_printk("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", + vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); + } + trace_printk("RSP = 0x%016lx RIP = 0x%016lx\n", + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); + trace_printk("RFLAGS=0x%08lx DR7 = 0x%016lx\n", + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); + trace_printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(GUEST_SYSENTER_ESP), + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); + vmx_ftrace_dump_sel("CS: ", GUEST_CS_SELECTOR); + vmx_ftrace_dump_sel("DS: ", GUEST_DS_SELECTOR); + vmx_ftrace_dump_sel("SS: ", GUEST_SS_SELECTOR); + vmx_ftrace_dump_sel("ES: ", GUEST_ES_SELECTOR); + vmx_ftrace_dump_sel("FS: ", GUEST_FS_SELECTOR); + vmx_ftrace_dump_sel("GS: ", GUEST_GS_SELECTOR); + vmx_ftrace_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); + vmx_ftrace_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); + vmx_ftrace_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); + vmx_ftrace_dump_sel("TR: ", GUEST_TR_SELECTOR); + efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) + trace_printk("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); + else if (efer_slot >= 0) + trace_printk("EFER= 0x%016llx (autoload)\n", + vmx->msr_autoload.guest.val[efer_slot].value); + else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) + trace_printk("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer | (EFER_LMA | EFER_LME)); + else + trace_printk("EFER= 0x%016llx (effective)\n", + vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) + trace_printk("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); + trace_printk("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", + vmcs_read64(GUEST_IA32_DEBUGCTL), + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + if (cpu_has_load_perf_global_ctrl() && + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + trace_printk("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) + trace_printk("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); + trace_printk("Interruptibility = %08x ActivityState = %08x\n", + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(GUEST_ACTIVITY_STATE)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + trace_printk("InterruptStatus = %04x\n", + vmcs_read16(GUEST_INTR_STATUS)); + if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) + vmx_ftrace_dump_msrs("guest autoload", &vmx->msr_autoload.guest); + if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) + vmx_ftrace_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + + trace_printk("*** Host State ***\n"); + trace_printk("RIP = 0x%016lx RSP = 0x%016lx\n", + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + trace_printk("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), + vmcs_read16(HOST_TR_SELECTOR)); + trace_printk("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), + vmcs_readl(HOST_TR_BASE)); + trace_printk("GDTBase=%016lx IDTBase=%016lx\n", + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); + trace_printk("CR0=%016lx CR3=%016lx CR4=%016lx\n", + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), + vmcs_readl(HOST_CR4)); + trace_printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(HOST_IA32_SYSENTER_ESP), + vmcs_read32(HOST_IA32_SYSENTER_CS), + vmcs_readl(HOST_IA32_SYSENTER_EIP)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) + trace_printk("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) + trace_printk("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); + if (cpu_has_load_perf_global_ctrl() && + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + trace_printk("PerfGlobCtl = 0x%016llx\n", + vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); + if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) + vmx_ftrace_dump_msrs("host autoload", &vmx->msr_autoload.host); + + trace_printk("*** Control State ***\n"); + trace_printk("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", + cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); + trace_printk("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", + pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); + trace_printk("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", + vmcs_read32(EXCEPTION_BITMAP), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + trace_printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + trace_printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + trace_printk(" reason=%08x qualification=%016lx\n", + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + trace_printk("IDTVectoring: info=%08x errcode=%08x\n", + vmcs_read32(IDT_VECTORING_INFO_FIELD), + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + trace_printk("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) + trace_printk("TSC Multiplier = 0x%016llx\n", + vmcs_read64(TSC_MULTIPLIER)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { + u16 status = vmcs_read16(GUEST_INTR_STATUS); + trace_printk("SVI|RVI = %02x|%02x \n", status >> 8, status & 0xff); + } + trace_printk("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) + trace_printk("APIC-access addr = 0x%016llx \n", vmcs_read64(APIC_ACCESS_ADDR)); + trace_printk("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); + } + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) + trace_printk("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) + trace_printk("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) + trace_printk("PLE Gap=%08x Window=%08x\n", + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + trace_printk("Virtual processor ID = 0x%04x\n", + vmcs_read16(VIRTUAL_PROCESSOR_ID)); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7358,6 +7560,14 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_update_hv_timer(vcpu); kvm_wait_lapic_expire(vcpu); + if (vcpu->dump_state_on_run) { + trace_printk("-------------------------------------------VCPU%d---------------------------------\n", vcpu->vcpu_id); + trace_printk("Entering guest\n"); + dump_ftrace_vmcs(vcpu); + dump_ftrace_vcpu_state(vcpu); + trace_printk("---------------------------------------------------------------------------\n"); + vcpu->dump_state_on_run = false; + } /* The actual VMENTER/EXIT is in the .noinstr.text section. */ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3eb608b6692c7..3c44ec76fa927f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4485,6 +4485,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_CPUID: case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: + case KVM_CAP_HYPERV_VSM: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -4526,6 +4527,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: + case KVM_CAP_APIC_ID_MASK: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -5152,8 +5154,8 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) +void kvm_vcpu_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) { struct kvm_queued_exception *ex; @@ -5245,8 +5247,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, } } -static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) +int kvm_vcpu_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR @@ -5821,7 +5823,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_GET_VCPU_EVENTS: { struct kvm_vcpu_events events; - kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); + kvm_vcpu_x86_get_vcpu_events(vcpu, &events); r = -EFAULT; if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) @@ -5836,7 +5838,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) break; - r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + r = kvm_vcpu_x86_set_vcpu_events(vcpu, &events); break; } case KVM_GET_DEBUGREGS: { @@ -6518,6 +6520,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_HYPERV_VSM: + kvm_hv_vtl_dev_register(); + kvm->arch.hyperv.hv_enable_vsm = true; + r = 0; + break; default: r = -EINVAL; break; @@ -6777,6 +6784,15 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +static int kvm_vm_ioctl_set_apic_id_mask(struct kvm *kvm, struct kvm_apic_id_mask *mask) +{ + if (mask->width > 32) + return -EINVAL; + + kvm->arch.apic_id_mask_shift = 32 - mask->width; + return 0; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; @@ -7112,6 +7128,61 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); break; } + case KVM_HV_GET_VSM_STATE: { + struct kvm_hv_vsm_state *vsm_state; + + r = -EINVAL; + if (!kvm->arch.hyperv.hv_enable_vsm) + goto out; + + r = -ENOMEM; + vsm_state = kzalloc(sizeof(*vsm_state), GFP_USER | __GFP_NOWARN); + if (!vsm_state) + goto out; + + r = kvm_vm_ioctl_get_hv_vsm_state(kvm, vsm_state); + if (r) + goto out_get_vsm_state; + + r = -EFAULT; + if (copy_to_user(argp, vsm_state, sizeof(*vsm_state))) + goto out_get_vsm_state; + + r = 0; +out_get_vsm_state: + kfree(vsm_state); + break; + } + case KVM_HV_SET_VSM_STATE: { + struct kvm_hv_vsm_state *vsm_state; + + r = -EINVAL; + if (!kvm->arch.hyperv.hv_enable_vsm) + goto out; + + vsm_state = memdup_user(argp, sizeof(*vsm_state)); + if (IS_ERR(vsm_state)) { + r = PTR_ERR(vsm_state); + goto out; + } + r = kvm_vm_ioctl_set_hv_vsm_state(kvm, vsm_state); + kfree(vsm_state); + break; + } + case KVM_SET_APIC_ID_MASK: { + struct kvm_apic_id_mask *mask; + + r = -EINVAL; + + mask = memdup_user(argp, sizeof(*mask)); + if (IS_ERR(mask)) { + r = PTR_ERR(mask); + goto out; + } + r = kvm_vm_ioctl_set_apic_id_mask(kvm, mask); + kfree(mask); + break; + } default: r = -ENOTTY; } @@ -9637,6 +9708,8 @@ void kvm_x86_vendor_exit(void) mutex_lock(&vendor_module_lock); kvm_x86_ops.hardware_enable = NULL; mutex_unlock(&vendor_module_lock); + + kvm_hv_vtl_dev_unregister(); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); @@ -10666,6 +10739,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); + + if (kvm_check_request(KVM_REQ_HV_INJECT_INTERCEPT, vcpu)) + kvm_hv_deliver_intercept(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || @@ -10713,6 +10789,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* Store vcpu->apicv_active before vcpu->mode. */ smp_store_release(&vcpu->mode, IN_GUEST_MODE); + WRITE_ONCE(vcpu->kicked, false); kvm_vcpu_srcu_read_unlock(vcpu); @@ -11359,6 +11436,11 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) (unsigned long *)sregs->interrupt_bitmap); } +void kvm_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + __get_sregs(vcpu, sregs); +} + static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) { int i; @@ -11460,6 +11542,188 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return ret; } +static void dump_ftrace_vcpu_state_events(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events; + + kvm_vcpu_x86_get_vcpu_events(vcpu, &events); + trace_printk("*** vCPU Events ***\n"); + trace_printk("exception: inj=%u nr=%u err_code=%u pending=%u err_code=%u\n", + events.exception.injected, events.exception.nr, + events.exception.has_error_code, events.exception.pending, + events.exception.error_code); + trace_printk("interrupt: inj=%u nr=%u soft=%u shadow=%u\n", + events.interrupt.injected, events.interrupt.nr, + events.interrupt.soft, events.interrupt.shadow); + trace_printk("nmi: inj=%u pending=%u masked=%u pad=%u\n", + events.nmi.injected, events.nmi.pending, events.nmi.masked, + events.nmi.pad); + trace_printk("sipi_vector: %u\n", events.sipi_vector); + trace_printk("flags: 0x%x\n", events.flags); + trace_printk("smi: smm=%u pending=%u smm_in_nmi=%u latched_init=%u\n", + events.smi.smm, events.smi.pending, + events.smi.smm_inside_nmi, events.smi.latched_init); + trace_printk("triple_fault: pending=%u\n", events.triple_fault.pending); + trace_printk("exc_payload: has_payload=%u payload=0x%llx\n", + events.exception_has_payload, events.exception_payload); +} + +static void dump_ftrace_vcpu_mp_state(struct kvm_vcpu *vcpu) +{ + trace_printk("*** vCPU MP state ***\n"); + trace_printk("mp_state=0x%x\n", vcpu->arch.mp_state); +} + +static void dump_ftrace_vcpu_regs(struct kvm_vcpu *vcpu) +{ + struct kvm_regs regs; + + __get_regs(vcpu, ®s); + trace_printk("*** vCPU Regs ***\n"); + trace_printk("rax=0x%llx rbx=0x%llx rcx=0x%llx rdx=0x%llx\n", regs.rax, + regs.rbx, regs.rcx, regs.rdx); + + trace_printk("rsi=0x%llx rdi=0x%llx rsp=0x%llx rbp=0x%llx\n", regs.rsi, + regs.rdi, regs.rsp, regs.rbp); + + trace_printk("r8=0x%llx r9=0x%llx r10=0x%llx r11=0x%llx\n", regs.r8, regs.r9, + regs.r10, regs.r11); + + trace_printk("r12=0x%llx r13=0x%llx r14=0x%llx r15=0x%llx\n", regs.r12, + regs.r13, regs.r14, regs.r15); + + trace_printk("rip=0x%llx rflags=0x%llx\n", regs.rip, regs.rflags); +} + +static void print_segment(const char *name, const struct kvm_segment seg) +{ + trace_printk("%s: base=0x%llx limit=0x%x selector=0x%x type=0x%x present=0x%x dpl=0x%x db=0x%x s=0x%x l=0x%x g=0x%x avl=0x%x unusable=0x%x\n", + name, seg.base, seg.limit, seg.selector, seg.type, seg.present, + seg.dpl, seg.db, seg.s, seg.l, seg.g, seg.avl, seg.unusable); +} + +static void print_dtable(const char *name, const struct kvm_dtable dtable) +{ + trace_printk("%s: base=0x%llx limit=0x%x\n", name, dtable.base, dtable.limit); +} + +static void dump_ftrace_vcpu_sregs2(struct kvm_vcpu *vcpu) +{ + struct kvm_sregs2 sregs; + + __get_sregs2(vcpu, &sregs); + + trace_printk("*** vCPU Sregs ***\n"); + print_segment("cs", sregs.cs); + print_segment("ds", sregs.ds); + print_segment("es", sregs.es); + print_segment("fs", sregs.fs); + print_segment("gs", sregs.gs); + print_segment("ss", sregs.ss); + + print_segment("tr", sregs.tr); + print_segment("ldt", sregs.ldt); + + print_dtable("gdt", sregs.gdt); + print_dtable("idt", sregs.idt); + + trace_printk("cr0=0x%llx cr2=0x%llx cr3=0x%llx cr4=0x%llx cr8=0x%llx\n", + sregs.cr0, sregs.cr2, sregs.cr3, sregs.cr4, sregs.cr8); + + trace_printk("efer=0x%llx apic_base=0x%llx flags=0x%llx\n", sregs.efer, + sregs.apic_base, sregs.flags); + + trace_printk("pdptrs: 0x%llx 0x%llx 0x%llx 0x%llx\n", sregs.pdptrs[0], + sregs.pdptrs[1], sregs.pdptrs[2], sregs.pdptrs[3]); +} + +static void dump_ftrace_vcpu_kvm_lapic_state(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic_state lapic_state; + + kvm_apic_get_state(vcpu, &lapic_state); + + trace_printk("*** vCPU apic state ***\n"); + trace_printk("APIC_ID: 0x%x\n", (unsigned char)lapic_state.regs[0x20]); + trace_printk("APIC_LVR: 0x%x\n", (unsigned char)lapic_state.regs[0x30]); + trace_printk("APIC_TASKPRI: 0x%x\n", (unsigned char)lapic_state.regs[0x80]); + trace_printk("APIC_ARBPRI: 0x%x\n", (unsigned char)lapic_state.regs[0x90]); + trace_printk("APIC_PROCPRI: 0x%x\n", (unsigned char)lapic_state.regs[0xA0]); + trace_printk("APIC_EOI: 0x%x\n", (unsigned char)lapic_state.regs[0xB0]); + trace_printk("APIC_RRR: 0x%x\n", (unsigned char)lapic_state.regs[0xC0]); + trace_printk("APIC_LDR: 0x%x\n", (unsigned char)lapic_state.regs[0xD0]); + trace_printk("APIC_DFR: 0x%x\n", (unsigned char)lapic_state.regs[0xE0]); + trace_printk("APIC_SPIV: 0x%x\n", (unsigned char)lapic_state.regs[0xF0]); + trace_printk("APIC_ISR: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", + (unsigned char)lapic_state.regs[0x100], + (unsigned char)lapic_state.regs[0x101], + (unsigned char)lapic_state.regs[0x102], + (unsigned char)lapic_state.regs[0x103], + (unsigned char)lapic_state.regs[0x104], + (unsigned char)lapic_state.regs[0x105], + (unsigned char)lapic_state.regs[0x106], + (unsigned char)lapic_state.regs[0x107]); + + trace_printk("APIC_TMR: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", + (unsigned char)lapic_state.regs[0x180], + (unsigned char)lapic_state.regs[0x181], + (unsigned char)lapic_state.regs[0x182], + (unsigned char)lapic_state.regs[0x183], + (unsigned char)lapic_state.regs[0x184], + (unsigned char)lapic_state.regs[0x185], + (unsigned char)lapic_state.regs[0x186], + (unsigned char)lapic_state.regs[0x187]); + + trace_printk("APIC_IRR: 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", + (unsigned char)lapic_state.regs[0x200], + (unsigned char)lapic_state.regs[0x201], + (unsigned char)lapic_state.regs[0x202], + (unsigned char)lapic_state.regs[0x203], + (unsigned char)lapic_state.regs[0x204], + (unsigned char)lapic_state.regs[0x205], + (unsigned char)lapic_state.regs[0x206], + (unsigned char)lapic_state.regs[0x207]); + trace_printk("APIC_ESR: 0x%x\n", (unsigned char)lapic_state.regs[0x280]); + trace_printk("APIC_ICR: 0x%x\n", (unsigned char)lapic_state.regs[0x300]); + trace_printk("APIC_ICR2: 0x%x\n", (unsigned char)lapic_state.regs[0x310]); + trace_printk("APIC_LVTT: 0x%x\n", (unsigned char)lapic_state.regs[0x320]); + trace_printk("APIC_LVTTHMR: 0x%x\n", (unsigned char)lapic_state.regs[0x330]); + trace_printk("APIC_LVTPC: 0x%x\n", (unsigned char)lapic_state.regs[0x340]); + trace_printk("APIC_LVT0: 0x%x\n", (unsigned char)lapic_state.regs[0x350]); + trace_printk("APIC_LVT1: 0x%x\n", (unsigned char)lapic_state.regs[0x360]); + trace_printk("APIC_LVTERR: 0x%x\n", (unsigned char)lapic_state.regs[0x370]); + trace_printk("APIC_TMICT: 0x%x\n", (unsigned char)lapic_state.regs[0x380]); + trace_printk("APIC_TMCCT: 0x%x\n", (unsigned char)lapic_state.regs[0x390]); + trace_printk("APIC_TDCR: 0x%x\n", (unsigned char)lapic_state.regs[0x3E0]); + trace_printk("APIC_SELF_IPI: 0x%x\n", (unsigned char)lapic_state.regs[0x3F0]); +} + +static void dump_ftrace_vcpu_debugregs(struct kvm_vcpu *vcpu) +{ + struct kvm_debugregs debugregs; + + kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &debugregs); + trace_printk("*** vCPU Debug Regs ***\n"); + trace_printk("db[0]=0x%llx db[1]=0x%llx db[2]=0x%llx db[3]=0x%llx\n", + debugregs.db[0], debugregs.db[1], debugregs.db[2], + debugregs.db[3]); + + trace_printk("dr6=0x%llx dr7=0x%llx flags=0x%llx\n", debugregs.dr6, + debugregs.dr7, debugregs.flags); + +} + +void dump_ftrace_vcpu_state(struct kvm_vcpu *vcpu) +{ + dump_ftrace_vcpu_state_events(vcpu); + dump_ftrace_vcpu_mp_state(vcpu); + dump_ftrace_vcpu_regs(vcpu); + dump_ftrace_vcpu_sregs2(vcpu); + dump_ftrace_vcpu_kvm_lapic_state(vcpu); + dump_ftrace_vcpu_debugregs(vcpu); + dump_ftrace_vcpu_hyperv(vcpu); +} + int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { @@ -11637,6 +11901,11 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) return 0; } +int kvm_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + return __set_sregs(vcpu, sregs); +} + int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -11812,7 +12081,7 @@ static void store_regs(struct kvm_vcpu *vcpu) __get_sregs(vcpu, &vcpu->run->s.regs.sregs); if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS) - kvm_vcpu_ioctl_x86_get_vcpu_events( + kvm_vcpu_x86_get_vcpu_events( vcpu, &vcpu->run->s.regs.events); } @@ -11835,7 +12104,7 @@ static int sync_regs(struct kvm_vcpu *vcpu) if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { struct kvm_vcpu_events events = vcpu->run->s.regs.events; - if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events)) + if (kvm_vcpu_x86_set_vcpu_events(vcpu, &events)) return -EINVAL; vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS; @@ -11853,7 +12122,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (!kvm->arch.max_vcpu_ids) kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; - if (id >= kvm->arch.max_vcpu_ids) + if (kvm_apic_id_masked(kvm, id) >= kvm->arch.max_vcpu_ids) return -EINVAL; return static_call(kvm_x86_vcpu_precreate)(kvm); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 1e7be1f6ab299d..86c12b202c46b5 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -499,6 +499,14 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type); +void kvm_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvm_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +void kvm_vcpu_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); +int kvm_vcpu_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events); + /* * Internal error codes that are used to indicate that MSR emulation encountered * an error that should result in #GP in the guest, unless userspace diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index fdac4a1714ec09..369e1a004d016e 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -89,6 +89,8 @@ #define HV_ACCESS_STATS BIT(8) #define HV_DEBUGGING BIT(11) #define HV_CPU_MANAGEMENT BIT(12) +#define HV_ACCESS_VSM BIT(16) +#define HV_ACCESS_VP_REGISTERS BIT(17) #define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20) #define HV_ISOLATION BIT(22) @@ -146,9 +148,13 @@ union hv_reference_tsc_msr { /* Declare the various hypercall operations. */ #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 -#define HVCALL_ENABLE_VP_VTL 0x000f #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 #define HVCALL_SEND_IPI 0x000b +#define HVCALL_MODIFY_VTL_PROTECTION_MASK 0x000c +#define HVCALL_ENABLE_PARTITION_VTL 0x000d +#define HVCALL_ENABLE_VP_VTL 0x000f +#define HVCALL_VTL_CALL 0x0011 +#define HVCALL_VTL_RETURN 0x0012 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_SEND_IPI_EX 0x0015 @@ -157,6 +163,7 @@ union hv_reference_tsc_msr { #define HVCALL_CREATE_VP 0x004e #define HVCALL_GET_VP_REGISTERS 0x0050 #define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_TRANSLATE_VIRTUAL_ADDRESS 0x0052 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d #define HVCALL_POST_DEBUG_DATA 0x0069 @@ -176,6 +183,7 @@ union hv_reference_tsc_msr { /* Extended hypercalls */ #define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 +#define HV_EXT_CALL_GET_BOOT_ZEROED_MEMORY 0x8002 #define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003 #define HV_FLUSH_ALL_PROCESSORS BIT(0) @@ -424,14 +432,16 @@ struct hv_vpset { /* HvCallSendSyntheticClusterIpi hypercall */ struct hv_send_ipi { u32 vector; - u32 reserved; + union hv_input_vtl in_vtl; + u8 reserved[3]; u64 cpu_mask; } __packed; /* HvCallSendSyntheticClusterIpiEx hypercall */ struct hv_send_ipi_ex { u32 vector; - u32 reserved; + union hv_input_vtl in_vtl; + u8 reserved[3]; struct hv_vpset vp_set; } __packed; @@ -823,4 +833,46 @@ struct hv_mmio_write_input { u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; } __packed; +#define HV_NUM_VTLS 2 +#define HV_INVALID_VTL ((u8) -1) +#define HV_ALL_VTLS ((u8) 0xF) + +/* + * VTL call/return hypercall page offsets register + */ +union hv_register_vsm_code_page_offsets { + u64 as_u64; + struct { + u64 vtl_call_offset:12; + u64 vtl_return_offset:12; + u64 reserved:40; + } __packed; +}; + +#define HV_XLATE_GVA_SUCCESS 0 +#define HV_XLATE_GVA_UNMAPPED 1 +#define HV_XLATE_GPA_UNMAPPED 4 +#define HV_CACHE_TYPE_X64_WB 6 + +#define HV_XLATE_GVA_VAL_READ 1 +#define HV_XLATE_GVA_VAL_WRITE 2 +#define HV_XLATE_GVA_VAL_EXECUTE 4 +#define HV_XLATE_GVA_FLAGS_MASK 0x3F + +struct hv_xlate_va_input { + u64 partition_id; + u32 vp_index; + u32 reserved; + u64 control_flags; + u64 gva; +}; + +struct hv_xlate_va_output { + u32 result_code; + u32 cache_type:8; + u32 overlay_page:1; + u32 reserved:23; + u64 gpa; +}; + #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 687589ce9f6302..1e50b13db5d551 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -336,6 +336,7 @@ struct kvm_vcpu { #endif int mode; u64 requests; + bool kicked; unsigned long guest_debug; struct mutex mutex; @@ -395,6 +396,8 @@ struct kvm_vcpu { */ struct kvm_memory_slot *last_used_slot; u64 last_used_slot_gen; + wait_queue_head_t wqh; + bool dump_state_on_run; }; /* @@ -2365,7 +2368,7 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t size, bool is_write, bool is_exec, - bool is_private) + bool is_read, bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; vcpu->run->memory_fault.gpa = gpa; @@ -2373,6 +2376,12 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, /* RWX flags are not (yet) defined or communicated to userspace. */ vcpu->run->memory_fault.flags = 0; + if (is_read) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_READ; + if (is_write) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_WRITE; + if (is_exec) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_EXECUTE; if (is_private) vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } @@ -2383,6 +2392,9 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)); } +int kvm_ioctl_set_mem_attributes(struct kvm *kvm, struct xarray *mem_attr_array, + u64 supported_attrs, + struct kvm_memory_attributes *attrs); bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, unsigned long attrs); bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 3bd31ea23fee9e..ca28e3bd5fc4ae 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -17,7 +17,8 @@ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ - ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) + ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR), \ + ERSN(MEMORY_FAULT) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), @@ -504,6 +505,29 @@ TRACE_EVENT(kvm_test_age_hva, TP_printk("mmu notifier test age hva: %#016lx", __entry->hva) ); +TRACE_EVENT(kvm_set_mem_attributes, + TP_PROTO(u64 start, u64 size, u64 attributes, u64 flags), + TP_ARGS(start, size, attributes, flags), + + TP_STRUCT__entry( + __field( u64, start ) + __field( u64, size ) + __field( u64, attributes ) + __field( u64, flags ) + ), + + TP_fast_assign( + __entry->start = start; + __entry->size = size; + __entry->attributes = attributes; + __entry->flags = flags; + ), + + TP_printk("start 0x%llx, size 0x%llx, attributes 0x%llx, flags 0x%llx", + __entry->start, __entry->size, __entry->attributes, + __entry->flags) +); + #endif /* _TRACE_KVM_MAIN_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5b5820d19e7191..d87cca1c715c06 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -196,6 +196,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 #define KVM_EXIT_HYPERV_SYNDBG 3 +#define KVM_EXIT_HYPERV_OVERLAY 4 __u32 type; __u32 pad1; union { @@ -210,6 +211,10 @@ struct kvm_hyperv_exit { __u64 input; __u64 result; __u64 params[2]; + //TODO: Maybe export sse128_t? + /* Number of XMM registers used in hypercall input/output */ + #define HV_HYPERCALL_MAX_XMM_REGISTERS 6 + __u64 xmm[HV_HYPERCALL_MAX_XMM_REGISTERS * 2]; } hcall; struct { __u32 msr; @@ -220,6 +225,13 @@ struct kvm_hyperv_exit { __u64 recv_page; __u64 pending_page; } syndbg; + struct { + __u32 msr; /* kernel -> user */ + __u8 error; /* user -> kernel */ + __u8 is_hypercall; /* kernel -> user */ + __u8 pad; + __u64 gpa; /* kernel -> user */ + } overlay; } u; }; @@ -527,7 +539,13 @@ struct kvm_run { } notify; /* KVM_EXIT_MEMORY_FAULT */ struct { +#define KVM_MEMORY_EXIT_FLAG_READ (1ULL << 0) +#define KVM_MEMORY_EXIT_FLAG_WRITE (1ULL << 1) +#define KVM_MEMORY_EXIT_FLAG_EXECUTE (1ULL << 2) #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) +#define KVM_MEMORY_EXIT_NO_ACCESS \ + (KVM_MEMORY_EXIT_FLAG_NR | KVM_MEMORY_EXIT_FLAG_NW | \ + KVM_MEMORY_EXIT_FLAG_NX) __u64 flags; __u64 gpa; __u64 size; @@ -1219,6 +1237,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_MEMORY_ATTRIBUTES 232 #define KVM_CAP_GUEST_MEMFD 233 #define KVM_CAP_VM_TYPES 234 +#define KVM_CAP_HYPERV_VSM 235 +#define KVM_CAP_APIC_ID_MASK 236 #ifdef KVM_CAP_IRQ_ROUTING @@ -1457,6 +1477,9 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP_DEL KVM_DEV_VFIO_FILE_DEL #define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 +#define KVM_DEV_HV_VTL_GROUP 1 +#define KVM_DEV_HV_VTL_GROUP_VTLNUM 1 + enum kvm_device_type { KVM_DEV_TYPE_FSL_MPIC_20 = 1, #define KVM_DEV_TYPE_FSL_MPIC_20 KVM_DEV_TYPE_FSL_MPIC_20 @@ -1480,6 +1503,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_RISCV_AIA, #define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA + KVM_DEV_TYPE_HV_VSM_VTL, +#define KVM_DEV_TYPE_HV_VSM_VTL KVM_DEV_TYPE_HV_VSM_VTL KVM_DEV_TYPE_MAX, }; @@ -2295,7 +2320,11 @@ struct kvm_memory_attributes { __u64 flags; }; +#define KVM_MEMORY_ATTRIBUTE_READ (1ULL << 0) +#define KVM_MEMORY_ATTRIBUTE_WRITE (1ULL << 1) +#define KVM_MEMORY_ATTRIBUTE_EXECUTE (1ULL << 2) #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) +#define KVM_MEMORY_ATTRIBUTE_NO_ACCESS (1ULL << 4) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) @@ -2307,4 +2336,9 @@ struct kvm_create_guest_memfd { #define KVM_GUEST_MEMFD_ALLOW_HUGEPAGE (1ULL << 0) +/* Get/Set Hyper-V VSM state. Available with KVM_CAP_HYPERV_VSM */ +#define KVM_HV_GET_VSM_STATE _IOR(KVMIO, 0xd5, struct kvm_hv_vsm_state) +#define KVM_HV_SET_VSM_STATE _IOW(KVMIO, 0xd6, struct kvm_hv_vsm_state) + +#define KVM_SET_APIC_ID_MASK _IOW(KVMIO, 0xd7, struct kvm_apic_id_mask) #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ad9aab898a0c22..c2b663bdee2cbe 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -497,12 +497,15 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; vcpu->ready = false; + vcpu->kicked = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); vcpu->last_used_slot = NULL; /* Fill the stats id string for the vcpu */ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", task_pid_nr(current), id); + init_waitqueue_head(&vcpu->wqh); + vcpu->dump_state_on_run = true; } static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -2552,8 +2555,9 @@ static bool kvm_pre_set_memory_attributes(struct kvm *kvm, } /* Set @attributes for the gfn range [@start, @end). */ -static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, - unsigned long attributes) +static int kvm_set_mem_attributes(struct kvm *kvm, + struct xarray *mem_attr_array, gfn_t start, + gfn_t end, unsigned long attributes) { struct kvm_mmu_notifier_range pre_set_range = { .start = start, @@ -2588,7 +2592,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, * partway through setting the new attributes. */ for (i = start; i < end; i++) { - r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); + r = xa_reserve(mem_attr_array, i, GFP_KERNEL_ACCOUNT); if (r) goto out_unlock; } @@ -2596,7 +2600,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, kvm_handle_gfn_range(kvm, &pre_set_range); for (i = start; i < end; i++) { - r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, + r = xa_err(xa_store(mem_attr_array, i, entry, GFP_KERNEL_ACCOUNT)); KVM_BUG_ON(r, kvm); } @@ -2608,15 +2612,17 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, return r; } -static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, - struct kvm_memory_attributes *attrs) + +int kvm_ioctl_set_mem_attributes(struct kvm *kvm, struct xarray *mem_attr_array, + u64 supported_attrs, + struct kvm_memory_attributes *attrs) { gfn_t start, end; /* flags is currently not used. */ if (attrs->flags) return -EINVAL; - if (attrs->attributes & ~kvm_supported_mem_attributes(kvm)) + if (attrs->attributes & ~supported_attrs) return -EINVAL; if (attrs->size == 0 || attrs->address + attrs->size < attrs->address) return -EINVAL; @@ -2633,7 +2639,16 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, */ BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long)); - return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes); + return kvm_set_mem_attributes(kvm, mem_attr_array, start, end, + attrs->attributes); +} + +static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, + struct kvm_memory_attributes *attrs) +{ + return kvm_ioctl_set_mem_attributes(kvm, &kvm->mem_attr_array, + kvm_supported_mem_attributes(kvm), + attrs); } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ @@ -3969,7 +3984,16 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) cpu = READ_ONCE(vcpu->cpu); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) smp_send_reschedule(cpu); + goto out; } + + if (!cmpxchg(&vcpu->kicked, false, true)) { + wake_up_interruptible(&vcpu->wqh); + trace_printk("vCPU%d\n", vcpu->vcpu_id); + trace_dump_stack(0); + kvm_get_vcpu_by_id(vcpu->kvm, 0)->dump_state_on_run = true; + } + out: put_cpu(); } @@ -4174,6 +4198,25 @@ static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static __poll_t kvm_vcpu_poll(struct file *file, poll_table *wait) +{ + struct kvm_vcpu *vcpu = file->private_data; + + poll_wait(file, &vcpu->wqh, wait); + + /* + * Make sure we read vcpu->kicked after adding the vcpu into + * the waitqueue list. + */ + smp_mb(); + if (READ_ONCE(vcpu->kicked)) { + trace_printk("up!\n"); + return EPOLLIN; + } + + return 0; +} + static int kvm_vcpu_release(struct inode *inode, struct file *filp) { struct kvm_vcpu *vcpu = filp->private_data; @@ -4186,6 +4229,7 @@ static const struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, .mmap = kvm_vcpu_mmap, + .poll = kvm_vcpu_poll, .llseek = noop_llseek, KVM_COMPAT(kvm_vcpu_compat_ioctl), }; @@ -4241,8 +4285,9 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) struct kvm_vcpu *vcpu; struct page *page; - if (id >= KVM_MAX_VCPU_IDS) - return -EINVAL; + /* TODO: fix this */ +// if (id >= KVM_MAX_VCPU_IDS) +// return -EINVAL; mutex_lock(&kvm->lock); if (kvm->created_vcpus >= kvm->max_vcpus) {