|
| 1 | +From 0a04094c8b7e292fcb7bdf8528d70baddbfff379 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Patrick Roy < [email protected]> |
| 3 | +Date: Fri, 18 Jul 2025 15:59:39 +0100 |
| 4 | +Subject: [PATCH 01/15] KVM: x86: use uhva for kvm-clock if kvm_gpc_refresh() |
| 5 | + fails |
| 6 | + |
| 7 | +kvm-clock uses a gfn_to_pfn_cache to avoid repeated gpa->pfn |
| 8 | +computations, relying on mmu notifiers to determine when the translation |
| 9 | +needs to be redone. |
| 10 | + |
| 11 | +If the guest places the kvm-clock for some vcpu into memory that is |
| 12 | +backed by a KVM_MEMSLOT_GMEM_ONLY memslot, and the guest_memfd instance |
| 13 | +has GUEST_MEMFD_FLAG_NO_DIRECT_MAP set, this does not work: |
| 14 | +gfn_to_pfn_cache internally uses GUP to resolve uhva->pfn, which |
| 15 | +returned -EFAULT for direct map removed memory. But even if this pfn |
| 16 | +computation were to work, the subsequent attempts to access guest memory |
| 17 | +through the direct map would obviously fail. |
| 18 | + |
| 19 | +For this scenario, all other parts of kvm fall back to instead accessing |
| 20 | +guest memory through userspace mapping of guest_memfd, which is stored |
| 21 | +in the memslots userspace_addr. Have kvm-clock do the same by handling |
| 22 | +failures in kvm_gpc_refresh() with a fallback to a pvclock update |
| 23 | +routine that operates on userspace mappings. This looses the |
| 24 | +optimization of gfn_to_pfn_cache for these VMs, but on modern hardawre |
| 25 | +kvm-clock update requests should be rare enough for this to not matter |
| 26 | +(and guest_memfd is not support for Xen VMs, where speed of pvclock |
| 27 | +accesses is more relevant). |
| 28 | + |
| 29 | +Alternatively, it would be possible to team gfn_to_pfn_cache about |
| 30 | +(direct map removed) guest_memfd, however the combination of on-demand |
| 31 | +direct map reinsertion (and its induced ref-counting) and hooking |
| 32 | +gfn_to_pfn_caches up to gmem invalidations has proven significantly more |
| 33 | +complex [1], and hence simply falling back to userspace mappings was |
| 34 | +suggested by Sean at one of the guest_memfd upstream calls. |
| 35 | + |
| 36 | +[1]: https://lore.kernel.org/kvm/ [email protected]/ |
| 37 | + https://lore.kernel.org/kvm/ [email protected]/ |
| 38 | + |
| 39 | +Signed-off-by: Patrick Roy < [email protected]> |
| 40 | +--- |
| 41 | + arch/x86/kvm/x86.c | 38 +++++++++++++++++++++++++++++++++++++- |
| 42 | + 1 file changed, 37 insertions(+), 1 deletion(-) |
| 43 | + |
| 44 | +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c |
| 45 | +index 33fba801b205..c8fd35c1bbda 100644 |
| 46 | +--- a/arch/x86/kvm/x86.c |
| 47 | ++++ b/arch/x86/kvm/x86.c |
| 48 | +@@ -3149,6 +3149,40 @@ u64 get_kvmclock_ns(struct kvm *kvm) |
| 49 | + return data.clock; |
| 50 | + } |
| 51 | + |
| 52 | ++static void kvm_setup_guest_pvclock_slow(struct pvclock_vcpu_time_info *ref_hv_clock, |
| 53 | ++ struct kvm_vcpu *vcpu, |
| 54 | ++ gpa_t gpa) |
| 55 | ++{ |
| 56 | ++ struct pvclock_vcpu_time_info guest_hv_clock; |
| 57 | ++ struct pvclock_vcpu_time_info hv_clock; |
| 58 | ++ |
| 59 | ++ memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); |
| 60 | ++ |
| 61 | ++ kvm_read_guest(vcpu->kvm, gpa, &guest_hv_clock, sizeof(struct pvclock_vcpu_time_info)); |
| 62 | ++ |
| 63 | ++ /* |
| 64 | ++ * This VCPU is paused, but it's legal for a guest to read another |
| 65 | ++ * VCPU's kvmclock, so we really have to follow the specification where |
| 66 | ++ * it says that version is odd if data is being modified, and even after |
| 67 | ++ * it is consistent. |
| 68 | ++ */ |
| 69 | ++ |
| 70 | ++ guest_hv_clock.version = hv_clock.version = (guest_hv_clock.version + 1) | 1; |
| 71 | ++ smp_wmb(); |
| 72 | ++ |
| 73 | ++ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ |
| 74 | ++ hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); |
| 75 | ++ |
| 76 | ++ kvm_write_guest(vcpu->kvm, gpa, &hv_clock, sizeof(struct pvclock_vcpu_time_info)); |
| 77 | ++ |
| 78 | ++ smp_wmb(); |
| 79 | ++ |
| 80 | ++ ++hv_clock.version; |
| 81 | ++ kvm_write_guest(vcpu->kvm, gpa + offsetof(struct pvclock_vcpu_time_info, version), &hv_clock.version, sizeof(hv_clock.version)); |
| 82 | ++ |
| 83 | ++ trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); |
| 84 | ++} |
| 85 | ++ |
| 86 | + static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, |
| 87 | + struct kvm_vcpu *vcpu, |
| 88 | + struct gfn_to_pfn_cache *gpc, |
| 89 | +@@ -3164,8 +3198,10 @@ static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, |
| 90 | + while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { |
| 91 | + read_unlock_irqrestore(&gpc->lock, flags); |
| 92 | + |
| 93 | +- if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) |
| 94 | ++ if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) { |
| 95 | ++ kvm_setup_guest_pvclock_slow(ref_hv_clock, vcpu, gpc->gpa + offset); |
| 96 | + return; |
| 97 | ++ } |
| 98 | + |
| 99 | + read_lock_irqsave(&gpc->lock, flags); |
| 100 | + } |
| 101 | +-- |
| 102 | +2.51.0 |
| 103 | + |
0 commit comments