Skip to content

Commit f3553a2

Browse files
ashkalraintel-lab-lkp
authored andcommitted
x86/sev: Fix host kdump support for SNP
With active SNP VMs, SNP_SHUTDOWN_EX invoked during panic notifiers causes crashkernel boot failure with the following signature: [ 563.497112] sysrq: Trigger a crash [ 563.508415] Kernel panic - not syncing: sysrq triggered crash [ 563.522002] CPU: 10 UID: 0 PID: 4661 Comm: bash Kdump: loaded Not tainted 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty torvalds#61 [ 563.549762] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS RXM100AB 10/17/2022 [ 563.566266] Call Trace: [ 563.576430] <TASK> [ 563.585932] dump_stack_lvl+0x2b/0x90 [ 563.597244] dump_stack+0x14/0x20 [ 563.608141] panic+0x3b9/0x400 [ 563.618801] ? srso_alias_return_thunk+0x5/0xfbef5 [ 563.631271] sysrq_handle_crash+0x19/0x20 [ 563.642696] __handle_sysrq+0xf9/0x290 [ 563.653691] ? srso_alias_return_thunk+0x5/0xfbef5 [ 563.666126] write_sysrq_trigger+0x60/0x80 ... ... [ 564.186804] in panic [ 564.194287] in panic_other_cpus_shutdown [ 564.203674] kexec: in crash_smp_send_stop [ 564.213205] kexec: in kdump_nmi_shootdown_cpus [ 564.224338] Kernel Offset: 0x35a00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 564.282209] in snp_shutdown_on_panic after decommision, wbinvd + df_flush required [ 564.462217] ccp 0000:23:00.1: SEV-SNP DF_FLUSH failed with error 14 [ 564.676920] kexec: in native_machine_crash_shutdown early console in extract_kernel input_data: 0x000000007410d2cc input_len: 0x0000000000ce98b2 output: 0x0000000071600000 output_len: 0x000000000379eb8c kernel_total_size: 0x0000000002c30000 needed_size: 0x0000000003800000 trampoline_32bit: 0x0000000000000000 Invalid physical address chosen! Physical KASLR disabled: no suitable memory region! Virtual KASLR using RDRAND RDTSC... Decompressing Linux... Parsing ELF... Performing relocations... done. Booting the kernel (entry_offset: 0x0000000000000bda). [ 0.000000] Linux version 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty (amd@ethanolx7e2ehost) (gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, GNU ld (GNU Binutils) 2.40) torvalds#61 SMP Mon Aug 19 19:59:02 UTC 2024 [ 0.000000] Command line: BOOT_IMAGE=/vmlinuz-6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty root=UUID=4b87a03b-0e78-42ca-a8ad-997e63bba4e0 ro console=tty0 console=ttyS0,115200n8 earlyprintk=ttyS0,115200n8 amd_iommu_dump=1 reset_devices systemd.unit=kdump-tools-dump.service nr_cpus=1 irqpoll nousb elfcorehdr=1916276K [ 0.000000] KERNEL supported cpus: ... ... [ 1.671804] AMD-Vi: Using global IVHD EFR:0x841f77e022094ace, EFR2:0x0 [ 1.679835] AMD-Vi: Translation is already enabled - trying to copy translation structures [ 1.689363] AMD-Vi: Copied DEV table from previous kernel. [ 1.864369] AMD-Vi: Completion-Wait loop timed out [ 2.038289] AMD-Vi: Completion-Wait loop timed out [ 2.212215] AMD-Vi: Completion-Wait loop timed out [ 2.386141] AMD-Vi: Completion-Wait loop timed out [ 2.560068] AMD-Vi: Completion-Wait loop timed out [ 2.733997] AMD-Vi: Completion-Wait loop timed out [ 2.907927] AMD-Vi: Completion-Wait loop timed out [ 3.081855] AMD-Vi: Completion-Wait loop timed out [ 3.225500] AMD-Vi: Completion-Wait loop timed out [ 3.231083] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 d out [ 3.579592] AMD-Vi: Completion-Wait loop timed out [ 3.753164] AMD-Vi: Completion-Wait loop timed out [ 3.815762] Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC [ 3.825347] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.11.0-rc3-next-20240813-snp-host-f2a41ff576cc-dirty torvalds#61 [ 3.837188] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS RXM100AB 10/17/2022 [ 3.846215] Call Trace: [ 3.848939] <TASK> [ 3.851277] dump_stack_lvl+0x2b/0x90 [ 3.855354] dump_stack+0x14/0x20 [ 3.859050] panic+0x3b9/0x400 [ 3.862454] panic_if_irq_remap+0x21/0x30 [ 3.866925] setup_IO_APIC+0x8aa/0xa50 [ 3.871106] ? __pfx_amd_iommu_enable_faulting+0x10/0x10 [ 3.877032] ? __cpuhp_setup_state+0x5e/0xd0 [ 3.881793] apic_intr_mode_init+0x6a/0xf0 [ 3.886360] x86_late_time_init+0x28/0x40 [ 3.890832] start_kernel+0x6a8/0xb50 [ 3.894914] x86_64_start_reservations+0x1c/0x30 [ 3.900064] x86_64_start_kernel+0xbf/0x110 [ 3.904729] ? setup_ghcb+0x12/0x130 [ 3.908716] common_startup_64+0x13e/0x141 [ 3.913283] </TASK> [ 3.915715] in panic [ 3.918149] in panic_other_cpus_shutdown [ 3.922523] ---[ end Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC ]--- This happens as SNP_SHUTDOWN_EX fails when SNP VMs are active as the firmware checks every encryption-capable ASID to verify that it is not in use by a guest and a DF_FLUSH is not required. If a DF_FLUSH is required, the firmware returns DFFLUSH_REQUIRED. To fix this, added support to do SNP_DECOMMISION of all active SNP VMs in the panic notifier before doing SNP_SHUTDOWN_EX, but then SNP_DECOMMISION tags all CPUs on which guest has been activated to do a WBINVD. This causes SNP_DF_FLUSH command failure with the following flow: SNP_DECOMMISION -> SNP_SHUTDOWN_EX -> SNP_DF_FLUSH -> failure with WBINVD_REQUIRED. When panic notifier is invoked all other CPUs have already been shutdown, so it is not possible to do a wbinvd_on_all_cpus() after SNP_DECOMMISION has been executed. This eventually causes SNP_SHUTDOWN_EX to fail after SNP_DECOMMISION. Adding fix to do SNP_DECOMMISION and subsequent WBINVD on all CPUs during NMI shutdown of CPUs as part of disabling virtualization on all CPUs via cpu_emergency_disable_virtualization -> svm_emergency_disable(). This fixes and enables crashkernel/kdump on SNP host. Fixes: c3b86e6 ("x86/cpufeatures: Enable/unmask SEV-SNP CPU feature") Signed-off-by: Ashish Kalra <[email protected]>
1 parent 4d8fdc5 commit f3553a2

File tree

3 files changed

+132
-1
lines changed

3 files changed

+132
-1
lines changed

arch/x86/kvm/svm/sev.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ static unsigned int nr_asids;
8989
static unsigned long *sev_asid_bitmap;
9090
static unsigned long *sev_reclaim_asid_bitmap;
9191

92+
static void **snp_asid_to_gctx_pages_map;
9293
static int snp_decommission_context(struct kvm *kvm);
9394

9495
struct enc_region {
@@ -2248,6 +2249,9 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
22482249
goto e_free_context;
22492250
}
22502251

2252+
if (snp_asid_to_gctx_pages_map)
2253+
snp_asid_to_gctx_pages_map[sev_get_asid(kvm)] = sev->snp_context;
2254+
22512255
return 0;
22522256

22532257
e_free_context:
@@ -2884,9 +2888,35 @@ static int snp_decommission_context(struct kvm *kvm)
28842888
snp_free_firmware_page(sev->snp_context);
28852889
sev->snp_context = NULL;
28862890

2891+
if (snp_asid_to_gctx_pages_map)
2892+
snp_asid_to_gctx_pages_map[sev_get_asid(kvm)] = NULL;
2893+
28872894
return 0;
28882895
}
28892896

2897+
/*
2898+
* NOTE: called in NMI context from sev_emergency_disable().
2899+
*/
2900+
void snp_decommision_all(void)
2901+
{
2902+
struct sev_data_snp_addr data = {};
2903+
int ret, asid;
2904+
2905+
if (!snp_asid_to_gctx_pages_map)
2906+
return;
2907+
2908+
for (asid = 1; asid < min_sev_asid; asid++) {
2909+
if (snp_asid_to_gctx_pages_map[asid]) {
2910+
data.address = __sme_pa(snp_asid_to_gctx_pages_map[asid]);
2911+
ret = sev_do_cmd(SEV_CMD_SNP_DECOMMISSION, &data, NULL);
2912+
if (!ret) {
2913+
snp_free_firmware_page(snp_asid_to_gctx_pages_map[asid]);
2914+
snp_asid_to_gctx_pages_map[asid] = NULL;
2915+
}
2916+
}
2917+
}
2918+
}
2919+
28902920
void sev_vm_destroy(struct kvm *kvm)
28912921
{
28922922
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -3052,6 +3082,13 @@ void __init sev_hardware_setup(void)
30523082
sev_es_supported = true;
30533083
sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
30543084

3085+
if (sev_snp_supported) {
3086+
snp_asid_to_gctx_pages_map = kmalloc_array(min_sev_asid,
3087+
sizeof(void *),
3088+
GFP_KERNEL | __GFP_ZERO);
3089+
if (!snp_asid_to_gctx_pages_map)
3090+
pr_warn("Could not allocate SNP asid to guest context map\n");
3091+
}
30553092
out:
30563093
if (boot_cpu_has(X86_FEATURE_SEV))
30573094
pr_info("SEV %s (ASIDs %u - %u)\n",
@@ -3094,6 +3131,8 @@ void sev_hardware_unsetup(void)
30943131

30953132
misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
30963133
misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
3134+
3135+
kfree(snp_asid_to_gctx_pages_map);
30973136
}
30983137

30993138
int sev_cpu_init(struct svm_cpu_data *sd)

arch/x86/kvm/svm/svm.c

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <linux/highmem.h>
1818
#include <linux/amd-iommu.h>
1919
#include <linux/sched.h>
20+
#include <linux/delay.h>
2021
#include <linux/trace_events.h>
2122
#include <linux/slab.h>
2223
#include <linux/hashtable.h>
@@ -248,6 +249,8 @@ static unsigned long iopm_base;
248249

249250
DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
250251

252+
static DEFINE_SPINLOCK(snp_decommision_lock);
253+
251254
/*
252255
* Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
253256
* the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
@@ -594,9 +597,97 @@ static inline void kvm_cpu_svm_disable(void)
594597

595598
static void svm_emergency_disable(void)
596599
{
600+
static atomic_t waiting_for_cpus_synchronized;
601+
static bool synchronize_cpus_initiated;
602+
static bool snp_decommision_handled;
603+
static atomic_t cpus_synchronized;
604+
597605
kvm_rebooting = true;
598606

599607
kvm_cpu_svm_disable();
608+
609+
if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP))
610+
return;
611+
612+
/*
613+
* SNP_SHUTDOWN_EX fails when SNP VMs are active as the firmware checks
614+
* every encryption-capable ASID to verify that it is not in use by a
615+
* guest and a DF_FLUSH is not required. If a DF_FLUSH is required,
616+
* the firmware returns DFFLUSH_REQUIRED. To address this, SNP_DECOMMISION
617+
* is required to shutdown all active SNP VMs, but SNP_DECOMMISION tags all
618+
* CPUs that guest was activated on to do a WBINVD. When panic notifier
619+
* is invoked all other CPUs have already been shutdown, so it is not
620+
* possible to do a wbinvd_on_all_cpus() after SNP_DECOMMISION has been
621+
* executed. This eventually causes SNP_SHUTDOWN_EX to fail after
622+
* SNP_DECOMMISION. To fix this, do SNP_DECOMMISION and subsequent WBINVD
623+
* on all CPUs during NMI shutdown of CPUs as part of disabling
624+
* virtualization on all CPUs via cpu_emergency_disable_virtualization().
625+
*/
626+
627+
spin_lock(&snp_decommision_lock);
628+
629+
/*
630+
* exit early for call from native_machine_crash_shutdown()
631+
* as SNP_DECOMMISSION has already been done as part of
632+
* NMI shutdown of the CPUs.
633+
*/
634+
if (snp_decommision_handled) {
635+
spin_unlock(&snp_decommision_lock);
636+
return;
637+
}
638+
639+
/*
640+
* Synchronize all CPUs handling NMI before issuing
641+
* SNP_DECOMMISSION.
642+
*/
643+
if (!synchronize_cpus_initiated) {
644+
/*
645+
* one CPU handling panic, the other CPU is initiator for
646+
* CPU synchronization.
647+
*/
648+
atomic_set(&waiting_for_cpus_synchronized, num_online_cpus() - 2);
649+
synchronize_cpus_initiated = true;
650+
/*
651+
* Ensure CPU synchronization parameters are setup before dropping
652+
* the lock to let other CPUs continue to reach synchronization.
653+
*/
654+
wmb();
655+
656+
spin_unlock(&snp_decommision_lock);
657+
658+
/*
659+
* This will not cause system to hang forever as the CPU
660+
* handling panic waits for maximum one second for
661+
* other CPUs to stop in nmi_shootdown_cpus().
662+
*/
663+
while (atomic_read(&waiting_for_cpus_synchronized) > 0)
664+
mdelay(1);
665+
666+
/* Reacquire the lock once CPUs are synchronized */
667+
spin_lock(&snp_decommision_lock);
668+
669+
atomic_set(&cpus_synchronized, 1);
670+
} else {
671+
atomic_dec(&waiting_for_cpus_synchronized);
672+
/*
673+
* drop the lock to let other CPUs contiune to reach
674+
* synchronization.
675+
*/
676+
spin_unlock(&snp_decommision_lock);
677+
678+
while (atomic_read(&cpus_synchronized) == 0)
679+
mdelay(1);
680+
681+
/* Try to re-acquire lock after CPUs are synchronized */
682+
spin_lock(&snp_decommision_lock);
683+
}
684+
685+
if (!snp_decommision_handled) {
686+
snp_decommision_all();
687+
snp_decommision_handled = true;
688+
}
689+
spin_unlock(&snp_decommision_lock);
690+
wbinvd();
600691
}
601692

602693
static void svm_hardware_disable(void)

arch/x86/kvm/svm/svm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -749,6 +749,7 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
749749
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
750750
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
751751
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
752+
void snp_decommision_all(void);
752753
#else
753754
static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
754755
{
@@ -779,7 +780,7 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
779780
{
780781
return 0;
781782
}
782-
783+
static void snp_decommision_all(void);
783784
#endif
784785

785786
/* vmenter.S */

0 commit comments

Comments
 (0)