summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLee Jones <joneslee@google.com>2024-04-11 18:16:04 +0100
committerTreehugger Robot <android-test-infra-autosubmit@system.gserviceaccount.com>2024-04-23 19:17:23 +0000
commit04c42ae514afa3c1e07796f1198bc6a90cab1904 (patch)
tree574a2657b9c7ddbf5c39c6487c3ef8e290a9c26d
parenta0436d5a158ed314a36e2c39b522d17816d8de69 (diff)
parent508f34f2381eb84b2335abb970b940aefef50a19 (diff)
downloadcommon-04c42ae514afa3c1e07796f1198bc6a90cab1904.tar.gz
Merge 508f34f2381e ("Merge tag 'm68k-for-v6.9-tag1' of git://git.kernel.org/pub/scm/linux/kernel/git/geert/linux-m68k") into android-mainline
Steps on the way to v6.9-rc1 Signed-off-by: Lee Jones <joneslee@google.com> Change-Id: I947378404e1ff2daca861d32e2a43f19c1a1b382
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu1
-rw-r--r--Documentation/admin-guide/RAS/address-translation.rst24
-rw-r--r--Documentation/admin-guide/RAS/error-decoding.rst (renamed from Documentation/RAS/ras.rst)11
-rw-r--r--Documentation/admin-guide/RAS/index.rst7
-rw-r--r--Documentation/admin-guide/RAS/main.rst (renamed from Documentation/admin-guide/ras.rst)10
-rw-r--r--Documentation/admin-guide/hw-vuln/index.rst1
-rw-r--r--Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst104
-rw-r--r--Documentation/admin-guide/hw-vuln/spectre.rst8
-rw-r--r--Documentation/admin-guide/index.rst2
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt30
-rw-r--r--Documentation/arch/x86/amd-memory-encryption.rst16
-rw-r--r--Documentation/arch/x86/boot.rst3
-rw-r--r--Documentation/arch/x86/pti.rst6
-rw-r--r--Documentation/index.rst1
-rw-r--r--Documentation/process/maintainer-tip.rst34
-rw-r--r--Documentation/virt/coco/sev-guest.rst52
-rw-r--r--MAINTAINERS15
-rw-r--r--arch/alpha/kernel/smp.c5
-rw-r--r--arch/arc/kernel/smp.c5
-rw-r--r--arch/csky/kernel/smp.c4
-rw-r--r--arch/hexagon/kernel/smp.c4
-rw-r--r--arch/m68k/configs/amiga_defconfig3
-rw-r--r--arch/m68k/configs/apollo_defconfig3
-rw-r--r--arch/m68k/configs/atari_defconfig3
-rw-r--r--arch/m68k/configs/bvme6000_defconfig3
-rw-r--r--arch/m68k/configs/hp300_defconfig3
-rw-r--r--arch/m68k/configs/mac_defconfig3
-rw-r--r--arch/m68k/configs/multi_defconfig3
-rw-r--r--arch/m68k/configs/mvme147_defconfig3
-rw-r--r--arch/m68k/configs/mvme16x_defconfig3
-rw-r--r--arch/m68k/configs/q40_defconfig3
-rw-r--r--arch/m68k/configs/sun3_defconfig3
-rw-r--r--arch/m68k/configs/sun3x_defconfig3
-rw-r--r--arch/openrisc/kernel/smp.c4
-rw-r--r--arch/powerpc/kernel/smp.c6
-rw-r--r--arch/riscv/kernel/smpboot.c4
-rw-r--r--arch/s390/Kconfig18
-rw-r--r--arch/s390/Makefile10
-rw-r--r--arch/s390/boot/.gitignore1
-rw-r--r--arch/s390/boot/Makefile25
-rw-r--r--arch/s390/boot/boot.h6
-rw-r--r--arch/s390/boot/startup.c75
-rw-r--r--arch/s390/boot/vmlinux.lds.S48
-rw-r--r--arch/s390/configs/debug_defconfig2
-rw-r--r--arch/s390/crypto/chacha-glue.c4
-rw-r--r--arch/s390/crypto/chacha-s390.S2
-rw-r--r--arch/s390/crypto/crc32-vx.c11
-rw-r--r--arch/s390/crypto/crc32-vx.h12
-rw-r--r--arch/s390/crypto/crc32be-vx.c (renamed from arch/s390/crypto/crc32be-vx.S)177
-rw-r--r--arch/s390/crypto/crc32le-vx.c (renamed from arch/s390/crypto/crc32le-vx.S)249
-rw-r--r--arch/s390/crypto/paes_s390.c16
-rw-r--r--arch/s390/hypfs/hypfs_diag0c.c3
-rw-r--r--arch/s390/hypfs/hypfs_sprp.c4
-rw-r--r--arch/s390/include/asm/access-regs.h38
-rw-r--r--arch/s390/include/asm/appldata.h4
-rw-r--r--arch/s390/include/asm/asm-prototypes.h2
-rw-r--r--arch/s390/include/asm/bug.h4
-rw-r--r--arch/s390/include/asm/checksum.h29
-rw-r--r--arch/s390/include/asm/diag.h15
-rw-r--r--arch/s390/include/asm/entry-common.h5
-rw-r--r--arch/s390/include/asm/fpu-insn-asm.h (renamed from arch/s390/include/asm/vx-insn-asm.h)71
-rw-r--r--arch/s390/include/asm/fpu-insn.h486
-rw-r--r--arch/s390/include/asm/fpu-types.h51
-rw-r--r--arch/s390/include/asm/fpu.h295
-rw-r--r--arch/s390/include/asm/fpu/api.h126
-rw-r--r--arch/s390/include/asm/fpu/internal.h67
-rw-r--r--arch/s390/include/asm/fpu/types.h38
-rw-r--r--arch/s390/include/asm/kvm_host.h5
-rw-r--r--arch/s390/include/asm/lowcore.h2
-rw-r--r--arch/s390/include/asm/pai.h3
-rw-r--r--arch/s390/include/asm/pci.h3
-rw-r--r--arch/s390/include/asm/physmem_info.h1
-rw-r--r--arch/s390/include/asm/processor.h11
-rw-r--r--arch/s390/include/asm/ptrace.h4
-rw-r--r--arch/s390/include/asm/stacktrace.h1
-rw-r--r--arch/s390/include/asm/switch_to.h49
-rw-r--r--arch/s390/include/asm/vx-insn.h19
-rw-r--r--arch/s390/kernel/cache.c1
-rw-r--r--arch/s390/kernel/compat_signal.c22
-rw-r--r--arch/s390/kernel/crash_dump.c2
-rw-r--r--arch/s390/kernel/diag.c31
-rw-r--r--arch/s390/kernel/early.c3
-rw-r--r--arch/s390/kernel/entry.S19
-rw-r--r--arch/s390/kernel/entry.h1
-rw-r--r--arch/s390/kernel/fpu.c372
-rw-r--r--arch/s390/kernel/ipl.c3
-rw-r--r--arch/s390/kernel/machine_kexec.c3
-rw-r--r--arch/s390/kernel/nmi.c168
-rw-r--r--arch/s390/kernel/os_info.c6
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c83
-rw-r--r--arch/s390/kernel/perf_pai_ext.c52
-rw-r--r--arch/s390/kernel/perf_regs.c10
-rw-r--r--arch/s390/kernel/process.c31
-rw-r--r--arch/s390/kernel/ptrace.c101
-rw-r--r--arch/s390/kernel/setup.c12
-rw-r--r--arch/s390/kernel/signal.c20
-rw-r--r--arch/s390/kernel/smp.c3
-rw-r--r--arch/s390/kernel/sysinfo.c27
-rw-r--r--arch/s390/kernel/text_amode31.S2
-rw-r--r--arch/s390/kernel/time.c6
-rw-r--r--arch/s390/kernel/traps.c12
-rw-r--r--arch/s390/kernel/uprobes.c1
-rw-r--r--arch/s390/kernel/vdso32/Makefile2
-rw-r--r--arch/s390/kernel/vdso32/vdso32.lds.S1
-rw-r--r--arch/s390/kernel/vdso64/Makefile3
-rw-r--r--arch/s390/kernel/vdso64/vdso64.lds.S1
-rw-r--r--arch/s390/kernel/vmlinux.lds.S54
-rw-r--r--arch/s390/kvm/gaccess.c5
-rw-r--r--arch/s390/kvm/interrupt.c6
-rw-r--r--arch/s390/kvm/kvm-s390.c33
-rw-r--r--arch/s390/kvm/kvm-s390.h18
-rw-r--r--arch/s390/kvm/vsie.c3
-rw-r--r--arch/s390/lib/Makefile1
-rw-r--r--arch/s390/lib/csum-partial.c91
-rw-r--r--arch/s390/mm/extmem.c4
-rw-r--r--arch/s390/mm/mmap.c19
-rw-r--r--arch/s390/pci/pci.c20
-rw-r--r--arch/s390/pci/pci_debug.c10
-rw-r--r--arch/s390/pci/pci_event.c15
-rw-r--r--arch/s390/pci/pci_sysfs.c70
-rw-r--r--arch/s390/tools/.gitignore1
-rw-r--r--arch/s390/tools/Makefile5
-rw-r--r--arch/s390/tools/relocs.c387
-rw-r--r--arch/sparc/kernel/smp_64.c4
-rw-r--r--arch/x86/Kbuild2
-rw-r--r--arch/x86/Kconfig79
-rw-r--r--arch/x86/Makefile14
-rw-r--r--arch/x86/boot/compressed/acpi.c2
-rw-r--r--arch/x86/boot/compressed/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/efi.c2
-rw-r--r--arch/x86/boot/compressed/efi.h9
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c6
-rw-r--r--arch/x86/boot/compressed/misc.c55
-rw-r--r--arch/x86/boot/compressed/misc.h3
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c1
-rw-r--r--arch/x86/boot/compressed/sev.c10
-rw-r--r--arch/x86/boot/header.S4
-rw-r--r--arch/x86/coco/core.c7
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/entry/calling.h100
-rw-r--r--arch/x86/entry/entry.S3
-rw-r--r--arch/x86/entry/entry_32.S2
-rw-r--r--arch/x86/entry/entry_64.S15
-rw-r--r--arch/x86/entry/thunk_32.S34
-rw-r--r--arch/x86/entry/thunk_64.S33
-rw-r--r--arch/x86/entry/vdso/Makefile57
-rw-r--r--arch/x86/entry/vdso/vma.c57
-rw-r--r--arch/x86/events/amd/uncore.c2
-rw-r--r--arch/x86/events/intel/core.c1
-rw-r--r--arch/x86/events/intel/ds.c1
-rw-r--r--arch/x86/include/asm/asm.h14
-rw-r--r--arch/x86/include/asm/coco.h9
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/current.h9
-rw-r--r--arch/x86/include/asm/debugreg.h24
-rw-r--r--arch/x86/include/asm/desc.h1
-rw-r--r--arch/x86/include/asm/disabled-features.h18
-rw-r--r--arch/x86/include/asm/efi.h14
-rw-r--r--arch/x86/include/asm/elf.h1
-rw-r--r--arch/x86/include/asm/fpu/sched.h10
-rw-r--r--arch/x86/include/asm/fsgsbase.h2
-rw-r--r--arch/x86/include/asm/io.h2
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/kexec.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/include/asm/linkage.h16
-rw-r--r--arch/x86/include/asm/local.h16
-rw-r--r--arch/x86/include/asm/mem_encrypt.h23
-rw-r--r--arch/x86/include/asm/msr-index.h74
-rw-r--r--arch/x86/include/asm/msr.h26
-rw-r--r--arch/x86/include/asm/nmi.h3
-rw-r--r--arch/x86/include/asm/nospec-branch.h57
-rw-r--r--arch/x86/include/asm/page.h6
-rw-r--r--arch/x86/include/asm/pci.h13
-rw-r--r--arch/x86/include/asm/percpu.h189
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-3level.h2
-rw-r--r--arch/x86/include/asm/pgtable.h18
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h2
-rw-r--r--arch/x86/include/asm/preempt.h2
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h33
-rw-r--r--arch/x86/include/asm/pti.h2
-rw-r--r--arch/x86/include/asm/resctrl.h90
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/setup_data.h32
-rw-r--r--arch/x86/include/asm/sev.h53
-rw-r--r--arch/x86/include/asm/smp.h5
-rw-r--r--arch/x86/include/asm/spec-ctrl.h2
-rw-r--r--arch/x86/include/asm/special_insns.h13
-rw-r--r--arch/x86/include/asm/static_call.h2
-rw-r--r--arch/x86/include/asm/text-patching.h2
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/trap_pf.h20
-rw-r--r--arch/x86/include/asm/tsc.h3
-rw-r--r--arch/x86/include/asm/uaccess_64.h18
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h73
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h83
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S24
-rw-r--r--arch/x86/kernel/alternative.c23
-rw-r--r--arch/x86/kernel/apm_32.c29
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/callthunks.c36
-rw-r--r--arch/x86/kernel/cpu/amd.c37
-rw-r--r--arch/x86/kernel/cpu/bugs.c142
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c7
-rw-r--r--arch/x86/kernel/cpu/common.c53
-rw-r--r--arch/x86/kernel/cpu/intel_pconfig.c2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/rdrand.c1
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c111
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c48
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h75
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c501
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c15
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c388
-rw-r--r--arch/x86/kernel/crash.c4
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/fpu/bugs.c2
-rw-r--r--arch/x86/kernel/ftrace.c3
-rw-r--r--arch/x86/kernel/head64.c179
-rw-r--r--arch/x86/kernel/head_32.S4
-rw-r--r--arch/x86/kernel/head_64.S148
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c5
-rw-r--r--arch/x86/kernel/kprobes/opt.c2
-rw-r--r--arch/x86/kernel/ldt.c8
-rw-r--r--arch/x86/kernel/nmi.c10
-rw-r--r--arch/x86/kernel/process.c99
-rw-r--r--arch/x86/kernel/process_32.c7
-rw-r--r--arch/x86/kernel/process_64.c7
-rw-r--r--arch/x86/kernel/setup.c10
-rw-r--r--arch/x86/kernel/sev-shared.c139
-rw-r--r--arch/x86/kernel/sev.c68
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S2
-rw-r--r--arch/x86/kernel/smp.c10
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/static_call.c2
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c7
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S11
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/mmu/mmu.c5
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h2
-rw-r--r--arch/x86/kvm/svm/nested.c2
-rw-r--r--arch/x86/kvm/svm/sev.c37
-rw-r--r--arch/x86/kvm/svm/svm.c19
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/svm/vmenter.S4
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/kvm/x86.c5
-rw-r--r--arch/x86/lib/Makefile15
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S12
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S30
-rw-r--r--arch/x86/lib/insn-eval.c6
-rw-r--r--arch/x86/lib/insn.c58
-rw-r--r--arch/x86/lib/msr-smp.c12
-rw-r--r--arch/x86/lib/msr.c6
-rw-r--r--arch/x86/lib/retpoline.S41
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/debug_pagetables.c4
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c32
-rw-r--r--arch/x86/mm/mem_encrypt.c55
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c108
-rw-r--r--arch/x86/mm/pat/memtype.c9
-rw-r--r--arch/x86/mm/pat/set_memory.c19
-rw-r--r--arch/x86/mm/pgtable.c4
-rw-r--r--arch/x86/mm/tlb.c10
-rw-r--r--arch/x86/net/bpf_jit_comp.c4
-rw-r--r--arch/x86/net/bpf_jit_comp32.c2
-rw-r--r--arch/x86/platform/efi/efi.c5
-rw-r--r--arch/x86/platform/pvh/enlighten.c1
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S33
-rw-r--r--arch/x86/virt/svm/Makefile3
-rw-r--r--arch/x86/virt/svm/sev.c560
-rw-r--r--arch/x86/xen/enlighten_pvh.c1
-rw-r--r--arch/x86/xen/vga.c1
-rw-r--r--arch/x86/xen/xen-asm.S10
-rw-r--r--block/blk-settings.c2
-rw-r--r--drivers/base/cpu.c3
-rw-r--r--drivers/crypto/ccp/Kconfig2
-rw-r--r--drivers/crypto/ccp/sev-dev.c1150
-rw-r--r--drivers/crypto/ccp/sev-dev.h5
-rw-r--r--drivers/edac/Kconfig1
-rw-r--r--drivers/edac/amd64_edac.c286
-rw-r--r--drivers/edac/i10nm_base.c1
-rw-r--r--drivers/edac/igen6_edac.c2
-rw-r--r--drivers/edac/synopsys_edac.c4
-rw-r--r--drivers/edac/versal_edac.c199
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c8
-rw-r--r--drivers/firmware/efi/libstub/efistub.h2
-rw-r--r--drivers/firmware/efi/libstub/x86-stub.c3
-rw-r--r--drivers/iommu/amd/amd_iommu.h1
-rw-r--r--drivers/iommu/amd/init.c125
-rw-r--r--drivers/md/dm-table.c27
-rw-r--r--drivers/pci/hotplug/s390_pci_hpc.c65
-rw-r--r--drivers/ras/Kconfig13
-rw-r--r--drivers/ras/Makefile3
-rw-r--r--drivers/ras/amd/atl/Kconfig21
-rw-r--r--drivers/ras/amd/atl/Makefile18
-rw-r--r--drivers/ras/amd/atl/access.c133
-rw-r--r--drivers/ras/amd/atl/core.c225
-rw-r--r--drivers/ras/amd/atl/dehash.c500
-rw-r--r--drivers/ras/amd/atl/denormalize.c718
-rw-r--r--drivers/ras/amd/atl/internal.h306
-rw-r--r--drivers/ras/amd/atl/map.c682
-rw-r--r--drivers/ras/amd/atl/reg_fields.h606
-rw-r--r--drivers/ras/amd/atl/system.c288
-rw-r--r--drivers/ras/amd/atl/umc.c341
-rw-r--r--drivers/ras/amd/fmpm.c1013
-rw-r--r--drivers/ras/cec.c10
-rw-r--r--drivers/ras/debugfs.c8
-rw-r--r--drivers/ras/debugfs.h2
-rw-r--r--drivers/ras/ras.c31
-rw-r--r--drivers/s390/char/vmur.c4
-rw-r--r--drivers/s390/char/zcore.c1
-rw-r--r--drivers/s390/cio/ccwgroup.c4
-rw-r--r--drivers/s390/cio/chsc.c4
-rw-r--r--drivers/s390/cio/chsc_sch.c20
-rw-r--r--drivers/s390/cio/cmf.c6
-rw-r--r--drivers/s390/cio/css.c4
-rw-r--r--drivers/s390/cio/device.c4
-rw-r--r--drivers/s390/cio/scm.c4
-rw-r--r--drivers/s390/crypto/ap_bus.c257
-rw-r--r--drivers/s390/crypto/ap_bus.h8
-rw-r--r--drivers/s390/crypto/ap_debug.h4
-rw-r--r--drivers/s390/crypto/ap_queue.c31
-rw-r--r--drivers/s390/crypto/pkey_api.c226
-rw-r--r--drivers/s390/crypto/vfio_ap_drv.c2
-rw-r--r--drivers/s390/crypto/vfio_ap_ops.c35
-rw-r--r--drivers/s390/crypto/zcrypt_api.c228
-rw-r--r--drivers/s390/crypto/zcrypt_api.h9
-rw-r--r--drivers/s390/crypto/zcrypt_ccamisc.c214
-rw-r--r--drivers/s390/crypto/zcrypt_debug.h4
-rw-r--r--drivers/s390/crypto/zcrypt_ep11misc.c127
-rw-r--r--drivers/s390/crypto/zcrypt_error.h5
-rw-r--r--drivers/s390/crypto/zcrypt_msgtype50.c14
-rw-r--r--drivers/s390/crypto/zcrypt_msgtype6.c45
-rw-r--r--drivers/zorro/zorro-driver.c2
-rw-r--r--drivers/zorro/zorro.h2
-rw-r--r--include/linux/amd-iommu.h6
-rw-r--r--include/linux/atomic/atomic-arch-fallback.h46
-rw-r--r--include/linux/atomic/atomic-instrumented.h68
-rw-r--r--include/linux/atomic/atomic-long.h24
-rw-r--r--include/linux/compiler-gcc.h2
-rw-r--r--include/linux/compiler.h2
-rw-r--r--include/linux/compiler_attributes.h12
-rw-r--r--include/linux/cpu.h4
-rw-r--r--include/linux/indirect_call_wrapper.h2
-rw-r--r--include/linux/module.h2
-rw-r--r--include/linux/mutex.h8
-rw-r--r--include/linux/objtool.h2
-rw-r--r--include/linux/psp-sev.h321
-rw-r--r--include/linux/pti.h2
-rw-r--r--include/linux/ras.h18
-rw-r--r--include/linux/resctrl.h48
-rw-r--r--include/linux/sched/sd_flags.h4
-rw-r--r--include/linux/sched/topology.h6
-rw-r--r--include/linux/smp.h13
-rw-r--r--include/linux/tick.h12
-rw-r--r--include/net/netfilter/nf_tables_core.h2
-rw-r--r--include/net/tc_wrapper.h2
-rw-r--r--include/uapi/linux/psp-sev.h59
-rw-r--r--init/main.c4
-rw-r--r--kernel/locking/percpu-rwsem.c11
-rw-r--r--kernel/locking/qspinlock_paravirt.h2
-rw-r--r--kernel/locking/rtmutex.c9
-rw-r--r--kernel/locking/rwsem.c6
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/fair.c110
-rw-r--r--kernel/sched/idle.c21
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/topology.c35
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--lib/Kconfig.debug4
-rw-r--r--lib/raid6/s390vx.uc62
-rw-r--r--net/netfilter/Makefile2
-rw-r--r--net/netfilter/nf_tables_core.c6
-rw-r--r--net/netfilter/nft_ct.c4
-rw-r--r--net/netfilter/nft_lookup.c2
-rw-r--r--net/sched/sch_api.c2
-rw-r--r--scripts/Makefile.lib8
-rw-r--r--scripts/Makefile.vmlinux_o2
-rw-r--r--scripts/atomic/kerneldoc/add_unless1
-rw-r--r--scripts/atomic/kerneldoc/cmpxchg1
-rw-r--r--scripts/atomic/kerneldoc/dec_if_positive1
-rw-r--r--scripts/atomic/kerneldoc/dec_unless_positive1
-rw-r--r--scripts/atomic/kerneldoc/inc_not_zero1
-rw-r--r--scripts/atomic/kerneldoc/inc_unless_negative1
-rw-r--r--scripts/atomic/kerneldoc/try_cmpxchg3
-rw-r--r--scripts/generate_rust_target.rs2
-rw-r--r--scripts/mod/modpost.c2
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h1
-rw-r--r--tools/arch/x86/include/asm/disabled-features.h10
-rw-r--r--tools/arch/x86/lib/insn.c58
-rw-r--r--tools/objtool/arch/x86/special.c2
-rw-r--r--tools/objtool/check.c4
-rw-r--r--tools/testing/selftests/kvm/s390x/memop.c31
406 files changed, 13836 insertions, 4328 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index a1db6db47505..710d47be11e0 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -516,6 +516,7 @@ What: /sys/devices/system/cpu/vulnerabilities
/sys/devices/system/cpu/vulnerabilities/mds
/sys/devices/system/cpu/vulnerabilities/meltdown
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
+ /sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling
/sys/devices/system/cpu/vulnerabilities/retbleed
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
/sys/devices/system/cpu/vulnerabilities/spectre_v1
diff --git a/Documentation/admin-guide/RAS/address-translation.rst b/Documentation/admin-guide/RAS/address-translation.rst
new file mode 100644
index 000000000000..f0ca17b43cd3
--- /dev/null
+++ b/Documentation/admin-guide/RAS/address-translation.rst
@@ -0,0 +1,24 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Address translation
+===================
+
+x86 AMD
+-------
+
+Zen-based AMD systems include a Data Fabric that manages the layout of
+physical memory. Devices attached to the Fabric, like memory controllers,
+I/O, etc., may not have a complete view of the system physical memory map.
+These devices may provide a "normalized", i.e. device physical, address
+when reporting memory errors. Normalized addresses must be translated to
+a system physical address for the kernel to action on the memory.
+
+AMD Address Translation Library (CONFIG_AMD_ATL) provides translation for
+this case.
+
+Glossary of acronyms used in address translation for Zen-based systems
+
+* CCM = Cache Coherent Moderator
+* COD = Cluster-on-Die
+* COH_ST = Coherent Station
+* DF = Data Fabric
diff --git a/Documentation/RAS/ras.rst b/Documentation/admin-guide/RAS/error-decoding.rst
index 2556b397cd27..26a72f3fe5de 100644
--- a/Documentation/RAS/ras.rst
+++ b/Documentation/admin-guide/RAS/error-decoding.rst
@@ -1,15 +1,10 @@
.. SPDX-License-Identifier: GPL-2.0
-Reliability, Availability and Serviceability features
-=====================================================
-
-This documents different aspects of the RAS functionality present in the
-kernel.
-
Error decoding
----------------
+==============
-* x86
+x86
+---
Error decoding on AMD systems should be done using the rasdaemon tool:
https://github.com/mchehab/rasdaemon/
diff --git a/Documentation/admin-guide/RAS/index.rst b/Documentation/admin-guide/RAS/index.rst
new file mode 100644
index 000000000000..f4087040a7c0
--- /dev/null
+++ b/Documentation/admin-guide/RAS/index.rst
@@ -0,0 +1,7 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. toctree::
+ :maxdepth: 2
+
+ main
+ error-decoding
+ address-translation
diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/RAS/main.rst
index 8e03751d126d..7ac1d4ccc509 100644
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/RAS/main.rst
@@ -1,8 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
.. include:: <isonum.txt>
-============================================
-Reliability, Availability and Serviceability
-============================================
+==================================================
+Reliability, Availability and Serviceability (RAS)
+==================================================
+
+This documents different aspects of the RAS functionality present in the
+kernel.
RAS concepts
************
diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst
index de99caabf65a..ff0b440ef2dc 100644
--- a/Documentation/admin-guide/hw-vuln/index.rst
+++ b/Documentation/admin-guide/hw-vuln/index.rst
@@ -21,3 +21,4 @@ are configurable at compile, boot or run time.
cross-thread-rsb
srso
gather_data_sampling
+ reg-file-data-sampling
diff --git a/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst b/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst
new file mode 100644
index 000000000000..0585d02b9a6c
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst
@@ -0,0 +1,104 @@
+==================================
+Register File Data Sampling (RFDS)
+==================================
+
+Register File Data Sampling (RFDS) is a microarchitectural vulnerability that
+only affects Intel Atom parts(also branded as E-cores). RFDS may allow
+a malicious actor to infer data values previously used in floating point
+registers, vector registers, or integer registers. RFDS does not provide the
+ability to choose which data is inferred. CVE-2023-28746 is assigned to RFDS.
+
+Affected Processors
+===================
+Below is the list of affected Intel processors [#f1]_:
+
+ =================== ============
+ Common name Family_Model
+ =================== ============
+ ATOM_GOLDMONT 06_5CH
+ ATOM_GOLDMONT_D 06_5FH
+ ATOM_GOLDMONT_PLUS 06_7AH
+ ATOM_TREMONT_D 06_86H
+ ATOM_TREMONT 06_96H
+ ALDERLAKE 06_97H
+ ALDERLAKE_L 06_9AH
+ ATOM_TREMONT_L 06_9CH
+ RAPTORLAKE 06_B7H
+ RAPTORLAKE_P 06_BAH
+ ATOM_GRACEMONT 06_BEH
+ RAPTORLAKE_S 06_BFH
+ =================== ============
+
+As an exception to this table, Intel Xeon E family parts ALDERLAKE(06_97H) and
+RAPTORLAKE(06_B7H) codenamed Catlow are not affected. They are reported as
+vulnerable in Linux because they share the same family/model with an affected
+part. Unlike their affected counterparts, they do not enumerate RFDS_CLEAR or
+CPUID.HYBRID. This information could be used to distinguish between the
+affected and unaffected parts, but it is deemed not worth adding complexity as
+the reporting is fixed automatically when these parts enumerate RFDS_NO.
+
+Mitigation
+==========
+Intel released a microcode update that enables software to clear sensitive
+information using the VERW instruction. Like MDS, RFDS deploys the same
+mitigation strategy to force the CPU to clear the affected buffers before an
+attacker can extract the secrets. This is achieved by using the otherwise
+unused and obsolete VERW instruction in combination with a microcode update.
+The microcode clears the affected CPU buffers when the VERW instruction is
+executed.
+
+Mitigation points
+-----------------
+VERW is executed by the kernel before returning to user space, and by KVM
+before VMentry. None of the affected cores support SMT, so VERW is not required
+at C-state transitions.
+
+New bits in IA32_ARCH_CAPABILITIES
+----------------------------------
+Newer processors and microcode update on existing affected processors added new
+bits to IA32_ARCH_CAPABILITIES MSR. These bits can be used to enumerate
+vulnerability and mitigation capability:
+
+- Bit 27 - RFDS_NO - When set, processor is not affected by RFDS.
+- Bit 28 - RFDS_CLEAR - When set, processor is affected by RFDS, and has the
+ microcode that clears the affected buffers on VERW execution.
+
+Mitigation control on the kernel command line
+---------------------------------------------
+The kernel command line allows to control RFDS mitigation at boot time with the
+parameter "reg_file_data_sampling=". The valid arguments are:
+
+ ========== =================================================================
+ on If the CPU is vulnerable, enable mitigation; CPU buffer clearing
+ on exit to userspace and before entering a VM.
+ off Disables mitigation.
+ ========== =================================================================
+
+Mitigation default is selected by CONFIG_MITIGATION_RFDS.
+
+Mitigation status information
+-----------------------------
+The Linux kernel provides a sysfs interface to enumerate the current
+vulnerability status of the system: whether the system is vulnerable, and
+which mitigations are active. The relevant sysfs file is:
+
+ /sys/devices/system/cpu/vulnerabilities/reg_file_data_sampling
+
+The possible values in this file are:
+
+ .. list-table::
+
+ * - 'Not affected'
+ - The processor is not vulnerable
+ * - 'Vulnerable'
+ - The processor is vulnerable, but no mitigation enabled
+ * - 'Vulnerable: No microcode'
+ - The processor is vulnerable but microcode is not updated.
+ * - 'Mitigation: Clear Register File'
+ - The processor is vulnerable and the CPU buffer clearing mitigation is
+ enabled.
+
+References
+----------
+.. [#f1] Affected Processors
+ https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index 32a8893e5617..cce768afec6b 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -473,8 +473,8 @@ Spectre variant 2
-mindirect-branch=thunk-extern -mindirect-branch-register options.
If the kernel is compiled with a Clang compiler, the compiler needs
to support -mretpoline-external-thunk option. The kernel config
- CONFIG_RETPOLINE needs to be turned on, and the CPU needs to run with
- the latest updated microcode.
+ CONFIG_MITIGATION_RETPOLINE needs to be turned on, and the CPU needs
+ to run with the latest updated microcode.
On Intel Skylake-era systems the mitigation covers most, but not all,
cases. See :ref:`[3] <spec_ref3>` for more details.
@@ -609,8 +609,8 @@ kernel command line.
Selecting 'on' will, and 'auto' may, choose a
mitigation method at run time according to the
CPU, the available microcode, the setting of the
- CONFIG_RETPOLINE configuration option, and the
- compiler with which the kernel was built.
+ CONFIG_MITIGATION_RETPOLINE configuration option,
+ and the compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index fb40a1f6f79e..dfc06fab9432 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -122,7 +122,7 @@ configure specific aspects of kernel behavior to your liking.
pmf
pnp
rapidio
- ras
+ RAS/index
rtc
serial-console
svga
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 3d635962d3ca..85c061926293 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1136,6 +1136,26 @@
The filter can be disabled or changed to another
driver later using sysfs.
+ reg_file_data_sampling=
+ [X86] Controls mitigation for Register File Data
+ Sampling (RFDS) vulnerability. RFDS is a CPU
+ vulnerability which may allow userspace to infer
+ kernel data values previously stored in floating point
+ registers, vector registers, or integer registers.
+ RFDS only affects Intel Atom processors.
+
+ on: Turns ON the mitigation.
+ off: Turns OFF the mitigation.
+
+ This parameter overrides the compile time default set
+ by CONFIG_MITIGATION_RFDS. Mitigation cannot be
+ disabled when other VERW based mitigations (like MDS)
+ are enabled. In order to disable RFDS mitigation all
+ VERW based mitigations need to be disabled.
+
+ For details see:
+ Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst
+
driver_async_probe= [KNL]
List of driver names to be probed asynchronously. *
matches with all driver names. If * is specified, the
@@ -3322,9 +3342,7 @@
mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control
Valid arguments: on, off
- Default (depends on kernel configuration option):
- on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
- off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
+ Default: off
mem_encrypt=on: Activate SME
mem_encrypt=off: Do not activate SME
@@ -3400,7 +3418,9 @@
nospectre_bhb [ARM64]
nospectre_v1 [X86,PPC]
nospectre_v2 [X86,PPC,S390,ARM64]
+ reg_file_data_sampling=off [X86]
retbleed=off [X86]
+ spec_rstack_overflow=off [X86]
spec_store_bypass_disable=off [X86,PPC]
spectre_v2_user=off [X86]
srbds=off [X86,INTEL]
@@ -6041,8 +6061,8 @@
Selecting 'on' will, and 'auto' may, choose a
mitigation method at run time according to the
CPU, the available microcode, the setting of the
- CONFIG_RETPOLINE configuration option, and the
- compiler with which the kernel was built.
+ CONFIG_MITIGATION_RETPOLINE configuration option,
+ and the compiler with which the kernel was built.
Selecting 'on' will also enable the mitigation
against user space to user space task attacks.
diff --git a/Documentation/arch/x86/amd-memory-encryption.rst b/Documentation/arch/x86/amd-memory-encryption.rst
index 07caa8fff852..414bc7402ae7 100644
--- a/Documentation/arch/x86/amd-memory-encryption.rst
+++ b/Documentation/arch/x86/amd-memory-encryption.rst
@@ -87,14 +87,14 @@ The state of SME in the Linux kernel can be documented as follows:
kernel is non-zero).
SME can also be enabled and activated in the BIOS. If SME is enabled and
-activated in the BIOS, then all memory accesses will be encrypted and it will
-not be necessary to activate the Linux memory encryption support. If the BIOS
-merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG), then Linux can activate
-memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
-by supplying mem_encrypt=on on the kernel command line. However, if BIOS does
-not enable SME, then Linux will not be able to activate memory encryption, even
-if configured to do so by default or the mem_encrypt=on command line parameter
-is specified.
+activated in the BIOS, then all memory accesses will be encrypted and it
+will not be necessary to activate the Linux memory encryption support.
+
+If the BIOS merely enables SME (sets bit 23 of the MSR_AMD64_SYSCFG),
+then memory encryption can be enabled by supplying mem_encrypt=on on the
+kernel command line. However, if BIOS does not enable SME, then Linux
+will not be able to activate memory encryption, even if configured to do
+so by default or the mem_encrypt=on command line parameter is specified.
Secure Nested Paging (SNP)
==========================
diff --git a/Documentation/arch/x86/boot.rst b/Documentation/arch/x86/boot.rst
index c513855a54bb..4fd492cb4970 100644
--- a/Documentation/arch/x86/boot.rst
+++ b/Documentation/arch/x86/boot.rst
@@ -878,7 +878,8 @@ Protocol: 2.10+
address if possible.
A non-relocatable kernel will unconditionally move itself and to run
- at this address.
+ at this address. A relocatable kernel will move itself to this address if it
+ loaded below this address.
============ =======
Field name: init_size
diff --git a/Documentation/arch/x86/pti.rst b/Documentation/arch/x86/pti.rst
index e08d35177bc0..57e8392f61d3 100644
--- a/Documentation/arch/x86/pti.rst
+++ b/Documentation/arch/x86/pti.rst
@@ -26,9 +26,9 @@ comments in pti.c).
This approach helps to ensure that side-channel attacks leveraging
the paging structures do not function when PTI is enabled. It can be
-enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time.
-Once enabled at compile-time, it can be disabled at boot with the
-'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
+enabled by setting CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y at compile
+time. Once enabled at compile-time, it can be disabled at boot with
+the 'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
Page Table Management
=====================
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 36e61783437c..9dfdc826618c 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -113,7 +113,6 @@ to ReStructured Text format, or are simply too old.
:maxdepth: 1
staging/index
- RAS/ras
Translations
diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst
index 08dd0f804410..497bb39727c8 100644
--- a/Documentation/process/maintainer-tip.rst
+++ b/Documentation/process/maintainer-tip.rst
@@ -304,13 +304,15 @@ following tag ordering scheme:
- Reported-by: ``Reporter <reporter@mail>``
+ - Closes: ``URL or Message-ID of the bug report this is fixing``
+
- Originally-by: ``Original author <original-author@mail>``
- Suggested-by: ``Suggester <suggester@mail>``
- Co-developed-by: ``Co-author <co-author@mail>``
- Signed-off: ``Co-author <co-author@mail>``
+ Signed-off-by: ``Co-author <co-author@mail>``
Note, that Co-developed-by and Signed-off-by of the co-author(s) must
come in pairs.
@@ -478,7 +480,7 @@ Multi-line comments::
* Larger multi-line comments should be split into paragraphs.
*/
-No tail comments:
+No tail comments (see below):
Please refrain from using tail comments. Tail comments disturb the
reading flow in almost all contexts, but especially in code::
@@ -499,6 +501,34 @@ No tail comments:
/* This magic initialization needs a comment. Maybe not? */
seed = MAGIC_CONSTANT;
+ Use C++ style, tail comments when documenting structs in headers to
+ achieve a more compact layout and better readability::
+
+ // eax
+ u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
+ // for the topology ID at the next level
+ : 27; // Reserved
+ // ebx
+ u32 num_processors : 16, // Number of processors at current level
+ : 16; // Reserved
+
+ versus::
+
+ /* eax */
+ /*
+ * Number of bits to shift APIC ID right for the topology ID
+ * at the next level
+ */
+ u32 x2apic_shift : 5,
+ /* Reserved */
+ : 27;
+
+ /* ebx */
+ /* Number of processors at current level */
+ u32 num_processors : 16,
+ /* Reserved */
+ : 16;
+
Comment the important things:
Comments should be added where the operation is not obvious. Documenting
diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst
index 68b0d2363af8..e1eaf6a830ce 100644
--- a/Documentation/virt/coco/sev-guest.rst
+++ b/Documentation/virt/coco/sev-guest.rst
@@ -67,6 +67,23 @@ counter (e.g. counter overflow), then -EIO will be returned.
};
};
+The host ioctls are issued to a file descriptor of the /dev/sev device.
+The ioctl accepts the command ID/input structure documented below.
+
+::
+
+ struct sev_issue_cmd {
+ /* Command ID */
+ __u32 cmd;
+
+ /* Command request structure */
+ __u64 data;
+
+ /* Firmware error code on failure (see psp-sev.h) */
+ __u32 error;
+ };
+
+
2.1 SNP_GET_REPORT
------------------
@@ -124,6 +141,41 @@ be updated with the expected value.
See GHCB specification for further detail on how to parse the certificate blob.
+2.4 SNP_PLATFORM_STATUS
+-----------------------
+:Technology: sev-snp
+:Type: hypervisor ioctl cmd
+:Parameters (out): struct sev_user_data_snp_status
+:Returns (out): 0 on success, -negative on error
+
+The SNP_PLATFORM_STATUS command is used to query the SNP platform status. The
+status includes API major, minor version and more. See the SEV-SNP
+specification for further details.
+
+2.5 SNP_COMMIT
+--------------
+:Technology: sev-snp
+:Type: hypervisor ioctl cmd
+:Returns (out): 0 on success, -negative on error
+
+SNP_COMMIT is used to commit the currently installed firmware using the
+SEV-SNP firmware SNP_COMMIT command. This prevents roll-back to a previously
+committed firmware version. This will also update the reported TCB to match
+that of the currently installed firmware.
+
+2.6 SNP_SET_CONFIG
+------------------
+:Technology: sev-snp
+:Type: hypervisor ioctl cmd
+:Parameters (in): struct sev_user_data_snp_config
+:Returns (out): 0 on success, -negative on error
+
+SNP_SET_CONFIG is used to set the system-wide configuration such as
+reported TCB version in the attestation report. The command is similar
+to SNP_CONFIG command defined in the SEV-SNP spec. The current values of
+the firmware parameters affected by this command can be queried via
+SNP_PLATFORM_STATUS.
+
3. SEV-SNP CPUID Enforcement
============================
diff --git a/MAINTAINERS b/MAINTAINERS
index 5574ceaa1a37..8423007e15f3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -897,6 +897,12 @@ Q: https://patchwork.kernel.org/project/linux-rdma/list/
F: drivers/infiniband/hw/efa/
F: include/uapi/rdma/efa-abi.h
+AMD ADDRESS TRANSLATION LIBRARY (ATL)
+M: Yazen Ghannam <Yazen.Ghannam@amd.com>
+L: linux-edac@vger.kernel.org
+S: Supported
+F: drivers/ras/amd/atl/*
+
AMD AXI W1 DRIVER
M: Kris Chaplin <kris.chaplin@amd.com>
R: Thomas Delev <thomas.delev@amd.com>
@@ -7583,7 +7589,6 @@ R: Robert Richter <rric@kernel.org>
L: linux-edac@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next
-F: Documentation/admin-guide/ras.rst
F: Documentation/driver-api/edac.rst
F: drivers/edac/
F: include/linux/edac.h
@@ -18386,11 +18391,17 @@ M: Tony Luck <tony.luck@intel.com>
M: Borislav Petkov <bp@alien8.de>
L: linux-edac@vger.kernel.org
S: Maintained
-F: Documentation/admin-guide/ras.rst
+F: Documentation/admin-guide/RAS
F: drivers/ras/
F: include/linux/ras.h
F: include/ras/ras_event.h
+RAS FRU MEMORY POISON MANAGER (FMPM)
+M: Yazen Ghannam <Yazen.Ghannam@amd.com>
+L: linux-edac@vger.kernel.org
+S: Maintained
+F: drivers/ras/amd/fmpm.c
+
RC-CORE / LIRC FRAMEWORK
M: Sean Young <sean@mess.org>
L: linux-media@vger.kernel.org
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 7439b2377df5..8e9dd63b220c 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -467,11 +467,6 @@ smp_prepare_cpus(unsigned int max_cpus)
smp_num_cpus = smp_num_probed;
}
-void
-smp_prepare_boot_cpu(void)
-{
-}
-
int
__cpu_up(unsigned int cpu, struct task_struct *tidle)
{
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 8d9b188caa27..b2f2c59279a6 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -39,11 +39,6 @@ struct plat_smp_ops __weak plat_smp_ops;
/* XXX: per cpu ? Only needed once in early secondary boot */
struct task_struct *secondary_idle_tsk;
-/* Called from start_kernel */
-void __init smp_prepare_boot_cpu(void)
-{
-}
-
static int __init arc_get_cpu_map(const char *name, struct cpumask *cpumask)
{
unsigned long dt_root = of_get_flat_dt_root();
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index 8e42352cbf12..92dbbf3e0205 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -152,10 +152,6 @@ void arch_irq_work_raise(void)
}
#endif
-void __init smp_prepare_boot_cpu(void)
-{
-}
-
void __init smp_prepare_cpus(unsigned int max_cpus)
{
}
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 608884bc3396..65e1fdf9fdb2 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -114,10 +114,6 @@ void send_ipi(const struct cpumask *cpumask, enum ipi_message_type msg)
local_irq_restore(flags);
}
-void __init smp_prepare_boot_cpu(void)
-{
-}
-
/*
* interrupts should already be disabled from the VM
* SP should already be correct; need to set THREADINFO_REG
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index b4d71fea558f..a6f6607efe79 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -336,7 +336,6 @@ CONFIG_ATA=y
CONFIG_PATA_GAYLE=y
CONFIG_PATA_BUDDHA=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -579,12 +578,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 682d8cd3dd3c..0ca1f9f930bc 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -316,7 +316,6 @@ CONFIG_SCSI_SAS_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -536,12 +535,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 15259ced8463..be030659d8d7 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -331,7 +331,6 @@ CONFIG_ATA=y
# CONFIG_ATA_BMDMA is not set
CONFIG_PATA_FALCON=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -556,12 +555,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 7395c12caef6..ad8f81fbb630 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -314,7 +314,6 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_BVME6000_SCSI=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -528,12 +527,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 92506bc7f78d..ff253b6accec 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -315,7 +315,6 @@ CONFIG_SCSI_SAS_ATTRS=m
CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -538,12 +537,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 144bc8c0d8b5..f92b866620a7 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -320,7 +320,6 @@ CONFIG_ATA=y
# CONFIG_ATA_BMDMA is not set
CONFIG_PATA_PLATFORM=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -555,12 +554,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 07594c729497..bd813beaa8a7 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -363,7 +363,6 @@ CONFIG_PATA_GAYLE=y
CONFIG_PATA_BUDDHA=y
CONFIG_PATA_PLATFORM=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -641,12 +640,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index c34de6c1de20..2237ee0fe433 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -313,7 +313,6 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MVME147_SCSI=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -527,12 +526,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 83bc029d1f33..afb5aa9c5012 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -314,7 +314,6 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_MVME16x_SCSI=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -528,12 +527,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 4f551dac2ed7..e40f7a308966 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -320,7 +320,6 @@ CONFIG_ATA=y
# CONFIG_ATA_BMDMA is not set
CONFIG_PATA_FALCON=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -545,12 +544,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index b1bf01182bd3..4df397c0395f 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -310,7 +310,6 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_SUN3_SCSI=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -526,12 +525,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 5c9a3f71f036..aa7719b3947f 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -311,7 +311,6 @@ CONFIG_ISCSI_TCP=m
CONFIG_ISCSI_BOOT_SYSFS=m
CONFIG_SUN3X_ESP=y
CONFIG_MD=y
-CONFIG_MD_LINEAR=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_UNSTRIPED=m
CONFIG_DM_CRYPT=m
@@ -526,12 +525,10 @@ CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_ADIANTUM=m
CONFIG_CRYPTO_ARC4=m
-CONFIG_CRYPTO_CFB=m
CONFIG_CRYPTO_CTS=m
CONFIG_CRYPTO_HCTR2=m
CONFIG_CRYPTO_KEYWRAP=m
CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_OFB=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
CONFIG_CRYPTO_AEGIS128=m
diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index 1c5a2d71d675..86da4bc5ee0b 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -57,10 +57,6 @@ static void boot_secondary(unsigned int cpu, struct task_struct *idle)
spin_unlock(&boot_lock);
}
-void __init smp_prepare_boot_cpu(void)
-{
-}
-
void __init smp_init_cpus(void)
{
struct device_node *cpu;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 693334c20d07..a60e4139214b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init;
/* cpumask of CPUs with asymmetric SMT dependency */
static int powerpc_smt_flags(void)
{
- int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+ int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
@@ -1010,9 +1010,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack);
static int powerpc_shared_cache_flags(void)
{
if (static_branch_unlikely(&splpar_asym_pack))
- return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
+ return SD_SHARE_LLC | SD_ASYM_PACKING;
- return SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_LLC;
}
static int powerpc_shared_proc_flags(void)
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index 519b6bd946e5..c4ed7d977f57 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -42,10 +42,6 @@
static DECLARE_COMPLETION(cpu_running);
-void __init smp_prepare_boot_cpu(void)
-{
-}
-
void __init smp_prepare_cpus(unsigned int max_cpus)
{
int cpuid;
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index fe565f3a3a91..2dc722fb31b4 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -127,6 +127,7 @@ config S390
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_KERNEL_PMD_MKWRITE
+ select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS2
@@ -448,7 +449,7 @@ config COMPAT
select COMPAT_OLD_SIGACTION
select HAVE_UID16
depends on MULTIUSER
- depends on !CC_IS_CLANG
+ depends on !CC_IS_CLANG && !LD_IS_LLD
help
Select this option if you want to enable your system kernel to
handle system-calls from ELF binaries for 31 bit ESA. This option
@@ -582,14 +583,23 @@ config RELOCATABLE
help
This builds a kernel image that retains relocation information
so it can be loaded at an arbitrary address.
- The kernel is linked as a position-independent executable (PIE)
- and contains dynamic relocations which are processed early in the
- bootup process.
The relocations make the kernel image about 15% larger (compressed
10%), but are discarded at runtime.
Note: this option exists only for documentation purposes, please do
not remove it.
+config PIE_BUILD
+ def_bool CC_IS_CLANG && !$(cc-option,-munaligned-symbols)
+ help
+ If the compiler is unable to generate code that can manage unaligned
+ symbols, the kernel is linked as a position-independent executable
+ (PIE) and includes dynamic relocations that are processed early
+ during bootup.
+
+ For kpatch functionality, it is recommended to build the kernel
+ without the PIE_BUILD option. PIE_BUILD is only enabled when the
+ compiler lacks proper support for handling unaligned symbols.
+
config RANDOMIZE_BASE
bool "Randomize the address of the kernel image (KASLR)"
default y
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index 73873e451686..2a58e1864931 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -14,8 +14,14 @@ KBUILD_AFLAGS_MODULE += -fPIC
KBUILD_CFLAGS_MODULE += -fPIC
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
+ifdef CONFIG_PIE_BUILD
KBUILD_CFLAGS += -fPIE
-LDFLAGS_vmlinux := -pie
+LDFLAGS_vmlinux := -pie -z notext
+else
+KBUILD_CFLAGS += $(call cc-option,-munaligned-symbols,)
+LDFLAGS_vmlinux := --emit-relocs --discard-none
+extra_tools := relocs
+endif
aflags_dwarf := -Wa,-gdwarf-2
KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
ifndef CONFIG_AS_IS_LLVM
@@ -143,7 +149,7 @@ archheaders:
archprepare:
$(Q)$(MAKE) $(build)=$(syscalls) kapi
- $(Q)$(MAKE) $(build)=$(tools) kapi
+ $(Q)$(MAKE) $(build)=$(tools) kapi $(extra_tools)
ifeq ($(KBUILD_EXTMOD),)
# We need to generate vdso-offsets.h before compiling certain files in kernel/.
# In order to do that, we should use the archprepare target, but we can't since
diff --git a/arch/s390/boot/.gitignore b/arch/s390/boot/.gitignore
index f56591bc0897..f5ef099e2fd3 100644
--- a/arch/s390/boot/.gitignore
+++ b/arch/s390/boot/.gitignore
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
image
bzImage
+relocs.S
section_cmp.*
vmlinux
vmlinux.lds
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index c7c81e5f9218..294f08a8811a 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -37,7 +37,8 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char
obj-y := head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y += version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o
+obj-y += version.o pgm_check_info.o ctype.o ipl_data.o
+obj-y += $(if $(CONFIG_PIE_BUILD),machine_kexec_reloc.o,relocs.o)
obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o
obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
obj-y += $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
@@ -48,6 +49,9 @@ targets := bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y
targets += vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all)
+ifndef CONFIG_PIE_BUILD
+targets += relocs.S
+endif
OBJECTS := $(addprefix $(obj)/,$(obj-y))
OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all))
@@ -56,9 +60,9 @@ clean-files += vmlinux.map
quiet_cmd_section_cmp = SECTCMP $*
define cmd_section_cmp
- s1=`$(OBJDUMP) -t -j "$*" "$<" | sort | \
+ s1=`$(OBJDUMP) -t "$<" | grep "\s$*\s\+" | sort | \
sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \
- s2=`$(OBJDUMP) -t -j "$*" "$(word 2,$^)" | sort | \
+ s2=`$(OBJDUMP) -t "$(word 2,$^)" | grep "\s$*\s\+" | sort | \
sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \
if [ "$$s1" != "$$s2" ]; then \
echo "error: section $* differs between $< and $(word 2,$^)" >&2; \
@@ -73,11 +77,12 @@ $(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.b
$(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE
$(call if_changed,section_cmp)
-LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T
+LDFLAGS_vmlinux-$(CONFIG_LD_ORPHAN_WARN) := --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
+LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T
$(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE
$(call if_changed,ld)
-LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T
+LDFLAGS_vmlinux.syms := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup -T
$(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE
$(call if_changed,ld)
@@ -93,7 +98,7 @@ OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .
$(obj)/syms.o: $(obj)/syms.bin FORCE
$(call if_changed,objcopy)
-OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load
+OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=alloc,load
$(obj)/info.bin: vmlinux FORCE
$(call if_changed,objcopy)
@@ -105,6 +110,14 @@ OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
+ifndef CONFIG_PIE_BUILD
+CMD_RELOCS=arch/s390/tools/relocs
+quiet_cmd_relocs = RELOCS $@
+ cmd_relocs = $(CMD_RELOCS) $< > $@
+$(obj)/relocs.S: vmlinux FORCE
+ $(call if_changed,relocs)
+endif
+
suffix-$(CONFIG_KERNEL_GZIP) := .gz
suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
suffix-$(CONFIG_KERNEL_LZ4) := .lz4
diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h
index 222c6886acf6..567d60f78bbc 100644
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -25,9 +25,14 @@ struct vmlinux_info {
unsigned long bootdata_size;
unsigned long bootdata_preserved_off;
unsigned long bootdata_preserved_size;
+#ifdef CONFIG_PIE_BUILD
unsigned long dynsym_start;
unsigned long rela_dyn_start;
unsigned long rela_dyn_end;
+#else
+ unsigned long got_start;
+ unsigned long got_end;
+#endif
unsigned long amode31_size;
unsigned long init_mm_off;
unsigned long swapper_pg_dir_off;
@@ -83,6 +88,7 @@ extern unsigned long vmalloc_size;
extern int vmalloc_size_set;
extern char __boot_data_start[], __boot_data_end[];
extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
+extern char __vmlinux_relocs_64_start[], __vmlinux_relocs_64_end[];
extern char _decompressor_syms_start[], _decompressor_syms_end[];
extern char _stack_start[], _stack_end[];
extern char _end[], _decompressor_end[];
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index 9cc76e631759..6cf89314209a 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -141,7 +141,8 @@ static void copy_bootdata(void)
memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size);
}
-static void handle_relocs(unsigned long offset)
+#ifdef CONFIG_PIE_BUILD
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
{
Elf64_Rela *rela_start, *rela_end, *rela;
int r_type, r_sym, rc;
@@ -172,6 +173,54 @@ static void handle_relocs(unsigned long offset)
}
}
+static void kaslr_adjust_got(unsigned long offset) {}
+static void rescue_relocs(void) {}
+static void free_relocs(void) {}
+#else
+static int *vmlinux_relocs_64_start;
+static int *vmlinux_relocs_64_end;
+
+static void rescue_relocs(void)
+{
+ unsigned long size = __vmlinux_relocs_64_end - __vmlinux_relocs_64_start;
+
+ vmlinux_relocs_64_start = (void *)physmem_alloc_top_down(RR_RELOC, size, 0);
+ vmlinux_relocs_64_end = (void *)vmlinux_relocs_64_start + size;
+ memmove(vmlinux_relocs_64_start, __vmlinux_relocs_64_start, size);
+}
+
+static void free_relocs(void)
+{
+ physmem_free(RR_RELOC);
+}
+
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
+{
+ int *reloc;
+ long loc;
+
+ /* Adjust R_390_64 relocations */
+ for (reloc = vmlinux_relocs_64_start; reloc < vmlinux_relocs_64_end; reloc++) {
+ loc = (long)*reloc + offset;
+ if (loc < min_addr || loc > max_addr)
+ error("64-bit relocation outside of kernel!\n");
+ *(u64 *)loc += offset;
+ }
+}
+
+static void kaslr_adjust_got(unsigned long offset)
+{
+ u64 *entry;
+
+ /*
+ * Even without -fPIE, Clang still uses a global offset table for some
+ * reason. Adjust the GOT entries.
+ */
+ for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++)
+ *entry += offset;
+}
+#endif
+
/*
* Merge information from several sources into a single ident_map_size value.
* "ident_map_size" represents the upper limit of physical memory we may ever
@@ -299,14 +348,19 @@ static void setup_vmalloc_size(void)
vmalloc_size = max(size, vmalloc_size);
}
-static void offset_vmlinux_info(unsigned long offset)
+static void kaslr_adjust_vmlinux_info(unsigned long offset)
{
*(unsigned long *)(&vmlinux.entry) += offset;
vmlinux.bootdata_off += offset;
vmlinux.bootdata_preserved_off += offset;
+#ifdef CONFIG_PIE_BUILD
vmlinux.rela_dyn_start += offset;
vmlinux.rela_dyn_end += offset;
vmlinux.dynsym_start += offset;
+#else
+ vmlinux.got_start += offset;
+ vmlinux.got_end += offset;
+#endif
vmlinux.init_mm_off += offset;
vmlinux.swapper_pg_dir_off += offset;
vmlinux.invalid_pg_dir_off += offset;
@@ -361,6 +415,7 @@ void startup_kernel(void)
detect_physmem_online_ranges(max_physmem_end);
save_ipl_cert_comp_list();
rescue_initrd(safe_addr, ident_map_size);
+ rescue_relocs();
if (kaslr_enabled()) {
vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size,
@@ -368,7 +423,7 @@ void startup_kernel(void)
ident_map_size);
if (vmlinux_lma) {
__kaslr_offset = vmlinux_lma - vmlinux.default_lma;
- offset_vmlinux_info(__kaslr_offset);
+ kaslr_adjust_vmlinux_info(__kaslr_offset);
}
}
vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma;
@@ -393,18 +448,20 @@ void startup_kernel(void)
/*
* The order of the following operations is important:
*
- * - handle_relocs() must follow clear_bss_section() to establish static
- * memory references to data in .bss to be used by setup_vmem()
+ * - kaslr_adjust_relocs() must follow clear_bss_section() to establish
+ * static memory references to data in .bss to be used by setup_vmem()
* (i.e init_mm.pgd)
*
- * - setup_vmem() must follow handle_relocs() to be able using
+ * - setup_vmem() must follow kaslr_adjust_relocs() to be able using
* static memory references to data in .bss (i.e init_mm.pgd)
*
- * - copy_bootdata() must follow setup_vmem() to propagate changes to
- * bootdata made by setup_vmem()
+ * - copy_bootdata() must follow setup_vmem() to propagate changes
+ * to bootdata made by setup_vmem()
*/
clear_bss_section(vmlinux_lma);
- handle_relocs(__kaslr_offset);
+ kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, __kaslr_offset);
+ kaslr_adjust_got(__kaslr_offset);
+ free_relocs();
setup_vmem(asce_limit);
copy_bootdata();
diff --git a/arch/s390/boot/vmlinux.lds.S b/arch/s390/boot/vmlinux.lds.S
index 389df0e0d9e5..3d7ea585ab99 100644
--- a/arch/s390/boot/vmlinux.lds.S
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
_text = .; /* Text */
*(.text)
*(.text.*)
+ INIT_TEXT
_etext = . ;
}
.rodata : {
@@ -39,6 +40,9 @@ SECTIONS
*(.rodata.*)
_erodata = . ;
}
+ .got : {
+ *(.got)
+ }
NOTES
.data : {
_data = . ;
@@ -106,6 +110,24 @@ SECTIONS
_compressed_end = .;
}
+#ifndef CONFIG_PIE_BUILD
+ /*
+ * When the kernel is built with CONFIG_KERNEL_UNCOMPRESSED, the entire
+ * uncompressed vmlinux.bin is positioned in the bzImage decompressor
+ * image at the default kernel LMA of 0x100000, enabling it to be
+ * executed in-place. However, the size of .vmlinux.relocs could be
+ * large enough to cause an overlap with the uncompressed kernel at the
+ * address 0x100000. To address this issue, .vmlinux.relocs is
+ * positioned after the .rodata.compressed.
+ */
+ . = ALIGN(4);
+ .vmlinux.relocs : {
+ __vmlinux_relocs_64_start = .;
+ *(.vmlinux.relocs_64)
+ __vmlinux_relocs_64_end = .;
+ }
+#endif
+
#define SB_TRAILER_SIZE 32
/* Trailer needed for Secure Boot */
. += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
@@ -118,8 +140,34 @@ SECTIONS
}
_end = .;
+ DWARF_DEBUG
+ ELF_DETAILS
+
+ /*
+ * Make sure that the .got.plt is either completely empty or it
+ * contains only the three reserved double words.
+ */
+ .got.plt : {
+ *(.got.plt)
+ }
+ ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!")
+
+ /*
+ * Sections that should stay zero sized, which is safer to
+ * explicitly check instead of blindly discarding.
+ */
+ .plt : {
+ *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
+ }
+ ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+ .rela.dyn : {
+ *(.rela.*) *(.rela_*)
+ }
+ ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
+
/* Sections to be discarded */
/DISCARD/ : {
+ COMMON_DISCARDS
*(.eh_frame)
*(__ex_table)
*(*__ksymtab*)
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 06756bad5e30..4032e6e136ac 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -824,6 +824,8 @@ CONFIG_TEST_LOCKUP=m
CONFIG_DEBUG_PREEMPT=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCK_STAT=y
+CONFIG_LOCKDEP_BITS=16
+CONFIG_LOCKDEP_CHAINS_BITS=17
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
CONFIG_DEBUG_IRQFLAGS=y
diff --git a/arch/s390/crypto/chacha-glue.c b/arch/s390/crypto/chacha-glue.c
index ed9959e6f714..f8b0c52e77a4 100644
--- a/arch/s390/crypto/chacha-glue.c
+++ b/arch/s390/crypto/chacha-glue.c
@@ -15,14 +15,14 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sizes.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
#include "chacha-s390.h"
static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
unsigned int nbytes, const u32 *key,
u32 *counter)
{
- struct kernel_fpu vxstate;
+ DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
kernel_fpu_begin(&vxstate, KERNEL_VXR);
chacha20_vx(dst, src, nbytes, key, counter);
diff --git a/arch/s390/crypto/chacha-s390.S b/arch/s390/crypto/chacha-s390.S
index 37cb63f25b17..63f3102678c0 100644
--- a/arch/s390/crypto/chacha-s390.S
+++ b/arch/s390/crypto/chacha-s390.S
@@ -8,7 +8,7 @@
#include <linux/linkage.h>
#include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <asm/fpu-insn.h>
#define SP %r15
#define FRAME (16 * 8 + 4 * 8)
diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c
index 017143e9cef7..74f17c905d12 100644
--- a/arch/s390/crypto/crc32-vx.c
+++ b/arch/s390/crypto/crc32-vx.c
@@ -13,8 +13,8 @@
#include <linux/cpufeature.h>
#include <linux/crc32.h>
#include <crypto/internal/hash.h>
-#include <asm/fpu/api.h>
-
+#include <asm/fpu.h>
+#include "crc32-vx.h"
#define CRC32_BLOCK_SIZE 1
#define CRC32_DIGEST_SIZE 4
@@ -31,11 +31,6 @@ struct crc_desc_ctx {
u32 crc;
};
-/* Prototypes for functions in assembly files */
-u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-
/*
* DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
*
@@ -49,8 +44,8 @@ u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
static u32 __pure ___fname(u32 crc, \
unsigned char const *data, size_t datalen) \
{ \
- struct kernel_fpu vxstate; \
unsigned long prealign, aligned, remaining; \
+ DECLARE_KERNEL_FPU_ONSTACK16(vxstate); \
\
if (datalen < VX_MIN_LEN + VX_ALIGN_MASK) \
return ___crc32_sw(crc, data, datalen); \
diff --git a/arch/s390/crypto/crc32-vx.h b/arch/s390/crypto/crc32-vx.h
new file mode 100644
index 000000000000..652c96e1a822
--- /dev/null
+++ b/arch/s390/crypto/crc32-vx.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRC32_VX_S390_H
+#define _CRC32_VX_S390_H
+
+#include <linux/types.h>
+
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+
+#endif /* _CRC32_VX_S390_H */
diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.c
index 34ee47926891..fed7c9c70d05 100644
--- a/arch/s390/crypto/crc32be-vx.S
+++ b/arch/s390/crypto/crc32be-vx.c
@@ -12,20 +12,17 @@
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
/* Vector register range containing CRC-32 constants */
-#define CONST_R1R2 %v9
-#define CONST_R3R4 %v10
-#define CONST_R5 %v11
-#define CONST_R6 %v12
-#define CONST_RU_POLY %v13
-#define CONST_CRC_POLY %v14
-
- .data
- .balign 8
+#define CONST_R1R2 9
+#define CONST_R3R4 10
+#define CONST_R5 11
+#define CONST_R6 12
+#define CONST_RU_POLY 13
+#define CONST_CRC_POLY 14
/*
* The CRC-32 constant block contains reduction constants to fold and
@@ -58,105 +55,74 @@
* P'(x) = 0xEDB88320
*/
-SYM_DATA_START_LOCAL(constants_CRC_32_BE)
- .quad 0x08833794c, 0x0e6228b11 # R1, R2
- .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4
- .quad 0x0f200aa66, 1 << 32 # R5, x32
- .quad 0x0490d678d, 1 # R6, 1
- .quad 0x104d101df, 0 # u
- .quad 0x104C11DB7, 0 # P(x)
-SYM_DATA_END(constants_CRC_32_BE)
-
- .previous
-
- GEN_BR_THUNK %r14
-
- .text
-/*
- * The CRC-32 function(s) use these calling conventions:
- *
- * Parameters:
- *
- * %r2: Initial CRC value, typically ~0; and final CRC (return) value.
- * %r3: Input buffer pointer, performance might be improved if the
- * buffer is on a doubleword boundary.
- * %r4: Length of the buffer, must be 64 bytes or greater.
+static unsigned long constants_CRC_32_BE[] = {
+ 0x08833794c, 0x0e6228b11, /* R1, R2 */
+ 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */
+ 0x0f200aa66, 1UL << 32, /* R5, x32 */
+ 0x0490d678d, 1, /* R6, 1 */
+ 0x104d101df, 0, /* u */
+ 0x104C11DB7, 0, /* P(x) */
+};
+
+/**
+ * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
*
* Register usage:
- *
- * %r5: CRC-32 constant pool base pointer.
* V0: Initial CRC value and intermediate constants and results.
* V1..V4: Data for CRC computation.
* V5..V8: Next data chunks that are fetched from the input buffer.
- *
* V9..V14: CRC-32 constants.
*/
-SYM_FUNC_START(crc32_be_vgfm_16)
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
/* Load CRC-32 constants */
- larl %r5,constants_CRC_32_BE
- VLM CONST_R1R2,CONST_CRC_POLY,0,%r5
+ fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE);
+ fpu_vzero(0);
/* Load the initial CRC value into the leftmost word of V0. */
- VZERO %v0
- VLVGF %v0,%r2,0
+ fpu_vlvgf(0, crc, 0);
/* Load a 64-byte data chunk and XOR with CRC */
- VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */
- VX %v1,%v0,%v1 /* V1 ^= CRC */
- aghi %r3,64 /* BUF = BUF + 64 */
- aghi %r4,-64 /* LEN = LEN - 64 */
-
- /* Check remaining buffer size and jump to proper folding method */
- cghi %r4,64
- jl .Lless_than_64bytes
-
-.Lfold_64bytes_loop:
- /* Load the next 64-byte data chunk into V5 to V8 */
- VLM %v5,%v8,0,%r3
+ fpu_vlm(1, 4, buf);
+ fpu_vx(1, 0, 1);
+ buf += 64;
+ size -= 64;
+
+ while (size >= 64) {
+ /* Load the next 64-byte data chunk into V5 to V8 */
+ fpu_vlm(5, 8, buf);
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the reduction constants in V0. The intermediate result is
+ * then folded (accumulated) with the next data chunk in V5 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ fpu_vgfmag(1, CONST_R1R2, 1, 5);
+ fpu_vgfmag(2, CONST_R1R2, 2, 6);
+ fpu_vgfmag(3, CONST_R1R2, 3, 7);
+ fpu_vgfmag(4, CONST_R1R2, 4, 8);
+ buf += 64;
+ size -= 64;
+ }
- /*
- * Perform a GF(2) multiplication of the doublewords in V1 with
- * the reduction constants in V0. The intermediate result is
- * then folded (accumulated) with the next data chunk in V5 and
- * stored in V1. Repeat this step for the register contents
- * in V2, V3, and V4 respectively.
- */
- VGFMAG %v1,CONST_R1R2,%v1,%v5
- VGFMAG %v2,CONST_R1R2,%v2,%v6
- VGFMAG %v3,CONST_R1R2,%v3,%v7
- VGFMAG %v4,CONST_R1R2,%v4,%v8
-
- /* Adjust buffer pointer and length for next loop */
- aghi %r3,64 /* BUF = BUF + 64 */
- aghi %r4,-64 /* LEN = LEN - 64 */
-
- cghi %r4,64
- jnl .Lfold_64bytes_loop
-
-.Lless_than_64bytes:
/* Fold V1 to V4 into a single 128-bit value in V1 */
- VGFMAG %v1,CONST_R3R4,%v1,%v2
- VGFMAG %v1,CONST_R3R4,%v1,%v3
- VGFMAG %v1,CONST_R3R4,%v1,%v4
-
- /* Check whether to continue with 64-bit folding */
- cghi %r4,16
- jl .Lfinal_fold
+ fpu_vgfmag(1, CONST_R3R4, 1, 2);
+ fpu_vgfmag(1, CONST_R3R4, 1, 3);
+ fpu_vgfmag(1, CONST_R3R4, 1, 4);
-.Lfold_16bytes_loop:
+ while (size >= 16) {
+ fpu_vl(2, buf);
+ fpu_vgfmag(1, CONST_R3R4, 1, 2);
+ buf += 16;
+ size -= 16;
+ }
- VL %v2,0,,%r3 /* Load next data chunk */
- VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */
-
- /* Adjust buffer pointer and size for folding next data chunk */
- aghi %r3,16
- aghi %r4,-16
-
- /* Process remaining data chunks */
- cghi %r4,16
- jnl .Lfold_16bytes_loop
-
-.Lfinal_fold:
/*
* The R5 constant is used to fold a 128-bit value into an 96-bit value
* that is XORed with the next 96-bit input data chunk. To use a single
@@ -164,7 +130,7 @@ SYM_FUNC_START(crc32_be_vgfm_16)
* form an intermediate 96-bit value (with appended zeros) which is then
* XORed with the intermediate reduction result.
*/
- VGFMG %v1,CONST_R5,%v1
+ fpu_vgfmg(1, CONST_R5, 1);
/*
* Further reduce the remaining 96-bit value to a 64-bit value using a
@@ -173,7 +139,7 @@ SYM_FUNC_START(crc32_be_vgfm_16)
* doubleword with R6. The result is a 64-bit value and is subject to
* the Barret reduction.
*/
- VGFMG %v1,CONST_R6,%v1
+ fpu_vgfmg(1, CONST_R6, 1);
/*
* The input values to the Barret reduction are the degree-63 polynomial
@@ -194,20 +160,15 @@ SYM_FUNC_START(crc32_be_vgfm_16)
*/
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
- VUPLLF %v2,%v1
- VGFMG %v2,CONST_RU_POLY,%v2
+ fpu_vupllf(2, 1);
+ fpu_vgfmg(2, CONST_RU_POLY, 2);
/*
* Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
* V2 and XOR the intermediate result, T2(x), with the value in V1.
* The final result is in the rightmost word of V2.
*/
- VUPLLF %v2,%v2
- VGFMAG %v2,CONST_CRC_POLY,%v2,%v1
-
-.Ldone:
- VLGVF %r2,%v2,3
- BR_EX %r14
-SYM_FUNC_END(crc32_be_vgfm_16)
-
-.previous
+ fpu_vupllf(2, 2);
+ fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+ return fpu_vlgvf(2, 3);
+}
diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.c
index 5a819ae09a0b..2f629f394df7 100644
--- a/arch/s390/crypto/crc32le-vx.S
+++ b/arch/s390/crypto/crc32le-vx.c
@@ -13,20 +13,17 @@
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"
/* Vector register range containing CRC-32 constants */
-#define CONST_PERM_LE2BE %v9
-#define CONST_R2R1 %v10
-#define CONST_R4R3 %v11
-#define CONST_R5 %v12
-#define CONST_RU_POLY %v13
-#define CONST_CRC_POLY %v14
-
- .data
- .balign 8
+#define CONST_PERM_LE2BE 9
+#define CONST_R2R1 10
+#define CONST_R4R3 11
+#define CONST_R5 12
+#define CONST_RU_POLY 13
+#define CONST_CRC_POLY 14
/*
* The CRC-32 constant block contains reduction constants to fold and
@@ -59,64 +56,43 @@
* P'(x) = 0x82F63B78
*/
-SYM_DATA_START_LOCAL(constants_CRC_32_LE)
- .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
- .quad 0x1c6e41596, 0x154442bd4 # R2, R1
- .quad 0x0ccaa009e, 0x1751997d0 # R4, R3
- .octa 0x163cd6124 # R5
- .octa 0x1F7011641 # u'
- .octa 0x1DB710641 # P'(x) << 1
-SYM_DATA_END(constants_CRC_32_LE)
-
-SYM_DATA_START_LOCAL(constants_CRC_32C_LE)
- .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
- .quad 0x09e4addf8, 0x740eef02 # R2, R1
- .quad 0x14cd00bd6, 0xf20c0dfe # R4, R3
- .octa 0x0dd45aab8 # R5
- .octa 0x0dea713f1 # u'
- .octa 0x105ec76f0 # P'(x) << 1
-SYM_DATA_END(constants_CRC_32C_LE)
-
- .previous
-
- GEN_BR_THUNK %r14
-
- .text
-
-/*
- * The CRC-32 functions use these calling conventions:
- *
- * Parameters:
- *
- * %r2: Initial CRC value, typically ~0; and final CRC (return) value.
- * %r3: Input buffer pointer, performance might be improved if the
- * buffer is on a doubleword boundary.
- * %r4: Length of the buffer, must be 64 bytes or greater.
+static unsigned long constants_CRC_32_LE[] = {
+ 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
+ 0x1c6e41596, 0x154442bd4, /* R2, R1 */
+ 0x0ccaa009e, 0x1751997d0, /* R4, R3 */
+ 0x0, 0x163cd6124, /* R5 */
+ 0x0, 0x1f7011641, /* u' */
+ 0x0, 0x1db710641 /* P'(x) << 1 */
+};
+
+static unsigned long constants_CRC_32C_LE[] = {
+ 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
+ 0x09e4addf8, 0x740eef02, /* R2, R1 */
+ 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */
+ 0x0, 0x0dd45aab8, /* R5 */
+ 0x0, 0x0dea713f1, /* u' */
+ 0x0, 0x105ec76f0 /* P'(x) << 1 */
+};
+
+/**
+ * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
+ * @constants: CRC-32 constant pool base pointer.
*
* Register usage:
- *
- * %r5: CRC-32 constant pool base pointer.
- * V0: Initial CRC value and intermediate constants and results.
- * V1..V4: Data for CRC computation.
- * V5..V8: Next data chunks that are fetched from the input buffer.
- * V9: Constant for BE->LE conversion and shift operations
- *
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ * V9: Constant for BE->LE conversion and shift operations
* V10..V14: CRC-32 constants.
*/
-
-SYM_FUNC_START(crc32_le_vgfm_16)
- larl %r5,constants_CRC_32_LE
- j crc32_le_vgfm_generic
-SYM_FUNC_END(crc32_le_vgfm_16)
-
-SYM_FUNC_START(crc32c_le_vgfm_16)
- larl %r5,constants_CRC_32C_LE
- j crc32_le_vgfm_generic
-SYM_FUNC_END(crc32c_le_vgfm_16)
-
-SYM_FUNC_START(crc32_le_vgfm_generic)
+static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
+{
/* Load CRC-32 constants */
- VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5
+ fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
/*
* Load the initial CRC value.
@@ -125,90 +101,73 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
* vector register and is later XORed with the LSB portion
* of the loaded input data.
*/
- VZERO %v0 /* Clear V0 */
- VLVGF %v0,%r2,3 /* Load CRC into rightmost word */
+ fpu_vzero(0); /* Clear V0 */
+ fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */
/* Load a 64-byte data chunk and XOR with CRC */
- VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */
- VPERM %v1,%v1,%v1,CONST_PERM_LE2BE
- VPERM %v2,%v2,%v2,CONST_PERM_LE2BE
- VPERM %v3,%v3,%v3,CONST_PERM_LE2BE
- VPERM %v4,%v4,%v4,CONST_PERM_LE2BE
+ fpu_vlm(1, 4, buf);
+ fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
+ fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+ fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
+ fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);
+
+ fpu_vx(1, 0, 1); /* V1 ^= CRC */
+ buf += 64;
+ size -= 64;
+
+ while (size >= 64) {
+ fpu_vlm(5, 8, buf);
+ fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
+ fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
+ fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
+ fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the R1 and R2 reduction constants in V0. The intermediate
+ * result is then folded (accumulated) with the next data chunk
+ * in V5 and stored in V1. Repeat this step for the register
+ * contents in V2, V3, and V4 respectively.
+ */
+ fpu_vgfmag(1, CONST_R2R1, 1, 5);
+ fpu_vgfmag(2, CONST_R2R1, 2, 6);
+ fpu_vgfmag(3, CONST_R2R1, 3, 7);
+ fpu_vgfmag(4, CONST_R2R1, 4, 8);
+ buf += 64;
+ size -= 64;
+ }
- VX %v1,%v0,%v1 /* V1 ^= CRC */
- aghi %r3,64 /* BUF = BUF + 64 */
- aghi %r4,-64 /* LEN = LEN - 64 */
-
- cghi %r4,64
- jl .Lless_than_64bytes
-
-.Lfold_64bytes_loop:
- /* Load the next 64-byte data chunk into V5 to V8 */
- VLM %v5,%v8,0,%r3
- VPERM %v5,%v5,%v5,CONST_PERM_LE2BE
- VPERM %v6,%v6,%v6,CONST_PERM_LE2BE
- VPERM %v7,%v7,%v7,CONST_PERM_LE2BE
- VPERM %v8,%v8,%v8,CONST_PERM_LE2BE
-
- /*
- * Perform a GF(2) multiplication of the doublewords in V1 with
- * the R1 and R2 reduction constants in V0. The intermediate result
- * is then folded (accumulated) with the next data chunk in V5 and
- * stored in V1. Repeat this step for the register contents
- * in V2, V3, and V4 respectively.
- */
- VGFMAG %v1,CONST_R2R1,%v1,%v5
- VGFMAG %v2,CONST_R2R1,%v2,%v6
- VGFMAG %v3,CONST_R2R1,%v3,%v7
- VGFMAG %v4,CONST_R2R1,%v4,%v8
-
- aghi %r3,64 /* BUF = BUF + 64 */
- aghi %r4,-64 /* LEN = LEN - 64 */
-
- cghi %r4,64
- jnl .Lfold_64bytes_loop
-
-.Lless_than_64bytes:
/*
* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
* and R4 and accumulating the next 128-bit chunk until a single 128-bit
* value remains.
*/
- VGFMAG %v1,CONST_R4R3,%v1,%v2
- VGFMAG %v1,CONST_R4R3,%v1,%v3
- VGFMAG %v1,CONST_R4R3,%v1,%v4
-
- cghi %r4,16
- jl .Lfinal_fold
-
-.Lfold_16bytes_loop:
-
- VL %v2,0,,%r3 /* Load next data chunk */
- VPERM %v2,%v2,%v2,CONST_PERM_LE2BE
- VGFMAG %v1,CONST_R4R3,%v1,%v2 /* Fold next data chunk */
+ fpu_vgfmag(1, CONST_R4R3, 1, 2);
+ fpu_vgfmag(1, CONST_R4R3, 1, 3);
+ fpu_vgfmag(1, CONST_R4R3, 1, 4);
+
+ while (size >= 16) {
+ fpu_vl(2, buf);
+ fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+ fpu_vgfmag(1, CONST_R4R3, 1, 2);
+ buf += 16;
+ size -= 16;
+ }
- aghi %r3,16
- aghi %r4,-16
-
- cghi %r4,16
- jnl .Lfold_16bytes_loop
-
-.Lfinal_fold:
/*
* Set up a vector register for byte shifts. The shift value must
* be loaded in bits 1-4 in byte element 7 of a vector register.
* Shift by 8 bytes: 0x40
* Shift by 4 bytes: 0x20
*/
- VLEIB %v9,0x40,7
+ fpu_vleib(9, 0x40, 7);
/*
* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
* to move R4 into the rightmost doubleword and set the leftmost
* doubleword to 0x1.
*/
- VSRLB %v0,CONST_R4R3,%v9
- VLEIG %v0,1,0
+ fpu_vsrlb(0, CONST_R4R3, 9);
+ fpu_vleig(0, 1, 0);
/*
* Compute GF(2) product of V1 and V0. The rightmost doubleword
@@ -216,7 +175,7 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
* multiplied by 0x1 and is then XORed with rightmost product.
* Implicitly, the intermediate leftmost product becomes padded
*/
- VGFMG %v1,%v0,%v1
+ fpu_vgfmg(1, 0, 1);
/*
* Now do the final 32-bit fold by multiplying the rightmost word
@@ -231,10 +190,10 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
* rightmost doubleword and the leftmost doubleword is zero to ignore
* the leftmost product of V1.
*/
- VLEIB %v9,0x20,7 /* Shift by words */
- VSRLB %v2,%v1,%v9 /* Store remaining bits in V2 */
- VUPLLF %v1,%v1 /* Split rightmost doubleword */
- VGFMAG %v1,CONST_R5,%v1,%v2 /* V1 = (V1 * R5) XOR V2 */
+ fpu_vleib(9, 0x20, 7); /* Shift by words */
+ fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */
+ fpu_vupllf(1, 1); /* Split rightmost doubleword */
+ fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */
/*
* Apply a Barret reduction to compute the final 32-bit CRC value.
@@ -256,20 +215,26 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
*/
/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
- VUPLLF %v2,%v1
- VGFMG %v2,CONST_RU_POLY,%v2
+ fpu_vupllf(2, 1);
+ fpu_vgfmg(2, CONST_RU_POLY, 2);
/*
* Compute the GF(2) product of the CRC polynomial with T1(x) in
* V2 and XOR the intermediate result, T2(x), with the value in V1.
* The final result is stored in word element 2 of V2.
*/
- VUPLLF %v2,%v2
- VGFMAG %v2,CONST_CRC_POLY,%v2,%v1
+ fpu_vupllf(2, 2);
+ fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+
+ return fpu_vlgvf(2, 2);
+}
-.Ldone:
- VLGVF %r2,%v2,2
- BR_EX %r14
-SYM_FUNC_END(crc32_le_vgfm_generic)
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
+}
-.previous
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+ return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
+}
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index 55ee5567a5ea..99f7e1f2b70a 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -125,20 +125,8 @@ struct s390_pxts_ctx {
static inline int __paes_keyblob2pkey(struct key_blob *kb,
struct pkey_protkey *pk)
{
- int i, ret;
-
- /* try three times in case of failure */
- for (i = 0; i < 3; i++) {
- if (i > 0 && ret == -EAGAIN && in_task())
- if (msleep_interruptible(1000))
- return -EINTR;
- ret = pkey_keyblob2pkey(kb->key, kb->keylen,
- pk->protkey, &pk->len, &pk->type);
- if (ret == 0)
- break;
- }
-
- return ret;
+ return pkey_keyblob2pkey(kb->key, kb->keylen,
+ pk->protkey, &pk->len, &pk->type);
}
static inline int __paes_convert_key(struct s390_paes_ctx *ctx)
diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c
index 9a2786079e3a..4131f0daa5ea 100644
--- a/arch/s390/hypfs/hypfs_diag0c.c
+++ b/arch/s390/hypfs/hypfs_diag0c.c
@@ -20,8 +20,7 @@
*/
static void diag0c_fn(void *data)
{
- diag_stat_inc(DIAG_STAT_X00C);
- diag_amode31_ops.diag0c(((void **)data)[smp_processor_id()]);
+ diag0c(((void **)data)[smp_processor_id()]);
}
/*
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index f5f7e78ddc0c..9fc3f0dae8f0 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -25,7 +25,7 @@
static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd)
{
- union register_pair r1 = { .even = (unsigned long)data, };
+ union register_pair r1 = { .even = virt_to_phys(data), };
asm volatile("diag %[r1],%[r3],0x304\n"
: [r1] "+&d" (r1.pair)
@@ -74,7 +74,7 @@ static int __hypfs_sprp_ioctl(void __user *user_area)
int rc;
rc = -ENOMEM;
- data = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ data = (void *)get_zeroed_page(GFP_KERNEL);
diag304 = kzalloc(sizeof(*diag304), GFP_KERNEL);
if (!data || !diag304)
goto out;
diff --git a/arch/s390/include/asm/access-regs.h b/arch/s390/include/asm/access-regs.h
new file mode 100644
index 000000000000..1a6412d9f5ad
--- /dev/null
+++ b/arch/s390/include/asm/access-regs.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2024
+ */
+
+#ifndef __ASM_S390_ACCESS_REGS_H
+#define __ASM_S390_ACCESS_REGS_H
+
+#include <linux/instrumented.h>
+#include <asm/sigcontext.h>
+
+struct access_regs {
+ unsigned int regs[NUM_ACRS];
+};
+
+static inline void save_access_regs(unsigned int *acrs)
+{
+ struct access_regs *regs = (struct access_regs *)acrs;
+
+ instrument_write(regs, sizeof(*regs));
+ asm volatile("stamy 0,15,%[regs]"
+ : [regs] "=QS" (*regs)
+ :
+ : "memory");
+}
+
+static inline void restore_access_regs(unsigned int *acrs)
+{
+ struct access_regs *regs = (struct access_regs *)acrs;
+
+ instrument_read(regs, sizeof(*regs));
+ asm volatile("lamy 0,15,%[regs]"
+ :
+ : [regs] "QS" (*regs)
+ : "memory");
+}
+
+#endif /* __ASM_S390_ACCESS_REGS_H */
diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h
index f2240392c708..a92ebbc7aa7a 100644
--- a/arch/s390/include/asm/appldata.h
+++ b/arch/s390/include/asm/appldata.h
@@ -54,13 +54,13 @@ static inline int appldata_asm(struct appldata_parameter_list *parm_list,
parm_list->function = fn;
parm_list->parlist_length = sizeof(*parm_list);
parm_list->buffer_length = length;
- parm_list->product_id_addr = (unsigned long) id;
+ parm_list->product_id_addr = virt_to_phys(id);
parm_list->buffer_addr = virt_to_phys(buffer);
diag_stat_inc(DIAG_STAT_X0DC);
asm volatile(
" diag %1,%0,0xdc"
: "=d" (ry)
- : "d" (parm_list), "m" (*parm_list), "m" (*id)
+ : "d" (virt_to_phys(parm_list)), "m" (*parm_list), "m" (*id)
: "cc");
return ry;
}
diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h
index a873e873e1ee..56096ae26f29 100644
--- a/arch/s390/include/asm/asm-prototypes.h
+++ b/arch/s390/include/asm/asm-prototypes.h
@@ -3,7 +3,7 @@
#include <linux/kvm_host.h>
#include <linux/ftrace.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
#include <asm-generic/asm-prototypes.h>
__int128_t __ashlti3(__int128_t a, int b);
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index aebe1e22c7be..c500d45fb465 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -14,7 +14,7 @@
".section .rodata.str,\"aMS\",@progbits,1\n" \
"1: .asciz \""__FILE__"\"\n" \
".previous\n" \
- ".section __bug_table,\"awM\",@progbits,%2\n" \
+ ".section __bug_table,\"aw\"\n" \
"2: .long 0b-.\n" \
" .long 1b-.\n" \
" .short %0,%1\n" \
@@ -30,7 +30,7 @@
#define __EMIT_BUG(x) do { \
asm_inline volatile( \
"0: mc 0,0\n" \
- ".section __bug_table,\"awM\",@progbits,%1\n" \
+ ".section __bug_table,\"aw\"\n" \
"1: .long 0b-.\n" \
" .short %0\n" \
" .org 1b+%1\n" \
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h
index 69837eec2ff5..b89159591ca0 100644
--- a/arch/s390/include/asm/checksum.h
+++ b/arch/s390/include/asm/checksum.h
@@ -12,36 +12,29 @@
#ifndef _S390_CHECKSUM_H
#define _S390_CHECKSUM_H
-#include <linux/kasan-checks.h>
+#include <linux/instrumented.h>
#include <linux/in6.h>
-/*
- * Computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit).
- *
- * Returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic.
- *
- * This function must be called with even lengths, except
- * for the last fragment, which may be odd.
- *
- * It's best to have buff aligned on a 32-bit boundary.
- */
-static inline __wsum csum_partial(const void *buff, int len, __wsum sum)
+static inline __wsum cksm(const void *buff, int len, __wsum sum)
{
union register_pair rp = {
- .even = (unsigned long) buff,
- .odd = (unsigned long) len,
+ .even = (unsigned long)buff,
+ .odd = (unsigned long)len,
};
- kasan_check_read(buff, len);
- asm volatile(
+ instrument_read(buff, len);
+ asm volatile("\n"
"0: cksm %[sum],%[rp]\n"
" jo 0b\n"
: [sum] "+&d" (sum), [rp] "+&d" (rp.pair) : : "cc", "memory");
return sum;
}
+__wsum csum_partial(const void *buff, int len, __wsum sum);
+
+#define _HAVE_ARCH_CSUM_AND_COPY
+__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len);
+
/*
* Fold a partial checksum without adding pseudo headers.
*/
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index bed804137537..20b94220113b 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -44,6 +44,13 @@ enum diag_stat_enum {
void diag_stat_inc(enum diag_stat_enum nr);
void diag_stat_inc_norecursion(enum diag_stat_enum nr);
+struct hypfs_diag0c_entry;
+
+/*
+ * Diagnose 0c: Pseudo Timer
+ */
+void diag0c(struct hypfs_diag0c_entry *data);
+
/*
* Diagnose 10: Release page range
*/
@@ -331,10 +338,10 @@ struct hypfs_diag0c_entry;
*/
struct diag_ops {
int (*diag210)(struct diag210 *addr);
- int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode);
+ int (*diag26c)(unsigned long rx, unsigned long rx1, enum diag26c_sc subcode);
int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode);
int (*diag8c)(struct diag8c *addr, struct ccw_dev_id *devno, size_t len);
- void (*diag0c)(struct hypfs_diag0c_entry *entry);
+ void (*diag0c)(unsigned long rx);
void (*diag308_reset)(void);
};
@@ -342,9 +349,9 @@ extern struct diag_ops diag_amode31_ops;
extern struct diag210 *__diag210_tmp_amode31;
int _diag210_amode31(struct diag210 *addr);
-int _diag26c_amode31(void *req, void *resp, enum diag26c_sc subcode);
+int _diag26c_amode31(unsigned long rx, unsigned long rx1, enum diag26c_sc subcode);
int _diag14_amode31(unsigned long rx, unsigned long ry1, unsigned long subcode);
-void _diag0c_amode31(struct hypfs_diag0c_entry *entry);
+void _diag0c_amode31(unsigned long rx);
void _diag308_reset_amode31(void);
int _diag8c_amode31(struct diag8c *addr, struct ccw_dev_id *devno, size_t len);
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
index fdd319a622b0..7f5004065e8a 100644
--- a/arch/s390/include/asm/entry-common.h
+++ b/arch/s390/include/asm/entry-common.h
@@ -8,7 +8,7 @@
#include <linux/processor.h>
#include <linux/uaccess.h>
#include <asm/timex.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
#include <asm/pai.h>
#define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP)
@@ -41,8 +41,7 @@ static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
static __always_inline void arch_exit_to_user_mode(void)
{
- if (test_cpu_flag(CIF_FPU))
- __load_fpu_regs();
+ load_user_fpu_regs();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
debug_user_asce(1);
diff --git a/arch/s390/include/asm/vx-insn-asm.h b/arch/s390/include/asm/fpu-insn-asm.h
index 360f8b36d962..02ccfe46050a 100644
--- a/arch/s390/include/asm/vx-insn-asm.h
+++ b/arch/s390/include/asm/fpu-insn-asm.h
@@ -9,11 +9,11 @@
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
-#ifndef __ASM_S390_VX_INSN_INTERNAL_H
-#define __ASM_S390_VX_INSN_INTERNAL_H
+#ifndef __ASM_S390_FPU_INSN_ASM_H
+#define __ASM_S390_FPU_INSN_ASM_H
-#ifndef __ASM_S390_VX_INSN_H
-#error only <asm/vx-insn.h> can be included directly
+#ifndef __ASM_S390_FPU_INSN_H
+#error only <asm/fpu-insn.h> can be included directly
#endif
#ifdef __ASSEMBLY__
@@ -195,10 +195,26 @@
/* RXB - Compute most significant bit used vector registers
*
* @rxb: Operand to store computed RXB value
- * @v1: First vector register designated operand
- * @v2: Second vector register designated operand
- * @v3: Third vector register designated operand
- * @v4: Fourth vector register designated operand
+ * @v1: Vector register designated operand whose MSB is stored in
+ * RXB bit 0 (instruction bit 36) and whose remaining bits
+ * are stored in instruction bits 8-11.
+ * @v2: Vector register designated operand whose MSB is stored in
+ * RXB bit 1 (instruction bit 37) and whose remaining bits
+ * are stored in instruction bits 12-15.
+ * @v3: Vector register designated operand whose MSB is stored in
+ * RXB bit 2 (instruction bit 38) and whose remaining bits
+ * are stored in instruction bits 16-19.
+ * @v4: Vector register designated operand whose MSB is stored in
+ * RXB bit 3 (instruction bit 39) and whose remaining bits
+ * are stored in instruction bits 32-35.
+ *
+ * Note: In most vector instruction formats [1] V1, V2, V3, and V4 directly
+ * correspond to @v1, @v2, @v3, and @v4. But there are exceptions, such as but
+ * not limited to the vector instruction formats VRR-g, VRR-h, VRS-a, VRS-d,
+ * and VSI.
+ *
+ * [1] IBM z/Architecture Principles of Operation, chapter "Program
+ * Execution, section "Instructions", subsection "Instruction Formats".
*/
.macro RXB rxb v1 v2=0 v3=0 v4=0
\rxb = 0
@@ -223,6 +239,9 @@
* @v2: Second vector register designated operand (for RXB)
* @v3: Third vector register designated operand (for RXB)
* @v4: Fourth vector register designated operand (for RXB)
+ *
+ * Note: For @v1, @v2, @v3, and @v4 also refer to the RXB macro
+ * description for further details.
*/
.macro MRXB m v1 v2=0 v3=0 v4=0
rxb = 0
@@ -238,6 +257,9 @@
* @v2: Second vector register designated operand (for RXB)
* @v3: Third vector register designated operand (for RXB)
* @v4: Fourth vector register designated operand (for RXB)
+ *
+ * Note: For @v1, @v2, @v3, and @v4 also refer to the RXB macro
+ * description for further details.
*/
.macro MRXBOPC m opc v1 v2=0 v3=0 v4=0
MRXB \m, \v1, \v2, \v3, \v4
@@ -350,7 +372,7 @@
VX_NUM v3, \vr
.word 0xE700 | (r1 << 4) | (v3&15)
.word (b2 << 12) | (\disp)
- MRXBOPC \m, 0x21, v3
+ MRXBOPC \m, 0x21, 0, v3
.endm
.macro VLGVB gr, vr, disp, base="%r0"
VLGV \gr, \vr, \disp, \base, 0
@@ -499,6 +521,25 @@
VMRL \vr1, \vr2, \vr3, 3
.endm
+/* VECTOR LOAD WITH LENGTH */
+.macro VLL v, gr, disp, base
+ VX_NUM v1, \v
+ GR_NUM b2, \base
+ GR_NUM r3, \gr
+ .word 0xE700 | ((v1&15) << 4) | r3
+ .word (b2 << 12) | (\disp)
+ MRXBOPC 0, 0x37, v1
+.endm
+
+/* VECTOR STORE WITH LENGTH */
+.macro VSTL v, gr, disp, base
+ VX_NUM v1, \v
+ GR_NUM b2, \base
+ GR_NUM r3, \gr
+ .word 0xE700 | ((v1&15) << 4) | r3
+ .word (b2 << 12) | (\disp)
+ MRXBOPC 0, 0x3f, v1
+.endm
/* Vector integer instructions */
@@ -512,6 +553,16 @@
MRXBOPC 0, 0x68, v1, v2, v3
.endm
+/* VECTOR CHECKSUM */
+.macro VCKSM vr1, vr2, vr3
+ VX_NUM v1, \vr1
+ VX_NUM v2, \vr2
+ VX_NUM v3, \vr3
+ .word 0xE700 | ((v1&15) << 4) | (v2&15)
+ .word ((v3&15) << 12)
+ MRXBOPC 0, 0x66, v1, v2, v3
+.endm
+
/* VECTOR EXCLUSIVE OR */
.macro VX vr1, vr2, vr3
VX_NUM v1, \vr1
@@ -678,4 +729,4 @@
.endm
#endif /* __ASSEMBLY__ */
-#endif /* __ASM_S390_VX_INSN_INTERNAL_H */
+#endif /* __ASM_S390_FPU_INSN_ASM_H */
diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h
new file mode 100644
index 000000000000..c1e2e521d9af
--- /dev/null
+++ b/arch/s390/include/asm/fpu-insn.h
@@ -0,0 +1,486 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Support for Floating Point and Vector Instructions
+ *
+ */
+
+#ifndef __ASM_S390_FPU_INSN_H
+#define __ASM_S390_FPU_INSN_H
+
+#include <asm/fpu-insn-asm.h>
+
+#ifndef __ASSEMBLY__
+
+#include <linux/instrumented.h>
+#include <asm/asm-extable.h>
+
+asm(".include \"asm/fpu-insn-asm.h\"\n");
+
+/*
+ * Various small helper functions, which can and should be used within
+ * kernel fpu code sections. Each function represents only one floating
+ * point or vector instruction (except for helper functions which require
+ * exception handling).
+ *
+ * This allows to use floating point and vector instructions like C
+ * functions, which has the advantage that all supporting code, like
+ * e.g. loops, can be written in easy to read C code.
+ *
+ * Each of the helper functions provides support for code instrumentation,
+ * like e.g. KASAN. Therefore instrumentation is also covered automatically
+ * when using these functions.
+ *
+ * In order to ensure that code generated with the helper functions stays
+ * within kernel fpu sections, which are guarded with kernel_fpu_begin()
+ * and kernel_fpu_end() calls, each function has a mandatory "memory"
+ * barrier.
+ */
+
+static __always_inline void fpu_cefbr(u8 f1, s32 val)
+{
+ asm volatile("cefbr %[f1],%[val]\n"
+ :
+ : [f1] "I" (f1), [val] "d" (val)
+ : "memory");
+}
+
+static __always_inline unsigned long fpu_cgebr(u8 f2, u8 mode)
+{
+ unsigned long val;
+
+ asm volatile("cgebr %[val],%[mode],%[f2]\n"
+ : [val] "=d" (val)
+ : [f2] "I" (f2), [mode] "I" (mode)
+ : "memory");
+ return val;
+}
+
+static __always_inline void fpu_debr(u8 f1, u8 f2)
+{
+ asm volatile("debr %[f1],%[f2]\n"
+ :
+ : [f1] "I" (f1), [f2] "I" (f2)
+ : "memory");
+}
+
+static __always_inline void fpu_ld(unsigned short fpr, freg_t *reg)
+{
+ instrument_read(reg, sizeof(*reg));
+ asm volatile("ld %[fpr],%[reg]\n"
+ :
+ : [fpr] "I" (fpr), [reg] "Q" (reg->ui)
+ : "memory");
+}
+
+static __always_inline void fpu_ldgr(u8 f1, u32 val)
+{
+ asm volatile("ldgr %[f1],%[val]\n"
+ :
+ : [f1] "I" (f1), [val] "d" (val)
+ : "memory");
+}
+
+static __always_inline void fpu_lfpc(unsigned int *fpc)
+{
+ instrument_read(fpc, sizeof(*fpc));
+ asm volatile("lfpc %[fpc]"
+ :
+ : [fpc] "Q" (*fpc)
+ : "memory");
+}
+
+/**
+ * fpu_lfpc_safe - Load floating point control register safely.
+ * @fpc: new value for floating point control register
+ *
+ * Load floating point control register. This may lead to an exception,
+ * since a saved value may have been modified by user space (ptrace,
+ * signal return, kvm registers) to an invalid value. In such a case
+ * set the floating point control register to zero.
+ */
+static inline void fpu_lfpc_safe(unsigned int *fpc)
+{
+ u32 tmp;
+
+ instrument_read(fpc, sizeof(*fpc));
+ asm volatile("\n"
+ "0: lfpc %[fpc]\n"
+ "1: nopr %%r7\n"
+ ".pushsection .fixup, \"ax\"\n"
+ "2: lghi %[tmp],0\n"
+ " sfpc %[tmp]\n"
+ " jg 1b\n"
+ ".popsection\n"
+ EX_TABLE(1b, 2b)
+ : [tmp] "=d" (tmp)
+ : [fpc] "Q" (*fpc)
+ : "memory");
+}
+
+static __always_inline void fpu_std(unsigned short fpr, freg_t *reg)
+{
+ instrument_write(reg, sizeof(*reg));
+ asm volatile("std %[fpr],%[reg]\n"
+ : [reg] "=Q" (reg->ui)
+ : [fpr] "I" (fpr)
+ : "memory");
+}
+
+static __always_inline void fpu_sfpc(unsigned int fpc)
+{
+ asm volatile("sfpc %[fpc]"
+ :
+ : [fpc] "d" (fpc)
+ : "memory");
+}
+
+static __always_inline void fpu_stfpc(unsigned int *fpc)
+{
+ instrument_write(fpc, sizeof(*fpc));
+ asm volatile("stfpc %[fpc]"
+ : [fpc] "=Q" (*fpc)
+ :
+ : "memory");
+}
+
+static __always_inline void fpu_vab(u8 v1, u8 v2, u8 v3)
+{
+ asm volatile("VAB %[v1],%[v2],%[v3]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+ : "memory");
+}
+
+static __always_inline void fpu_vcksm(u8 v1, u8 v2, u8 v3)
+{
+ asm volatile("VCKSM %[v1],%[v2],%[v3]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+ : "memory");
+}
+
+static __always_inline void fpu_vesravb(u8 v1, u8 v2, u8 v3)
+{
+ asm volatile("VESRAVB %[v1],%[v2],%[v3]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+ : "memory");
+}
+
+static __always_inline void fpu_vgfmag(u8 v1, u8 v2, u8 v3, u8 v4)
+{
+ asm volatile("VGFMAG %[v1],%[v2],%[v3],%[v4]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3), [v4] "I" (v4)
+ : "memory");
+}
+
+static __always_inline void fpu_vgfmg(u8 v1, u8 v2, u8 v3)
+{
+ asm volatile("VGFMG %[v1],%[v2],%[v3]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+ : "memory");
+}
+
+#ifdef CONFIG_CC_IS_CLANG
+
+static __always_inline void fpu_vl(u8 v1, const void *vxr)
+{
+ instrument_read(vxr, sizeof(__vector128));
+ asm volatile("\n"
+ " la 1,%[vxr]\n"
+ " VL %[v1],0,,1\n"
+ :
+ : [vxr] "R" (*(__vector128 *)vxr),
+ [v1] "I" (v1)
+ : "memory", "1");
+}
+
+#else /* CONFIG_CC_IS_CLANG */
+
+static __always_inline void fpu_vl(u8 v1, const void *vxr)
+{
+ instrument_read(vxr, sizeof(__vector128));
+ asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n"
+ :
+ : [vxr] "Q" (*(__vector128 *)vxr),
+ [v1] "I" (v1)
+ : "memory");
+}
+
+#endif /* CONFIG_CC_IS_CLANG */
+
+static __always_inline void fpu_vleib(u8 v, s16 val, u8 index)
+{
+ asm volatile("VLEIB %[v],%[val],%[index]"
+ :
+ : [v] "I" (v), [val] "K" (val), [index] "I" (index)
+ : "memory");
+}
+
+static __always_inline void fpu_vleig(u8 v, s16 val, u8 index)
+{
+ asm volatile("VLEIG %[v],%[val],%[index]"
+ :
+ : [v] "I" (v), [val] "K" (val), [index] "I" (index)
+ : "memory");
+}
+
+static __always_inline u64 fpu_vlgvf(u8 v, u16 index)
+{
+ u64 val;
+
+ asm volatile("VLGVF %[val],%[v],%[index]"
+ : [val] "=d" (val)
+ : [v] "I" (v), [index] "L" (index)
+ : "memory");
+ return val;
+}
+
+#ifdef CONFIG_CC_IS_CLANG
+
+static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr)
+{
+ unsigned int size;
+
+ size = min(index + 1, sizeof(__vector128));
+ instrument_read(vxr, size);
+ asm volatile("\n"
+ " la 1,%[vxr]\n"
+ " VLL %[v1],%[index],0,1\n"
+ :
+ : [vxr] "R" (*(u8 *)vxr),
+ [index] "d" (index),
+ [v1] "I" (v1)
+ : "memory", "1");
+}
+
+#else /* CONFIG_CC_IS_CLANG */
+
+static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr)
+{
+ unsigned int size;
+
+ size = min(index + 1, sizeof(__vector128));
+ instrument_read(vxr, size);
+ asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n"
+ :
+ : [vxr] "Q" (*(u8 *)vxr),
+ [index] "d" (index),
+ [v1] "I" (v1)
+ : "memory");
+}
+
+#endif /* CONFIG_CC_IS_CLANG */
+
+#ifdef CONFIG_CC_IS_CLANG
+
+#define fpu_vlm(_v1, _v3, _vxrs) \
+({ \
+ unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \
+ struct { \
+ __vector128 _v[(_v3) - (_v1) + 1]; \
+ } *_v = (void *)(_vxrs); \
+ \
+ instrument_read(_v, size); \
+ asm volatile("\n" \
+ " la 1,%[vxrs]\n" \
+ " VLM %[v1],%[v3],0,1\n" \
+ : \
+ : [vxrs] "R" (*_v), \
+ [v1] "I" (_v1), [v3] "I" (_v3) \
+ : "memory", "1"); \
+ (_v3) - (_v1) + 1; \
+})
+
+#else /* CONFIG_CC_IS_CLANG */
+
+#define fpu_vlm(_v1, _v3, _vxrs) \
+({ \
+ unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \
+ struct { \
+ __vector128 _v[(_v3) - (_v1) + 1]; \
+ } *_v = (void *)(_vxrs); \
+ \
+ instrument_read(_v, size); \
+ asm volatile("VLM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \
+ : \
+ : [vxrs] "Q" (*_v), \
+ [v1] "I" (_v1), [v3] "I" (_v3) \
+ : "memory"); \
+ (_v3) - (_v1) + 1; \
+})
+
+#endif /* CONFIG_CC_IS_CLANG */
+
+static __always_inline void fpu_vlr(u8 v1, u8 v2)
+{
+ asm volatile("VLR %[v1],%[v2]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2)
+ : "memory");
+}
+
+static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index)
+{
+ asm volatile("VLVGF %[v],%[val],%[index]"
+ :
+ : [v] "I" (v), [val] "d" (val), [index] "L" (index)
+ : "memory");
+}
+
+static __always_inline void fpu_vn(u8 v1, u8 v2, u8 v3)
+{
+ asm volatile("VN %[v1],%[v2],%[v3]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+ : "memory");
+}
+
+static __always_inline void fpu_vperm(u8 v1, u8 v2, u8 v3, u8 v4)
+{
+ asm volatile("VPERM %[v1],%[v2],%[v3],%[v4]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3), [v4] "I" (v4)
+ : "memory");
+}
+
+static __always_inline void fpu_vrepib(u8 v1, s16 i2)
+{
+ asm volatile("VREPIB %[v1],%[i2]"
+ :
+ : [v1] "I" (v1), [i2] "K" (i2)
+ : "memory");
+}
+
+static __always_inline void fpu_vsrlb(u8 v1, u8 v2, u8 v3)
+{
+ asm volatile("VSRLB %[v1],%[v2],%[v3]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+ : "memory");
+}
+
+#ifdef CONFIG_CC_IS_CLANG
+
+static __always_inline void fpu_vst(u8 v1, const void *vxr)
+{
+ instrument_write(vxr, sizeof(__vector128));
+ asm volatile("\n"
+ " la 1,%[vxr]\n"
+ " VST %[v1],0,,1\n"
+ : [vxr] "=R" (*(__vector128 *)vxr)
+ : [v1] "I" (v1)
+ : "memory", "1");
+}
+
+#else /* CONFIG_CC_IS_CLANG */
+
+static __always_inline void fpu_vst(u8 v1, const void *vxr)
+{
+ instrument_write(vxr, sizeof(__vector128));
+ asm volatile("VST %[v1],%O[vxr],,%R[vxr]\n"
+ : [vxr] "=Q" (*(__vector128 *)vxr)
+ : [v1] "I" (v1)
+ : "memory");
+}
+
+#endif /* CONFIG_CC_IS_CLANG */
+
+#ifdef CONFIG_CC_IS_CLANG
+
+static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
+{
+ unsigned int size;
+
+ size = min(index + 1, sizeof(__vector128));
+ instrument_write(vxr, size);
+ asm volatile("\n"
+ " la 1,%[vxr]\n"
+ " VSTL %[v1],%[index],0,1\n"
+ : [vxr] "=R" (*(u8 *)vxr)
+ : [index] "d" (index), [v1] "I" (v1)
+ : "memory", "1");
+}
+
+#else /* CONFIG_CC_IS_CLANG */
+
+static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr)
+{
+ unsigned int size;
+
+ size = min(index + 1, sizeof(__vector128));
+ instrument_write(vxr, size);
+ asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]\n"
+ : [vxr] "=Q" (*(u8 *)vxr)
+ : [index] "d" (index), [v1] "I" (v1)
+ : "memory");
+}
+
+#endif /* CONFIG_CC_IS_CLANG */
+
+#ifdef CONFIG_CC_IS_CLANG
+
+#define fpu_vstm(_v1, _v3, _vxrs) \
+({ \
+ unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \
+ struct { \
+ __vector128 _v[(_v3) - (_v1) + 1]; \
+ } *_v = (void *)(_vxrs); \
+ \
+ instrument_write(_v, size); \
+ asm volatile("\n" \
+ " la 1,%[vxrs]\n" \
+ " VSTM %[v1],%[v3],0,1\n" \
+ : [vxrs] "=R" (*_v) \
+ : [v1] "I" (_v1), [v3] "I" (_v3) \
+ : "memory", "1"); \
+ (_v3) - (_v1) + 1; \
+})
+
+#else /* CONFIG_CC_IS_CLANG */
+
+#define fpu_vstm(_v1, _v3, _vxrs) \
+({ \
+ unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \
+ struct { \
+ __vector128 _v[(_v3) - (_v1) + 1]; \
+ } *_v = (void *)(_vxrs); \
+ \
+ instrument_write(_v, size); \
+ asm volatile("VSTM %[v1],%[v3],%O[vxrs],%R[vxrs]\n" \
+ : [vxrs] "=Q" (*_v) \
+ : [v1] "I" (_v1), [v3] "I" (_v3) \
+ : "memory"); \
+ (_v3) - (_v1) + 1; \
+})
+
+#endif /* CONFIG_CC_IS_CLANG */
+
+static __always_inline void fpu_vupllf(u8 v1, u8 v2)
+{
+ asm volatile("VUPLLF %[v1],%[v2]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2)
+ : "memory");
+}
+
+static __always_inline void fpu_vx(u8 v1, u8 v2, u8 v3)
+{
+ asm volatile("VX %[v1],%[v2],%[v3]"
+ :
+ : [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
+ : "memory");
+}
+
+static __always_inline void fpu_vzero(u8 v)
+{
+ asm volatile("VZERO %[v]"
+ :
+ : [v] "I" (v)
+ : "memory");
+}
+
+#endif /* __ASSEMBLY__ */
+#endif /* __ASM_S390_FPU_INSN_H */
diff --git a/arch/s390/include/asm/fpu-types.h b/arch/s390/include/asm/fpu-types.h
new file mode 100644
index 000000000000..8d58d5a95399
--- /dev/null
+++ b/arch/s390/include/asm/fpu-types.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * FPU data structures
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_S390_FPU_TYPES_H
+#define _ASM_S390_FPU_TYPES_H
+
+#include <asm/sigcontext.h>
+
+struct fpu {
+ u32 fpc;
+ __vector128 vxrs[__NUM_VXRS] __aligned(8);
+};
+
+struct kernel_fpu_hdr {
+ int mask;
+ u32 fpc;
+};
+
+struct kernel_fpu {
+ struct kernel_fpu_hdr hdr;
+ __vector128 vxrs[] __aligned(8);
+};
+
+#define KERNEL_FPU_STRUCT(vxr_size) \
+struct kernel_fpu_##vxr_size { \
+ struct kernel_fpu_hdr hdr; \
+ __vector128 vxrs[vxr_size] __aligned(8); \
+}
+
+KERNEL_FPU_STRUCT(8);
+KERNEL_FPU_STRUCT(16);
+KERNEL_FPU_STRUCT(32);
+
+#define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name) \
+ struct kernel_fpu_##vxr_size name __uninitialized
+
+#define DECLARE_KERNEL_FPU_ONSTACK8(name) \
+ DECLARE_KERNEL_FPU_ONSTACK(8, name)
+
+#define DECLARE_KERNEL_FPU_ONSTACK16(name) \
+ DECLARE_KERNEL_FPU_ONSTACK(16, name)
+
+#define DECLARE_KERNEL_FPU_ONSTACK32(name) \
+ DECLARE_KERNEL_FPU_ONSTACK(32, name)
+
+#endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/include/asm/fpu.h b/arch/s390/include/asm/fpu.h
new file mode 100644
index 000000000000..c84cb33913e2
--- /dev/null
+++ b/arch/s390/include/asm/fpu.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * In-kernel FPU support functions
+ *
+ *
+ * Consider these guidelines before using in-kernel FPU functions:
+ *
+ * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
+ * use of floating-point or vector registers and instructions.
+ *
+ * 2. For kernel_fpu_begin(), specify the vector register range you want to
+ * use with the KERNEL_VXR_* constants. Consider these usage guidelines:
+ *
+ * a) If your function typically runs in process-context, use the lower
+ * half of the vector registers, for example, specify KERNEL_VXR_LOW.
+ * b) If your function typically runs in soft-irq or hard-irq context,
+ * prefer using the upper half of the vector registers, for example,
+ * specify KERNEL_VXR_HIGH.
+ *
+ * If you adhere to these guidelines, an interrupted process context
+ * does not require to save and restore vector registers because of
+ * disjoint register ranges.
+ *
+ * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
+ * includes logic to save and restore up to 16 vector registers at once.
+ *
+ * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
+ * struct kernel_fpu states. Vector registers that are in use by outer
+ * levels are saved and restored. You can minimize the save and restore
+ * effort by choosing disjoint vector register ranges.
+ *
+ * 5. To use vector floating-point instructions, specify the KERNEL_FPC
+ * flag to save and restore floating-point controls in addition to any
+ * vector register range.
+ *
+ * 6. To use floating-point registers and instructions only, specify the
+ * KERNEL_FPR flag. This flag triggers a save and restore of vector
+ * registers V0 to V15 and floating-point controls.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_S390_FPU_H
+#define _ASM_S390_FPU_H
+
+#include <linux/processor.h>
+#include <linux/preempt.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <asm/sigcontext.h>
+#include <asm/fpu-types.h>
+#include <asm/fpu-insn.h>
+#include <asm/facility.h>
+
+static inline bool cpu_has_vx(void)
+{
+ return likely(test_facility(129));
+}
+
+enum {
+ KERNEL_FPC_BIT = 0,
+ KERNEL_VXR_V0V7_BIT,
+ KERNEL_VXR_V8V15_BIT,
+ KERNEL_VXR_V16V23_BIT,
+ KERNEL_VXR_V24V31_BIT,
+};
+
+#define KERNEL_FPC BIT(KERNEL_FPC_BIT)
+#define KERNEL_VXR_V0V7 BIT(KERNEL_VXR_V0V7_BIT)
+#define KERNEL_VXR_V8V15 BIT(KERNEL_VXR_V8V15_BIT)
+#define KERNEL_VXR_V16V23 BIT(KERNEL_VXR_V16V23_BIT)
+#define KERNEL_VXR_V24V31 BIT(KERNEL_VXR_V24V31_BIT)
+
+#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7 | KERNEL_VXR_V8V15)
+#define KERNEL_VXR_MID (KERNEL_VXR_V8V15 | KERNEL_VXR_V16V23)
+#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23 | KERNEL_VXR_V24V31)
+
+#define KERNEL_VXR (KERNEL_VXR_LOW | KERNEL_VXR_HIGH)
+#define KERNEL_FPR (KERNEL_FPC | KERNEL_VXR_LOW)
+
+void load_fpu_state(struct fpu *state, int flags);
+void save_fpu_state(struct fpu *state, int flags);
+void __kernel_fpu_begin(struct kernel_fpu *state, int flags);
+void __kernel_fpu_end(struct kernel_fpu *state, int flags);
+
+static __always_inline void save_vx_regs(__vector128 *vxrs)
+{
+ fpu_vstm(0, 15, &vxrs[0]);
+ fpu_vstm(16, 31, &vxrs[16]);
+}
+
+static __always_inline void load_vx_regs(__vector128 *vxrs)
+{
+ fpu_vlm(0, 15, &vxrs[0]);
+ fpu_vlm(16, 31, &vxrs[16]);
+}
+
+static __always_inline void __save_fp_regs(freg_t *fprs, unsigned int offset)
+{
+ fpu_std(0, &fprs[0 * offset]);
+ fpu_std(1, &fprs[1 * offset]);
+ fpu_std(2, &fprs[2 * offset]);
+ fpu_std(3, &fprs[3 * offset]);
+ fpu_std(4, &fprs[4 * offset]);
+ fpu_std(5, &fprs[5 * offset]);
+ fpu_std(6, &fprs[6 * offset]);
+ fpu_std(7, &fprs[7 * offset]);
+ fpu_std(8, &fprs[8 * offset]);
+ fpu_std(9, &fprs[9 * offset]);
+ fpu_std(10, &fprs[10 * offset]);
+ fpu_std(11, &fprs[11 * offset]);
+ fpu_std(12, &fprs[12 * offset]);
+ fpu_std(13, &fprs[13 * offset]);
+ fpu_std(14, &fprs[14 * offset]);
+ fpu_std(15, &fprs[15 * offset]);
+}
+
+static __always_inline void __load_fp_regs(freg_t *fprs, unsigned int offset)
+{
+ fpu_ld(0, &fprs[0 * offset]);
+ fpu_ld(1, &fprs[1 * offset]);
+ fpu_ld(2, &fprs[2 * offset]);
+ fpu_ld(3, &fprs[3 * offset]);
+ fpu_ld(4, &fprs[4 * offset]);
+ fpu_ld(5, &fprs[5 * offset]);
+ fpu_ld(6, &fprs[6 * offset]);
+ fpu_ld(7, &fprs[7 * offset]);
+ fpu_ld(8, &fprs[8 * offset]);
+ fpu_ld(9, &fprs[9 * offset]);
+ fpu_ld(10, &fprs[10 * offset]);
+ fpu_ld(11, &fprs[11 * offset]);
+ fpu_ld(12, &fprs[12 * offset]);
+ fpu_ld(13, &fprs[13 * offset]);
+ fpu_ld(14, &fprs[14 * offset]);
+ fpu_ld(15, &fprs[15 * offset]);
+}
+
+static __always_inline void save_fp_regs(freg_t *fprs)
+{
+ __save_fp_regs(fprs, sizeof(freg_t) / sizeof(freg_t));
+}
+
+static __always_inline void load_fp_regs(freg_t *fprs)
+{
+ __load_fp_regs(fprs, sizeof(freg_t) / sizeof(freg_t));
+}
+
+static __always_inline void save_fp_regs_vx(__vector128 *vxrs)
+{
+ freg_t *fprs = (freg_t *)&vxrs[0].high;
+
+ __save_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
+}
+
+static __always_inline void load_fp_regs_vx(__vector128 *vxrs)
+{
+ freg_t *fprs = (freg_t *)&vxrs[0].high;
+
+ __load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
+}
+
+static inline void load_user_fpu_regs(void)
+{
+ struct thread_struct *thread = &current->thread;
+
+ if (!thread->ufpu_flags)
+ return;
+ load_fpu_state(&thread->ufpu, thread->ufpu_flags);
+ thread->ufpu_flags = 0;
+}
+
+static __always_inline void __save_user_fpu_regs(struct thread_struct *thread, int flags)
+{
+ save_fpu_state(&thread->ufpu, flags);
+ __atomic_or(flags, &thread->ufpu_flags);
+}
+
+static inline void save_user_fpu_regs(void)
+{
+ struct thread_struct *thread = &current->thread;
+ int mask, flags;
+
+ mask = __atomic_or(KERNEL_FPC | KERNEL_VXR, &thread->kfpu_flags);
+ flags = ~READ_ONCE(thread->ufpu_flags) & (KERNEL_FPC | KERNEL_VXR);
+ if (flags)
+ __save_user_fpu_regs(thread, flags);
+ barrier();
+ WRITE_ONCE(thread->kfpu_flags, mask);
+}
+
+static __always_inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
+{
+ struct thread_struct *thread = &current->thread;
+ int mask, uflags;
+
+ mask = __atomic_or(flags, &thread->kfpu_flags);
+ state->hdr.mask = mask;
+ uflags = READ_ONCE(thread->ufpu_flags);
+ if ((uflags & flags) != flags)
+ __save_user_fpu_regs(thread, ~uflags & flags);
+ if (mask & flags)
+ __kernel_fpu_begin(state, flags);
+}
+
+static __always_inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
+{
+ int mask = state->hdr.mask;
+
+ if (mask & flags)
+ __kernel_fpu_end(state, flags);
+ barrier();
+ WRITE_ONCE(current->thread.kfpu_flags, mask);
+}
+
+void __kernel_fpu_invalid_size(void);
+
+static __always_inline void kernel_fpu_check_size(int flags, unsigned int size)
+{
+ unsigned int cnt = 0;
+
+ if (flags & KERNEL_VXR_V0V7)
+ cnt += 8;
+ if (flags & KERNEL_VXR_V8V15)
+ cnt += 8;
+ if (flags & KERNEL_VXR_V16V23)
+ cnt += 8;
+ if (flags & KERNEL_VXR_V24V31)
+ cnt += 8;
+ if (cnt != size)
+ __kernel_fpu_invalid_size();
+}
+
+#define kernel_fpu_begin(state, flags) \
+{ \
+ typeof(state) s = (state); \
+ int _flags = (flags); \
+ \
+ kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs)); \
+ _kernel_fpu_begin((struct kernel_fpu *)s, _flags); \
+}
+
+#define kernel_fpu_end(state, flags) \
+{ \
+ typeof(state) s = (state); \
+ int _flags = (flags); \
+ \
+ kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs)); \
+ _kernel_fpu_end((struct kernel_fpu *)s, _flags); \
+}
+
+static inline void save_kernel_fpu_regs(struct thread_struct *thread)
+{
+ if (!thread->kfpu_flags)
+ return;
+ save_fpu_state(&thread->kfpu, thread->kfpu_flags);
+}
+
+static inline void restore_kernel_fpu_regs(struct thread_struct *thread)
+{
+ if (!thread->kfpu_flags)
+ return;
+ load_fpu_state(&thread->kfpu, thread->kfpu_flags);
+}
+
+static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
+{
+ int i;
+
+ for (i = 0; i < __NUM_FPRS; i++)
+ fprs[i].ui = vxrs[i].high;
+}
+
+static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
+{
+ int i;
+
+ for (i = 0; i < __NUM_FPRS; i++)
+ vxrs[i].high = fprs[i].ui;
+}
+
+static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
+{
+ fpregs->pad = 0;
+ fpregs->fpc = fpu->fpc;
+ convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
+}
+
+static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
+{
+ fpu->fpc = fpregs->fpc;
+ convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
+}
+
+#endif /* _ASM_S390_FPU_H */
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
deleted file mode 100644
index d6ca8bc6ca68..000000000000
--- a/arch/s390/include/asm/fpu/api.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * In-kernel FPU support functions
- *
- *
- * Consider these guidelines before using in-kernel FPU functions:
- *
- * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
- * use of floating-point or vector registers and instructions.
- *
- * 2. For kernel_fpu_begin(), specify the vector register range you want to
- * use with the KERNEL_VXR_* constants. Consider these usage guidelines:
- *
- * a) If your function typically runs in process-context, use the lower
- * half of the vector registers, for example, specify KERNEL_VXR_LOW.
- * b) If your function typically runs in soft-irq or hard-irq context,
- * prefer using the upper half of the vector registers, for example,
- * specify KERNEL_VXR_HIGH.
- *
- * If you adhere to these guidelines, an interrupted process context
- * does not require to save and restore vector registers because of
- * disjoint register ranges.
- *
- * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
- * includes logic to save and restore up to 16 vector registers at once.
- *
- * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
- * struct kernel_fpu states. Vector registers that are in use by outer
- * levels are saved and restored. You can minimize the save and restore
- * effort by choosing disjoint vector register ranges.
- *
- * 5. To use vector floating-point instructions, specify the KERNEL_FPC
- * flag to save and restore floating-point controls in addition to any
- * vector register range.
- *
- * 6. To use floating-point registers and instructions only, specify the
- * KERNEL_FPR flag. This flag triggers a save and restore of vector
- * registers V0 to V15 and floating-point controls.
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#ifndef _ASM_S390_FPU_API_H
-#define _ASM_S390_FPU_API_H
-
-#include <linux/preempt.h>
-#include <asm/asm-extable.h>
-#include <asm/fpu/internal.h>
-
-void save_fpu_regs(void);
-void load_fpu_regs(void);
-void __load_fpu_regs(void);
-
-/**
- * sfpc_safe - Set floating point control register safely.
- * @fpc: new value for floating point control register
- *
- * Set floating point control register. This may lead to an exception,
- * since a saved value may have been modified by user space (ptrace,
- * signal return, kvm registers) to an invalid value. In such a case
- * set the floating point control register to zero.
- */
-static inline void sfpc_safe(u32 fpc)
-{
- asm volatile("\n"
- "0: sfpc %[fpc]\n"
- "1: nopr %%r7\n"
- ".pushsection .fixup, \"ax\"\n"
- "2: lghi %[fpc],0\n"
- " jg 0b\n"
- ".popsection\n"
- EX_TABLE(1b, 2b)
- : [fpc] "+d" (fpc)
- : : "memory");
-}
-
-#define KERNEL_FPC 1
-#define KERNEL_VXR_V0V7 2
-#define KERNEL_VXR_V8V15 4
-#define KERNEL_VXR_V16V23 8
-#define KERNEL_VXR_V24V31 16
-
-#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15)
-#define KERNEL_VXR_MID (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23)
-#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31)
-
-#define KERNEL_VXR (KERNEL_VXR_LOW|KERNEL_VXR_HIGH)
-#define KERNEL_FPR (KERNEL_FPC|KERNEL_VXR_LOW)
-
-struct kernel_fpu;
-
-/*
- * Note the functions below must be called with preemption disabled.
- * Do not enable preemption before calling __kernel_fpu_end() to prevent
- * an corruption of an existing kernel FPU state.
- *
- * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions.
- */
-void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
-void __kernel_fpu_end(struct kernel_fpu *state, u32 flags);
-
-
-static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
-{
- preempt_disable();
- state->mask = S390_lowcore.fpu_flags;
- if (!test_cpu_flag(CIF_FPU))
- /* Save user space FPU state and register contents */
- save_fpu_regs();
- else if (state->mask & flags)
- /* Save FPU/vector register in-use by the kernel */
- __kernel_fpu_begin(state, flags);
- S390_lowcore.fpu_flags |= flags;
-}
-
-static inline void kernel_fpu_end(struct kernel_fpu *state, u32 flags)
-{
- S390_lowcore.fpu_flags = state->mask;
- if (state->mask & flags)
- /* Restore FPU/vector register in-use by the kernel */
- __kernel_fpu_end(state, flags);
- preempt_enable();
-}
-
-#endif /* _ASM_S390_FPU_API_H */
diff --git a/arch/s390/include/asm/fpu/internal.h b/arch/s390/include/asm/fpu/internal.h
deleted file mode 100644
index d511c4cf5afb..000000000000
--- a/arch/s390/include/asm/fpu/internal.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * FPU state and register content conversion primitives
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#ifndef _ASM_S390_FPU_INTERNAL_H
-#define _ASM_S390_FPU_INTERNAL_H
-
-#include <linux/string.h>
-#include <asm/facility.h>
-#include <asm/fpu/types.h>
-
-static inline bool cpu_has_vx(void)
-{
- return likely(test_facility(129));
-}
-
-static inline void save_vx_regs(__vector128 *vxrs)
-{
- asm volatile(
- " la 1,%0\n"
- " .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
- " .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
- : "=Q" (*(struct vx_array *) vxrs) : : "1");
-}
-
-static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
-{
- int i;
-
- for (i = 0; i < __NUM_FPRS; i++)
- fprs[i].ui = vxrs[i].high;
-}
-
-static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
-{
- int i;
-
- for (i = 0; i < __NUM_FPRS; i++)
- vxrs[i].high = fprs[i].ui;
-}
-
-static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
-{
- fpregs->pad = 0;
- fpregs->fpc = fpu->fpc;
- if (cpu_has_vx())
- convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
- else
- memcpy((freg_t *)&fpregs->fprs, fpu->fprs,
- sizeof(fpregs->fprs));
-}
-
-static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
-{
- fpu->fpc = fpregs->fpc;
- if (cpu_has_vx())
- convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
- else
- memcpy(fpu->fprs, (freg_t *)&fpregs->fprs,
- sizeof(fpregs->fprs));
-}
-
-#endif /* _ASM_S390_FPU_INTERNAL_H */
diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h
deleted file mode 100644
index d889e9436865..000000000000
--- a/arch/s390/include/asm/fpu/types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * FPU data structures
- *
- * Copyright IBM Corp. 2015
- * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
- */
-
-#ifndef _ASM_S390_FPU_TYPES_H
-#define _ASM_S390_FPU_TYPES_H
-
-#include <asm/sigcontext.h>
-
-struct fpu {
- __u32 fpc; /* Floating-point control */
- void *regs; /* Pointer to the current save area */
- union {
- /* Floating-point register save area */
- freg_t fprs[__NUM_FPRS];
- /* Vector register save area */
- __vector128 vxrs[__NUM_VXRS];
- };
-};
-
-/* VX array structure for address operand constraints in inline assemblies */
-struct vx_array { __vector128 _[__NUM_VXRS]; };
-
-/* In-kernel FPU state structure */
-struct kernel_fpu {
- u32 mask;
- u32 fpc;
- union {
- freg_t fprs[__NUM_FPRS];
- __vector128 vxrs[__NUM_VXRS];
- };
-};
-
-#endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 52664105a473..95990461888f 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -23,7 +23,7 @@
#include <linux/mmu_notifier.h>
#include <asm/debug.h>
#include <asm/cpu.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
#include <asm/isc.h>
#include <asm/guarded_storage.h>
@@ -743,7 +743,6 @@ struct kvm_vcpu_arch {
struct kvm_s390_sie_block *vsie_block;
unsigned int host_acrs[NUM_ACRS];
struct gs_cb *host_gscb;
- struct fpu host_fpregs;
struct kvm_s390_local_interrupt local_int;
struct hrtimer ckc_timer;
struct kvm_s390_pgm_info pgm;
@@ -765,6 +764,8 @@ struct kvm_vcpu_arch {
__u64 cputm_start;
bool gs_enabled;
bool skey_enabled;
+ /* Indicator if the access registers have been loaded from guest */
+ bool acrs_loaded;
struct kvm_s390_pv_vcpu pv;
union diag318_info diag318_info;
};
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 5dc1b6345006..8c5f16857539 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -157,7 +157,7 @@ struct lowcore {
__s32 preempt_count; /* 0x03a8 */
__u32 spinlock_lockval; /* 0x03ac */
__u32 spinlock_index; /* 0x03b0 */
- __u32 fpu_flags; /* 0x03b4 */
+ __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */
__u64 percpu_offset; /* 0x03b8 */
__u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */
__u64 machine_flags; /* 0x03c8 */
diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h
index 7d1888e3dee6..3f609565734b 100644
--- a/arch/s390/include/asm/pai.h
+++ b/arch/s390/include/asm/pai.h
@@ -16,7 +16,7 @@ struct qpaci_info_block {
u64 header;
struct {
u64 : 8;
- u64 num_cc : 8; /* # of supported crypto counters */
+ u64 num_cc : 8; /* # of supported crypto counters */
u64 : 9;
u64 num_nnpa : 7; /* # of supported NNPA counters */
u64 : 32;
@@ -81,4 +81,5 @@ enum paievt_mode {
PAI_MODE_COUNTING,
};
+#define PAI_SAVE_AREA(x) ((x)->hw.event_base)
#endif
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index e91cd6bbc330..30820a649e6e 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -122,6 +122,7 @@ struct zpci_dev {
struct rcu_head rcu;
struct hotplug_slot hotplug_slot;
+ struct mutex state_lock; /* protect state changes */
enum zpci_state state;
u32 fid; /* function ID, used by sclp */
u32 fh; /* function handle, used by insn's */
@@ -142,7 +143,6 @@ struct zpci_dev {
u8 reserved : 2;
unsigned int devfn; /* DEVFN part of the RID*/
- struct mutex lock;
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
u32 uid; /* user defined id */
u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */
@@ -170,6 +170,7 @@ struct zpci_dev {
u64 dma_mask; /* DMA address space mask */
/* Function measurement block */
+ struct mutex fmb_lock;
struct zpci_fmb *fmb;
u16 fmb_update; /* update interval */
u16 fmb_length;
diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
index 9e41a74fce9a..e747b067f8db 100644
--- a/arch/s390/include/asm/physmem_info.h
+++ b/arch/s390/include/asm/physmem_info.h
@@ -22,6 +22,7 @@ enum reserved_range_type {
RR_DECOMPRESSOR,
RR_INITRD,
RR_VMLINUX,
+ RR_RELOC,
RR_AMODE31,
RR_IPLREPORT,
RR_CERT_COMP_LIST,
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index c0b6e74d899a..7cf00cf8fb0b 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -15,13 +15,11 @@
#include <linux/bits.h>
#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */
-#define CIF_FPU 3 /* restore FPU registers */
#define CIF_ENABLED_WAIT 5 /* in enabled wait state */
#define CIF_MCCK_GUEST 6 /* machine check happening in guest */
#define CIF_DEDICATED_CPU 7 /* this CPU is dedicated */
#define _CIF_NOHZ_DELAY BIT(CIF_NOHZ_DELAY)
-#define _CIF_FPU BIT(CIF_FPU)
#define _CIF_ENABLED_WAIT BIT(CIF_ENABLED_WAIT)
#define _CIF_MCCK_GUEST BIT(CIF_MCCK_GUEST)
#define _CIF_DEDICATED_CPU BIT(CIF_DEDICATED_CPU)
@@ -33,13 +31,12 @@
#include <linux/cpumask.h>
#include <linux/linkage.h>
#include <linux/irqflags.h>
+#include <asm/fpu-types.h>
#include <asm/cpu.h>
#include <asm/page.h>
#include <asm/ptrace.h>
#include <asm/setup.h>
#include <asm/runtime_instr.h>
-#include <asm/fpu/types.h>
-#include <asm/fpu/internal.h>
#include <asm/irqflags.h>
typedef long (*sys_call_ptr_t)(struct pt_regs *regs);
@@ -169,6 +166,8 @@ struct thread_struct {
unsigned int gmap_write_flag; /* gmap fault write indication */
unsigned int gmap_int_code; /* int code of last gmap fault */
unsigned int gmap_pfault; /* signal of a pending guest pfault */
+ int ufpu_flags; /* user fpu flags */
+ int kfpu_flags; /* kernel fpu flags */
/* Per-thread information related to debugging */
struct per_regs per_user; /* User specified PER registers */
@@ -184,7 +183,8 @@ struct thread_struct {
struct gs_cb *gs_cb; /* Current guarded storage cb */
struct gs_cb *gs_bc_cb; /* Broadcast guarded storage cb */
struct pgm_tdb trap_tdb; /* Transaction abort diagnose block */
- struct fpu fpu; /* FP and VX register save area */
+ struct fpu ufpu; /* User FP and VX register save area */
+ struct fpu kfpu; /* Kernel FP and VX register save area */
};
/* Flag to disable transactions. */
@@ -203,7 +203,6 @@ typedef struct thread_struct thread_struct;
#define INIT_THREAD { \
.ksp = sizeof(init_stack) + (unsigned long) &init_stack, \
- .fpu.regs = (void *) init_task.thread.fpu.fprs, \
.last_break = 1, \
}
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index d28bf8fb2799..788bc4467445 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -203,6 +203,10 @@ static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag)
return ret;
}
+struct task_struct;
+
+void update_cr_regs(struct task_struct *task);
+
/*
* These are defined as per linux/ptrace.h, which see.
*/
diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h
index 31ec4f545e03..433fde85b14e 100644
--- a/arch/s390/include/asm/stacktrace.h
+++ b/arch/s390/include/asm/stacktrace.h
@@ -4,7 +4,6 @@
#include <linux/uaccess.h>
#include <linux/ptrace.h>
-#include <asm/switch_to.h>
struct stack_frame_user {
unsigned long back_chain;
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h
deleted file mode 100644
index c61b2cc1a8a8..000000000000
--- a/arch/s390/include/asm/switch_to.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright IBM Corp. 1999, 2009
- *
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-
-#ifndef __ASM_SWITCH_TO_H
-#define __ASM_SWITCH_TO_H
-
-#include <linux/thread_info.h>
-#include <asm/fpu/api.h>
-#include <asm/ptrace.h>
-#include <asm/guarded_storage.h>
-
-extern struct task_struct *__switch_to(void *, void *);
-extern void update_cr_regs(struct task_struct *task);
-
-static inline void save_access_regs(unsigned int *acrs)
-{
- typedef struct { int _[NUM_ACRS]; } acrstype;
-
- asm volatile("stam 0,15,%0" : "=Q" (*(acrstype *)acrs));
-}
-
-static inline void restore_access_regs(unsigned int *acrs)
-{
- typedef struct { int _[NUM_ACRS]; } acrstype;
-
- asm volatile("lam 0,15,%0" : : "Q" (*(acrstype *)acrs));
-}
-
-#define switch_to(prev, next, last) do { \
- /* save_fpu_regs() sets the CIF_FPU flag, which enforces \
- * a restore of the floating point / vector registers as \
- * soon as the next task returns to user space \
- */ \
- save_fpu_regs(); \
- save_access_regs(&prev->thread.acrs[0]); \
- save_ri_cb(prev->thread.ri_cb); \
- save_gs_cb(prev->thread.gs_cb); \
- update_cr_regs(next); \
- restore_access_regs(&next->thread.acrs[0]); \
- restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \
- restore_gs_cb(next->thread.gs_cb); \
- prev = __switch_to(prev, next); \
-} while (0)
-
-#endif /* __ASM_SWITCH_TO_H */
diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h
deleted file mode 100644
index 8c188f1c6d27..000000000000
--- a/arch/s390/include/asm/vx-insn.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Support for Vector Instructions
- *
- * This wrapper header file allows to use the vector instruction macros in
- * both assembler files as well as in inline assemblies in C files.
- */
-
-#ifndef __ASM_S390_VX_INSN_H
-#define __ASM_S390_VX_INSN_H
-
-#include <asm/vx-insn-asm.h>
-
-#ifndef __ASSEMBLY__
-
-asm(".include \"asm/vx-insn-asm.h\"\n");
-
-#endif /* __ASSEMBLY__ */
-#endif /* __ASM_S390_VX_INSN_H */
diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c
index 56254fa06f99..4f2669030220 100644
--- a/arch/s390/kernel/cache.c
+++ b/arch/s390/kernel/cache.c
@@ -166,5 +166,6 @@ int populate_cache_leaves(unsigned int cpu)
ci_leaf_init(this_leaf++, pvt, ctype, level, cpu);
}
}
+ this_cpu_ci->cpu_map_populated = true;
return 0;
}
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index f8fc6c25d051..1942e2a9f8db 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -24,12 +24,12 @@
#include <linux/tty.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
+#include <asm/access-regs.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
#include <asm/lowcore.h>
-#include <asm/switch_to.h>
#include <asm/vdso.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
#include "compat_linux.h"
#include "compat_ptrace.h"
#include "entry.h"
@@ -56,7 +56,7 @@ typedef struct
static void store_sigregs(void)
{
save_access_regs(current->thread.acrs);
- save_fpu_regs();
+ save_user_fpu_regs();
}
/* Load registers after signal return */
@@ -79,7 +79,7 @@ static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs)
user_sregs.regs.gprs[i] = (__u32) regs->gprs[i];
memcpy(&user_sregs.regs.acrs, current->thread.acrs,
sizeof(user_sregs.regs.acrs));
- fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.fpu);
+ fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.ufpu);
if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32)))
return -EFAULT;
return 0;
@@ -113,7 +113,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs)
regs->gprs[i] = (__u64) user_sregs.regs.gprs[i];
memcpy(&current->thread.acrs, &user_sregs.regs.acrs,
sizeof(current->thread.acrs));
- fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, &current->thread.fpu);
+ fpregs_load((_s390_fp_regs *)&user_sregs.fpregs, &current->thread.ufpu);
clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
return 0;
@@ -136,11 +136,11 @@ static int save_sigregs_ext32(struct pt_regs *regs,
/* Save vector registers to signal stack */
if (cpu_has_vx()) {
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = current->thread.fpu.vxrs[i].low;
+ vxrs[i] = current->thread.ufpu.vxrs[i].low;
if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
sizeof(sregs_ext->vxrs_low)) ||
__copy_to_user(&sregs_ext->vxrs_high,
- current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
}
@@ -165,12 +165,12 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
if (cpu_has_vx()) {
if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
sizeof(sregs_ext->vxrs_low)) ||
- __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
&sregs_ext->vxrs_high,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
for (i = 0; i < __NUM_VXRS_LOW; i++)
- current->thread.fpu.vxrs[i].low = vxrs[i];
+ current->thread.ufpu.vxrs[i].low = vxrs[i];
}
return 0;
}
@@ -184,7 +184,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
if (get_compat_sigset(&set, (compat_sigset_t __user *)frame->sc.oldmask))
goto badframe;
set_current_blocked(&set);
- save_fpu_regs();
+ save_user_fpu_regs();
if (restore_sigregs32(regs, &frame->sregs))
goto badframe;
if (restore_sigregs_ext32(regs, &frame->sregs_ext))
@@ -207,7 +207,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
- save_fpu_regs();
+ save_user_fpu_regs();
if (restore_sigregs32(regs, &frame->uc.uc_mcontext))
goto badframe;
if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext))
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 5c46c2659305..d09ebb6f5262 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -22,7 +22,7 @@
#include <asm/ipl.h>
#include <asm/sclp.h>
#include <asm/maccess.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y)))
#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y)))
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 92fdc35f028c..8dee9aa0ec95 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -147,11 +147,40 @@ void notrace diag_stat_inc_norecursion(enum diag_stat_enum nr)
EXPORT_SYMBOL(diag_stat_inc_norecursion);
/*
+ * Diagnose 0c: Pseudo Timer
+ */
+void diag0c(struct hypfs_diag0c_entry *data)
+{
+ diag_stat_inc(DIAG_STAT_X00C);
+ diag_amode31_ops.diag0c(virt_to_phys(data));
+}
+
+/*
* Diagnose 14: Input spool file manipulation
+ *
+ * The subcode parameter determines the type of the first parameter rx.
+ * Currently used are the following 3 subcommands:
+ * 0x0: Read the Next Spool File Buffer (Data Record)
+ * 0x28: Position a Spool File to the Designated Record
+ * 0xfff: Retrieve Next File Descriptor
+ *
+ * For subcommands 0x0 and 0xfff, the value of the first parameter is
+ * a virtual address of a memory buffer and needs virtual to physical
+ * address translation. For other subcommands the rx parameter is not
+ * a virtual address.
*/
int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
{
diag_stat_inc(DIAG_STAT_X014);
+ switch (subcode) {
+ case 0x0:
+ case 0xfff:
+ rx = virt_to_phys((void *)rx);
+ break;
+ default:
+ /* Do nothing */
+ break;
+ }
return diag_amode31_ops.diag14(rx, ry1, subcode);
}
EXPORT_SYMBOL(diag14);
@@ -265,6 +294,6 @@ EXPORT_SYMBOL(diag224);
int diag26c(void *req, void *resp, enum diag26c_sc subcode)
{
diag_stat_inc(DIAG_STAT_X26C);
- return diag_amode31_ops.diag26c(req, resp, subcode);
+ return diag_amode31_ops.diag26c(virt_to_phys(req), virt_to_phys(resp), subcode);
}
EXPORT_SYMBOL(diag26c);
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 2345ea332b97..c666271433fb 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -19,8 +19,10 @@
#include <linux/kernel.h>
#include <asm/asm-extable.h>
#include <linux/memblock.h>
+#include <asm/access-regs.h>
#include <asm/diag.h>
#include <asm/ebcdic.h>
+#include <asm/fpu.h>
#include <asm/ipl.h>
#include <asm/lowcore.h>
#include <asm/processor.h>
@@ -31,7 +33,6 @@
#include <asm/sclp.h>
#include <asm/facility.h>
#include <asm/boot_data.h>
-#include <asm/switch_to.h>
#include "entry.h"
#define decompressor_handled_param(param) \
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 49a11f6dd7ae..fc5277eab554 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -24,7 +24,7 @@
#include <asm/page.h>
#include <asm/sigp.h>
#include <asm/irq.h>
-#include <asm/vx-insn.h>
+#include <asm/fpu-insn.h>
#include <asm/setup.h>
#include <asm/nmi.h>
#include <asm/nospec-insn.h>
@@ -171,13 +171,13 @@ _LPP_OFFSET = __LC_LPP
nop 0
/*
- * Scheduler resume function, called by switch_to
- * gpr2 = (task_struct *) prev
- * gpr3 = (task_struct *) next
+ * Scheduler resume function, called by __switch_to
+ * gpr2 = (task_struct *)prev
+ * gpr3 = (task_struct *)next
* Returns:
* gpr2 = prev
*/
-SYM_FUNC_START(__switch_to)
+SYM_FUNC_START(__switch_to_asm)
stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task
lghi %r4,__TASK_stack
lghi %r1,__TASK_thread
@@ -193,7 +193,7 @@ SYM_FUNC_START(__switch_to)
lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task
ALTERNATIVE "nop", "lpp _LPP_OFFSET", 40
BR_EX %r14
-SYM_FUNC_END(__switch_to)
+SYM_FUNC_END(__switch_to_asm)
#if IS_ENABLED(CONFIG_KVM)
/*
@@ -220,8 +220,6 @@ SYM_FUNC_START(__sie64a)
oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now
tm __SIE_PROG20+3(%r14),3 # last exit...
jnz .Lsie_skip
- TSTMSK __LC_CPU_FLAGS,_CIF_FPU
- jo .Lsie_skip # exit if fp/vx regs changed
lg %r14,__SF_SIE_CONTROL_PHYS(%r15) # get sie block phys addr
BPEXIT __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
.Lsie_entry:
@@ -489,16 +487,11 @@ SYM_FUNC_END(psw_idle)
*/
SYM_CODE_START(mcck_int_handler)
BPOFF
- la %r1,4095 # validate r1
- spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer
- LBEAR __LC_LAST_BREAK_SAVE_AREA-4095(%r1) # validate bear
- lmg %r0,%r15,__LC_GPREGS_SAVE_AREA # validate gprs
lmg %r8,%r9,__LC_MCK_OLD_PSW
TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
jo .Lmcck_panic # yes -> rest of mcck code invalid
TSTMSK __LC_MCCK_CODE,MCCK_CODE_CR_VALID
jno .Lmcck_panic # control registers invalid -> panic
- lctlg %c0,%c15,__LC_CREGS_SAVE_AREA # validate ctl regs
ptlb
lghi %r14,__LC_CPU_TIMER_SAVE_AREA
mvc __LC_MCCK_ENTER_TIMER(8),0(%r14)
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 9f41853f36b9..21969520f947 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -19,6 +19,7 @@ void mcck_int_handler(void);
void restart_int_handler(void);
void early_pgm_check_handler(void);
+struct task_struct *__switch_to_asm(struct task_struct *prev, struct task_struct *next);
void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs);
void __do_pgm_check(struct pt_regs *regs);
void __do_syscall(struct pt_regs *regs, int per_trap);
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
index a4f3449cc814..fa90bbdc5ef9 100644
--- a/arch/s390/kernel/fpu.c
+++ b/arch/s390/kernel/fpu.c
@@ -8,256 +8,186 @@
#include <linux/kernel.h>
#include <linux/cpu.h>
#include <linux/sched.h>
-#include <asm/fpu/types.h>
-#include <asm/fpu/api.h>
-#include <asm/vx-insn.h>
+#include <asm/fpu.h>
-void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+void __kernel_fpu_begin(struct kernel_fpu *state, int flags)
{
+ __vector128 *vxrs = state->vxrs;
+ int mask;
+
/*
* Limit the save to the FPU/vector registers already
- * in use by the previous context
+ * in use by the previous context.
*/
- flags &= state->mask;
-
+ flags &= state->hdr.mask;
if (flags & KERNEL_FPC)
- /* Save floating point control */
- asm volatile("stfpc %0" : "=Q" (state->fpc));
-
+ fpu_stfpc(&state->hdr.fpc);
if (!cpu_has_vx()) {
- if (flags & KERNEL_VXR_V0V7) {
- /* Save floating-point registers */
- asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
- asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
- asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
- asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
- asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
- asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
- asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
- asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
- asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
- asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
- asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
- asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
- asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
- asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
- asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
- asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
- }
+ if (flags & KERNEL_VXR_LOW)
+ save_fp_regs_vx(vxrs);
return;
}
-
- /* Test and save vector registers */
- asm volatile (
- /*
- * Test if any vector register must be saved and, if so,
- * test if all register can be saved.
- */
- " la 1,%[vxrs]\n" /* load save area */
- " tmll %[m],30\n" /* KERNEL_VXR */
- " jz 7f\n" /* no work -> done */
- " jo 5f\n" /* -> save V0..V31 */
- /*
- * Test for special case KERNEL_FPU_MID only. In this
- * case a vstm V8..V23 is the best instruction
- */
- " chi %[m],12\n" /* KERNEL_VXR_MID */
- " jne 0f\n" /* -> save V8..V23 */
- " VSTM 8,23,128,1\n" /* vstm %v8,%v23,128(%r1) */
- " j 7f\n"
- /* Test and save the first half of 16 vector registers */
- "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */
- " jz 3f\n" /* -> KERNEL_VXR_HIGH */
- " jo 2f\n" /* 11 -> save V0..V15 */
- " brc 2,1f\n" /* 10 -> save V8..V15 */
- " VSTM 0,7,0,1\n" /* vstm %v0,%v7,0(%r1) */
- " j 3f\n"
- "1: VSTM 8,15,128,1\n" /* vstm %v8,%v15,128(%r1) */
- " j 3f\n"
- "2: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */
- /* Test and save the second half of 16 vector registers */
- "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */
- " jz 7f\n"
- " jo 6f\n" /* 11 -> save V16..V31 */
- " brc 2,4f\n" /* 10 -> save V24..V31 */
- " VSTM 16,23,256,1\n" /* vstm %v16,%v23,256(%r1) */
- " j 7f\n"
- "4: VSTM 24,31,384,1\n" /* vstm %v24,%v31,384(%r1) */
- " j 7f\n"
- "5: VSTM 0,15,0,1\n" /* vstm %v0,%v15,0(%r1) */
- "6: VSTM 16,31,256,1\n" /* vstm %v16,%v31,256(%r1) */
- "7:"
- : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
- : [m] "d" (flags)
- : "1", "cc");
+ mask = flags & KERNEL_VXR;
+ if (mask == KERNEL_VXR) {
+ vxrs += fpu_vstm(0, 15, vxrs);
+ vxrs += fpu_vstm(16, 31, vxrs);
+ return;
+ }
+ if (mask == KERNEL_VXR_MID) {
+ vxrs += fpu_vstm(8, 23, vxrs);
+ return;
+ }
+ mask = flags & KERNEL_VXR_LOW;
+ if (mask) {
+ if (mask == KERNEL_VXR_LOW)
+ vxrs += fpu_vstm(0, 15, vxrs);
+ else if (mask == KERNEL_VXR_V0V7)
+ vxrs += fpu_vstm(0, 7, vxrs);
+ else
+ vxrs += fpu_vstm(8, 15, vxrs);
+ }
+ mask = flags & KERNEL_VXR_HIGH;
+ if (mask) {
+ if (mask == KERNEL_VXR_HIGH)
+ vxrs += fpu_vstm(16, 31, vxrs);
+ else if (mask == KERNEL_VXR_V16V23)
+ vxrs += fpu_vstm(16, 23, vxrs);
+ else
+ vxrs += fpu_vstm(24, 31, vxrs);
+ }
}
EXPORT_SYMBOL(__kernel_fpu_begin);
-void __kernel_fpu_end(struct kernel_fpu *state, u32 flags)
+void __kernel_fpu_end(struct kernel_fpu *state, int flags)
{
+ __vector128 *vxrs = state->vxrs;
+ int mask;
+
/*
* Limit the restore to the FPU/vector registers of the
- * previous context that have been overwritte by the
- * current context
+ * previous context that have been overwritten by the
+ * current context.
*/
- flags &= state->mask;
-
+ flags &= state->hdr.mask;
if (flags & KERNEL_FPC)
- /* Restore floating-point controls */
- asm volatile("lfpc %0" : : "Q" (state->fpc));
-
+ fpu_lfpc(&state->hdr.fpc);
if (!cpu_has_vx()) {
- if (flags & KERNEL_VXR_V0V7) {
- /* Restore floating-point registers */
- asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
- asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
- asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
- asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
- asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
- asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
- asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
- asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
- asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
- asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
- asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
- asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
- asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
- asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
- asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
- asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
- }
+ if (flags & KERNEL_VXR_LOW)
+ load_fp_regs_vx(vxrs);
return;
}
-
- /* Test and restore (load) vector registers */
- asm volatile (
- /*
- * Test if any vector register must be loaded and, if so,
- * test if all registers can be loaded at once.
- */
- " la 1,%[vxrs]\n" /* load restore area */
- " tmll %[m],30\n" /* KERNEL_VXR */
- " jz 7f\n" /* no work -> done */
- " jo 5f\n" /* -> restore V0..V31 */
- /*
- * Test for special case KERNEL_FPU_MID only. In this
- * case a vlm V8..V23 is the best instruction
- */
- " chi %[m],12\n" /* KERNEL_VXR_MID */
- " jne 0f\n" /* -> restore V8..V23 */
- " VLM 8,23,128,1\n" /* vlm %v8,%v23,128(%r1) */
- " j 7f\n"
- /* Test and restore the first half of 16 vector registers */
- "0: tmll %[m],6\n" /* KERNEL_VXR_LOW */
- " jz 3f\n" /* -> KERNEL_VXR_HIGH */
- " jo 2f\n" /* 11 -> restore V0..V15 */
- " brc 2,1f\n" /* 10 -> restore V8..V15 */
- " VLM 0,7,0,1\n" /* vlm %v0,%v7,0(%r1) */
- " j 3f\n"
- "1: VLM 8,15,128,1\n" /* vlm %v8,%v15,128(%r1) */
- " j 3f\n"
- "2: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */
- /* Test and restore the second half of 16 vector registers */
- "3: tmll %[m],24\n" /* KERNEL_VXR_HIGH */
- " jz 7f\n"
- " jo 6f\n" /* 11 -> restore V16..V31 */
- " brc 2,4f\n" /* 10 -> restore V24..V31 */
- " VLM 16,23,256,1\n" /* vlm %v16,%v23,256(%r1) */
- " j 7f\n"
- "4: VLM 24,31,384,1\n" /* vlm %v24,%v31,384(%r1) */
- " j 7f\n"
- "5: VLM 0,15,0,1\n" /* vlm %v0,%v15,0(%r1) */
- "6: VLM 16,31,256,1\n" /* vlm %v16,%v31,256(%r1) */
- "7:"
- : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
- : [m] "d" (flags)
- : "1", "cc");
+ mask = flags & KERNEL_VXR;
+ if (mask == KERNEL_VXR) {
+ vxrs += fpu_vlm(0, 15, vxrs);
+ vxrs += fpu_vlm(16, 31, vxrs);
+ return;
+ }
+ if (mask == KERNEL_VXR_MID) {
+ vxrs += fpu_vlm(8, 23, vxrs);
+ return;
+ }
+ mask = flags & KERNEL_VXR_LOW;
+ if (mask) {
+ if (mask == KERNEL_VXR_LOW)
+ vxrs += fpu_vlm(0, 15, vxrs);
+ else if (mask == KERNEL_VXR_V0V7)
+ vxrs += fpu_vlm(0, 7, vxrs);
+ else
+ vxrs += fpu_vlm(8, 15, vxrs);
+ }
+ mask = flags & KERNEL_VXR_HIGH;
+ if (mask) {
+ if (mask == KERNEL_VXR_HIGH)
+ vxrs += fpu_vlm(16, 31, vxrs);
+ else if (mask == KERNEL_VXR_V16V23)
+ vxrs += fpu_vlm(16, 23, vxrs);
+ else
+ vxrs += fpu_vlm(24, 31, vxrs);
+ }
}
EXPORT_SYMBOL(__kernel_fpu_end);
-void __load_fpu_regs(void)
+void load_fpu_state(struct fpu *state, int flags)
{
- unsigned long *regs = current->thread.fpu.regs;
- struct fpu *state = &current->thread.fpu;
+ __vector128 *vxrs = &state->vxrs[0];
+ int mask;
- sfpc_safe(state->fpc);
- if (likely(cpu_has_vx())) {
- asm volatile("lgr 1,%0\n"
- "VLM 0,15,0,1\n"
- "VLM 16,31,256,1\n"
- :
- : "d" (regs)
- : "1", "cc", "memory");
- } else {
- asm volatile("ld 0,%0" : : "Q" (regs[0]));
- asm volatile("ld 1,%0" : : "Q" (regs[1]));
- asm volatile("ld 2,%0" : : "Q" (regs[2]));
- asm volatile("ld 3,%0" : : "Q" (regs[3]));
- asm volatile("ld 4,%0" : : "Q" (regs[4]));
- asm volatile("ld 5,%0" : : "Q" (regs[5]));
- asm volatile("ld 6,%0" : : "Q" (regs[6]));
- asm volatile("ld 7,%0" : : "Q" (regs[7]));
- asm volatile("ld 8,%0" : : "Q" (regs[8]));
- asm volatile("ld 9,%0" : : "Q" (regs[9]));
- asm volatile("ld 10,%0" : : "Q" (regs[10]));
- asm volatile("ld 11,%0" : : "Q" (regs[11]));
- asm volatile("ld 12,%0" : : "Q" (regs[12]));
- asm volatile("ld 13,%0" : : "Q" (regs[13]));
- asm volatile("ld 14,%0" : : "Q" (regs[14]));
- asm volatile("ld 15,%0" : : "Q" (regs[15]));
+ if (flags & KERNEL_FPC)
+ fpu_lfpc(&state->fpc);
+ if (!cpu_has_vx()) {
+ if (flags & KERNEL_VXR_V0V7)
+ load_fp_regs_vx(state->vxrs);
+ return;
+ }
+ mask = flags & KERNEL_VXR;
+ if (mask == KERNEL_VXR) {
+ fpu_vlm(0, 15, &vxrs[0]);
+ fpu_vlm(16, 31, &vxrs[16]);
+ return;
+ }
+ if (mask == KERNEL_VXR_MID) {
+ fpu_vlm(8, 23, &vxrs[8]);
+ return;
+ }
+ mask = flags & KERNEL_VXR_LOW;
+ if (mask) {
+ if (mask == KERNEL_VXR_LOW)
+ fpu_vlm(0, 15, &vxrs[0]);
+ else if (mask == KERNEL_VXR_V0V7)
+ fpu_vlm(0, 7, &vxrs[0]);
+ else
+ fpu_vlm(8, 15, &vxrs[8]);
+ }
+ mask = flags & KERNEL_VXR_HIGH;
+ if (mask) {
+ if (mask == KERNEL_VXR_HIGH)
+ fpu_vlm(16, 31, &vxrs[16]);
+ else if (mask == KERNEL_VXR_V16V23)
+ fpu_vlm(16, 23, &vxrs[16]);
+ else
+ fpu_vlm(24, 31, &vxrs[24]);
}
- clear_cpu_flag(CIF_FPU);
-}
-
-void load_fpu_regs(void)
-{
- raw_local_irq_disable();
- __load_fpu_regs();
- raw_local_irq_enable();
}
-EXPORT_SYMBOL(load_fpu_regs);
-void save_fpu_regs(void)
+void save_fpu_state(struct fpu *state, int flags)
{
- unsigned long flags, *regs;
- struct fpu *state;
-
- local_irq_save(flags);
+ __vector128 *vxrs = &state->vxrs[0];
+ int mask;
- if (test_cpu_flag(CIF_FPU))
- goto out;
-
- state = &current->thread.fpu;
- regs = current->thread.fpu.regs;
-
- asm volatile("stfpc %0" : "=Q" (state->fpc));
- if (likely(cpu_has_vx())) {
- asm volatile("lgr 1,%0\n"
- "VSTM 0,15,0,1\n"
- "VSTM 16,31,256,1\n"
- :
- : "d" (regs)
- : "1", "cc", "memory");
- } else {
- asm volatile("std 0,%0" : "=Q" (regs[0]));
- asm volatile("std 1,%0" : "=Q" (regs[1]));
- asm volatile("std 2,%0" : "=Q" (regs[2]));
- asm volatile("std 3,%0" : "=Q" (regs[3]));
- asm volatile("std 4,%0" : "=Q" (regs[4]));
- asm volatile("std 5,%0" : "=Q" (regs[5]));
- asm volatile("std 6,%0" : "=Q" (regs[6]));
- asm volatile("std 7,%0" : "=Q" (regs[7]));
- asm volatile("std 8,%0" : "=Q" (regs[8]));
- asm volatile("std 9,%0" : "=Q" (regs[9]));
- asm volatile("std 10,%0" : "=Q" (regs[10]));
- asm volatile("std 11,%0" : "=Q" (regs[11]));
- asm volatile("std 12,%0" : "=Q" (regs[12]));
- asm volatile("std 13,%0" : "=Q" (regs[13]));
- asm volatile("std 14,%0" : "=Q" (regs[14]));
- asm volatile("std 15,%0" : "=Q" (regs[15]));
+ if (flags & KERNEL_FPC)
+ fpu_stfpc(&state->fpc);
+ if (!cpu_has_vx()) {
+ if (flags & KERNEL_VXR_LOW)
+ save_fp_regs_vx(state->vxrs);
+ return;
+ }
+ mask = flags & KERNEL_VXR;
+ if (mask == KERNEL_VXR) {
+ fpu_vstm(0, 15, &vxrs[0]);
+ fpu_vstm(16, 31, &vxrs[16]);
+ return;
+ }
+ if (mask == KERNEL_VXR_MID) {
+ fpu_vstm(8, 23, &vxrs[8]);
+ return;
+ }
+ mask = flags & KERNEL_VXR_LOW;
+ if (mask) {
+ if (mask == KERNEL_VXR_LOW)
+ fpu_vstm(0, 15, &vxrs[0]);
+ else if (mask == KERNEL_VXR_V0V7)
+ fpu_vstm(0, 7, &vxrs[0]);
+ else
+ fpu_vstm(8, 15, &vxrs[8]);
+ }
+ mask = flags & KERNEL_VXR_HIGH;
+ if (mask) {
+ if (mask == KERNEL_VXR_HIGH)
+ fpu_vstm(16, 31, &vxrs[16]);
+ else if (mask == KERNEL_VXR_V16V23)
+ fpu_vstm(16, 23, &vxrs[16]);
+ else
+ fpu_vstm(24, 31, &vxrs[24]);
}
- set_cpu_flag(CIF_FPU);
-out:
- local_irq_restore(flags);
}
-EXPORT_SYMBOL(save_fpu_regs);
+EXPORT_SYMBOL(save_fpu_state);
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index ba75f6bee774..1486350a4177 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -1941,8 +1941,7 @@ static void dump_reipl_run(struct shutdown_trigger *trigger)
reipl_type == IPL_TYPE_UNKNOWN)
os_info_flags |= OS_INFO_FLAG_REIPL_CLEAR;
os_info_entry_add(OS_INFO_FLAGS_ENTRY, &os_info_flags, sizeof(os_info_flags));
- csum = (__force unsigned int)
- csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0);
+ csum = (__force unsigned int)cksm(reipl_block_actual, reipl_block_actual->hdr.len, 0);
abs_lc = get_abs_lowcore();
abs_lc->ipib = __pa(reipl_block_actual);
abs_lc->ipib_checksum = csum;
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index aa22ffc16bcd..c5d0c1cf984b 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -13,8 +13,10 @@
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/debug_locks.h>
+#include <asm/guarded_storage.h>
#include <asm/pfault.h>
#include <asm/cio.h>
+#include <asm/fpu.h>
#include <asm/setup.h>
#include <asm/smp.h>
#include <asm/ipl.h>
@@ -26,7 +28,6 @@
#include <asm/os_info.h>
#include <asm/set_memory.h>
#include <asm/stacktrace.h>
-#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/sclp.h>
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 9ad44c26d1a2..c77382a67325 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -23,16 +23,14 @@
#include <linux/export.h>
#include <asm/lowcore.h>
#include <asm/ctlreg.h>
+#include <asm/fpu.h>
#include <asm/smp.h>
#include <asm/stp.h>
#include <asm/cputime.h>
#include <asm/nmi.h>
#include <asm/crw.h>
-#include <asm/switch_to.h>
#include <asm/asm-offsets.h>
#include <asm/pai.h>
-#include <asm/vx-insn.h>
-#include <asm/fpu/api.h>
struct mcck_struct {
unsigned int kill_task : 1;
@@ -204,133 +202,63 @@ void s390_handle_mcck(void)
}
}
-/*
- * returns 0 if register contents could be validated
- * returns 1 otherwise
+/**
+ * nmi_registers_valid - verify if registers are valid
+ * @mci: machine check interruption code
+ *
+ * Inspect a machine check interruption code and verify if all required
+ * registers are valid. For some registers the corresponding validity bit is
+ * ignored and the registers are set to the expected value.
+ * Returns true if all registers are valid, otherwise false.
*/
-static int notrace s390_validate_registers(union mci mci)
+static bool notrace nmi_registers_valid(union mci mci)
{
- struct mcesa *mcesa;
- void *fpt_save_area;
union ctlreg2 cr2;
- int kill_task;
- u64 zero;
-
- kill_task = 0;
- zero = 0;
-
- if (!mci.gr || !mci.fp)
- kill_task = 1;
- fpt_save_area = &S390_lowcore.floating_pt_save_area;
- if (!mci.fc) {
- kill_task = 1;
- asm volatile(
- " lfpc %0\n"
- :
- : "Q" (zero));
- } else {
- asm volatile(
- " lfpc %0\n"
- :
- : "Q" (S390_lowcore.fpt_creg_save_area));
- }
- mcesa = __va(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
- if (!cpu_has_vx()) {
- /* Validate floating point registers */
- asm volatile(
- " ld 0,0(%0)\n"
- " ld 1,8(%0)\n"
- " ld 2,16(%0)\n"
- " ld 3,24(%0)\n"
- " ld 4,32(%0)\n"
- " ld 5,40(%0)\n"
- " ld 6,48(%0)\n"
- " ld 7,56(%0)\n"
- " ld 8,64(%0)\n"
- " ld 9,72(%0)\n"
- " ld 10,80(%0)\n"
- " ld 11,88(%0)\n"
- " ld 12,96(%0)\n"
- " ld 13,104(%0)\n"
- " ld 14,112(%0)\n"
- " ld 15,120(%0)\n"
- :
- : "a" (fpt_save_area)
- : "memory");
- } else {
- /* Validate vector registers */
- union ctlreg0 cr0;
-
- /*
- * The vector validity must only be checked if not running a
- * KVM guest. For KVM guests the machine check is forwarded by
- * KVM and it is the responsibility of the guest to take
- * appropriate actions. The host vector or FPU values have been
- * saved by KVM and will be restored by KVM.
- */
- if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST))
- kill_task = 1;
- cr0.reg = S390_lowcore.cregs_save_area[0];
- cr0.afp = cr0.vx = 1;
- local_ctl_load(0, &cr0.reg);
- asm volatile(
- " la 1,%0\n"
- " VLM 0,15,0,1\n"
- " VLM 16,31,256,1\n"
- :
- : "Q" (*(struct vx_array *)mcesa->vector_save_area)
- : "1");
- local_ctl_load(0, &S390_lowcore.cregs_save_area[0]);
- }
- /* Validate access registers */
- asm volatile(
- " lam 0,15,0(%0)\n"
- :
- : "a" (&S390_lowcore.access_regs_save_area)
- : "memory");
- if (!mci.ar)
- kill_task = 1;
- /* Validate guarded storage registers */
- cr2.reg = S390_lowcore.cregs_save_area[2];
- if (cr2.gse) {
- if (!mci.gs) {
- /*
- * 2 cases:
- * - machine check in kernel or userspace
- * - machine check while running SIE (KVM guest)
- * For kernel or userspace the userspace values of
- * guarded storage control can not be recreated, the
- * process must be terminated.
- * For SIE the guest values of guarded storage can not
- * be recreated. This is either due to a bug or due to
- * GS being disabled in the guest. The guest will be
- * notified by KVM code and the guests machine check
- * handling must take care of this. The host values
- * are saved by KVM and are not affected.
- */
- if (!test_cpu_flag(CIF_MCCK_GUEST))
- kill_task = 1;
- } else {
- load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
- }
- }
/*
- * The getcpu vdso syscall reads CPU number from the programmable
+ * The getcpu vdso syscall reads the CPU number from the programmable
* field of the TOD clock. Disregard the TOD programmable register
- * validity bit and load the CPU number into the TOD programmable
- * field unconditionally.
+ * validity bit and load the CPU number into the TOD programmable field
+ * unconditionally.
*/
set_tod_programmable_field(raw_smp_processor_id());
- /* Validate clock comparator register */
+ /*
+ * Set the clock comparator register to the next expected value.
+ */
set_clock_comparator(S390_lowcore.clock_comparator);
-
+ if (!mci.gr || !mci.fp || !mci.fc)
+ return false;
+ /*
+ * The vector validity must only be checked if not running a
+ * KVM guest. For KVM guests the machine check is forwarded by
+ * KVM and it is the responsibility of the guest to take
+ * appropriate actions. The host vector or FPU values have been
+ * saved by KVM and will be restored by KVM.
+ */
+ if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST))
+ return false;
+ if (!mci.ar)
+ return false;
+ /*
+ * Two cases for guarded storage registers:
+ * - machine check in kernel or userspace
+ * - machine check while running SIE (KVM guest)
+ * For kernel or userspace the userspace values of guarded storage
+ * control can not be recreated, the process must be terminated.
+ * For SIE the guest values of guarded storage can not be recreated.
+ * This is either due to a bug or due to GS being disabled in the
+ * guest. The guest will be notified by KVM code and the guests machine
+ * check handling must take care of this. The host values are saved by
+ * KVM and are not affected.
+ */
+ cr2.reg = S390_lowcore.cregs_save_area[2];
+ if (cr2.gse && !mci.gs && !test_cpu_flag(CIF_MCCK_GUEST))
+ return false;
if (!mci.ms || !mci.pm || !mci.ia)
- kill_task = 1;
-
- return kill_task;
+ return false;
+ return true;
}
-NOKPROBE_SYMBOL(s390_validate_registers);
+NOKPROBE_SYMBOL(nmi_registers_valid);
/*
* Backup the guest's machine check info to its description block
@@ -428,7 +356,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
s390_handle_damage();
}
}
- if (s390_validate_registers(mci)) {
+ if (!nmi_registers_valid(mci)) {
if (!user_mode(regs))
s390_handle_damage();
/*
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index 6e1824141b29..a801e6bd5341 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -29,7 +29,7 @@ static struct os_info os_info __page_aligned_data;
u32 os_info_csum(struct os_info *os_info)
{
int size = sizeof(*os_info) - offsetof(struct os_info, version_major);
- return (__force u32)csum_partial(&os_info->version_major, size, 0);
+ return (__force u32)cksm(&os_info->version_major, size, 0);
}
/*
@@ -49,7 +49,7 @@ void os_info_entry_add(int nr, void *ptr, u64 size)
{
os_info.entry[nr].addr = __pa(ptr);
os_info.entry[nr].size = size;
- os_info.entry[nr].csum = (__force u32)csum_partial(ptr, size, 0);
+ os_info.entry[nr].csum = (__force u32)cksm(ptr, size, 0);
os_info.csum = os_info_csum(&os_info);
}
@@ -98,7 +98,7 @@ static void os_info_old_alloc(int nr, int align)
msg = "copy failed";
goto fail_free;
}
- csum = (__force u32)csum_partial(buf_align, size, 0);
+ csum = (__force u32)cksm(buf_align, size, 0);
if (csum != os_info_old->entry[nr].csum) {
msg = "checksum failed";
goto fail_free;
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index bf8a672b15a4..823d652e3917 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -98,6 +98,7 @@ static void paicrypt_event_destroy(struct perf_event *event)
event->attr.config, event->cpu,
cpump->active_events, cpump->mode,
refcount_read(&cpump->refcnt));
+ free_page(PAI_SAVE_AREA(event));
if (refcount_dec_and_test(&cpump->refcnt)) {
debug_sprintf_event(cfm_dbg, 4, "%s page %#lx save %p\n",
__func__, (unsigned long)cpump->page,
@@ -260,6 +261,7 @@ static int paicrypt_event_init(struct perf_event *event)
{
struct perf_event_attr *a = &event->attr;
struct paicrypt_map *cpump;
+ int rc = 0;
/* PAI crypto PMU registered as PERF_TYPE_RAW, check event type */
if (a->type != PERF_TYPE_RAW && event->pmu->type != a->type)
@@ -274,10 +276,21 @@ static int paicrypt_event_init(struct perf_event *event)
/* Allow only CRYPTO_ALL for sampling. */
if (a->sample_period && a->config != PAI_CRYPTO_BASE)
return -EINVAL;
+ /* Get a page to store last counter values for sampling */
+ if (a->sample_period) {
+ PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
+ if (!PAI_SAVE_AREA(event)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
cpump = paicrypt_busy(event);
- if (IS_ERR(cpump))
- return PTR_ERR(cpump);
+ if (IS_ERR(cpump)) {
+ free_page(PAI_SAVE_AREA(event));
+ rc = PTR_ERR(cpump);
+ goto out;
+ }
event->destroy = paicrypt_event_destroy;
@@ -293,7 +306,8 @@ static int paicrypt_event_init(struct perf_event *event)
}
static_branch_inc(&pai_key);
- return 0;
+out:
+ return rc;
}
static void paicrypt_read(struct perf_event *event)
@@ -310,20 +324,15 @@ static void paicrypt_read(struct perf_event *event)
static void paicrypt_start(struct perf_event *event, int flags)
{
+ struct paicrypt_mapptr *mp = this_cpu_ptr(paicrypt_root.mapptr);
+ struct paicrypt_map *cpump = mp->mapptr;
u64 sum;
- /* Event initialization sets last_tag to 0. When later on the events
- * are deleted and re-added, do not reset the event count value to zero.
- * Events are added, deleted and re-added when 2 or more events
- * are active at the same time.
- */
if (!event->attr.sample_period) { /* Counting */
- if (!event->hw.last_tag) {
- event->hw.last_tag = 1;
- sum = paicrypt_getall(event); /* Get current value */
- local64_set(&event->hw.prev_count, sum);
- }
+ sum = paicrypt_getall(event); /* Get current value */
+ local64_set(&event->hw.prev_count, sum);
} else { /* Sampling */
+ cpump->event = event;
perf_sched_cb_inc(event->pmu);
}
}
@@ -339,7 +348,6 @@ static int paicrypt_add(struct perf_event *event, int flags)
WRITE_ONCE(S390_lowcore.ccd, ccd);
local_ctl_set_bit(0, CR0_CRYPTOGRAPHY_COUNTER_BIT);
}
- cpump->event = event;
if (flags & PERF_EF_START)
paicrypt_start(event, PERF_EF_RELOAD);
event->hw.state = 0;
@@ -367,23 +375,34 @@ static void paicrypt_del(struct perf_event *event, int flags)
}
}
-/* Create raw data and save it in buffer. Returns number of bytes copied.
- * Saves only positive counter entries of the form
+/* Create raw data and save it in buffer. Calculate the delta for each
+ * counter between this invocation and the last invocation.
+ * Returns number of bytes copied.
+ * Saves only entries with positive counter difference of the form
* 2 bytes: Number of counter
* 8 bytes: Value of counter
*/
static size_t paicrypt_copy(struct pai_userdata *userdata, unsigned long *page,
- bool exclude_user, bool exclude_kernel)
+ unsigned long *page_old, bool exclude_user,
+ bool exclude_kernel)
{
int i, outidx = 0;
for (i = 1; i <= paicrypt_cnt; i++) {
- u64 val = 0;
+ u64 val = 0, val_old = 0;
- if (!exclude_kernel)
+ if (!exclude_kernel) {
val += paicrypt_getctr(page, i, true);
- if (!exclude_user)
+ val_old += paicrypt_getctr(page_old, i, true);
+ }
+ if (!exclude_user) {
val += paicrypt_getctr(page, i, false);
+ val_old += paicrypt_getctr(page_old, i, false);
+ }
+ if (val >= val_old)
+ val -= val_old;
+ else
+ val = (~0ULL - val_old) + val + 1;
if (val) {
userdata[outidx].num = i;
userdata[outidx].value = val;
@@ -426,8 +445,8 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
overflow = perf_event_overflow(event, &data, &regs);
perf_event_update_userpage(event);
- /* Clear lowcore page after read */
- memset(cpump->page, 0, PAGE_SIZE);
+ /* Save crypto counter lowcore page after reading event data. */
+ memcpy((void *)PAI_SAVE_AREA(event), cpump->page, PAGE_SIZE);
return overflow;
}
@@ -443,6 +462,7 @@ static int paicrypt_have_sample(void)
if (!event) /* No event active */
return 0;
rawsize = paicrypt_copy(cpump->save, cpump->page,
+ (unsigned long *)PAI_SAVE_AREA(event),
cpump->event->attr.exclude_user,
cpump->event->attr.exclude_kernel);
if (rawsize) /* No incremented counters */
@@ -694,6 +714,12 @@ static int __init attr_event_init_one(struct attribute **attrs, int num)
{
struct perf_pmu_events_attr *pa;
+ /* Index larger than array_size, no counter name available */
+ if (num >= ARRAY_SIZE(paicrypt_ctrnames)) {
+ attrs[num] = NULL;
+ return 0;
+ }
+
pa = kzalloc(sizeof(*pa), GFP_KERNEL);
if (!pa)
return -ENOMEM;
@@ -714,14 +740,13 @@ static int __init attr_event_init(void)
struct attribute **attrs;
int ret, i;
- attrs = kmalloc_array(ARRAY_SIZE(paicrypt_ctrnames) + 1, sizeof(*attrs),
- GFP_KERNEL);
+ attrs = kmalloc_array(paicrypt_cnt + 2, sizeof(*attrs), GFP_KERNEL);
if (!attrs)
return -ENOMEM;
- for (i = 0; i < ARRAY_SIZE(paicrypt_ctrnames); i++) {
+ for (i = 0; i <= paicrypt_cnt; i++) {
ret = attr_event_init_one(attrs, i);
if (ret) {
- attr_event_free(attrs, i - 1);
+ attr_event_free(attrs, i);
return ret;
}
}
@@ -742,8 +767,10 @@ static int __init paicrypt_init(void)
paicrypt_cnt = ib.num_cc;
if (paicrypt_cnt == 0)
return 0;
- if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR)
- paicrypt_cnt = PAI_CRYPTO_MAXCTR - 1;
+ if (paicrypt_cnt >= PAI_CRYPTO_MAXCTR) {
+ pr_err("Too many PMU pai_crypto counters %d\n", paicrypt_cnt);
+ return -E2BIG;
+ }
rc = attr_event_init(); /* Export known PAI crypto events */
if (rc) {
diff --git a/arch/s390/kernel/perf_pai_ext.c b/arch/s390/kernel/perf_pai_ext.c
index af7f2b538c8f..616a25606cd6 100644
--- a/arch/s390/kernel/perf_pai_ext.c
+++ b/arch/s390/kernel/perf_pai_ext.c
@@ -120,6 +120,7 @@ static void paiext_event_destroy(struct perf_event *event)
struct paiext_mapptr *mp = per_cpu_ptr(paiext_root.mapptr, event->cpu);
struct paiext_map *cpump = mp->mapptr;
+ free_page(PAI_SAVE_AREA(event));
mutex_lock(&paiext_reserve_mutex);
cpump->event = NULL;
if (refcount_dec_and_test(&cpump->refcnt)) /* Last reference gone */
@@ -202,7 +203,6 @@ static int paiext_alloc(struct perf_event_attr *a, struct perf_event *event)
}
rc = 0;
- cpump->event = event;
undo:
if (rc) {
@@ -256,10 +256,18 @@ static int paiext_event_init(struct perf_event *event)
/* Prohibit exclude_user event selection */
if (a->exclude_user)
return -EINVAL;
+ /* Get a page to store last counter values for sampling */
+ if (a->sample_period) {
+ PAI_SAVE_AREA(event) = get_zeroed_page(GFP_KERNEL);
+ if (!PAI_SAVE_AREA(event))
+ return -ENOMEM;
+ }
rc = paiext_alloc(a, event);
- if (rc)
+ if (rc) {
+ free_page(PAI_SAVE_AREA(event));
return rc;
+ }
event->destroy = paiext_event_destroy;
if (a->sample_period) {
@@ -319,15 +327,15 @@ static void paiext_read(struct perf_event *event)
static void paiext_start(struct perf_event *event, int flags)
{
+ struct paiext_mapptr *mp = this_cpu_ptr(paiext_root.mapptr);
+ struct paiext_map *cpump = mp->mapptr;
u64 sum;
if (!event->attr.sample_period) { /* Counting */
- if (!event->hw.last_tag) {
- event->hw.last_tag = 1;
- sum = paiext_getall(event); /* Get current value */
- local64_set(&event->hw.prev_count, sum);
- }
+ sum = paiext_getall(event); /* Get current value */
+ local64_set(&event->hw.prev_count, sum);
} else { /* Sampling */
+ cpump->event = event;
perf_sched_cb_inc(event->pmu);
}
}
@@ -346,7 +354,6 @@ static int paiext_add(struct perf_event *event, int flags)
debug_sprintf_event(paiext_dbg, 4, "%s 1508 %llx acc %llx\n",
__func__, S390_lowcore.aicd, pcb->acc);
}
- cpump->event = event;
if (flags & PERF_EF_START)
paiext_start(event, PERF_EF_RELOAD);
event->hw.state = 0;
@@ -384,13 +391,19 @@ static void paiext_del(struct perf_event *event, int flags)
* 2 bytes: Number of counter
* 8 bytes: Value of counter
*/
-static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area)
+static size_t paiext_copy(struct pai_userdata *userdata, unsigned long *area,
+ unsigned long *area_old)
{
int i, outidx = 0;
for (i = 1; i <= paiext_cnt; i++) {
u64 val = paiext_getctr(area, i);
+ u64 val_old = paiext_getctr(area_old, i);
+ if (val >= val_old)
+ val -= val_old;
+ else
+ val = (~0ULL - val_old) + val + 1;
if (val) {
userdata[outidx].num = i;
userdata[outidx].value = val;
@@ -446,8 +459,9 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
overflow = perf_event_overflow(event, &data, &regs);
perf_event_update_userpage(event);
- /* Clear lowcore area after read */
- memset(cpump->area, 0, PAIE1_CTRBLOCK_SZ);
+ /* Save NNPA lowcore area after read in event */
+ memcpy((void *)PAI_SAVE_AREA(event), cpump->area,
+ PAIE1_CTRBLOCK_SZ);
return overflow;
}
@@ -462,7 +476,8 @@ static int paiext_have_sample(void)
if (!event)
return 0;
- rawsize = paiext_copy(cpump->save, cpump->area);
+ rawsize = paiext_copy(cpump->save, cpump->area,
+ (unsigned long *)PAI_SAVE_AREA(event));
if (rawsize) /* Incremented counters */
rc = paiext_push_sample(rawsize, cpump, event);
return rc;
@@ -584,6 +599,12 @@ static int __init attr_event_init_one(struct attribute **attrs, int num)
{
struct perf_pmu_events_attr *pa;
+ /* Index larger than array_size, no counter name available */
+ if (num >= ARRAY_SIZE(paiext_ctrnames)) {
+ attrs[num] = NULL;
+ return 0;
+ }
+
pa = kzalloc(sizeof(*pa), GFP_KERNEL);
if (!pa)
return -ENOMEM;
@@ -604,14 +625,13 @@ static int __init attr_event_init(void)
struct attribute **attrs;
int ret, i;
- attrs = kmalloc_array(ARRAY_SIZE(paiext_ctrnames) + 1, sizeof(*attrs),
- GFP_KERNEL);
+ attrs = kmalloc_array(paiext_cnt + 2, sizeof(*attrs), GFP_KERNEL);
if (!attrs)
return -ENOMEM;
- for (i = 0; i < ARRAY_SIZE(paiext_ctrnames); i++) {
+ for (i = 0; i <= paiext_cnt; i++) {
ret = attr_event_init_one(attrs, i);
if (ret) {
- attr_event_free(attrs, i - 1);
+ attr_event_free(attrs, i);
return ret;
}
}
diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c
index 3d93656bd948..a6b058ee4a36 100644
--- a/arch/s390/kernel/perf_regs.c
+++ b/arch/s390/kernel/perf_regs.c
@@ -5,8 +5,7 @@
#include <linux/errno.h>
#include <linux/bug.h>
#include <asm/ptrace.h>
-#include <asm/fpu/api.h>
-#include <asm/fpu/types.h>
+#include <asm/fpu.h>
u64 perf_reg_value(struct pt_regs *regs, int idx)
{
@@ -20,10 +19,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
return 0;
idx -= PERF_REG_S390_FP0;
- if (cpu_has_vx())
- fp = *(freg_t *)(current->thread.fpu.vxrs + idx);
- else
- fp = current->thread.fpu.fprs[idx];
+ fp = *(freg_t *)(current->thread.ufpu.vxrs + idx);
return fp.ui;
}
@@ -65,6 +61,6 @@ void perf_get_regs_user(struct perf_regs *regs_user,
*/
regs_user->regs = task_pt_regs(current);
if (user_mode(regs_user->regs))
- save_fpu_regs();
+ save_user_fpu_regs();
regs_user->abi = perf_reg_abi(current);
}
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 4e3b366589fb..dd456b475861 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -31,15 +31,19 @@
#include <linux/init_task.h>
#include <linux/entry-common.h>
#include <linux/io.h>
+#include <asm/guarded_storage.h>
+#include <asm/access-regs.h>
+#include <asm/switch_to.h>
#include <asm/cpu_mf.h>
#include <asm/processor.h>
+#include <asm/ptrace.h>
#include <asm/vtimer.h>
#include <asm/exec.h>
+#include <asm/fpu.h>
#include <asm/irq.h>
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/stacktrace.h>
-#include <asm/switch_to.h>
#include <asm/runtime_instr.h>
#include <asm/unwind.h>
#include "entry.h"
@@ -84,13 +88,13 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
/*
* Save the floating-point or vector register state of the current
- * task and set the CIF_FPU flag to lazy restore the FPU register
+ * task and set the TIF_FPU flag to lazy restore the FPU register
* state when returning to user space.
*/
- save_fpu_regs();
+ save_user_fpu_regs();
*dst = *src;
- dst->thread.fpu.regs = dst->thread.fpu.fprs;
+ dst->thread.kfpu_flags = 0;
/*
* Don't transfer over the runtime instrumentation or the guarded
@@ -186,8 +190,23 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
void execve_tail(void)
{
- current->thread.fpu.fpc = 0;
- asm volatile("sfpc %0" : : "d" (0));
+ current->thread.ufpu.fpc = 0;
+ fpu_sfpc(0);
+}
+
+struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next)
+{
+ save_user_fpu_regs();
+ save_kernel_fpu_regs(&prev->thread);
+ save_access_regs(&prev->thread.acrs[0]);
+ save_ri_cb(prev->thread.ri_cb);
+ save_gs_cb(prev->thread.gs_cb);
+ update_cr_regs(next);
+ restore_kernel_fpu_regs(&next->thread);
+ restore_access_regs(&next->thread.acrs[0]);
+ restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb);
+ restore_gs_cb(next->thread.gs_cb);
+ return __switch_to_asm(prev, next);
}
unsigned long __get_wchan(struct task_struct *p)
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index f1897a8bb221..1cfed8b710b8 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -24,13 +24,14 @@
#include <linux/seccomp.h>
#include <linux/compat.h>
#include <trace/syscall.h>
+#include <asm/guarded_storage.h>
+#include <asm/access-regs.h>
#include <asm/page.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
-#include <asm/switch_to.h>
#include <asm/runtime_instr.h>
#include <asm/facility.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
#include "entry.h"
@@ -246,22 +247,15 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr)
/*
* floating point control reg. is in the thread structure
*/
- tmp = child->thread.fpu.fpc;
+ tmp = child->thread.ufpu.fpc;
tmp <<= BITS_PER_LONG - 32;
} else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
- * floating point regs. are either in child->thread.fpu
- * or the child->thread.fpu.vxrs array
+ * floating point regs. are in the child->thread.ufpu.vxrs array
*/
offset = addr - offsetof(struct user, regs.fp_regs.fprs);
- if (cpu_has_vx())
- tmp = *(addr_t *)
- ((addr_t) child->thread.fpu.vxrs + 2*offset);
- else
- tmp = *(addr_t *)
- ((addr_t) child->thread.fpu.fprs + offset);
-
+ tmp = *(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset);
} else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
@@ -395,21 +389,14 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
*/
if ((unsigned int)data != 0)
return -EINVAL;
- child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32);
+ child->thread.ufpu.fpc = data >> (BITS_PER_LONG - 32);
} else if (addr < offsetof(struct user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
- * floating point regs. are either in child->thread.fpu
- * or the child->thread.fpu.vxrs array
+ * floating point regs. are in the child->thread.ufpu.vxrs array
*/
offset = addr - offsetof(struct user, regs.fp_regs.fprs);
- if (cpu_has_vx())
- *(addr_t *)((addr_t)
- child->thread.fpu.vxrs + 2*offset) = data;
- else
- *(addr_t *)((addr_t)
- child->thread.fpu.fprs + offset) = data;
-
+ *(addr_t *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = data;
} else if (addr < offsetof(struct user, regs.per_info) + sizeof(per_struct)) {
/*
* Handle access to the per_info structure.
@@ -622,21 +609,14 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
/*
* floating point control reg. is in the thread structure
*/
- tmp = child->thread.fpu.fpc;
+ tmp = child->thread.ufpu.fpc;
} else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
- * floating point regs. are either in child->thread.fpu
- * or the child->thread.fpu.vxrs array
+ * floating point regs. are in the child->thread.ufpu.vxrs array
*/
offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
- if (cpu_has_vx())
- tmp = *(__u32 *)
- ((addr_t) child->thread.fpu.vxrs + 2*offset);
- else
- tmp = *(__u32 *)
- ((addr_t) child->thread.fpu.fprs + offset);
-
+ tmp = *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset);
} else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
@@ -748,21 +728,14 @@ static int __poke_user_compat(struct task_struct *child,
/*
* floating point control reg. is in the thread structure
*/
- child->thread.fpu.fpc = data;
+ child->thread.ufpu.fpc = data;
} else if (addr < offsetof(struct compat_user, regs.fp_regs) + sizeof(s390_fp_regs)) {
/*
- * floating point regs. are either in child->thread.fpu
- * or the child->thread.fpu.vxrs array
+ * floating point regs. are in the child->thread.ufpu.vxrs array
*/
offset = addr - offsetof(struct compat_user, regs.fp_regs.fprs);
- if (cpu_has_vx())
- *(__u32 *)((addr_t)
- child->thread.fpu.vxrs + 2*offset) = tmp;
- else
- *(__u32 *)((addr_t)
- child->thread.fpu.fprs + offset) = tmp;
-
+ *(__u32 *)((addr_t)child->thread.ufpu.vxrs + 2 * offset) = tmp;
} else if (addr < offsetof(struct compat_user, regs.per_info) + sizeof(struct compat_per_struct_kernel)) {
/*
* Handle access to the per_info structure.
@@ -893,10 +866,10 @@ static int s390_fpregs_get(struct task_struct *target,
_s390_fp_regs fp_regs;
if (target == current)
- save_fpu_regs();
+ save_user_fpu_regs();
- fp_regs.fpc = target->thread.fpu.fpc;
- fpregs_store(&fp_regs, &target->thread.fpu);
+ fp_regs.fpc = target->thread.ufpu.fpc;
+ fpregs_store(&fp_regs, &target->thread.ufpu);
return membuf_write(&to, &fp_regs, sizeof(fp_regs));
}
@@ -910,22 +883,17 @@ static int s390_fpregs_set(struct task_struct *target,
freg_t fprs[__NUM_FPRS];
if (target == current)
- save_fpu_regs();
-
- if (cpu_has_vx())
- convert_vx_to_fp(fprs, target->thread.fpu.vxrs);
- else
- memcpy(&fprs, target->thread.fpu.fprs, sizeof(fprs));
-
+ save_user_fpu_regs();
+ convert_vx_to_fp(fprs, target->thread.ufpu.vxrs);
if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) {
- u32 ufpc[2] = { target->thread.fpu.fpc, 0 };
+ u32 ufpc[2] = { target->thread.ufpu.fpc, 0 };
rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc,
0, offsetof(s390_fp_regs, fprs));
if (rc)
return rc;
if (ufpc[1] != 0)
return -EINVAL;
- target->thread.fpu.fpc = ufpc[0];
+ target->thread.ufpu.fpc = ufpc[0];
}
if (rc == 0 && count > 0)
@@ -933,12 +901,7 @@ static int s390_fpregs_set(struct task_struct *target,
fprs, offsetof(s390_fp_regs, fprs), -1);
if (rc)
return rc;
-
- if (cpu_has_vx())
- convert_fp_to_vx(target->thread.fpu.vxrs, fprs);
- else
- memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs));
-
+ convert_fp_to_vx(target->thread.ufpu.vxrs, fprs);
return rc;
}
@@ -988,9 +951,9 @@ static int s390_vxrs_low_get(struct task_struct *target,
if (!cpu_has_vx())
return -ENODEV;
if (target == current)
- save_fpu_regs();
+ save_user_fpu_regs();
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = target->thread.fpu.vxrs[i].low;
+ vxrs[i] = target->thread.ufpu.vxrs[i].low;
return membuf_write(&to, vxrs, sizeof(vxrs));
}
@@ -1005,15 +968,15 @@ static int s390_vxrs_low_set(struct task_struct *target,
if (!cpu_has_vx())
return -ENODEV;
if (target == current)
- save_fpu_regs();
+ save_user_fpu_regs();
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = target->thread.fpu.vxrs[i].low;
+ vxrs[i] = target->thread.ufpu.vxrs[i].low;
rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
if (rc == 0)
for (i = 0; i < __NUM_VXRS_LOW; i++)
- target->thread.fpu.vxrs[i].low = vxrs[i];
+ target->thread.ufpu.vxrs[i].low = vxrs[i];
return rc;
}
@@ -1025,8 +988,8 @@ static int s390_vxrs_high_get(struct task_struct *target,
if (!cpu_has_vx())
return -ENODEV;
if (target == current)
- save_fpu_regs();
- return membuf_write(&to, target->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ save_user_fpu_regs();
+ return membuf_write(&to, target->thread.ufpu.vxrs + __NUM_VXRS_LOW,
__NUM_VXRS_HIGH * sizeof(__vector128));
}
@@ -1040,10 +1003,10 @@ static int s390_vxrs_high_set(struct task_struct *target,
if (!cpu_has_vx())
return -ENODEV;
if (target == current)
- save_fpu_regs();
+ save_user_fpu_regs();
rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1);
+ target->thread.ufpu.vxrs + __NUM_VXRS_LOW, 0, -1);
return rc;
}
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index d1f3b56e7afc..24ed33f044ec 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -504,12 +504,12 @@ static void __init setup_resources(void)
int j;
u64 i;
- code_resource.start = (unsigned long) _text;
- code_resource.end = (unsigned long) _etext - 1;
- data_resource.start = (unsigned long) _etext;
- data_resource.end = (unsigned long) _edata - 1;
- bss_resource.start = (unsigned long) __bss_start;
- bss_resource.end = (unsigned long) __bss_stop - 1;
+ code_resource.start = __pa_symbol(_text);
+ code_resource.end = __pa_symbol(_etext) - 1;
+ data_resource.start = __pa_symbol(_etext);
+ data_resource.end = __pa_symbol(_edata) - 1;
+ bss_resource.start = __pa_symbol(__bss_start);
+ bss_resource.end = __pa_symbol(__bss_stop) - 1;
for_each_mem_range(i, &start, &end) {
res = memblock_alloc(sizeof(*res), 8);
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 43e9661cd715..6c2cb345402f 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -30,8 +30,8 @@
#include <linux/compat.h>
#include <asm/ucontext.h>
#include <linux/uaccess.h>
+#include <asm/access-regs.h>
#include <asm/lowcore.h>
-#include <asm/switch_to.h>
#include <asm/vdso.h>
#include "entry.h"
@@ -109,7 +109,7 @@ struct rt_sigframe
static void store_sigregs(void)
{
save_access_regs(current->thread.acrs);
- save_fpu_regs();
+ save_user_fpu_regs();
}
/* Load registers after signal return */
@@ -131,7 +131,7 @@ static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
memcpy(&user_sregs.regs.gprs, &regs->gprs, sizeof(sregs->regs.gprs));
memcpy(&user_sregs.regs.acrs, current->thread.acrs,
sizeof(user_sregs.regs.acrs));
- fpregs_store(&user_sregs.fpregs, &current->thread.fpu);
+ fpregs_store(&user_sregs.fpregs, &current->thread.ufpu);
if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs)))
return -EFAULT;
return 0;
@@ -165,7 +165,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs)
memcpy(&current->thread.acrs, &user_sregs.regs.acrs,
sizeof(current->thread.acrs));
- fpregs_load(&user_sregs.fpregs, &current->thread.fpu);
+ fpregs_load(&user_sregs.fpregs, &current->thread.ufpu);
clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */
return 0;
@@ -181,11 +181,11 @@ static int save_sigregs_ext(struct pt_regs *regs,
/* Save vector registers to signal stack */
if (cpu_has_vx()) {
for (i = 0; i < __NUM_VXRS_LOW; i++)
- vxrs[i] = current->thread.fpu.vxrs[i].low;
+ vxrs[i] = current->thread.ufpu.vxrs[i].low;
if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
sizeof(sregs_ext->vxrs_low)) ||
__copy_to_user(&sregs_ext->vxrs_high,
- current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
}
@@ -202,12 +202,12 @@ static int restore_sigregs_ext(struct pt_regs *regs,
if (cpu_has_vx()) {
if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
sizeof(sregs_ext->vxrs_low)) ||
- __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
+ __copy_from_user(current->thread.ufpu.vxrs + __NUM_VXRS_LOW,
&sregs_ext->vxrs_high,
sizeof(sregs_ext->vxrs_high)))
return -EFAULT;
for (i = 0; i < __NUM_VXRS_LOW; i++)
- current->thread.fpu.vxrs[i].low = vxrs[i];
+ current->thread.ufpu.vxrs[i].low = vxrs[i];
}
return 0;
}
@@ -222,7 +222,7 @@ SYSCALL_DEFINE0(sigreturn)
if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE))
goto badframe;
set_current_blocked(&set);
- save_fpu_regs();
+ save_user_fpu_regs();
if (restore_sigregs(regs, &frame->sregs))
goto badframe;
if (restore_sigregs_ext(regs, &frame->sregs_ext))
@@ -246,7 +246,7 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
- save_fpu_regs();
+ save_user_fpu_regs();
if (restore_sigregs(regs, &frame->uc.uc_mcontext))
goto badframe;
if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext))
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index c39d9f0d4b1c..0324649aae0a 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -36,12 +36,13 @@
#include <linux/sched/task_stack.h>
#include <linux/crash_dump.h>
#include <linux/kprobes.h>
+#include <asm/access-regs.h>
#include <asm/asm-offsets.h>
#include <asm/ctlreg.h>
#include <asm/pfault.h>
#include <asm/diag.h>
-#include <asm/switch_to.h>
#include <asm/facility.h>
+#include <asm/fpu.h>
#include <asm/ipl.h>
#include <asm/setup.h>
#include <asm/irq.h>
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index f6f8f498c9be..1b1be3110cfc 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -20,7 +20,7 @@
#include <asm/sysinfo.h>
#include <asm/cpcmd.h>
#include <asm/topology.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
int topology_max_mnest;
@@ -426,9 +426,9 @@ subsys_initcall(create_proc_service_level);
*/
void s390_adjust_jiffies(void)
{
+ DECLARE_KERNEL_FPU_ONSTACK16(fpu);
struct sysinfo_1_2_2 *info;
unsigned long capability;
- struct kernel_fpu fpu;
info = (void *) get_zeroed_page(GFP_KERNEL);
if (!info)
@@ -447,21 +447,14 @@ void s390_adjust_jiffies(void)
* point division ..
*/
kernel_fpu_begin(&fpu, KERNEL_FPR);
- asm volatile(
- " sfpc %3\n"
- " l %0,%1\n"
- " tmlh %0,0xff80\n"
- " jnz 0f\n"
- " cefbr %%f2,%0\n"
- " j 1f\n"
- "0: le %%f2,%1\n"
- "1: cefbr %%f0,%2\n"
- " debr %%f0,%%f2\n"
- " cgebr %0,5,%%f0\n"
- : "=&d" (capability)
- : "Q" (info->capability), "d" (10000000), "d" (0)
- : "cc"
- );
+ fpu_sfpc(0);
+ if (info->capability & 0xff800000)
+ fpu_ldgr(2, info->capability);
+ else
+ fpu_cefbr(2, info->capability);
+ fpu_cefbr(0, 10000000);
+ fpu_debr(0, 2);
+ capability = fpu_cgebr(0, 5);
kernel_fpu_end(&fpu, KERNEL_FPR);
} else
/*
diff --git a/arch/s390/kernel/text_amode31.S b/arch/s390/kernel/text_amode31.S
index 14c6d25c035f..c0a70efa2426 100644
--- a/arch/s390/kernel/text_amode31.S
+++ b/arch/s390/kernel/text_amode31.S
@@ -90,7 +90,7 @@ SYM_FUNC_START(_diag26c_amode31)
SYM_FUNC_END(_diag26c_amode31)
/*
- * void _diag0c_amode31(struct hypfs_diag0c_entry *entry)
+ * void _diag0c_amode31(unsigned long rx)
*/
SYM_FUNC_START(_diag0c_amode31)
sam31
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 14abad953c02..fb9f31f36628 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -251,8 +251,8 @@ static struct clocksource clocksource_tod = {
.rating = 400,
.read = read_tod_clock,
.mask = CLOCKSOURCE_MASK(64),
- .mult = 1000,
- .shift = 12,
+ .mult = 4096000,
+ .shift = 24,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.vdso_clock_mode = VDSO_CLOCKMODE_TOD,
};
@@ -716,7 +716,7 @@ out_unlock:
/*
* STP subsys sysfs interface functions
*/
-static struct bus_type stp_subsys = {
+static const struct bus_type stp_subsys = {
.name = "stp",
.dev_name = "stp",
};
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 46dac4540ca8..52578b5cecbd 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -28,8 +28,8 @@
#include <linux/cpu.h>
#include <linux/entry-common.h>
#include <asm/asm-extable.h>
-#include <asm/fpu/api.h>
#include <asm/vtime.h>
+#include <asm/fpu.h>
#include "entry.h"
static inline void __user *get_trap_ip(struct pt_regs *regs)
@@ -201,8 +201,8 @@ static void vector_exception(struct pt_regs *regs)
}
/* get vector interrupt code from fpc */
- save_fpu_regs();
- vic = (current->thread.fpu.fpc & 0xf00) >> 8;
+ save_user_fpu_regs();
+ vic = (current->thread.ufpu.fpc & 0xf00) >> 8;
switch (vic) {
case 1: /* invalid vector operation */
si_code = FPE_FLTINV;
@@ -227,9 +227,9 @@ static void vector_exception(struct pt_regs *regs)
static void data_exception(struct pt_regs *regs)
{
- save_fpu_regs();
- if (current->thread.fpu.fpc & FPC_DXC_MASK)
- do_fp_trap(regs, current->thread.fpu.fpc);
+ save_user_fpu_regs();
+ if (current->thread.ufpu.fpc & FPC_DXC_MASK)
+ do_fp_trap(regs, current->thread.ufpu.fpc);
else
do_trap(regs, SIGILL, ILL_ILLOPN, "data exception");
}
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index b88345ef8bd9..5b0633ea8d93 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -12,7 +12,6 @@
#include <linux/kdebug.h>
#include <linux/sched/task_stack.h>
-#include <asm/switch_to.h>
#include <asm/facility.h>
#include <asm/kprobes.h>
#include <asm/dis.h>
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
index caec7db6f966..b12a274cbb47 100644
--- a/arch/s390/kernel/vdso32/Makefile
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -22,7 +22,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin
-LDFLAGS_vdso32.so.dbg += -fPIC -shared -soname=linux-vdso32.so.1 \
+LDFLAGS_vdso32.so.dbg += -shared -soname=linux-vdso32.so.1 \
--hash-style=both --build-id=sha1 -melf_s390 -T
$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S
index edf5ff1debe1..65b9513a5a0e 100644
--- a/arch/s390/kernel/vdso32/vdso32.lds.S
+++ b/arch/s390/kernel/vdso32/vdso32.lds.S
@@ -9,7 +9,6 @@
OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390")
OUTPUT_ARCH(s390:31-bit)
-ENTRY(_start)
SECTIONS
{
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index e3c9085f8fa7..ef9832726097 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -25,8 +25,9 @@ KBUILD_AFLAGS_64 += -m64
KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_64 := $(filter-out -mno-pic-data-is-text-relative,$(KBUILD_CFLAGS_64))
+KBUILD_CFLAGS_64 := $(filter-out -munaligned-symbols,$(KBUILD_CFLAGS_64))
KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin
-ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \
+ldflags-y := -shared -soname=linux-vdso64.so.1 \
--hash-style=both --build-id=sha1 -T
$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64)
diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S
index 4461ea151e49..37e2a505e81d 100644
--- a/arch/s390/kernel/vdso64/vdso64.lds.S
+++ b/arch/s390/kernel/vdso64/vdso64.lds.S
@@ -9,7 +9,6 @@
OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
OUTPUT_ARCH(s390:64-bit)
-ENTRY(_start)
SECTIONS
{
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index e32ef446f451..48de296e8905 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -59,6 +59,14 @@ SECTIONS
} :text = 0x0700
RO_DATA(PAGE_SIZE)
+ .data.rel.ro : {
+ *(.data.rel.ro .data.rel.ro.*)
+ }
+ .got : {
+ __got_start = .;
+ *(.got)
+ __got_end = .;
+ }
. = ALIGN(PAGE_SIZE);
_sdata = .; /* Start of data section */
@@ -73,6 +81,9 @@ SECTIONS
__end_ro_after_init = .;
RW_DATA(0x100, PAGE_SIZE, THREAD_SIZE)
+ .data.rel : {
+ *(.data.rel*)
+ }
BOOT_DATA_PRESERVED
. = ALIGN(8);
@@ -181,6 +192,7 @@ SECTIONS
PERCPU_SECTION(0x100)
+#ifdef CONFIG_PIE_BUILD
.dynsym ALIGN(8) : {
__dynsym_start = .;
*(.dynsym)
@@ -191,6 +203,19 @@ SECTIONS
*(.rela*)
__rela_dyn_end = .;
}
+ .dynamic ALIGN(8) : {
+ *(.dynamic)
+ }
+ .dynstr ALIGN(8) : {
+ *(.dynstr)
+ }
+#endif
+ .hash ALIGN(8) : {
+ *(.hash)
+ }
+ .gnu.hash ALIGN(8) : {
+ *(.gnu.hash)
+ }
. = ALIGN(PAGE_SIZE);
__init_end = .; /* freed after init ends here */
@@ -214,9 +239,14 @@ SECTIONS
QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */
QUAD(__boot_data_preserved_end -
__boot_data_preserved_start) /* bootdata_preserved_size */
+#ifdef CONFIG_PIE_BUILD
QUAD(__dynsym_start) /* dynsym_start */
QUAD(__rela_dyn_start) /* rela_dyn_start */
QUAD(__rela_dyn_end) /* rela_dyn_end */
+#else
+ QUAD(__got_start) /* got_start */
+ QUAD(__got_end) /* got_end */
+#endif
QUAD(_eamode31 - _samode31) /* amode31_size */
QUAD(init_mm)
QUAD(swapper_pg_dir)
@@ -235,6 +265,30 @@ SECTIONS
DWARF_DEBUG
ELF_DETAILS
+ /*
+ * Make sure that the .got.plt is either completely empty or it
+ * contains only the three reserved double words.
+ */
+ .got.plt : {
+ *(.got.plt)
+ }
+ ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!")
+
+ /*
+ * Sections that should stay zero sized, which is safer to
+ * explicitly check instead of blindly discarding.
+ */
+ .plt : {
+ *(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
+ }
+ ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+#ifndef CONFIG_PIE_BUILD
+ .rela.dyn : {
+ *(.rela.*) *(.rela_*)
+ }
+ ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
+#endif
+
/* Sections to be discarded */
DISCARDS
/DISCARD/ : {
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 5bfcc50c1a68..ee863566910b 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -11,11 +11,11 @@
#include <linux/err.h>
#include <linux/pgtable.h>
#include <linux/bitfield.h>
+#include <asm/access-regs.h>
#include <asm/fault.h>
#include <asm/gmap.h>
#include "kvm-s390.h"
#include "gaccess.h"
-#include <asm/switch_to.h>
union asce {
unsigned long val;
@@ -391,7 +391,8 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
if (ar >= NUM_ACRS)
return -EINVAL;
- save_access_regs(vcpu->run->s.regs.acrs);
+ if (vcpu->arch.acrs_loaded)
+ save_access_regs(vcpu->run->s.regs.acrs);
alet.val = vcpu->run->s.regs.acrs[ar];
if (ar == 0 || alet.val == 0) {
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index fc4007cc067a..dc721d50a942 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -19,13 +19,13 @@
#include <linux/slab.h>
#include <linux/bitmap.h>
#include <linux/vmalloc.h>
+#include <asm/access-regs.h>
#include <asm/asm-offsets.h>
#include <asm/dis.h>
#include <linux/uaccess.h>
#include <asm/sclp.h>
#include <asm/isc.h>
#include <asm/gmap.h>
-#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/airq.h>
#include <asm/tpi.h>
@@ -584,7 +584,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
mci.val = mchk->mcic;
/* take care of lazy register loading */
- save_fpu_regs();
+ kvm_s390_fpu_store(vcpu->run);
save_access_regs(vcpu->run->s.regs.acrs);
if (MACHINE_HAS_GS && vcpu->arch.gs_enabled)
save_gs_cb(current->thread.gs_cb);
@@ -648,7 +648,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu,
}
rc |= write_guest_lc(vcpu, __LC_GPREGS_SAVE_AREA,
vcpu->run->s.regs.gprs, 128);
- rc |= put_guest_lc(vcpu, current->thread.fpu.fpc,
+ rc |= put_guest_lc(vcpu, vcpu->run->s.regs.fpc,
(u32 __user *) __LC_FP_CREG_SAVE_AREA);
rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->todpr,
(u32 __user *) __LC_TOD_PROGREG_SAVE_AREA);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ea63ac769889..b11bb8e780a1 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -33,19 +33,19 @@
#include <linux/pgtable.h>
#include <linux/mmu_notifier.h>
+#include <asm/access-regs.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
#include <asm/stp.h>
#include <asm/gmap.h>
#include <asm/nmi.h>
-#include <asm/switch_to.h>
#include <asm/isc.h>
#include <asm/sclp.h>
#include <asm/cpacf.h>
#include <asm/timex.h>
+#include <asm/fpu.h>
#include <asm/ap.h>
#include <asm/uv.h>
-#include <asm/fpu/api.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "pci.h"
@@ -3951,6 +3951,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
KVM_SYNC_ARCH0 |
KVM_SYNC_PFAULT |
KVM_SYNC_DIAG318;
+ vcpu->arch.acrs_loaded = false;
kvm_s390_set_prefix(vcpu, 0);
if (test_kvm_facility(vcpu->kvm, 64))
vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
@@ -4829,8 +4830,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->s.regs.gprs,
sizeof(sie_page->pv_grregs));
}
- if (test_cpu_flag(CIF_FPU))
- load_fpu_regs();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
@@ -4951,16 +4950,8 @@ static void sync_regs(struct kvm_vcpu *vcpu)
}
save_access_regs(vcpu->arch.host_acrs);
restore_access_regs(vcpu->run->s.regs.acrs);
- /* save host (userspace) fprs/vrs */
- save_fpu_regs();
- vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
- vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
- if (cpu_has_vx())
- current->thread.fpu.regs = vcpu->run->s.regs.vrs;
- else
- current->thread.fpu.regs = vcpu->run->s.regs.fprs;
- current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
-
+ vcpu->arch.acrs_loaded = true;
+ kvm_s390_fpu_load(vcpu->run);
/* Sync fmt2 only data */
if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) {
sync_regs_fmt2(vcpu);
@@ -5021,12 +5012,8 @@ static void store_regs(struct kvm_vcpu *vcpu)
kvm_run->s.regs.pfc = vcpu->arch.pfault_compare;
save_access_regs(vcpu->run->s.regs.acrs);
restore_access_regs(vcpu->arch.host_acrs);
- /* Save guest register state */
- save_fpu_regs();
- vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
- /* Restore will be done lazily at return */
- current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
- current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
+ vcpu->arch.acrs_loaded = false;
+ kvm_s390_fpu_store(vcpu->run);
if (likely(!kvm_s390_pv_cpu_is_protected(vcpu)))
store_regs_fmt2(vcpu);
}
@@ -5034,6 +5021,7 @@ static void store_regs(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
+ DECLARE_KERNEL_FPU_ONSTACK32(fpu);
int rc;
/*
@@ -5075,6 +5063,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
+ kernel_fpu_begin(&fpu, KERNEL_FPC | KERNEL_VXR);
sync_regs(vcpu);
enable_cpu_timer_accounting(vcpu);
@@ -5098,6 +5087,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
disable_cpu_timer_accounting(vcpu);
store_regs(vcpu);
+ kernel_fpu_end(&fpu, KERNEL_FPC | KERNEL_VXR);
kvm_sigset_deactivate(vcpu);
@@ -5172,8 +5162,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
* switch in the run ioctl. Let's update our copies before we save
* it into the save area
*/
- save_fpu_regs();
- vcpu->run->s.regs.fpc = current->thread.fpu.fpc;
+ kvm_s390_fpu_store(vcpu->run);
save_access_regs(vcpu->run->s.regs.acrs);
return kvm_s390_store_status_unloaded(vcpu, addr);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index a7ea80cfa445..111eb5c74784 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -20,6 +20,24 @@
#include <asm/processor.h>
#include <asm/sclp.h>
+static inline void kvm_s390_fpu_store(struct kvm_run *run)
+{
+ fpu_stfpc(&run->s.regs.fpc);
+ if (cpu_has_vx())
+ save_vx_regs((__vector128 *)&run->s.regs.vrs);
+ else
+ save_fp_regs((freg_t *)&run->s.regs.fprs);
+}
+
+static inline void kvm_s390_fpu_load(struct kvm_run *run)
+{
+ fpu_lfpc_safe(&run->s.regs.fpc);
+ if (cpu_has_vx())
+ load_vx_regs((__vector128 *)&run->s.regs.vrs);
+ else
+ load_fp_regs((freg_t *)&run->s.regs.fprs);
+}
+
/* Transactional Memory Execution related macros */
#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE))
#define TDB_FORMAT1 1
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 3af3bd20ac7b..b2c9f010f0fe 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -18,7 +18,6 @@
#include <asm/sclp.h>
#include <asm/nmi.h>
#include <asm/dis.h>
-#include <asm/fpu/api.h>
#include <asm/facility.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -1149,8 +1148,6 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
*/
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
barrier();
- if (test_cpu_flag(CIF_FPU))
- load_fpu_regs();
if (!kvm_s390_vcpu_sie_inhibited(vcpu))
rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
barrier();
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 7c50eca85ca4..90eac15ea62a 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -4,6 +4,7 @@
#
lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
+lib-y += csum-partial.o
obj-y += mem.o xor.o
lib-$(CONFIG_KPROBES) += probes.o
lib-$(CONFIG_UPROBES) += probes.o
diff --git a/arch/s390/lib/csum-partial.c b/arch/s390/lib/csum-partial.c
new file mode 100644
index 000000000000..458abd9bac70
--- /dev/null
+++ b/arch/s390/lib/csum-partial.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <asm/checksum.h>
+#include <asm/fpu.h>
+
+/*
+ * Computes the checksum of a memory block at src, length len,
+ * and adds in "sum" (32-bit). If copy is true copies to dst.
+ *
+ * Returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic.
+ *
+ * This function must be called with even lengths, except
+ * for the last fragment, which may be odd.
+ *
+ * It's best to have src and dst aligned on a 64-bit boundary.
+ */
+static __always_inline __wsum csum_copy(void *dst, const void *src, int len, __wsum sum, bool copy)
+{
+ DECLARE_KERNEL_FPU_ONSTACK8(vxstate);
+
+ if (!cpu_has_vx()) {
+ if (copy)
+ memcpy(dst, src, len);
+ return cksm(dst, len, sum);
+ }
+ kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23);
+ fpu_vlvgf(16, (__force u32)sum, 1);
+ fpu_vzero(17);
+ fpu_vzero(18);
+ fpu_vzero(19);
+ while (len >= 64) {
+ fpu_vlm(20, 23, src);
+ if (copy) {
+ fpu_vstm(20, 23, dst);
+ dst += 64;
+ }
+ fpu_vcksm(16, 20, 16);
+ fpu_vcksm(17, 21, 17);
+ fpu_vcksm(18, 22, 18);
+ fpu_vcksm(19, 23, 19);
+ src += 64;
+ len -= 64;
+ }
+ while (len >= 32) {
+ fpu_vlm(20, 21, src);
+ if (copy) {
+ fpu_vstm(20, 21, dst);
+ dst += 32;
+ }
+ fpu_vcksm(16, 20, 16);
+ fpu_vcksm(17, 21, 17);
+ src += 32;
+ len -= 32;
+ }
+ while (len >= 16) {
+ fpu_vl(20, src);
+ if (copy) {
+ fpu_vst(20, dst);
+ dst += 16;
+ }
+ fpu_vcksm(16, 20, 16);
+ src += 16;
+ len -= 16;
+ }
+ if (len) {
+ fpu_vll(20, len - 1, src);
+ if (copy)
+ fpu_vstl(20, len - 1, dst);
+ fpu_vcksm(16, 20, 16);
+ }
+ fpu_vcksm(18, 19, 18);
+ fpu_vcksm(16, 17, 16);
+ fpu_vcksm(16, 18, 16);
+ sum = (__force __wsum)fpu_vlgvf(16, 1);
+ kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23);
+ return sum;
+}
+
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+ return csum_copy(NULL, buff, len, sum, false);
+}
+EXPORT_SYMBOL(csum_partial);
+
+__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len)
+{
+ return csum_copy(dst, src, len, 0, true);
+}
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index e41869f5cc95..282fefe107a2 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -136,7 +136,7 @@ dcss_diag(int *func, void *parameter,
unsigned long rx, ry;
int rc;
- rx = (unsigned long) parameter;
+ rx = virt_to_phys(parameter);
ry = (unsigned long) *func;
diag_stat_inc(DIAG_STAT_X064);
@@ -178,7 +178,7 @@ query_segment_type (struct dcss_segment *seg)
/* initialize diag input parameters */
qin->qopcode = DCSS_FINDSEGA;
- qin->qoutptr = (unsigned long) qout;
+ qin->qoutptr = virt_to_phys(qout);
qin->qoutlen = sizeof(struct qout64);
memcpy (qin->qname, seg->dcss_name, 8);
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index fc9a7dc26c5e..b14fc0887654 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -71,6 +71,15 @@ static inline unsigned long mmap_base(unsigned long rnd,
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
+static int get_align_mask(struct file *filp, unsigned long flags)
+{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+ if (filp || (flags & MAP_SHARED))
+ return MMAP_ALIGN_MASK << PAGE_SHIFT;
+ return 0;
+}
+
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
@@ -97,10 +106,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
- if (filp || (flags & MAP_SHARED))
- info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
- else
- info.align_mask = 0;
+ info.align_mask = get_align_mask(filp, flags);
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
if (offset_in_page(addr))
@@ -138,10 +144,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
- if (filp || (flags & MAP_SHARED))
- info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT;
- else
- info.align_mask = 0;
+ info.align_mask = get_align_mask(filp, flags);
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 52a44e353796..26afde0d1ed3 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -28,6 +28,7 @@
#include <linux/jump_label.h>
#include <linux/pci.h>
#include <linux/printk.h>
+#include <linux/lockdep.h>
#include <asm/isc.h>
#include <asm/airq.h>
@@ -730,12 +731,12 @@ EXPORT_SYMBOL_GPL(zpci_disable_device);
* equivalent to its state during boot when first probing a driver.
* Consequently after reset the PCI function requires re-initialization via the
* common PCI code including re-enabling IRQs via pci_alloc_irq_vectors()
- * and enabling the function via e.g.pci_enablde_device_flags().The caller
+ * and enabling the function via e.g. pci_enable_device_flags(). The caller
* must guard against concurrent reset attempts.
*
* In most cases this function should not be called directly but through
* pci_reset_function() or pci_reset_bus() which handle the save/restore and
- * locking.
+ * locking - asserted by lockdep.
*
* Return: 0 on success and an error value otherwise
*/
@@ -744,6 +745,7 @@ int zpci_hot_reset_device(struct zpci_dev *zdev)
u8 status;
int rc;
+ lockdep_assert_held(&zdev->state_lock);
zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh);
if (zdev_enabled(zdev)) {
/* Disables device access, DMAs and IRQs (reset state) */
@@ -806,7 +808,8 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
zdev->state = state;
kref_init(&zdev->kref);
- mutex_init(&zdev->lock);
+ mutex_init(&zdev->state_lock);
+ mutex_init(&zdev->fmb_lock);
mutex_init(&zdev->kzdev_lock);
rc = zpci_init_iommu(zdev);
@@ -870,6 +873,10 @@ int zpci_deconfigure_device(struct zpci_dev *zdev)
{
int rc;
+ lockdep_assert_held(&zdev->state_lock);
+ if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+ return 0;
+
if (zdev->zbus->bus)
zpci_bus_remove_device(zdev, false);
@@ -889,7 +896,7 @@ int zpci_deconfigure_device(struct zpci_dev *zdev)
}
/**
- * zpci_device_reserved() - Mark device as resverved
+ * zpci_device_reserved() - Mark device as reserved
* @zdev: the zpci_dev that was reserved
*
* Handle the case that a given zPCI function was reserved by another system.
@@ -899,8 +906,6 @@ int zpci_deconfigure_device(struct zpci_dev *zdev)
*/
void zpci_device_reserved(struct zpci_dev *zdev)
{
- if (zdev->has_hp_slot)
- zpci_exit_slot(zdev);
/*
* Remove device from zpci_list as it is going away. This also
* makes sure we ignore subsequent zPCI events for this device.
@@ -918,6 +923,9 @@ void zpci_release_device(struct kref *kref)
struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref);
int ret;
+ if (zdev->has_hp_slot)
+ zpci_exit_slot(zdev);
+
if (zdev->zbus->bus)
zpci_bus_remove_device(zdev, false);
diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c
index 6dde2263c79d..2cb5043a997d 100644
--- a/arch/s390/pci/pci_debug.c
+++ b/arch/s390/pci/pci_debug.c
@@ -91,9 +91,9 @@ static int pci_perf_show(struct seq_file *m, void *v)
if (!zdev)
return 0;
- mutex_lock(&zdev->lock);
+ mutex_lock(&zdev->fmb_lock);
if (!zdev->fmb) {
- mutex_unlock(&zdev->lock);
+ mutex_unlock(&zdev->fmb_lock);
seq_puts(m, "FMB statistics disabled\n");
return 0;
}
@@ -130,7 +130,7 @@ static int pci_perf_show(struct seq_file *m, void *v)
}
pci_sw_counter_show(m);
- mutex_unlock(&zdev->lock);
+ mutex_unlock(&zdev->fmb_lock);
return 0;
}
@@ -148,7 +148,7 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf,
if (rc)
return rc;
- mutex_lock(&zdev->lock);
+ mutex_lock(&zdev->fmb_lock);
switch (val) {
case 0:
rc = zpci_fmb_disable_device(zdev);
@@ -157,7 +157,7 @@ static ssize_t pci_perf_seq_write(struct file *file, const char __user *ubuf,
rc = zpci_fmb_enable_device(zdev);
break;
}
- mutex_unlock(&zdev->lock);
+ mutex_unlock(&zdev->fmb_lock);
return rc ? rc : count;
}
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 4d9773ef9e0a..dbe95ec5917e 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -267,6 +267,7 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
zpci_err_hex(ccdf, sizeof(*ccdf));
if (zdev) {
+ mutex_lock(&zdev->state_lock);
zpci_update_fh(zdev, ccdf->fh);
if (zdev->zbus->bus)
pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
@@ -294,6 +295,8 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf)
}
pci_dev_put(pdev);
no_pdev:
+ if (zdev)
+ mutex_unlock(&zdev->state_lock);
zpci_zdev_put(zdev);
}
@@ -326,6 +329,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
zpci_dbg(3, "avl fid:%x, fh:%x, pec:%x\n",
ccdf->fid, ccdf->fh, ccdf->pec);
+
+ if (existing_zdev)
+ mutex_lock(&zdev->state_lock);
+
switch (ccdf->pec) {
case 0x0301: /* Reserved|Standby -> Configured */
if (!zdev) {
@@ -348,7 +355,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
break;
case 0x0303: /* Deconfiguration requested */
if (zdev) {
- /* The event may have been queued before we confirgured
+ /* The event may have been queued before we configured
* the device.
*/
if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
@@ -359,7 +366,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
break;
case 0x0304: /* Configured -> Standby|Reserved */
if (zdev) {
- /* The event may have been queued before we confirgured
+ /* The event may have been queued before we configured
* the device.:
*/
if (zdev->state == ZPCI_FN_STATE_CONFIGURED)
@@ -383,8 +390,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
default:
break;
}
- if (existing_zdev)
+ if (existing_zdev) {
+ mutex_unlock(&zdev->state_lock);
zpci_zdev_put(zdev);
+ }
}
void zpci_event_availability(void *data)
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index 8a7abac51816..a0b872b74fe3 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -49,6 +49,39 @@ static ssize_t mio_enabled_show(struct device *dev,
}
static DEVICE_ATTR_RO(mio_enabled);
+static int _do_recover(struct pci_dev *pdev, struct zpci_dev *zdev)
+{
+ u8 status;
+ int ret;
+
+ pci_stop_and_remove_bus_device(pdev);
+ if (zdev_enabled(zdev)) {
+ ret = zpci_disable_device(zdev);
+ /*
+ * Due to a z/VM vs LPAR inconsistency in the error
+ * state the FH may indicate an enabled device but
+ * disable says the device is already disabled don't
+ * treat it as an error here.
+ */
+ if (ret == -EINVAL)
+ ret = 0;
+ if (ret)
+ return ret;
+ }
+
+ ret = zpci_enable_device(zdev);
+ if (ret)
+ return ret;
+
+ if (zdev->dma_table) {
+ ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
+ virt_to_phys(zdev->dma_table), &status);
+ if (ret)
+ zpci_disable_device(zdev);
+ }
+ return ret;
+}
+
static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
@@ -56,7 +89,6 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
struct pci_dev *pdev = to_pci_dev(dev);
struct zpci_dev *zdev = to_zpci(pdev);
int ret = 0;
- u8 status;
/* Can't use device_remove_self() here as that would lead us to lock
* the pci_rescan_remove_lock while holding the device' kernfs lock.
@@ -70,6 +102,12 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
*/
kn = sysfs_break_active_protection(&dev->kobj, &attr->attr);
WARN_ON_ONCE(!kn);
+
+ /* Device needs to be configured and state must not change */
+ mutex_lock(&zdev->state_lock);
+ if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+ goto out;
+
/* device_remove_file() serializes concurrent calls ignoring all but
* the first
*/
@@ -82,35 +120,13 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr,
*/
pci_lock_rescan_remove();
if (pci_dev_is_added(pdev)) {
- pci_stop_and_remove_bus_device(pdev);
- if (zdev_enabled(zdev)) {
- ret = zpci_disable_device(zdev);
- /*
- * Due to a z/VM vs LPAR inconsistency in the error
- * state the FH may indicate an enabled device but
- * disable says the device is already disabled don't
- * treat it as an error here.
- */
- if (ret == -EINVAL)
- ret = 0;
- if (ret)
- goto out;
- }
-
- ret = zpci_enable_device(zdev);
- if (ret)
- goto out;
-
- if (zdev->dma_table) {
- ret = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma,
- virt_to_phys(zdev->dma_table), &status);
- if (ret)
- zpci_disable_device(zdev);
- }
+ ret = _do_recover(pdev, zdev);
}
-out:
pci_rescan_bus(zdev->zbus->bus);
pci_unlock_rescan_remove();
+
+out:
+ mutex_unlock(&zdev->state_lock);
if (kn)
sysfs_unbreak_active_protection(kn);
return ret ? ret : count;
diff --git a/arch/s390/tools/.gitignore b/arch/s390/tools/.gitignore
index ea62f37b79ef..e6af51d9d183 100644
--- a/arch/s390/tools/.gitignore
+++ b/arch/s390/tools/.gitignore
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
gen_facilities
gen_opcode_table
+relocs
diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile
index f9dd47ff9ac4..f2862364fb42 100644
--- a/arch/s390/tools/Makefile
+++ b/arch/s390/tools/Makefile
@@ -25,3 +25,8 @@ $(kapi)/facility-defs.h: $(obj)/gen_facilities FORCE
$(kapi)/dis-defs.h: $(obj)/gen_opcode_table FORCE
$(call filechk,dis-defs.h)
+
+hostprogs += relocs
+PHONY += relocs
+relocs: $(obj)/relocs
+ @:
diff --git a/arch/s390/tools/relocs.c b/arch/s390/tools/relocs.c
new file mode 100644
index 000000000000..30a732c808f3
--- /dev/null
+++ b/arch/s390/tools/relocs.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <elf.h>
+#include <byteswap.h>
+#define USE_BSD
+#include <endian.h>
+
+#define ELF_BITS 64
+
+#define ELF_MACHINE EM_S390
+#define ELF_MACHINE_NAME "IBM S/390"
+#define SHT_REL_TYPE SHT_RELA
+#define Elf_Rel Elf64_Rela
+
+#define ELF_CLASS ELFCLASS64
+#define ELF_ENDIAN ELFDATA2MSB
+#define ELF_R_SYM(val) ELF64_R_SYM(val)
+#define ELF_R_TYPE(val) ELF64_R_TYPE(val)
+#define ELF_ST_TYPE(o) ELF64_ST_TYPE(o)
+#define ELF_ST_BIND(o) ELF64_ST_BIND(o)
+#define ELF_ST_VISIBILITY(o) ELF64_ST_VISIBILITY(o)
+
+#define ElfW(type) _ElfW(ELF_BITS, type)
+#define _ElfW(bits, type) __ElfW(bits, type)
+#define __ElfW(bits, type) Elf##bits##_##type
+
+#define Elf_Addr ElfW(Addr)
+#define Elf_Ehdr ElfW(Ehdr)
+#define Elf_Phdr ElfW(Phdr)
+#define Elf_Shdr ElfW(Shdr)
+#define Elf_Sym ElfW(Sym)
+
+static Elf_Ehdr ehdr;
+static unsigned long shnum;
+static unsigned int shstrndx;
+
+struct relocs {
+ uint32_t *offset;
+ unsigned long count;
+ unsigned long size;
+};
+
+static struct relocs relocs64;
+#define FMT PRIu64
+
+struct section {
+ Elf_Shdr shdr;
+ struct section *link;
+ Elf_Rel *reltab;
+};
+
+static struct section *secs;
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define le16_to_cpu(val) (val)
+#define le32_to_cpu(val) (val)
+#define le64_to_cpu(val) (val)
+#define be16_to_cpu(val) bswap_16(val)
+#define be32_to_cpu(val) bswap_32(val)
+#define be64_to_cpu(val) bswap_64(val)
+#endif
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define le16_to_cpu(val) bswap_16(val)
+#define le32_to_cpu(val) bswap_32(val)
+#define le64_to_cpu(val) bswap_64(val)
+#define be16_to_cpu(val) (val)
+#define be32_to_cpu(val) (val)
+#define be64_to_cpu(val) (val)
+#endif
+
+static uint16_t elf16_to_cpu(uint16_t val)
+{
+ if (ehdr.e_ident[EI_DATA] == ELFDATA2LSB)
+ return le16_to_cpu(val);
+ else
+ return be16_to_cpu(val);
+}
+
+static uint32_t elf32_to_cpu(uint32_t val)
+{
+ if (ehdr.e_ident[EI_DATA] == ELFDATA2LSB)
+ return le32_to_cpu(val);
+ else
+ return be32_to_cpu(val);
+}
+
+#define elf_half_to_cpu(x) elf16_to_cpu(x)
+#define elf_word_to_cpu(x) elf32_to_cpu(x)
+
+static uint64_t elf64_to_cpu(uint64_t val)
+{
+ return be64_to_cpu(val);
+}
+
+#define elf_addr_to_cpu(x) elf64_to_cpu(x)
+#define elf_off_to_cpu(x) elf64_to_cpu(x)
+#define elf_xword_to_cpu(x) elf64_to_cpu(x)
+
+static void die(char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ exit(1);
+}
+
+static void read_ehdr(FILE *fp)
+{
+ if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1)
+ die("Cannot read ELF header: %s\n", strerror(errno));
+ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0)
+ die("No ELF magic\n");
+ if (ehdr.e_ident[EI_CLASS] != ELF_CLASS)
+ die("Not a %d bit executable\n", ELF_BITS);
+ if (ehdr.e_ident[EI_DATA] != ELF_ENDIAN)
+ die("ELF endian mismatch\n");
+ if (ehdr.e_ident[EI_VERSION] != EV_CURRENT)
+ die("Unknown ELF version\n");
+
+ /* Convert the fields to native endian */
+ ehdr.e_type = elf_half_to_cpu(ehdr.e_type);
+ ehdr.e_machine = elf_half_to_cpu(ehdr.e_machine);
+ ehdr.e_version = elf_word_to_cpu(ehdr.e_version);
+ ehdr.e_entry = elf_addr_to_cpu(ehdr.e_entry);
+ ehdr.e_phoff = elf_off_to_cpu(ehdr.e_phoff);
+ ehdr.e_shoff = elf_off_to_cpu(ehdr.e_shoff);
+ ehdr.e_flags = elf_word_to_cpu(ehdr.e_flags);
+ ehdr.e_ehsize = elf_half_to_cpu(ehdr.e_ehsize);
+ ehdr.e_phentsize = elf_half_to_cpu(ehdr.e_phentsize);
+ ehdr.e_phnum = elf_half_to_cpu(ehdr.e_phnum);
+ ehdr.e_shentsize = elf_half_to_cpu(ehdr.e_shentsize);
+ ehdr.e_shnum = elf_half_to_cpu(ehdr.e_shnum);
+ ehdr.e_shstrndx = elf_half_to_cpu(ehdr.e_shstrndx);
+
+ shnum = ehdr.e_shnum;
+ shstrndx = ehdr.e_shstrndx;
+
+ if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN))
+ die("Unsupported ELF header type\n");
+ if (ehdr.e_machine != ELF_MACHINE)
+ die("Not for %s\n", ELF_MACHINE_NAME);
+ if (ehdr.e_version != EV_CURRENT)
+ die("Unknown ELF version\n");
+ if (ehdr.e_ehsize != sizeof(Elf_Ehdr))
+ die("Bad Elf header size\n");
+ if (ehdr.e_phentsize != sizeof(Elf_Phdr))
+ die("Bad program header entry\n");
+ if (ehdr.e_shentsize != sizeof(Elf_Shdr))
+ die("Bad section header entry\n");
+
+ if (shnum == SHN_UNDEF || shstrndx == SHN_XINDEX) {
+ Elf_Shdr shdr;
+
+ if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
+
+ if (fread(&shdr, sizeof(shdr), 1, fp) != 1)
+ die("Cannot read initial ELF section header: %s\n", strerror(errno));
+
+ if (shnum == SHN_UNDEF)
+ shnum = elf_xword_to_cpu(shdr.sh_size);
+
+ if (shstrndx == SHN_XINDEX)
+ shstrndx = elf_word_to_cpu(shdr.sh_link);
+ }
+
+ if (shstrndx >= shnum)
+ die("String table index out of bounds\n");
+}
+
+static void read_shdrs(FILE *fp)
+{
+ Elf_Shdr shdr;
+ int i;
+
+ secs = calloc(shnum, sizeof(struct section));
+ if (!secs)
+ die("Unable to allocate %ld section headers\n", shnum);
+
+ if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", ehdr.e_shoff, strerror(errno));
+
+ for (i = 0; i < shnum; i++) {
+ struct section *sec = &secs[i];
+
+ if (fread(&shdr, sizeof(shdr), 1, fp) != 1) {
+ die("Cannot read ELF section headers %d/%ld: %s\n",
+ i, shnum, strerror(errno));
+ }
+
+ sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name);
+ sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type);
+ sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags);
+ sec->shdr.sh_addr = elf_addr_to_cpu(shdr.sh_addr);
+ sec->shdr.sh_offset = elf_off_to_cpu(shdr.sh_offset);
+ sec->shdr.sh_size = elf_xword_to_cpu(shdr.sh_size);
+ sec->shdr.sh_link = elf_word_to_cpu(shdr.sh_link);
+ sec->shdr.sh_info = elf_word_to_cpu(shdr.sh_info);
+ sec->shdr.sh_addralign = elf_xword_to_cpu(shdr.sh_addralign);
+ sec->shdr.sh_entsize = elf_xword_to_cpu(shdr.sh_entsize);
+
+ if (sec->shdr.sh_link < shnum)
+ sec->link = &secs[sec->shdr.sh_link];
+ }
+
+}
+
+static void read_relocs(FILE *fp)
+{
+ int i, j;
+
+ for (i = 0; i < shnum; i++) {
+ struct section *sec = &secs[i];
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
+ continue;
+
+ sec->reltab = malloc(sec->shdr.sh_size);
+ if (!sec->reltab)
+ die("malloc of %" FMT " bytes for relocs failed\n", sec->shdr.sh_size);
+
+ if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0)
+ die("Seek to %" FMT " failed: %s\n", sec->shdr.sh_offset, strerror(errno));
+
+ if (fread(sec->reltab, 1, sec->shdr.sh_size, fp) != sec->shdr.sh_size)
+ die("Cannot read symbol table: %s\n", strerror(errno));
+
+ for (j = 0; j < sec->shdr.sh_size / sizeof(Elf_Rel); j++) {
+ Elf_Rel *rel = &sec->reltab[j];
+
+ rel->r_offset = elf_addr_to_cpu(rel->r_offset);
+ rel->r_info = elf_xword_to_cpu(rel->r_info);
+#if (SHT_REL_TYPE == SHT_RELA)
+ rel->r_addend = elf_xword_to_cpu(rel->r_addend);
+#endif
+ }
+ }
+}
+
+static void add_reloc(struct relocs *r, uint32_t offset)
+{
+ if (r->count == r->size) {
+ unsigned long newsize = r->size + 50000;
+ void *mem = realloc(r->offset, newsize * sizeof(r->offset[0]));
+
+ if (!mem)
+ die("realloc of %ld entries for relocs failed\n", newsize);
+
+ r->offset = mem;
+ r->size = newsize;
+ }
+ r->offset[r->count++] = offset;
+}
+
+static int do_reloc(struct section *sec, Elf_Rel *rel)
+{
+ unsigned int r_type = ELF64_R_TYPE(rel->r_info);
+ ElfW(Addr) offset = rel->r_offset;
+
+ switch (r_type) {
+ case R_390_NONE:
+ case R_390_PC32:
+ case R_390_PC64:
+ case R_390_PC16DBL:
+ case R_390_PC32DBL:
+ case R_390_PLT32DBL:
+ case R_390_GOTENT:
+ case R_390_GOTPCDBL:
+ case R_390_GOTOFF64:
+ break;
+ case R_390_64:
+ add_reloc(&relocs64, offset);
+ break;
+ default:
+ die("Unsupported relocation type: %d\n", r_type);
+ break;
+ }
+
+ return 0;
+}
+
+static void walk_relocs(void)
+{
+ int i;
+
+ /* Walk through the relocations */
+ for (i = 0; i < shnum; i++) {
+ struct section *sec_applies;
+ int j;
+ struct section *sec = &secs[i];
+
+ if (sec->shdr.sh_type != SHT_REL_TYPE)
+ continue;
+
+ sec_applies = &secs[sec->shdr.sh_info];
+ if (!(sec_applies->shdr.sh_flags & SHF_ALLOC))
+ continue;
+
+ for (j = 0; j < sec->shdr.sh_size / sizeof(Elf_Rel); j++) {
+ Elf_Rel *rel = &sec->reltab[j];
+
+ do_reloc(sec, rel);
+ }
+ }
+}
+
+static int cmp_relocs(const void *va, const void *vb)
+{
+ const uint32_t *a, *b;
+
+ a = va; b = vb;
+ return (*a == *b) ? 0 : (*a > *b) ? 1 : -1;
+}
+
+static void sort_relocs(struct relocs *r)
+{
+ qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
+}
+
+static int print_reloc(uint32_t v)
+{
+ return fprintf(stdout, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1;
+}
+
+static void emit_relocs(void)
+{
+ int i;
+
+ walk_relocs();
+ sort_relocs(&relocs64);
+
+ printf(".section \".vmlinux.relocs_64\",\"a\"\n");
+ for (i = 0; i < relocs64.count; i++)
+ print_reloc(relocs64.offset[i]);
+}
+
+static void process(FILE *fp)
+{
+ read_ehdr(fp);
+ read_shdrs(fp);
+ read_relocs(fp);
+ emit_relocs();
+}
+
+static void usage(void)
+{
+ die("relocs vmlinux\n");
+}
+
+int main(int argc, char **argv)
+{
+ unsigned char e_ident[EI_NIDENT];
+ const char *fname;
+ FILE *fp;
+
+ fname = NULL;
+
+ if (argc != 2)
+ usage();
+
+ fname = argv[1];
+
+ fp = fopen(fname, "r");
+ if (!fp)
+ die("Cannot open %s: %s\n", fname, strerror(errno));
+
+ if (fread(&e_ident, 1, EI_NIDENT, fp) != EI_NIDENT)
+ die("Cannot read %s: %s", fname, strerror(errno));
+
+ rewind(fp);
+
+ process(fp);
+
+ fclose(fp);
+ return 0;
+}
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index f3969a3600db..a0cc9bb41a92 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1206,10 +1206,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
{
}
-void smp_prepare_boot_cpu(void)
-{
-}
-
void __init smp_setup_processor_id(void)
{
if (tlb_type == spitfire)
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 5a83da703e87..6a1f36df6a18 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -28,5 +28,7 @@ obj-y += net/
obj-$(CONFIG_KEXEC_FILE) += purgatory/
+obj-y += virt/svm/
+
# for cleaning
subdir- += boot tools
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 854ab38a359a..e88f6f7b6b41 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -147,6 +147,7 @@ config X86
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
+ select GENERIC_CLOCKEVENTS_BROADCAST_IDLE if GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_CLOCKEVENTS_MIN_ADJUST
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
@@ -1548,19 +1549,6 @@ config AMD_MEM_ENCRYPT
This requires an AMD processor that supports Secure Memory
Encryption (SME).
-config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
- bool "Activate AMD Secure Memory Encryption (SME) by default"
- depends on AMD_MEM_ENCRYPT
- help
- Say yes to have system memory encrypted by default if running on
- an AMD processor that supports Secure Memory Encryption (SME).
-
- If set to Y, then the encryption of system memory can be
- deactivated with the mem_encrypt=off command line option.
-
- If set to N, then the encryption of system memory can be
- activated with the mem_encrypt=on command line option.
-
# Common NUMA Features
config NUMA
bool "NUMA Memory Allocation and Scheduler Support"
@@ -2123,11 +2111,11 @@ config PHYSICAL_START
help
This gives the physical address where the kernel is loaded.
- If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then
- bzImage will decompress itself to above physical address and
- run from there. Otherwise, bzImage will run from the address where
- it has been loaded by the boot loader and will ignore above physical
- address.
+ If the kernel is not relocatable (CONFIG_RELOCATABLE=n) then bzImage
+ will decompress itself to above physical address and run from there.
+ Otherwise, bzImage will run from the address where it has been loaded
+ by the boot loader. The only exception is if it is loaded below the
+ above physical address, in which case it will relocate itself there.
In normal kdump cases one does not have to set/change this option
as now bzImage can be compiled as a completely relocatable image
@@ -2443,6 +2431,18 @@ source "kernel/livepatch/Kconfig"
endmenu
+config CC_HAS_NAMED_AS
+ def_bool CC_IS_GCC && GCC_VERSION >= 120100
+
+config USE_X86_SEG_SUPPORT
+ def_bool y
+ depends on CC_HAS_NAMED_AS
+ #
+ # -fsanitize=kernel-address (KASAN) is at the moment incompatible
+ # with named address spaces - see GCC PR sanitizer/111736.
+ #
+ depends on !KASAN
+
config CC_HAS_SLS
def_bool $(cc-option,-mharden-sls=all)
@@ -2474,12 +2474,12 @@ config CALL_PADDING
config FINEIBT
def_bool y
- depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE
+ depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE
select CALL_PADDING
config HAVE_CALL_THUNKS
def_bool y
- depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL
+ depends on CC_HAS_ENTRY_PADDING && MITIGATION_RETHUNK && OBJTOOL
config CALL_THUNKS
def_bool n
@@ -2501,7 +2501,7 @@ menuconfig SPECULATION_MITIGATIONS
if SPECULATION_MITIGATIONS
-config PAGE_TABLE_ISOLATION
+config MITIGATION_PAGE_TABLE_ISOLATION
bool "Remove the kernel mapping in user mode"
default y
depends on (X86_64 || X86_PAE)
@@ -2512,7 +2512,7 @@ config PAGE_TABLE_ISOLATION
See Documentation/arch/x86/pti.rst for more details.
-config RETPOLINE
+config MITIGATION_RETPOLINE
bool "Avoid speculative indirect branches in kernel"
select OBJTOOL if HAVE_OBJTOOL
default y
@@ -2522,9 +2522,9 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
-config RETHUNK
+config MITIGATION_RETHUNK
bool "Enable return-thunks"
- depends on RETPOLINE && CC_HAS_RETURN_THUNK
+ depends on MITIGATION_RETPOLINE && CC_HAS_RETURN_THUNK
select OBJTOOL if HAVE_OBJTOOL
default y if X86_64
help
@@ -2533,14 +2533,14 @@ config RETHUNK
Requires a compiler with -mfunction-return=thunk-extern
support for full protection. The kernel may run slower.
-config CPU_UNRET_ENTRY
+config MITIGATION_UNRET_ENTRY
bool "Enable UNRET on kernel entry"
- depends on CPU_SUP_AMD && RETHUNK && X86_64
+ depends on CPU_SUP_AMD && MITIGATION_RETHUNK && X86_64
default y
help
Compile the kernel with support for the retbleed=unret mitigation.
-config CALL_DEPTH_TRACKING
+config MITIGATION_CALL_DEPTH_TRACKING
bool "Mitigate RSB underflow with call depth tracking"
depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS
select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
@@ -2560,7 +2560,7 @@ config CALL_DEPTH_TRACKING
config CALL_THUNKS_DEBUG
bool "Enable call thunks and call depth tracking debugging"
- depends on CALL_DEPTH_TRACKING
+ depends on MITIGATION_CALL_DEPTH_TRACKING
select FUNCTION_ALIGNMENT_32B
default n
help
@@ -2571,14 +2571,14 @@ config CALL_THUNKS_DEBUG
Only enable this when you are debugging call thunks as this
creates a noticeable runtime overhead. If unsure say N.
-config CPU_IBPB_ENTRY
+config MITIGATION_IBPB_ENTRY
bool "Enable IBPB on kernel entry"
depends on CPU_SUP_AMD && X86_64
default y
help
Compile the kernel with support for the retbleed=ibpb mitigation.
-config CPU_IBRS_ENTRY
+config MITIGATION_IBRS_ENTRY
bool "Enable IBRS on kernel entry"
depends on CPU_SUP_INTEL && X86_64
default y
@@ -2587,14 +2587,14 @@ config CPU_IBRS_ENTRY
This mitigates both spectre_v2 and retbleed at great cost to
performance.
-config CPU_SRSO
+config MITIGATION_SRSO
bool "Mitigate speculative RAS overflow on AMD"
- depends on CPU_SUP_AMD && X86_64 && RETHUNK
+ depends on CPU_SUP_AMD && X86_64 && MITIGATION_RETHUNK
default y
help
Enable the SRSO mitigation needed on AMD Zen1-4 machines.
-config SLS
+config MITIGATION_SLS
bool "Mitigate Straight-Line-Speculation"
depends on CC_HAS_SLS && X86_64
select OBJTOOL if HAVE_OBJTOOL
@@ -2604,7 +2604,7 @@ config SLS
against straight line speculation. The kernel image might be slightly
larger.
-config GDS_FORCE_MITIGATION
+config MITIGATION_GDS_FORCE
bool "Force GDS Mitigation"
depends on CPU_SUP_INTEL
default n
@@ -2623,6 +2623,17 @@ config GDS_FORCE_MITIGATION
If in doubt, say N.
+config MITIGATION_RFDS
+ bool "RFDS Mitigation"
+ depends on CPU_SUP_INTEL
+ default y
+ help
+ Enable mitigation for Register File Data Sampling (RFDS) by default.
+ RFDS is a hardware vulnerability which affects Intel Atom CPUs. It
+ allows unprivileged speculative access to stale data previously
+ stored in floating point, vector and integer registers.
+ See also <file:Documentation/admin-guide/hw-vuln/reg-file-data-sampling.rst>
+
endif
config ARCH_HAS_ADD_PAGES
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b221dd87c034..ba6c59c18f5e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -22,7 +22,7 @@ RETPOLINE_VDSO_CFLAGS := -mretpoline
endif
RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
-ifdef CONFIG_RETHUNK
+ifdef CONFIG_MITIGATION_RETHUNK
RETHUNK_CFLAGS := -mfunction-return=thunk-extern
RETPOLINE_CFLAGS += $(RETHUNK_CFLAGS)
endif
@@ -53,6 +53,9 @@ REALMODE_CFLAGS += -fno-stack-protector
REALMODE_CFLAGS += -Wno-address-of-packed-member
REALMODE_CFLAGS += $(cc_stack_align4)
REALMODE_CFLAGS += $(CLANG_FLAGS)
+ifdef CONFIG_CC_IS_CLANG
+REALMODE_CFLAGS += -Wno-gnu
+endif
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
@@ -192,7 +195,7 @@ KBUILD_CFLAGS += -Wno-sign-compare
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# Avoid indirect branches in kernel to deal with Spectre
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
# Additionally, avoid generating expensive indirect jumps which
# are subject to retpolines for small number of switch cases.
@@ -205,7 +208,7 @@ ifdef CONFIG_RETPOLINE
endif
endif
-ifdef CONFIG_SLS
+ifdef CONFIG_MITIGATION_SLS
KBUILD_CFLAGS += -mharden-sls=all
endif
@@ -299,12 +302,11 @@ install:
vdso-install-$(CONFIG_X86_64) += arch/x86/entry/vdso/vdso64.so.dbg
vdso-install-$(CONFIG_X86_X32_ABI) += arch/x86/entry/vdso/vdsox32.so.dbg
-vdso-install-$(CONFIG_X86_32) += arch/x86/entry/vdso/vdso32.so.dbg
-vdso-install-$(CONFIG_IA32_EMULATION) += arch/x86/entry/vdso/vdso32.so.dbg
+vdso-install-$(CONFIG_COMPAT_32) += arch/x86/entry/vdso/vdso32.so.dbg
archprepare: checkbin
checkbin:
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
ifeq ($(RETPOLINE_CFLAGS),)
@echo "You are building kernel with non-retpoline compiler." >&2
@echo "Please update your compiler." >&2
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 18d15d1ce87d..f196b1d1ddf8 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -5,6 +5,8 @@
#include "../string.h"
#include "efi.h"
+#include <asm/bootparam.h>
+
#include <linux/numa.h>
/*
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index c1bb180973ea..e162d7f59cc5 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
+#include <asm/bootparam.h>
+
static unsigned long fs;
static inline void set_fs(unsigned long seg)
{
diff --git a/arch/x86/boot/compressed/efi.c b/arch/x86/boot/compressed/efi.c
index 6edd034b0b30..f2e50f9758e6 100644
--- a/arch/x86/boot/compressed/efi.c
+++ b/arch/x86/boot/compressed/efi.c
@@ -7,6 +7,8 @@
#include "misc.h"
+#include <asm/bootparam.h>
+
/**
* efi_get_type - Given a pointer to boot_params, determine the type of EFI environment.
*
diff --git a/arch/x86/boot/compressed/efi.h b/arch/x86/boot/compressed/efi.h
index 866c0af8b5b9..b22300970f97 100644
--- a/arch/x86/boot/compressed/efi.h
+++ b/arch/x86/boot/compressed/efi.h
@@ -97,15 +97,6 @@ typedef struct {
u32 tables;
} efi_system_table_32_t;
-/* kexec external ABI */
-struct efi_setup_data {
- u64 fw_vendor;
- u64 __unused;
- u64 tables;
- u64 smbios;
- u64 reserved[8];
-};
-
struct efi_unaccepted_memory {
u32 version;
u32 unit_size;
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index d040080d7edb..909f2a35b60c 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -8,8 +8,8 @@
* Copyright (C) 2016 Kees Cook
*/
-/* No PAGE_TABLE_ISOLATION support needed either: */
-#undef CONFIG_PAGE_TABLE_ISOLATION
+/* No MITIGATION_PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
#include "error.h"
#include "misc.h"
@@ -389,5 +389,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
void do_boot_nmi_trap(struct pt_regs *regs, unsigned long error_code)
{
- /* Empty handler to ignore NMI during early boot */
+ spurious_nmi_count++;
}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b99e08e6815b..408507e305be 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -52,6 +52,7 @@ struct port_io_ops pio_ops;
memptr free_mem_ptr;
memptr free_mem_end_ptr;
+int spurious_nmi_count;
static char *vidmem;
static int vidport;
@@ -164,21 +165,34 @@ void __putstr(const char *s)
outb(0xff & (pos >> 1), vidport+1);
}
-void __puthex(unsigned long value)
+static noinline void __putnum(unsigned long value, unsigned int base,
+ int mindig)
{
- char alpha[2] = "0";
- int bits;
+ char buf[8*sizeof(value)+1];
+ char *p;
- for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) {
- unsigned long digit = (value >> bits) & 0xf;
+ p = buf + sizeof(buf);
+ *--p = '\0';
- if (digit < 0xA)
- alpha[0] = '0' + digit;
- else
- alpha[0] = 'a' + (digit - 0xA);
+ while (mindig-- > 0 || value) {
+ unsigned char digit = value % base;
+ digit += (digit >= 10) ? ('a'-10) : '0';
+ *--p = digit;
- __putstr(alpha);
+ value /= base;
}
+
+ __putstr(p);
+}
+
+void __puthex(unsigned long value)
+{
+ __putnum(value, 16, sizeof(value)*2);
+}
+
+void __putdec(unsigned long value)
+{
+ __putnum(value, 10, 1);
}
#ifdef CONFIG_X86_NEED_RELOCS
@@ -358,6 +372,19 @@ unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
}
/*
+ * Set the memory encryption xloadflag based on the mem_encrypt= command line
+ * parameter, if provided.
+ */
+static void parse_mem_encrypt(struct setup_header *hdr)
+{
+ int on = cmdline_find_option_bool("mem_encrypt=on");
+ int off = cmdline_find_option_bool("mem_encrypt=off");
+
+ if (on > off)
+ hdr->xloadflags |= XLF_MEM_ENCRYPTION;
+}
+
+/*
* The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel
* image (VO) and the execution environment (.bss, .brk), which makes sure
@@ -387,6 +414,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
/* Clear flags intended for solely in-kernel use. */
boot_params_ptr->hdr.loadflags &= ~KASLR_FLAG;
+ parse_mem_encrypt(&boot_params_ptr->hdr);
+
sanitize_boot_params(boot_params_ptr);
if (boot_params_ptr->screen_info.orig_video_mode == 7) {
@@ -493,6 +522,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
/* Disable exception handling before booting the kernel */
cleanup_exception_handling();
+ if (spurious_nmi_count) {
+ error_putstr("Spurious early NMIs ignored: ");
+ error_putdec(spurious_nmi_count);
+ error_putstr("\n");
+ }
+
return output + entry_offset;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index bc2f0f17fb90..b353a7be380c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -59,12 +59,15 @@ extern char _head[], _end[];
/* misc.c */
extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
+extern int spurious_nmi_count;
void *malloc(int size);
void free(void *where);
void __putstr(const char *s);
void __puthex(unsigned long value);
+void __putdec(unsigned long value);
#define error_putstr(__x) __putstr(__x)
#define error_puthex(__x) __puthex(__x)
+#define error_putdec(__x) __putdec(__x)
#ifdef CONFIG_X86_VERBOSE_BOOTUP
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 51f957b24ba7..c882e1f67af0 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
+#include <asm/bootparam.h>
#include <asm/e820/types.h>
#include <asm/processor.h>
#include "pgtable.h"
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 454acd7a2daf..ec71846d28c9 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -12,6 +12,7 @@
*/
#include "misc.h"
+#include <asm/bootparam.h>
#include <asm/pgtable_types.h>
#include <asm/sev.h>
#include <asm/trapnr.h>
@@ -116,6 +117,9 @@ static bool fault_in_kernel_space(unsigned long address)
#undef __init
#define __init
+#undef __head
+#define __head
+
#define __BOOT_COMPRESSED
/* Basic instruction decoding support needed */
@@ -304,6 +308,10 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
if (result != ES_OK)
goto finish;
+ result = vc_check_opcode_bytes(&ctxt, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
switch (exit_code) {
case SVM_EXIT_RDTSC:
case SVM_EXIT_RDTSCP:
@@ -365,7 +373,7 @@ static void enforce_vmpl0(void)
MSR_AMD64_SNP_VMPL_SSS | \
MSR_AMD64_SNP_SECURE_TSC | \
MSR_AMD64_SNP_VMGEXIT_PARAM | \
- MSR_AMD64_SNP_VMSA_REG_PROTECTION | \
+ MSR_AMD64_SNP_VMSA_REG_PROT | \
MSR_AMD64_SNP_RESERVED_BIT13 | \
MSR_AMD64_SNP_RESERVED_BIT15 | \
MSR_AMD64_SNP_RESERVED_MASK)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index a1bbedd989e4..b5c79f43359b 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -111,11 +111,7 @@ extra_header_fields:
.long salign # SizeOfHeaders
.long 0 # CheckSum
.word IMAGE_SUBSYSTEM_EFI_APPLICATION # Subsystem (EFI application)
-#ifdef CONFIG_EFI_DXE_MEM_ATTRIBUTES
.word IMAGE_DLL_CHARACTERISTICS_NX_COMPAT # DllCharacteristics
-#else
- .word 0 # DllCharacteristics
-#endif
#ifdef CONFIG_X86_32
.long 0 # SizeOfStackReserve
.long 0 # SizeOfStackCommit
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index eeec9986570e..d07be9d05cd0 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -14,7 +14,7 @@
#include <asm/processor.h>
enum cc_vendor cc_vendor __ro_after_init = CC_VENDOR_NONE;
-static u64 cc_mask __ro_after_init;
+u64 cc_mask __ro_after_init;
static bool noinstr intel_cc_platform_has(enum cc_attr attr)
{
@@ -148,8 +148,3 @@ u64 cc_mkdec(u64 val)
}
}
EXPORT_SYMBOL_GPL(cc_mkdec);
-
-__init void cc_set_mask(u64 mask)
-{
- cc_mask = mask;
-}
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 73abbbdd26f8..91801138b10b 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -42,7 +42,7 @@ CONFIG_EFI_STUB=y
CONFIG_HZ_1000=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
-# CONFIG_RETHUNK is not set
+# CONFIG_MITIGATION_RETHUNK is not set
CONFIG_HIBERNATION=y
CONFIG_PM_DEBUG=y
CONFIG_PM_TRACE_RTC=y
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3ff925b17b7e..ea81770629ee 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -147,10 +147,10 @@ For 32-bit we have the following conventions - kernel is built with
.endif
.endm
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
- * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
+ * MITIGATION_PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_USER_PGTABLE_BIT PAGE_SHIFT
@@ -165,7 +165,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
- /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+ /* Clear PCID and "MITIGATION_PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
.endm
@@ -178,7 +178,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
#define THIS_CPU_user_pcid_flush_mask \
- PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+ PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask)
.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req
mov %cr3, \scratch_reg
@@ -244,17 +244,19 @@ For 32-bit we have the following conventions - kernel is built with
.Ldone_\@:
.endm
-.macro RESTORE_CR3 scratch_reg:req save_reg:req
+/* Restore CR3 from a kernel context. May restore a user CR3 value. */
+.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
- ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
-
/*
- * KERNEL pages can always resume with NOFLUSH as we do
- * explicit flushes.
+ * If CR3 contained the kernel page tables at the paranoid exception
+ * entry, then there is nothing to restore as CR3 is not modified while
+ * handling the exception.
*/
bt $PTI_USER_PGTABLE_BIT, \save_reg
- jnc .Lnoflush_\@
+ jnc .Lend_\@
+
+ ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* Check if there's a pending flush for the user ASID we're
@@ -262,25 +264,17 @@ For 32-bit we have the following conventions - kernel is built with
*/
movq \save_reg, \scratch_reg
andq $(0x7FF), \scratch_reg
- bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
- jnc .Lnoflush_\@
-
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
- jmp .Lwrcr3_\@
+ jc .Lwrcr3_\@
-.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg
.Lwrcr3_\@:
- /*
- * The CR3 write could be avoided when not changing its value,
- * but would require a CR3 read *and* a scratch register.
- */
movq \save_reg, %cr3
.Lend_\@:
.endm
-#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=n: */
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
@@ -290,7 +284,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
-.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.macro PARANOID_RESTORE_CR3 scratch_reg:req save_reg:req
.endm
#endif
@@ -308,7 +302,7 @@ For 32-bit we have the following conventions - kernel is built with
* Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
*/
.macro IBRS_ENTER save_reg
-#ifdef CONFIG_CPU_IBRS_ENTRY
+#ifdef CONFIG_MITIGATION_IBRS_ENTRY
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
movl $MSR_IA32_SPEC_CTRL, %ecx
@@ -337,7 +331,7 @@ For 32-bit we have the following conventions - kernel is built with
* regs. Must be called after the last RET.
*/
.macro IBRS_EXIT save_reg
-#ifdef CONFIG_CPU_IBRS_ENTRY
+#ifdef CONFIG_MITIGATION_IBRS_ENTRY
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
movl $MSR_IA32_SPEC_CTRL, %ecx
@@ -431,3 +425,63 @@ For 32-bit we have the following conventions - kernel is built with
.endm
#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_X86_64
+
+/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+.macro THUNK name, func
+SYM_FUNC_START(\name)
+ pushq %rbp
+ movq %rsp, %rbp
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+
+ call \func
+
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+ popq %rbp
+ RET
+SYM_FUNC_END(\name)
+ _ASM_NOKPROBE(\name)
+.endm
+
+#else /* CONFIG_X86_32 */
+
+/* put return address in eax (arg1) */
+.macro THUNK name, func, put_ret_addr_in_eax=0
+SYM_CODE_START_NOALIGN(\name)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+
+ .if \put_ret_addr_in_eax
+ /* Place EIP in the arg1 */
+ movl 3*4(%esp), %eax
+ .endif
+
+ call \func
+ popl %edx
+ popl %ecx
+ popl %eax
+ RET
+ _ASM_NOKPROBE(\name)
+SYM_CODE_END(\name)
+ .endm
+
+#endif
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 003379049924..d9feadffa972 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -10,6 +10,8 @@
#include <asm/segment.h>
#include <asm/cache.h>
+#include "calling.h"
+
.pushsection .noinstr.text, "ax"
SYM_FUNC_START(entry_ibpb)
@@ -43,3 +45,4 @@ EXPORT_SYMBOL_GPL(mds_verw_sel);
.popsection
+THUNK warn_thunk_thunk, __warn_thunk
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b8be0164385c..d3a814efbff6 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -305,7 +305,7 @@
.macro CHECK_AND_APPLY_ESPFIX
#ifdef CONFIG_X86_ESPFIX32
#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8)
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page + GDT_ESPFIX_OFFSET)
ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 23c90813e877..8af2a26b24f6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -191,7 +191,7 @@ SYM_FUNC_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary)
#endif
/*
@@ -561,7 +561,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
#endif
@@ -578,7 +578,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
jnz .Lnative_iret
ud2
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
.Lpti_restore_regs_and_return_to_usermode:
POP_REGS pop_rdi=0
@@ -970,14 +970,14 @@ SYM_CODE_START_LOCAL(paranoid_exit)
IBRS_EXIT save_reg=%r15
/*
- * The order of operations is important. RESTORE_CR3 requires
+ * The order of operations is important. PARANOID_RESTORE_CR3 requires
* kernel GSBASE.
*
* NB to anyone to try to optimize this code: this code does
* not execute at all for exceptions from user mode. Those
* exceptions go through error_return instead.
*/
- RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+ PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14
/* Handle the three GSBASE cases */
ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
@@ -1098,7 +1098,7 @@ SYM_CODE_END(error_return)
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
- * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
+ * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
SYM_CODE_START(asm_exc_nmi)
UNWIND_HINT_IRET_ENTRY
@@ -1406,8 +1406,7 @@ end_repeat_nmi:
/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
IBRS_EXIT save_reg=%r15
- /* Always restore stashed CR3 value (see paranoid_entry) */
- RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+ PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
/*
* The above invocation of paranoid_entry stored the GSBASE
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index 0103e103a657..da37f42f4549 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -4,33 +4,15 @@
* Copyright 2008 by Steven Rostedt, Red Hat, Inc
* (inspired by Andi Kleen's thunk_64.S)
*/
- #include <linux/export.h>
- #include <linux/linkage.h>
- #include <asm/asm.h>
- /* put return address in eax (arg1) */
- .macro THUNK name, func, put_ret_addr_in_eax=0
-SYM_CODE_START_NOALIGN(\name)
- pushl %eax
- pushl %ecx
- pushl %edx
+#include <linux/export.h>
+#include <linux/linkage.h>
+#include <asm/asm.h>
- .if \put_ret_addr_in_eax
- /* Place EIP in the arg1 */
- movl 3*4(%esp), %eax
- .endif
+#include "calling.h"
- call \func
- popl %edx
- popl %ecx
- popl %eax
- RET
- _ASM_NOKPROBE(\name)
-SYM_CODE_END(\name)
- .endm
-
- THUNK preempt_schedule_thunk, preempt_schedule
- THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
- EXPORT_SYMBOL(preempt_schedule_thunk)
- EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
+THUNK preempt_schedule_thunk, preempt_schedule
+THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
+EXPORT_SYMBOL(preempt_schedule_thunk)
+EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 416b400f39db..119ebdc3d362 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -9,39 +9,6 @@
#include "calling.h"
#include <asm/asm.h>
- /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
- .macro THUNK name, func
-SYM_FUNC_START(\name)
- pushq %rbp
- movq %rsp, %rbp
-
- pushq %rdi
- pushq %rsi
- pushq %rdx
- pushq %rcx
- pushq %rax
- pushq %r8
- pushq %r9
- pushq %r10
- pushq %r11
-
- call \func
-
- popq %r11
- popq %r10
- popq %r9
- popq %r8
- popq %rax
- popq %rcx
- popq %rdx
- popq %rsi
- popq %rdi
- popq %rbp
- RET
-SYM_FUNC_END(\name)
- _ASM_NOKPROBE(\name)
- .endm
-
THUNK preempt_schedule_thunk, preempt_schedule
THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
EXPORT_SYMBOL(preempt_schedule_thunk)
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index b1b8dd1608f7..620f6257bbe9 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,7 +3,7 @@
# Building vDSO images for x86.
#
-# Include the generic Makefile to check the built vdso.
+# Include the generic Makefile to check the built vDSO:
include $(srctree)/lib/vdso/Makefile
# Sanitizer runtimes are unavailable and cannot be linked here.
@@ -18,48 +18,39 @@ OBJECT_FILES_NON_STANDARD := y
# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
KCOV_INSTRUMENT := n
-VDSO64-$(CONFIG_X86_64) := y
-VDSOX32-$(CONFIG_X86_X32_ABI) := y
-VDSO32-$(CONFIG_X86_32) := y
-VDSO32-$(CONFIG_IA32_EMULATION) := y
-
-# files to link into the vdso
+# Files to link into the vDSO:
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
vobjs-$(CONFIG_X86_SGX) += vsgx.o
-# files to link into kernel
-obj-y += vma.o extable.o
-KASAN_SANITIZE_vma.o := y
-UBSAN_SANITIZE_vma.o := y
-KCSAN_SANITIZE_vma.o := y
-OBJECT_FILES_NON_STANDARD_vma.o := n
-OBJECT_FILES_NON_STANDARD_extable.o := n
+# Files to link into the kernel:
+obj-y += vma.o extable.o
+KASAN_SANITIZE_vma.o := y
+UBSAN_SANITIZE_vma.o := y
+KCSAN_SANITIZE_vma.o := y
+
+OBJECT_FILES_NON_STANDARD_vma.o := n
+OBJECT_FILES_NON_STANDARD_extable.o := n
-# vDSO images to build
-vdso_img-$(VDSO64-y) += 64
-vdso_img-$(VDSOX32-y) += x32
-vdso_img-$(VDSO32-y) += 32
+# vDSO images to build:
+obj-$(CONFIG_X86_64) += vdso-image-64.o
+obj-$(CONFIG_X86_X32_ABI) += vdso-image-x32.o
+obj-$(CONFIG_COMPAT_32) += vdso-image-32.o vdso32-setup.o
-obj-$(VDSO32-y) += vdso32-setup.o
-OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n
+OBJECT_FILES_NON_STANDARD_vdso-image-32.o := n
+OBJECT_FILES_NON_STANDARD_vdso-image-64.o := n
+OBJECT_FILES_NON_STANDARD_vdso32-setup.o := n
-vobjs := $(foreach F,$(vobjs-y),$(obj)/$F)
-vobjs32 := $(foreach F,$(vobjs32-y),$(obj)/$F)
+vobjs := $(addprefix $(obj)/, $(vobjs-y))
+vobjs32 := $(addprefix $(obj)/, $(vobjs32-y))
$(obj)/vdso.o: $(obj)/vdso.so
targets += vdso.lds $(vobjs-y)
targets += vdso32/vdso32.lds $(vobjs32-y)
-# Build the vDSO image C files and link them in.
-vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o)
-vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c)
-vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
-obj-y += $(vdso_img_objs)
-targets += $(vdso_img_cfiles)
-targets += $(vdso_img_sodbg) $(vdso_img-y:%=vdso%.so)
+targets += $(foreach x, 64 x32 32, vdso-image-$(x).c vdso$(x).so vdso$(x).so.dbg)
CPPFLAGS_vdso.lds += -P -C
@@ -87,7 +78,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
-fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
ifneq ($(RETPOLINE_VDSO_CFLAGS),)
CFL += $(RETPOLINE_VDSO_CFLAGS)
endif
@@ -123,7 +114,7 @@ VDSO_LDFLAGS_vdsox32.lds = -m elf32_x86_64 -soname linux-vdso.so.1 \
vobjx32s-y := $(vobjs-y:.o=-x32.o)
# same thing, but in the output directory
-vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F)
+vobjx32s := $(addprefix $(obj)/, $(vobjx32s-y))
# Convert 64bit object file to x32 for x32 vDSO.
quiet_cmd_x32 = X32 $@
@@ -164,7 +155,7 @@ KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
ifneq ($(RETPOLINE_VDSO_CFLAGS),)
KBUILD_CFLAGS_32 += $(RETPOLINE_VDSO_CFLAGS)
endif
@@ -190,5 +181,3 @@ GCOV_PROFILE := n
quiet_cmd_vdso_and_check = VDSO $@
cmd_vdso_and_check = $(cmd_vdso); $(cmd_vdso_check)
-
-clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 7645730dc228..6d83ceb7f1ba 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -274,59 +274,6 @@ up_fail:
return ret;
}
-#ifdef CONFIG_X86_64
-/*
- * Put the vdso above the (randomized) stack with another randomized
- * offset. This way there is no hole in the middle of address space.
- * To save memory make sure it is still in the same PTE as the stack
- * top. This doesn't give that many random bits.
- *
- * Note that this algorithm is imperfect: the distribution of the vdso
- * start address within a PMD is biased toward the end.
- *
- * Only used for the 64-bit and x32 vdsos.
- */
-static unsigned long vdso_addr(unsigned long start, unsigned len)
-{
- unsigned long addr, end;
- unsigned offset;
-
- /*
- * Round up the start address. It can start out unaligned as a result
- * of stack start randomization.
- */
- start = PAGE_ALIGN(start);
-
- /* Round the lowest possible end address up to a PMD boundary. */
- end = (start + len + PMD_SIZE - 1) & PMD_MASK;
- if (end >= DEFAULT_MAP_WINDOW)
- end = DEFAULT_MAP_WINDOW;
- end -= len;
-
- if (end > start) {
- offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
- addr = start + (offset << PAGE_SHIFT);
- } else {
- addr = start;
- }
-
- /*
- * Forcibly align the final address in case we have a hardware
- * issue that requires alignment for performance reasons.
- */
- addr = align_vdso_addr(addr);
-
- return addr;
-}
-
-static int map_vdso_randomized(const struct vdso_image *image)
-{
- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
-
- return map_vdso(image, addr);
-}
-#endif
-
int map_vdso_once(const struct vdso_image *image, unsigned long addr)
{
struct mm_struct *mm = current->mm;
@@ -369,7 +316,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (!vdso64_enabled)
return 0;
- return map_vdso_randomized(&vdso_image_64);
+ return map_vdso(&vdso_image_64, 0);
}
#ifdef CONFIG_COMPAT
@@ -380,7 +327,7 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
if (x32) {
if (!vdso64_enabled)
return 0;
- return map_vdso_randomized(&vdso_image_x32);
+ return map_vdso(&vdso_image_x32, 0);
}
#endif
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 5bf03c575812..4ccb8fa483e6 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -71,7 +71,7 @@ union amd_uncore_info {
};
struct amd_uncore {
- union amd_uncore_info * __percpu info;
+ union amd_uncore_info __percpu *info;
struct amd_uncore_pmu *pmus;
unsigned int num_pmus;
bool init_done;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 3804f21ab049..768d1414897f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -17,6 +17,7 @@
#include <linux/kvm_host.h>
#include <asm/cpufeature.h>
+#include <asm/debugreg.h>
#include <asm/hardirq.h>
#include <asm/intel-family.h>
#include <asm/intel_pt.h>
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index d49d661ec0a7..2641ba620f12 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -5,6 +5,7 @@
#include <linux/sched/clock.h>
#include <asm/cpu_entry_area.h>
+#include <asm/debugreg.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/insn.h>
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index fbcfec4dc4cc..ca8eed1d496a 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -113,6 +113,20 @@
#endif
+#ifndef __ASSEMBLY__
+#ifndef __pic__
+static __always_inline __pure void *rip_rel_ptr(void *p)
+{
+ asm("leaq %c1(%%rip), %0" : "=r"(p) : "i"(p));
+
+ return p;
+}
+#define RIP_REL_REF(var) (*(typeof(&(var)))rip_rel_ptr(&(var)))
+#else
+#define RIP_REL_REF(var) (var)
+#endif
+#endif
+
/*
* Macros to generate condition code outputs from inline assembly,
* The output operand must be type "bool".
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index 76c310b19b11..fb7388bbc212 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_COCO_H
#define _ASM_X86_COCO_H
+#include <asm/asm.h>
#include <asm/types.h>
enum cc_vendor {
@@ -12,7 +13,13 @@ enum cc_vendor {
#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
extern enum cc_vendor cc_vendor;
-void cc_set_mask(u64 mask);
+extern u64 cc_mask;
+
+static inline void cc_set_mask(u64 mask)
+{
+ RIP_REL_REF(cc_mask) = mask;
+}
+
u64 cc_mkenc(u64 val);
u64 cc_mkdec(u64 val);
#else
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 8e371c6cc5f9..f0337f7bcf16 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -442,6 +442,7 @@
#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */
@@ -505,4 +506,5 @@
/* BUG word 2 */
#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */
#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
+#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index dd4b67101bb7..bf5953883ec3 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -18,7 +18,7 @@ struct pcpu_hot {
struct task_struct *current_task;
int preempt_count;
int cpu_number;
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
u64 call_depth;
#endif
unsigned long top_of_stack;
@@ -37,8 +37,15 @@ static_assert(sizeof(struct pcpu_hot) == 64);
DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
+/* const-qualified alias to pcpu_hot, aliased by linker. */
+DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
+ const_pcpu_hot);
+
static __always_inline struct task_struct *get_current(void)
{
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return this_cpu_read_const(const_pcpu_hot.current_task);
+
return this_cpu_read_stable(pcpu_hot.current_task);
}
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 0cec92c430cc..fdbbbfec745a 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -5,7 +5,9 @@
#include <linux/bug.h>
#include <linux/percpu.h>
#include <uapi/asm/debugreg.h>
+
#include <asm/cpufeature.h>
+#include <asm/msr.h>
DECLARE_PER_CPU(unsigned long, cpu_dr7);
@@ -159,4 +161,26 @@ static inline unsigned long amd_get_dr_addr_mask(unsigned int dr)
}
#endif
+static inline unsigned long get_debugctlmsr(void)
+{
+ unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+ return debugctlmsr;
+}
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index ec95fe44fa3a..62dc9f59ea76 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -46,6 +46,7 @@ struct gdt_page {
} __attribute__((aligned(PAGE_SIZE)));
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_INIT_PER_CPU(gdt_page);
/* Provide the original GDT */
static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index f40b29d3abad..da4054fbf533 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -44,32 +44,32 @@
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
# define DISABLE_PTI 0
#else
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
# define DISABLE_RETPOLINE 0
#else
# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \
(1 << (X86_FEATURE_RETPOLINE_LFENCE & 31)))
#endif
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
# define DISABLE_RETHUNK 0
#else
# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31))
#endif
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
# define DISABLE_UNRET 0
#else
# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31))
#endif
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
# define DISABLE_CALL_DEPTH_TRACKING 0
#else
# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
@@ -123,6 +123,12 @@
# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31))
#endif
+#ifdef CONFIG_KVM_AMD_SEV
+#define DISABLE_SEV_SNP 0
+#else
+#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -147,7 +153,7 @@
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 (DISABLE_IBT)
-#define DISABLED_MASK19 0
+#define DISABLED_MASK19 (DISABLE_SEV_SNP)
#define DISABLED_MASK20 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c4555b269a1b..1dc600fa3ba5 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -143,15 +143,6 @@ extern void efi_free_boot_services(void);
void arch_efi_call_virt_setup(void);
void arch_efi_call_virt_teardown(void);
-/* kexec external ABI */
-struct efi_setup_data {
- u64 fw_vendor;
- u64 __unused;
- u64 tables;
- u64 smbios;
- u64 reserved[8];
-};
-
extern u64 efi_setup;
#ifdef CONFIG_EFI
@@ -418,8 +409,9 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md,
extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap,
void *buf, struct efi_mem_range *mem);
-#define arch_ima_efi_boot_mode \
- ({ extern struct boot_params boot_params; boot_params.secure_boot; })
+extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void);
+
+#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode()
#ifdef CONFIG_EFI_RUNTIME_MAP
int efi_get_runtime_map_size(void);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1e16bd5ac781..1fb83d47711f 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -392,5 +392,4 @@ struct va_alignment {
} ____cacheline_aligned;
extern struct va_alignment va_align;
-extern unsigned long align_vdso_addr(unsigned long);
#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index ca6e5e5f16b2..c485f1944c5f 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -37,10 +37,12 @@ extern void fpu_flush_thread(void);
* The FPU context is only stored/restored for a user task and
* PF_KTHREAD is used to distinguish between kernel and user threads.
*/
-static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
+static inline void switch_fpu_prepare(struct task_struct *old, int cpu)
{
if (cpu_feature_enabled(X86_FEATURE_FPU) &&
- !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+ !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) {
+ struct fpu *old_fpu = &old->thread.fpu;
+
save_fpregs_to_fpstate(old_fpu);
/*
* The save operation preserved register state, so the
@@ -60,10 +62,10 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
* Delay loading of the complete FPU state until the return to userland.
* PKRU is handled separately.
*/
-static inline void switch_fpu_finish(void)
+static inline void switch_fpu_finish(struct task_struct *new)
{
if (cpu_feature_enabled(X86_FEATURE_FPU))
- set_thread_flag(TIF_NEED_FPU_LOAD);
+ set_tsk_thread_flag(new, TIF_NEED_FPU_LOAD);
}
#endif /* _ASM_X86_FPU_SCHED_H */
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index 35cff5f2becf..9e7e8ca8e299 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -6,7 +6,7 @@
#ifdef CONFIG_X86_64
-#include <asm/msr-index.h>
+#include <asm/msr.h>
/*
* Read/write a task's FSBASE or GSBASE. This returns the value that
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 3814a9263d64..294cd2a40818 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -379,7 +379,7 @@ static inline void iosubmit_cmds512(void __iomem *dst, const void *src,
const u8 *end = from + count * 64;
while (from < end) {
- movdir64b(dst, from);
+ movdir64b_io(dst, from);
from += 64;
}
}
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 2fd52b65deac..3be2451e7bc8 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -10,6 +10,7 @@ extern int force_iommu, no_iommu;
extern int iommu_detected;
extern int iommu_merge;
extern int panic_on_overflow;
+extern bool amd_iommu_snp_en;
#ifdef CONFIG_SWIOTLB
extern bool x86_swiotlb_enable;
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index c9f6a6c5de3c..91ca9a9ee3a2 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -25,7 +25,6 @@
#include <asm/page.h>
#include <asm/ptrace.h>
-#include <asm/bootparam.h>
struct kimage;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 378ed944b849..ab24ce207988 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -138,6 +138,7 @@ KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
KVM_X86_OP_OPTIONAL(get_untagged_addr)
+KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d271ba20a0b2..18cbde14cf81 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1796,6 +1796,7 @@ struct kvm_x86_ops {
unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
+ void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 571fe4d2d232..dc31b13b87a0 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -40,27 +40,27 @@
#ifdef __ASSEMBLY__
-#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
#define RET jmp __x86_return_thunk
-#else /* CONFIG_RETPOLINE */
-#ifdef CONFIG_SLS
+#else /* CONFIG_MITIGATION_RETPOLINE */
+#ifdef CONFIG_MITIGATION_SLS
#define RET ret; int3
#else
#define RET ret
#endif
-#endif /* CONFIG_RETPOLINE */
+#endif /* CONFIG_MITIGATION_RETPOLINE */
#else /* __ASSEMBLY__ */
-#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#if defined(CONFIG_MITIGATION_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
#define ASM_RET "jmp __x86_return_thunk\n\t"
-#else /* CONFIG_RETPOLINE */
-#ifdef CONFIG_SLS
+#else /* CONFIG_MITIGATION_RETPOLINE */
+#ifdef CONFIG_MITIGATION_SLS
#define ASM_RET "ret; int3\n\t"
#else
#define ASM_RET "ret\n\t"
#endif
-#endif /* CONFIG_RETPOLINE */
+#endif /* CONFIG_MITIGATION_RETPOLINE */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 73dba8b94443..59aa966dc212 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -131,8 +131,20 @@ static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
(typeof(l->a.counter) *) old, new);
}
-/* Always has a lock prefix */
-#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
+/*
+ * Implement local_xchg using CMPXCHG instruction without the LOCK prefix.
+ * XCHG is expensive due to the implied LOCK prefix. The processor
+ * cannot prefetch cachelines if XCHG is used.
+ */
+static __always_inline long
+local_xchg(local_t *l, long n)
+{
+ long c = local_read(l);
+
+ do { } while (!local_try_cmpxchg(l, &c, n));
+
+ return c;
+}
/**
* local_add_unless - add unless the number is already a given value
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 359ada486fa9..f922b682b9b4 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -15,7 +15,8 @@
#include <linux/init.h>
#include <linux/cc_platform.h>
-#include <asm/bootparam.h>
+#include <asm/asm.h>
+struct boot_params;
#ifdef CONFIG_X86_MEM_ENCRYPT
void __init mem_encrypt_init(void);
@@ -46,8 +47,8 @@ void __init sme_unmap_bootdata(char *real_mode_data);
void __init sme_early_init(void);
-void __init sme_encrypt_kernel(struct boot_params *bp);
-void __init sme_enable(struct boot_params *bp);
+void sme_encrypt_kernel(struct boot_params *bp);
+void sme_enable(struct boot_params *bp);
int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
@@ -58,6 +59,11 @@ void __init mem_encrypt_free_decrypted_mem(void);
void __init sev_es_init_vc_handling(void);
+static inline u64 sme_get_me_mask(void)
+{
+ return RIP_REL_REF(sme_me_mask);
+}
+
#define __bss_decrypted __section(".bss..decrypted")
#else /* !CONFIG_AMD_MEM_ENCRYPT */
@@ -75,8 +81,8 @@ static inline void __init sme_unmap_bootdata(char *real_mode_data) { }
static inline void __init sme_early_init(void) { }
-static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
-static inline void __init sme_enable(struct boot_params *bp) { }
+static inline void sme_encrypt_kernel(struct boot_params *bp) { }
+static inline void sme_enable(struct boot_params *bp) { }
static inline void sev_es_init_vc_handling(void) { }
@@ -89,6 +95,8 @@ early_set_mem_enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool en
static inline void mem_encrypt_free_decrypted_mem(void) { }
+static inline u64 sme_get_me_mask(void) { return 0; }
+
#define __bss_decrypted
#endif /* CONFIG_AMD_MEM_ENCRYPT */
@@ -106,11 +114,6 @@ void add_encrypt_protection_map(void);
extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
-static inline u64 sme_get_me_mask(void)
-{
- return sme_me_mask;
-}
-
#endif /* __ASSEMBLY__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1f9dc9bd13eb..05956bd8bacf 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -176,6 +176,14 @@
* CPU is not vulnerable to Gather
* Data Sampling (GDS).
*/
+#define ARCH_CAP_RFDS_NO BIT(27) /*
+ * Not susceptible to Register
+ * File Data Sampling.
+ */
+#define ARCH_CAP_RFDS_CLEAR BIT(28) /*
+ * VERW clears CPU Register
+ * File.
+ */
#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
* IA32_XAPIC_DISABLE_STATUS MSR
@@ -605,34 +613,47 @@
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
-#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
-#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
+#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
+#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2
#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
-
-/* SNP feature bits enabled by the hypervisor */
-#define MSR_AMD64_SNP_VTOM BIT_ULL(3)
-#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4)
-#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5)
-#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6)
-#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7)
-#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8)
-#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9)
-#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10)
-#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11)
-#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12)
-#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14)
-#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16)
-#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17)
-
-/* SNP feature bits reserved for future use. */
-#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13)
-#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15)
-#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18)
+#define MSR_AMD64_SNP_VTOM_BIT 3
+#define MSR_AMD64_SNP_VTOM BIT_ULL(MSR_AMD64_SNP_VTOM_BIT)
+#define MSR_AMD64_SNP_REFLECT_VC_BIT 4
+#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(MSR_AMD64_SNP_REFLECT_VC_BIT)
+#define MSR_AMD64_SNP_RESTRICTED_INJ_BIT 5
+#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(MSR_AMD64_SNP_RESTRICTED_INJ_BIT)
+#define MSR_AMD64_SNP_ALT_INJ_BIT 6
+#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(MSR_AMD64_SNP_ALT_INJ_BIT)
+#define MSR_AMD64_SNP_DEBUG_SWAP_BIT 7
+#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(MSR_AMD64_SNP_DEBUG_SWAP_BIT)
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT 8
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT)
+#define MSR_AMD64_SNP_BTB_ISOLATION_BIT 9
+#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(MSR_AMD64_SNP_BTB_ISOLATION_BIT)
+#define MSR_AMD64_SNP_VMPL_SSS_BIT 10
+#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(MSR_AMD64_SNP_VMPL_SSS_BIT)
+#define MSR_AMD64_SNP_SECURE_TSC_BIT 11
+#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(MSR_AMD64_SNP_SECURE_TSC_BIT)
+#define MSR_AMD64_SNP_VMGEXIT_PARAM_BIT 12
+#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(MSR_AMD64_SNP_VMGEXIT_PARAM_BIT)
+#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13)
+#define MSR_AMD64_SNP_IBS_VIRT_BIT 14
+#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(MSR_AMD64_SNP_IBS_VIRT_BIT)
+#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15)
+#define MSR_AMD64_SNP_VMSA_REG_PROT_BIT 16
+#define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT)
+#define MSR_AMD64_SNP_SMT_PROT_BIT 17
+#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
+#define MSR_AMD64_SNP_RESV_BIT 18
+#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
+#define MSR_AMD64_RMP_BASE 0xc0010132
+#define MSR_AMD64_RMP_END 0xc0010133
+
/* AMD Collaborative Processor Performance Control MSRs */
#define MSR_AMD_CPPC_CAP1 0xc00102b0
#define MSR_AMD_CPPC_ENABLE 0xc00102b1
@@ -719,8 +740,15 @@
#define MSR_K8_TOP_MEM1 0xc001001a
#define MSR_K8_TOP_MEM2 0xc001001d
#define MSR_AMD64_SYSCFG 0xc0010010
-#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23
+#define MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT 23
#define MSR_AMD64_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_AMD64_SYSCFG_MEM_ENCRYPT_BIT)
+#define MSR_AMD64_SYSCFG_SNP_EN_BIT 24
+#define MSR_AMD64_SYSCFG_SNP_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_EN_BIT)
+#define MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT 25
+#define MSR_AMD64_SYSCFG_SNP_VMPL_EN BIT_ULL(MSR_AMD64_SYSCFG_SNP_VMPL_EN_BIT)
+#define MSR_AMD64_SYSCFG_MFDM_BIT 19
+#define MSR_AMD64_SYSCFG_MFDM BIT_ULL(MSR_AMD64_SYSCFG_MFDM_BIT)
+
#define MSR_K8_INT_PENDING_MSG 0xc0010055
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c284ff9ebe67..d642037f9ed5 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -12,11 +12,13 @@
#include <uapi/asm/msr.h>
#include <asm/shared/msr.h>
+#include <linux/percpu.h>
+
struct msr_info {
- u32 msr_no;
- struct msr reg;
- struct msr *msrs;
- int err;
+ u32 msr_no;
+ struct msr reg;
+ struct msr __percpu *msrs;
+ int err;
};
struct msr_regs_info {
@@ -323,8 +325,8 @@ static inline int wrmsrl_safe(u32 msr, u64 val)
return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
}
-struct msr *msrs_alloc(void);
-void msrs_free(struct msr *msrs);
+struct msr __percpu *msrs_alloc(void);
+void msrs_free(struct msr __percpu *msrs);
int msr_set_bit(u32 msr, u8 bit);
int msr_clear_bit(u32 msr, u8 bit);
@@ -333,8 +335,8 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q);
-void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
-void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs);
int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q);
@@ -363,14 +365,14 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
return 0;
}
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
- struct msr *msrs)
+ struct msr __percpu *msrs)
{
- rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+ rdmsr_on_cpu(0, msr_no, raw_cpu_ptr(&msrs->l), raw_cpu_ptr(&msrs->h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
- struct msr *msrs)
+ struct msr __percpu *msrs)
{
- wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+ wrmsr_on_cpu(0, msr_no, raw_cpu_read(msrs->l), raw_cpu_read(msrs->h));
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
u32 *l, u32 *h)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 5c5f1e56c404..41a0ebb699ec 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -14,9 +14,6 @@ extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
extern void release_evntsel_nmi(unsigned int);
-struct ctl_table;
-extern int proc_nmi_enabled(struct ctl_table *, int ,
- void __user *, size_t *, loff_t *);
extern int unknown_nmi_panic;
#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index ab19c7f1167b..fc3a8a3c7ffe 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -59,13 +59,13 @@
#ifdef CONFIG_CALL_THUNKS_DEBUG
# define CALL_THUNKS_DEBUG_INC_CALLS \
- incq %gs:__x86_call_count;
+ incq PER_CPU_VAR(__x86_call_count);
# define CALL_THUNKS_DEBUG_INC_RETS \
- incq %gs:__x86_ret_count;
+ incq PER_CPU_VAR(__x86_ret_count);
# define CALL_THUNKS_DEBUG_INC_STUFFS \
- incq %gs:__x86_stuffs_count;
+ incq PER_CPU_VAR(__x86_stuffs_count);
# define CALL_THUNKS_DEBUG_INC_CTXSW \
- incq %gs:__x86_ctxsw_count;
+ incq PER_CPU_VAR(__x86_ctxsw_count);
#else
# define CALL_THUNKS_DEBUG_INC_CALLS
# define CALL_THUNKS_DEBUG_INC_RETS
@@ -73,16 +73,13 @@
# define CALL_THUNKS_DEBUG_INC_CTXSW
#endif
-#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
+#if defined(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
#include <asm/asm-offsets.h>
#define CREDIT_CALL_DEPTH \
movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
-#define ASM_CREDIT_CALL_DEPTH \
- movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
-
#define RESET_CALL_DEPTH \
xor %eax, %eax; \
bts $63, %rax; \
@@ -95,20 +92,14 @@
CALL_THUNKS_DEBUG_INC_CALLS
#define INCREMENT_CALL_DEPTH \
- sarq $5, %gs:pcpu_hot + X86_call_depth; \
- CALL_THUNKS_DEBUG_INC_CALLS
-
-#define ASM_INCREMENT_CALL_DEPTH \
sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
CALL_THUNKS_DEBUG_INC_CALLS
#else
#define CREDIT_CALL_DEPTH
-#define ASM_CREDIT_CALL_DEPTH
#define RESET_CALL_DEPTH
-#define INCREMENT_CALL_DEPTH
-#define ASM_INCREMENT_CALL_DEPTH
#define RESET_CALL_DEPTH_FROM_CALL
+#define INCREMENT_CALL_DEPTH
#endif
/*
@@ -158,7 +149,7 @@
jnz 771b; \
/* barrier for jnz misprediction */ \
lfence; \
- ASM_CREDIT_CALL_DEPTH \
+ CREDIT_CALL_DEPTH \
CALL_THUNKS_DEBUG_INC_CTXSW
#else
/*
@@ -212,7 +203,7 @@
*/
.macro VALIDATE_UNRET_END
#if defined(CONFIG_NOINSTR_VALIDATION) && \
- (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
+ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
ANNOTATE_RETPOLINE_SAFE
nop
#endif
@@ -241,7 +232,7 @@
* instruction irrespective of kCFI.
*/
.macro JMP_NOSPEC reg:req
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
__CS_PREFIX \reg
jmp __x86_indirect_thunk_\reg
#else
@@ -251,7 +242,7 @@
.endm
.macro CALL_NOSPEC reg:req
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
__CS_PREFIX \reg
call __x86_indirect_thunk_\reg
#else
@@ -271,7 +262,7 @@
.Lskip_rsb_\@:
.endm
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
+#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
#define CALL_UNTRAIN_RET "call entry_untrain_ret"
#else
#define CALL_UNTRAIN_RET ""
@@ -289,7 +280,7 @@
* where we have a stack but before any RET instruction.
*/
.macro __UNTRAIN_RET ibpb_feature, call_depth_insns
-#if defined(CONFIG_RETHUNK) || defined(CONFIG_CPU_IBPB_ENTRY)
+#if defined(CONFIG_MITIGATION_RETHUNK) || defined(CONFIG_MITIGATION_IBPB_ENTRY)
VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
CALL_UNTRAIN_RET, X86_FEATURE_UNRET, \
@@ -309,9 +300,9 @@
.macro CALL_DEPTH_ACCOUNT
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
ALTERNATIVE "", \
- __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+ __stringify(INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm
@@ -323,7 +314,7 @@
* Note: Only the memory operand variant of VERW clears the CPU buffers.
*/
.macro CLEAR_CPU_BUFFERS
- ALTERNATIVE "", __stringify(verw mds_verw_sel), X86_FEATURE_CLEAR_CPU_BUF
+ ALTERNATIVE "", __stringify(verw _ASM_RIP(mds_verw_sel)), X86_FEATURE_CLEAR_CPU_BUF
.endm
#else /* __ASSEMBLY__ */
@@ -339,19 +330,19 @@ extern retpoline_thunk_t __x86_indirect_thunk_array[];
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
extern void __x86_return_thunk(void);
#else
static inline void __x86_return_thunk(void) {}
#endif
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
extern void retbleed_return_thunk(void);
#else
static inline void retbleed_return_thunk(void) {}
#endif
-#ifdef CONFIG_CPU_SRSO
+#ifdef CONFIG_MITIGATION_SRSO
extern void srso_return_thunk(void);
extern void srso_alias_return_thunk(void);
#else
@@ -368,7 +359,9 @@ extern void entry_ibpb(void);
extern void (*x86_return_thunk)(void);
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+extern void __warn_thunk(void);
+
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
extern void call_depth_return_thunk(void);
#define CALL_DEPTH_ACCOUNT \
@@ -382,14 +375,14 @@ DECLARE_PER_CPU(u64, __x86_ret_count);
DECLARE_PER_CPU(u64, __x86_stuffs_count);
DECLARE_PER_CPU(u64, __x86_ctxsw_count);
#endif
-#else /* !CONFIG_CALL_DEPTH_TRACKING */
+#else /* !CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
static inline void call_depth_return_thunk(void) {}
#define CALL_DEPTH_ACCOUNT ""
-#endif /* CONFIG_CALL_DEPTH_TRACKING */
+#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
#define GEN(reg) \
extern retpoline_thunk_t __x86_indirect_thunk_ ## reg;
@@ -410,7 +403,7 @@ static inline void call_depth_return_thunk(void) {}
/*
* Inline asm uses the %V modifier which is only in newer GCC
- * which is ensured when CONFIG_RETPOLINE is defined.
+ * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined.
*/
# define CALL_NOSPEC \
ALTERNATIVE_2( \
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index d18e5c332cb9..1b93ff80b43b 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -66,10 +66,14 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
* virt_addr_valid(kaddr) returns true.
*/
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
+static __always_inline void *pfn_to_kaddr(unsigned long pfn)
+{
+ return __va(pfn << PAGE_SHIFT);
+}
+
static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits)
{
return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b40c462b4af3..b3ab80a03365 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -10,7 +10,6 @@
#include <linux/numa.h>
#include <asm/io.h>
#include <asm/memtype.h>
-#include <asm/x86_init.h>
struct pci_sysdata {
int domain; /* PCI domain */
@@ -124,16 +123,4 @@ cpumask_of_pcibus(const struct pci_bus *bus)
}
#endif
-struct pci_setup_rom {
- struct setup_data data;
- uint16_t vendor;
- uint16_t devid;
- uint64_t pcilen;
- unsigned long segment;
- unsigned long bus;
- unsigned long device;
- unsigned long function;
- uint8_t romdata[];
-};
-
#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 5e01883eb51e..44958ebaf626 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -4,17 +4,21 @@
#ifdef CONFIG_X86_64
#define __percpu_seg gs
+#define __percpu_rel (%rip)
#else
#define __percpu_seg fs
+#define __percpu_rel
#endif
#ifdef __ASSEMBLY__
#ifdef CONFIG_SMP
-#define PER_CPU_VAR(var) %__percpu_seg:var
-#else /* ! SMP */
-#define PER_CPU_VAR(var) var
-#endif /* SMP */
+#define __percpu %__percpu_seg:
+#else
+#define __percpu
+#endif
+
+#define PER_CPU_VAR(var) __percpu(var)__percpu_rel
#ifdef CONFIG_X86_64_SMP
#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
@@ -24,30 +28,84 @@
#else /* ...!ASSEMBLY */
+#include <linux/build_bug.h>
#include <linux/stringify.h>
#include <asm/asm.h>
#ifdef CONFIG_SMP
+
+#ifdef CONFIG_CC_HAS_NAMED_AS
+
+#ifdef __CHECKER__
+#define __seg_gs __attribute__((address_space(__seg_gs)))
+#define __seg_fs __attribute__((address_space(__seg_fs)))
+#endif
+
+#ifdef CONFIG_X86_64
+#define __percpu_seg_override __seg_gs
+#else
+#define __percpu_seg_override __seg_fs
+#endif
+
+#define __percpu_prefix ""
+
+#else /* CONFIG_CC_HAS_NAMED_AS */
+
+#define __percpu_seg_override
#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
+
+#endif /* CONFIG_CC_HAS_NAMED_AS */
+
+#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
#define __my_cpu_offset this_cpu_read(this_cpu_off)
+#ifdef CONFIG_USE_X86_SEG_SUPPORT
+/*
+ * Efficient implementation for cases in which the compiler supports
+ * named address spaces. Allows the compiler to perform additional
+ * optimizations that can save more instructions.
+ */
+#define arch_raw_cpu_ptr(ptr) \
+({ \
+ unsigned long tcp_ptr__; \
+ tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \
+ \
+ tcp_ptr__ += (unsigned long)(ptr); \
+ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+})
+#else /* CONFIG_USE_X86_SEG_SUPPORT */
/*
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
*/
-#define arch_raw_cpu_ptr(ptr) \
-({ \
- unsigned long tcp_ptr__; \
- asm ("add " __percpu_arg(1) ", %0" \
- : "=r" (tcp_ptr__) \
- : "m" (this_cpu_off), "0" (ptr)); \
- (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
+#define arch_raw_cpu_ptr(ptr) \
+({ \
+ unsigned long tcp_ptr__; \
+ asm ("mov " __percpu_arg(1) ", %0" \
+ : "=r" (tcp_ptr__) \
+ : "m" (__my_cpu_var(this_cpu_off))); \
+ \
+ tcp_ptr__ += (unsigned long)(ptr); \
+ (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
})
-#else
+#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+
+#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel
+
+#else /* CONFIG_SMP */
+#define __percpu_seg_override
#define __percpu_prefix ""
-#endif
+#define __force_percpu_prefix ""
+
+#define PER_CPU_VAR(var) (var)__percpu_rel
+#endif /* CONFIG_SMP */
+
+#define __my_cpu_type(var) typeof(var) __percpu_seg_override
+#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
+#define __my_cpu_var(var) (*__my_cpu_ptr(&var))
#define __percpu_arg(x) __percpu_prefix "%" #x
+#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
/*
* Initialized pointers to per-cpu variables needed for the boot
@@ -107,14 +165,14 @@ do { \
(void)pto_tmp__; \
} \
asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \
- : [var] "+m" (_var) \
+ : [var] "+m" (__my_cpu_var(_var)) \
: [val] __pcpu_reg_imm_##size(pto_val__)); \
} while (0)
#define percpu_unary_op(size, qual, op, _var) \
({ \
asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \
- : [var] "+m" (_var)); \
+ : [var] "+m" (__my_cpu_var(_var))); \
})
/*
@@ -144,16 +202,16 @@ do { \
__pcpu_type_##size pfo_val__; \
asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \
: [val] __pcpu_reg_##size("=", pfo_val__) \
- : [var] "m" (_var)); \
+ : [var] "m" (__my_cpu_var(_var))); \
(typeof(_var))(unsigned long) pfo_val__; \
})
#define percpu_stable_op(size, op, _var) \
({ \
__pcpu_type_##size pfo_val__; \
- asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \
+ asm(__pcpu_op2_##size(op, __force_percpu_arg(a[var]), "%[val]") \
: [val] __pcpu_reg_##size("=", pfo_val__) \
- : [var] "p" (&(_var))); \
+ : [var] "i" (&(_var))); \
(typeof(_var))(unsigned long) pfo_val__; \
})
@@ -166,7 +224,7 @@ do { \
asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \
__percpu_arg([var])) \
: [tmp] __pcpu_reg_##size("+", paro_tmp__), \
- [var] "+m" (_var) \
+ [var] "+m" (__my_cpu_var(_var)) \
: : "memory"); \
(typeof(_var))(unsigned long) (paro_tmp__ + _val); \
})
@@ -187,7 +245,7 @@ do { \
__percpu_arg([var])) \
"\n\tjnz 1b" \
: [oval] "=&a" (pxo_old__), \
- [var] "+m" (_var) \
+ [var] "+m" (__my_cpu_var(_var)) \
: [nval] __pcpu_reg_##size(, pxo_new__) \
: "memory"); \
(typeof(_var))(unsigned long) pxo_old__; \
@@ -204,7 +262,7 @@ do { \
asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
__percpu_arg([var])) \
: [oval] "+a" (pco_old__), \
- [var] "+m" (_var) \
+ [var] "+m" (__my_cpu_var(_var)) \
: [nval] __pcpu_reg_##size(, pco_new__) \
: "memory"); \
(typeof(_var))(unsigned long) pco_old__; \
@@ -221,7 +279,7 @@ do { \
CC_SET(z) \
: CC_OUT(z) (success), \
[oval] "+a" (pco_old__), \
- [var] "+m" (_var) \
+ [var] "+m" (__my_cpu_var(_var)) \
: [nval] __pcpu_reg_##size(, pco_new__) \
: "memory"); \
if (unlikely(!success)) \
@@ -244,7 +302,7 @@ do { \
\
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
- : [var] "+m" (_var), \
+ : [var] "+m" (__my_cpu_var(_var)), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
@@ -276,7 +334,7 @@ do { \
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
CC_SET(z) \
: CC_OUT(z) (success), \
- [var] "+m" (_var), \
+ [var] "+m" (__my_cpu_var(_var)), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
@@ -313,7 +371,7 @@ do { \
\
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
- : [var] "+m" (_var), \
+ : [var] "+m" (__my_cpu_var(_var)), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
@@ -345,7 +403,7 @@ do { \
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
CC_SET(z) \
: CC_OUT(z) (success), \
- [var] "+m" (_var), \
+ [var] "+m" (__my_cpu_var(_var)), \
"+a" (old__.low), \
"+d" (old__.high) \
: "b" (new__.low), \
@@ -366,9 +424,9 @@ do { \
* accessed while this_cpu_read_stable() allows the value to be cached.
* this_cpu_read_stable() is more efficient and can be used if its value
* is guaranteed to be valid across cpus. The current users include
- * get_current() and get_thread_info() both of which are actually
- * per-thread variables implemented as per-cpu variables and thus
- * stable for the duration of the respective task.
+ * pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
+ * actually per-thread variables implemented as per-CPU variables and
+ * thus stable for the duration of the respective task.
*/
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
@@ -376,13 +434,72 @@ do { \
#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
+#ifdef CONFIG_USE_X86_SEG_SUPPORT
+
+#define __raw_cpu_read(qual, pcp) \
+({ \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
+})
+
+#define __raw_cpu_write(qual, pcp, val) \
+do { \
+ *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
+} while (0)
+
+#define raw_cpu_read_1(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_read_2(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_read_4(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_write_1(pcp, val) __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_2(pcp, val) __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_4(pcp, val) __raw_cpu_write(, pcp, val)
+
+#define this_cpu_read_1(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_read_2(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_read_4(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_write_1(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_2(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#define this_cpu_write_4(pcp, val) __raw_cpu_write(volatile, pcp, val)
+
+#ifdef CONFIG_X86_64
+#define raw_cpu_read_8(pcp) __raw_cpu_read(, pcp)
+#define raw_cpu_write_8(pcp, val) __raw_cpu_write(, pcp, val)
+
+#define this_cpu_read_8(pcp) __raw_cpu_read(volatile, pcp)
+#define this_cpu_write_8(pcp, val) __raw_cpu_write(volatile, pcp, val)
+#endif
+
+#define this_cpu_read_const(pcp) __raw_cpu_read(, pcp)
+#else /* CONFIG_USE_X86_SEG_SUPPORT */
+
#define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp)
#define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp)
#define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp)
-
#define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val)
#define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val)
#define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val)
+
+#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp)
+#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp)
+#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp)
+#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val)
+#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val)
+#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val)
+
+#ifdef CONFIG_X86_64
+#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp)
+#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val)
+
+#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
+#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
+#endif
+
+/*
+ * The generic per-cpu infrastrucutre is not suitable for
+ * reading const-qualified variables.
+ */
+#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
+#endif /* CONFIG_USE_X86_SEG_SUPPORT */
+
#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
@@ -408,12 +525,6 @@ do { \
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
-#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp)
-#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp)
-#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp)
-#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val)
-#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val)
-#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val)
#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val)
#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val)
#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val)
@@ -452,8 +563,6 @@ do { \
* 32 bit must fall back to generic operations.
*/
#ifdef CONFIG_X86_64
-#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp)
-#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val)
#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val)
#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val)
@@ -462,8 +571,6 @@ do { \
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
-#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
-#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val)
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
@@ -494,7 +601,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
asm volatile("btl "__percpu_arg(2)",%1"
CC_SET(c)
: CC_OUT(c) (oldbit)
- : "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
+ : "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr));
return oldbit;
}
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c7ec5bb88334..dcd836b59beb 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -34,7 +34,7 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
*/
extern gfp_t __userpte_alloc_gfp;
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Instead of one PGD, we acquire two PGDs. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 9e7c0b719c3c..dabafba957ea 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -52,7 +52,7 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
#endif
pxx_xchg64(pud, pudp, native_pud_val(pud));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9d077bca6a10..df0f7d4a96f3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -909,7 +909,7 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd);
/*
@@ -923,12 +923,12 @@ static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
return pgd;
return __pti_set_user_pgtbl(pgdp, pgd);
}
-#else /* CONFIG_PAGE_TABLE_ISOLATION */
+#else /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static inline pgd_t pti_set_user_pgtbl(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
-#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
#endif /* __ASSEMBLY__ */
@@ -1131,7 +1131,7 @@ static inline int p4d_bad(p4d_t p4d)
{
unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
- if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (p4d_flags(p4d) & ~ignore_flags) != 0;
@@ -1177,7 +1177,7 @@ static inline int pgd_bad(pgd_t pgd)
if (!pgtable_l5_enabled())
return 0;
- if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ if (IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
@@ -1422,9 +1422,9 @@ static inline bool pgdp_maps_userspace(void *__ptr)
#define pgd_leaf pgd_large
static inline int pgd_large(pgd_t pgd) { return 0; }
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
- * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * All top-level MITIGATION_PAGE_TABLE_ISOLATION page tables are order-1 pages
* (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
* the user one is in the last 4k. To switch between them, you
* just need to flip the 12th bit in their addresses.
@@ -1469,7 +1469,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
-#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
@@ -1484,7 +1484,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
memcpy(dst, src, count * sizeof(pgd_t));
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/* Clone the user space pgd as well */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 24af25b1551a..7e9db77231ac 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -143,7 +143,8 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
pgd_t pgd;
- if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
+ if (pgtable_l5_enabled() ||
+ !IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION)) {
WRITE_ONCE(*p4dp, p4d);
return;
}
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 38b54b992f32..9053dfe9fa03 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -21,9 +21,9 @@ typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
typedef struct { pmdval_t pmd; } pmd_t;
-#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled;
+#ifdef CONFIG_X86_5LEVEL
#ifdef USE_EARLY_PGTABLE_L5
/*
* cpu_feature_enabled() is not available in early boot code.
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index af77235fded6..919909d8cb77 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -91,7 +91,7 @@ static __always_inline void __preempt_count_sub(int val)
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
- return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e,
+ return GEN_UNARY_RMWcc("decl", __my_cpu_var(pcpu_hot.preempt_count), e,
__percpu_arg([var]));
}
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index d8cccadc83a6..e5f204b9b33d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -51,7 +51,7 @@
#define CR3_NOFLUSH 0
#endif
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
# define X86_CR3_PTI_PCID_USER_BIT 11
#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 89cf39dbd306..811548f131f4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -20,7 +20,6 @@ struct vm86;
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
-#include <asm/msr.h>
#include <asm/desc_defs.h>
#include <asm/nops.h>
#include <asm/special_insns.h>
@@ -185,13 +184,8 @@ extern struct cpuinfo_x86 new_cpu_data;
extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
-#ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
-#else
-#define cpu_info boot_cpu_data
-#define cpu_data(cpu) boot_cpu_data
-#endif
extern const struct seq_operations cpuinfo_op;
@@ -532,6 +526,9 @@ static __always_inline unsigned long current_top_of_stack(void)
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
+ if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
+ return this_cpu_read_const(const_pcpu_hot.top_of_stack);
+
return this_cpu_read_stable(pcpu_hot.top_of_stack);
}
@@ -554,7 +551,7 @@ static inline void load_sp0(unsigned long sp0)
unsigned long __get_wchan(struct task_struct *p);
-extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void select_idle_routine(void);
extern void amd_e400_c1e_apic_setup(void);
extern unsigned long boot_option_idle_override;
@@ -575,28 +572,6 @@ extern void cpu_init(void);
extern void cpu_init_exception_handling(void);
extern void cr4_init(void);
-static inline unsigned long get_debugctlmsr(void)
-{
- unsigned long debugctlmsr = 0;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return 0;
-#endif
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
-
- return debugctlmsr;
-}
-
-static inline void update_debugctlmsr(unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
- if (boot_cpu_data.x86 < 6)
- return;
-#endif
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
-}
-
extern void set_task_blockstep(struct task_struct *task, bool on);
/* Boot loader type from the setup header: */
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
index 07375b476c4f..ab167c96b9ab 100644
--- a/arch/x86/include/asm/pti.h
+++ b/arch/x86/include/asm/pti.h
@@ -3,7 +3,7 @@
#define _ASM_X86_PTI_H
#ifndef __ASSEMBLY__
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
extern void pti_init(void);
extern void pti_check_boottime_disable(void);
extern void pti_finalize(void);
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 255a78d9d906..12dbd2588ca7 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -7,6 +7,13 @@
#include <linux/sched.h>
#include <linux/jump_label.h>
+/*
+ * This value can never be a valid CLOSID, and is used when mapping a
+ * (closid, rmid) pair to an index and back. On x86 only the RMID is
+ * needed. The index is a software defined value.
+ */
+#define X86_RESCTRL_EMPTY_CLOSID ((u32)~0)
+
/**
* struct resctrl_pqr_state - State cache for the PQR MSR
* @cur_rmid: The cached Resource Monitoring ID
@@ -31,10 +38,47 @@ struct resctrl_pqr_state {
DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
+extern bool rdt_alloc_capable;
+extern bool rdt_mon_capable;
+
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+static inline bool resctrl_arch_alloc_capable(void)
+{
+ return rdt_alloc_capable;
+}
+
+static inline void resctrl_arch_enable_alloc(void)
+{
+ static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
+ static_branch_inc_cpuslocked(&rdt_enable_key);
+}
+
+static inline void resctrl_arch_disable_alloc(void)
+{
+ static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
+ static_branch_dec_cpuslocked(&rdt_enable_key);
+}
+
+static inline bool resctrl_arch_mon_capable(void)
+{
+ return rdt_mon_capable;
+}
+
+static inline void resctrl_arch_enable_mon(void)
+{
+ static_branch_enable_cpuslocked(&rdt_mon_enable_key);
+ static_branch_inc_cpuslocked(&rdt_enable_key);
+}
+
+static inline void resctrl_arch_disable_mon(void)
+{
+ static_branch_disable_cpuslocked(&rdt_mon_enable_key);
+ static_branch_dec_cpuslocked(&rdt_enable_key);
+}
+
/*
* __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
*
@@ -88,12 +132,58 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
return val * scale;
}
+static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk,
+ u32 closid, u32 rmid)
+{
+ WRITE_ONCE(tsk->closid, closid);
+ WRITE_ONCE(tsk->rmid, rmid);
+}
+
+static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
+{
+ return READ_ONCE(tsk->closid) == closid;
+}
+
+static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored,
+ u32 rmid)
+{
+ return READ_ONCE(tsk->rmid) == rmid;
+}
+
static inline void resctrl_sched_in(struct task_struct *tsk)
{
if (static_branch_likely(&rdt_enable_key))
__resctrl_sched_in(tsk);
}
+static inline u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return boot_cpu_data.x86_cache_max_rmid + 1;
+}
+
+static inline void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
+{
+ *rmid = idx;
+ *closid = X86_RESCTRL_EMPTY_CLOSID;
+}
+
+static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid)
+{
+ return rmid;
+}
+
+/* x86 can always read an rmid, nothing needs allocating */
+struct rdt_resource;
+static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid)
+{
+ might_sleep();
+ return NULL;
+};
+
+static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid,
+ void *ctx) { };
+
void resctrl_cpu_detect(struct cpuinfo_x86 *c);
#else
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 5c83729c8e71..e61e68d71cba 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -48,7 +48,7 @@ extern unsigned long saved_video_mode;
extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
-extern void startup_64_setup_env(unsigned long physbase);
+extern void startup_64_setup_gdt_idt(void);
extern void early_setup_idt(void);
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
diff --git a/arch/x86/include/asm/setup_data.h b/arch/x86/include/asm/setup_data.h
new file mode 100644
index 000000000000..77c51111a893
--- /dev/null
+++ b/arch/x86/include/asm/setup_data.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SETUP_DATA_H
+#define _ASM_X86_SETUP_DATA_H
+
+#include <uapi/asm/setup_data.h>
+
+#ifndef __ASSEMBLY__
+
+struct pci_setup_rom {
+ struct setup_data data;
+ uint16_t vendor;
+ uint16_t devid;
+ uint64_t pcilen;
+ unsigned long segment;
+ unsigned long bus;
+ unsigned long device;
+ unsigned long function;
+ uint8_t romdata[];
+};
+
+/* kexec external ABI */
+struct efi_setup_data {
+ u64 fw_vendor;
+ u64 __unused;
+ u64 tables;
+ u64 smbios;
+ u64 reserved[8];
+};
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 5b4a1ce3d368..9477b4053bce 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -13,7 +13,6 @@
#include <asm/insn.h>
#include <asm/sev-common.h>
-#include <asm/bootparam.h>
#include <asm/coco.h>
#define GHCB_PROTOCOL_MIN 1ULL
@@ -22,6 +21,8 @@
#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
+struct boot_params;
+
enum es_result {
ES_OK, /* All good */
ES_UNSUPPORTED, /* Requested operation not supported */
@@ -87,9 +88,23 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
/* Software defined (when rFlags.CF = 1) */
#define PVALIDATE_FAIL_NOUPDATE 255
+/* RMUPDATE detected 4K page and 2MB page overlap. */
+#define RMPUPDATE_FAIL_OVERLAP 4
+
/* RMP page size */
#define RMP_PG_SIZE_4K 0
#define RMP_PG_SIZE_2M 1
+#define RMP_TO_PG_LEVEL(level) (((level) == RMP_PG_SIZE_4K) ? PG_LEVEL_4K : PG_LEVEL_2M)
+#define PG_LEVEL_TO_RMP(level) (((level) == PG_LEVEL_4K) ? RMP_PG_SIZE_4K : RMP_PG_SIZE_2M)
+
+struct rmp_state {
+ u64 gpa;
+ u8 assigned;
+ u8 pagesize;
+ u8 immutable;
+ u8 rsvd;
+ u32 asid;
+} __packed;
#define RMPADJUST_VMSA_PAGE_BIT BIT(16)
@@ -199,20 +214,22 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
struct snp_guest_request_ioctl;
void setup_ghcb(void);
-void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
- unsigned long npages);
-void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
- unsigned long npages);
+void early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
+void early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages);
void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op);
void snp_set_memory_shared(unsigned long vaddr, unsigned long npages);
void snp_set_memory_private(unsigned long vaddr, unsigned long npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
-void __init __noreturn snp_abort(void);
+void __noreturn snp_abort(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
u64 sev_get_status(void);
+void kdump_sev_callback(void);
+void sev_show_status(void);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
@@ -241,6 +258,30 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
static inline u64 sev_get_status(void) { return 0; }
+static inline void kdump_sev_callback(void) { }
+static inline void sev_show_status(void) { }
+#endif
+
+#ifdef CONFIG_KVM_AMD_SEV
+bool snp_probe_rmptable_info(void);
+int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level);
+void snp_dump_hva_rmpentry(unsigned long address);
+int psmash(u64 pfn);
+int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable);
+int rmp_make_shared(u64 pfn, enum pg_level level);
+void snp_leak_pages(u64 pfn, unsigned int npages);
+#else
+static inline bool snp_probe_rmptable_info(void) { return false; }
+static inline int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level) { return -ENODEV; }
+static inline void snp_dump_hva_rmpentry(unsigned long address) {}
+static inline int psmash(u64 pfn) { return -ENODEV; }
+static inline int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid,
+ bool immutable)
+{
+ return -ENODEV;
+}
+static inline int rmp_make_shared(u64 pfn, enum pg_level level) { return -ENODEV; }
+static inline void snp_leak_pages(u64 pfn, unsigned int npages) {}
#endif
#endif
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 54d6d71e0eca..a35936b512fe 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -56,11 +56,6 @@ static inline void stop_other_cpus(void)
smp_ops.stop_other_cpus(1);
}
-static inline void smp_prepare_boot_cpu(void)
-{
- smp_ops.smp_prepare_boot_cpu();
-}
-
static inline void smp_prepare_cpus(unsigned int max_cpus)
{
smp_ops.smp_prepare_cpus(max_cpus);
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index c648502e4535..658b690b2ccb 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -96,4 +96,6 @@ static inline void speculative_store_bypass_ht_init(void) { }
extern void speculation_ctrl_update(unsigned long tif);
extern void speculation_ctrl_update_current(void);
+extern bool itlb_multihit_kvm_mitigation;
+
#endif
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 48f8dd47cf68..2e9fc5c400cd 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -2,11 +2,11 @@
#ifndef _ASM_X86_SPECIAL_INSNS_H
#define _ASM_X86_SPECIAL_INSNS_H
-
#ifdef __KERNEL__
-
#include <asm/nops.h>
#include <asm/processor-flags.h>
+
+#include <linux/errno.h>
#include <linux/irqflags.h>
#include <linux/jump_label.h>
@@ -224,10 +224,10 @@ static inline void serialize(void)
}
/* The dst parameter must be 64-bytes aligned */
-static inline void movdir64b(void __iomem *dst, const void *src)
+static inline void movdir64b(void *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } __iomem *__dst = dst;
+ struct { char _[64]; } *__dst = dst;
/*
* MOVDIR64B %(rdx), rax.
@@ -245,6 +245,11 @@ static inline void movdir64b(void __iomem *dst, const void *src)
: "m" (*__src), "a" (__dst), "d" (__src));
}
+static inline void movdir64b_io(void __iomem *dst, const void *src)
+{
+ movdir64b((void __force *)dst, src);
+}
+
/**
* enqcmds - Enqueue a command in supervisor (CPL0) mode
* @dst: destination, in MMIO space (must be 512-bit aligned)
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index 343b722ccaf2..125c407e2abe 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -46,7 +46,7 @@
#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk")
#else
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 0b70653a98c1..345aafbc1964 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -15,6 +15,8 @@
extern void text_poke_early(void *addr, const void *opcode, size_t len);
+extern void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len);
+
/*
* Clear and restore the kernel write-protection flag on the local CPU.
* Allows the kernel to edit read-only pages.
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 76b1d87f1531..abe3a8f22cbd 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -224,7 +224,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu)
static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
static inline int topology_max_smt_threads(void) { return 1; }
static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
-static inline unsigned int topology_amd_nodes_per_pkg(void) { return 0; };
+static inline unsigned int topology_amd_nodes_per_pkg(void) { return 1; }
#endif /* !CONFIG_SMP */
static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index afa524325e55..a23a7b707b64 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_TRAP_PF_H
#define _ASM_X86_TRAP_PF_H
+#include <linux/bits.h>
+
/*
* Page fault error code bits:
*
@@ -13,16 +15,18 @@
* bit 5 == 1: protection keys block access
* bit 6 == 1: shadow stack access fault
* bit 15 == 1: SGX MMU page-fault
+ * bit 31 == 1: fault was due to RMP violation
*/
enum x86_pf_error_code {
- X86_PF_PROT = 1 << 0,
- X86_PF_WRITE = 1 << 1,
- X86_PF_USER = 1 << 2,
- X86_PF_RSVD = 1 << 3,
- X86_PF_INSTR = 1 << 4,
- X86_PF_PK = 1 << 5,
- X86_PF_SHSTK = 1 << 6,
- X86_PF_SGX = 1 << 15,
+ X86_PF_PROT = BIT(0),
+ X86_PF_WRITE = BIT(1),
+ X86_PF_USER = BIT(2),
+ X86_PF_RSVD = BIT(3),
+ X86_PF_INSTR = BIT(4),
+ X86_PF_PK = BIT(5),
+ X86_PF_SHSTK = BIT(6),
+ X86_PF_SGX = BIT(15),
+ X86_PF_RMP = BIT(31),
};
#endif /* _ASM_X86_TRAP_PF_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 594fce0ca744..405efb3e4996 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -5,8 +5,9 @@
#ifndef _ASM_X86_TSC_H
#define _ASM_X86_TSC_H
-#include <asm/processor.h>
#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
/*
* Standard way to access the cycle counter.
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f2c02e4469cc..04789f45ab2b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -11,6 +11,7 @@
#include <asm/alternative.h>
#include <asm/cpufeatures.h>
#include <asm/page.h>
+#include <asm/percpu.h>
#ifdef CONFIG_ADDRESS_MASKING
/*
@@ -18,14 +19,10 @@
*/
static inline unsigned long __untagged_addr(unsigned long addr)
{
- /*
- * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
- * in alternative instructions. The relocation gets wrong when gets
- * copied to the target place.
- */
asm (ALTERNATIVE("",
- "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM)
- : [addr] "+r" (addr) : "m" (tlbstate_untag_mask));
+ "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM)
+ : [addr] "+r" (addr)
+ : [mask] "m" (__my_cpu_var(tlbstate_untag_mask)));
return addr;
}
@@ -54,7 +51,7 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
* half and a user half. When cast to a signed type, user pointers
* are positive and kernel pointers are negative.
*/
-#define valid_user_address(x) ((long)(x) >= 0)
+#define valid_user_address(x) ((__force long)(x) >= 0)
/*
* User pointers can have tag bits on x86-64. This scheme tolerates
@@ -87,8 +84,9 @@ static inline bool __access_ok(const void __user *ptr, unsigned long size)
if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
return valid_user_address(ptr);
} else {
- unsigned long sum = size + (unsigned long)ptr;
- return valid_user_address(sum) && sum >= (unsigned long)ptr;
+ unsigned long sum = size + (__force unsigned long)ptr;
+
+ return valid_user_address(sum) && sum >= (__force unsigned long)ptr;
}
}
#define __access_ok __access_ok
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 9ca624749176..b89b40f250e6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -2,8 +2,6 @@
#ifndef _ASM_X86_PLATFORM_H
#define _ASM_X86_PLATFORM_H
-#include <asm/bootparam.h>
-
struct ghcb;
struct mpc_bus;
struct mpc_cpu;
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 01d19fc22346..9b82eebd7add 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -2,21 +2,7 @@
#ifndef _ASM_X86_BOOTPARAM_H
#define _ASM_X86_BOOTPARAM_H
-/* setup_data/setup_indirect types */
-#define SETUP_NONE 0
-#define SETUP_E820_EXT 1
-#define SETUP_DTB 2
-#define SETUP_PCI 3
-#define SETUP_EFI 4
-#define SETUP_APPLE_PROPERTIES 5
-#define SETUP_JAILHOUSE 6
-#define SETUP_CC_BLOB 7
-#define SETUP_IMA 8
-#define SETUP_RNG_SEED 9
-#define SETUP_ENUM_MAX SETUP_RNG_SEED
-
-#define SETUP_INDIRECT (1<<31)
-#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
+#include <asm/setup_data.h>
/* ram_size flags */
#define RAMDISK_IMAGE_START_MASK 0x07FF
@@ -38,6 +24,7 @@
#define XLF_EFI_KEXEC (1<<4)
#define XLF_5LEVEL (1<<5)
#define XLF_5LEVEL_ENABLED (1<<6)
+#define XLF_MEM_ENCRYPTION (1<<7)
#ifndef __ASSEMBLY__
@@ -48,22 +35,6 @@
#include <asm/ist.h>
#include <video/edid.h>
-/* extensible setup data list node */
-struct setup_data {
- __u64 next;
- __u32 type;
- __u32 len;
- __u8 data[];
-};
-
-/* extensible setup indirect data node */
-struct setup_indirect {
- __u32 type;
- __u32 reserved; /* Reserved, must be set to zero. */
- __u64 len;
- __u64 addr;
-};
-
struct setup_header {
__u8 setup_sects;
__u16 root_flags;
@@ -137,50 +108,10 @@ struct efi_info {
#define E820_MAX_ENTRIES_ZEROPAGE 128
/*
- * The E820 memory region entry of the boot protocol ABI:
- */
-struct boot_e820_entry {
- __u64 addr;
- __u64 size;
- __u32 type;
-} __attribute__((packed));
-
-/*
* Smallest compatible version of jailhouse_setup_data required by this kernel.
*/
#define JAILHOUSE_SETUP_REQUIRED_VERSION 1
-/*
- * The boot loader is passing platform information via this Jailhouse-specific
- * setup data structure.
- */
-struct jailhouse_setup_data {
- struct {
- __u16 version;
- __u16 compatible_version;
- } __attribute__((packed)) hdr;
- struct {
- __u16 pm_timer_address;
- __u16 num_cpus;
- __u64 pci_mmconfig_base;
- __u32 tsc_khz;
- __u32 apic_khz;
- __u8 standard_ioapic;
- __u8 cpu_ids[255];
- } __attribute__((packed)) v1;
- struct {
- __u32 flags;
- } __attribute__((packed)) v2;
-} __attribute__((packed));
-
-/*
- * IMA buffer setup data information from the previous kernel during kexec
- */
-struct ima_setup_data {
- __u64 addr;
- __u64 size;
-} __attribute__((packed));
-
/* The so-called "zeropage" */
struct boot_params {
struct screen_info screen_info; /* 0x000 */
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
new file mode 100644
index 000000000000..b111b0c18544
--- /dev/null
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_ASM_X86_SETUP_DATA_H
+#define _UAPI_ASM_X86_SETUP_DATA_H
+
+/* setup_data/setup_indirect types */
+#define SETUP_NONE 0
+#define SETUP_E820_EXT 1
+#define SETUP_DTB 2
+#define SETUP_PCI 3
+#define SETUP_EFI 4
+#define SETUP_APPLE_PROPERTIES 5
+#define SETUP_JAILHOUSE 6
+#define SETUP_CC_BLOB 7
+#define SETUP_IMA 8
+#define SETUP_RNG_SEED 9
+#define SETUP_ENUM_MAX SETUP_RNG_SEED
+
+#define SETUP_INDIRECT (1<<31)
+#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* extensible setup data list node */
+struct setup_data {
+ __u64 next;
+ __u32 type;
+ __u32 len;
+ __u8 data[];
+};
+
+/* extensible setup indirect data node */
+struct setup_indirect {
+ __u32 type;
+ __u32 reserved; /* Reserved, must be set to zero. */
+ __u64 len;
+ __u64 addr;
+};
+
+/*
+ * The E820 memory region entry of the boot protocol ABI:
+ */
+struct boot_e820_entry {
+ __u64 addr;
+ __u64 size;
+ __u32 type;
+} __attribute__((packed));
+
+/*
+ * The boot loader is passing platform information via this Jailhouse-specific
+ * setup data structure.
+ */
+struct jailhouse_setup_data {
+ struct {
+ __u16 version;
+ __u16 compatible_version;
+ } __attribute__((packed)) hdr;
+ struct {
+ __u16 pm_timer_address;
+ __u16 num_cpus;
+ __u64 pci_mmconfig_base;
+ __u32 tsc_khz;
+ __u32 apic_khz;
+ __u8 standard_ioapic;
+ __u8 cpu_ids[255];
+ } __attribute__((packed)) v1;
+ struct {
+ __u32 flags;
+ } __attribute__((packed)) v2;
+} __attribute__((packed));
+
+/*
+ * IMA buffer setup data information from the previous kernel during kexec
+ */
+struct ima_setup_data {
+ __u64 addr;
+ __u64 size;
+} __attribute__((packed));
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0dcbfc1a4c41..d0c744cb2a0e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -33,6 +33,7 @@ KASAN_SANITIZE_sev.o := n
KCSAN_SANITIZE := n
KMSAN_SANITIZE_head$(BITS).o := n
KMSAN_SANITIZE_nmi.o := n
+KMSAN_SANITIZE_sev.o := n
# If instrumentation of the following files is enabled, boot hangs during
# first second.
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index d5d8a352eafa..94ff83f3d3fe 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -17,7 +17,7 @@
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
SYM_FUNC_START(wakeup_long64)
- movq saved_magic, %rax
+ movq saved_magic(%rip), %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
je 2f
@@ -33,14 +33,14 @@ SYM_FUNC_START(wakeup_long64)
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
- movq saved_rsp, %rsp
+ movq saved_rsp(%rip), %rsp
- movq saved_rbx, %rbx
- movq saved_rdi, %rdi
- movq saved_rsi, %rsi
- movq saved_rbp, %rbp
+ movq saved_rbx(%rip), %rbx
+ movq saved_rdi(%rip), %rdi
+ movq saved_rsi(%rip), %rsi
+ movq saved_rbp(%rip), %rbp
- movq saved_rip, %rax
+ movq saved_rip(%rip), %rax
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
SYM_FUNC_END(wakeup_long64)
@@ -72,11 +72,11 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq $.Lresume_point, saved_rip(%rip)
- movq %rsp, saved_rsp
- movq %rbp, saved_rbp
- movq %rbx, saved_rbx
- movq %rdi, saved_rdi
- movq %rsi, saved_rsi
+ movq %rsp, saved_rsp(%rip)
+ movq %rbp, saved_rbp(%rip)
+ movq %rbx, saved_rbx(%rip)
+ movq %rdi, saved_rdi(%rip)
+ movq %rsi, saved_rsi(%rip)
addq $8, %rsp
movl $3, %edi
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1d85cb7071cb..ff6e32ec8259 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched);
#define DA_ENDBR 0x08
#define DA_SMP 0x10
-static unsigned int __initdata_or_module debug_alternative;
+static unsigned int debug_alternative;
static int __init debug_alt(char *str)
{
@@ -133,7 +133,7 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
* *jump* over instead of executing long and daft NOPs.
*/
-static void __init_or_module add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *instr, unsigned int len)
{
u8 *target = instr + len;
@@ -206,7 +206,7 @@ static int skip_nops(u8 *instr, int offset, int len)
* Optimize a sequence of NOPs, possibly preceded by an unconditional jump
* to the end of the NOP sequence into a single NOP.
*/
-static bool __init_or_module
+static bool
__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
{
int i = *next - insn->length;
@@ -335,8 +335,7 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
return (target < src || target > src + src_len);
}
-static void __init_or_module noinline
-apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
{
int prev, target = 0;
@@ -545,7 +544,7 @@ static inline bool is_jcc32(struct insn *insn)
return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
}
-#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
+#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
/*
* CALL/JMP *%\reg
@@ -709,8 +708,8 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
/*
* The compiler is supposed to EMIT an INT3 after every unconditional
* JMP instruction due to AMD BTC. However, if the compiler is too old
- * or SLS isn't enabled, we still need an INT3 after indirect JMPs
- * even on Intel.
+ * or MITIGATION_SLS isn't enabled, we still need an INT3 after
+ * indirect JMPs even on Intel.
*/
if (op == JMP32_INSN_OPCODE && i < insn->length)
bytes[i++] = INT3_INSN_OPCODE;
@@ -770,7 +769,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
}
}
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
/*
* Rewrite the compiler generated return thunk tail-calls.
@@ -843,14 +842,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
}
#else
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
-#endif /* CONFIG_RETHUNK */
+#endif /* CONFIG_MITIGATION_RETHUNK */
-#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
+#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
-#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
+#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 76a5ced278c2..b37ab1095707 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1055,35 +1055,6 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
return APM_SUCCESS;
}
-#if 0
-static int apm_get_battery_status(u_short which, u_short *status,
- u_short *bat, u_short *life, u_short *nbat)
-{
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esi;
-
- if (apm_info.connection_version < 0x0102) {
- /* pretend we only have one battery. */
- if (which != 1)
- return APM_BAD_DEVICE;
- *nbat = 1;
- return apm_get_power_status(status, bat, life);
- }
-
- if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
- &ebx, &ecx, &edx, &esi))
- return (eax >> 8) & 0xff;
- *status = ebx;
- *bat = ecx;
- *life = edx;
- *nbat = esi;
- return APM_SUCCESS;
-}
-#endif
-
/**
* apm_engage_power_management - enable PM on a device
* @device: identity of device
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 6913b372ccf7..a98020bf31bb 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -109,7 +109,7 @@ static void __used common(void)
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
OFFSET(X86_current_task, pcpu_hot, current_task);
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
OFFSET(X86_call_depth, pcpu_hot, call_depth);
#endif
#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index 64ad2ddea121..30335182b6b0 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -24,6 +24,8 @@
static int __initdata_or_module debug_callthunks;
+#define MAX_PATCH_LEN (255-1)
+
#define prdbg(fmt, args...) \
do { \
if (debug_callthunks) \
@@ -42,8 +44,8 @@ DEFINE_PER_CPU(u64, __x86_call_count);
DEFINE_PER_CPU(u64, __x86_ret_count);
DEFINE_PER_CPU(u64, __x86_stuffs_count);
DEFINE_PER_CPU(u64, __x86_ctxsw_count);
-EXPORT_SYMBOL_GPL(__x86_ctxsw_count);
-EXPORT_SYMBOL_GPL(__x86_call_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count);
#endif
extern s32 __call_sites[], __call_sites_end[];
@@ -179,10 +181,15 @@ static const u8 nops[] = {
static void *patch_dest(void *dest, bool direct)
{
unsigned int tsize = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
u8 *pad = dest - tsize;
+ memcpy(insn_buff, skl_call_thunk_template, tsize);
+ apply_relocation(insn_buff, tsize, pad,
+ skl_call_thunk_template, tsize);
+
/* Already patched? */
- if (!bcmp(pad, skl_call_thunk_template, tsize))
+ if (!bcmp(pad, insn_buff, tsize))
return pad;
/* Ensure there are nops */
@@ -192,9 +199,9 @@ static void *patch_dest(void *dest, bool direct)
}
if (direct)
- memcpy(pad, skl_call_thunk_template, tsize);
+ memcpy(pad, insn_buff, tsize);
else
- text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true);
+ text_poke_copy_locked(pad, insn_buff, tsize, true);
return pad;
}
@@ -290,20 +297,27 @@ void *callthunks_translate_call_dest(void *dest)
static bool is_callthunk(void *addr)
{
unsigned int tmpl_size = SKL_TMPL_SIZE;
- void *tmpl = skl_call_thunk_template;
+ u8 insn_buff[MAX_PATCH_LEN];
unsigned long dest;
+ u8 *pad;
dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
if (!thunks_initialized || skip_addr((void *)dest))
return false;
- return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size);
+ pad = (void *)(dest - tmpl_size);
+
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, tmpl_size, pad,
+ skl_call_thunk_template, tmpl_size);
+
+ return !bcmp(pad, insn_buff, tmpl_size);
}
int x86_call_depth_emit_accounting(u8 **pprog, void *func)
{
unsigned int tmpl_size = SKL_TMPL_SIZE;
- void *tmpl = skl_call_thunk_template;
+ u8 insn_buff[MAX_PATCH_LEN];
if (!thunks_initialized)
return 0;
@@ -312,7 +326,11 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func)
if (func && is_callthunk(func))
return 0;
- memcpy(*pprog, tmpl, tmpl_size);
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, tmpl_size, *pprog,
+ skl_call_thunk_template, tmpl_size);
+
+ memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;
return tmpl_size;
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c82069e5f341..3282a747b645 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -20,6 +20,7 @@
#include <asm/delay.h>
#include <asm/debugreg.h>
#include <asm/resctrl.h>
+#include <asm/sev.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -451,6 +452,21 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
break;
}
+ if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
+ /*
+ * RMP table entry format is not architectural and it can vary by processor
+ * and is defined by the per-processor PPR. Restrict SNP support on the
+ * known CPU model and family for which the RMP table entry format is
+ * currently defined for.
+ */
+ if (!boot_cpu_has(X86_FEATURE_ZEN3) &&
+ !boot_cpu_has(X86_FEATURE_ZEN4) &&
+ !boot_cpu_has(X86_FEATURE_ZEN5))
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ else if (!snp_probe_rmptable_info())
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ }
+
return;
warn:
@@ -469,8 +485,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
* SME feature (set in scattered.c).
* If the kernel has not enabled SME via any means then
* don't advertise the SME feature.
- * For SEV: If BIOS has not enabled SEV then don't advertise the
- * SEV and SEV_ES feature (set in scattered.c).
+ * For SEV: If BIOS has not enabled SEV then don't advertise SEV and
+ * any additional functionality based on it.
*
* In all cases, since support for SME and SEV requires long mode,
* don't advertise the feature under CONFIG_X86_32.
@@ -505,6 +521,7 @@ clear_all:
clear_sev:
setup_clear_cpu_cap(X86_FEATURE_SEV);
setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
}
}
@@ -800,7 +817,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
void init_spectral_chicken(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
u64 value;
/*
@@ -828,7 +845,6 @@ static void init_amd_zen_common(void)
static void init_amd_zen1(struct cpuinfo_x86 *c)
{
- init_amd_zen_common();
fix_erratum_1386(c);
/* Fix up CPUID bits, but only if not virtualised. */
@@ -882,7 +898,6 @@ static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
static void init_amd_zen2(struct cpuinfo_x86 *c)
{
- init_amd_zen_common();
init_spectral_chicken(c);
fix_erratum_1386(c);
zen2_zenbleed_check(c);
@@ -890,8 +905,6 @@ static void init_amd_zen2(struct cpuinfo_x86 *c)
static void init_amd_zen3(struct cpuinfo_x86 *c)
{
- init_amd_zen_common();
-
if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
/*
* Zen3 (Fam19 model < 0x10) parts are not susceptible to
@@ -905,15 +918,12 @@ static void init_amd_zen3(struct cpuinfo_x86 *c)
static void init_amd_zen4(struct cpuinfo_x86 *c)
{
- init_amd_zen_common();
-
if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
}
static void init_amd_zen5(struct cpuinfo_x86 *c)
{
- init_amd_zen_common();
}
static void init_amd(struct cpuinfo_x86 *c)
@@ -950,6 +960,13 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x16: init_amd_jg(c); break;
}
+ /*
+ * Save up on some future enablement work and do common Zen
+ * settings.
+ */
+ if (c->x86 >= 0x17)
+ init_amd_zen_common();
+
if (boot_cpu_has(X86_FEATURE_ZEN1))
init_amd_zen1(c);
else if (boot_cpu_has(X86_FEATURE_ZEN2))
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 48d049cd74e7..e7ba936d798b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -56,7 +56,7 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
/* The current value of the SPEC_CTRL MSR with task-specific bits set */
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
EXPORT_SYMBOL_GPL(x86_pred_cmd);
@@ -422,6 +422,13 @@ static void __init mmio_select_mitigation(void)
if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
boot_cpu_has(X86_FEATURE_RTM)))
setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+
+ /*
+ * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based
+ * mitigations, disable KVM-only mitigation in that case.
+ */
+ if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
+ static_branch_disable(&mmio_stale_data_clear);
else
static_branch_enable(&mmio_stale_data_clear);
@@ -474,6 +481,57 @@ static int __init mmio_stale_data_parse_cmdline(char *str)
early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
+
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ return;
+ }
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR)
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ else
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+}
+
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
+}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "" fmt
static void __init md_clear_update_mitigation(void)
@@ -498,11 +556,19 @@ static void __init md_clear_update_mitigation(void)
taa_mitigation = TAA_MITIGATION_VERW;
taa_select_mitigation();
}
- if (mmio_mitigation == MMIO_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ /*
+ * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear
+ * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
mmio_mitigation = MMIO_MITIGATION_VERW;
mmio_select_mitigation();
}
+ if (rfds_mitigation == RFDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_RFDS)) {
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ rfds_select_mitigation();
+ }
out:
if (boot_cpu_has_bug(X86_BUG_MDS))
pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
@@ -512,6 +578,8 @@ out:
pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
pr_info("MMIO Stale Data: Unknown: No mitigations\n");
+ if (boot_cpu_has_bug(X86_BUG_RFDS))
+ pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]);
}
static void __init md_clear_select_mitigation(void)
@@ -519,11 +587,12 @@ static void __init md_clear_select_mitigation(void)
mds_select_mitigation();
taa_select_mitigation();
mmio_select_mitigation();
+ rfds_select_mitigation();
/*
- * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
- * and print their mitigation after MDS, TAA and MMIO Stale Data
- * mitigation selection is done.
+ * As these mitigations are inter-related and rely on VERW instruction
+ * to clear the microarchitural buffers, update and print their status
+ * after mitigation selection is done for each of these vulnerabilities.
*/
md_clear_update_mitigation();
}
@@ -668,7 +737,7 @@ enum gds_mitigations {
GDS_MITIGATION_HYPERVISOR,
};
-#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION)
+#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE)
static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
#else
static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
@@ -979,10 +1048,10 @@ static void __init retbleed_select_mitigation(void)
return;
case RETBLEED_CMD_UNRET:
- if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
} else {
- pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
goto do_cmd_auto;
}
break;
@@ -991,24 +1060,24 @@ static void __init retbleed_select_mitigation(void)
if (!boot_cpu_has(X86_FEATURE_IBPB)) {
pr_err("WARNING: CPU does not support IBPB.\n");
goto do_cmd_auto;
- } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
} else {
- pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
goto do_cmd_auto;
}
break;
case RETBLEED_CMD_STUFF:
- if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) &&
+ if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) &&
spectre_v2_enabled == SPECTRE_V2_RETPOLINE) {
retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
} else {
- if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING))
+ if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))
pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
else
- pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
goto do_cmd_auto;
}
@@ -1018,9 +1087,10 @@ do_cmd_auto:
case RETBLEED_CMD_AUTO:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY))
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB))
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
}
@@ -1099,7 +1169,7 @@ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
SPECTRE_V2_USER_NONE;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
static bool spectre_v2_bad_module;
bool retpoline_module_ok(bool has_retpoline)
@@ -1412,7 +1482,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
- !IS_ENABLED(CONFIG_RETPOLINE)) {
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
pr_err("%s selected but not compiled in. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1435,7 +1505,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) {
+ if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
pr_err("%s selected but not compiled in. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1466,7 +1536,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
{
- if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
pr_err("Kernel not compiled with retpoline; no mitigation available!");
return SPECTRE_V2_NONE;
}
@@ -1561,7 +1631,7 @@ static void __init spectre_v2_select_mitigation(void)
break;
}
- if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) &&
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
boot_cpu_has_bug(X86_BUG_RETBLEED) &&
retbleed_cmd != RETBLEED_CMD_OFF &&
retbleed_cmd != RETBLEED_CMD_STUFF &&
@@ -2454,7 +2524,7 @@ static void __init srso_select_mitigation(void)
break;
case SRSO_CMD_SAFE_RET:
- if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
/*
* Enable the return thunk for generated code
* like ftrace, static_call, etc.
@@ -2474,29 +2544,29 @@ static void __init srso_select_mitigation(void)
else
srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
} else {
- pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
}
break;
case SRSO_CMD_IBPB:
- if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
srso_mitigation = SRSO_MITIGATION_IBPB;
}
} else {
- pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
}
break;
case SRSO_CMD_IBPB_ON_VMEXIT:
- if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
}
} else {
- pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
}
break;
}
@@ -2612,6 +2682,11 @@ static ssize_t mmio_stale_data_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2771,6 +2846,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_GDS:
return gds_show_state(buf);
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
default:
break;
}
@@ -2845,4 +2923,14 @@ ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
#endif
+
+void __warn_thunk(void)
+{
+ WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n");
+}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index f2241e7e96fd..392d09c936d6 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -1118,15 +1118,16 @@ static void cache_cpu_init(void)
unsigned long flags;
local_irq_save(flags);
- cache_disable();
- if (memory_caching_control & CACHE_MTRR)
+ if (memory_caching_control & CACHE_MTRR) {
+ cache_disable();
mtrr_generic_set_state();
+ cache_enable();
+ }
if (memory_caching_control & CACHE_PAT)
pat_cpu_init();
- cache_enable();
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c44e6f0c8972..ba8cf5e9ce56 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -71,6 +71,9 @@
#include "cpu.h"
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
u32 elf_hwcap2 __read_mostly;
/* Number of siblings per CPU package */
@@ -1221,6 +1224,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define SRSO BIT(5)
/* CPU is affected by GDS */
#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
@@ -1248,9 +1253,18 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
@@ -1284,6 +1298,24 @@ static bool arch_cap_mmio_immune(u64 ia32_cap)
ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
}
+static bool __init vulnerable_to_rfds(u64 ia32_cap)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (ia32_cap & ARCH_CAP_RFDS_NO)
+ return false;
+
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (ia32_cap & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -1309,8 +1341,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
/*
* AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
* flag and protect from vendor-specific bugs via the whitelist.
+ *
+ * Don't use AutoIBRS when SNP is enabled because it degrades host
+ * userspace indirect branch performance.
*/
- if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) {
+ if ((ia32_cap & ARCH_CAP_IBRS_ALL) ||
+ (cpu_has(c, X86_FEATURE_AUTOIBRS) &&
+ !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) {
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
!(ia32_cap & ARCH_CAP_PBRSB_NO))
@@ -1395,6 +1432,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
boot_cpu_has(X86_FEATURE_AVX))
setup_force_cpu_bug(X86_BUG_GDS);
+ if (vulnerable_to_rfds(ia32_cap))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1848,8 +1888,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Init Machine Check Exception if available. */
mcheck_cpu_init(c);
- select_idle_routine(c);
-
#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
#endif
@@ -1959,6 +1997,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
.top_of_stack = TOP_OF_INIT_STACK,
};
EXPORT_PER_CPU_SYMBOL(pcpu_hot);
+EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
@@ -2270,6 +2309,8 @@ void __init arch_cpu_finalize_init(void)
{
identify_boot_cpu();
+ select_idle_routine();
+
/*
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
index 0771a905b286..5be2b1790282 100644
--- a/arch/x86/kernel/cpu/intel_pconfig.c
+++ b/arch/x86/kernel/cpu/intel_pconfig.c
@@ -7,6 +7,8 @@
* Author:
* Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
*/
+#include <linux/bug.h>
+#include <linux/limits.h>
#include <asm/cpufeature.h>
#include <asm/intel_pconfig.h>
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 04acdc3534c8..b5cc557cfc37 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -2457,7 +2457,7 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct bus_type mce_subsys = {
+static const struct bus_type mce_subsys = {
.name = "machinecheck",
.dev_name = "machinecheck",
};
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d3524778a545..422a4ddc2ab7 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -108,6 +108,9 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
+ if (cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return;
+
rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 26a427fa84ea..eeac00d20926 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -6,6 +6,7 @@
* Authors: Fenghua Yu <fenghua.yu@intel.com>,
* H. Peter Anvin <hpa@linux.intel.com>
*/
+#include <linux/printk.h>
#include <asm/processor.h>
#include <asm/archrandom.h>
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 19e0681f0435..83e40341583e 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -16,6 +16,7 @@
#define pr_fmt(fmt) "resctrl: " fmt
+#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/cacheinfo.h>
@@ -25,8 +26,15 @@
#include <asm/resctrl.h>
#include "internal.h"
-/* Mutex to protect rdtgroup access. */
-DEFINE_MUTEX(rdtgroup_mutex);
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
/*
* The cached resctrl_pqr_state is strictly per CPU and can never be
@@ -136,15 +144,15 @@ static inline void cache_alloc_hsw_probe(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
struct rdt_resource *r = &hw_res->r_resctrl;
- u32 l, h, max_cbm = BIT_MASK(20) - 1;
+ u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
- if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0))
+ if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
return;
- rdmsr(MSR_IA32_L3_CBM_BASE, l, h);
+ rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
/* If all the bits were set in MSR, return success */
- if (l != max_cbm)
+ if (l3_cbm_0 != max_cbm)
return;
hw_res->num_closid = 4;
@@ -231,9 +239,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- union cpuid_0x10_3_eax eax;
- union cpuid_0x10_x_edx edx;
- u32 ebx, ecx, subleaf;
+ u32 eax, ebx, ecx, edx, subleaf;
/*
* Query CPUID_Fn80000020_EDX_x01 for MBA and
@@ -241,9 +247,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
*/
subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
- cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full);
- hw_res->num_closid = edx.split.cos_max + 1;
- r->default_ctrl = MAX_MBA_BW_AMD;
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->default_ctrl = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -512,6 +518,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
struct rdt_domain *d;
int err;
+ lockdep_assert_held(&domain_list_lock);
+
d = rdt_find_domain(r, id, &add_pos);
if (IS_ERR(d)) {
pr_warn("Couldn't find cache id for CPU %d\n", cpu);
@@ -545,11 +553,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
}
- list_add_tail(&d->list, add_pos);
+ list_add_tail_rcu(&d->list, add_pos);
err = resctrl_online_domain(r, d);
if (err) {
- list_del(&d->list);
+ list_del_rcu(&d->list);
+ synchronize_rcu();
domain_free(hw_dom);
}
}
@@ -560,6 +569,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
struct rdt_hw_domain *hw_dom;
struct rdt_domain *d;
+ lockdep_assert_held(&domain_list_lock);
+
d = rdt_find_domain(r, id, NULL);
if (IS_ERR_OR_NULL(d)) {
pr_warn("Couldn't find cache id for CPU %d\n", cpu);
@@ -570,7 +581,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
resctrl_offline_domain(r, d);
- list_del(&d->list);
+ list_del_rcu(&d->list);
+ synchronize_rcu();
/*
* rdt_domain "d" is going to be freed below, so clear
@@ -582,73 +594,47 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
return;
}
-
- if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
- if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
- cancel_delayed_work(&d->mbm_over);
- mbm_setup_overflow_handler(d, 0);
- }
- if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
- has_busy_rmid(r, d)) {
- cancel_delayed_work(&d->cqm_limbo);
- cqm_setup_limbo_handler(d, 0);
- }
- }
}
static void clear_closid_rmid(int cpu)
{
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
- state->default_closid = 0;
- state->default_rmid = 0;
- state->cur_closid = 0;
- state->cur_rmid = 0;
- wrmsr(MSR_IA32_PQR_ASSOC, 0, 0);
+ state->default_closid = RESCTRL_RESERVED_CLOSID;
+ state->default_rmid = RESCTRL_RESERVED_RMID;
+ state->cur_closid = RESCTRL_RESERVED_CLOSID;
+ state->cur_rmid = RESCTRL_RESERVED_RMID;
+ wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
+ RESCTRL_RESERVED_CLOSID);
}
-static int resctrl_online_cpu(unsigned int cpu)
+static int resctrl_arch_online_cpu(unsigned int cpu)
{
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
- /* The cpu is set in default rdtgroup after online. */
- cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ mutex_unlock(&domain_list_lock);
+
clear_closid_rmid(cpu);
- mutex_unlock(&rdtgroup_mutex);
+ resctrl_online_cpu(cpu);
return 0;
}
-static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
-{
- struct rdtgroup *cr;
-
- list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
- if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
- break;
- }
- }
-}
-
-static int resctrl_offline_cpu(unsigned int cpu)
+static int resctrl_arch_offline_cpu(unsigned int cpu)
{
- struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
+ resctrl_offline_cpu(cpu);
+
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
- clear_childcpus(rdtgrp, cpu);
- break;
- }
- }
+ mutex_unlock(&domain_list_lock);
+
clear_closid_rmid(cpu);
- mutex_unlock(&rdtgroup_mutex);
return 0;
}
@@ -968,7 +954,8 @@ static int __init resctrl_late_init(void)
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:",
- resctrl_online_cpu, resctrl_offline_cpu);
+ resctrl_arch_online_cpu,
+ resctrl_arch_offline_cpu);
if (state < 0)
return state;
@@ -992,8 +979,14 @@ late_initcall(resctrl_late_init);
static void __exit resctrl_exit(void)
{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
cpuhp_remove_state(rdt_online);
+
rdtgroup_exit();
+
+ if (r->mon_capable)
+ rdt_put_mon_l3_config();
}
__exitcall(resctrl_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index beccb0e87ba7..7997b47743a2 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -19,6 +19,8 @@
#include <linux/kernfs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/tick.h>
+
#include "internal.h"
/*
@@ -210,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s,
struct rdt_domain *d;
unsigned long dom_id;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
@@ -314,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
struct rdt_domain *d;
u32 idx;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
@@ -379,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
return -EINVAL;
buf[nbytes - 1] = '\0';
- cpus_read_lock();
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
return -ENOENT;
}
rdt_last_cmd_clear();
@@ -445,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
out:
rdt_staged_configs_clear();
rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
return ret ?: nbytes;
}
@@ -465,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
bool sep = false;
u32 ctrl_val;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
seq_printf(s, "%*s:", max_name_width, schema->name);
list_for_each_entry(dom, &r->domains, list) {
if (sep)
@@ -522,12 +530,24 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
return ret;
}
+static int smp_mon_event_count(void *arg)
+{
+ mon_event_count(arg);
+
+ return 0;
+}
+
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_domain *d, struct rdtgroup *rdtgrp,
int evtid, int first)
{
+ int cpu;
+
+ /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
/*
- * setup the parameters to send to the IPI to read the data.
+ * Setup the parameters to pass to mon_event_count() to read the data.
*/
rr->rgrp = rdtgrp;
rr->evtid = evtid;
@@ -535,8 +555,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
rr->d = d;
rr->val = 0;
rr->first = first;
+ rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
+ if (IS_ERR(rr->arch_mon_ctx)) {
+ rr->err = -EINVAL;
+ return;
+ }
+
+ cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU);
+
+ /*
+ * cpumask_any_housekeeping() prefers housekeeping CPUs, but
+ * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
+ * MPAM's resctrl_arch_rmid_read() is unable to read the
+ * counters on some platforms if its called in IRQ context.
+ */
+ if (tick_nohz_full_cpu(cpu))
+ smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+ else
+ smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
- smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+ resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
}
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a4f1aa15f0a2..c99f26ebe7a6 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -7,6 +7,9 @@
#include <linux/kernfs.h>
#include <linux/fs_context.h>
#include <linux/jump_label.h>
+#include <linux/tick.h>
+
+#include <asm/resctrl.h>
#define L3_QOS_CDP_ENABLE 0x01ULL
@@ -18,7 +21,6 @@
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
-#define MAX_MBA_BW_AMD 0x800
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
@@ -54,6 +56,46 @@
/* Max event bits supported */
#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
+/**
+ * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
+ * aren't marked nohz_full
+ * @mask: The mask to pick a CPU from.
+ * @exclude_cpu:The CPU to avoid picking.
+ *
+ * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
+ * CPUs that don't use nohz_full, these are preferred. Pass
+ * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
+ *
+ * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
+ */
+static inline unsigned int
+cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
+{
+ unsigned int cpu, hk_cpu;
+
+ if (exclude_cpu == RESCTRL_PICK_ANY_CPU)
+ cpu = cpumask_any(mask);
+ else
+ cpu = cpumask_any_but(mask, exclude_cpu);
+
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL))
+ return cpu;
+
+ /* If the CPU picked isn't marked nohz_full nothing more needs doing. */
+ if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu))
+ return cpu;
+
+ /* Try to find a CPU that isn't nohz_full to use in preference */
+ hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
+ if (hk_cpu == exclude_cpu)
+ hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
+
+ if (hk_cpu < nr_cpu_ids)
+ cpu = hk_cpu;
+
+ return cpu;
+}
+
struct rdt_fs_context {
struct kernfs_fs_context kfc;
bool enable_cdpl2;
@@ -69,9 +111,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
return container_of(kfc, struct rdt_fs_context, kfc);
}
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-
/**
* struct mon_evt - Entry in the event list of a resource
* @evtid: event id
@@ -112,12 +151,12 @@ struct rmid_read {
bool first;
int err;
u64 val;
+ void *arch_mon_ctx;
};
-extern bool rdt_alloc_capable;
-extern bool rdt_mon_capable;
extern unsigned int rdt_mon_features;
extern struct list_head resctrl_schema_all;
+extern bool resctrl_mounted;
enum rdt_group_type {
RDTCTRL_GROUP = 0,
@@ -296,14 +335,10 @@ struct rftype {
* struct mbm_state - status for each MBM counter in each domain
* @prev_bw_bytes: Previous bytes value read for bandwidth calculation
* @prev_bw: The most recent bandwidth in MBps
- * @delta_bw: Difference between the current and previous bandwidth
- * @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 prev_bw_bytes;
u32 prev_bw;
- u32 delta_bw;
- bool delta_comp;
};
/**
@@ -395,6 +430,8 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
+ * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
+ * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -409,6 +446,7 @@ struct rdt_hw_resource {
struct rdt_resource *r);
unsigned int mon_scale;
unsigned int mbm_width;
+ unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
@@ -426,8 +464,6 @@ extern struct mutex rdtgroup_mutex;
extern struct rdt_hw_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-
extern struct dentry *debugfs_resctrl;
enum resctrl_res_level {
@@ -543,9 +579,10 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
int closids_supported(void);
void closid_free(int closid);
-int alloc_rmid(void);
-void free_rmid(u32 rmid);
+int alloc_rmid(u32 closid);
+void free_rmid(u32 closid, u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
+void __exit rdt_put_mon_l3_config(void);
bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
@@ -553,17 +590,21 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_domain *d, struct rdtgroup *rdtgrp,
int evtid, int first);
void mbm_setup_overflow_handler(struct rdt_domain *dom,
- unsigned long delay_ms);
+ unsigned long delay_ms,
+ int exclude_cpu);
void mbm_handle_overflow(struct work_struct *work);
void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
+ int exclude_cpu);
void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
+bool has_busy_rmid(struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
void __init mbm_config_rftype_init(const char *config);
void rdt_staged_configs_clear(void);
+bool closid_allocated(unsigned int closid);
+int resctrl_find_cleanest_closid(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f136ac046851..c34a35ec0f03 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -15,6 +15,7 @@
* Software Developer Manual June 2016, volume 3, section 17.17.
*/
+#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/sizes.h>
#include <linux/slab.h>
@@ -24,7 +25,20 @@
#include "internal.h"
+/**
+ * struct rmid_entry - dirty tracking for all RMID.
+ * @closid: The CLOSID for this entry.
+ * @rmid: The RMID for this entry.
+ * @busy: The number of domains with cached data using this RMID.
+ * @list: Member of the rmid_free_lru list when busy == 0.
+ *
+ * Depending on the architecture the correct monitor is accessed using
+ * both @closid and @rmid, or @rmid only.
+ *
+ * Take the rdtgroup_mutex when accessing.
+ */
struct rmid_entry {
+ u32 closid;
u32 rmid;
int busy;
struct list_head list;
@@ -38,6 +52,13 @@ struct rmid_entry {
static LIST_HEAD(rmid_free_lru);
/*
+ * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has.
+ * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
+ * Indexed by CLOSID. Protected by rdtgroup_mutex.
+ */
+static u32 *closid_num_dirty_rmid;
+
+/*
* @rmid_limbo_count - count of currently unused but (potentially)
* dirty RMIDs.
* This counts RMIDs that no one is currently using but that
@@ -136,12 +157,29 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
return val;
}
-static inline struct rmid_entry *__rmid_entry(u32 rmid)
+/*
+ * x86 and arm64 differ in their handling of monitoring.
+ * x86's RMID are independent numbers, there is only one source of traffic
+ * with an RMID value of '1'.
+ * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
+ * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
+ * value is no longer unique.
+ * To account for this, resctrl uses an index. On x86 this is just the RMID,
+ * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
+ *
+ * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
+ * must accept an attempt to read every index.
+ */
+static inline struct rmid_entry *__rmid_entry(u32 idx)
{
struct rmid_entry *entry;
+ u32 closid, rmid;
+
+ entry = &rmid_ptrs[idx];
+ resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
- entry = &rmid_ptrs[rmid];
- WARN_ON(entry->rmid != rmid);
+ WARN_ON_ONCE(entry->closid != closid);
+ WARN_ON_ONCE(entry->rmid != rmid);
return entry;
}
@@ -190,7 +228,8 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
}
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
- u32 rmid, enum resctrl_event_id eventid)
+ u32 unused, u32 rmid,
+ enum resctrl_event_id eventid)
{
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct arch_mbm_state *am;
@@ -230,7 +269,8 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
}
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
- u32 rmid, enum resctrl_event_id eventid, u64 *val)
+ u32 unused, u32 rmid, enum resctrl_event_id eventid,
+ u64 *val, void *ignored)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
@@ -238,6 +278,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
u64 msr_val, chunks;
int ret;
+ resctrl_arch_rmid_read_context_check();
+
if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
return -EINVAL;
@@ -260,6 +302,17 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
return 0;
}
+static void limbo_release_entry(struct rmid_entry *entry)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ rmid_limbo_count--;
+ list_add_tail(&entry->list, &rmid_free_lru);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ closid_num_dirty_rmid[entry->closid]--;
+}
+
/*
* Check the RMIDs that are marked as busy for this domain. If the
* reported LLC occupancy is below the threshold clear the busy bit and
@@ -269,11 +322,20 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
void __check_limbo(struct rdt_domain *d, bool force_free)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
struct rmid_entry *entry;
- u32 crmid = 1, nrmid;
+ u32 idx, cur_idx = 1;
+ void *arch_mon_ctx;
bool rmid_dirty;
u64 val = 0;
+ arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
+ if (IS_ERR(arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(arch_mon_ctx));
+ return;
+ }
+
/*
* Skip RMID 0 and start from RMID 1 and check all the RMIDs that
* are marked as busy for occupancy < threshold. If the occupancy
@@ -281,53 +343,125 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
* RMID and move it to the free list when the counter reaches 0.
*/
for (;;) {
- nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
- if (nrmid >= r->num_rmid)
+ idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
+ if (idx >= idx_limit)
break;
- entry = __rmid_entry(nrmid);
-
- if (resctrl_arch_rmid_read(r, d, entry->rmid,
- QOS_L3_OCCUP_EVENT_ID, &val)) {
+ entry = __rmid_entry(idx);
+ if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
+ QOS_L3_OCCUP_EVENT_ID, &val,
+ arch_mon_ctx)) {
rmid_dirty = true;
} else {
rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
}
if (force_free || !rmid_dirty) {
- clear_bit(entry->rmid, d->rmid_busy_llc);
- if (!--entry->busy) {
- rmid_limbo_count--;
- list_add_tail(&entry->list, &rmid_free_lru);
- }
+ clear_bit(idx, d->rmid_busy_llc);
+ if (!--entry->busy)
+ limbo_release_entry(entry);
}
- crmid = nrmid + 1;
+ cur_idx = idx + 1;
}
+
+ resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
}
-bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+bool has_busy_rmid(struct rdt_domain *d)
{
- return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+
+ return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
+}
+
+static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
+{
+ struct rmid_entry *itr;
+ u32 itr_idx, cmp_idx;
+
+ if (list_empty(&rmid_free_lru))
+ return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
+
+ list_for_each_entry(itr, &rmid_free_lru, list) {
+ /*
+ * Get the index of this free RMID, and the index it would need
+ * to be if it were used with this CLOSID.
+ * If the CLOSID is irrelevant on this architecture, the two
+ * index values are always the same on every entry and thus the
+ * very first entry will be returned.
+ */
+ itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
+ cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
+
+ if (itr_idx == cmp_idx)
+ return itr;
+ }
+
+ return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
+ * RMID are clean, or the CLOSID that has
+ * the most clean RMID.
+ *
+ * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
+ * may not be able to allocate clean RMID. To avoid this the allocator will
+ * choose the CLOSID with the most clean RMID.
+ *
+ * When the CLOSID and RMID are independent numbers, the first free CLOSID will
+ * be returned.
+ */
+int resctrl_find_cleanest_closid(void)
+{
+ u32 cleanest_closid = ~0;
+ int i = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ return -EIO;
+
+ for (i = 0; i < closids_supported(); i++) {
+ int num_dirty;
+
+ if (closid_allocated(i))
+ continue;
+
+ num_dirty = closid_num_dirty_rmid[i];
+ if (num_dirty == 0)
+ return i;
+
+ if (cleanest_closid == ~0)
+ cleanest_closid = i;
+
+ if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
+ cleanest_closid = i;
+ }
+
+ if (cleanest_closid == ~0)
+ return -ENOSPC;
+
+ return cleanest_closid;
}
/*
- * As of now the RMIDs allocation is global.
- * However we keep track of which packages the RMIDs
- * are used to optimize the limbo list management.
+ * For MPAM the RMID value is not unique, and has to be considered with
+ * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
+ * allows all domains to be managed by a single free list.
+ * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
*/
-int alloc_rmid(void)
+int alloc_rmid(u32 closid)
{
struct rmid_entry *entry;
lockdep_assert_held(&rdtgroup_mutex);
- if (list_empty(&rmid_free_lru))
- return rmid_limbo_count ? -EBUSY : -ENOSPC;
+ entry = resctrl_find_free_rmid(closid);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
- entry = list_first_entry(&rmid_free_lru,
- struct rmid_entry, list);
list_del(&entry->list);
-
return entry->rmid;
}
@@ -335,47 +469,50 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rdt_domain *d;
- int cpu, err;
- u64 val = 0;
+ u32 idx;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
entry->busy = 0;
- cpu = get_cpu();
list_for_each_entry(d, &r->domains, list) {
- if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
- err = resctrl_arch_rmid_read(r, d, entry->rmid,
- QOS_L3_OCCUP_EVENT_ID,
- &val);
- if (err || val <= resctrl_rmid_realloc_threshold)
- continue;
- }
-
/*
* For the first limbo RMID in the domain,
* setup up the limbo worker.
*/
- if (!has_busy_rmid(r, d))
- cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
- set_bit(entry->rmid, d->rmid_busy_llc);
+ if (!has_busy_rmid(d))
+ cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
+ set_bit(idx, d->rmid_busy_llc);
entry->busy++;
}
- put_cpu();
- if (entry->busy)
- rmid_limbo_count++;
- else
- list_add_tail(&entry->list, &rmid_free_lru);
+ rmid_limbo_count++;
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ closid_num_dirty_rmid[entry->closid]++;
}
-void free_rmid(u32 rmid)
+void free_rmid(u32 closid, u32 rmid)
{
+ u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
struct rmid_entry *entry;
- if (!rmid)
- return;
-
lockdep_assert_held(&rdtgroup_mutex);
- entry = __rmid_entry(rmid);
+ /*
+ * Do not allow the default rmid to be free'd. Comparing by index
+ * allows architectures that ignore the closid parameter to avoid an
+ * unnecessary check.
+ */
+ if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID))
+ return;
+
+ entry = __rmid_entry(idx);
if (is_llc_occupancy_enabled())
add_rmid_to_limbo(entry);
@@ -383,33 +520,36 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
-static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
- enum resctrl_event_id evtid)
+static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid,
+ u32 rmid, enum resctrl_event_id evtid)
{
+ u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+
switch (evtid) {
case QOS_L3_MBM_TOTAL_EVENT_ID:
- return &d->mbm_total[rmid];
+ return &d->mbm_total[idx];
case QOS_L3_MBM_LOCAL_EVENT_ID:
- return &d->mbm_local[rmid];
+ return &d->mbm_local[idx];
default:
return NULL;
}
}
-static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
struct mbm_state *m;
u64 tval = 0;
if (rr->first) {
- resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
- m = get_mbm_state(rr->d, rmid, rr->evtid);
+ resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
+ m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
if (m)
memset(m, 0, sizeof(struct mbm_state));
return 0;
}
- rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid,
+ &tval, rr->arch_mon_ctx);
if (rr->err)
return rr->err;
@@ -421,6 +561,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
/*
* mbm_bw_count() - Update bw count from values previously read by
* __mon_event_count().
+ * @closid: The closid used to identify the cached mbm_state.
* @rmid: The rmid used to identify the cached mbm_state.
* @rr: The struct rmid_read populated by __mon_event_count().
*
@@ -429,9 +570,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
* __mon_event_count() is compared with the chunks value from the previous
* invocation. This must be called once per second to maintain values in MBps.
*/
-static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
+static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
- struct mbm_state *m = &rr->d->mbm_local[rmid];
+ u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+ struct mbm_state *m = &rr->d->mbm_local[idx];
u64 cur_bw, bytes, cur_bytes;
cur_bytes = rr->val;
@@ -440,14 +582,11 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
cur_bw = bytes / SZ_1M;
- if (m->delta_comp)
- m->delta_bw = abs(cur_bw - m->prev_bw);
- m->delta_comp = false;
m->prev_bw = cur_bw;
}
/*
- * This is called via IPI to read the CQM/MBM counters
+ * This is scheduled by mon_event_read() to read the CQM/MBM counters
* on a domain.
*/
void mon_event_count(void *info)
@@ -459,7 +598,7 @@ void mon_event_count(void *info)
rdtgrp = rr->rgrp;
- ret = __mon_event_count(rdtgrp->mon.rmid, rr);
+ ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
/*
* For Ctrl groups read data from child monitor groups and
@@ -470,7 +609,8 @@ void mon_event_count(void *info)
if (rdtgrp->type == RDTCTRL_GROUP) {
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->mon.rmid, rr) == 0)
+ if (__mon_event_count(entry->closid, entry->mon.rmid,
+ rr) == 0)
ret = 0;
}
}
@@ -520,9 +660,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
{
u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
- u32 cur_bw, delta_bw, user_bw;
struct rdt_resource *r_mba;
struct rdt_domain *dom_mba;
+ u32 cur_bw, user_bw, idx;
struct list_head *head;
struct rdtgroup *entry;
@@ -533,7 +673,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
closid = rgrp->closid;
rmid = rgrp->mon.rmid;
- pmbm_data = &dom_mbm->mbm_local[rmid];
+ idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+ pmbm_data = &dom_mbm->mbm_local[idx];
dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
if (!dom_mba) {
@@ -543,7 +684,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
cur_bw = pmbm_data->prev_bw;
user_bw = dom_mba->mbps_val[closid];
- delta_bw = pmbm_data->delta_bw;
/* MBA resource doesn't support CDP */
cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
@@ -555,52 +695,35 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
list_for_each_entry(entry, head, mon.crdtgrp_list) {
cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
cur_bw += cmbm_data->prev_bw;
- delta_bw += cmbm_data->delta_bw;
}
/*
* Scale up/down the bandwidth linearly for the ctrl group. The
* bandwidth step is the bandwidth granularity specified by the
* hardware.
- *
- * The delta_bw is used when increasing the bandwidth so that we
- * dont alternately increase and decrease the control values
- * continuously.
- *
- * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
- * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
- * switching between 90 and 110 continuously if we only check
- * cur_bw < user_bw.
+ * Always increase throttling if current bandwidth is above the
+ * target set by user.
+ * But avoid thrashing up and down on every poll by checking
+ * whether a decrease in throttling is likely to push the group
+ * back over target. E.g. if currently throttling to 30% of bandwidth
+ * on a system with 10% granularity steps, check whether moving to
+ * 40% would go past the limit by multiplying current bandwidth by
+ * "(30 + 10) / 30".
*/
if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
} else if (cur_msr_val < MAX_MBA_BW &&
- (user_bw > (cur_bw + delta_bw))) {
+ (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
} else {
return;
}
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
-
- /*
- * Delta values are updated dynamically package wise for each
- * rdtgrp every time the throttle MSR changes value.
- *
- * This is because (1)the increase in bandwidth is not perfectly
- * linear and only "approximately" linear even when the hardware
- * says it is linear.(2)Also since MBA is a core specific
- * mechanism, the delta values vary based on number of cores used
- * by the rdtgrp.
- */
- pmbm_data->delta_comp = true;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
- cmbm_data->delta_comp = true;
- }
}
-static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
+static void mbm_update(struct rdt_resource *r, struct rdt_domain *d,
+ u32 closid, u32 rmid)
{
struct rmid_read rr;
@@ -615,12 +738,28 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
if (is_mbm_total_enabled()) {
rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
rr.val = 0;
- __mon_event_count(rmid, &rr);
+ rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+ if (IS_ERR(rr.arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(rr.arch_mon_ctx));
+ return;
+ }
+
+ __mon_event_count(closid, rmid, &rr);
+
+ resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
}
if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
rr.val = 0;
- __mon_event_count(rmid, &rr);
+ rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+ if (IS_ERR(rr.arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(rr.arch_mon_ctx));
+ return;
+ }
+
+ __mon_event_count(closid, rmid, &rr);
/*
* Call the MBA software controller only for the
@@ -628,7 +767,9 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
* the software controller explicitly.
*/
if (is_mba_sc(NULL))
- mbm_bw_count(rmid, &rr);
+ mbm_bw_count(closid, rmid, &rr);
+
+ resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
}
}
@@ -639,106 +780,193 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
void cqm_handle_limbo(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
- int cpu = smp_processor_id();
- struct rdt_resource *r;
struct rdt_domain *d;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
d = container_of(work, struct rdt_domain, cqm_limbo.work);
__check_limbo(d, false);
- if (has_busy_rmid(r, d))
- schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+ if (has_busy_rmid(d)) {
+ d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
+ RESCTRL_PICK_ANY_CPU);
+ schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
+ delay);
+ }
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
}
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+/**
+ * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
+ * domain.
+ * @dom: The domain the limbo handler should run for.
+ * @delay_ms: How far in the future the handler should run.
+ * @exclude_cpu: Which CPU the handler should not run on,
+ * RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms,
+ int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- cpu = cpumask_any(&dom->cpu_mask);
+ cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
dom->cqm_work_cpu = cpu;
- schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+ if (cpu < nr_cpu_ids)
+ schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
}
void mbm_handle_overflow(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
struct rdtgroup *prgrp, *crgrp;
- int cpu = smp_processor_id();
struct list_head *head;
struct rdt_resource *r;
struct rdt_domain *d;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- if (!static_branch_likely(&rdt_mon_enable_key))
+ /*
+ * If the filesystem has been unmounted this work no longer needs to
+ * run.
+ */
+ if (!resctrl_mounted || !resctrl_arch_mon_capable())
goto out_unlock;
r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
d = container_of(work, struct rdt_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(r, d, prgrp->mon.rmid);
+ mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(r, d, crgrp->mon.rmid);
+ mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
if (is_mba_sc(NULL))
update_mba_bw(prgrp, d);
}
- schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+ /*
+ * Re-check for housekeeping CPUs. This allows the overflow handler to
+ * move off a nohz_full CPU quickly.
+ */
+ d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask,
+ RESCTRL_PICK_ANY_CPU);
+ schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
out_unlock:
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
}
-void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+/**
+ * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
+ * domain.
+ * @dom: The domain the overflow handler should run for.
+ * @delay_ms: How far in the future the handler should run.
+ * @exclude_cpu: Which CPU the handler should not run on,
+ * RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms,
+ int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- if (!static_branch_likely(&rdt_mon_enable_key))
+ /*
+ * When a domain comes online there is no guarantee the filesystem is
+ * mounted. If not, there is no need to catch counter overflow.
+ */
+ if (!resctrl_mounted || !resctrl_arch_mon_capable())
return;
- cpu = cpumask_any(&dom->cpu_mask);
+ cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu);
dom->mbm_work_cpu = cpu;
- schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+
+ if (cpu < nr_cpu_ids)
+ schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
}
static int dom_data_init(struct rdt_resource *r)
{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+ u32 num_closid = resctrl_arch_get_num_closid(r);
struct rmid_entry *entry = NULL;
- int i, nr_rmids;
+ int err = 0, i;
+ u32 idx;
+
+ mutex_lock(&rdtgroup_mutex);
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ u32 *tmp;
+
+ /*
+ * If the architecture hasn't provided a sanitised value here,
+ * this may result in larger arrays than necessary. Resctrl will
+ * use a smaller system wide value based on the resources in
+ * use.
+ */
+ tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
- nr_rmids = r->num_rmid;
- rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
- if (!rmid_ptrs)
- return -ENOMEM;
+ closid_num_dirty_rmid = tmp;
+ }
+
+ rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs) {
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ kfree(closid_num_dirty_rmid);
+ closid_num_dirty_rmid = NULL;
+ }
+ err = -ENOMEM;
+ goto out_unlock;
+ }
- for (i = 0; i < nr_rmids; i++) {
+ for (i = 0; i < idx_limit; i++) {
entry = &rmid_ptrs[i];
INIT_LIST_HEAD(&entry->list);
- entry->rmid = i;
+ resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
list_add_tail(&entry->list, &rmid_free_lru);
}
/*
- * RMID 0 is special and is always allocated. It's used for all
- * tasks that are not monitored.
+ * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
+ * are always allocated. These are used for the rdtgroup_default
+ * control group, which will be setup later in rdtgroup_init().
*/
- entry = __rmid_entry(0);
+ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID);
+ entry = __rmid_entry(idx);
list_del(&entry->list);
- return 0;
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+static void __exit dom_data_exit(void)
+{
+ mutex_lock(&rdtgroup_mutex);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ kfree(closid_num_dirty_rmid);
+ closid_num_dirty_rmid = NULL;
+ }
+
+ kfree(rmid_ptrs);
+ rmid_ptrs = NULL;
+
+ mutex_unlock(&rdtgroup_mutex);
}
static struct mon_evt llc_occupancy_event = {
@@ -813,6 +1041,12 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return ret;
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
mbm_total_event.configurable = true;
mbm_config_rftype_init("mbm_total_bytes_config");
@@ -830,6 +1064,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return 0;
}
+void __exit rdt_put_mon_l3_config(void)
+{
+ dom_data_exit();
+}
+
void __init intel_rdt_mbm_apply_quirk(void)
{
int cf_index;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 8f559eeae08e..884b88e25141 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -581,7 +581,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
if (ret)
goto err_cpus;
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
if (ret)
goto err_cpus_list;
@@ -628,7 +628,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
if (ret)
goto err_cpus;
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
if (ret)
goto err_cpus_list;
@@ -752,7 +752,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
* anymore when this group would be used for pseudo-locking. This
* is safe to call on platforms not capable of monitoring.
*/
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
ret = 0;
goto out;
@@ -776,8 +776,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
{
int ret;
- if (rdt_mon_capable) {
- ret = alloc_rmid();
+ if (resctrl_arch_mon_capable()) {
+ ret = alloc_rmid(rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("Out of RMIDs\n");
return ret;
@@ -787,7 +787,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
ret = rdtgroup_locksetup_user_restore(rdtgrp);
if (ret) {
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
return ret;
}
@@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
struct rdt_domain *d_i;
bool ret = false;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
return true;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 69a1de92384a..011e17efb1a6 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -35,6 +35,10 @@
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
static struct kernfs_root *rdt_root;
struct rdtgroup rdtgroup_default;
LIST_HEAD(rdt_all_groups);
@@ -42,6 +46,9 @@ LIST_HEAD(rdt_all_groups);
/* list of entries for the schemata file */
LIST_HEAD(resctrl_schema_all);
+/* The filesystem can only be mounted once. */
+bool resctrl_mounted;
+
/* Kernel fs node for "info" directory under root */
static struct kernfs_node *kn_info;
@@ -102,7 +109,7 @@ void rdt_staged_configs_clear(void)
*
* Using a global CLOSID across all resources has some advantages and
* some drawbacks:
- * + We can simply set "current->closid" to assign a task to a resource
+ * + We can simply set current's closid to assign a task to a resource
* group.
* + Context switch code can avoid extra memory references deciding which
* CLOSID to load into the PQR_ASSOC MSR
@@ -111,7 +118,7 @@ void rdt_staged_configs_clear(void)
* - Our choices on how to configure each resource become progressively more
* limited as the number of resources grows.
*/
-static int closid_free_map;
+static unsigned long closid_free_map;
static int closid_free_map_len;
int closids_supported(void)
@@ -130,26 +137,39 @@ static void closid_init(void)
closid_free_map = BIT_MASK(rdt_min_closid) - 1;
- /* CLOSID 0 is always reserved for the default group */
- closid_free_map &= ~1;
+ /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
+ __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
closid_free_map_len = rdt_min_closid;
}
static int closid_alloc(void)
{
- u32 closid = ffs(closid_free_map);
+ int cleanest_closid;
+ u32 closid;
- if (closid == 0)
- return -ENOSPC;
- closid--;
- closid_free_map &= ~(1 << closid);
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ cleanest_closid = resctrl_find_cleanest_closid();
+ if (cleanest_closid < 0)
+ return cleanest_closid;
+ closid = cleanest_closid;
+ } else {
+ closid = ffs(closid_free_map);
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ }
+ __clear_bit(closid, &closid_free_map);
return closid;
}
void closid_free(int closid)
{
- closid_free_map |= 1 << closid;
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ __set_bit(closid, &closid_free_map);
}
/**
@@ -159,9 +179,11 @@ void closid_free(int closid)
* Return: true if @closid is currently associated with a resource group,
* false if @closid is free
*/
-static bool closid_allocated(unsigned int closid)
+bool closid_allocated(unsigned int closid)
{
- return (closid_free_map & (1 << closid)) == 0;
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ return !test_bit(closid, &closid_free_map);
}
/**
@@ -559,14 +581,26 @@ static void update_task_closid_rmid(struct task_struct *t)
_update_task_closid_rmid(t);
}
+static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
+{
+ u32 closid, rmid = rdtgrp->mon.rmid;
+
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ closid = rdtgrp->closid;
+ else if (rdtgrp->type == RDTMON_GROUP)
+ closid = rdtgrp->mon.parent->closid;
+ else
+ return false;
+
+ return resctrl_arch_match_closid(tsk, closid) &&
+ resctrl_arch_match_rmid(tsk, closid, rmid);
+}
+
static int __rdtgroup_move_task(struct task_struct *tsk,
struct rdtgroup *rdtgrp)
{
/* If the task is already in rdtgrp, no need to move the task. */
- if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
- tsk->rmid == rdtgrp->mon.rmid) ||
- (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
- tsk->closid == rdtgrp->mon.parent->closid))
+ if (task_in_rdtgroup(tsk, rdtgrp))
return 0;
/*
@@ -577,19 +611,19 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
* For monitor groups, can move the tasks only from
* their parent CTRL group.
*/
-
- if (rdtgrp->type == RDTCTRL_GROUP) {
- WRITE_ONCE(tsk->closid, rdtgrp->closid);
- WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
- } else if (rdtgrp->type == RDTMON_GROUP) {
- if (rdtgrp->mon.parent->closid == tsk->closid) {
- WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
- } else {
- rdt_last_cmd_puts("Can't move task to different control group\n");
- return -EINVAL;
- }
+ if (rdtgrp->type == RDTMON_GROUP &&
+ !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
+ return -EINVAL;
}
+ if (rdtgrp->type == RDTMON_GROUP)
+ resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
+ rdtgrp->mon.rmid);
+ else
+ resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
+ rdtgrp->mon.rmid);
+
/*
* Ensure the task's closid and rmid are written before determining if
* the task is current that will decide if it will be interrupted.
@@ -611,14 +645,15 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
{
- return (rdt_alloc_capable &&
- (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+ return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
+ resctrl_arch_match_closid(t, r->closid));
}
static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
{
- return (rdt_mon_capable &&
- (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+ return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
+ resctrl_arch_match_rmid(t, r->mon.parent->closid,
+ r->mon.rmid));
}
/**
@@ -853,7 +888,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
mutex_lock(&rdtgroup_mutex);
/* Return empty if resctrl has not been mounted. */
- if (!static_branch_unlikely(&rdt_enable_key)) {
+ if (!resctrl_mounted) {
seq_puts(s, "res:\nmon:\n");
goto unlock;
}
@@ -869,7 +904,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
rdtg->mode != RDT_MODE_EXCLUSIVE)
continue;
- if (rdtg->closid != tsk->closid)
+ if (!resctrl_arch_match_closid(tsk, rdtg->closid))
continue;
seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
@@ -877,7 +912,8 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
seq_puts(s, "mon:");
list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
mon.crdtgrp_list) {
- if (tsk->rmid != crg->mon.rmid)
+ if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
+ crg->mon.rmid))
continue;
seq_printf(s, "%s", crg->kn->name);
break;
@@ -982,6 +1018,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
bool sep = false;
u32 ctrl_val;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
hw_shareable = r->cache.shareable_bits;
list_for_each_entry(dom, &r->domains, list) {
@@ -1042,6 +1079,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
}
seq_putc(seq, '\n');
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -1297,6 +1335,9 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
struct rdt_domain *d;
u32 ctrl;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
@@ -1561,6 +1602,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
struct rdt_domain *dom;
bool sep = false;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
list_for_each_entry(dom, &r->domains, list) {
@@ -1577,6 +1619,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
seq_puts(s, "\n");
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -1614,17 +1657,10 @@ static void mon_event_config_write(void *info)
wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
}
-static int mbm_config_write_domain(struct rdt_resource *r,
- struct rdt_domain *d, u32 evtid, u32 val)
+static void mbm_config_write_domain(struct rdt_resource *r,
+ struct rdt_domain *d, u32 evtid, u32 val)
{
struct mon_config_info mon_info = {0};
- int ret = 0;
-
- /* mon_config cannot be more than the supported set of events */
- if (val > MAX_EVT_CONFIG_BITS) {
- rdt_last_cmd_puts("Invalid event configuration\n");
- return -EINVAL;
- }
/*
* Read the current config value first. If both are the same then
@@ -1633,7 +1669,7 @@ static int mbm_config_write_domain(struct rdt_resource *r,
mon_info.evtid = evtid;
mondata_config_read(d, &mon_info);
if (mon_info.mon_config == val)
- goto out;
+ return;
mon_info.mon_config = val;
@@ -1656,17 +1692,17 @@ static int mbm_config_write_domain(struct rdt_resource *r,
* mbm_local and mbm_total counts for all the RMIDs.
*/
resctrl_arch_reset_rmid_all(r, d);
-
-out:
- return ret;
}
static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
struct rdt_domain *d;
- int ret = 0;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
next:
if (!tok || tok[0] == '\0')
@@ -1686,11 +1722,16 @@ next:
return -EINVAL;
}
+ /* Value from user cannot be more than the supported set of events */
+ if ((val & hw_res->mbm_cfg_mask) != val) {
+ rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
+ hw_res->mbm_cfg_mask);
+ return -EINVAL;
+ }
+
list_for_each_entry(d, &r->domains, list) {
if (d->id == dom_id) {
- ret = mbm_config_write_domain(r, d, evtid, val);
- if (ret)
- return -EINVAL;
+ mbm_config_write_domain(r, d, evtid, val);
goto next;
}
}
@@ -1709,6 +1750,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
if (nbytes == 0 || buf[nbytes - 1] != '\n')
return -EINVAL;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
rdt_last_cmd_clear();
@@ -1718,6 +1760,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return ret ?: nbytes;
}
@@ -1733,6 +1776,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
if (nbytes == 0 || buf[nbytes - 1] != '\n')
return -EINVAL;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
rdt_last_cmd_clear();
@@ -1742,6 +1786,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return ret ?: nbytes;
}
@@ -2218,6 +2263,9 @@ static int set_cache_qos_cfg(int level, bool enable)
struct rdt_domain *d;
int cpu;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (level == RDT_RESOURCE_L3)
update = l3_qos_cfg_update;
else if (level == RDT_RESOURCE_L2)
@@ -2417,6 +2465,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
rdtgroup_kn_get(rdtgrp, kn);
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
/* Was this group deleted while we waited? */
@@ -2434,6 +2483,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
return;
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
rdtgroup_kn_put(rdtgrp, kn);
}
@@ -2584,7 +2635,7 @@ static int rdt_get_tree(struct fs_context *fc)
/*
* resctrl file system can only be mounted once.
*/
- if (static_branch_unlikely(&rdt_enable_key)) {
+ if (resctrl_mounted) {
ret = -EBUSY;
goto out;
}
@@ -2605,7 +2656,7 @@ static int rdt_get_tree(struct fs_context *fc)
closid_init();
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
flags |= RFTYPE_MON;
ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
@@ -2618,7 +2669,7 @@ static int rdt_get_tree(struct fs_context *fc)
if (ret < 0)
goto out_schemata_free;
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
ret = mongroup_create_dir(rdtgroup_default.kn,
&rdtgroup_default, "mon_groups",
&kn_mongrp);
@@ -2640,18 +2691,19 @@ static int rdt_get_tree(struct fs_context *fc)
if (ret < 0)
goto out_psl;
- if (rdt_alloc_capable)
- static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
- if (rdt_mon_capable)
- static_branch_enable_cpuslocked(&rdt_mon_enable_key);
+ if (resctrl_arch_alloc_capable())
+ resctrl_arch_enable_alloc();
+ if (resctrl_arch_mon_capable())
+ resctrl_arch_enable_mon();
- if (rdt_alloc_capable || rdt_mon_capable)
- static_branch_enable_cpuslocked(&rdt_enable_key);
+ if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
+ resctrl_mounted = true;
if (is_mbm_enabled()) {
r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
list_for_each_entry(dom, &r->domains, list)
- mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+ mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
}
goto out;
@@ -2659,10 +2711,10 @@ static int rdt_get_tree(struct fs_context *fc)
out_psl:
rdt_pseudo_lock_release();
out_mondata:
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
kernfs_remove(kn_mondata);
out_mongrp:
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
kernfs_remove(kn_mongrp);
out_info:
kernfs_remove(kn_info);
@@ -2765,6 +2817,9 @@ static int reset_all_ctrls(struct rdt_resource *r)
struct rdt_domain *d;
int i;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
@@ -2810,8 +2865,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
for_each_process_thread(p, t) {
if (!from || is_closid_match(t, from) ||
is_rmid_match(t, from)) {
- WRITE_ONCE(t->closid, to->closid);
- WRITE_ONCE(t->rmid, to->mon.rmid);
+ resctrl_arch_set_closid_rmid(t, to->closid,
+ to->mon.rmid);
/*
* Order the closid/rmid stores above before the loads
@@ -2842,7 +2897,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
head = &rdtgrp->mon.crdtgrp_list;
list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
- free_rmid(sentry->mon.rmid);
+ free_rmid(sentry->closid, sentry->mon.rmid);
list_del(&sentry->mon.crdtgrp_list);
if (atomic_read(&sentry->waitcount) != 0)
@@ -2882,7 +2937,7 @@ static void rmdir_all_sub(void)
cpumask_or(&rdtgroup_default.cpu_mask,
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
kernfs_remove(rdtgrp->kn);
list_del(&rdtgrp->rdtgroup_list);
@@ -2917,9 +2972,11 @@ static void rdt_kill_sb(struct super_block *sb)
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
schemata_list_destroy();
rdtgroup_destroy_root();
- static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
- static_branch_disable_cpuslocked(&rdt_mon_enable_key);
- static_branch_disable_cpuslocked(&rdt_enable_key);
+ if (resctrl_arch_alloc_capable())
+ resctrl_arch_disable_alloc();
+ if (resctrl_arch_mon_capable())
+ resctrl_arch_disable_mon();
+ resctrl_mounted = false;
kernfs_kill_sb(sb);
mutex_unlock(&rdtgroup_mutex);
cpus_read_unlock();
@@ -3047,6 +3104,9 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
struct rdt_domain *dom;
int ret;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
list_for_each_entry(dom, &r->domains, list) {
ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
if (ret)
@@ -3293,6 +3353,36 @@ out:
return ret;
}
+static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ if (!resctrl_arch_mon_capable())
+ return 0;
+
+ ret = alloc_rmid(rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("Out of RMIDs\n");
+ return ret;
+ }
+ rdtgrp->mon.rmid = ret;
+
+ ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
+{
+ if (resctrl_arch_mon_capable())
+ free_rmid(rgrp->closid, rgrp->mon.rmid);
+}
+
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
const char *name, umode_t mode,
enum rdt_group_type rtype, struct rdtgroup **r)
@@ -3353,7 +3443,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
if (rtype == RDTCTRL_GROUP) {
files = RFTYPE_BASE | RFTYPE_CTRL;
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
files |= RFTYPE_MON;
} else {
files = RFTYPE_BASE | RFTYPE_MON;
@@ -3365,29 +3455,11 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
goto out_destroy;
}
- if (rdt_mon_capable) {
- ret = alloc_rmid();
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- goto out_destroy;
- }
- rdtgrp->mon.rmid = ret;
-
- ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- goto out_idfree;
- }
- }
- kernfs_activate(kn);
-
/*
* The caller unlocks the parent_kn upon success.
*/
return 0;
-out_idfree:
- free_rmid(rdtgrp->mon.rmid);
out_destroy:
kernfs_put(rdtgrp->kn);
kernfs_remove(rdtgrp->kn);
@@ -3401,7 +3473,6 @@ out_unlock:
static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
{
kernfs_remove(rgrp->kn);
- free_rmid(rgrp->mon.rmid);
rdtgroup_remove(rgrp);
}
@@ -3423,12 +3494,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
prgrp = rdtgrp->mon.parent;
rdtgrp->closid = prgrp->closid;
+ ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+ if (ret) {
+ mkdir_rdt_prepare_clean(rdtgrp);
+ goto out_unlock;
+ }
+
+ kernfs_activate(rdtgrp->kn);
+
/*
* Add the rdtgrp to the list of rdtgrps the parent
* ctrl_mon group has to track.
*/
list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+out_unlock:
rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -3459,13 +3539,20 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
ret = 0;
rdtgrp->closid = closid;
+
+ ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+ if (ret)
+ goto out_closid_free;
+
+ kernfs_activate(rdtgrp->kn);
+
ret = rdtgroup_init_alloc(rdtgrp);
if (ret < 0)
- goto out_id_free;
+ goto out_rmid_free;
list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
/*
* Create an empty mon_groups directory to hold the subset
* of tasks and cpus to monitor.
@@ -3481,7 +3568,9 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
out_del_list:
list_del(&rdtgrp->rdtgroup_list);
-out_id_free:
+out_rmid_free:
+ mkdir_rdt_prepare_rmid_free(rdtgrp);
+out_closid_free:
closid_free(closid);
out_common_fail:
mkdir_rdt_prepare_clean(rdtgrp);
@@ -3518,14 +3607,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* allocation is supported, add a control and monitoring
* subdirectory
*/
- if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+ if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
/*
* If RDT monitoring is supported and the parent directory is a valid
* "mon_groups" directory, add a monitoring subdirectory.
*/
- if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+ if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
return rdtgroup_mkdir_mon(parent_kn, name, mode);
return -EPERM;
@@ -3550,7 +3639,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
update_closid_rmid(tmpmask, NULL);
rdtgrp->flags = RDT_DELETED;
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
/*
* Remove the rdtgrp from the parent ctrl_mon group's list
@@ -3596,8 +3685,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
update_closid_rmid(tmpmask, NULL);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
closid_free(rdtgrp->closid);
- free_rmid(rdtgrp->mon.rmid);
rdtgroup_ctrl_remove(rdtgrp);
@@ -3829,8 +3918,8 @@ static void __init rdtgroup_setup_default(void)
{
mutex_lock(&rdtgroup_mutex);
- rdtgroup_default.closid = 0;
- rdtgroup_default.mon.rmid = 0;
+ rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
+ rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
rdtgroup_default.type = RDTCTRL_GROUP;
INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
@@ -3848,24 +3937,24 @@ static void domain_destroy_mon_state(struct rdt_domain *d)
void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
{
- lockdep_assert_held(&rdtgroup_mutex);
+ mutex_lock(&rdtgroup_mutex);
if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
mba_sc_domain_destroy(r, d);
if (!r->mon_capable)
- return;
+ goto out_unlock;
/*
* If resctrl is mounted, remove all the
* per domain monitor data directories.
*/
- if (static_branch_unlikely(&rdt_mon_enable_key))
+ if (resctrl_mounted && resctrl_arch_mon_capable())
rmdir_mondata_subdir_allrdtgrp(r, d->id);
if (is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+ if (is_llc_occupancy_enabled() && has_busy_rmid(d)) {
/*
* When a package is going down, forcefully
* decrement rmid->ebusy. There is no way to know
@@ -3879,20 +3968,24 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
}
domain_destroy_mon_state(d);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
}
static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
size_t tsize;
if (is_llc_occupancy_enabled()) {
- d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
+ d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
if (!d->rmid_busy_llc)
return -ENOMEM;
}
if (is_mbm_total_enabled()) {
tsize = sizeof(*d->mbm_total);
- d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_total) {
bitmap_free(d->rmid_busy_llc);
return -ENOMEM;
@@ -3900,7 +3993,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
}
if (is_mbm_local_enabled()) {
tsize = sizeof(*d->mbm_local);
- d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_local) {
bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
@@ -3913,34 +4006,97 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
{
- int err;
+ int err = 0;
- lockdep_assert_held(&rdtgroup_mutex);
+ mutex_lock(&rdtgroup_mutex);
- if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+ if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
/* RDT_RESOURCE_MBA is never mon_capable */
- return mba_sc_domain_allocate(r, d);
+ err = mba_sc_domain_allocate(r, d);
+ goto out_unlock;
+ }
if (!r->mon_capable)
- return 0;
+ goto out_unlock;
err = domain_setup_mon_state(r, d);
if (err)
- return err;
+ goto out_unlock;
if (is_mbm_enabled()) {
INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
- mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
}
if (is_llc_occupancy_enabled())
INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
- /* If resctrl is mounted, add per domain monitor data directories. */
- if (static_branch_unlikely(&rdt_mon_enable_key))
+ /*
+ * If the filesystem is not mounted then only the default resource group
+ * exists. Creation of its directories is deferred until mount time
+ * by rdt_get_tree() calling mkdir_mondata_all().
+ * If resctrl is mounted, add per domain monitor data directories.
+ */
+ if (resctrl_mounted && resctrl_arch_mon_capable())
mkdir_mondata_subdir_allrdtgrp(r, d);
- return 0;
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+void resctrl_online_cpu(unsigned int cpu)
+{
+ mutex_lock(&rdtgroup_mutex);
+ /* The CPU is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+ struct rdtgroup *cr;
+
+ list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
+ break;
+ }
+}
+
+void resctrl_offline_cpu(unsigned int cpu)
+{
+ struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdtgroup *rdtgrp;
+ struct rdt_domain *d;
+
+ mutex_lock(&rdtgroup_mutex);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+ clear_childcpus(rdtgrp, cpu);
+ break;
+ }
+ }
+
+ if (!l3->mon_capable)
+ goto out_unlock;
+
+ d = get_domain_from_cpu(cpu, l3);
+ if (d) {
+ if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ cancel_delayed_work(&d->mbm_over);
+ mbm_setup_overflow_handler(d, 0, cpu);
+ }
+ if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+ has_busy_rmid(d)) {
+ cancel_delayed_work(&d->cqm_limbo);
+ cqm_setup_limbo_handler(d, 0, cpu);
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
}
/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index b6b044356f1b..e74d0c4286c1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
#include <linux/vmalloc.h>
#include <linux/memblock.h>
+#include <asm/bootparam.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
@@ -40,6 +41,7 @@
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
+#include <asm/sev.h>
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
@@ -59,6 +61,8 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_emergency_stop_pt();
+ kdump_sev_callback();
+
disable_local_APIC();
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index f18ca44c904b..44a91ef5a23b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -410,7 +410,7 @@ static void __die_header(const char *str, struct pt_regs *regs, long err)
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
- IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+ IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
}
NOKPROBE_SYMBOL(__die_header);
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index a06b876bbf2d..edbafc5940e3 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,6 +2,8 @@
/*
* x86 FPU bug checks:
*/
+#include <linux/printk.h>
+
#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 12df54ff0e81..70139d9d2e01 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -307,7 +307,8 @@ union ftrace_op_code_union {
} __attribute__((packed));
};
-#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS))
+#define RET_SIZE \
+ (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_MITIGATION_SLS))
static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index dc0956067944..212e8e06aeba 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -22,6 +22,8 @@
#include <linux/cc_platform.h>
#include <linux/pgtable.h>
+#include <asm/asm.h>
+#include <asm/page_64.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
@@ -67,42 +69,11 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
#endif
-/*
- * GDT used on the boot CPU before switching to virtual addresses.
- */
-static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
-};
-
-/*
- * Address needs to be set at runtime because it references the startup_gdt
- * while the kernel still uses a direct mapping.
- */
-static struct desc_ptr startup_gdt_descr __initdata = {
- .size = sizeof(startup_gdt)-1,
- .address = 0,
-};
-
-static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
-{
- return ptr - (void *)_text + (void *)physaddr;
-}
-
-static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
-{
- return fixup_pointer(ptr, physaddr);
-}
-
-#ifdef CONFIG_X86_5LEVEL
-static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
+static inline bool check_la57_support(void)
{
- return fixup_pointer(ptr, physaddr);
-}
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return false;
-static bool __head check_la57_support(unsigned long physaddr)
-{
/*
* 5-level paging is detected and enabled at kernel decompression
* stage. Only check if it has been enabled there.
@@ -110,21 +81,8 @@ static bool __head check_la57_support(unsigned long physaddr)
if (!(native_read_cr4() & X86_CR4_LA57))
return false;
- *fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
- *fixup_int(&pgdir_shift, physaddr) = 48;
- *fixup_int(&ptrs_per_p4d, physaddr) = 512;
- *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
- *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
- *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
-
return true;
}
-#else
-static bool __head check_la57_support(unsigned long physaddr)
-{
- return false;
-}
-#endif
static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
{
@@ -173,23 +131,22 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
* boot-time crashes. To work around this problem, every global pointer must
- * be adjusted using fixup_pointer().
+ * be accessed using RIP_REL_REF().
*/
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
- unsigned long load_delta, *p;
+ pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
unsigned long pgtable_flags;
+ unsigned long load_delta;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
- pteval_t *mask_ptr;
bool la57;
int i;
- unsigned int *next_pgt_ptr;
- la57 = check_la57_support(physaddr);
+ la57 = check_la57_support();
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
@@ -200,6 +157,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* and the address I am actually running at.
*/
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+ RIP_REL_REF(phys_base) = load_delta;
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_MASK)
@@ -210,26 +168,21 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Fixup the physical addresses in the page table */
- pgd = fixup_pointer(early_top_pgt, physaddr);
- p = pgd + pgd_index(__START_KERNEL_map);
- if (la57)
- *p = (unsigned long)level4_kernel_pgt;
- else
- *p = (unsigned long)level3_kernel_pgt;
- *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+ pgd = &RIP_REL_REF(early_top_pgt)->pgd;
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
if (la57) {
- p4d = fixup_pointer(level4_kernel_pgt, physaddr);
- p4d[511] += load_delta;
+ p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
+ p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE_NOENC;
}
- pud = fixup_pointer(level3_kernel_pgt, physaddr);
- pud[510] += load_delta;
- pud[511] += load_delta;
+ RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
+ RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
- pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
- pmd[i] += load_delta;
+ RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
/*
* Set up the identity mapping for the switchover. These
@@ -238,15 +191,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
* it avoids problems around wraparound.
*/
- next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
- pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pud = &early_pgts[0]->pmd;
+ pmd = &early_pgts[1]->pmd;
+ RIP_REL_REF(next_early_pgt) = 2;
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
- p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
- physaddr);
+ p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
@@ -267,8 +219,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
- mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
- pmd_entry &= *mask_ptr;
+ pmd_entry &= RIP_REL_REF(__supported_pte_mask);
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
@@ -294,7 +245,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
* error, causing the BIOS to halt the system.
*/
- pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
/* invalidate pages before the kernel image */
for (i = 0; i < pmd_index((unsigned long)_text); i++)
@@ -309,12 +260,6 @@ unsigned long __head __startup_64(unsigned long physaddr,
for (; i < PTRS_PER_PMD; i++)
pmd[i] &= ~_PAGE_PRESENT;
- /*
- * Fixup phys_base - remove the memory encryption mask to obtain
- * the true physical address.
- */
- *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
-
return sme_postprocess_startup(bp, pmd);
}
@@ -486,6 +431,15 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ if (check_la57_support()) {
+ __pgtable_l5_enabled = 1;
+ pgdir_shift = 48;
+ ptrs_per_p4d = 512;
+ page_offset_base = __PAGE_OFFSET_BASE_L5;
+ vmalloc_base = __VMALLOC_BASE_L5;
+ vmemmap_base = __VMEMMAP_BASE_L5;
+ }
+
cr4_init_shadow();
/* Kill off the identity-map trampoline */
@@ -569,62 +523,52 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data)
*/
static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
-static struct desc_ptr bringup_idt_descr = {
- .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1,
- .address = 0, /* Set at runtime */
-};
-
-static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
+/* This may run while still in the direct mapping */
+static void __head startup_64_load_idt(void *vc_handler)
{
-#ifdef CONFIG_AMD_MEM_ENCRYPT
+ struct desc_ptr desc = {
+ .address = (unsigned long)&RIP_REL_REF(bringup_idt_table),
+ .size = sizeof(bringup_idt_table) - 1,
+ };
struct idt_data data;
- gate_desc desc;
-
- init_idt_data(&data, n, handler);
- idt_init_desc(&desc, &data);
- native_write_idt_entry(idt, n, &desc);
-#endif
-}
+ gate_desc idt_desc;
-/* This runs while still in the direct mapping */
-static void __head startup_64_load_idt(unsigned long physbase)
-{
- struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
- gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
-
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- void *handler;
-
- /* VMM Communication Exception */
- handler = fixup_pointer(vc_no_ghcb, physbase);
- set_bringup_idt_handler(idt, X86_TRAP_VC, handler);
+ /* @vc_handler is set only for a VMM Communication Exception */
+ if (vc_handler) {
+ init_idt_data(&data, X86_TRAP_VC, vc_handler);
+ idt_init_desc(&idt_desc, &data);
+ native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
}
- desc->address = (unsigned long)idt;
- native_load_idt(desc);
+ native_load_idt(&desc);
}
/* This is used when running on kernel addresses */
void early_setup_idt(void)
{
- /* VMM Communication Exception */
+ void *handler = NULL;
+
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
setup_ghcb();
- set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb);
+ handler = vc_boot_ghcb;
}
- bringup_idt_descr.address = (unsigned long)bringup_idt_table;
- native_load_idt(&bringup_idt_descr);
+ startup_64_load_idt(handler);
}
/*
* Setup boot CPU state needed before kernel switches to virtual addresses.
*/
-void __head startup_64_setup_env(unsigned long physbase)
+void __head startup_64_setup_gdt_idt(void)
{
+ void *handler = NULL;
+
+ struct desc_ptr startup_gdt_descr = {
+ .address = (unsigned long)&RIP_REL_REF(init_per_cpu_var(gdt_page.gdt)),
+ .size = GDT_SIZE - 1,
+ };
+
/* Load GDT */
- startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase);
native_load_gdt(&startup_gdt_descr);
/* New GDT is live - reload data segment registers */
@@ -632,5 +576,8 @@ void __head startup_64_setup_env(unsigned long physbase)
"movl %%eax, %%ss\n"
"movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
- startup_64_load_idt(physbase);
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ handler = &RIP_REL_REF(vc_no_ghcb);
+
+ startup_64_load_idt(handler);
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 487ac57e2c81..b50f3641c4d6 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -414,7 +414,7 @@ __REFDATA
.align 4
SYM_DATA(initial_code, .long i386_start_kernel)
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
#define PGD_ALIGN (2 * PAGE_SIZE)
#define PTI_USER_PGD_FILL 1024
#else
@@ -474,7 +474,7 @@ SYM_DATA_START(initial_page_table)
# endif
.align PAGE_SIZE /* needs to be page-sized too */
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* PTI needs another page so sync_initial_pagetable() works correctly
* and does not scribble over the data which is placed behind the
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index c38e43589046..d8198fbd70e5 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,6 @@ L4_START_KERNEL = l4_index(__START_KERNEL_map)
L3_START_KERNEL = pud_index(__START_KERNEL_map)
- .text
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
@@ -69,8 +68,6 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu() */
leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
- leaq _text(%rip), %rdi
-
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
@@ -78,7 +75,7 @@ SYM_CODE_START_NOALIGN(startup_64)
shrq $32, %rdx
wrmsr
- call startup_64_setup_env
+ call startup_64_setup_gdt_idt
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
@@ -114,13 +111,11 @@ SYM_CODE_START_NOALIGN(startup_64)
call __startup_64
/* Form the CR3 value being sure to include the CR3 modifier */
- addq $(early_top_pgt - __START_KERNEL_map), %rax
+ leaq early_top_pgt(%rip), %rcx
+ addq %rcx, %rax
#ifdef CONFIG_AMD_MEM_ENCRYPT
mov %rax, %rdi
- mov %rax, %r14
-
- addq phys_base(%rip), %rdi
/*
* For SEV guests: Verify that the C-bit is correct. A malicious
@@ -129,17 +124,23 @@ SYM_CODE_START_NOALIGN(startup_64)
* the next RET instruction.
*/
call sev_verify_cbit
+#endif
/*
- * Restore CR3 value without the phys_base which will be added
- * below, before writing %cr3.
+ * Switch to early_top_pgt which still has the identity mappings
+ * present.
*/
- mov %r14, %rax
-#endif
+ movq %rax, %cr3
- jmp 1f
+ /* Branch to the common startup code at its kernel virtual address */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *0f(%rip)
SYM_CODE_END(startup_64)
+ __INITRODATA
+0: .quad common_startup_64
+
+ .text
SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
@@ -172,22 +173,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Clear %R15 which holds the boot_params pointer on the boot CPU */
- xorq %r15, %r15
+ xorl %r15d, %r15d
+
+ /* Derive the runtime physical address of init_top_pgt[] */
+ movq phys_base(%rip), %rax
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
*/
#ifdef CONFIG_AMD_MEM_ENCRYPT
- movq sme_me_mask, %rax
-#else
- xorq %rax, %rax
+ addq sme_me_mask(%rip), %rax
#endif
+ /*
+ * Switch to the init_top_pgt here, away from the trampoline_pgd and
+ * unmap the identity mapped ranges.
+ */
+ movq %rax, %cr3
- /* Form the CR3 value being sure to include the CR3 modifier */
- addq $(init_top_pgt - __START_KERNEL_map), %rax
-1:
+SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+ /*
+ * Create a mask of CR4 bits to preserve. Omit PGE in order to flush
+ * global 1:1 translations from the TLBs.
+ *
+ * From the SDM:
+ * "If CR4.PGE is changing from 0 to 1, there were no global TLB
+ * entries before the execution; if CR4.PGE is changing from 1 to 0,
+ * there will be no global TLB entries after the execution."
+ */
+ movl $(X86_CR4_PAE | X86_CR4_LA57), %edx
#ifdef CONFIG_X86_MCE
/*
* Preserve CR4.MCE if the kernel will enable #MC support.
@@ -196,52 +214,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* configured will crash the system regardless of the CR4.MCE value set
* here.
*/
- movq %cr4, %rcx
- andl $X86_CR4_MCE, %ecx
-#else
- movl $0, %ecx
+ orl $X86_CR4_MCE, %edx
#endif
+ movq %cr4, %rcx
+ andl %edx, %ecx
- /* Enable PAE mode, PSE, PGE and LA57 */
- orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
-#ifdef CONFIG_X86_5LEVEL
- testb $1, __pgtable_l5_enabled(%rip)
- jz 1f
- orl $X86_CR4_LA57, %ecx
-1:
-#endif
+ /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */
+ btsl $X86_CR4_PSE_BIT, %ecx
movq %rcx, %cr4
- /* Setup early boot stage 4-/5-level pagetables. */
- addq phys_base(%rip), %rax
-
/*
- * Switch to new page-table
- *
- * For the boot CPU this switches to early_top_pgt which still has the
- * identity mappings present. The secondary CPUs will switch to the
- * init_top_pgt here, away from the trampoline_pgd and unmap the
- * identity mapped ranges.
+ * Set CR4.PGE to re-enable global translations.
*/
- movq %rax, %cr3
-
- /*
- * Do a global TLB flush after the CR3 switch to make sure the TLB
- * entries from the identity mapping are flushed.
- */
- movq %cr4, %rcx
- movq %rcx, %rax
- xorq $X86_CR4_PGE, %rcx
+ btsl $X86_CR4_PGE_BIT, %ecx
movq %rcx, %cr4
- movq %rax, %cr4
-
- /* Ensure I am executing from virtual addresses */
- movq $1f, %rax
- ANNOTATE_RETPOLINE_SAFE
- jmp *%rax
-1:
- UNWIND_HINT_END_OF_STACK
- ANNOTATE_NOENDBR // above
#ifdef CONFIG_SMP
/*
@@ -298,7 +284,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
.Llookup_AP:
/* EAX contains the APIC ID of the current CPU */
- xorq %rcx, %rcx
+ xorl %ecx, %ecx
leaq cpuid_to_apicid(%rip), %rbx
.Lfind_cpunr:
@@ -429,39 +415,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movq %r15, %rdi
.Ljump_to_C_code:
- /*
- * Jump to run C code and to be on a real kernel address.
- * Since we are running on identity-mapped space we have to jump
- * to the full 64bit address, this is only possible as indirect
- * jump. In addition we need to ensure %cs is set so we make this
- * a far return.
- *
- * Note: do not change to far jump indirect with 64bit offset.
- *
- * AMD does not support far jump indirect with 64bit offset.
- * AMD64 Architecture Programmer's Manual, Volume 3: states only
- * JMP FAR mem16:16 FF /5 Far jump indirect,
- * with the target specified by a far pointer in memory.
- * JMP FAR mem16:32 FF /5 Far jump indirect,
- * with the target specified by a far pointer in memory.
- *
- * Intel64 does support 64bit offset.
- * Software Developer Manual Vol 2: states:
- * FF /5 JMP m16:16 Jump far, absolute indirect,
- * address given in m16:16
- * FF /5 JMP m16:32 Jump far, absolute indirect,
- * address given in m16:32.
- * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
- * address given in m16:64.
- */
- pushq $.Lafter_lret # put return address on stack for unwinder
xorl %ebp, %ebp # clear frame pointer
- movq initial_code(%rip), %rax
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
-.Lafter_lret:
- ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ callq *initial_code(%rip)
+ ud2
SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
@@ -478,7 +435,7 @@ SYM_CODE_START(soft_restart_cpu)
UNWIND_HINT_END_OF_STACK
/* Find the idle task stack */
- movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+ movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
movq TASK_threadsp(%rcx), %rsp
jmp .Ljump_to_C_code
@@ -623,7 +580,7 @@ SYM_CODE_END(vc_no_ghcb)
#define SYM_DATA_START_PAGE_ALIGNED(name) \
SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
@@ -656,7 +613,8 @@ SYM_CODE_END(vc_no_ghcb)
.balign 4
SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
- .fill 512,8,0
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
SYM_DATA_END(early_top_pgt)
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 2a422e00ed4b..cde167b0ea92 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -503,7 +503,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz = kernel_len - kern16_size;
kbuf.memsz = PAGE_ALIGN(header->init_size);
kbuf.buf_align = header->kernel_alignment;
- kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ if (header->pref_address < MIN_KERNEL_LOAD_ADDR)
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ else
+ kbuf.buf_min = header->pref_address;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 517821b48391..36d6809c6c9e 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -324,7 +324,7 @@ static int can_optimize(unsigned long paddr)
* However, the kernel built with retpolines or IBT has jump
* tables disabled so the check can be skipped altogether.
*/
- if (!IS_ENABLED(CONFIG_RETPOLINE) &&
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) &&
!IS_ENABLED(CONFIG_X86_KERNEL_IBT) &&
insn_is_indirect_jump(&insn))
return 0;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 7a814b41402d..0f19ef355f5f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -184,7 +184,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return new_ldt;
}
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
static void do_sanity_check(struct mm_struct *mm,
bool had_kernel_mapping,
@@ -377,7 +377,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
}
-#else /* !CONFIG_PAGE_TABLE_ISOLATION */
+#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
@@ -388,11 +388,11 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
}
-#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static void free_ldt_pgtables(struct mm_struct *mm)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
unsigned long end = LDT_END_ADDR;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 397ef9fcf5e9..9a5b372c706f 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -304,13 +304,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
__this_cpu_add(nmi_stats.unknown, 1);
- pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
nmi_panic(regs, "NMI: Not continuing");
- pr_emerg("Dazed and confused, but trying to continue\n");
+ pr_emerg_ratelimited("Dazed and confused, but trying to continue\n");
}
NOKPROBE_SYMBOL(unknown_nmi_error);
@@ -503,7 +503,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
raw_atomic_long_inc(&nsp->idt_calls);
- if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) {
+ if (arch_cpu_is_offline(smp_processor_id())) {
if (microcode_nmi_handler_enabled())
microcode_offline_nmi_handler();
return;
@@ -637,7 +637,7 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
msgp = nmi_check_stall_msg[idx];
if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
modp = ", but OK because ignore_nmis was set";
- if (nmi_seq & ~0x1)
+ if (nmi_seq & 0x1)
msghp = " (CPU currently in NMI handler function)";
else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
msghp = " (CPU exited one NMI handler function)";
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6121c2b42ecf..b8441147eb5e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -846,31 +846,6 @@ void __noreturn stop_this_cpu(void *dummy)
}
/*
- * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
- * states (local apic timer and TSC stop).
- *
- * XXX this function is completely buggered vs RCU and tracing.
- */
-static void amd_e400_idle(void)
-{
- /*
- * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
- * gets set after static_cpu_has() places have been converted via
- * alternatives.
- */
- if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
- default_idle();
- return;
- }
-
- tick_broadcast_enter();
-
- default_idle();
-
- tick_broadcast_exit();
-}
-
-/*
* Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf
* exists and whenever MONITOR/MWAIT extensions are present there is at
* least one C1 substate.
@@ -878,21 +853,22 @@ static void amd_e400_idle(void)
* Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait
* is passed to kernel commandline parameter.
*/
-static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+static __init bool prefer_mwait_c1_over_halt(void)
{
+ const struct cpuinfo_x86 *c = &boot_cpu_data;
u32 eax, ebx, ecx, edx;
- /* User has disallowed the use of MWAIT. Fallback to HALT */
- if (boot_option_idle_override == IDLE_NOMWAIT)
- return 0;
+ /* If override is enforced on the command line, fall back to HALT. */
+ if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+ return false;
/* MWAIT is not supported on this platform. Fallback to HALT */
if (!cpu_has(c, X86_FEATURE_MWAIT))
- return 0;
+ return false;
- /* Monitor has a bug. Fallback to HALT */
- if (boot_cpu_has_bug(X86_BUG_MONITOR))
- return 0;
+ /* Monitor has a bug or APIC stops in C1E. Fallback to HALT */
+ if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E))
+ return false;
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
@@ -901,13 +877,13 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
* with EAX=0, ECX=0.
*/
if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED))
- return 1;
+ return true;
/*
* If MWAIT extensions are available, there should be at least one
* MWAIT C1 substate present.
*/
- return (edx & MWAIT_C1_SUBSTATE_MASK);
+ return !!(edx & MWAIT_C1_SUBSTATE_MASK);
}
/*
@@ -933,26 +909,27 @@ static __cpuidle void mwait_idle(void)
__current_clr_polling();
}
-void select_idle_routine(const struct cpuinfo_x86 *c)
+void __init select_idle_routine(void)
{
-#ifdef CONFIG_SMP
- if (boot_option_idle_override == IDLE_POLL && __max_threads_per_core > 1)
- pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
-#endif
- if (x86_idle_set() || boot_option_idle_override == IDLE_POLL)
+ if (boot_option_idle_override == IDLE_POLL) {
+ if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1)
+ pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
+ return;
+ }
+
+ /* Required to guard against xen_set_default_idle() */
+ if (x86_idle_set())
return;
- if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
- pr_info("using AMD E400 aware idle routine\n");
- static_call_update(x86_idle, amd_e400_idle);
- } else if (prefer_mwait_c1_over_halt(c)) {
+ if (prefer_mwait_c1_over_halt()) {
pr_info("using mwait in idle threads\n");
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
static_call_update(x86_idle, tdx_safe_halt);
- } else
+ } else {
static_call_update(x86_idle, default_idle);
+ }
}
void amd_e400_c1e_apic_setup(void)
@@ -985,7 +962,10 @@ void __init arch_post_acpi_subsys_init(void)
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
- pr_info("System has AMD C1E enabled\n");
+
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE))
+ static_branch_enable(&arch_needs_tick_broadcast);
+ pr_info("System has AMD C1E erratum E400. Workaround enabled.\n");
}
static int __init idle_setup(char *str)
@@ -998,24 +978,14 @@ static int __init idle_setup(char *str)
boot_option_idle_override = IDLE_POLL;
cpu_idle_poll_ctrl(true);
} else if (!strcmp(str, "halt")) {
- /*
- * When the boot option of idle=halt is added, halt is
- * forced to be used for CPU idle. In such case CPU C2/C3
- * won't be used again.
- * To continue to load the CPU idle driver, don't touch
- * the boot_option_idle_override.
- */
- static_call_update(x86_idle, default_idle);
+ /* 'idle=halt' HALT for idle. C-states are disabled. */
boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
- /*
- * If the boot option of "idle=nomwait" is added,
- * it means that mwait will be disabled for CPU C1/C2/C3
- * states.
- */
+ /* 'idle=nomwait' disables MWAIT for idle */
boot_option_idle_override = IDLE_NOMWAIT;
- } else
- return -1;
+ } else {
+ return -EINVAL;
+ }
return 0;
}
@@ -1030,7 +1000,10 @@ unsigned long arch_align_stack(unsigned long sp)
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- return randomize_page(mm->brk, 0x02000000);
+ if (mmap_is_ia32())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 708c87b88cc1..0917c7f25720 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -156,13 +156,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
+ switch_fpu_prepare(prev_p, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -209,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
raw_cpu_write(pcpu_hot.current_task, next_p);
- switch_fpu_finish();
+ switch_fpu_finish(next_p);
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in(next_p);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c075591b7b46..7062b84dd467 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -611,14 +611,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(pcpu_hot.hardirq_stack_inuse));
- if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
+ switch_fpu_prepare(prev_p, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -672,7 +671,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
raw_cpu_write(pcpu_hot.current_task, next_p);
raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
- switch_fpu_finish();
+ switch_fpu_finish(next_p);
/* Reload sp0. */
update_task_stack(next_p);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4e320d4d3898..46d5a8c520ad 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1206,6 +1206,16 @@ void __init i386_reserve_resources(void)
#endif /* CONFIG_X86_32 */
+#ifndef CONFIG_SMP
+void __init smp_prepare_boot_cpu(void)
+{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
+ *c = boot_cpu_data;
+ c->initialized = true;
+}
+#endif
+
static struct notifier_block kernel_offset_notifier = {
.notifier_call = dump_kernel_offset
};
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 1d24ec679915..8b04958da5e7 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -9,12 +9,18 @@
* and is included directly into both code-bases.
*/
+#include <asm/setup_data.h>
+
#ifndef __BOOT_COMPRESSED
-#define error(v) pr_err(v)
-#define has_cpuflag(f) boot_cpu_has(f)
+#define error(v) pr_err(v)
+#define has_cpuflag(f) boot_cpu_has(f)
+#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__)
#else
#undef WARN
#define WARN(condition, format...) (!!(condition))
+#define sev_printk(fmt, ...)
+#define sev_printk_rtl(fmt, ...)
#endif
/* I/O parameters for CPUID-related helpers */
@@ -89,7 +95,8 @@ static bool __init sev_es_check_cpu_features(void)
return true;
}
-static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
+static void __head __noreturn
+sev_es_terminate(unsigned int set, unsigned int reason)
{
u64 val = GHCB_MSR_TERM_REQ;
@@ -326,13 +333,7 @@ static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid
*/
static const struct snp_cpuid_table *snp_cpuid_get_table(void)
{
- void *ptr;
-
- asm ("lea cpuid_table_copy(%%rip), %0"
- : "=r" (ptr)
- : "p" (&cpuid_table_copy));
-
- return ptr;
+ return &RIP_REL_REF(cpuid_table_copy);
}
/*
@@ -391,7 +392,7 @@ static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
return xsave_size;
}
-static bool
+static bool __head
snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -528,7 +529,8 @@ static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
* Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
* should be treated as fatal by caller.
*/
-static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
+static int __head
+snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
{
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
@@ -556,9 +558,9 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le
leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
/* Skip post-processing for out-of-range zero leafs. */
- if (!(leaf->fn <= cpuid_std_range_max ||
- (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
- (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
+ if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) ||
+ (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) ||
+ (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max))))
return 0;
}
@@ -570,10 +572,11 @@ static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_le
* page yet, so it only supports the MSR based communication with the
* hypervisor and only the CPUID exit-code.
*/
-void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
{
unsigned int subfn = lower_bits(regs->cx, 32);
unsigned int fn = lower_bits(regs->ax, 32);
+ u16 opcode = *(unsigned short *)regs->ip;
struct cpuid_leaf leaf;
int ret;
@@ -581,6 +584,10 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
if (exit_code != SVM_EXIT_CPUID)
goto fail;
+ /* Is it really a CPUID insn? */
+ if (opcode != 0xa20f)
+ goto fail;
+
leaf.fn = fn;
leaf.subfn = subfn;
@@ -1016,7 +1023,8 @@ struct cc_setup_data {
* Search for a Confidential Computing blob passed in as a setup_data entry
* via the Linux Boot Protocol.
*/
-static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
+static __head
+struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
{
struct cc_setup_data *sd = NULL;
struct setup_data *hdr;
@@ -1043,7 +1051,7 @@ static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
* mapping needs to be updated in sync with all the changes to virtual memory
* layout and related mapping facilities throughout the boot process.
*/
-static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
+static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
{
const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
int i;
@@ -1063,11 +1071,11 @@ static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
if (fn->eax_in == 0x0)
- cpuid_std_range_max = fn->eax;
+ RIP_REL_REF(cpuid_std_range_max) = fn->eax;
else if (fn->eax_in == 0x40000000)
- cpuid_hyp_range_max = fn->eax;
+ RIP_REL_REF(cpuid_hyp_range_max) = fn->eax;
else if (fn->eax_in == 0x80000000)
- cpuid_ext_range_max = fn->eax;
+ RIP_REL_REF(cpuid_ext_range_max) = fn->eax;
}
}
@@ -1170,3 +1178,92 @@ static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
out:
return ret;
}
+
+static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt,
+ unsigned long exit_code)
+{
+ unsigned int opcode = (unsigned int)ctxt->insn.opcode.value;
+ u8 modrm = ctxt->insn.modrm.value;
+
+ switch (exit_code) {
+
+ case SVM_EXIT_IOIO:
+ case SVM_EXIT_NPF:
+ /* handled separately */
+ return ES_OK;
+
+ case SVM_EXIT_CPUID:
+ if (opcode == 0xa20f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_INVD:
+ if (opcode == 0x080f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MONITOR:
+ if (opcode == 0x010f && modrm == 0xc8)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MWAIT:
+ if (opcode == 0x010f && modrm == 0xc9)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_MSR:
+ /* RDMSR */
+ if (opcode == 0x320f ||
+ /* WRMSR */
+ opcode == 0x300f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDPMC:
+ if (opcode == 0x330f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDTSC:
+ if (opcode == 0x310f)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_RDTSCP:
+ if (opcode == 0x010f && modrm == 0xf9)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_READ_DR7:
+ if (opcode == 0x210f &&
+ X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_VMMCALL:
+ if (opcode == 0x010f && modrm == 0xd9)
+ return ES_OK;
+
+ break;
+
+ case SVM_EXIT_WRITE_DR7:
+ if (opcode == 0x230f &&
+ X86_MODRM_REG(ctxt->insn.modrm.value) == 7)
+ return ES_OK;
+ break;
+
+ case SVM_EXIT_WBINVD:
+ if (opcode == 0x90f)
+ return ES_OK;
+ break;
+
+ default:
+ break;
+ }
+
+ sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n",
+ opcode, exit_code, ctxt->regs->ip);
+
+ return ES_UNSUPPORTED;
+}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index c67285824e82..b59b09c2f284 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -25,6 +25,7 @@
#include <linux/psp-sev.h>
#include <uapi/linux/sev-guest.h>
+#include <asm/init.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
#include <asm/sev.h>
@@ -59,6 +60,25 @@
#define AP_INIT_CR0_DEFAULT 0x60000010
#define AP_INIT_MXCSR_DEFAULT 0x1f80
+static const char * const sev_status_feat_names[] = {
+ [MSR_AMD64_SEV_ENABLED_BIT] = "SEV",
+ [MSR_AMD64_SEV_ES_ENABLED_BIT] = "SEV-ES",
+ [MSR_AMD64_SEV_SNP_ENABLED_BIT] = "SEV-SNP",
+ [MSR_AMD64_SNP_VTOM_BIT] = "vTom",
+ [MSR_AMD64_SNP_REFLECT_VC_BIT] = "ReflectVC",
+ [MSR_AMD64_SNP_RESTRICTED_INJ_BIT] = "RI",
+ [MSR_AMD64_SNP_ALT_INJ_BIT] = "AI",
+ [MSR_AMD64_SNP_DEBUG_SWAP_BIT] = "DebugSwap",
+ [MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT] = "NoHostIBS",
+ [MSR_AMD64_SNP_BTB_ISOLATION_BIT] = "BTBIsol",
+ [MSR_AMD64_SNP_VMPL_SSS_BIT] = "VmplSSS",
+ [MSR_AMD64_SNP_SECURE_TSC_BIT] = "SecureTSC",
+ [MSR_AMD64_SNP_VMGEXIT_PARAM_BIT] = "VMGExitParam",
+ [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt",
+ [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt",
+ [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt",
+};
+
/* For early boot hypervisor communication in SEV-ES enabled guests */
static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
@@ -682,8 +702,9 @@ static u64 __init get_jump_table_addr(void)
return ret;
}
-static void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
- unsigned long npages, enum psc_op op)
+static void __head
+early_set_pages_state(unsigned long vaddr, unsigned long paddr,
+ unsigned long npages, enum psc_op op)
{
unsigned long paddr_end;
u64 val;
@@ -739,7 +760,7 @@ e_term:
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
}
-void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
+void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned long npages)
{
/*
@@ -748,7 +769,7 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
* This eliminates worries about jump tables or checking boot_cpu_data
* in the cc_platform_has() function.
*/
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
return;
/*
@@ -767,7 +788,7 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr
* This eliminates worries about jump tables or checking boot_cpu_data
* in the cc_platform_has() function.
*/
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED))
return;
/* Ask hypervisor to mark the memory pages shared in the RMP table. */
@@ -1752,7 +1773,10 @@ static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
struct ghcb *ghcb,
unsigned long exit_code)
{
- enum es_result result;
+ enum es_result result = vc_check_opcode_bytes(ctxt, exit_code);
+
+ if (result != ES_OK)
+ return result;
switch (exit_code) {
case SVM_EXIT_READ_DR7:
@@ -2059,7 +2083,7 @@ fail:
*
* Scan for the blob in that order.
*/
-static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
+static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
@@ -2085,7 +2109,7 @@ found_cc_info:
return cc_info;
}
-bool __init snp_init(struct boot_params *bp)
+bool __head snp_init(struct boot_params *bp)
{
struct cc_blob_sev_info *cc_info;
@@ -2107,7 +2131,7 @@ bool __init snp_init(struct boot_params *bp)
return true;
}
-void __init __noreturn snp_abort(void)
+void __head __noreturn snp_abort(void)
{
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
}
@@ -2262,3 +2286,29 @@ static int __init snp_init_platform_device(void)
return 0;
}
device_initcall(snp_init_platform_device);
+
+void kdump_sev_callback(void)
+{
+ /*
+ * Do wbinvd() on remote CPUs when SNP is enabled in order to
+ * safely do SNP_SHUTDOWN on the local CPU.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ wbinvd();
+}
+
+void sev_show_status(void)
+{
+ int i;
+
+ pr_info("Status: ");
+ for (i = 0; i < MSR_AMD64_SNP_RESV_BIT; i++) {
+ if (sev_status & BIT_ULL(i)) {
+ if (!sev_status_feat_names[i])
+ continue;
+
+ pr_cont("%s ", sev_status_feat_names[i]);
+ }
+ }
+ pr_cont("\n");
+}
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
index 3355e27c69eb..1ab65f6c6ae7 100644
--- a/arch/x86/kernel/sev_verify_cbit.S
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -77,7 +77,7 @@ SYM_FUNC_START(sev_verify_cbit)
* The check failed, prevent any forward progress to prevent ROP
* attacks, invalidate the stack and go into a hlt loop.
*/
- xorq %rsp, %rsp
+ xorl %esp, %esp
subq $0x1000, %rsp
2: hlt
jmp 2b
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 96a771f9f930..2908e063d7d8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -148,14 +148,16 @@ static int register_stop_handler(void)
static void native_stop_other_cpus(int wait)
{
- unsigned int cpu = smp_processor_id();
+ unsigned int old_cpu, this_cpu;
unsigned long flags, timeout;
if (reboot_force)
return;
/* Only proceed if this is the first CPU to reach this code */
- if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
+ old_cpu = -1;
+ this_cpu = smp_processor_id();
+ if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu))
return;
/* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
@@ -186,7 +188,7 @@ static void native_stop_other_cpus(int wait)
* NMIs.
*/
cpumask_copy(&cpus_stop_mask, cpu_online_mask);
- cpumask_clear_cpu(cpu, &cpus_stop_mask);
+ cpumask_clear_cpu(this_cpu, &cpus_stop_mask);
if (!cpumask_empty(&cpus_stop_mask)) {
apic_send_IPI_allbutself(REBOOT_VECTOR);
@@ -210,6 +212,8 @@ static void native_stop_other_cpus(int wait)
* CPUs to stop.
*/
if (!smp_no_nmi_ipi && !register_stop_handler()) {
+ unsigned int cpu;
+
pr_emerg("Shutting down cpus with NMI\n");
for_each_cpu(cpu, &cpus_stop_mask)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9c1e1219c28f..fe355c89f6c1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -101,10 +101,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
EXPORT_PER_CPU_SYMBOL(cpu_die_map);
-/* Per CPU bogomips and other parameters */
-DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
-EXPORT_PER_CPU_SYMBOL(cpu_info);
-
/* CPUs which are the primary SMT threads */
struct cpumask __cpu_primary_thread_mask __read_mostly;
@@ -1078,6 +1074,11 @@ void __init smp_prepare_cpus_common(void)
set_cpu_sibling_map(0);
}
+void __init smp_prepare_boot_cpu(void)
+{
+ smp_ops.smp_prepare_boot_cpu();
+}
+
#ifdef CONFIG_X86_64
/* Establish whether parallel bringup can be supported. */
bool __init arch_cpuhp_init_parallel_bringup(void)
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 77a9316da435..4eefaac64c6c 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -172,7 +172,7 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
}
EXPORT_SYMBOL_GPL(arch_static_call_transform);
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
/*
* This is called by apply_returns() to fix up static call trampolines,
* specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 8e2b2552b5ee..3e2952679b88 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -6,7 +6,9 @@
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/ptrace.h>
+
#include <asm/desc.h>
+#include <asm/debugreg.h>
#include <asm/mmu_context.h>
unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index c783aeb37dce..cb9fa1d5c66f 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -52,13 +52,6 @@ static unsigned long get_align_bits(void)
return va_align.bits & get_align_mask();
}
-unsigned long align_vdso_addr(unsigned long addr)
-{
- unsigned long align_mask = get_align_mask();
- addr = (addr + align_mask) & ~align_mask;
- return addr | get_align_bits();
-}
-
static int __init control_va_addr_alignment(char *str)
{
/* guard against enabling this on other CPU families */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6cb31df3d5ff..4fa0b17e5043 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -774,7 +774,7 @@ DEFINE_IDTENTRY_RAW(exc_int3)
*/
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1;
+ struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
@@ -792,7 +792,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
* trust it and switch to the current kernel stack
*/
if (ip_within_syscall_gap(regs)) {
- sp = this_cpu_read(pcpu_hot.top_of_stack);
+ sp = current_top_of_stack();
goto sync;
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a349dbfc6d5a..56451fd2099e 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -46,6 +46,7 @@ ENTRY(phys_startup_64)
#endif
jiffies = jiffies_64;
+const_pcpu_hot = pcpu_hot;
#if defined(CONFIG_X86_64)
/*
@@ -132,7 +133,7 @@ SECTIONS
LOCK_TEXT
KPROBES_TEXT
SOFTIRQENTRY_TEXT
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
*(.text..__x86.indirect_thunk)
*(.text..__x86.return_thunk)
#endif
@@ -142,7 +143,7 @@ SECTIONS
*(.text..__x86.rethunk_untrain)
ENTRY_TEXT
-#ifdef CONFIG_CPU_SRSO
+#ifdef CONFIG_MITIGATION_SRSO
/*
* See the comment above srso_alias_untrain_ret()'s
* definition.
@@ -267,7 +268,7 @@ SECTIONS
}
#endif
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
/*
* List of instructions that call/jmp/jcc to retpoline thunks
* __x86_indirect_thunk_*(). These instructions can be patched along
@@ -504,11 +505,11 @@ INIT_PER_CPU(irq_stack_backing_store);
"fixed_percpu_data is not at start of per-cpu area");
#endif
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
#endif
-#ifdef CONFIG_CPU_SRSO
+#ifdef CONFIG_MITIGATION_SRSO
. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
/*
* GNU ld cannot do XOR until 2.41.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3242f3da2457..1edf93ee3395 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2815,7 +2815,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
vcpu->arch.apic = apic;
- apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (kvm_x86_ops.alloc_apic_backing_page)
+ apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
+ else
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0544700ca50b..9a905f10e10c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -53,12 +53,11 @@
#include <asm/cmpxchg.h>
#include <asm/io.h>
#include <asm/set_memory.h>
+#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
#include "trace.h"
-extern bool itlb_multihit_kvm_mitigation;
-
static bool nx_hugepage_mitigation_hard_disabled;
int __read_mostly nx_huge_pages = -1;
@@ -263,7 +262,7 @@ static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu)
{
- if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
return kvm_read_cr3(vcpu);
return mmu->get_guest_pgd(vcpu);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 0669a8a668ca..5390a591a571 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -315,7 +315,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (!prefetch)
vcpu->stat.pf_taken++;
- if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && fault.is_tdp)
r = kvm_tdp_page_fault(vcpu, &fault);
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index dee62362a360..55b9a6d96bcf 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1181,7 +1181,7 @@ int svm_allocate_nested(struct vcpu_svm *svm)
if (svm->nested.initialized)
return 0;
- vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ vmcb02_page = snp_safe_alloc_page(&svm->vcpu);
if (!vmcb02_page)
return -ENOMEM;
svm->nested.vmcb02.ptr = page_address(vmcb02_page);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index a8ce5226b3b5..ae0ac12382b9 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -246,6 +246,7 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_platform_init_args init_args = {0};
int asid, ret;
if (kvm->created_vcpus)
@@ -262,7 +263,8 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_no_asid;
sev->asid = asid;
- ret = sev_platform_init(&argp->error);
+ init_args.probe = false;
+ ret = sev_platform_init(&init_args);
if (ret)
goto e_free;
@@ -274,6 +276,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
return 0;
e_free:
+ argp->error = init_args.error;
sev_asid_free(sev);
sev->asid = 0;
e_no_asid:
@@ -3165,3 +3168,35 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
}
+
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
+{
+ unsigned long pfn;
+ struct page *p;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+
+ /*
+ * Allocate an SNP-safe page to workaround the SNP erratum where
+ * the CPU will incorrectly signal an RMP violation #PF if a
+ * hugepage (2MB or 1GB) collides with the RMP entry of a
+ * 2MB-aligned VMCB, VMSA, or AVIC backing page.
+ *
+ * Allocate one extra page, choose a page which is not
+ * 2MB-aligned, and free the other.
+ */
+ p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+ if (!p)
+ return NULL;
+
+ split_page(p, 1);
+
+ pfn = page_to_pfn(p);
+ if (IS_ALIGNED(pfn, PTRS_PER_PMD))
+ __free_page(p++);
+ else
+ __free_page(p + 1);
+
+ return p;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e90b429c84f1..272d5ed37ce7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -703,7 +703,7 @@ static int svm_cpu_init(int cpu)
int ret = -ENOMEM;
memset(sd, 0, sizeof(struct svm_cpu_data));
- sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ sd->save_area = snp_safe_alloc_page(NULL);
if (!sd->save_area)
return ret;
@@ -1421,7 +1421,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
svm = to_svm(vcpu);
err = -ENOMEM;
- vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ vmcb01_page = snp_safe_alloc_page(vcpu);
if (!vmcb01_page)
goto out;
@@ -1430,7 +1430,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
* SEV-ES guests require a separate VMSA page used to contain
* the encrypted register state of the guest.
*/
- vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ vmsa_page = snp_safe_alloc_page(vcpu);
if (!vmsa_page)
goto error_free_vmcb_page;
@@ -3455,7 +3455,7 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
if (!svm_check_exit_valid(exit_code))
return svm_handle_invalid_exit(vcpu, exit_code);
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
return msr_interception(vcpu);
else if (exit_code == SVM_EXIT_VINTR)
@@ -4900,6 +4900,16 @@ static int svm_vm_init(struct kvm *kvm)
return 0;
}
+static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
+{
+ struct page *page = snp_safe_alloc_page(vcpu);
+
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
static struct kvm_x86_ops svm_x86_ops __initdata = {
.name = KBUILD_MODNAME,
@@ -5031,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
+ .alloc_apic_backing_page = svm_alloc_apic_backing_page,
};
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 8ef95139cd24..7f1fbd874c45 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -694,6 +694,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 9499f9c6b077..187018c424bf 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -207,7 +207,7 @@ SYM_FUNC_START(__svm_vcpu_run)
7: vmload %_ASM_AX
8:
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
@@ -344,7 +344,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
/* Pop @svm to RDI, guest registers have been saved already. */
pop %_ASM_DI
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
#endif
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5b8fae9c1f82..305237dcba88 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6553,7 +6553,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e02cc710f56d..ffe580169c93 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1623,7 +1623,8 @@ static bool kvm_is_immutable_feature_msr(u32 msr)
ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1655,6 +1656,8 @@ static u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_SSB_NO;
if (!boot_cpu_has_bug(X86_BUG_MDS))
data |= ARCH_CAP_MDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ data |= ARCH_CAP_RFDS_NO;
if (!boot_cpu_has(X86_FEATURE_RTM)) {
/*
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index ea3a28e7b613..6da73513f026 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -14,19 +14,6 @@ ifdef CONFIG_KCSAN
CFLAGS_REMOVE_delay.o = $(CC_FLAGS_FTRACE)
endif
-# Early boot use of cmdline; don't instrument it
-ifdef CONFIG_AMD_MEM_ENCRYPT
-KCOV_INSTRUMENT_cmdline.o := n
-KASAN_SANITIZE_cmdline.o := n
-KCSAN_SANITIZE_cmdline.o := n
-
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_cmdline.o = -pg
-endif
-
-CFLAGS_cmdline.o := -fno-stack-protector -fno-jump-tables
-endif
-
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
@@ -49,7 +36,7 @@ lib-$(CONFIG_ARCH_HAS_COPY_MC) += copy_mc.o copy_mc_64.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
-lib-$(CONFIG_RETPOLINE) += retpoline.o
+lib-$(CONFIG_MITIGATION_RETPOLINE) += retpoline.o
obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
obj-y += iomem.o
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 6962df315793..4fb44894ad87 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -23,14 +23,14 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
cli
/* if (*ptr == old) */
- cmpq PER_CPU_VAR(0(%rsi)), %rax
+ cmpq __percpu (%rsi), %rax
jne .Lnot_same
- cmpq PER_CPU_VAR(8(%rsi)), %rdx
+ cmpq __percpu 8(%rsi), %rdx
jne .Lnot_same
/* *ptr = new */
- movq %rbx, PER_CPU_VAR(0(%rsi))
- movq %rcx, PER_CPU_VAR(8(%rsi))
+ movq %rbx, __percpu (%rsi)
+ movq %rcx, __percpu 8(%rsi)
/* set ZF in EFLAGS to indicate success */
orl $X86_EFLAGS_ZF, (%rsp)
@@ -42,8 +42,8 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
/* *ptr != old */
/* old = *ptr */
- movq PER_CPU_VAR(0(%rsi)), %rax
- movq PER_CPU_VAR(8(%rsi)), %rdx
+ movq __percpu (%rsi), %rax
+ movq __percpu 8(%rsi), %rdx
/* clear ZF in EFLAGS to indicate failure */
andl $(~X86_EFLAGS_ZF), (%rsp)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index 873e4ef23e49..1c96be769adc 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -24,12 +24,12 @@ SYM_FUNC_START(cmpxchg8b_emu)
pushfl
cli
- cmpl 0(%esi), %eax
+ cmpl (%esi), %eax
jne .Lnot_same
cmpl 4(%esi), %edx
jne .Lnot_same
- movl %ebx, 0(%esi)
+ movl %ebx, (%esi)
movl %ecx, 4(%esi)
orl $X86_EFLAGS_ZF, (%esp)
@@ -38,7 +38,7 @@ SYM_FUNC_START(cmpxchg8b_emu)
RET
.Lnot_same:
- movl 0(%esi), %eax
+ movl (%esi), %eax
movl 4(%esi), %edx
andl $(~X86_EFLAGS_ZF), (%esp)
@@ -53,18 +53,30 @@ EXPORT_SYMBOL(cmpxchg8b_emu)
#ifndef CONFIG_UML
+/*
+ * Emulate 'cmpxchg8b %fs:(%rsi)'
+ *
+ * Inputs:
+ * %esi : memory location to compare
+ * %eax : low 32 bits of old value
+ * %edx : high 32 bits of old value
+ * %ebx : low 32 bits of new value
+ * %ecx : high 32 bits of new value
+ *
+ * Notably this is not LOCK prefixed and is not safe against NMIs
+ */
SYM_FUNC_START(this_cpu_cmpxchg8b_emu)
pushfl
cli
- cmpl PER_CPU_VAR(0(%esi)), %eax
+ cmpl __percpu (%esi), %eax
jne .Lnot_same2
- cmpl PER_CPU_VAR(4(%esi)), %edx
+ cmpl __percpu 4(%esi), %edx
jne .Lnot_same2
- movl %ebx, PER_CPU_VAR(0(%esi))
- movl %ecx, PER_CPU_VAR(4(%esi))
+ movl %ebx, __percpu (%esi)
+ movl %ecx, __percpu 4(%esi)
orl $X86_EFLAGS_ZF, (%esp)
@@ -72,8 +84,8 @@ SYM_FUNC_START(this_cpu_cmpxchg8b_emu)
RET
.Lnot_same2:
- movl PER_CPU_VAR(0(%esi)), %eax
- movl PER_CPU_VAR(4(%esi)), %edx
+ movl __percpu (%esi), %eax
+ movl __percpu 4(%esi), %edx
andl $(~X86_EFLAGS_ZF), (%esp)
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 558a605929db..98631c0e7a11 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1129,15 +1129,15 @@ static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs,
* get_eff_addr_sib() - Obtain referenced effective address via SIB
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
- * @regoff: Obtained operand offset, in pt_regs, associated with segment
+ * @base_offset: Obtained operand offset, in pt_regs, associated with segment
* @eff_addr: Obtained effective address
*
* Obtain the effective address referenced by the SIB byte of @insn. After
* identifying the registers involved in the indexed, register-indirect memory
* reference, its value is obtained from the operands in @regs. The computed
* address is stored @eff_addr. Also, the register operand that indicates the
- * associated segment is stored in @regoff, this parameter can later be used to
- * determine such segment.
+ * associated segment is stored in @base_offset; this parameter can later be
+ * used to determine such segment.
*
* Returns:
*
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 55e371cc69fd..1bb155a0955b 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -71,7 +71,7 @@ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
insn->kaddr = kaddr;
insn->end_kaddr = kaddr + buf_len;
insn->next_byte = kaddr;
- insn->x86_64 = x86_64 ? 1 : 0;
+ insn->x86_64 = x86_64;
insn->opnd_bytes = 4;
if (x86_64)
insn->addr_bytes = 8;
@@ -268,11 +268,9 @@ int insn_get_opcode(struct insn *insn)
if (opcode->got)
return 0;
- if (!insn->prefixes.got) {
- ret = insn_get_prefixes(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_prefixes(insn);
+ if (ret)
+ return ret;
/* Get first opcode */
op = get_next(insn_byte_t, insn);
@@ -339,11 +337,9 @@ int insn_get_modrm(struct insn *insn)
if (modrm->got)
return 0;
- if (!insn->opcode.got) {
- ret = insn_get_opcode(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_opcode(insn);
+ if (ret)
+ return ret;
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
@@ -386,11 +382,9 @@ int insn_rip_relative(struct insn *insn)
if (!insn->x86_64)
return 0;
- if (!modrm->got) {
- ret = insn_get_modrm(insn);
- if (ret)
- return 0;
- }
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return 0;
/*
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
@@ -417,11 +411,9 @@ int insn_get_sib(struct insn *insn)
if (insn->sib.got)
return 0;
- if (!insn->modrm.got) {
- ret = insn_get_modrm(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (insn->modrm.nbytes) {
modrm = insn->modrm.bytes[0];
@@ -460,11 +452,9 @@ int insn_get_displacement(struct insn *insn)
if (insn->displacement.got)
return 0;
- if (!insn->sib.got) {
- ret = insn_get_sib(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_sib(insn);
+ if (ret)
+ return ret;
if (insn->modrm.nbytes) {
/*
@@ -628,11 +618,9 @@ int insn_get_immediate(struct insn *insn)
if (insn->immediate.got)
return 0;
- if (!insn->displacement.got) {
- ret = insn_get_displacement(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_displacement(insn);
+ if (ret)
+ return ret;
if (inat_has_moffset(insn->attr)) {
if (!__get_moffset(insn))
@@ -703,11 +691,9 @@ int insn_get_length(struct insn *insn)
if (insn->length)
return 0;
- if (!insn->immediate.got) {
- ret = insn_get_immediate(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_immediate(insn);
+ if (ret)
+ return ret;
insn->length = (unsigned char)((unsigned long)insn->next_byte
- (unsigned long)insn->kaddr);
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index 40bbe56bde32..acd463d887e1 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -9,10 +9,9 @@ static void __rdmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
struct msr *reg;
- int this_cpu = raw_smp_processor_id();
if (rv->msrs)
- reg = per_cpu_ptr(rv->msrs, this_cpu);
+ reg = this_cpu_ptr(rv->msrs);
else
reg = &rv->reg;
@@ -23,10 +22,9 @@ static void __wrmsr_on_cpu(void *info)
{
struct msr_info *rv = info;
struct msr *reg;
- int this_cpu = raw_smp_processor_id();
if (rv->msrs)
- reg = per_cpu_ptr(rv->msrs, this_cpu);
+ reg = this_cpu_ptr(rv->msrs);
else
reg = &rv->reg;
@@ -97,7 +95,7 @@ int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
EXPORT_SYMBOL(wrmsrl_on_cpu);
static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
- struct msr *msrs,
+ struct msr __percpu *msrs,
void (*msr_func) (void *info))
{
struct msr_info rv;
@@ -124,7 +122,7 @@ static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
* @msrs: array of MSR values
*
*/
-void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs)
{
__rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
}
@@ -138,7 +136,7 @@ EXPORT_SYMBOL(rdmsr_on_cpus);
* @msrs: array of MSR values
*
*/
-void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
+void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs)
{
__rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
}
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 47fd9bd6b91d..4bf4fad5b148 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -6,9 +6,9 @@
#define CREATE_TRACE_POINTS
#include <asm/msr-trace.h>
-struct msr *msrs_alloc(void)
+struct msr __percpu *msrs_alloc(void)
{
- struct msr *msrs = NULL;
+ struct msr __percpu *msrs = NULL;
msrs = alloc_percpu(struct msr);
if (!msrs) {
@@ -20,7 +20,7 @@ struct msr *msrs_alloc(void)
}
EXPORT_SYMBOL(msrs_alloc);
-void msrs_free(struct msr *msrs)
+void msrs_free(struct msr __percpu *msrs)
{
free_percpu(msrs);
}
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 7b2589877d06..721b528da9ac 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -71,7 +71,7 @@ SYM_CODE_END(__x86_indirect_thunk_array)
#include <asm/GEN-for-each-reg.h>
#undef GEN
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
.macro CALL_THUNK reg
.align RETPOLINE_THUNK_SIZE
@@ -127,7 +127,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
#undef GEN
#endif
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
/*
* Be careful here: that label cannot really be removed because in
@@ -138,7 +138,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array)
*/
.section .text..__x86.return_thunk
-#ifdef CONFIG_CPU_SRSO
+#ifdef CONFIG_MITIGATION_SRSO
/*
* srso_alias_untrain_ret() and srso_alias_safe_ret() are placed at
@@ -225,12 +225,12 @@ SYM_CODE_END(srso_return_thunk)
#define JMP_SRSO_UNTRAIN_RET "jmp srso_untrain_ret"
#define JMP_SRSO_ALIAS_UNTRAIN_RET "jmp srso_alias_untrain_ret"
-#else /* !CONFIG_CPU_SRSO */
+#else /* !CONFIG_MITIGATION_SRSO */
#define JMP_SRSO_UNTRAIN_RET "ud2"
#define JMP_SRSO_ALIAS_UNTRAIN_RET "ud2"
-#endif /* CONFIG_CPU_SRSO */
+#endif /* CONFIG_MITIGATION_SRSO */
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
/*
* Some generic notes on the untraining sequences:
@@ -312,11 +312,11 @@ SYM_CODE_END(retbleed_return_thunk)
SYM_FUNC_END(retbleed_untrain_ret)
#define JMP_RETBLEED_UNTRAIN_RET "jmp retbleed_untrain_ret"
-#else /* !CONFIG_CPU_UNRET_ENTRY */
+#else /* !CONFIG_MITIGATION_UNRET_ENTRY */
#define JMP_RETBLEED_UNTRAIN_RET "ud2"
-#endif /* CONFIG_CPU_UNRET_ENTRY */
+#endif /* CONFIG_MITIGATION_UNRET_ENTRY */
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)
+#if defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)
SYM_FUNC_START(entry_untrain_ret)
ALTERNATIVE_2 JMP_RETBLEED_UNTRAIN_RET, \
@@ -325,9 +325,9 @@ SYM_FUNC_START(entry_untrain_ret)
SYM_FUNC_END(entry_untrain_ret)
__EXPORT_THUNK(entry_untrain_ret)
-#endif /* CONFIG_CPU_UNRET_ENTRY || CONFIG_CPU_SRSO */
+#endif /* CONFIG_MITIGATION_UNRET_ENTRY || CONFIG_MITIGATION_SRSO */
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
.align 64
SYM_FUNC_START(call_depth_return_thunk)
@@ -359,7 +359,7 @@ SYM_FUNC_START(call_depth_return_thunk)
int3
SYM_FUNC_END(call_depth_return_thunk)
-#endif /* CONFIG_CALL_DEPTH_TRACKING */
+#endif /* CONFIG_MITIGATION_CALL_DEPTH_TRACKING */
/*
* This function name is magical and is used by -mfunction-return=thunk-extern
@@ -369,21 +369,18 @@ SYM_FUNC_END(call_depth_return_thunk)
* 'JMP __x86_return_thunk' sites are changed to something else by
* apply_returns().
*
- * This should be converted eventually to call a warning function which
- * should scream loudly when the default return thunk is called after
- * alternatives have been applied.
- *
- * That warning function cannot BUG() because the bug splat cannot be
- * displayed in all possible configurations, leading to users not really
- * knowing why the machine froze.
+ * The ALTERNATIVE below adds a really loud warning to catch the case
+ * where the insufficient default return thunk ends up getting used for
+ * whatever reason like miscompilation or failure of
+ * objtool/alternatives/etc to patch all the return sites.
*/
SYM_CODE_START(__x86_return_thunk)
UNWIND_HINT_FUNC
ANNOTATE_NOENDBR
- ANNOTATE_UNRET_SAFE
- ret
+ ALTERNATIVE __stringify(ANNOTATE_UNRET_SAFE; ret), \
+ "jmp warn_thunk_thunk", X86_FEATURE_ALWAYS
int3
SYM_CODE_END(__x86_return_thunk)
EXPORT_SYMBOL(__x86_return_thunk)
-#endif /* CONFIG_RETHUNK */
+#endif /* CONFIG_MITIGATION_RETHUNK */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c80febc44cd2..428048e73bd2 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -16,6 +16,7 @@ KASAN_SANITIZE_pgprot.o := n
KCSAN_SANITIZE := n
# Avoid recursion by not calling KMSAN hooks for CEA code.
KMSAN_SANITIZE_cpu_entry_area.o := n
+KMSAN_SANITIZE_mem_encrypt_identity.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
@@ -60,7 +61,7 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
+obj-$(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) += pti.o
obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index b43301cb2a80..ae5c213a1cb0 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -22,7 +22,7 @@ static int ptdump_curknl_show(struct seq_file *m, void *v)
DEFINE_SHOW_ATTRIBUTE(ptdump_curknl);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
static int ptdump_curusr_show(struct seq_file *m, void *v)
{
if (current->mm->pgd)
@@ -54,7 +54,7 @@ static int __init pt_dump_debug_init(void)
debugfs_create_file("current_kernel", 0400, dir, NULL,
&ptdump_curknl_fops);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
debugfs_create_file("current_user", 0400, dir, NULL,
&ptdump_curusr_fops);
#endif
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e1b599ecbbc2..b7b88c1d91ec 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -408,7 +408,7 @@ void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
bool user)
{
pgd_t *pgd = mm->pgd;
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
if (user && boot_cpu_has(X86_FEATURE_PTI))
pgd = kernel_to_user_pgdp(pgd);
#endif
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
void ptdump_walk_user_pgd_level_checkwx(void)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_t *pgd = INIT_PGD;
if (!(__supported_pte_mask & _PAGE_NX) ||
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e1ac86396446..402e08f6b7ec 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
#include <asm/vdso.h> /* fixup_vdso_exception() */
#include <asm/irq_stack.h>
#include <asm/fred.h>
+#include <asm/sev.h> /* snp_dump_hva_rmpentry() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -548,6 +549,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
!(error_code & X86_PF_PROT) ? "not-present page" :
(error_code & X86_PF_RSVD) ? "reserved bit violation" :
(error_code & X86_PF_PK) ? "protection keys violation" :
+ (error_code & X86_PF_RMP) ? "RMP violation" :
"permissions violation");
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
@@ -580,6 +582,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad
}
dump_pagetable(address);
+
+ if (error_code & X86_PF_RMP)
+ snp_dump_hva_rmpentry(address);
}
static noinline void
@@ -1294,21 +1299,14 @@ void do_user_addr_fault(struct pt_regs *regs,
return;
}
- /*
- * It's safe to allow irq's after cr2 has been saved and the
- * vmalloc fault has been handled.
- *
- * User-mode registers count as a user access even for any
- * potential system fault or CPU buglet:
- */
- if (user_mode(regs)) {
- local_irq_enable();
- flags |= FAULT_FLAG_USER;
- } else {
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
+ /* Legacy check - remove this after verifying that it doesn't trigger */
+ if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
+ bad_area_nosemaphore(regs, error_code, address);
+ return;
}
+ local_irq_enable();
+
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
/*
@@ -1324,6 +1322,14 @@ void do_user_addr_fault(struct pt_regs *regs,
if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
+ /*
+ * We set FAULT_FLAG_USER based on the register state, not
+ * based on X86_PF_USER. User space accesses that cause
+ * system page faults are still user accesses.
+ */
+ if (user_mode(regs))
+ flags |= FAULT_FLAG_USER;
+
#ifdef CONFIG_X86_64
/*
* Faults in the vsyscall page might need emulation. The
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index c290c55b632b..6f3b3e028718 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -14,6 +14,8 @@
#include <linux/mem_encrypt.h>
#include <linux/virtio_anchor.h>
+#include <asm/sev.h>
+
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
{
@@ -42,38 +44,45 @@ bool force_dma_unencrypted(struct device *dev)
static void print_mem_encrypt_feature_info(void)
{
- pr_info("Memory Encryption Features active:");
-
- if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
- pr_cont(" Intel TDX\n");
- return;
- }
+ pr_info("Memory Encryption Features active: ");
- pr_cont(" AMD");
+ switch (cc_vendor) {
+ case CC_VENDOR_INTEL:
+ pr_cont("Intel TDX\n");
+ break;
+ case CC_VENDOR_AMD:
+ pr_cont("AMD");
- /* Secure Memory Encryption */
- if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
+ /* Secure Memory Encryption */
+ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) {
/*
* SME is mutually exclusive with any of the SEV
* features below.
- */
- pr_cont(" SME\n");
- return;
- }
+ */
+ pr_cont(" SME\n");
+ return;
+ }
- /* Secure Encrypted Virtualization */
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- pr_cont(" SEV");
+ /* Secure Encrypted Virtualization */
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ pr_cont(" SEV");
+
+ /* Encrypted Register State */
+ if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+ pr_cont(" SEV-ES");
- /* Encrypted Register State */
- if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
- pr_cont(" SEV-ES");
+ /* Secure Nested Paging */
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
+ pr_cont(" SEV-SNP");
- /* Secure Nested Paging */
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- pr_cont(" SEV-SNP");
+ pr_cont("\n");
- pr_cont("\n");
+ sev_show_status();
+
+ break;
+ default:
+ pr_cont("Unknown\n");
+ }
}
/* Architecture __weak replacement functions */
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index d73aeb16417f..64b5005d49e5 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -41,9 +41,9 @@
#include <linux/mem_encrypt.h>
#include <linux/cc_platform.h>
+#include <asm/init.h>
#include <asm/setup.h>
#include <asm/sections.h>
-#include <asm/cmdline.h>
#include <asm/coco.h>
#include <asm/sev.h>
@@ -95,11 +95,7 @@ struct sme_populate_pgd_data {
*/
static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
-static char sme_cmdline_arg[] __initdata = "mem_encrypt";
-static char sme_cmdline_on[] __initdata = "on";
-static char sme_cmdline_off[] __initdata = "off";
-
-static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
+static void __head sme_clear_pgd(struct sme_populate_pgd_data *ppd)
{
unsigned long pgd_start, pgd_end, pgd_size;
pgd_t *pgd_p;
@@ -114,7 +110,7 @@ static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd)
memset(pgd_p, 0, pgd_size);
}
-static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
+static pud_t __head *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -151,7 +147,7 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd)
return pud;
}
-static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
+static void __head sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
{
pud_t *pud;
pmd_t *pmd;
@@ -167,7 +163,7 @@ static void __init sme_populate_pgd_large(struct sme_populate_pgd_data *ppd)
set_pmd(pmd, __pmd(ppd->paddr | ppd->pmd_flags));
}
-static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
+static void __head sme_populate_pgd(struct sme_populate_pgd_data *ppd)
{
pud_t *pud;
pmd_t *pmd;
@@ -193,7 +189,7 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
set_pte(pte, __pte(ppd->paddr | ppd->pte_flags));
}
-static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
+static void __head __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd_large(ppd);
@@ -203,7 +199,7 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
}
}
-static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
+static void __head __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
{
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd(ppd);
@@ -213,7 +209,7 @@ static void __init __sme_map_range_pte(struct sme_populate_pgd_data *ppd)
}
}
-static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
+static void __head __sme_map_range(struct sme_populate_pgd_data *ppd,
pmdval_t pmd_flags, pteval_t pte_flags)
{
unsigned long vaddr_end;
@@ -237,22 +233,22 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
__sme_map_range_pte(ppd);
}
-static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_encrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC);
}
-static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_decrypted(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC);
}
-static void __init sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
+static void __head sme_map_range_decrypted_wp(struct sme_populate_pgd_data *ppd)
{
__sme_map_range(ppd, PMD_FLAGS_DEC_WP, PTE_FLAGS_DEC_WP);
}
-static unsigned long __init sme_pgtable_calc(unsigned long len)
+static unsigned long __head sme_pgtable_calc(unsigned long len)
{
unsigned long entries = 0, tables = 0;
@@ -289,7 +285,7 @@ static unsigned long __init sme_pgtable_calc(unsigned long len)
return entries + tables;
}
-void __init sme_encrypt_kernel(struct boot_params *bp)
+void __head sme_encrypt_kernel(struct boot_params *bp)
{
unsigned long workarea_start, workarea_end, workarea_len;
unsigned long execute_start, execute_end, execute_len;
@@ -305,7 +301,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* instrumentation or checking boot_cpu_data in the cc_platform_has()
* function.
*/
- if (!sme_get_me_mask() || sev_status & MSR_AMD64_SEV_ENABLED)
+ if (!sme_get_me_mask() ||
+ RIP_REL_REF(sev_status) & MSR_AMD64_SEV_ENABLED)
return;
/*
@@ -323,9 +320,8 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* memory from being cached.
*/
- /* Physical addresses gives us the identity mapped virtual addresses */
- kernel_start = __pa_symbol(_text);
- kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE);
+ kernel_start = (unsigned long)RIP_REL_REF(_text);
+ kernel_end = ALIGN((unsigned long)RIP_REL_REF(_end), PMD_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
@@ -343,14 +339,6 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
#endif
/*
- * We're running identity mapped, so we must obtain the address to the
- * SME encryption workarea using rip-relative addressing.
- */
- asm ("lea sme_workarea(%%rip), %0"
- : "=r" (workarea_start)
- : "p" (sme_workarea));
-
- /*
* Calculate required number of workarea bytes needed:
* executable encryption area size:
* stack page (PAGE_SIZE)
@@ -359,7 +347,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* pagetable structures for the encryption of the kernel
* pagetable structures for workarea (in case not currently mapped)
*/
- execute_start = workarea_start;
+ execute_start = workarea_start = (unsigned long)RIP_REL_REF(sme_workarea);
execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
execute_len = execute_end - execute_start;
@@ -502,14 +490,11 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
native_write_cr3(__native_read_cr3());
}
-void __init sme_enable(struct boot_params *bp)
+void __head sme_enable(struct boot_params *bp)
{
- const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
unsigned int eax, ebx, ecx, edx;
unsigned long feature_mask;
- bool active_by_default;
unsigned long me_mask;
- char buffer[16];
bool snp;
u64 msr;
@@ -543,15 +528,18 @@ void __init sme_enable(struct boot_params *bp)
me_mask = 1UL << (ebx & 0x3f);
/* Check the SEV MSR whether SEV or SME is enabled */
- sev_status = __rdmsr(MSR_AMD64_SEV);
- feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
+ RIP_REL_REF(sev_status) = msr = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (msr & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */
- if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ if (snp && !(msr & MSR_AMD64_SEV_SNP_ENABLED))
snp_abort();
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ if (!(bp->hdr.xloadflags & XLF_MEM_ENCRYPTION))
+ return;
+
/*
* No SME if Hypervisor bit is set. This check is here to
* prevent a guest from trying to enable SME. For running as a
@@ -571,48 +559,10 @@ void __init sme_enable(struct boot_params *bp)
msr = __rdmsr(MSR_AMD64_SYSCFG);
if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
return;
- } else {
- /* SEV state cannot be controlled by a command line option */
- sme_me_mask = me_mask;
- goto out;
}
- /*
- * Fixups have not been applied to phys_base yet and we're running
- * identity mapped, so we must obtain the address to the SME command
- * line argument data using rip-relative addressing.
- */
- asm ("lea sme_cmdline_arg(%%rip), %0"
- : "=r" (cmdline_arg)
- : "p" (sme_cmdline_arg));
- asm ("lea sme_cmdline_on(%%rip), %0"
- : "=r" (cmdline_on)
- : "p" (sme_cmdline_on));
- asm ("lea sme_cmdline_off(%%rip), %0"
- : "=r" (cmdline_off)
- : "p" (sme_cmdline_off));
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT))
- active_by_default = true;
- else
- active_by_default = false;
-
- cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
- ((u64)bp->ext_cmd_line_ptr << 32));
-
- if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0)
- return;
-
- if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
- sme_me_mask = me_mask;
- else if (!strncmp(buffer, cmdline_off, sizeof(buffer)))
- sme_me_mask = 0;
- else
- sme_me_mask = active_by_default ? me_mask : 0;
-out:
- if (sme_me_mask) {
- physical_mask &= ~sme_me_mask;
- cc_vendor = CC_VENDOR_AMD;
- cc_set_mask(sme_me_mask);
- }
+ RIP_REL_REF(sme_me_mask) = me_mask;
+ physical_mask &= ~me_mask;
+ cc_vendor = CC_VENDOR_AMD;
+ cc_set_mask(me_mask);
}
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 0904d7e8e126..0d72183b5dd0 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -240,6 +240,8 @@ void pat_cpu_init(void)
}
wrmsrl(MSR_IA32_CR_PAT, pat_msr_val);
+
+ __flush_tlb_all();
}
/**
@@ -296,13 +298,8 @@ void __init pat_bp_init(void)
/*
* Xen PV doesn't allow to set PAT MSR, but all cache modes are
* supported.
- * When running as TDX guest setting the PAT MSR won't work either
- * due to the requirement to set CR0.CD when doing so. Rely on
- * firmware to have set the PAT MSR correctly.
*/
- if (pat_disabled ||
- cpu_feature_enabled(X86_FEATURE_XENPV) ||
- cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) {
init_cache_modes(pat_msr_val);
return;
}
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 102880404046..e5b2985a7c51 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2157,7 +2157,7 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
/* Notify hypervisor that we are about to set/clr encryption attribute. */
if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc))
- return -EIO;
+ goto vmm_fail;
ret = __change_page_attr_set_clr(&cpa, 1);
@@ -2170,13 +2170,20 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
*/
cpa_flush(&cpa, 0);
+ if (ret)
+ return ret;
+
/* Notify hypervisor that we have successfully set/clr encryption attribute. */
- if (!ret) {
- if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
- ret = -EIO;
- }
+ if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
+ goto vmm_fail;
- return ret;
+ return 0;
+
+vmm_fail:
+ WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n",
+ (void *)addr, numpages, enc ? "private" : "shared");
+
+ return -EIO;
}
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0cbc1b8e8e3d..cceb779d882d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -293,7 +293,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
for (i = 0; i < PREALLOCATED_PMDS; i++)
mop_up_one_pmd(mm, &pgdp[i]);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
@@ -325,7 +325,7 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
}
}
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
pgd_t *k_pgd, pmd_t *pmds[])
{
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5768d386efab..4af930947380 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -89,10 +89,10 @@
#define CR3_HW_ASID_BITS 12
/*
- * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
@@ -114,7 +114,7 @@ static inline u16 kern_pcid(u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not conflict with the
* bit we are using to switch between user and kernel ASIDs.
@@ -149,7 +149,7 @@ static inline u16 kern_pcid(u16 asid)
static inline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
#endif
return ret;
@@ -262,7 +262,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
- if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
return;
/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 919f647c740f..f3b4716317c1 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -553,7 +553,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
} else {
EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */
- if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || IS_ENABLED(CONFIG_MITIGATION_SLS))
EMIT1(0xCC); /* int3 */
}
@@ -568,7 +568,7 @@ static void emit_return(u8 **pprog, u8 *ip)
emit_jump(&prog, x86_return_thunk, ip);
} else {
EMIT1(0xC3); /* ret */
- if (IS_ENABLED(CONFIG_SLS))
+ if (IS_ENABLED(CONFIG_MITIGATION_SLS))
EMIT1(0xCC); /* int3 */
}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index b18ce19981ec..c10083a8e68e 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1273,7 +1273,7 @@ static int emit_jmp_edx(u8 **pprog, u8 *ip)
u8 *prog = *pprog;
int cnt = 0;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
EMIT1_off32(0xE9, (u8 *)__x86_indirect_thunk_edx - (ip + 5));
#else
EMIT2(0xFF, 0xE2);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e9f99c56f3ce..f090ec972d7b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -950,3 +950,8 @@ umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n)
}
return attr->mode;
}
+
+enum efi_secureboot_mode __x86_ima_efi_boot_mode(void)
+{
+ return boot_params.secure_boot;
+}
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index 00a92cb2c814..944e0290f2c0 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -3,6 +3,7 @@
#include <xen/hvc-console.h>
+#include <asm/bootparam.h>
#include <asm/io_apic.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 08aa0f25f12a..bc31863c5ee6 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -61,7 +61,7 @@ ifdef CONFIG_STACKPROTECTOR_STRONG
PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong
endif
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS)
endif
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index c9f76fae902e..14d9c7daf90f 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -37,13 +37,15 @@
.text
.code16
-.macro LOCK_AND_LOAD_REALMODE_ESP lock_pa=0
+.macro LOCK_AND_LOAD_REALMODE_ESP lock_pa=0 lock_rip=0
/*
* Make sure only one CPU fiddles with the realmode stack
*/
.Llock_rm\@:
.if \lock_pa
lock btsl $0, pa_tr_lock
+ .elseif \lock_rip
+ lock btsl $0, tr_lock(%rip)
.else
lock btsl $0, tr_lock
.endif
@@ -220,6 +222,35 @@ SYM_CODE_START(trampoline_start64)
lidt tr_idt(%rip)
lgdt tr_gdt64(%rip)
+ /* Check if paging mode has to be changed */
+ movq %cr4, %rax
+ xorl tr_cr4(%rip), %eax
+ testl $X86_CR4_LA57, %eax
+ jnz .L_switch_paging
+
+ /* Paging mode is correct proceed in 64-bit mode */
+
+ LOCK_AND_LOAD_REALMODE_ESP lock_rip=1
+
+ movw $__KERNEL_DS, %dx
+ movl %edx, %ss
+ addl $pa_real_mode_base, %esp
+ movl %edx, %ds
+ movl %edx, %es
+ movl %edx, %fs
+ movl %edx, %gs
+
+ movl $pa_trampoline_pgd, %eax
+ movq %rax, %cr3
+
+ pushq $__KERNEL_CS
+ pushq tr_start(%rip)
+ lretq
+.L_switch_paging:
+ /*
+ * To switch between 4- and 5-level paging modes, it is necessary
+ * to disable paging. This must be done in the compatibility mode.
+ */
ljmpl *tr_compat(%rip)
SYM_CODE_END(trampoline_start64)
diff --git a/arch/x86/virt/svm/Makefile b/arch/x86/virt/svm/Makefile
new file mode 100644
index 000000000000..ef2a31bdcc70
--- /dev/null
+++ b/arch/x86/virt/svm/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_KVM_AMD_SEV) += sev.o
diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
new file mode 100644
index 000000000000..cffe1157a90a
--- /dev/null
+++ b/arch/x86/virt/svm/sev.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM-SEV Host Support.
+ *
+ * Copyright (C) 2023 Advanced Micro Devices, Inc.
+ *
+ * Author: Ashish Kalra <ashish.kalra@amd.com>
+ *
+ */
+
+#include <linux/cc_platform.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/cpumask.h>
+#include <linux/iommu.h>
+#include <linux/amd-iommu.h>
+
+#include <asm/sev.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/cpuid.h>
+#include <asm/cmdline.h>
+#include <asm/iommu.h>
+
+/*
+ * The RMP entry format is not architectural. The format is defined in PPR
+ * Family 19h Model 01h, Rev B1 processor.
+ */
+struct rmpentry {
+ union {
+ struct {
+ u64 assigned : 1,
+ pagesize : 1,
+ immutable : 1,
+ rsvd1 : 9,
+ gpa : 39,
+ asid : 10,
+ vmsa : 1,
+ validated : 1,
+ rsvd2 : 1;
+ };
+ u64 lo;
+ };
+ u64 hi;
+} __packed;
+
+/*
+ * The first 16KB from the RMP_BASE is used by the processor for the
+ * bookkeeping, the range needs to be added during the RMP entry lookup.
+ */
+#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000
+
+/* Mask to apply to a PFN to get the first PFN of a 2MB page */
+#define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
+
+static u64 probed_rmp_base, probed_rmp_size;
+static struct rmpentry *rmptable __ro_after_init;
+static u64 rmptable_max_pfn __ro_after_init;
+
+static LIST_HEAD(snp_leaked_pages_list);
+static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
+
+static unsigned long snp_nr_leaked_pages;
+
+#undef pr_fmt
+#define pr_fmt(fmt) "SEV-SNP: " fmt
+
+static int __mfd_enable(unsigned int cpu)
+{
+ u64 val;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return 0;
+
+ rdmsrl(MSR_AMD64_SYSCFG, val);
+
+ val |= MSR_AMD64_SYSCFG_MFDM;
+
+ wrmsrl(MSR_AMD64_SYSCFG, val);
+
+ return 0;
+}
+
+static __init void mfd_enable(void *arg)
+{
+ __mfd_enable(smp_processor_id());
+}
+
+static int __snp_enable(unsigned int cpu)
+{
+ u64 val;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return 0;
+
+ rdmsrl(MSR_AMD64_SYSCFG, val);
+
+ val |= MSR_AMD64_SYSCFG_SNP_EN;
+ val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
+
+ wrmsrl(MSR_AMD64_SYSCFG, val);
+
+ return 0;
+}
+
+static __init void snp_enable(void *arg)
+{
+ __snp_enable(smp_processor_id());
+}
+
+#define RMP_ADDR_MASK GENMASK_ULL(51, 13)
+
+bool snp_probe_rmptable_info(void)
+{
+ u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end;
+
+ rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
+ rdmsrl(MSR_AMD64_RMP_END, rmp_end);
+
+ if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
+ pr_err("Memory for the RMP table has not been reserved by BIOS\n");
+ return false;
+ }
+
+ if (rmp_base > rmp_end) {
+ pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
+ return false;
+ }
+
+ rmp_sz = rmp_end - rmp_base + 1;
+
+ /*
+ * Calculate the amount the memory that must be reserved by the BIOS to
+ * address the whole RAM, including the bookkeeping area. The RMP itself
+ * must also be covered.
+ */
+ max_rmp_pfn = max_pfn;
+ if (PHYS_PFN(rmp_end) > max_pfn)
+ max_rmp_pfn = PHYS_PFN(rmp_end);
+
+ calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
+
+ if (calc_rmp_sz > rmp_sz) {
+ pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
+ calc_rmp_sz, rmp_sz);
+ return false;
+ }
+
+ probed_rmp_base = rmp_base;
+ probed_rmp_size = rmp_sz;
+
+ pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
+ probed_rmp_base, probed_rmp_base + probed_rmp_size - 1);
+
+ return true;
+}
+
+/*
+ * Do the necessary preparations which are verified by the firmware as
+ * described in the SNP_INIT_EX firmware command description in the SNP
+ * firmware ABI spec.
+ */
+static int __init snp_rmptable_init(void)
+{
+ void *rmptable_start;
+ u64 rmptable_size;
+ u64 val;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return 0;
+
+ if (!amd_iommu_snp_en)
+ return 0;
+
+ if (!probed_rmp_size)
+ goto nosnp;
+
+ rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
+ if (!rmptable_start) {
+ pr_err("Failed to map RMP table\n");
+ return 1;
+ }
+
+ /*
+ * Check if SEV-SNP is already enabled, this can happen in case of
+ * kexec boot.
+ */
+ rdmsrl(MSR_AMD64_SYSCFG, val);
+ if (val & MSR_AMD64_SYSCFG_SNP_EN)
+ goto skip_enable;
+
+ memset(rmptable_start, 0, probed_rmp_size);
+
+ /* Flush the caches to ensure that data is written before SNP is enabled. */
+ wbinvd_on_all_cpus();
+
+ /* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
+ on_each_cpu(mfd_enable, NULL, 1);
+
+ on_each_cpu(snp_enable, NULL, 1);
+
+skip_enable:
+ rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ;
+ rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
+
+ rmptable = (struct rmpentry *)rmptable_start;
+ rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1;
+
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
+
+ /*
+ * Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
+ * notifier is invoked to do SNP IOMMU shutdown before kdump.
+ */
+ crash_kexec_post_notifiers = true;
+
+ return 0;
+
+nosnp:
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ return -ENOSYS;
+}
+
+/*
+ * This must be called after the IOMMU has been initialized.
+ */
+device_initcall(snp_rmptable_init);
+
+static struct rmpentry *get_rmpentry(u64 pfn)
+{
+ if (WARN_ON_ONCE(pfn > rmptable_max_pfn))
+ return ERR_PTR(-EFAULT);
+
+ return &rmptable[pfn];
+}
+
+static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
+{
+ struct rmpentry *large_entry, *entry;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return ERR_PTR(-ENODEV);
+
+ entry = get_rmpentry(pfn);
+ if (IS_ERR(entry))
+ return entry;
+
+ /*
+ * Find the authoritative RMP entry for a PFN. This can be either a 4K
+ * RMP entry or a special large RMP entry that is authoritative for a
+ * whole 2M area.
+ */
+ large_entry = get_rmpentry(pfn & PFN_PMD_MASK);
+ if (IS_ERR(large_entry))
+ return large_entry;
+
+ *level = RMP_TO_PG_LEVEL(large_entry->pagesize);
+
+ return entry;
+}
+
+int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
+{
+ struct rmpentry *e;
+
+ e = __snp_lookup_rmpentry(pfn, level);
+ if (IS_ERR(e))
+ return PTR_ERR(e);
+
+ *assigned = !!e->assigned;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
+
+/*
+ * Dump the raw RMP entry for a particular PFN. These bits are documented in the
+ * PPR for a particular CPU model and provide useful information about how a
+ * particular PFN is being utilized by the kernel/firmware at the time certain
+ * unexpected events occur, such as RMP faults.
+ */
+static void dump_rmpentry(u64 pfn)
+{
+ u64 pfn_i, pfn_end;
+ struct rmpentry *e;
+ int level;
+
+ e = __snp_lookup_rmpentry(pfn, &level);
+ if (IS_ERR(e)) {
+ pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n",
+ pfn, PTR_ERR(e));
+ return;
+ }
+
+ if (e->assigned) {
+ pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
+ pfn, e->lo, e->hi);
+ return;
+ }
+
+ /*
+ * If the RMP entry for a particular PFN is not in an assigned state,
+ * then it is sometimes useful to get an idea of whether or not any RMP
+ * entries for other PFNs within the same 2MB region are assigned, since
+ * those too can affect the ability to access a particular PFN in
+ * certain situations, such as when the PFN is being accessed via a 2MB
+ * mapping in the host page table.
+ */
+ pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
+ pfn_end = pfn_i + PTRS_PER_PMD;
+
+ pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
+ pfn, pfn_i, pfn_end);
+
+ while (pfn_i < pfn_end) {
+ e = __snp_lookup_rmpentry(pfn_i, &level);
+ if (IS_ERR(e)) {
+ pr_err("Error %ld reading RMP entry for PFN 0x%llx\n",
+ PTR_ERR(e), pfn_i);
+ pfn_i++;
+ continue;
+ }
+
+ if (e->lo || e->hi)
+ pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi);
+ pfn_i++;
+ }
+}
+
+void snp_dump_hva_rmpentry(unsigned long hva)
+{
+ unsigned long paddr;
+ unsigned int level;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3_pa());
+ pgd += pgd_index(hva);
+ pte = lookup_address_in_pgd(pgd, hva, &level);
+
+ if (!pte) {
+ pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
+ return;
+ }
+
+ paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
+ dump_rmpentry(PHYS_PFN(paddr));
+}
+
+/*
+ * PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
+ * Validated bit.
+ */
+int psmash(u64 pfn)
+{
+ unsigned long paddr = pfn << PAGE_SHIFT;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return -ENODEV;
+
+ if (!pfn_valid(pfn))
+ return -EINVAL;
+
+ /* Binutils version 2.36 supports the PSMASH mnemonic. */
+ asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
+ : "=a" (ret)
+ : "a" (paddr)
+ : "memory", "cc");
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(psmash);
+
+/*
+ * If the kernel uses a 2MB or larger directmap mapping to write to an address,
+ * and that mapping contains any 4KB pages that are set to private in the RMP
+ * table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
+ * owns the PFNs being transitioned will never attempt such a write, but other
+ * kernel tasks writing to other PFNs in the range may trigger these checks
+ * inadvertently due a large directmap mapping that happens to overlap such a
+ * PFN.
+ *
+ * Prevent this by splitting any 2MB+ mappings that might end up containing a
+ * mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
+ * PFN/rmp_level passed in.
+ *
+ * Note that there is no attempt here to scan all the RMP entries for the 2MB
+ * physical range, since it would only be worthwhile in determining if a
+ * subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
+ * the same shared/private state, thus avoiding the need to split the mapping.
+ * But that would mean the entries are currently in a mixed state, and so the
+ * mapping would have already been split as a result of prior transitions.
+ * And since the 4K split is only done if the mapping is 2MB+, and there isn't
+ * currently a mechanism in place to restore 2MB+ mappings, such a check would
+ * not provide any usable benefit.
+ *
+ * More specifics on how these checks are carried out can be found in APM
+ * Volume 2, "RMP and VMPL Access Checks".
+ */
+static int adjust_direct_map(u64 pfn, int rmp_level)
+{
+ unsigned long vaddr;
+ unsigned int level;
+ int npages, ret;
+ pte_t *pte;
+
+ /*
+ * pfn_to_kaddr() will return a vaddr only within the direct
+ * map range.
+ */
+ vaddr = (unsigned long)pfn_to_kaddr(pfn);
+
+ /* Only 4KB/2MB RMP entries are supported by current hardware. */
+ if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
+ return -EINVAL;
+
+ if (!pfn_valid(pfn))
+ return -EINVAL;
+
+ if (rmp_level == PG_LEVEL_2M &&
+ (!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
+ return -EINVAL;
+
+ /*
+ * If an entire 2MB physical range is being transitioned, then there is
+ * no risk of RMP #PFs due to write accesses from overlapping mappings,
+ * since even accesses from 1GB mappings will be treated as 2MB accesses
+ * as far as RMP table checks are concerned.
+ */
+ if (rmp_level == PG_LEVEL_2M)
+ return 0;
+
+ pte = lookup_address(vaddr, &level);
+ if (!pte || pte_none(*pte))
+ return 0;
+
+ if (level == PG_LEVEL_4K)
+ return 0;
+
+ npages = page_level_size(rmp_level) / PAGE_SIZE;
+ ret = set_memory_4k(vaddr, npages);
+ if (ret)
+ pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
+ pfn, ret);
+
+ return ret;
+}
+
+/*
+ * It is expected that those operations are seldom enough so that no mutual
+ * exclusion of updaters is needed and thus the overlap error condition below
+ * should happen very rarely and would get resolved relatively quickly by
+ * the firmware.
+ *
+ * If not, one could consider introducing a mutex or so here to sync concurrent
+ * RMP updates and thus diminish the amount of cases where firmware needs to
+ * lock 2M ranges to protect against concurrent updates.
+ *
+ * The optimal solution would be range locking to avoid locking disjoint
+ * regions unnecessarily but there's no support for that yet.
+ */
+static int rmpupdate(u64 pfn, struct rmp_state *state)
+{
+ unsigned long paddr = pfn << PAGE_SHIFT;
+ int ret, level;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return -ENODEV;
+
+ level = RMP_TO_PG_LEVEL(state->pagesize);
+
+ if (adjust_direct_map(pfn, level))
+ return -EFAULT;
+
+ do {
+ /* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
+ asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
+ : "=a" (ret)
+ : "a" (paddr), "c" ((unsigned long)state)
+ : "memory", "cc");
+ } while (ret == RMPUPDATE_FAIL_OVERLAP);
+
+ if (ret) {
+ pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
+ pfn, level, ret);
+ dump_rmpentry(pfn);
+ dump_stack();
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/* Transition a page to guest-owned/private state in the RMP table. */
+int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
+{
+ struct rmp_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.assigned = 1;
+ state.asid = asid;
+ state.immutable = immutable;
+ state.gpa = gpa;
+ state.pagesize = PG_LEVEL_TO_RMP(level);
+
+ return rmpupdate(pfn, &state);
+}
+EXPORT_SYMBOL_GPL(rmp_make_private);
+
+/* Transition a page to hypervisor-owned/shared state in the RMP table. */
+int rmp_make_shared(u64 pfn, enum pg_level level)
+{
+ struct rmp_state state;
+
+ memset(&state, 0, sizeof(state));
+ state.pagesize = PG_LEVEL_TO_RMP(level);
+
+ return rmpupdate(pfn, &state);
+}
+EXPORT_SYMBOL_GPL(rmp_make_shared);
+
+void snp_leak_pages(u64 pfn, unsigned int npages)
+{
+ struct page *page = pfn_to_page(pfn);
+
+ pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
+
+ spin_lock(&snp_leaked_pages_list_lock);
+ while (npages--) {
+
+ /*
+ * Reuse the page's buddy list for chaining into the leaked
+ * pages list. This page should not be on a free list currently
+ * and is also unsafe to be added to a free list.
+ */
+ if (likely(!PageCompound(page)) ||
+
+ /*
+ * Skip inserting tail pages of compound page as
+ * page->buddy_list of tail pages is not usable.
+ */
+ (PageHead(page) && compound_nr(page) <= npages))
+ list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
+
+ dump_rmpentry(pfn);
+ snp_nr_leaked_pages++;
+ pfn++;
+ page++;
+ }
+ spin_unlock(&snp_leaked_pages_list_lock);
+}
+EXPORT_SYMBOL_GPL(snp_leak_pages);
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index ada3868c02c2..9e9db601bd52 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -4,6 +4,7 @@
#include <xen/hvc-console.h>
+#include <asm/bootparam.h>
#include <asm/io_apic.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index d97adab8420f..f7547807b0bd 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -2,7 +2,6 @@
#include <linux/screen_info.h>
#include <linux/init.h>
-#include <asm/bootparam.h>
#include <asm/setup.h>
#include <xen/interface/xen.h>
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 1a9cd18dfbd3..83189cf5cdce 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -28,7 +28,7 @@
* non-zero.
*/
SYM_FUNC_START(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
RET
SYM_FUNC_END(xen_irq_disable_direct)
@@ -69,7 +69,7 @@ SYM_FUNC_END(check_events)
SYM_FUNC_START(xen_irq_enable_direct)
FRAME_BEGIN
/* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
/*
* Preempt here doesn't matter because that will deal with any
@@ -78,7 +78,7 @@ SYM_FUNC_START(xen_irq_enable_direct)
*/
/* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_pending)
jz 1f
call check_events
@@ -97,7 +97,7 @@ SYM_FUNC_END(xen_irq_enable_direct)
* x86 use opposite senses (mask vs enable).
*/
SYM_FUNC_START(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
setz %ah
addb %ah, %ah
RET
@@ -113,7 +113,7 @@ SYM_FUNC_END(xen_read_cr2);
SYM_FUNC_START(xen_read_cr2_direct)
FRAME_BEGIN
- _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
+ _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX
FRAME_END
RET
SYM_FUNC_END(xen_read_cr2_direct);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3c7d8d638ab5..e160d56e8eda 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -267,7 +267,7 @@ int queue_limits_commit_update(struct request_queue *q,
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
/**
- * queue_limits_set - apply queue limits to queue
+ * queue_limits_commit_set - apply queue limits to queue
* @q: queue to update
* @lim: limits to apply
*
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 47de0f140ba6..0b33e81f9c9b 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -588,6 +588,7 @@ CPU_SHOW_VULN_FALLBACK(mmio_stale_data);
CPU_SHOW_VULN_FALLBACK(retbleed);
CPU_SHOW_VULN_FALLBACK(spec_rstack_overflow);
CPU_SHOW_VULN_FALLBACK(gds);
+CPU_SHOW_VULN_FALLBACK(reg_file_data_sampling);
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
@@ -602,6 +603,7 @@ static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL);
static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL);
static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NULL);
static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL);
+static DEVICE_ATTR(reg_file_data_sampling, 0444, cpu_show_reg_file_data_sampling, NULL);
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_meltdown.attr,
@@ -617,6 +619,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
&dev_attr_retbleed.attr,
&dev_attr_spec_rstack_overflow.attr,
&dev_attr_gather_data_sampling.attr,
+ &dev_attr_reg_file_data_sampling.attr,
NULL
};
diff --git a/drivers/crypto/ccp/Kconfig b/drivers/crypto/ccp/Kconfig
index 32268e239bf1..f394e45e11ab 100644
--- a/drivers/crypto/ccp/Kconfig
+++ b/drivers/crypto/ccp/Kconfig
@@ -38,7 +38,7 @@ config CRYPTO_DEV_CCP_CRYPTO
config CRYPTO_DEV_SP_PSP
bool "Platform Security Processor (PSP) device"
default y
- depends on CRYPTO_DEV_CCP_DD && X86_64
+ depends on CRYPTO_DEV_CCP_DD && X86_64 && AMD_IOMMU
help
Provide support for the AMD Platform Security Processor (PSP).
The PSP is a dedicated processor that provides support for key
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index b04bc1d3d627..f44efbb89c34 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -21,14 +21,18 @@
#include <linux/hw_random.h>
#include <linux/ccp.h>
#include <linux/firmware.h>
+#include <linux/panic_notifier.h>
#include <linux/gfp.h>
#include <linux/cpufeature.h>
#include <linux/fs.h>
#include <linux/fs_struct.h>
#include <linux/psp.h>
+#include <linux/amd-iommu.h>
#include <asm/smp.h>
#include <asm/cacheflush.h>
+#include <asm/e820/types.h>
+#include <asm/sev.h>
#include "psp-dev.h"
#include "sev-dev.h"
@@ -37,6 +41,19 @@
#define SEV_FW_FILE "amd/sev.fw"
#define SEV_FW_NAME_SIZE 64
+/* Minimum firmware version required for the SEV-SNP support */
+#define SNP_MIN_API_MAJOR 1
+#define SNP_MIN_API_MINOR 51
+
+/*
+ * Maximum number of firmware-writable buffers that might be specified
+ * in the parameters of a legacy SEV command buffer.
+ */
+#define CMD_BUF_FW_WRITABLE_MAX 2
+
+/* Leave room in the descriptor array for an end-of-list indicator. */
+#define CMD_BUF_DESC_MAX (CMD_BUF_FW_WRITABLE_MAX + 1)
+
static DEFINE_MUTEX(sev_cmd_mutex);
static struct sev_misc_dev *misc_dev;
@@ -68,9 +85,14 @@ static int psp_timeout;
* The TMR is a 1MB area that must be 1MB aligned. Use the page allocator
* to allocate the memory, which will return aligned memory for the specified
* allocation order.
+ *
+ * When SEV-SNP is enabled the TMR needs to be 2MB aligned and 2MB sized.
*/
-#define SEV_ES_TMR_SIZE (1024 * 1024)
+#define SEV_TMR_SIZE (1024 * 1024)
+#define SNP_TMR_SIZE (2 * 1024 * 1024)
+
static void *sev_es_tmr;
+static size_t sev_es_tmr_size = SEV_TMR_SIZE;
/* INIT_EX NV Storage:
* The NV Storage is a 32Kb area and must be 4Kb page aligned. Use the page
@@ -80,6 +102,13 @@ static void *sev_es_tmr;
#define NV_LENGTH (32 * 1024)
static void *sev_init_ex_buffer;
+/*
+ * SEV_DATA_RANGE_LIST:
+ * Array containing range of pages that firmware transitions to HV-fixed
+ * page state.
+ */
+static struct sev_data_range_list *snp_range_list;
+
static inline bool sev_version_greater_or_equal(u8 maj, u8 min)
{
struct sev_device *sev = psp_master->sev_data;
@@ -115,6 +144,25 @@ static int sev_wait_cmd_ioc(struct sev_device *sev,
{
int ret;
+ /*
+ * If invoked during panic handling, local interrupts are disabled,
+ * so the PSP command completion interrupt can't be used. Poll for
+ * PSP command completion instead.
+ */
+ if (irqs_disabled()) {
+ unsigned long timeout_usecs = (timeout * USEC_PER_SEC) / 10;
+
+ /* Poll for SEV command completion: */
+ while (timeout_usecs--) {
+ *reg = ioread32(sev->io_regs + sev->vdata->cmdresp_reg);
+ if (*reg & PSP_CMDRESP_RESP)
+ return 0;
+
+ udelay(10);
+ }
+ return -ETIMEDOUT;
+ }
+
ret = wait_event_timeout(sev->int_queue,
sev->int_rcvd, timeout * HZ);
if (!ret)
@@ -130,6 +178,8 @@ static int sev_cmd_buffer_len(int cmd)
switch (cmd) {
case SEV_CMD_INIT: return sizeof(struct sev_data_init);
case SEV_CMD_INIT_EX: return sizeof(struct sev_data_init_ex);
+ case SEV_CMD_SNP_SHUTDOWN_EX: return sizeof(struct sev_data_snp_shutdown_ex);
+ case SEV_CMD_SNP_INIT_EX: return sizeof(struct sev_data_snp_init_ex);
case SEV_CMD_PLATFORM_STATUS: return sizeof(struct sev_user_data_status);
case SEV_CMD_PEK_CSR: return sizeof(struct sev_data_pek_csr);
case SEV_CMD_PEK_CERT_IMPORT: return sizeof(struct sev_data_pek_cert_import);
@@ -158,23 +208,27 @@ static int sev_cmd_buffer_len(int cmd)
case SEV_CMD_GET_ID: return sizeof(struct sev_data_get_id);
case SEV_CMD_ATTESTATION_REPORT: return sizeof(struct sev_data_attestation_report);
case SEV_CMD_SEND_CANCEL: return sizeof(struct sev_data_send_cancel);
+ case SEV_CMD_SNP_GCTX_CREATE: return sizeof(struct sev_data_snp_addr);
+ case SEV_CMD_SNP_LAUNCH_START: return sizeof(struct sev_data_snp_launch_start);
+ case SEV_CMD_SNP_LAUNCH_UPDATE: return sizeof(struct sev_data_snp_launch_update);
+ case SEV_CMD_SNP_ACTIVATE: return sizeof(struct sev_data_snp_activate);
+ case SEV_CMD_SNP_DECOMMISSION: return sizeof(struct sev_data_snp_addr);
+ case SEV_CMD_SNP_PAGE_RECLAIM: return sizeof(struct sev_data_snp_page_reclaim);
+ case SEV_CMD_SNP_GUEST_STATUS: return sizeof(struct sev_data_snp_guest_status);
+ case SEV_CMD_SNP_LAUNCH_FINISH: return sizeof(struct sev_data_snp_launch_finish);
+ case SEV_CMD_SNP_DBG_DECRYPT: return sizeof(struct sev_data_snp_dbg);
+ case SEV_CMD_SNP_DBG_ENCRYPT: return sizeof(struct sev_data_snp_dbg);
+ case SEV_CMD_SNP_PAGE_UNSMASH: return sizeof(struct sev_data_snp_page_unsmash);
+ case SEV_CMD_SNP_PLATFORM_STATUS: return sizeof(struct sev_data_snp_addr);
+ case SEV_CMD_SNP_GUEST_REQUEST: return sizeof(struct sev_data_snp_guest_request);
+ case SEV_CMD_SNP_CONFIG: return sizeof(struct sev_user_data_snp_config);
+ case SEV_CMD_SNP_COMMIT: return sizeof(struct sev_data_snp_commit);
default: return 0;
}
return 0;
}
-static void *sev_fw_alloc(unsigned long len)
-{
- struct page *page;
-
- page = alloc_pages(GFP_KERNEL, get_order(len));
- if (!page)
- return NULL;
-
- return page_address(page);
-}
-
static struct file *open_file_as_root(const char *filename, int flags, umode_t mode)
{
struct file *fp;
@@ -305,13 +359,485 @@ static int sev_write_init_ex_file_if_required(int cmd_id)
return sev_write_init_ex_file();
}
+/*
+ * snp_reclaim_pages() needs __sev_do_cmd_locked(), and __sev_do_cmd_locked()
+ * needs snp_reclaim_pages(), so a forward declaration is needed.
+ */
+static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret);
+
+static int snp_reclaim_pages(unsigned long paddr, unsigned int npages, bool locked)
+{
+ int ret, err, i;
+
+ paddr = __sme_clr(ALIGN_DOWN(paddr, PAGE_SIZE));
+
+ for (i = 0; i < npages; i++, paddr += PAGE_SIZE) {
+ struct sev_data_snp_page_reclaim data = {0};
+
+ data.paddr = paddr;
+
+ if (locked)
+ ret = __sev_do_cmd_locked(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
+ else
+ ret = sev_do_cmd(SEV_CMD_SNP_PAGE_RECLAIM, &data, &err);
+
+ if (ret)
+ goto cleanup;
+
+ ret = rmp_make_shared(__phys_to_pfn(paddr), PG_LEVEL_4K);
+ if (ret)
+ goto cleanup;
+ }
+
+ return 0;
+
+cleanup:
+ /*
+ * If there was a failure reclaiming the page then it is no longer safe
+ * to release it back to the system; leak it instead.
+ */
+ snp_leak_pages(__phys_to_pfn(paddr), npages - i);
+ return ret;
+}
+
+static int rmp_mark_pages_firmware(unsigned long paddr, unsigned int npages, bool locked)
+{
+ unsigned long pfn = __sme_clr(paddr) >> PAGE_SHIFT;
+ int rc, i;
+
+ for (i = 0; i < npages; i++, pfn++) {
+ rc = rmp_make_private(pfn, 0, PG_LEVEL_4K, 0, true);
+ if (rc)
+ goto cleanup;
+ }
+
+ return 0;
+
+cleanup:
+ /*
+ * Try unrolling the firmware state changes by
+ * reclaiming the pages which were already changed to the
+ * firmware state.
+ */
+ snp_reclaim_pages(paddr, i, locked);
+
+ return rc;
+}
+
+static struct page *__snp_alloc_firmware_pages(gfp_t gfp_mask, int order)
+{
+ unsigned long npages = 1ul << order, paddr;
+ struct sev_device *sev;
+ struct page *page;
+
+ if (!psp_master || !psp_master->sev_data)
+ return NULL;
+
+ page = alloc_pages(gfp_mask, order);
+ if (!page)
+ return NULL;
+
+ /* If SEV-SNP is initialized then add the page in RMP table. */
+ sev = psp_master->sev_data;
+ if (!sev->snp_initialized)
+ return page;
+
+ paddr = __pa((unsigned long)page_address(page));
+ if (rmp_mark_pages_firmware(paddr, npages, false))
+ return NULL;
+
+ return page;
+}
+
+void *snp_alloc_firmware_page(gfp_t gfp_mask)
+{
+ struct page *page;
+
+ page = __snp_alloc_firmware_pages(gfp_mask, 0);
+
+ return page ? page_address(page) : NULL;
+}
+EXPORT_SYMBOL_GPL(snp_alloc_firmware_page);
+
+static void __snp_free_firmware_pages(struct page *page, int order, bool locked)
+{
+ struct sev_device *sev = psp_master->sev_data;
+ unsigned long paddr, npages = 1ul << order;
+
+ if (!page)
+ return;
+
+ paddr = __pa((unsigned long)page_address(page));
+ if (sev->snp_initialized &&
+ snp_reclaim_pages(paddr, npages, locked))
+ return;
+
+ __free_pages(page, order);
+}
+
+void snp_free_firmware_page(void *addr)
+{
+ if (!addr)
+ return;
+
+ __snp_free_firmware_pages(virt_to_page(addr), 0, false);
+}
+EXPORT_SYMBOL_GPL(snp_free_firmware_page);
+
+static void *sev_fw_alloc(unsigned long len)
+{
+ struct page *page;
+
+ page = __snp_alloc_firmware_pages(GFP_KERNEL, get_order(len));
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+/**
+ * struct cmd_buf_desc - descriptors for managing legacy SEV command address
+ * parameters corresponding to buffers that may be written to by firmware.
+ *
+ * @paddr_ptr: pointer to the address parameter in the command buffer which may
+ * need to be saved/restored depending on whether a bounce buffer
+ * is used. In the case of a bounce buffer, the command buffer
+ * needs to be updated with the address of the new bounce buffer
+ * snp_map_cmd_buf_desc() has allocated specifically for it. Must
+ * be NULL if this descriptor is only an end-of-list indicator.
+ *
+ * @paddr_orig: storage for the original address parameter, which can be used to
+ * restore the original value in @paddr_ptr in cases where it is
+ * replaced with the address of a bounce buffer.
+ *
+ * @len: length of buffer located at the address originally stored at @paddr_ptr
+ *
+ * @guest_owned: true if the address corresponds to guest-owned pages, in which
+ * case bounce buffers are not needed.
+ */
+struct cmd_buf_desc {
+ u64 *paddr_ptr;
+ u64 paddr_orig;
+ u32 len;
+ bool guest_owned;
+};
+
+/*
+ * If a legacy SEV command parameter is a memory address, those pages in
+ * turn need to be transitioned to/from firmware-owned before/after
+ * executing the firmware command.
+ *
+ * Additionally, in cases where those pages are not guest-owned, a bounce
+ * buffer is needed in place of the original memory address parameter.
+ *
+ * A set of descriptors are used to keep track of this handling, and
+ * initialized here based on the specific commands being executed.
+ */
+static void snp_populate_cmd_buf_desc_list(int cmd, void *cmd_buf,
+ struct cmd_buf_desc *desc_list)
+{
+ switch (cmd) {
+ case SEV_CMD_PDH_CERT_EXPORT: {
+ struct sev_data_pdh_cert_export *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->pdh_cert_address;
+ desc_list[0].len = data->pdh_cert_len;
+ desc_list[1].paddr_ptr = &data->cert_chain_address;
+ desc_list[1].len = data->cert_chain_len;
+ break;
+ }
+ case SEV_CMD_GET_ID: {
+ struct sev_data_get_id *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->address;
+ desc_list[0].len = data->len;
+ break;
+ }
+ case SEV_CMD_PEK_CSR: {
+ struct sev_data_pek_csr *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->address;
+ desc_list[0].len = data->len;
+ break;
+ }
+ case SEV_CMD_LAUNCH_UPDATE_DATA: {
+ struct sev_data_launch_update_data *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->address;
+ desc_list[0].len = data->len;
+ desc_list[0].guest_owned = true;
+ break;
+ }
+ case SEV_CMD_LAUNCH_UPDATE_VMSA: {
+ struct sev_data_launch_update_vmsa *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->address;
+ desc_list[0].len = data->len;
+ desc_list[0].guest_owned = true;
+ break;
+ }
+ case SEV_CMD_LAUNCH_MEASURE: {
+ struct sev_data_launch_measure *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->address;
+ desc_list[0].len = data->len;
+ break;
+ }
+ case SEV_CMD_LAUNCH_UPDATE_SECRET: {
+ struct sev_data_launch_secret *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->guest_address;
+ desc_list[0].len = data->guest_len;
+ desc_list[0].guest_owned = true;
+ break;
+ }
+ case SEV_CMD_DBG_DECRYPT: {
+ struct sev_data_dbg *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->dst_addr;
+ desc_list[0].len = data->len;
+ desc_list[0].guest_owned = true;
+ break;
+ }
+ case SEV_CMD_DBG_ENCRYPT: {
+ struct sev_data_dbg *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->dst_addr;
+ desc_list[0].len = data->len;
+ desc_list[0].guest_owned = true;
+ break;
+ }
+ case SEV_CMD_ATTESTATION_REPORT: {
+ struct sev_data_attestation_report *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->address;
+ desc_list[0].len = data->len;
+ break;
+ }
+ case SEV_CMD_SEND_START: {
+ struct sev_data_send_start *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->session_address;
+ desc_list[0].len = data->session_len;
+ break;
+ }
+ case SEV_CMD_SEND_UPDATE_DATA: {
+ struct sev_data_send_update_data *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->hdr_address;
+ desc_list[0].len = data->hdr_len;
+ desc_list[1].paddr_ptr = &data->trans_address;
+ desc_list[1].len = data->trans_len;
+ break;
+ }
+ case SEV_CMD_SEND_UPDATE_VMSA: {
+ struct sev_data_send_update_vmsa *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->hdr_address;
+ desc_list[0].len = data->hdr_len;
+ desc_list[1].paddr_ptr = &data->trans_address;
+ desc_list[1].len = data->trans_len;
+ break;
+ }
+ case SEV_CMD_RECEIVE_UPDATE_DATA: {
+ struct sev_data_receive_update_data *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->guest_address;
+ desc_list[0].len = data->guest_len;
+ desc_list[0].guest_owned = true;
+ break;
+ }
+ case SEV_CMD_RECEIVE_UPDATE_VMSA: {
+ struct sev_data_receive_update_vmsa *data = cmd_buf;
+
+ desc_list[0].paddr_ptr = &data->guest_address;
+ desc_list[0].len = data->guest_len;
+ desc_list[0].guest_owned = true;
+ break;
+ }
+ default:
+ break;
+ }
+}
+
+static int snp_map_cmd_buf_desc(struct cmd_buf_desc *desc)
+{
+ unsigned int npages;
+
+ if (!desc->len)
+ return 0;
+
+ /* Allocate a bounce buffer if this isn't a guest owned page. */
+ if (!desc->guest_owned) {
+ struct page *page;
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, get_order(desc->len));
+ if (!page) {
+ pr_warn("Failed to allocate bounce buffer for SEV legacy command.\n");
+ return -ENOMEM;
+ }
+
+ desc->paddr_orig = *desc->paddr_ptr;
+ *desc->paddr_ptr = __psp_pa(page_to_virt(page));
+ }
+
+ npages = PAGE_ALIGN(desc->len) >> PAGE_SHIFT;
+
+ /* Transition the buffer to firmware-owned. */
+ if (rmp_mark_pages_firmware(*desc->paddr_ptr, npages, true)) {
+ pr_warn("Error moving pages to firmware-owned state for SEV legacy command.\n");
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int snp_unmap_cmd_buf_desc(struct cmd_buf_desc *desc)
+{
+ unsigned int npages;
+
+ if (!desc->len)
+ return 0;
+
+ npages = PAGE_ALIGN(desc->len) >> PAGE_SHIFT;
+
+ /* Transition the buffers back to hypervisor-owned. */
+ if (snp_reclaim_pages(*desc->paddr_ptr, npages, true)) {
+ pr_warn("Failed to reclaim firmware-owned pages while issuing SEV legacy command.\n");
+ return -EFAULT;
+ }
+
+ /* Copy data from bounce buffer and then free it. */
+ if (!desc->guest_owned) {
+ void *bounce_buf = __va(__sme_clr(*desc->paddr_ptr));
+ void *dst_buf = __va(__sme_clr(desc->paddr_orig));
+
+ memcpy(dst_buf, bounce_buf, desc->len);
+ __free_pages(virt_to_page(bounce_buf), get_order(desc->len));
+
+ /* Restore the original address in the command buffer. */
+ *desc->paddr_ptr = desc->paddr_orig;
+ }
+
+ return 0;
+}
+
+static int snp_map_cmd_buf_desc_list(int cmd, void *cmd_buf, struct cmd_buf_desc *desc_list)
+{
+ int i;
+
+ snp_populate_cmd_buf_desc_list(cmd, cmd_buf, desc_list);
+
+ for (i = 0; i < CMD_BUF_DESC_MAX; i++) {
+ struct cmd_buf_desc *desc = &desc_list[i];
+
+ if (!desc->paddr_ptr)
+ break;
+
+ if (snp_map_cmd_buf_desc(desc))
+ goto err_unmap;
+ }
+
+ return 0;
+
+err_unmap:
+ for (i--; i >= 0; i--)
+ snp_unmap_cmd_buf_desc(&desc_list[i]);
+
+ return -EFAULT;
+}
+
+static int snp_unmap_cmd_buf_desc_list(struct cmd_buf_desc *desc_list)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < CMD_BUF_DESC_MAX; i++) {
+ struct cmd_buf_desc *desc = &desc_list[i];
+
+ if (!desc->paddr_ptr)
+ break;
+
+ if (snp_unmap_cmd_buf_desc(&desc_list[i]))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+
+static bool sev_cmd_buf_writable(int cmd)
+{
+ switch (cmd) {
+ case SEV_CMD_PLATFORM_STATUS:
+ case SEV_CMD_GUEST_STATUS:
+ case SEV_CMD_LAUNCH_START:
+ case SEV_CMD_RECEIVE_START:
+ case SEV_CMD_LAUNCH_MEASURE:
+ case SEV_CMD_SEND_START:
+ case SEV_CMD_SEND_UPDATE_DATA:
+ case SEV_CMD_SEND_UPDATE_VMSA:
+ case SEV_CMD_PEK_CSR:
+ case SEV_CMD_PDH_CERT_EXPORT:
+ case SEV_CMD_GET_ID:
+ case SEV_CMD_ATTESTATION_REPORT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* After SNP is INIT'ed, the behavior of legacy SEV commands is changed. */
+static bool snp_legacy_handling_needed(int cmd)
+{
+ struct sev_device *sev = psp_master->sev_data;
+
+ return cmd < SEV_CMD_SNP_INIT && sev->snp_initialized;
+}
+
+static int snp_prep_cmd_buf(int cmd, void *cmd_buf, struct cmd_buf_desc *desc_list)
+{
+ if (!snp_legacy_handling_needed(cmd))
+ return 0;
+
+ if (snp_map_cmd_buf_desc_list(cmd, cmd_buf, desc_list))
+ return -EFAULT;
+
+ /*
+ * Before command execution, the command buffer needs to be put into
+ * the firmware-owned state.
+ */
+ if (sev_cmd_buf_writable(cmd)) {
+ if (rmp_mark_pages_firmware(__pa(cmd_buf), 1, true))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int snp_reclaim_cmd_buf(int cmd, void *cmd_buf)
+{
+ if (!snp_legacy_handling_needed(cmd))
+ return 0;
+
+ /*
+ * After command completion, the command buffer needs to be put back
+ * into the hypervisor-owned state.
+ */
+ if (sev_cmd_buf_writable(cmd))
+ if (snp_reclaim_pages(__pa(cmd_buf), 1, true))
+ return -EFAULT;
+
+ return 0;
+}
+
static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
{
+ struct cmd_buf_desc desc_list[CMD_BUF_DESC_MAX] = {0};
struct psp_device *psp = psp_master;
struct sev_device *sev;
unsigned int cmdbuff_hi, cmdbuff_lo;
unsigned int phys_lsb, phys_msb;
unsigned int reg, ret = 0;
+ void *cmd_buf;
int buf_len;
if (!psp || !psp->sev_data)
@@ -331,12 +857,47 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
* work for some memory, e.g. vmalloc'd addresses, and @data may not be
* physically contiguous.
*/
- if (data)
- memcpy(sev->cmd_buf, data, buf_len);
+ if (data) {
+ /*
+ * Commands are generally issued one at a time and require the
+ * sev_cmd_mutex, but there could be recursive firmware requests
+ * due to SEV_CMD_SNP_PAGE_RECLAIM needing to be issued while
+ * preparing buffers for another command. This is the only known
+ * case of nesting in the current code, so exactly one
+ * additional command buffer is available for that purpose.
+ */
+ if (!sev->cmd_buf_active) {
+ cmd_buf = sev->cmd_buf;
+ sev->cmd_buf_active = true;
+ } else if (!sev->cmd_buf_backup_active) {
+ cmd_buf = sev->cmd_buf_backup;
+ sev->cmd_buf_backup_active = true;
+ } else {
+ dev_err(sev->dev,
+ "SEV: too many firmware commands in progress, no command buffers available.\n");
+ return -EBUSY;
+ }
+
+ memcpy(cmd_buf, data, buf_len);
+
+ /*
+ * The behavior of the SEV-legacy commands is altered when the
+ * SNP firmware is in the INIT state.
+ */
+ ret = snp_prep_cmd_buf(cmd, cmd_buf, desc_list);
+ if (ret) {
+ dev_err(sev->dev,
+ "SEV: failed to prepare buffer for legacy command 0x%x. Error: %d\n",
+ cmd, ret);
+ return ret;
+ }
+ } else {
+ cmd_buf = sev->cmd_buf;
+ }
/* Get the physical address of the command buffer */
- phys_lsb = data ? lower_32_bits(__psp_pa(sev->cmd_buf)) : 0;
- phys_msb = data ? upper_32_bits(__psp_pa(sev->cmd_buf)) : 0;
+ phys_lsb = data ? lower_32_bits(__psp_pa(cmd_buf)) : 0;
+ phys_msb = data ? upper_32_bits(__psp_pa(cmd_buf)) : 0;
dev_dbg(sev->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n",
cmd, phys_msb, phys_lsb, psp_timeout);
@@ -390,20 +951,41 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret)
ret = sev_write_init_ex_file_if_required(cmd);
}
- print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
- buf_len, false);
-
/*
* Copy potential output from the PSP back to data. Do this even on
* failure in case the caller wants to glean something from the error.
*/
- if (data)
- memcpy(data, sev->cmd_buf, buf_len);
+ if (data) {
+ int ret_reclaim;
+ /*
+ * Restore the page state after the command completes.
+ */
+ ret_reclaim = snp_reclaim_cmd_buf(cmd, cmd_buf);
+ if (ret_reclaim) {
+ dev_err(sev->dev,
+ "SEV: failed to reclaim buffer for legacy command %#x. Error: %d\n",
+ cmd, ret_reclaim);
+ return ret_reclaim;
+ }
+
+ memcpy(data, cmd_buf, buf_len);
+
+ if (sev->cmd_buf_backup_active)
+ sev->cmd_buf_backup_active = false;
+ else
+ sev->cmd_buf_active = false;
+
+ if (snp_unmap_cmd_buf_desc_list(desc_list))
+ return -EFAULT;
+ }
+
+ print_hex_dump_debug("(out): ", DUMP_PREFIX_OFFSET, 16, 2, data,
+ buf_len, false);
return ret;
}
-static int sev_do_cmd(int cmd, void *data, int *psp_ret)
+int sev_do_cmd(int cmd, void *data, int *psp_ret)
{
int rc;
@@ -413,6 +995,7 @@ static int sev_do_cmd(int cmd, void *data, int *psp_ret)
return rc;
}
+EXPORT_SYMBOL_GPL(sev_do_cmd);
static int __sev_init_locked(int *error)
{
@@ -427,7 +1010,7 @@ static int __sev_init_locked(int *error)
data.tmr_address = __pa(sev_es_tmr);
data.flags |= SEV_INIT_FLAGS_SEV_ES;
- data.tmr_len = SEV_ES_TMR_SIZE;
+ data.tmr_len = sev_es_tmr_size;
}
return __sev_do_cmd_locked(SEV_CMD_INIT, &data, error);
@@ -450,7 +1033,7 @@ static int __sev_init_ex_locked(int *error)
data.tmr_address = __pa(sev_es_tmr);
data.flags |= SEV_INIT_FLAGS_SEV_ES;
- data.tmr_len = SEV_ES_TMR_SIZE;
+ data.tmr_len = sev_es_tmr_size;
}
return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error);
@@ -464,26 +1047,218 @@ static inline int __sev_do_init_locked(int *psp_ret)
return __sev_init_locked(psp_ret);
}
-static int __sev_platform_init_locked(int *error)
+static void snp_set_hsave_pa(void *arg)
+{
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+}
+
+static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg)
+{
+ struct sev_data_range_list *range_list = arg;
+ struct sev_data_range *range = &range_list->ranges[range_list->num_elements];
+ size_t size;
+
+ /*
+ * Ensure the list of HV_FIXED pages that will be passed to firmware
+ * do not exceed the page-sized argument buffer.
+ */
+ if ((range_list->num_elements * sizeof(struct sev_data_range) +
+ sizeof(struct sev_data_range_list)) > PAGE_SIZE)
+ return -E2BIG;
+
+ switch (rs->desc) {
+ case E820_TYPE_RESERVED:
+ case E820_TYPE_PMEM:
+ case E820_TYPE_ACPI:
+ range->base = rs->start & PAGE_MASK;
+ size = PAGE_ALIGN((rs->end + 1) - rs->start);
+ range->page_count = size >> PAGE_SHIFT;
+ range_list->num_elements++;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int __sev_snp_init_locked(int *error)
{
- int rc = 0, psp_ret = SEV_RET_NO_FW_CALL;
struct psp_device *psp = psp_master;
+ struct sev_data_snp_init_ex data;
struct sev_device *sev;
+ void *arg = &data;
+ int cmd, rc = 0;
- if (!psp || !psp->sev_data)
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
return -ENODEV;
sev = psp->sev_data;
- if (sev->state == SEV_STATE_INIT)
+ if (sev->snp_initialized)
return 0;
- if (sev_init_ex_buffer) {
- rc = sev_read_init_ex_file();
- if (rc)
+ if (!sev_version_greater_or_equal(SNP_MIN_API_MAJOR, SNP_MIN_API_MINOR)) {
+ dev_dbg(sev->dev, "SEV-SNP support requires firmware version >= %d:%d\n",
+ SNP_MIN_API_MAJOR, SNP_MIN_API_MINOR);
+ return 0;
+ }
+
+ /* SNP_INIT requires MSR_VM_HSAVE_PA to be cleared on all CPUs. */
+ on_each_cpu(snp_set_hsave_pa, NULL, 1);
+
+ /*
+ * Starting in SNP firmware v1.52, the SNP_INIT_EX command takes a list
+ * of system physical address ranges to convert into HV-fixed page
+ * states during the RMP initialization. For instance, the memory that
+ * UEFI reserves should be included in the that list. This allows system
+ * components that occasionally write to memory (e.g. logging to UEFI
+ * reserved regions) to not fail due to RMP initialization and SNP
+ * enablement.
+ *
+ */
+ if (sev_version_greater_or_equal(SNP_MIN_API_MAJOR, 52)) {
+ /*
+ * Firmware checks that the pages containing the ranges enumerated
+ * in the RANGES structure are either in the default page state or in the
+ * firmware page state.
+ */
+ snp_range_list = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!snp_range_list) {
+ dev_err(sev->dev,
+ "SEV: SNP_INIT_EX range list memory allocation failed\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * Retrieve all reserved memory regions from the e820 memory map
+ * to be setup as HV-fixed pages.
+ */
+ rc = walk_iomem_res_desc(IORES_DESC_NONE, IORESOURCE_MEM, 0, ~0,
+ snp_range_list, snp_filter_reserved_mem_regions);
+ if (rc) {
+ dev_err(sev->dev,
+ "SEV: SNP_INIT_EX walk_iomem_res_desc failed rc = %d\n", rc);
return rc;
+ }
+
+ memset(&data, 0, sizeof(data));
+ data.init_rmp = 1;
+ data.list_paddr_en = 1;
+ data.list_paddr = __psp_pa(snp_range_list);
+ cmd = SEV_CMD_SNP_INIT_EX;
+ } else {
+ cmd = SEV_CMD_SNP_INIT;
+ arg = NULL;
+ }
+
+ /*
+ * The following sequence must be issued before launching the first SNP
+ * guest to ensure all dirty cache lines are flushed, including from
+ * updates to the RMP table itself via the RMPUPDATE instruction:
+ *
+ * - WBINVD on all running CPUs
+ * - SEV_CMD_SNP_INIT[_EX] firmware command
+ * - WBINVD on all running CPUs
+ * - SEV_CMD_SNP_DF_FLUSH firmware command
+ */
+ wbinvd_on_all_cpus();
+
+ rc = __sev_do_cmd_locked(cmd, arg, error);
+ if (rc)
+ return rc;
+
+ /* Prepare for first SNP guest launch after INIT. */
+ wbinvd_on_all_cpus();
+ rc = __sev_do_cmd_locked(SEV_CMD_SNP_DF_FLUSH, NULL, error);
+ if (rc)
+ return rc;
+
+ sev->snp_initialized = true;
+ dev_dbg(sev->dev, "SEV-SNP firmware initialized\n");
+
+ sev_es_tmr_size = SNP_TMR_SIZE;
+
+ return rc;
+}
+
+static void __sev_platform_init_handle_tmr(struct sev_device *sev)
+{
+ if (sev_es_tmr)
+ return;
+
+ /* Obtain the TMR memory area for SEV-ES use */
+ sev_es_tmr = sev_fw_alloc(sev_es_tmr_size);
+ if (sev_es_tmr) {
+ /* Must flush the cache before giving it to the firmware */
+ if (!sev->snp_initialized)
+ clflush_cache_range(sev_es_tmr, sev_es_tmr_size);
+ } else {
+ dev_warn(sev->dev, "SEV: TMR allocation failed, SEV-ES support unavailable\n");
+ }
+}
+
+/*
+ * If an init_ex_path is provided allocate a buffer for the file and
+ * read in the contents. Additionally, if SNP is initialized, convert
+ * the buffer pages to firmware pages.
+ */
+static int __sev_platform_init_handle_init_ex_path(struct sev_device *sev)
+{
+ struct page *page;
+ int rc;
+
+ if (!init_ex_path)
+ return 0;
+
+ if (sev_init_ex_buffer)
+ return 0;
+
+ page = alloc_pages(GFP_KERNEL, get_order(NV_LENGTH));
+ if (!page) {
+ dev_err(sev->dev, "SEV: INIT_EX NV memory allocation failed\n");
+ return -ENOMEM;
+ }
+
+ sev_init_ex_buffer = page_address(page);
+
+ rc = sev_read_init_ex_file();
+ if (rc)
+ return rc;
+
+ /* If SEV-SNP is initialized, transition to firmware page. */
+ if (sev->snp_initialized) {
+ unsigned long npages;
+
+ npages = 1UL << get_order(NV_LENGTH);
+ if (rmp_mark_pages_firmware(__pa(sev_init_ex_buffer), npages, false)) {
+ dev_err(sev->dev, "SEV: INIT_EX NV memory page state change failed.\n");
+ return -ENOMEM;
+ }
}
+ return 0;
+}
+
+static int __sev_platform_init_locked(int *error)
+{
+ int rc, psp_ret = SEV_RET_NO_FW_CALL;
+ struct sev_device *sev;
+
+ if (!psp_master || !psp_master->sev_data)
+ return -ENODEV;
+
+ sev = psp_master->sev_data;
+
+ if (sev->state == SEV_STATE_INIT)
+ return 0;
+
+ __sev_platform_init_handle_tmr(sev);
+
+ rc = __sev_platform_init_handle_init_ex_path(sev);
+ if (rc)
+ return rc;
+
rc = __sev_do_init_locked(&psp_ret);
if (rc && psp_ret == SEV_RET_SECURE_DATA_INVALID) {
/*
@@ -520,12 +1295,46 @@ static int __sev_platform_init_locked(int *error)
return 0;
}
-int sev_platform_init(int *error)
+static int _sev_platform_init_locked(struct sev_platform_init_args *args)
+{
+ struct sev_device *sev;
+ int rc;
+
+ if (!psp_master || !psp_master->sev_data)
+ return -ENODEV;
+
+ sev = psp_master->sev_data;
+
+ if (sev->state == SEV_STATE_INIT)
+ return 0;
+
+ /*
+ * Legacy guests cannot be running while SNP_INIT(_EX) is executing,
+ * so perform SEV-SNP initialization at probe time.
+ */
+ rc = __sev_snp_init_locked(&args->error);
+ if (rc && rc != -ENODEV) {
+ /*
+ * Don't abort the probe if SNP INIT failed,
+ * continue to initialize the legacy SEV firmware.
+ */
+ dev_err(sev->dev, "SEV-SNP: failed to INIT rc %d, error %#x\n",
+ rc, args->error);
+ }
+
+ /* Defer legacy SEV/SEV-ES support if allowed by caller/module. */
+ if (args->probe && !psp_init_on_probe)
+ return 0;
+
+ return __sev_platform_init_locked(&args->error);
+}
+
+int sev_platform_init(struct sev_platform_init_args *args)
{
int rc;
mutex_lock(&sev_cmd_mutex);
- rc = __sev_platform_init_locked(error);
+ rc = _sev_platform_init_locked(args);
mutex_unlock(&sev_cmd_mutex);
return rc;
@@ -556,17 +1365,6 @@ static int __sev_platform_shutdown_locked(int *error)
return ret;
}
-static int sev_platform_shutdown(int *error)
-{
- int rc;
-
- mutex_lock(&sev_cmd_mutex);
- rc = __sev_platform_shutdown_locked(NULL);
- mutex_unlock(&sev_cmd_mutex);
-
- return rc;
-}
-
static int sev_get_platform_state(int *state, int *error)
{
struct sev_user_data_status data;
@@ -842,6 +1640,72 @@ fw_err:
return ret;
}
+static int __sev_snp_shutdown_locked(int *error, bool panic)
+{
+ struct sev_device *sev = psp_master->sev_data;
+ struct sev_data_snp_shutdown_ex data;
+ int ret;
+
+ if (!sev->snp_initialized)
+ return 0;
+
+ memset(&data, 0, sizeof(data));
+ data.len = sizeof(data);
+ data.iommu_snp_shutdown = 1;
+
+ /*
+ * If invoked during panic handling, local interrupts are disabled
+ * and all CPUs are stopped, so wbinvd_on_all_cpus() can't be called.
+ * In that case, a wbinvd() is done on remote CPUs via the NMI
+ * callback, so only a local wbinvd() is needed here.
+ */
+ if (!panic)
+ wbinvd_on_all_cpus();
+ else
+ wbinvd();
+
+ ret = __sev_do_cmd_locked(SEV_CMD_SNP_SHUTDOWN_EX, &data, error);
+ /* SHUTDOWN may require DF_FLUSH */
+ if (*error == SEV_RET_DFFLUSH_REQUIRED) {
+ ret = __sev_do_cmd_locked(SEV_CMD_SNP_DF_FLUSH, NULL, NULL);
+ if (ret) {
+ dev_err(sev->dev, "SEV-SNP DF_FLUSH failed\n");
+ return ret;
+ }
+ /* reissue the shutdown command */
+ ret = __sev_do_cmd_locked(SEV_CMD_SNP_SHUTDOWN_EX, &data,
+ error);
+ }
+ if (ret) {
+ dev_err(sev->dev, "SEV-SNP firmware shutdown failed\n");
+ return ret;
+ }
+
+ /*
+ * SNP_SHUTDOWN_EX with IOMMU_SNP_SHUTDOWN set to 1 disables SNP
+ * enforcement by the IOMMU and also transitions all pages
+ * associated with the IOMMU to the Reclaim state.
+ * Firmware was transitioning the IOMMU pages to Hypervisor state
+ * before version 1.53. But, accounting for the number of assigned
+ * 4kB pages in a 2M page was done incorrectly by not transitioning
+ * to the Reclaim state. This resulted in RMP #PF when later accessing
+ * the 2M page containing those pages during kexec boot. Hence, the
+ * firmware now transitions these pages to Reclaim state and hypervisor
+ * needs to transition these pages to shared state. SNP Firmware
+ * version 1.53 and above are needed for kexec boot.
+ */
+ ret = amd_iommu_snp_disable();
+ if (ret) {
+ dev_err(sev->dev, "SNP IOMMU shutdown failed\n");
+ return ret;
+ }
+
+ sev->snp_initialized = false;
+ dev_dbg(sev->dev, "SEV-SNP firmware shutdown\n");
+
+ return ret;
+}
+
static int sev_ioctl_do_pek_import(struct sev_issue_cmd *argp, bool writable)
{
struct sev_device *sev = psp_master->sev_data;
@@ -1084,6 +1948,85 @@ e_free_pdh:
return ret;
}
+static int sev_ioctl_do_snp_platform_status(struct sev_issue_cmd *argp)
+{
+ struct sev_device *sev = psp_master->sev_data;
+ struct sev_data_snp_addr buf;
+ struct page *status_page;
+ void *data;
+ int ret;
+
+ if (!sev->snp_initialized || !argp->data)
+ return -EINVAL;
+
+ status_page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!status_page)
+ return -ENOMEM;
+
+ data = page_address(status_page);
+
+ /*
+ * Firmware expects status page to be in firmware-owned state, otherwise
+ * it will report firmware error code INVALID_PAGE_STATE (0x1A).
+ */
+ if (rmp_mark_pages_firmware(__pa(data), 1, true)) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ buf.address = __psp_pa(data);
+ ret = __sev_do_cmd_locked(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &argp->error);
+
+ /*
+ * Status page will be transitioned to Reclaim state upon success, or
+ * left in Firmware state in failure. Use snp_reclaim_pages() to
+ * transition either case back to Hypervisor-owned state.
+ */
+ if (snp_reclaim_pages(__pa(data), 1, true))
+ return -EFAULT;
+
+ if (ret)
+ goto cleanup;
+
+ if (copy_to_user((void __user *)argp->data, data,
+ sizeof(struct sev_user_data_snp_status)))
+ ret = -EFAULT;
+
+cleanup:
+ __free_pages(status_page, 0);
+ return ret;
+}
+
+static int sev_ioctl_do_snp_commit(struct sev_issue_cmd *argp)
+{
+ struct sev_device *sev = psp_master->sev_data;
+ struct sev_data_snp_commit buf;
+
+ if (!sev->snp_initialized)
+ return -EINVAL;
+
+ buf.len = sizeof(buf);
+
+ return __sev_do_cmd_locked(SEV_CMD_SNP_COMMIT, &buf, &argp->error);
+}
+
+static int sev_ioctl_do_snp_set_config(struct sev_issue_cmd *argp, bool writable)
+{
+ struct sev_device *sev = psp_master->sev_data;
+ struct sev_user_data_snp_config config;
+
+ if (!sev->snp_initialized || !argp->data)
+ return -EINVAL;
+
+ if (!writable)
+ return -EPERM;
+
+ if (copy_from_user(&config, (void __user *)argp->data, sizeof(config)))
+ return -EFAULT;
+
+ return __sev_do_cmd_locked(SEV_CMD_SNP_CONFIG, &config, &argp->error);
+}
+
static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
{
void __user *argp = (void __user *)arg;
@@ -1135,6 +2078,15 @@ static long sev_ioctl(struct file *file, unsigned int ioctl, unsigned long arg)
case SEV_GET_ID2:
ret = sev_ioctl_do_get_id2(&input);
break;
+ case SNP_PLATFORM_STATUS:
+ ret = sev_ioctl_do_snp_platform_status(&input);
+ break;
+ case SNP_COMMIT:
+ ret = sev_ioctl_do_snp_commit(&input);
+ break;
+ case SNP_SET_CONFIG:
+ ret = sev_ioctl_do_snp_set_config(&input, writable);
+ break;
default:
ret = -EINVAL;
goto out;
@@ -1245,10 +2197,12 @@ int sev_dev_init(struct psp_device *psp)
if (!sev)
goto e_err;
- sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 0);
+ sev->cmd_buf = (void *)devm_get_free_pages(dev, GFP_KERNEL, 1);
if (!sev->cmd_buf)
goto e_sev;
+ sev->cmd_buf_backup = (uint8_t *)sev->cmd_buf + PAGE_SIZE;
+
psp->sev_data = sev;
sev->dev = dev;
@@ -1287,24 +2241,51 @@ e_err:
return ret;
}
-static void sev_firmware_shutdown(struct sev_device *sev)
+static void __sev_firmware_shutdown(struct sev_device *sev, bool panic)
{
- sev_platform_shutdown(NULL);
+ int error;
+
+ __sev_platform_shutdown_locked(NULL);
if (sev_es_tmr) {
- /* The TMR area was encrypted, flush it from the cache */
- wbinvd_on_all_cpus();
+ /*
+ * The TMR area was encrypted, flush it from the cache.
+ *
+ * If invoked during panic handling, local interrupts are
+ * disabled and all CPUs are stopped, so wbinvd_on_all_cpus()
+ * can't be used. In that case, wbinvd() is done on remote CPUs
+ * via the NMI callback, and done for this CPU later during
+ * SNP shutdown, so wbinvd_on_all_cpus() can be skipped.
+ */
+ if (!panic)
+ wbinvd_on_all_cpus();
- free_pages((unsigned long)sev_es_tmr,
- get_order(SEV_ES_TMR_SIZE));
+ __snp_free_firmware_pages(virt_to_page(sev_es_tmr),
+ get_order(sev_es_tmr_size),
+ true);
sev_es_tmr = NULL;
}
if (sev_init_ex_buffer) {
- free_pages((unsigned long)sev_init_ex_buffer,
- get_order(NV_LENGTH));
+ __snp_free_firmware_pages(virt_to_page(sev_init_ex_buffer),
+ get_order(NV_LENGTH),
+ true);
sev_init_ex_buffer = NULL;
}
+
+ if (snp_range_list) {
+ kfree(snp_range_list);
+ snp_range_list = NULL;
+ }
+
+ __sev_snp_shutdown_locked(&error, panic);
+}
+
+static void sev_firmware_shutdown(struct sev_device *sev)
+{
+ mutex_lock(&sev_cmd_mutex);
+ __sev_firmware_shutdown(sev, false);
+ mutex_unlock(&sev_cmd_mutex);
}
void sev_dev_destroy(struct psp_device *psp)
@@ -1322,6 +2303,29 @@ void sev_dev_destroy(struct psp_device *psp)
psp_clear_sev_irq_handler(psp);
}
+static int snp_shutdown_on_panic(struct notifier_block *nb,
+ unsigned long reason, void *arg)
+{
+ struct sev_device *sev = psp_master->sev_data;
+
+ /*
+ * If sev_cmd_mutex is already acquired, then it's likely
+ * another PSP command is in flight and issuing a shutdown
+ * would fail in unexpected ways. Rather than create even
+ * more confusion during a panic, just bail out here.
+ */
+ if (mutex_is_locked(&sev_cmd_mutex))
+ return NOTIFY_DONE;
+
+ __sev_firmware_shutdown(sev, true);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block snp_panic_notifier = {
+ .notifier_call = snp_shutdown_on_panic,
+};
+
int sev_issue_cmd_external_user(struct file *filep, unsigned int cmd,
void *data, int *error)
{
@@ -1335,7 +2339,8 @@ EXPORT_SYMBOL_GPL(sev_issue_cmd_external_user);
void sev_pci_init(void)
{
struct sev_device *sev = psp_master->sev_data;
- int error, rc;
+ struct sev_platform_init_args args = {0};
+ int rc;
if (!sev)
return;
@@ -1348,36 +2353,18 @@ void sev_pci_init(void)
if (sev_update_firmware(sev->dev) == 0)
sev_get_api_version();
- /* If an init_ex_path is provided rely on INIT_EX for PSP initialization
- * instead of INIT.
- */
- if (init_ex_path) {
- sev_init_ex_buffer = sev_fw_alloc(NV_LENGTH);
- if (!sev_init_ex_buffer) {
- dev_err(sev->dev,
- "SEV: INIT_EX NV memory allocation failed\n");
- goto err;
- }
- }
-
- /* Obtain the TMR memory area for SEV-ES use */
- sev_es_tmr = sev_fw_alloc(SEV_ES_TMR_SIZE);
- if (sev_es_tmr)
- /* Must flush the cache before giving it to the firmware */
- clflush_cache_range(sev_es_tmr, SEV_ES_TMR_SIZE);
- else
- dev_warn(sev->dev,
- "SEV: TMR allocation failed, SEV-ES support unavailable\n");
-
- if (!psp_init_on_probe)
- return;
-
/* Initialize the platform */
- rc = sev_platform_init(&error);
+ args.probe = true;
+ rc = sev_platform_init(&args);
if (rc)
dev_err(sev->dev, "SEV: failed to INIT error %#x, rc %d\n",
- error, rc);
+ args.error, rc);
+ dev_info(sev->dev, "SEV%s API:%d.%d build:%d\n", sev->snp_initialized ?
+ "-SNP" : "", sev->api_major, sev->api_minor, sev->build);
+
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &snp_panic_notifier);
return;
err:
@@ -1392,4 +2379,7 @@ void sev_pci_exit(void)
return;
sev_firmware_shutdown(sev);
+
+ atomic_notifier_chain_unregister(&panic_notifier_list,
+ &snp_panic_notifier);
}
diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h
index 778c95155e74..3e4e5574e88a 100644
--- a/drivers/crypto/ccp/sev-dev.h
+++ b/drivers/crypto/ccp/sev-dev.h
@@ -52,6 +52,11 @@ struct sev_device {
u8 build;
void *cmd_buf;
+ void *cmd_buf_backup;
+ bool cmd_buf_active;
+ bool cmd_buf_backup_active;
+
+ bool snp_initialized;
};
int sev_dev_init(struct psp_device *psp);
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 5a7f3fabee22..16c8de5050e5 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -78,6 +78,7 @@ config EDAC_GHES
config EDAC_AMD64
tristate "AMD64 (Opteron, Athlon64)"
depends on AMD_NB && EDAC_DECODE_MCE
+ imply AMD_ATL
help
Support for error detection and correction of DRAM ECC errors on
the AMD64 families (>= K8) of memory controllers.
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 2b8c20bb926a..1f3520d76861 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/ras.h>
#include "amd64_edac.h"
#include <asm/amd_nb.h>
@@ -1051,281 +1052,6 @@ static int fixup_node_id(int node_id, struct mce *m)
return nid - gpu_node_map.base_node_id + 1;
}
-/* Protect the PCI config register pairs used for DF indirect access. */
-static DEFINE_MUTEX(df_indirect_mutex);
-
-/*
- * Data Fabric Indirect Access uses FICAA/FICAD.
- *
- * Fabric Indirect Configuration Access Address (FICAA): Constructed based
- * on the device's Instance Id and the PCI function and register offset of
- * the desired register.
- *
- * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
- * and FICAD HI registers but so far we only need the LO register.
- *
- * Use Instance Id 0xFF to indicate a broadcast read.
- */
-#define DF_BROADCAST 0xFF
-static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
-{
- struct pci_dev *F4;
- u32 ficaa;
- int err = -ENODEV;
-
- if (node >= amd_nb_num())
- goto out;
-
- F4 = node_to_amd_nb(node)->link;
- if (!F4)
- goto out;
-
- ficaa = (instance_id == DF_BROADCAST) ? 0 : 1;
- ficaa |= reg & 0x3FC;
- ficaa |= (func & 0x7) << 11;
- ficaa |= instance_id << 16;
-
- mutex_lock(&df_indirect_mutex);
-
- err = pci_write_config_dword(F4, 0x5C, ficaa);
- if (err) {
- pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
- goto out_unlock;
- }
-
- err = pci_read_config_dword(F4, 0x98, lo);
- if (err)
- pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
-
-out_unlock:
- mutex_unlock(&df_indirect_mutex);
-
-out:
- return err;
-}
-
-static int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
-{
- return __df_indirect_read(node, func, reg, instance_id, lo);
-}
-
-static int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo)
-{
- return __df_indirect_read(node, func, reg, DF_BROADCAST, lo);
-}
-
-struct addr_ctx {
- u64 ret_addr;
- u32 tmp;
- u16 nid;
- u8 inst_id;
-};
-
-static int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
-{
- u64 dram_base_addr, dram_limit_addr, dram_hole_base;
-
- u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
- u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
- u8 intlv_addr_sel, intlv_addr_bit;
- u8 num_intlv_bits, hashed_bit;
- u8 lgcy_mmio_hole_en, base = 0;
- u8 cs_mask, cs_id = 0;
- bool hash_enabled = false;
-
- struct addr_ctx ctx;
-
- memset(&ctx, 0, sizeof(ctx));
-
- /* Start from the normalized address */
- ctx.ret_addr = norm_addr;
-
- ctx.nid = nid;
- ctx.inst_id = umc;
-
- /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
- if (df_indirect_read_instance(nid, 0, 0x1B4, umc, &ctx.tmp))
- goto out_err;
-
- /* Remove HiAddrOffset from normalized address, if enabled: */
- if (ctx.tmp & BIT(0)) {
- u64 hi_addr_offset = (ctx.tmp & GENMASK_ULL(31, 20)) << 8;
-
- if (norm_addr >= hi_addr_offset) {
- ctx.ret_addr -= hi_addr_offset;
- base = 1;
- }
- }
-
- /* Read D18F0x110 (DramBaseAddress). */
- if (df_indirect_read_instance(nid, 0, 0x110 + (8 * base), umc, &ctx.tmp))
- goto out_err;
-
- /* Check if address range is valid. */
- if (!(ctx.tmp & BIT(0))) {
- pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
- __func__, ctx.tmp);
- goto out_err;
- }
-
- lgcy_mmio_hole_en = ctx.tmp & BIT(1);
- intlv_num_chan = (ctx.tmp >> 4) & 0xF;
- intlv_addr_sel = (ctx.tmp >> 8) & 0x7;
- dram_base_addr = (ctx.tmp & GENMASK_ULL(31, 12)) << 16;
-
- /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
- if (intlv_addr_sel > 3) {
- pr_err("%s: Invalid interleave address select %d.\n",
- __func__, intlv_addr_sel);
- goto out_err;
- }
-
- /* Read D18F0x114 (DramLimitAddress). */
- if (df_indirect_read_instance(nid, 0, 0x114 + (8 * base), umc, &ctx.tmp))
- goto out_err;
-
- intlv_num_sockets = (ctx.tmp >> 8) & 0x1;
- intlv_num_dies = (ctx.tmp >> 10) & 0x3;
- dram_limit_addr = ((ctx.tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
-
- intlv_addr_bit = intlv_addr_sel + 8;
-
- /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
- switch (intlv_num_chan) {
- case 0: intlv_num_chan = 0; break;
- case 1: intlv_num_chan = 1; break;
- case 3: intlv_num_chan = 2; break;
- case 5: intlv_num_chan = 3; break;
- case 7: intlv_num_chan = 4; break;
-
- case 8: intlv_num_chan = 1;
- hash_enabled = true;
- break;
- default:
- pr_err("%s: Invalid number of interleaved channels %d.\n",
- __func__, intlv_num_chan);
- goto out_err;
- }
-
- num_intlv_bits = intlv_num_chan;
-
- if (intlv_num_dies > 2) {
- pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
- __func__, intlv_num_dies);
- goto out_err;
- }
-
- num_intlv_bits += intlv_num_dies;
-
- /* Add a bit if sockets are interleaved. */
- num_intlv_bits += intlv_num_sockets;
-
- /* Assert num_intlv_bits <= 4 */
- if (num_intlv_bits > 4) {
- pr_err("%s: Invalid interleave bits %d.\n",
- __func__, num_intlv_bits);
- goto out_err;
- }
-
- if (num_intlv_bits > 0) {
- u64 temp_addr_x, temp_addr_i, temp_addr_y;
- u8 die_id_bit, sock_id_bit, cs_fabric_id;
-
- /*
- * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
- * This is the fabric id for this coherent slave. Use
- * umc/channel# as instance id of the coherent slave
- * for FICAA.
- */
- if (df_indirect_read_instance(nid, 0, 0x50, umc, &ctx.tmp))
- goto out_err;
-
- cs_fabric_id = (ctx.tmp >> 8) & 0xFF;
- die_id_bit = 0;
-
- /* If interleaved over more than 1 channel: */
- if (intlv_num_chan) {
- die_id_bit = intlv_num_chan;
- cs_mask = (1 << die_id_bit) - 1;
- cs_id = cs_fabric_id & cs_mask;
- }
-
- sock_id_bit = die_id_bit;
-
- /* Read D18F1x208 (SystemFabricIdMask). */
- if (intlv_num_dies || intlv_num_sockets)
- if (df_indirect_read_broadcast(nid, 1, 0x208, &ctx.tmp))
- goto out_err;
-
- /* If interleaved over more than 1 die. */
- if (intlv_num_dies) {
- sock_id_bit = die_id_bit + intlv_num_dies;
- die_id_shift = (ctx.tmp >> 24) & 0xF;
- die_id_mask = (ctx.tmp >> 8) & 0xFF;
-
- cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
- }
-
- /* If interleaved over more than 1 socket. */
- if (intlv_num_sockets) {
- socket_id_shift = (ctx.tmp >> 28) & 0xF;
- socket_id_mask = (ctx.tmp >> 16) & 0xFF;
-
- cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
- }
-
- /*
- * The pre-interleaved address consists of XXXXXXIIIYYYYY
- * where III is the ID for this CS, and XXXXXXYYYYY are the
- * address bits from the post-interleaved address.
- * "num_intlv_bits" has been calculated to tell us how many "I"
- * bits there are. "intlv_addr_bit" tells us how many "Y" bits
- * there are (where "I" starts).
- */
- temp_addr_y = ctx.ret_addr & GENMASK_ULL(intlv_addr_bit - 1, 0);
- temp_addr_i = (cs_id << intlv_addr_bit);
- temp_addr_x = (ctx.ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
- ctx.ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
- }
-
- /* Add dram base address */
- ctx.ret_addr += dram_base_addr;
-
- /* If legacy MMIO hole enabled */
- if (lgcy_mmio_hole_en) {
- if (df_indirect_read_broadcast(nid, 0, 0x104, &ctx.tmp))
- goto out_err;
-
- dram_hole_base = ctx.tmp & GENMASK(31, 24);
- if (ctx.ret_addr >= dram_hole_base)
- ctx.ret_addr += (BIT_ULL(32) - dram_hole_base);
- }
-
- if (hash_enabled) {
- /* Save some parentheses and grab ls-bit at the end. */
- hashed_bit = (ctx.ret_addr >> 12) ^
- (ctx.ret_addr >> 18) ^
- (ctx.ret_addr >> 21) ^
- (ctx.ret_addr >> 30) ^
- cs_id;
-
- hashed_bit &= BIT(0);
-
- if (hashed_bit != ((ctx.ret_addr >> intlv_addr_bit) & BIT(0)))
- ctx.ret_addr ^= BIT(intlv_addr_bit);
- }
-
- /* Is calculated system address is above DRAM limit address? */
- if (ctx.ret_addr > dram_limit_addr)
- goto out_err;
-
- *sys_addr = ctx.ret_addr;
- return 0;
-
-out_err:
- return -EINVAL;
-}
-
static int get_channel_from_ecc_syndrome(struct mem_ctl_info *, u16);
/*
@@ -3073,9 +2799,10 @@ static void decode_umc_error(int node_id, struct mce *m)
{
u8 ecc_type = (m->status >> 45) & 0x3;
struct mem_ctl_info *mci;
+ unsigned long sys_addr;
struct amd64_pvt *pvt;
+ struct atl_err a_err;
struct err_info err;
- u64 sys_addr;
node_id = fixup_node_id(node_id, m);
@@ -3106,7 +2833,12 @@ static void decode_umc_error(int node_id, struct mce *m)
pvt->ops->get_err_info(m, &err);
- if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
+ a_err.addr = m->addr;
+ a_err.ipid = m->ipid;
+ a_err.cpu = m->extcpu;
+
+ sys_addr = amd_convert_umc_mca_addr_to_sys_addr(&a_err);
+ if (IS_ERR_VALUE(sys_addr)) {
err.err_code = ERR_NORM_ADDR;
goto log_error;
}
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 2b83d6de9352..3fd22a1eb1a9 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -951,6 +951,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = {
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(EMERALDRAPIDS_X, X86_STEPPINGS(0x0, 0xf), &spr_cfg),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(GRANITERAPIDS_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT_X, X86_STEPPINGS(0x0, 0xf), &gnr_cfg),
+ X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(ATOM_CRESTMONT, X86_STEPPINGS(0x0, 0xf), &gnr_cfg),
{}
};
MODULE_DEVICE_TABLE(x86cpu, i10nm_cpuids);
diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 2b0ecdeba5cd..cdd8480e7368 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -238,6 +238,7 @@ static struct work_struct ecclog_work;
#define DID_ADL_N_SKU9 0x4678
#define DID_ADL_N_SKU10 0x4679
#define DID_ADL_N_SKU11 0x467c
+#define DID_ADL_N_SKU12 0x4632
/* Compute die IDs for Raptor Lake-P with IBECC */
#define DID_RPL_P_SKU1 0xa706
@@ -583,6 +584,7 @@ static const struct pci_device_id igen6_pci_tbl[] = {
{ PCI_VDEVICE(INTEL, DID_ADL_N_SKU9), (kernel_ulong_t)&adl_n_cfg },
{ PCI_VDEVICE(INTEL, DID_ADL_N_SKU10), (kernel_ulong_t)&adl_n_cfg },
{ PCI_VDEVICE(INTEL, DID_ADL_N_SKU11), (kernel_ulong_t)&adl_n_cfg },
+ { PCI_VDEVICE(INTEL, DID_ADL_N_SKU12), (kernel_ulong_t)&adl_n_cfg },
{ PCI_VDEVICE(INTEL, DID_RPL_P_SKU1), (kernel_ulong_t)&rpl_p_cfg },
{ PCI_VDEVICE(INTEL, DID_RPL_P_SKU2), (kernel_ulong_t)&rpl_p_cfg },
{ PCI_VDEVICE(INTEL, DID_RPL_P_SKU3), (kernel_ulong_t)&rpl_p_cfg },
diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c
index 709babce43ba..5527055b0964 100644
--- a/drivers/edac/synopsys_edac.c
+++ b/drivers/edac/synopsys_edac.c
@@ -1324,11 +1324,9 @@ static int mc_probe(struct platform_device *pdev)
struct synps_edac_priv *priv;
struct mem_ctl_info *mci;
void __iomem *baseaddr;
- struct resource *res;
int rc;
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- baseaddr = devm_ioremap_resource(&pdev->dev, res);
+ baseaddr = devm_platform_ioremap_resource(pdev, 0);
if (IS_ERR(baseaddr))
return PTR_ERR(baseaddr);
diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
index 62caf454b567..1688a5050f63 100644
--- a/drivers/edac/versal_edac.c
+++ b/drivers/edac/versal_edac.c
@@ -42,8 +42,11 @@
#define ECCW0_FLIP_CTRL 0x109C
#define ECCW0_FLIP0_OFFSET 0x10A0
+#define ECCW0_FLIP0_BITS 31
+#define ECCW0_FLIP1_OFFSET 0x10A4
#define ECCW1_FLIP_CTRL 0x10AC
#define ECCW1_FLIP0_OFFSET 0x10B0
+#define ECCW1_FLIP1_OFFSET 0x10B4
#define ECCR0_CERR_STAT_OFFSET 0x10BC
#define ECCR0_CE_ADDR_LO_OFFSET 0x10C0
#define ECCR0_CE_ADDR_HI_OFFSET 0x10C4
@@ -116,9 +119,6 @@
#define XDDR_BUS_WIDTH_32 1
#define XDDR_BUS_WIDTH_16 2
-#define ECC_CEPOISON_MASK 0x1
-#define ECC_UEPOISON_MASK 0x3
-
#define XDDR_MAX_ROW_CNT 18
#define XDDR_MAX_COL_CNT 10
#define XDDR_MAX_RANK_CNT 2
@@ -133,6 +133,7 @@
* https://docs.xilinx.com/r/en-US/am012-versal-register-reference/PCSR_LOCK-XRAM_SLCR-Register
*/
#define PCSR_UNLOCK_VAL 0xF9E8D7C6
+#define PCSR_LOCK_VAL 1
#define XDDR_ERR_TYPE_CE 0
#define XDDR_ERR_TYPE_UE 1
@@ -142,6 +143,7 @@
#define XILINX_DRAM_SIZE_12G 3
#define XILINX_DRAM_SIZE_16G 4
#define XILINX_DRAM_SIZE_32G 5
+#define NUM_UE_BITPOS 2
/**
* struct ecc_error_info - ECC error log information.
@@ -479,7 +481,7 @@ static void err_callback(const u32 *payload, void *data)
writel(regval, priv->ddrmc_baseaddr + XDDR_ISR_OFFSET);
/* Lock the PCSR registers */
- writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+ writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
edac_dbg(3, "Total error count CE %d UE %d\n",
priv->ce_cnt, priv->ue_cnt);
}
@@ -650,7 +652,7 @@ static void enable_intr(struct edac_priv *priv)
writel(XDDR_IRQ_UE_MASK,
priv->ddrmc_baseaddr + XDDR_IRQ1_EN_OFFSET);
/* Lock the PCSR registers */
- writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+ writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
}
static void disable_intr(struct edac_priv *priv)
@@ -663,7 +665,7 @@ static void disable_intr(struct edac_priv *priv)
priv->ddrmc_baseaddr + XDDR_IRQ_DIS_OFFSET);
/* Lock the PCSR registers */
- writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+ writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
}
#define to_mci(k) container_of(k, struct mem_ctl_info, dev)
@@ -734,38 +736,63 @@ static void poison_setup(struct edac_priv *priv)
writel(regval, priv->ddrmc_noc_baseaddr + XDDR_NOC_REG_ADEC15_OFFSET);
}
-static ssize_t xddr_inject_data_poison_store(struct mem_ctl_info *mci,
- const char __user *data)
+static void xddr_inject_data_ce_store(struct mem_ctl_info *mci, u8 ce_bitpos)
{
+ u32 ecc0_flip0, ecc1_flip0, ecc0_flip1, ecc1_flip1;
struct edac_priv *priv = mci->pvt_info;
- writel(0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET);
- writel(0, priv->ddrmc_baseaddr + ECCW1_FLIP0_OFFSET);
-
- if (strncmp(data, "CE", 2) == 0) {
- writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr +
- ECCW0_FLIP0_OFFSET);
- writel(ECC_CEPOISON_MASK, priv->ddrmc_baseaddr +
- ECCW1_FLIP0_OFFSET);
+ if (ce_bitpos < ECCW0_FLIP0_BITS) {
+ ecc0_flip0 = BIT(ce_bitpos);
+ ecc1_flip0 = BIT(ce_bitpos);
+ ecc0_flip1 = 0;
+ ecc1_flip1 = 0;
} else {
- writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr +
- ECCW0_FLIP0_OFFSET);
- writel(ECC_UEPOISON_MASK, priv->ddrmc_baseaddr +
- ECCW1_FLIP0_OFFSET);
+ ce_bitpos = ce_bitpos - ECCW0_FLIP0_BITS;
+ ecc0_flip1 = BIT(ce_bitpos);
+ ecc1_flip1 = BIT(ce_bitpos);
+ ecc0_flip0 = 0;
+ ecc1_flip0 = 0;
}
- /* Lock the PCSR registers */
- writel(1, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
-
- return 0;
+ writel(ecc0_flip0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET);
+ writel(ecc1_flip0, priv->ddrmc_baseaddr + ECCW1_FLIP0_OFFSET);
+ writel(ecc0_flip1, priv->ddrmc_baseaddr + ECCW0_FLIP1_OFFSET);
+ writel(ecc1_flip1, priv->ddrmc_baseaddr + ECCW1_FLIP1_OFFSET);
}
-static ssize_t inject_data_poison_store(struct file *file, const char __user *data,
- size_t count, loff_t *ppos)
+/*
+ * To inject a correctable error, the following steps are needed:
+ *
+ * - Write the correctable error bit position value:
+ * echo <bit_pos val> > /sys/kernel/debug/edac/<controller instance>/inject_ce
+ *
+ * poison_setup() derives the row, column, bank, group and rank and
+ * writes to the ADEC registers based on the address given by the user.
+ *
+ * The ADEC12 and ADEC13 are mask registers; write 0 to make sure default
+ * configuration is there and no addresses are masked.
+ *
+ * The row, column, bank, group and rank registers are written to the
+ * match ADEC bit to generate errors at the particular address. ADEC14
+ * and ADEC15 have the match bits.
+ *
+ * xddr_inject_data_ce_store() updates the ECC FLIP registers with the
+ * bits to be corrupted based on the bit position given by the user.
+ *
+ * Upon doing a read to the address the errors are injected.
+ */
+static ssize_t inject_data_ce_store(struct file *file, const char __user *data,
+ size_t count, loff_t *ppos)
{
struct device *dev = file->private_data;
struct mem_ctl_info *mci = to_mci(dev);
struct edac_priv *priv = mci->pvt_info;
+ u8 ce_bitpos;
+ int ret;
+
+ ret = kstrtou8_from_user(data, count, 0, &ce_bitpos);
+ if (ret)
+ return ret;
/* Unlock the PCSR registers */
writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
@@ -773,17 +800,110 @@ static ssize_t inject_data_poison_store(struct file *file, const char __user *da
poison_setup(priv);
+ xddr_inject_data_ce_store(mci, ce_bitpos);
+ ret = count;
+
/* Lock the PCSR registers */
- writel(1, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+ writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+ writel(PCSR_LOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+
+ return ret;
+}
+
+static const struct file_operations xddr_inject_ce_fops = {
+ .open = simple_open,
+ .write = inject_data_ce_store,
+ .llseek = generic_file_llseek,
+};
+
+static void xddr_inject_data_ue_store(struct mem_ctl_info *mci, u32 val0, u32 val1)
+{
+ struct edac_priv *priv = mci->pvt_info;
+
+ writel(val0, priv->ddrmc_baseaddr + ECCW0_FLIP0_OFFSET);
+ writel(val0, priv->ddrmc_baseaddr + ECCW0_FLIP1_OFFSET);
+ writel(val1, priv->ddrmc_baseaddr + ECCW1_FLIP1_OFFSET);
+ writel(val1, priv->ddrmc_baseaddr + ECCW1_FLIP1_OFFSET);
+}
+
+/*
+ * To inject an uncorrectable error, the following steps are needed:
+ * echo <bit_pos val> > /sys/kernel/debug/edac/<controller instance>/inject_ue
+ *
+ * poison_setup() derives the row, column, bank, group and rank and
+ * writes to the ADEC registers based on the address given by the user.
+ *
+ * The ADEC12 and ADEC13 are mask registers; write 0 so that none of the
+ * addresses are masked. The row, column, bank, group and rank registers
+ * are written to the match ADEC bit to generate errors at the
+ * particular address. ADEC14 and ADEC15 have the match bits.
+ *
+ * xddr_inject_data_ue_store() updates the ECC FLIP registers with the
+ * bits to be corrupted based on the bit position given by the user. For
+ * uncorrectable errors
+ * 2 bit errors are injected.
+ *
+ * Upon doing a read to the address the errors are injected.
+ */
+static ssize_t inject_data_ue_store(struct file *file, const char __user *data,
+ size_t count, loff_t *ppos)
+{
+ struct device *dev = file->private_data;
+ struct mem_ctl_info *mci = to_mci(dev);
+ struct edac_priv *priv = mci->pvt_info;
+ char buf[6], *pbuf, *token[2];
+ u32 val0 = 0, val1 = 0;
+ u8 len, ue0, ue1;
+ int i, ret;
+
+ len = min_t(size_t, count, sizeof(buf));
+ if (copy_from_user(buf, data, len))
+ return -EFAULT;
+
+ buf[len] = '\0';
+ pbuf = &buf[0];
+ for (i = 0; i < NUM_UE_BITPOS; i++)
+ token[i] = strsep(&pbuf, ",");
+
+ ret = kstrtou8(token[0], 0, &ue0);
+ if (ret)
+ return ret;
+
+ ret = kstrtou8(token[1], 0, &ue1);
+ if (ret)
+ return ret;
+
+ if (ue0 < ECCW0_FLIP0_BITS) {
+ val0 = BIT(ue0);
+ } else {
+ ue0 = ue0 - ECCW0_FLIP0_BITS;
+ val1 = BIT(ue0);
+ }
+
+ if (ue1 < ECCW0_FLIP0_BITS) {
+ val0 |= BIT(ue1);
+ } else {
+ ue1 = ue1 - ECCW0_FLIP0_BITS;
+ val1 |= BIT(ue1);
+ }
- xddr_inject_data_poison_store(mci, data);
+ /* Unlock the PCSR registers */
+ writel(PCSR_UNLOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
+ writel(PCSR_UNLOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+ poison_setup(priv);
+
+ xddr_inject_data_ue_store(mci, val0, val1);
+
+ /* Lock the PCSR registers */
+ writel(PCSR_LOCK_VAL, priv->ddrmc_noc_baseaddr + XDDR_PCSR_OFFSET);
+ writel(PCSR_LOCK_VAL, priv->ddrmc_baseaddr + XDDR_PCSR_OFFSET);
return count;
}
-static const struct file_operations xddr_inject_enable_fops = {
+static const struct file_operations xddr_inject_ue_fops = {
.open = simple_open,
- .write = inject_data_poison_store,
+ .write = inject_data_ue_store,
.llseek = generic_file_llseek,
};
@@ -795,8 +915,17 @@ static void create_debugfs_attributes(struct mem_ctl_info *mci)
if (!priv->debugfs)
return;
- edac_debugfs_create_file("inject_error", 0200, priv->debugfs,
- &mci->dev, &xddr_inject_enable_fops);
+ if (!edac_debugfs_create_file("inject_ce", 0200, priv->debugfs,
+ &mci->dev, &xddr_inject_ce_fops)) {
+ debugfs_remove_recursive(priv->debugfs);
+ return;
+ }
+
+ if (!edac_debugfs_create_file("inject_ue", 0200, priv->debugfs,
+ &mci->dev, &xddr_inject_ue_fops)) {
+ debugfs_remove_recursive(priv->debugfs);
+ return;
+ }
debugfs_create_x64("address", 0600, priv->debugfs,
&priv->err_inject_addr);
mci->debugfs = priv->debugfs;
@@ -1031,7 +1160,7 @@ free_edac_mc:
return rc;
}
-static int mc_remove(struct platform_device *pdev)
+static void mc_remove(struct platform_device *pdev)
{
struct mem_ctl_info *mci = platform_get_drvdata(pdev);
struct edac_priv *priv = mci->pvt_info;
@@ -1049,8 +1178,6 @@ static int mc_remove(struct platform_device *pdev)
XPM_EVENT_ERROR_MASK_DDRMC_NCR, err_callback, mci);
edac_mc_del_mc(&pdev->dev);
edac_mc_free(mci);
-
- return 0;
}
static struct platform_driver xilinx_ddr_edac_mc_driver = {
@@ -1059,7 +1186,7 @@ static struct platform_driver xilinx_ddr_edac_mc_driver = {
.of_match_table = xlnx_edac_match,
},
.probe = mc_probe,
- .remove = mc_remove,
+ .remove_new = mc_remove,
};
module_platform_driver(xilinx_ddr_edac_mc_driver);
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index bfa30625f5d0..3dc2f9aaf08d 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -24,6 +24,8 @@ static bool efi_noinitrd;
static bool efi_nosoftreserve;
static bool efi_disable_pci_dma = IS_ENABLED(CONFIG_EFI_DISABLE_PCI_DMA);
+int efi_mem_encrypt;
+
bool __pure __efi_soft_reserve_enabled(void)
{
return !efi_nosoftreserve;
@@ -75,6 +77,12 @@ efi_status_t efi_parse_options(char const *cmdline)
efi_noinitrd = true;
} else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
efi_no5lvl = true;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_MEM_ENCRYPT) &&
+ !strcmp(param, "mem_encrypt") && val) {
+ if (parse_option_str(val, "on"))
+ efi_mem_encrypt = 1;
+ else if (parse_option_str(val, "off"))
+ efi_mem_encrypt = -1;
} else if (!strcmp(param, "efi") && val) {
efi_nochunk = parse_option_str(val, "nochunk");
efi_novamap |= parse_option_str(val, "novamap");
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index c04b82ea40f2..fc18fd649ed7 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -37,8 +37,8 @@ extern bool efi_no5lvl;
extern bool efi_nochunk;
extern bool efi_nokaslr;
extern int efi_loglevel;
+extern int efi_mem_encrypt;
extern bool efi_novamap;
-
extern const efi_system_table_t *efi_system_table;
typedef union efi_dxe_services_table efi_dxe_services_table_t;
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 99429bc4b0c7..0336ed175e67 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -884,6 +884,9 @@ void __noreturn efi_stub_entry(efi_handle_t handle,
}
}
+ if (efi_mem_encrypt > 0)
+ hdr->xloadflags |= XLF_MEM_ENCRYPTION;
+
status = efi_decompress_kernel(&kernel_entry);
if (status != EFI_SUCCESS) {
efi_err("Failed to decompress kernel\n");
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index 8b3601f285fd..c970eae2313d 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -164,5 +164,4 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
u64 *root, int mode);
struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
-extern bool amd_iommu_snp_en;
#endif
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index c83bd0c2a1c9..480e7681f4f3 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -30,6 +30,7 @@
#include <asm/io_apic.h>
#include <asm/irq_remapping.h>
#include <asm/set_memory.h>
+#include <asm/sev.h>
#include <linux/crash_dump.h>
@@ -3221,6 +3222,36 @@ out:
return true;
}
+static void iommu_snp_enable(void)
+{
+#ifdef CONFIG_KVM_AMD_SEV
+ if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+ return;
+ /*
+ * The SNP support requires that IOMMU must be enabled, and is
+ * not configured in the passthrough mode.
+ */
+ if (no_iommu || iommu_default_passthrough()) {
+ pr_err("SNP: IOMMU disabled or configured in passthrough mode, SNP cannot be supported.\n");
+ return;
+ }
+
+ amd_iommu_snp_en = check_feature(FEATURE_SNP);
+ if (!amd_iommu_snp_en) {
+ pr_err("SNP: IOMMU SNP feature not enabled, SNP cannot be supported.\n");
+ return;
+ }
+
+ pr_info("IOMMU SNP support enabled.\n");
+
+ /* Enforce IOMMU v1 pagetable when SNP is enabled. */
+ if (amd_iommu_pgtable != AMD_IOMMU_V1) {
+ pr_warn("Forcing use of AMD IOMMU v1 page table due to SNP.\n");
+ amd_iommu_pgtable = AMD_IOMMU_V1;
+ }
+#endif
+}
+
/****************************************************************************
*
* AMD IOMMU Initialization State Machine
@@ -3256,6 +3287,7 @@ static int __init state_next(void)
break;
case IOMMU_ENABLED:
register_syscore_ops(&amd_iommu_syscore_ops);
+ iommu_snp_enable();
ret = amd_iommu_init_pci();
init_state = ret ? IOMMU_INIT_ERROR : IOMMU_PCI_INIT;
break;
@@ -3767,40 +3799,85 @@ int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64
return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
}
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-int amd_iommu_snp_enable(void)
+#ifdef CONFIG_KVM_AMD_SEV
+static int iommu_page_make_shared(void *page)
{
- /*
- * The SNP support requires that IOMMU must be enabled, and is
- * not configured in the passthrough mode.
- */
- if (no_iommu || iommu_default_passthrough()) {
- pr_err("SNP: IOMMU is disabled or configured in passthrough mode, SNP cannot be supported");
- return -EINVAL;
+ unsigned long paddr, pfn;
+
+ paddr = iommu_virt_to_phys(page);
+ /* Cbit maybe set in the paddr */
+ pfn = __sme_clr(paddr) >> PAGE_SHIFT;
+
+ if (!(pfn % PTRS_PER_PMD)) {
+ int ret, level;
+ bool assigned;
+
+ ret = snp_lookup_rmpentry(pfn, &assigned, &level);
+ if (ret) {
+ pr_warn("IOMMU PFN %lx RMP lookup failed, ret %d\n", pfn, ret);
+ return ret;
+ }
+
+ if (!assigned) {
+ pr_warn("IOMMU PFN %lx not assigned in RMP table\n", pfn);
+ return -EINVAL;
+ }
+
+ if (level > PG_LEVEL_4K) {
+ ret = psmash(pfn);
+ if (!ret)
+ goto done;
+
+ pr_warn("PSMASH failed for IOMMU PFN %lx huge RMP entry, ret: %d, level: %d\n",
+ pfn, ret, level);
+ return ret;
+ }
}
- /*
- * Prevent enabling SNP after IOMMU_ENABLED state because this process
- * affect how IOMMU driver sets up data structures and configures
- * IOMMU hardware.
- */
- if (init_state > IOMMU_ENABLED) {
- pr_err("SNP: Too late to enable SNP for IOMMU.\n");
- return -EINVAL;
+done:
+ return rmp_make_shared(pfn, PG_LEVEL_4K);
+}
+
+static int iommu_make_shared(void *va, size_t size)
+{
+ void *page;
+ int ret;
+
+ if (!va)
+ return 0;
+
+ for (page = va; page < (va + size); page += PAGE_SIZE) {
+ ret = iommu_page_make_shared(page);
+ if (ret)
+ return ret;
}
- amd_iommu_snp_en = check_feature(FEATURE_SNP);
+ return 0;
+}
+
+int amd_iommu_snp_disable(void)
+{
+ struct amd_iommu *iommu;
+ int ret;
+
if (!amd_iommu_snp_en)
- return -EINVAL;
+ return 0;
+
+ for_each_iommu(iommu) {
+ ret = iommu_make_shared(iommu->evt_buf, EVT_BUFFER_SIZE);
+ if (ret)
+ return ret;
- pr_info("SNP enabled\n");
+ ret = iommu_make_shared(iommu->ppr_log, PPR_LOG_SIZE);
+ if (ret)
+ return ret;
- /* Enforce IOMMU v1 pagetable when SNP is enabled. */
- if (amd_iommu_pgtable != AMD_IOMMU_V1) {
- pr_warn("Force to using AMD IOMMU v1 page table due to SNP\n");
- amd_iommu_pgtable = AMD_IOMMU_V1;
+ ret = iommu_make_shared((void *)iommu->cmd_sem, PAGE_SIZE);
+ if (ret)
+ return ret;
}
return 0;
}
+EXPORT_SYMBOL_GPL(amd_iommu_snp_disable);
#endif
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 95077c06ec4a..9836745f5fde 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2027,27 +2027,26 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
bool wc = false, fua = false;
int r;
+ /*
+ * Copy table's limits to the DM device's request_queue
+ */
+ q->limits = *limits;
+
if (dm_table_supports_nowait(t))
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
else
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
if (!dm_table_supports_discards(t)) {
- limits->max_hw_discard_sectors = 0;
- limits->discard_granularity = 0;
- limits->discard_alignment = 0;
- limits->discard_misaligned = 0;
+ q->limits.max_discard_sectors = 0;
+ q->limits.max_hw_discard_sectors = 0;
+ q->limits.discard_granularity = 0;
+ q->limits.discard_alignment = 0;
+ q->limits.discard_misaligned = 0;
}
- if (!dm_table_supports_write_zeroes(t))
- limits->max_write_zeroes_sectors = 0;
-
if (!dm_table_supports_secure_erase(t))
- limits->max_secure_erase_sectors = 0;
-
- r = queue_limits_set(q, limits);
- if (r)
- return r;
+ q->limits.max_secure_erase_sectors = 0;
if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
wc = true;
@@ -2072,6 +2071,9 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+ if (!dm_table_supports_write_zeroes(t))
+ q->limits.max_write_zeroes_sectors = 0;
+
dm_table_verify_integrity(t);
/*
@@ -2109,6 +2111,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
dm_update_crypto_profile(q, t);
+ disk_update_readahead(t->md->disk);
/*
* Check for request-based device is left to
diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c
index a89b7de72dcf..7333b305f2a5 100644
--- a/drivers/pci/hotplug/s390_pci_hpc.c
+++ b/drivers/pci/hotplug/s390_pci_hpc.c
@@ -26,58 +26,79 @@ static int enable_slot(struct hotplug_slot *hotplug_slot)
hotplug_slot);
int rc;
- if (zdev->state != ZPCI_FN_STATE_STANDBY)
- return -EIO;
+ mutex_lock(&zdev->state_lock);
+ if (zdev->state != ZPCI_FN_STATE_STANDBY) {
+ rc = -EIO;
+ goto out;
+ }
rc = sclp_pci_configure(zdev->fid);
zpci_dbg(3, "conf fid:%x, rc:%d\n", zdev->fid, rc);
if (rc)
- return rc;
+ goto out;
zdev->state = ZPCI_FN_STATE_CONFIGURED;
- return zpci_scan_configured_device(zdev, zdev->fh);
+ rc = zpci_scan_configured_device(zdev, zdev->fh);
+out:
+ mutex_unlock(&zdev->state_lock);
+ return rc;
}
static int disable_slot(struct hotplug_slot *hotplug_slot)
{
struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev,
hotplug_slot);
- struct pci_dev *pdev;
+ struct pci_dev *pdev = NULL;
+ int rc;
- if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
- return -EIO;
+ mutex_lock(&zdev->state_lock);
+ if (zdev->state != ZPCI_FN_STATE_CONFIGURED) {
+ rc = -EIO;
+ goto out;
+ }
pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
if (pdev && pci_num_vf(pdev)) {
pci_dev_put(pdev);
- return -EBUSY;
+ rc = -EBUSY;
+ goto out;
}
- pci_dev_put(pdev);
- return zpci_deconfigure_device(zdev);
+ rc = zpci_deconfigure_device(zdev);
+out:
+ mutex_unlock(&zdev->state_lock);
+ if (pdev)
+ pci_dev_put(pdev);
+ return rc;
}
static int reset_slot(struct hotplug_slot *hotplug_slot, bool probe)
{
struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev,
hotplug_slot);
+ int rc = -EIO;
- if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
- return -EIO;
/*
- * We can't take the zdev->lock as reset_slot may be called during
- * probing and/or device removal which already happens under the
- * zdev->lock. Instead the user should use the higher level
- * pci_reset_function() or pci_bus_reset() which hold the PCI device
- * lock preventing concurrent removal. If not using these functions
- * holding the PCI device lock is required.
+ * If we can't get the zdev->state_lock the device state is
+ * currently undergoing a transition and we bail out - just
+ * the same as if the device's state is not configured at all.
*/
+ if (!mutex_trylock(&zdev->state_lock))
+ return rc;
- /* As long as the function is configured we can reset */
- if (probe)
- return 0;
+ /* We can reset only if the function is configured */
+ if (zdev->state != ZPCI_FN_STATE_CONFIGURED)
+ goto out;
+
+ if (probe) {
+ rc = 0;
+ goto out;
+ }
- return zpci_hot_reset_device(zdev);
+ rc = zpci_hot_reset_device(zdev);
+out:
+ mutex_unlock(&zdev->state_lock);
+ return rc;
}
static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value)
diff --git a/drivers/ras/Kconfig b/drivers/ras/Kconfig
index c2a236f2e846..fc4f4bb94a4c 100644
--- a/drivers/ras/Kconfig
+++ b/drivers/ras/Kconfig
@@ -32,5 +32,18 @@ menuconfig RAS
if RAS
source "arch/x86/ras/Kconfig"
+source "drivers/ras/amd/atl/Kconfig"
+
+config RAS_FMPM
+ tristate "FRU Memory Poison Manager"
+ default m
+ depends on AMD_ATL && ACPI_APEI
+ help
+ Support saving and restoring memory error information across reboot
+ using ACPI ERST as persistent storage. Error information is saved with
+ the UEFI CPER "FRU Memory Poison" section format.
+
+ Memory will be retired during boot time and run time depending on
+ platform-specific policies.
endif
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile
index 6f0404f50107..11f95d59d397 100644
--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
@@ -2,3 +2,6 @@
obj-$(CONFIG_RAS) += ras.o
obj-$(CONFIG_DEBUG_FS) += debugfs.o
obj-$(CONFIG_RAS_CEC) += cec.o
+
+obj-$(CONFIG_RAS_FMPM) += amd/fmpm.o
+obj-y += amd/atl/
diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig
new file mode 100644
index 000000000000..df49c23e7f62
--- /dev/null
+++ b/drivers/ras/amd/atl/Kconfig
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# AMD Address Translation Library Kconfig
+#
+# Copyright (c) 2023, Advanced Micro Devices, Inc.
+# All Rights Reserved.
+#
+# Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+
+config AMD_ATL
+ tristate "AMD Address Translation Library"
+ depends on AMD_NB && X86_64 && RAS
+ depends on MEMORY_FAILURE
+ default N
+ help
+ This library includes support for implementation-specific
+ address translation procedures needed for various error
+ handling cases.
+
+ Enable this option if using DRAM ECC on Zen-based systems
+ and OS-based error handling.
diff --git a/drivers/ras/amd/atl/Makefile b/drivers/ras/amd/atl/Makefile
new file mode 100644
index 000000000000..4acd5f05bd9c
--- /dev/null
+++ b/drivers/ras/amd/atl/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# AMD Address Translation Library Makefile
+#
+# Copyright (c) 2023, Advanced Micro Devices, Inc.
+# All Rights Reserved.
+#
+# Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+
+amd_atl-y := access.o
+amd_atl-y += core.o
+amd_atl-y += dehash.o
+amd_atl-y += denormalize.o
+amd_atl-y += map.o
+amd_atl-y += system.o
+amd_atl-y += umc.o
+
+obj-$(CONFIG_AMD_ATL) += amd_atl.o
diff --git a/drivers/ras/amd/atl/access.c b/drivers/ras/amd/atl/access.c
new file mode 100644
index 000000000000..ee4661ed28ba
--- /dev/null
+++ b/drivers/ras/amd/atl/access.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * access.c : DF Indirect Access functions
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include "internal.h"
+
+/* Protect the PCI config register pairs used for DF indirect access. */
+static DEFINE_MUTEX(df_indirect_mutex);
+
+/*
+ * Data Fabric Indirect Access uses FICAA/FICAD.
+ *
+ * Fabric Indirect Configuration Access Address (FICAA): constructed based
+ * on the device's Instance Id and the PCI function and register offset of
+ * the desired register.
+ *
+ * Fabric Indirect Configuration Access Data (FICAD): there are FICAD
+ * low and high registers but so far only the low register is needed.
+ *
+ * Use Instance Id 0xFF to indicate a broadcast read.
+ */
+#define DF_BROADCAST 0xFF
+
+#define DF_FICAA_INST_EN BIT(0)
+#define DF_FICAA_REG_NUM GENMASK(10, 1)
+#define DF_FICAA_FUNC_NUM GENMASK(13, 11)
+#define DF_FICAA_INST_ID GENMASK(23, 16)
+
+#define DF_FICAA_REG_NUM_LEGACY GENMASK(10, 2)
+
+static u16 get_accessible_node(u16 node)
+{
+ /*
+ * On heterogeneous systems, not all AMD Nodes are accessible
+ * through software-visible registers. The Node ID needs to be
+ * adjusted for register accesses. But its value should not be
+ * changed for the translation methods.
+ */
+ if (df_cfg.flags.heterogeneous) {
+ /* Only Node 0 is accessible on DF3.5 systems. */
+ if (df_cfg.rev == DF3p5)
+ node = 0;
+
+ /*
+ * Only the first Node in each Socket is accessible on
+ * DF4.5 systems, and this is visible to software as one
+ * Fabric per Socket. The Socket ID can be derived from
+ * the Node ID and global shift values.
+ */
+ if (df_cfg.rev == DF4p5)
+ node >>= df_cfg.socket_id_shift - df_cfg.node_id_shift;
+ }
+
+ return node;
+}
+
+static int __df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+ u32 ficaa_addr = 0x8C, ficad_addr = 0xB8;
+ struct pci_dev *F4;
+ int err = -ENODEV;
+ u32 ficaa = 0;
+
+ node = get_accessible_node(node);
+ if (node >= amd_nb_num())
+ goto out;
+
+ F4 = node_to_amd_nb(node)->link;
+ if (!F4)
+ goto out;
+
+ /* Enable instance-specific access. */
+ if (instance_id != DF_BROADCAST) {
+ ficaa |= FIELD_PREP(DF_FICAA_INST_EN, 1);
+ ficaa |= FIELD_PREP(DF_FICAA_INST_ID, instance_id);
+ }
+
+ /*
+ * The two least-significant bits are masked when inputing the
+ * register offset to FICAA.
+ */
+ reg >>= 2;
+
+ if (df_cfg.flags.legacy_ficaa) {
+ ficaa_addr = 0x5C;
+ ficad_addr = 0x98;
+
+ ficaa |= FIELD_PREP(DF_FICAA_REG_NUM_LEGACY, reg);
+ } else {
+ ficaa |= FIELD_PREP(DF_FICAA_REG_NUM, reg);
+ }
+
+ ficaa |= FIELD_PREP(DF_FICAA_FUNC_NUM, func);
+
+ mutex_lock(&df_indirect_mutex);
+
+ err = pci_write_config_dword(F4, ficaa_addr, ficaa);
+ if (err) {
+ pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
+ goto out_unlock;
+ }
+
+ err = pci_read_config_dword(F4, ficad_addr, lo);
+ if (err)
+ pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
+
+ pr_debug("node=%u inst=0x%x func=0x%x reg=0x%x val=0x%x",
+ node, instance_id, func, reg << 2, *lo);
+
+out_unlock:
+ mutex_unlock(&df_indirect_mutex);
+
+out:
+ return err;
+}
+
+int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+ return __df_indirect_read(node, func, reg, instance_id, lo);
+}
+
+int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo)
+{
+ return __df_indirect_read(node, func, reg, DF_BROADCAST, lo);
+}
diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c
new file mode 100644
index 000000000000..6dc4e06305f7
--- /dev/null
+++ b/drivers/ras/amd/atl/core.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * core.c : Module init and base translation functions
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include <linux/module.h>
+#include <asm/cpu_device_id.h>
+
+#include "internal.h"
+
+struct df_config df_cfg __read_mostly;
+
+static int addr_over_limit(struct addr_ctx *ctx)
+{
+ u64 dram_limit_addr;
+
+ if (df_cfg.rev >= DF4)
+ dram_limit_addr = FIELD_GET(DF4_DRAM_LIMIT_ADDR, ctx->map.limit);
+ else
+ dram_limit_addr = FIELD_GET(DF2_DRAM_LIMIT_ADDR, ctx->map.limit);
+
+ dram_limit_addr <<= DF_DRAM_BASE_LIMIT_LSB;
+ dram_limit_addr |= GENMASK(DF_DRAM_BASE_LIMIT_LSB - 1, 0);
+
+ /* Is calculated system address above DRAM limit address? */
+ if (ctx->ret_addr > dram_limit_addr) {
+ atl_debug(ctx, "Calculated address (0x%016llx) > DRAM limit (0x%016llx)",
+ ctx->ret_addr, dram_limit_addr);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool legacy_hole_en(struct addr_ctx *ctx)
+{
+ u32 reg = ctx->map.base;
+
+ if (df_cfg.rev >= DF4)
+ reg = ctx->map.ctl;
+
+ return FIELD_GET(DF_LEGACY_MMIO_HOLE_EN, reg);
+}
+
+static int add_legacy_hole(struct addr_ctx *ctx)
+{
+ u32 dram_hole_base;
+ u8 func = 0;
+
+ if (!legacy_hole_en(ctx))
+ return 0;
+
+ if (df_cfg.rev >= DF4)
+ func = 7;
+
+ if (df_indirect_read_broadcast(ctx->node_id, func, 0x104, &dram_hole_base))
+ return -EINVAL;
+
+ dram_hole_base &= DF_DRAM_HOLE_BASE_MASK;
+
+ if (ctx->ret_addr >= dram_hole_base)
+ ctx->ret_addr += (BIT_ULL(32) - dram_hole_base);
+
+ return 0;
+}
+
+static u64 get_base_addr(struct addr_ctx *ctx)
+{
+ u64 base_addr;
+
+ if (df_cfg.rev >= DF4)
+ base_addr = FIELD_GET(DF4_BASE_ADDR, ctx->map.base);
+ else
+ base_addr = FIELD_GET(DF2_BASE_ADDR, ctx->map.base);
+
+ return base_addr << DF_DRAM_BASE_LIMIT_LSB;
+}
+
+static int add_base_and_hole(struct addr_ctx *ctx)
+{
+ ctx->ret_addr += get_base_addr(ctx);
+
+ if (add_legacy_hole(ctx))
+ return -EINVAL;
+
+ return 0;
+}
+
+static bool late_hole_remove(struct addr_ctx *ctx)
+{
+ if (df_cfg.rev == DF3p5)
+ return true;
+
+ if (df_cfg.rev == DF4)
+ return true;
+
+ if (ctx->map.intlv_mode == DF3_6CHAN)
+ return true;
+
+ return false;
+}
+
+unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr)
+{
+ struct addr_ctx ctx;
+
+ if (df_cfg.rev == UNKNOWN)
+ return -EINVAL;
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ /* Start from the normalized address */
+ ctx.ret_addr = addr;
+ ctx.inst_id = coh_st_inst_id;
+
+ ctx.inputs.norm_addr = addr;
+ ctx.inputs.socket_id = socket_id;
+ ctx.inputs.die_id = die_id;
+ ctx.inputs.coh_st_inst_id = coh_st_inst_id;
+
+ if (determine_node_id(&ctx, socket_id, die_id))
+ return -EINVAL;
+
+ if (get_address_map(&ctx))
+ return -EINVAL;
+
+ if (denormalize_address(&ctx))
+ return -EINVAL;
+
+ if (!late_hole_remove(&ctx) && add_base_and_hole(&ctx))
+ return -EINVAL;
+
+ if (dehash_address(&ctx))
+ return -EINVAL;
+
+ if (late_hole_remove(&ctx) && add_base_and_hole(&ctx))
+ return -EINVAL;
+
+ if (addr_over_limit(&ctx))
+ return -EINVAL;
+
+ return ctx.ret_addr;
+}
+
+static void check_for_legacy_df_access(void)
+{
+ /*
+ * All Zen-based systems before Family 19h use the legacy
+ * DF Indirect Access (FICAA/FICAD) offsets.
+ */
+ if (boot_cpu_data.x86 < 0x19) {
+ df_cfg.flags.legacy_ficaa = true;
+ return;
+ }
+
+ /* All systems after Family 19h use the current offsets. */
+ if (boot_cpu_data.x86 > 0x19)
+ return;
+
+ /* Some Family 19h systems use the legacy offsets. */
+ switch (boot_cpu_data.x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ df_cfg.flags.legacy_ficaa = true;
+ }
+}
+
+/*
+ * This library provides functionality for AMD-based systems with a Data Fabric.
+ * The set of systems with a Data Fabric is equivalent to the set of Zen-based systems
+ * and the set of systems with the Scalable MCA feature at this time. However, these
+ * are technically independent things.
+ *
+ * It's possible to match on the PCI IDs of the Data Fabric devices, but this will be
+ * an ever expanding list. Instead, match on the SMCA and Zen features to cover all
+ * relevant systems.
+ */
+static const struct x86_cpu_id amd_atl_cpuids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_SMCA, NULL),
+ X86_MATCH_FEATURE(X86_FEATURE_ZEN, NULL),
+ { }
+};
+MODULE_DEVICE_TABLE(x86cpu, amd_atl_cpuids);
+
+static int __init amd_atl_init(void)
+{
+ if (!x86_match_cpu(amd_atl_cpuids))
+ return -ENODEV;
+
+ if (!amd_nb_num())
+ return -ENODEV;
+
+ check_for_legacy_df_access();
+
+ if (get_df_system_info())
+ return -ENODEV;
+
+ /* Increment this module's recount so that it can't be easily unloaded. */
+ __module_get(THIS_MODULE);
+ amd_atl_register_decoder(convert_umc_mca_addr_to_sys_addr);
+
+ pr_info("AMD Address Translation Library initialized");
+ return 0;
+}
+
+/*
+ * Exit function is only needed for testing and debug. Module unload must be
+ * forced to override refcount check.
+ */
+static void __exit amd_atl_exit(void)
+{
+ amd_atl_unregister_decoder();
+}
+
+module_init(amd_atl_init);
+module_exit(amd_atl_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/ras/amd/atl/dehash.c b/drivers/ras/amd/atl/dehash.c
new file mode 100644
index 000000000000..4ea46262c4f5
--- /dev/null
+++ b/drivers/ras/amd/atl/dehash.c
@@ -0,0 +1,500 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * dehash.c : Functions to account for hashing bits
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include "internal.h"
+
+/*
+ * Verify the interleave bits are correct in the different interleaving
+ * settings.
+ *
+ * If @num_intlv_dies and/or @num_intlv_sockets are 1, it means the
+ * respective interleaving is disabled.
+ */
+static inline bool map_bits_valid(struct addr_ctx *ctx, u8 bit1, u8 bit2,
+ u8 num_intlv_dies, u8 num_intlv_sockets)
+{
+ if (!(ctx->map.intlv_bit_pos == bit1 || ctx->map.intlv_bit_pos == bit2)) {
+ pr_debug("Invalid interleave bit: %u", ctx->map.intlv_bit_pos);
+ return false;
+ }
+
+ if (ctx->map.num_intlv_dies > num_intlv_dies) {
+ pr_debug("Invalid number of interleave dies: %u", ctx->map.num_intlv_dies);
+ return false;
+ }
+
+ if (ctx->map.num_intlv_sockets > num_intlv_sockets) {
+ pr_debug("Invalid number of interleave sockets: %u", ctx->map.num_intlv_sockets);
+ return false;
+ }
+
+ return true;
+}
+
+static int df2_dehash_addr(struct addr_ctx *ctx)
+{
+ u8 hashed_bit, intlv_bit, intlv_bit_pos;
+
+ if (!map_bits_valid(ctx, 8, 9, 1, 1))
+ return -EINVAL;
+
+ intlv_bit_pos = ctx->map.intlv_bit_pos;
+ intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(12), ctx->ret_addr);
+ hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr);
+ hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr);
+ hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr);
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(intlv_bit_pos);
+
+ return 0;
+}
+
+static int df3_dehash_addr(struct addr_ctx *ctx)
+{
+ bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G;
+ u8 hashed_bit, intlv_bit, intlv_bit_pos;
+
+ if (!map_bits_valid(ctx, 8, 9, 1, 1))
+ return -EINVAL;
+
+ hash_ctl_64k = FIELD_GET(DF3_HASH_CTL_64K, ctx->map.ctl);
+ hash_ctl_2M = FIELD_GET(DF3_HASH_CTL_2M, ctx->map.ctl);
+ hash_ctl_1G = FIELD_GET(DF3_HASH_CTL_1G, ctx->map.ctl);
+
+ intlv_bit_pos = ctx->map.intlv_bit_pos;
+ intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr);
+ hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(intlv_bit_pos);
+
+ /* Calculation complete for 2 channels. Continue for 4 and 8 channels. */
+ if (ctx->map.intlv_mode == DF3_COD4_2CHAN_HASH)
+ return 0;
+
+ intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(12);
+
+ /* Calculation complete for 4 channels. Continue for 8 channels. */
+ if (ctx->map.intlv_mode == DF3_COD2_4CHAN_HASH)
+ return 0;
+
+ intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(13);
+
+ return 0;
+}
+
+static int df3_6chan_dehash_addr(struct addr_ctx *ctx)
+{
+ u8 intlv_bit_pos = ctx->map.intlv_bit_pos;
+ u8 hashed_bit, intlv_bit, num_intlv_bits;
+ bool hash_ctl_2M, hash_ctl_1G;
+
+ if (ctx->map.intlv_mode != DF3_6CHAN) {
+ atl_debug_on_bad_intlv_mode(ctx);
+ return -EINVAL;
+ }
+
+ num_intlv_bits = ilog2(ctx->map.num_intlv_chan) + 1;
+
+ hash_ctl_2M = FIELD_GET(DF3_HASH_CTL_2M, ctx->map.ctl);
+ hash_ctl_1G = FIELD_GET(DF3_HASH_CTL_1G, ctx->map.ctl);
+
+ intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= !!(BIT_ULL(intlv_bit_pos + num_intlv_bits) & ctx->ret_addr);
+ hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(intlv_bit_pos);
+
+ intlv_bit_pos++;
+ intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(intlv_bit_pos);
+
+ intlv_bit_pos++;
+ intlv_bit = !!(BIT_ULL(intlv_bit_pos) & ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(intlv_bit_pos);
+
+ return 0;
+}
+
+static int df4_dehash_addr(struct addr_ctx *ctx)
+{
+ bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G;
+ u8 hashed_bit, intlv_bit;
+
+ if (!map_bits_valid(ctx, 8, 8, 1, 2))
+ return -EINVAL;
+
+ hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl);
+ hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl);
+ hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl);
+
+ intlv_bit = FIELD_GET(BIT_ULL(8), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G;
+
+ if (ctx->map.num_intlv_sockets == 1)
+ hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr);
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(8);
+
+ /*
+ * Hashing is possible with socket interleaving, so check the total number
+ * of channels in the system rather than DRAM map interleaving mode.
+ *
+ * Calculation complete for 2 channels. Continue for 4, 8, and 16 channels.
+ */
+ if (ctx->map.total_intlv_chan <= 2)
+ return 0;
+
+ intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(12);
+
+ /* Calculation complete for 4 channels. Continue for 8 and 16 channels. */
+ if (ctx->map.total_intlv_chan <= 4)
+ return 0;
+
+ intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(13);
+
+ /* Calculation complete for 8 channels. Continue for 16 channels. */
+ if (ctx->map.total_intlv_chan <= 8)
+ return 0;
+
+ intlv_bit = FIELD_GET(BIT_ULL(14), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(19), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(24), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(33), ctx->ret_addr) & hash_ctl_1G;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(14);
+
+ return 0;
+}
+
+static int df4p5_dehash_addr(struct addr_ctx *ctx)
+{
+ bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G, hash_ctl_1T;
+ u8 hashed_bit, intlv_bit;
+ u64 rehash_vector;
+
+ if (!map_bits_valid(ctx, 8, 8, 1, 2))
+ return -EINVAL;
+
+ hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl);
+ hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl);
+ hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl);
+ hash_ctl_1T = FIELD_GET(DF4p5_HASH_CTL_1T, ctx->map.ctl);
+
+ /*
+ * Generate a unique address to determine which bits
+ * need to be dehashed.
+ *
+ * Start with a contiguous bitmask for the total
+ * number of channels starting at bit 8.
+ *
+ * Then make a gap in the proper place based on
+ * interleave mode.
+ */
+ rehash_vector = ctx->map.total_intlv_chan - 1;
+ rehash_vector <<= 8;
+
+ if (ctx->map.intlv_mode == DF4p5_NPS2_4CHAN_1K_HASH ||
+ ctx->map.intlv_mode == DF4p5_NPS1_8CHAN_1K_HASH ||
+ ctx->map.intlv_mode == DF4p5_NPS1_16CHAN_1K_HASH)
+ rehash_vector = expand_bits(10, 2, rehash_vector);
+ else
+ rehash_vector = expand_bits(9, 3, rehash_vector);
+
+ if (rehash_vector & BIT_ULL(8)) {
+ intlv_bit = FIELD_GET(BIT_ULL(8), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G;
+ hashed_bit ^= FIELD_GET(BIT_ULL(40), ctx->ret_addr) & hash_ctl_1T;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(8);
+ }
+
+ if (rehash_vector & BIT_ULL(9)) {
+ intlv_bit = FIELD_GET(BIT_ULL(9), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G;
+ hashed_bit ^= FIELD_GET(BIT_ULL(41), ctx->ret_addr) & hash_ctl_1T;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(9);
+ }
+
+ if (rehash_vector & BIT_ULL(12)) {
+ intlv_bit = FIELD_GET(BIT_ULL(12), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G;
+ hashed_bit ^= FIELD_GET(BIT_ULL(42), ctx->ret_addr) & hash_ctl_1T;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(12);
+ }
+
+ if (rehash_vector & BIT_ULL(13)) {
+ intlv_bit = FIELD_GET(BIT_ULL(13), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(19), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(24), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(33), ctx->ret_addr) & hash_ctl_1G;
+ hashed_bit ^= FIELD_GET(BIT_ULL(43), ctx->ret_addr) & hash_ctl_1T;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(13);
+ }
+
+ if (rehash_vector & BIT_ULL(14)) {
+ intlv_bit = FIELD_GET(BIT_ULL(14), ctx->ret_addr);
+
+ hashed_bit = intlv_bit;
+ hashed_bit ^= FIELD_GET(BIT_ULL(20), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(25), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(34), ctx->ret_addr) & hash_ctl_1G;
+ hashed_bit ^= FIELD_GET(BIT_ULL(44), ctx->ret_addr) & hash_ctl_1T;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(14);
+ }
+
+ return 0;
+}
+
+/*
+ * MI300 hash bits
+ * 4K 64K 2M 1G 1T 1T
+ * COH_ST_Select[0] = XOR of addr{8, 12, 15, 22, 29, 36, 43}
+ * COH_ST_Select[1] = XOR of addr{9, 13, 16, 23, 30, 37, 44}
+ * COH_ST_Select[2] = XOR of addr{10, 14, 17, 24, 31, 38, 45}
+ * COH_ST_Select[3] = XOR of addr{11, 18, 25, 32, 39, 46}
+ * COH_ST_Select[4] = XOR of addr{14, 19, 26, 33, 40, 47} aka Stack
+ * DieID[0] = XOR of addr{12, 20, 27, 34, 41 }
+ * DieID[1] = XOR of addr{13, 21, 28, 35, 42 }
+ */
+static int mi300_dehash_addr(struct addr_ctx *ctx)
+{
+ bool hash_ctl_4k, hash_ctl_64k, hash_ctl_2M, hash_ctl_1G, hash_ctl_1T;
+ bool hashed_bit, intlv_bit, test_bit;
+ u8 num_intlv_bits, base_bit, i;
+
+ if (!map_bits_valid(ctx, 8, 8, 4, 1))
+ return -EINVAL;
+
+ hash_ctl_4k = FIELD_GET(DF4p5_HASH_CTL_4K, ctx->map.ctl);
+ hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl);
+ hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl);
+ hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl);
+ hash_ctl_1T = FIELD_GET(DF4p5_HASH_CTL_1T, ctx->map.ctl);
+
+ /* Channel bits */
+ num_intlv_bits = ilog2(ctx->map.num_intlv_chan);
+
+ for (i = 0; i < num_intlv_bits; i++) {
+ base_bit = 8 + i;
+
+ /* COH_ST_Select[4] jumps to a base bit of 14. */
+ if (i == 4)
+ base_bit = 14;
+
+ intlv_bit = BIT_ULL(base_bit) & ctx->ret_addr;
+
+ hashed_bit = intlv_bit;
+
+ /* 4k hash bit only applies to the first 3 bits. */
+ if (i <= 2) {
+ test_bit = BIT_ULL(12 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_4k;
+ }
+
+ /* Use temporary 'test_bit' value to avoid Sparse warnings. */
+ test_bit = BIT_ULL(15 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_64k;
+ test_bit = BIT_ULL(22 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_2M;
+ test_bit = BIT_ULL(29 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_1G;
+ test_bit = BIT_ULL(36 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_1T;
+ test_bit = BIT_ULL(43 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_1T;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(base_bit);
+ }
+
+ /* Die bits */
+ num_intlv_bits = ilog2(ctx->map.num_intlv_dies);
+
+ for (i = 0; i < num_intlv_bits; i++) {
+ base_bit = 12 + i;
+
+ intlv_bit = BIT_ULL(base_bit) & ctx->ret_addr;
+
+ hashed_bit = intlv_bit;
+
+ test_bit = BIT_ULL(20 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_64k;
+ test_bit = BIT_ULL(27 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_2M;
+ test_bit = BIT_ULL(34 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_1G;
+ test_bit = BIT_ULL(41 + i) & ctx->ret_addr;
+ hashed_bit ^= test_bit & hash_ctl_1T;
+
+ if (hashed_bit != intlv_bit)
+ ctx->ret_addr ^= BIT_ULL(base_bit);
+ }
+
+ return 0;
+}
+
+int dehash_address(struct addr_ctx *ctx)
+{
+ switch (ctx->map.intlv_mode) {
+ /* No hashing cases. */
+ case NONE:
+ case NOHASH_2CHAN:
+ case NOHASH_4CHAN:
+ case NOHASH_8CHAN:
+ case NOHASH_16CHAN:
+ case NOHASH_32CHAN:
+ /* Hashing bits handled earlier during CS ID calculation. */
+ case DF4_NPS4_3CHAN_HASH:
+ case DF4_NPS2_5CHAN_HASH:
+ case DF4_NPS2_6CHAN_HASH:
+ case DF4_NPS1_10CHAN_HASH:
+ case DF4_NPS1_12CHAN_HASH:
+ case DF4p5_NPS2_6CHAN_1K_HASH:
+ case DF4p5_NPS2_6CHAN_2K_HASH:
+ case DF4p5_NPS1_10CHAN_1K_HASH:
+ case DF4p5_NPS1_10CHAN_2K_HASH:
+ case DF4p5_NPS1_12CHAN_1K_HASH:
+ case DF4p5_NPS1_12CHAN_2K_HASH:
+ case DF4p5_NPS0_24CHAN_1K_HASH:
+ case DF4p5_NPS0_24CHAN_2K_HASH:
+ /* No hash physical address bits, so nothing to do. */
+ case DF4p5_NPS4_3CHAN_1K_HASH:
+ case DF4p5_NPS4_3CHAN_2K_HASH:
+ case DF4p5_NPS2_5CHAN_1K_HASH:
+ case DF4p5_NPS2_5CHAN_2K_HASH:
+ return 0;
+
+ case DF2_2CHAN_HASH:
+ return df2_dehash_addr(ctx);
+
+ case DF3_COD4_2CHAN_HASH:
+ case DF3_COD2_4CHAN_HASH:
+ case DF3_COD1_8CHAN_HASH:
+ return df3_dehash_addr(ctx);
+
+ case DF3_6CHAN:
+ return df3_6chan_dehash_addr(ctx);
+
+ case DF4_NPS4_2CHAN_HASH:
+ case DF4_NPS2_4CHAN_HASH:
+ case DF4_NPS1_8CHAN_HASH:
+ return df4_dehash_addr(ctx);
+
+ case DF4p5_NPS4_2CHAN_1K_HASH:
+ case DF4p5_NPS4_2CHAN_2K_HASH:
+ case DF4p5_NPS2_4CHAN_2K_HASH:
+ case DF4p5_NPS2_4CHAN_1K_HASH:
+ case DF4p5_NPS1_8CHAN_1K_HASH:
+ case DF4p5_NPS1_8CHAN_2K_HASH:
+ case DF4p5_NPS1_16CHAN_1K_HASH:
+ case DF4p5_NPS1_16CHAN_2K_HASH:
+ return df4p5_dehash_addr(ctx);
+
+ case MI3_HASH_8CHAN:
+ case MI3_HASH_16CHAN:
+ case MI3_HASH_32CHAN:
+ return mi300_dehash_addr(ctx);
+
+ default:
+ atl_debug_on_bad_intlv_mode(ctx);
+ return -EINVAL;
+ }
+}
diff --git a/drivers/ras/amd/atl/denormalize.c b/drivers/ras/amd/atl/denormalize.c
new file mode 100644
index 000000000000..e279224288d6
--- /dev/null
+++ b/drivers/ras/amd/atl/denormalize.c
@@ -0,0 +1,718 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * denormalize.c : Functions to account for interleaving bits
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include "internal.h"
+
+/*
+ * Returns the Destination Fabric ID. This is the first (lowest)
+ * COH_ST Fabric ID used within a DRAM Address map.
+ */
+static u16 get_dst_fabric_id(struct addr_ctx *ctx)
+{
+ switch (df_cfg.rev) {
+ case DF2: return FIELD_GET(DF2_DST_FABRIC_ID, ctx->map.limit);
+ case DF3: return FIELD_GET(DF3_DST_FABRIC_ID, ctx->map.limit);
+ case DF3p5: return FIELD_GET(DF3p5_DST_FABRIC_ID, ctx->map.limit);
+ case DF4: return FIELD_GET(DF4_DST_FABRIC_ID, ctx->map.ctl);
+ case DF4p5: return FIELD_GET(DF4p5_DST_FABRIC_ID, ctx->map.ctl);
+ default:
+ atl_debug_on_bad_df_rev();
+ return 0;
+ }
+}
+
+/*
+ * Make a contiguous gap in address for N bits starting at bit P.
+ *
+ * Example:
+ * address bits: [20:0]
+ * # of interleave bits (n): 3
+ * starting interleave bit (p): 8
+ *
+ * expanded address bits: [20+n : n+p][n+p-1 : p][p-1 : 0]
+ * [23 : 11][10 : 8][7 : 0]
+ */
+static u64 make_space_for_coh_st_id_at_intlv_bit(struct addr_ctx *ctx)
+{
+ return expand_bits(ctx->map.intlv_bit_pos,
+ ctx->map.total_intlv_bits,
+ ctx->ret_addr);
+}
+
+/*
+ * Make two gaps in address for N bits.
+ * First gap is a single bit at bit P.
+ * Second gap is the remaining N-1 bits at bit 12.
+ *
+ * Example:
+ * address bits: [20:0]
+ * # of interleave bits (n): 3
+ * starting interleave bit (p): 8
+ *
+ * First gap
+ * expanded address bits: [20+1 : p+1][p][p-1 : 0]
+ * [21 : 9][8][7 : 0]
+ *
+ * Second gap uses result from first.
+ * r = n - 1; remaining interleave bits
+ * expanded address bits: [21+r : 12+r][12+r-1: 12][11 : 0]
+ * [23 : 14][13 : 12][11 : 0]
+ */
+static u64 make_space_for_coh_st_id_split_2_1(struct addr_ctx *ctx)
+{
+ /* Make a single space at the interleave bit. */
+ u64 denorm_addr = expand_bits(ctx->map.intlv_bit_pos, 1, ctx->ret_addr);
+
+ /* Done if there's only a single interleave bit. */
+ if (ctx->map.total_intlv_bits <= 1)
+ return denorm_addr;
+
+ /* Make spaces for the remaining interleave bits starting at bit 12. */
+ return expand_bits(12, ctx->map.total_intlv_bits - 1, denorm_addr);
+}
+
+/*
+ * Make space for CS ID at bits [14:8] as follows:
+ *
+ * 8 channels -> bits [10:8]
+ * 16 channels -> bits [11:8]
+ * 32 channels -> bits [14,11:8]
+ *
+ * 1 die -> N/A
+ * 2 dies -> bit [12]
+ * 4 dies -> bits [13:12]
+ */
+static u64 make_space_for_coh_st_id_mi300(struct addr_ctx *ctx)
+{
+ u8 num_intlv_bits = ilog2(ctx->map.num_intlv_chan);
+ u64 denorm_addr;
+
+ if (ctx->map.intlv_bit_pos != 8) {
+ pr_debug("Invalid interleave bit: %u", ctx->map.intlv_bit_pos);
+ return ~0ULL;
+ }
+
+ /* Channel bits. Covers up to 4 bits at [11:8]. */
+ denorm_addr = expand_bits(8, min(num_intlv_bits, 4), ctx->ret_addr);
+
+ /* Die bits. Always starts at [12]. */
+ denorm_addr = expand_bits(12, ilog2(ctx->map.num_intlv_dies), denorm_addr);
+
+ /* Additional channel bit at [14]. */
+ if (num_intlv_bits > 4)
+ denorm_addr = expand_bits(14, 1, denorm_addr);
+
+ return denorm_addr;
+}
+
+/*
+ * Take the current calculated address and shift enough bits in the middle
+ * to make a gap where the interleave bits will be inserted.
+ */
+static u64 make_space_for_coh_st_id(struct addr_ctx *ctx)
+{
+ switch (ctx->map.intlv_mode) {
+ case NOHASH_2CHAN:
+ case NOHASH_4CHAN:
+ case NOHASH_8CHAN:
+ case NOHASH_16CHAN:
+ case NOHASH_32CHAN:
+ case DF2_2CHAN_HASH:
+ return make_space_for_coh_st_id_at_intlv_bit(ctx);
+
+ case DF3_COD4_2CHAN_HASH:
+ case DF3_COD2_4CHAN_HASH:
+ case DF3_COD1_8CHAN_HASH:
+ case DF4_NPS4_2CHAN_HASH:
+ case DF4_NPS2_4CHAN_HASH:
+ case DF4_NPS1_8CHAN_HASH:
+ case DF4p5_NPS4_2CHAN_1K_HASH:
+ case DF4p5_NPS4_2CHAN_2K_HASH:
+ case DF4p5_NPS2_4CHAN_2K_HASH:
+ case DF4p5_NPS1_8CHAN_2K_HASH:
+ case DF4p5_NPS1_16CHAN_2K_HASH:
+ return make_space_for_coh_st_id_split_2_1(ctx);
+
+ case MI3_HASH_8CHAN:
+ case MI3_HASH_16CHAN:
+ case MI3_HASH_32CHAN:
+ return make_space_for_coh_st_id_mi300(ctx);
+
+ default:
+ atl_debug_on_bad_intlv_mode(ctx);
+ return ~0ULL;
+ }
+}
+
+static u16 get_coh_st_id_df2(struct addr_ctx *ctx)
+{
+ u8 num_socket_intlv_bits = ilog2(ctx->map.num_intlv_sockets);
+ u8 num_die_intlv_bits = ilog2(ctx->map.num_intlv_dies);
+ u8 num_intlv_bits;
+ u16 coh_st_id, mask;
+
+ coh_st_id = ctx->coh_st_fabric_id - get_dst_fabric_id(ctx);
+
+ /* Channel interleave bits */
+ num_intlv_bits = order_base_2(ctx->map.num_intlv_chan);
+ mask = GENMASK(num_intlv_bits - 1, 0);
+ coh_st_id &= mask;
+
+ /* Die interleave bits */
+ if (num_die_intlv_bits) {
+ u16 die_bits;
+
+ mask = GENMASK(num_die_intlv_bits - 1, 0);
+ die_bits = ctx->coh_st_fabric_id & df_cfg.die_id_mask;
+ die_bits >>= df_cfg.die_id_shift;
+
+ coh_st_id |= (die_bits & mask) << num_intlv_bits;
+ num_intlv_bits += num_die_intlv_bits;
+ }
+
+ /* Socket interleave bits */
+ if (num_socket_intlv_bits) {
+ u16 socket_bits;
+
+ mask = GENMASK(num_socket_intlv_bits - 1, 0);
+ socket_bits = ctx->coh_st_fabric_id & df_cfg.socket_id_mask;
+ socket_bits >>= df_cfg.socket_id_shift;
+
+ coh_st_id |= (socket_bits & mask) << num_intlv_bits;
+ }
+
+ return coh_st_id;
+}
+
+static u16 get_coh_st_id_df4(struct addr_ctx *ctx)
+{
+ /*
+ * Start with the original component mask and the number of interleave
+ * bits for the channels in this map.
+ */
+ u8 num_intlv_bits = ilog2(ctx->map.num_intlv_chan);
+ u16 mask = df_cfg.component_id_mask;
+
+ u16 socket_bits;
+
+ /* Set the derived Coherent Station ID to the input Coherent Station Fabric ID. */
+ u16 coh_st_id = ctx->coh_st_fabric_id & mask;
+
+ /*
+ * Subtract the "base" Destination Fabric ID.
+ * This accounts for systems with disabled Coherent Stations.
+ */
+ coh_st_id -= get_dst_fabric_id(ctx) & mask;
+
+ /*
+ * Generate and use a new mask based on the number of bits
+ * needed for channel interleaving in this map.
+ */
+ mask = GENMASK(num_intlv_bits - 1, 0);
+ coh_st_id &= mask;
+
+ /* Done if socket interleaving is not enabled. */
+ if (ctx->map.num_intlv_sockets <= 1)
+ return coh_st_id;
+
+ /*
+ * Figure out how many bits are needed for the number of
+ * interleaved sockets. And shift the derived Coherent Station ID to account
+ * for these.
+ */
+ num_intlv_bits = ilog2(ctx->map.num_intlv_sockets);
+ coh_st_id <<= num_intlv_bits;
+
+ /* Generate a new mask for the socket interleaving bits. */
+ mask = GENMASK(num_intlv_bits - 1, 0);
+
+ /* Get the socket interleave bits from the original Coherent Station Fabric ID. */
+ socket_bits = (ctx->coh_st_fabric_id & df_cfg.socket_id_mask) >> df_cfg.socket_id_shift;
+
+ /* Apply the appropriate socket bits to the derived Coherent Station ID. */
+ coh_st_id |= socket_bits & mask;
+
+ return coh_st_id;
+}
+
+/*
+ * MI300 hash has:
+ * (C)hannel[3:0] = coh_st_id[3:0]
+ * (S)tack[0] = coh_st_id[4]
+ * (D)ie[1:0] = coh_st_id[6:5]
+ *
+ * Hashed coh_st_id is swizzled so that Stack bit is at the end.
+ * coh_st_id = SDDCCCC
+ */
+static u16 get_coh_st_id_mi300(struct addr_ctx *ctx)
+{
+ u8 channel_bits, die_bits, stack_bit;
+ u16 die_id;
+
+ /* Subtract the "base" Destination Fabric ID. */
+ ctx->coh_st_fabric_id -= get_dst_fabric_id(ctx);
+
+ die_id = (ctx->coh_st_fabric_id & df_cfg.die_id_mask) >> df_cfg.die_id_shift;
+
+ channel_bits = FIELD_GET(GENMASK(3, 0), ctx->coh_st_fabric_id);
+ stack_bit = FIELD_GET(BIT(4), ctx->coh_st_fabric_id) << 6;
+ die_bits = die_id << 4;
+
+ return stack_bit | die_bits | channel_bits;
+}
+
+/*
+ * Derive the correct Coherent Station ID that represents the interleave bits
+ * used within the system physical address. This accounts for the
+ * interleave mode, number of interleaved channels/dies/sockets, and
+ * other system/mode-specific bit swizzling.
+ *
+ * Returns: Coherent Station ID on success.
+ * All bits set on error.
+ */
+static u16 calculate_coh_st_id(struct addr_ctx *ctx)
+{
+ switch (ctx->map.intlv_mode) {
+ case NOHASH_2CHAN:
+ case NOHASH_4CHAN:
+ case NOHASH_8CHAN:
+ case NOHASH_16CHAN:
+ case NOHASH_32CHAN:
+ case DF3_COD4_2CHAN_HASH:
+ case DF3_COD2_4CHAN_HASH:
+ case DF3_COD1_8CHAN_HASH:
+ case DF2_2CHAN_HASH:
+ return get_coh_st_id_df2(ctx);
+
+ case DF4_NPS4_2CHAN_HASH:
+ case DF4_NPS2_4CHAN_HASH:
+ case DF4_NPS1_8CHAN_HASH:
+ case DF4p5_NPS4_2CHAN_1K_HASH:
+ case DF4p5_NPS4_2CHAN_2K_HASH:
+ case DF4p5_NPS2_4CHAN_2K_HASH:
+ case DF4p5_NPS1_8CHAN_2K_HASH:
+ case DF4p5_NPS1_16CHAN_2K_HASH:
+ return get_coh_st_id_df4(ctx);
+
+ case MI3_HASH_8CHAN:
+ case MI3_HASH_16CHAN:
+ case MI3_HASH_32CHAN:
+ return get_coh_st_id_mi300(ctx);
+
+ /* COH_ST ID is simply the COH_ST Fabric ID adjusted by the Destination Fabric ID. */
+ case DF4p5_NPS2_4CHAN_1K_HASH:
+ case DF4p5_NPS1_8CHAN_1K_HASH:
+ case DF4p5_NPS1_16CHAN_1K_HASH:
+ return ctx->coh_st_fabric_id - get_dst_fabric_id(ctx);
+
+ default:
+ atl_debug_on_bad_intlv_mode(ctx);
+ return ~0;
+ }
+}
+
+static u64 insert_coh_st_id_at_intlv_bit(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id)
+{
+ return denorm_addr | (coh_st_id << ctx->map.intlv_bit_pos);
+}
+
+static u64 insert_coh_st_id_split_2_1(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id)
+{
+ /* Insert coh_st_id[0] at the interleave bit. */
+ denorm_addr |= (coh_st_id & BIT(0)) << ctx->map.intlv_bit_pos;
+
+ /* Insert coh_st_id[2:1] at bit 12. */
+ denorm_addr |= (coh_st_id & GENMASK(2, 1)) << 11;
+
+ return denorm_addr;
+}
+
+static u64 insert_coh_st_id_split_2_2(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id)
+{
+ /* Insert coh_st_id[1:0] at bit 8. */
+ denorm_addr |= (coh_st_id & GENMASK(1, 0)) << 8;
+
+ /*
+ * Insert coh_st_id[n:2] at bit 12. 'n' could be 2 or 3.
+ * Grab both because bit 3 will be clear if unused.
+ */
+ denorm_addr |= (coh_st_id & GENMASK(3, 2)) << 10;
+
+ return denorm_addr;
+}
+
+static u64 insert_coh_st_id(struct addr_ctx *ctx, u64 denorm_addr, u16 coh_st_id)
+{
+ switch (ctx->map.intlv_mode) {
+ case NOHASH_2CHAN:
+ case NOHASH_4CHAN:
+ case NOHASH_8CHAN:
+ case NOHASH_16CHAN:
+ case NOHASH_32CHAN:
+ case MI3_HASH_8CHAN:
+ case MI3_HASH_16CHAN:
+ case MI3_HASH_32CHAN:
+ case DF2_2CHAN_HASH:
+ return insert_coh_st_id_at_intlv_bit(ctx, denorm_addr, coh_st_id);
+
+ case DF3_COD4_2CHAN_HASH:
+ case DF3_COD2_4CHAN_HASH:
+ case DF3_COD1_8CHAN_HASH:
+ case DF4_NPS4_2CHAN_HASH:
+ case DF4_NPS2_4CHAN_HASH:
+ case DF4_NPS1_8CHAN_HASH:
+ case DF4p5_NPS4_2CHAN_1K_HASH:
+ case DF4p5_NPS4_2CHAN_2K_HASH:
+ case DF4p5_NPS2_4CHAN_2K_HASH:
+ case DF4p5_NPS1_8CHAN_2K_HASH:
+ case DF4p5_NPS1_16CHAN_2K_HASH:
+ return insert_coh_st_id_split_2_1(ctx, denorm_addr, coh_st_id);
+
+ case DF4p5_NPS2_4CHAN_1K_HASH:
+ case DF4p5_NPS1_8CHAN_1K_HASH:
+ case DF4p5_NPS1_16CHAN_1K_HASH:
+ return insert_coh_st_id_split_2_2(ctx, denorm_addr, coh_st_id);
+
+ default:
+ atl_debug_on_bad_intlv_mode(ctx);
+ return ~0ULL;
+ }
+}
+
+/*
+ * MI300 systems have a fixed, hardware-defined physical-to-logical
+ * Coherent Station mapping. The Remap registers are not used.
+ */
+static const u16 phy_to_log_coh_st_map_mi300[] = {
+ 12, 13, 14, 15,
+ 8, 9, 10, 11,
+ 4, 5, 6, 7,
+ 0, 1, 2, 3,
+ 28, 29, 30, 31,
+ 24, 25, 26, 27,
+ 20, 21, 22, 23,
+ 16, 17, 18, 19,
+};
+
+static u16 get_logical_coh_st_fabric_id_mi300(struct addr_ctx *ctx)
+{
+ if (ctx->inst_id >= ARRAY_SIZE(phy_to_log_coh_st_map_mi300)) {
+ atl_debug(ctx, "Instance ID out of range");
+ return ~0;
+ }
+
+ return phy_to_log_coh_st_map_mi300[ctx->inst_id] | (ctx->node_id << df_cfg.node_id_shift);
+}
+
+static u16 get_logical_coh_st_fabric_id(struct addr_ctx *ctx)
+{
+ u16 component_id, log_fabric_id;
+
+ /* Start with the physical COH_ST Fabric ID. */
+ u16 phys_fabric_id = ctx->coh_st_fabric_id;
+
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+ return get_logical_coh_st_fabric_id_mi300(ctx);
+
+ /* Skip logical ID lookup if remapping is disabled. */
+ if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl) &&
+ ctx->map.intlv_mode != DF3_6CHAN)
+ return phys_fabric_id;
+
+ /* Mask off the Node ID bits to get the "local" Component ID. */
+ component_id = phys_fabric_id & df_cfg.component_id_mask;
+
+ /*
+ * Search the list of logical Component IDs for the one that
+ * matches this physical Component ID.
+ */
+ for (log_fabric_id = 0; log_fabric_id < MAX_COH_ST_CHANNELS; log_fabric_id++) {
+ if (ctx->map.remap_array[log_fabric_id] == component_id)
+ break;
+ }
+
+ if (log_fabric_id == MAX_COH_ST_CHANNELS)
+ atl_debug(ctx, "COH_ST remap entry not found for 0x%x",
+ log_fabric_id);
+
+ /* Get the Node ID bits from the physical and apply to the logical. */
+ return (phys_fabric_id & df_cfg.node_id_mask) | log_fabric_id;
+}
+
+static int denorm_addr_common(struct addr_ctx *ctx)
+{
+ u64 denorm_addr;
+ u16 coh_st_id;
+
+ /*
+ * Convert the original physical COH_ST Fabric ID to a logical value.
+ * This is required for non-power-of-two and other interleaving modes.
+ */
+ ctx->coh_st_fabric_id = get_logical_coh_st_fabric_id(ctx);
+
+ denorm_addr = make_space_for_coh_st_id(ctx);
+ coh_st_id = calculate_coh_st_id(ctx);
+ ctx->ret_addr = insert_coh_st_id(ctx, denorm_addr, coh_st_id);
+ return 0;
+}
+
+static int denorm_addr_df3_6chan(struct addr_ctx *ctx)
+{
+ u16 coh_st_id = ctx->coh_st_fabric_id & df_cfg.component_id_mask;
+ u8 total_intlv_bits = ctx->map.total_intlv_bits;
+ u8 low_bit, intlv_bit = ctx->map.intlv_bit_pos;
+ u64 msb_intlv_bits, temp_addr_a, temp_addr_b;
+ u8 np2_bits = ctx->map.np2_bits;
+
+ if (ctx->map.intlv_mode != DF3_6CHAN)
+ return -EINVAL;
+
+ /*
+ * 'np2_bits' holds the number of bits needed to cover the
+ * amount of memory (rounded up) in this map using 64K chunks.
+ *
+ * Example:
+ * Total memory in map: 6GB
+ * Rounded up to next power-of-2: 8GB
+ * Number of 64K chunks: 0x20000
+ * np2_bits = log2(# of chunks): 17
+ *
+ * Get the two most-significant interleave bits from the
+ * input address based on the following:
+ *
+ * [15 + np2_bits - total_intlv_bits : 14 + np2_bits - total_intlv_bits]
+ */
+ low_bit = 14 + np2_bits - total_intlv_bits;
+ msb_intlv_bits = ctx->ret_addr >> low_bit;
+ msb_intlv_bits &= 0x3;
+
+ /*
+ * If MSB are 11b, then logical COH_ST ID is 6 or 7.
+ * Need to adjust based on the mod3 result.
+ */
+ if (msb_intlv_bits == 3) {
+ u8 addr_mod, phys_addr_msb, msb_coh_st_id;
+
+ /* Get the remaining interleave bits from the input address. */
+ temp_addr_b = GENMASK_ULL(low_bit - 1, intlv_bit) & ctx->ret_addr;
+ temp_addr_b >>= intlv_bit;
+
+ /* Calculate the logical COH_ST offset based on mod3. */
+ addr_mod = temp_addr_b % 3;
+
+ /* Get COH_ST ID bits [2:1]. */
+ msb_coh_st_id = (coh_st_id >> 1) & 0x3;
+
+ /* Get the bit that starts the physical address bits. */
+ phys_addr_msb = (intlv_bit + np2_bits + 1);
+ phys_addr_msb &= BIT(0);
+ phys_addr_msb++;
+ phys_addr_msb *= 3 - addr_mod + msb_coh_st_id;
+ phys_addr_msb %= 3;
+
+ /* Move the physical address MSB to the correct place. */
+ temp_addr_b |= phys_addr_msb << (low_bit - total_intlv_bits - intlv_bit);
+
+ /* Generate a new COH_ST ID as follows: coh_st_id = [1, 1, coh_st_id[0]] */
+ coh_st_id &= BIT(0);
+ coh_st_id |= GENMASK(2, 1);
+ } else {
+ temp_addr_b = GENMASK_ULL(63, intlv_bit) & ctx->ret_addr;
+ temp_addr_b >>= intlv_bit;
+ }
+
+ temp_addr_a = GENMASK_ULL(intlv_bit - 1, 0) & ctx->ret_addr;
+ temp_addr_b <<= intlv_bit + total_intlv_bits;
+
+ ctx->ret_addr = temp_addr_a | temp_addr_b;
+ ctx->ret_addr |= coh_st_id << intlv_bit;
+ return 0;
+}
+
+static int denorm_addr_df4_np2(struct addr_ctx *ctx)
+{
+ bool hash_ctl_64k, hash_ctl_2M, hash_ctl_1G;
+ u16 group, group_offset, log_coh_st_offset;
+ unsigned int mod_value, shift_value;
+ u16 mask = df_cfg.component_id_mask;
+ u64 temp_addr_a, temp_addr_b;
+ bool hash_pa8, hashed_bit;
+
+ switch (ctx->map.intlv_mode) {
+ case DF4_NPS4_3CHAN_HASH:
+ mod_value = 3;
+ shift_value = 13;
+ break;
+ case DF4_NPS2_6CHAN_HASH:
+ mod_value = 3;
+ shift_value = 12;
+ break;
+ case DF4_NPS1_12CHAN_HASH:
+ mod_value = 3;
+ shift_value = 11;
+ break;
+ case DF4_NPS2_5CHAN_HASH:
+ mod_value = 5;
+ shift_value = 13;
+ break;
+ case DF4_NPS1_10CHAN_HASH:
+ mod_value = 5;
+ shift_value = 12;
+ break;
+ default:
+ atl_debug_on_bad_intlv_mode(ctx);
+ return -EINVAL;
+ };
+
+ if (ctx->map.num_intlv_sockets == 1) {
+ hash_pa8 = BIT_ULL(shift_value) & ctx->ret_addr;
+ temp_addr_a = remove_bits(shift_value, shift_value, ctx->ret_addr);
+ } else {
+ hash_pa8 = ctx->coh_st_fabric_id & df_cfg.socket_id_mask;
+ temp_addr_a = ctx->ret_addr;
+ }
+
+ /* Make a gap for the real bit [8]. */
+ temp_addr_a = expand_bits(8, 1, temp_addr_a);
+
+ /* Make an additional gap for bits [13:12], as appropriate.*/
+ if (ctx->map.intlv_mode == DF4_NPS2_6CHAN_HASH ||
+ ctx->map.intlv_mode == DF4_NPS1_10CHAN_HASH) {
+ temp_addr_a = expand_bits(13, 1, temp_addr_a);
+ } else if (ctx->map.intlv_mode == DF4_NPS1_12CHAN_HASH) {
+ temp_addr_a = expand_bits(12, 2, temp_addr_a);
+ }
+
+ /* Keep bits [13:0]. */
+ temp_addr_a &= GENMASK_ULL(13, 0);
+
+ /* Get the appropriate high bits. */
+ shift_value += 1 - ilog2(ctx->map.num_intlv_sockets);
+ temp_addr_b = GENMASK_ULL(63, shift_value) & ctx->ret_addr;
+ temp_addr_b >>= shift_value;
+ temp_addr_b *= mod_value;
+
+ /*
+ * Coherent Stations are divided into groups.
+ *
+ * Multiples of 3 (mod3) are divided into quadrants.
+ * e.g. NP4_3CHAN -> [0, 1, 2] [6, 7, 8]
+ * [3, 4, 5] [9, 10, 11]
+ *
+ * Multiples of 5 (mod5) are divided into sides.
+ * e.g. NP2_5CHAN -> [0, 1, 2, 3, 4] [5, 6, 7, 8, 9]
+ */
+
+ /*
+ * Calculate the logical offset for the COH_ST within its DRAM Address map.
+ * e.g. if map includes [5, 6, 7, 8, 9] and target instance is '8', then
+ * log_coh_st_offset = 8 - 5 = 3
+ */
+ log_coh_st_offset = (ctx->coh_st_fabric_id & mask) - (get_dst_fabric_id(ctx) & mask);
+
+ /*
+ * Figure out the group number.
+ *
+ * Following above example,
+ * log_coh_st_offset = 3
+ * mod_value = 5
+ * group = 3 / 5 = 0
+ */
+ group = log_coh_st_offset / mod_value;
+
+ /*
+ * Figure out the offset within the group.
+ *
+ * Following above example,
+ * log_coh_st_offset = 3
+ * mod_value = 5
+ * group_offset = 3 % 5 = 3
+ */
+ group_offset = log_coh_st_offset % mod_value;
+
+ /* Adjust group_offset if the hashed bit [8] is set. */
+ if (hash_pa8) {
+ if (!group_offset)
+ group_offset = mod_value - 1;
+ else
+ group_offset--;
+ }
+
+ /* Add in the group offset to the high bits. */
+ temp_addr_b += group_offset;
+
+ /* Shift the high bits to the proper starting position. */
+ temp_addr_b <<= 14;
+
+ /* Combine the high and low bits together. */
+ ctx->ret_addr = temp_addr_a | temp_addr_b;
+
+ /* Account for hashing here instead of in dehash_address(). */
+ hash_ctl_64k = FIELD_GET(DF4_HASH_CTL_64K, ctx->map.ctl);
+ hash_ctl_2M = FIELD_GET(DF4_HASH_CTL_2M, ctx->map.ctl);
+ hash_ctl_1G = FIELD_GET(DF4_HASH_CTL_1G, ctx->map.ctl);
+
+ hashed_bit = !!hash_pa8;
+ hashed_bit ^= FIELD_GET(BIT_ULL(14), ctx->ret_addr);
+ hashed_bit ^= FIELD_GET(BIT_ULL(16), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(21), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(30), ctx->ret_addr) & hash_ctl_1G;
+
+ ctx->ret_addr |= hashed_bit << 8;
+
+ /* Done for 3 and 5 channel. */
+ if (ctx->map.intlv_mode == DF4_NPS4_3CHAN_HASH ||
+ ctx->map.intlv_mode == DF4_NPS2_5CHAN_HASH)
+ return 0;
+
+ /* Select the proper 'group' bit to use for Bit 13. */
+ if (ctx->map.intlv_mode == DF4_NPS1_12CHAN_HASH)
+ hashed_bit = !!(group & BIT(1));
+ else
+ hashed_bit = group & BIT(0);
+
+ hashed_bit ^= FIELD_GET(BIT_ULL(18), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(23), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(32), ctx->ret_addr) & hash_ctl_1G;
+
+ ctx->ret_addr |= hashed_bit << 13;
+
+ /* Done for 6 and 10 channel. */
+ if (ctx->map.intlv_mode != DF4_NPS1_12CHAN_HASH)
+ return 0;
+
+ hashed_bit = group & BIT(0);
+ hashed_bit ^= FIELD_GET(BIT_ULL(17), ctx->ret_addr) & hash_ctl_64k;
+ hashed_bit ^= FIELD_GET(BIT_ULL(22), ctx->ret_addr) & hash_ctl_2M;
+ hashed_bit ^= FIELD_GET(BIT_ULL(31), ctx->ret_addr) & hash_ctl_1G;
+
+ ctx->ret_addr |= hashed_bit << 12;
+ return 0;
+}
+
+int denormalize_address(struct addr_ctx *ctx)
+{
+ switch (ctx->map.intlv_mode) {
+ case NONE:
+ return 0;
+ case DF4_NPS4_3CHAN_HASH:
+ case DF4_NPS2_6CHAN_HASH:
+ case DF4_NPS1_12CHAN_HASH:
+ case DF4_NPS2_5CHAN_HASH:
+ case DF4_NPS1_10CHAN_HASH:
+ return denorm_addr_df4_np2(ctx);
+ case DF3_6CHAN:
+ return denorm_addr_df3_6chan(ctx);
+ default:
+ return denorm_addr_common(ctx);
+ }
+}
diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h
new file mode 100644
index 000000000000..5de69e0bb0f9
--- /dev/null
+++ b/drivers/ras/amd/atl/internal.h
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Address Translation Library
+ *
+ * internal.h : Helper functions and common defines
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#ifndef __AMD_ATL_INTERNAL_H__
+#define __AMD_ATL_INTERNAL_H__
+
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/ras.h>
+
+#include <asm/amd_nb.h>
+
+#include "reg_fields.h"
+
+/* Maximum possible number of Coherent Stations within a single Data Fabric. */
+#define MAX_COH_ST_CHANNELS 32
+
+/* PCI ID for Zen4 Server DF Function 0. */
+#define DF_FUNC0_ID_ZEN4_SERVER 0x14AD1022
+
+/* PCI IDs for MI300 DF Function 0. */
+#define DF_FUNC0_ID_MI300 0x15281022
+
+/* Shift needed for adjusting register values to true values. */
+#define DF_DRAM_BASE_LIMIT_LSB 28
+#define MI300_DRAM_LIMIT_LSB 20
+
+enum df_revisions {
+ UNKNOWN,
+ DF2,
+ DF3,
+ DF3p5,
+ DF4,
+ DF4p5,
+};
+
+/* These are mapped 1:1 to the hardware values. Special cases are set at > 0x20. */
+enum intlv_modes {
+ NONE = 0x00,
+ NOHASH_2CHAN = 0x01,
+ NOHASH_4CHAN = 0x03,
+ NOHASH_8CHAN = 0x05,
+ DF3_6CHAN = 0x06,
+ NOHASH_16CHAN = 0x07,
+ NOHASH_32CHAN = 0x08,
+ DF3_COD4_2CHAN_HASH = 0x0C,
+ DF3_COD2_4CHAN_HASH = 0x0D,
+ DF3_COD1_8CHAN_HASH = 0x0E,
+ DF4_NPS4_2CHAN_HASH = 0x10,
+ DF4_NPS2_4CHAN_HASH = 0x11,
+ DF4_NPS1_8CHAN_HASH = 0x12,
+ DF4_NPS4_3CHAN_HASH = 0x13,
+ DF4_NPS2_6CHAN_HASH = 0x14,
+ DF4_NPS1_12CHAN_HASH = 0x15,
+ DF4_NPS2_5CHAN_HASH = 0x16,
+ DF4_NPS1_10CHAN_HASH = 0x17,
+ MI3_HASH_8CHAN = 0x18,
+ MI3_HASH_16CHAN = 0x19,
+ MI3_HASH_32CHAN = 0x1A,
+ DF2_2CHAN_HASH = 0x21,
+ /* DF4.5 modes are all IntLvNumChan + 0x20 */
+ DF4p5_NPS1_16CHAN_1K_HASH = 0x2C,
+ DF4p5_NPS0_24CHAN_1K_HASH = 0x2E,
+ DF4p5_NPS4_2CHAN_1K_HASH = 0x30,
+ DF4p5_NPS2_4CHAN_1K_HASH = 0x31,
+ DF4p5_NPS1_8CHAN_1K_HASH = 0x32,
+ DF4p5_NPS4_3CHAN_1K_HASH = 0x33,
+ DF4p5_NPS2_6CHAN_1K_HASH = 0x34,
+ DF4p5_NPS1_12CHAN_1K_HASH = 0x35,
+ DF4p5_NPS2_5CHAN_1K_HASH = 0x36,
+ DF4p5_NPS1_10CHAN_1K_HASH = 0x37,
+ DF4p5_NPS4_2CHAN_2K_HASH = 0x40,
+ DF4p5_NPS2_4CHAN_2K_HASH = 0x41,
+ DF4p5_NPS1_8CHAN_2K_HASH = 0x42,
+ DF4p5_NPS1_16CHAN_2K_HASH = 0x43,
+ DF4p5_NPS4_3CHAN_2K_HASH = 0x44,
+ DF4p5_NPS2_6CHAN_2K_HASH = 0x45,
+ DF4p5_NPS1_12CHAN_2K_HASH = 0x46,
+ DF4p5_NPS0_24CHAN_2K_HASH = 0x47,
+ DF4p5_NPS2_5CHAN_2K_HASH = 0x48,
+ DF4p5_NPS1_10CHAN_2K_HASH = 0x49,
+};
+
+struct df_flags {
+ __u8 legacy_ficaa : 1,
+ socket_id_shift_quirk : 1,
+ heterogeneous : 1,
+ __reserved_0 : 5;
+};
+
+struct df_config {
+ enum df_revisions rev;
+
+ /*
+ * These masks operate on the 16-bit Coherent Station IDs,
+ * e.g. Instance, Fabric, Destination, etc.
+ */
+ u16 component_id_mask;
+ u16 die_id_mask;
+ u16 node_id_mask;
+ u16 socket_id_mask;
+
+ /*
+ * Least-significant bit of Node ID portion of the
+ * system-wide Coherent Station Fabric ID.
+ */
+ u8 node_id_shift;
+
+ /*
+ * Least-significant bit of Die portion of the Node ID.
+ * Adjusted to include the Node ID shift in order to apply
+ * to the Coherent Station Fabric ID.
+ */
+ u8 die_id_shift;
+
+ /*
+ * Least-significant bit of Socket portion of the Node ID.
+ * Adjusted to include the Node ID shift in order to apply
+ * to the Coherent Station Fabric ID.
+ */
+ u8 socket_id_shift;
+
+ /* Number of DRAM Address maps visible in a Coherent Station. */
+ u8 num_coh_st_maps;
+
+ /* Global flags to handle special cases. */
+ struct df_flags flags;
+};
+
+extern struct df_config df_cfg;
+
+struct dram_addr_map {
+ /*
+ * Each DRAM Address Map can operate independently
+ * in different interleaving modes.
+ */
+ enum intlv_modes intlv_mode;
+
+ /* System-wide number for this address map. */
+ u8 num;
+
+ /* Raw register values */
+ u32 base;
+ u32 limit;
+ u32 ctl;
+ u32 intlv;
+
+ /*
+ * Logical to Physical Coherent Station Remapping array
+ *
+ * Index: Logical Coherent Station Instance ID
+ * Value: Physical Coherent Station Instance ID
+ *
+ * phys_coh_st_inst_id = remap_array[log_coh_st_inst_id]
+ */
+ u8 remap_array[MAX_COH_ST_CHANNELS];
+
+ /*
+ * Number of bits covering DRAM Address map 0
+ * when interleaving is non-power-of-2.
+ *
+ * Used only for DF3_6CHAN.
+ */
+ u8 np2_bits;
+
+ /* Position of the 'interleave bit'. */
+ u8 intlv_bit_pos;
+ /* Number of channels interleaved in this map. */
+ u8 num_intlv_chan;
+ /* Number of dies interleaved in this map. */
+ u8 num_intlv_dies;
+ /* Number of sockets interleaved in this map. */
+ u8 num_intlv_sockets;
+ /*
+ * Total number of channels interleaved accounting
+ * for die and socket interleaving.
+ */
+ u8 total_intlv_chan;
+ /* Total bits needed to cover 'total_intlv_chan'. */
+ u8 total_intlv_bits;
+};
+
+/* Original input values cached for debug printing. */
+struct addr_ctx_inputs {
+ u64 norm_addr;
+ u8 socket_id;
+ u8 die_id;
+ u8 coh_st_inst_id;
+};
+
+struct addr_ctx {
+ u64 ret_addr;
+
+ struct addr_ctx_inputs inputs;
+ struct dram_addr_map map;
+
+ /* AMD Node ID calculated from Socket and Die IDs. */
+ u8 node_id;
+
+ /*
+ * Coherent Station Instance ID
+ * Local ID used within a 'node'.
+ */
+ u16 inst_id;
+
+ /*
+ * Coherent Station Fabric ID
+ * System-wide ID that includes 'node' bits.
+ */
+ u16 coh_st_fabric_id;
+};
+
+int df_indirect_read_instance(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo);
+int df_indirect_read_broadcast(u16 node, u8 func, u16 reg, u32 *lo);
+
+int get_df_system_info(void);
+int determine_node_id(struct addr_ctx *ctx, u8 socket_num, u8 die_num);
+int get_addr_hash_mi300(void);
+
+int get_address_map(struct addr_ctx *ctx);
+
+int denormalize_address(struct addr_ctx *ctx);
+int dehash_address(struct addr_ctx *ctx);
+
+unsigned long norm_to_sys_addr(u8 socket_id, u8 die_id, u8 coh_st_inst_id, unsigned long addr);
+unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
+
+/*
+ * Make a gap in @data that is @num_bits long starting at @bit_num.
+ * e.g. data = 11111111'b
+ * bit_num = 3
+ * num_bits = 2
+ * result = 1111100111'b
+ */
+static inline u64 expand_bits(u8 bit_num, u8 num_bits, u64 data)
+{
+ u64 temp1, temp2;
+
+ if (!num_bits)
+ return data;
+
+ if (!bit_num) {
+ WARN_ON_ONCE(num_bits >= BITS_PER_LONG);
+ return data << num_bits;
+ }
+
+ WARN_ON_ONCE(bit_num >= BITS_PER_LONG);
+
+ temp1 = data & GENMASK_ULL(bit_num - 1, 0);
+
+ temp2 = data & GENMASK_ULL(63, bit_num);
+ temp2 <<= num_bits;
+
+ return temp1 | temp2;
+}
+
+/*
+ * Remove bits in @data between @low_bit and @high_bit inclusive.
+ * e.g. data = XXXYYZZZ'b
+ * low_bit = 3
+ * high_bit = 4
+ * result = XXXZZZ'b
+ */
+static inline u64 remove_bits(u8 low_bit, u8 high_bit, u64 data)
+{
+ u64 temp1, temp2;
+
+ WARN_ON_ONCE(high_bit >= BITS_PER_LONG);
+ WARN_ON_ONCE(low_bit >= BITS_PER_LONG);
+ WARN_ON_ONCE(low_bit > high_bit);
+
+ if (!low_bit)
+ return data >> (high_bit++);
+
+ temp1 = GENMASK_ULL(low_bit - 1, 0) & data;
+ temp2 = GENMASK_ULL(63, high_bit + 1) & data;
+ temp2 >>= high_bit - low_bit + 1;
+
+ return temp1 | temp2;
+}
+
+#define atl_debug(ctx, fmt, arg...) \
+ pr_debug("socket_id=%u die_id=%u coh_st_inst_id=%u norm_addr=0x%016llx: " fmt,\
+ (ctx)->inputs.socket_id, (ctx)->inputs.die_id,\
+ (ctx)->inputs.coh_st_inst_id, (ctx)->inputs.norm_addr, ##arg)
+
+static inline void atl_debug_on_bad_df_rev(void)
+{
+ pr_debug("Unrecognized DF rev: %u", df_cfg.rev);
+}
+
+static inline void atl_debug_on_bad_intlv_mode(struct addr_ctx *ctx)
+{
+ atl_debug(ctx, "Unrecognized interleave mode: %u", ctx->map.intlv_mode);
+}
+
+#endif /* __AMD_ATL_INTERNAL_H__ */
diff --git a/drivers/ras/amd/atl/map.c b/drivers/ras/amd/atl/map.c
new file mode 100644
index 000000000000..8b908e8d7495
--- /dev/null
+++ b/drivers/ras/amd/atl/map.c
@@ -0,0 +1,682 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * map.c : Functions to read and decode DRAM address maps
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include "internal.h"
+
+static int df2_get_intlv_mode(struct addr_ctx *ctx)
+{
+ ctx->map.intlv_mode = FIELD_GET(DF2_INTLV_NUM_CHAN, ctx->map.base);
+
+ if (ctx->map.intlv_mode == 8)
+ ctx->map.intlv_mode = DF2_2CHAN_HASH;
+
+ if (ctx->map.intlv_mode != NONE &&
+ ctx->map.intlv_mode != NOHASH_2CHAN &&
+ ctx->map.intlv_mode != DF2_2CHAN_HASH)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int df3_get_intlv_mode(struct addr_ctx *ctx)
+{
+ ctx->map.intlv_mode = FIELD_GET(DF3_INTLV_NUM_CHAN, ctx->map.base);
+ return 0;
+}
+
+static int df3p5_get_intlv_mode(struct addr_ctx *ctx)
+{
+ ctx->map.intlv_mode = FIELD_GET(DF3p5_INTLV_NUM_CHAN, ctx->map.base);
+
+ if (ctx->map.intlv_mode == DF3_6CHAN)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int df4_get_intlv_mode(struct addr_ctx *ctx)
+{
+ ctx->map.intlv_mode = FIELD_GET(DF4_INTLV_NUM_CHAN, ctx->map.intlv);
+
+ if (ctx->map.intlv_mode == DF3_COD4_2CHAN_HASH ||
+ ctx->map.intlv_mode == DF3_COD2_4CHAN_HASH ||
+ ctx->map.intlv_mode == DF3_COD1_8CHAN_HASH ||
+ ctx->map.intlv_mode == DF3_6CHAN)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int df4p5_get_intlv_mode(struct addr_ctx *ctx)
+{
+ ctx->map.intlv_mode = FIELD_GET(DF4p5_INTLV_NUM_CHAN, ctx->map.intlv);
+
+ if (ctx->map.intlv_mode <= NOHASH_32CHAN)
+ return 0;
+
+ if (ctx->map.intlv_mode >= MI3_HASH_8CHAN &&
+ ctx->map.intlv_mode <= MI3_HASH_32CHAN)
+ return 0;
+
+ /*
+ * Modes matching the ranges above are returned as-is.
+ *
+ * All other modes are "fixed up" by adding 20h to make a unique value.
+ */
+ ctx->map.intlv_mode += 0x20;
+
+ return 0;
+}
+
+static int get_intlv_mode(struct addr_ctx *ctx)
+{
+ int ret;
+
+ switch (df_cfg.rev) {
+ case DF2:
+ ret = df2_get_intlv_mode(ctx);
+ break;
+ case DF3:
+ ret = df3_get_intlv_mode(ctx);
+ break;
+ case DF3p5:
+ ret = df3p5_get_intlv_mode(ctx);
+ break;
+ case DF4:
+ ret = df4_get_intlv_mode(ctx);
+ break;
+ case DF4p5:
+ ret = df4p5_get_intlv_mode(ctx);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ atl_debug_on_bad_df_rev();
+
+ return ret;
+}
+
+static u64 get_hi_addr_offset(u32 reg_dram_offset)
+{
+ u8 shift = DF_DRAM_BASE_LIMIT_LSB;
+ u64 hi_addr_offset;
+
+ switch (df_cfg.rev) {
+ case DF2:
+ hi_addr_offset = FIELD_GET(DF2_HI_ADDR_OFFSET, reg_dram_offset);
+ break;
+ case DF3:
+ case DF3p5:
+ hi_addr_offset = FIELD_GET(DF3_HI_ADDR_OFFSET, reg_dram_offset);
+ break;
+ case DF4:
+ case DF4p5:
+ hi_addr_offset = FIELD_GET(DF4_HI_ADDR_OFFSET, reg_dram_offset);
+ break;
+ default:
+ hi_addr_offset = 0;
+ atl_debug_on_bad_df_rev();
+ }
+
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+ shift = MI300_DRAM_LIMIT_LSB;
+
+ return hi_addr_offset << shift;
+}
+
+/*
+ * Returns: 0 if offset is disabled.
+ * 1 if offset is enabled.
+ * -EINVAL on error.
+ */
+static int get_dram_offset(struct addr_ctx *ctx, u64 *norm_offset)
+{
+ u32 reg_dram_offset;
+ u8 map_num;
+
+ /* Should not be called for map 0. */
+ if (!ctx->map.num) {
+ atl_debug(ctx, "Trying to find DRAM offset for map 0");
+ return -EINVAL;
+ }
+
+ /*
+ * DramOffset registers don't exist for map 0, so the base register
+ * actually refers to map 1.
+ * Adjust the map_num for the register offsets.
+ */
+ map_num = ctx->map.num - 1;
+
+ if (df_cfg.rev >= DF4) {
+ /* Read D18F7x140 (DramOffset) */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x140 + (4 * map_num),
+ ctx->inst_id, &reg_dram_offset))
+ return -EINVAL;
+
+ } else {
+ /* Read D18F0x1B4 (DramOffset) */
+ if (df_indirect_read_instance(ctx->node_id, 0, 0x1B4 + (4 * map_num),
+ ctx->inst_id, &reg_dram_offset))
+ return -EINVAL;
+ }
+
+ if (!FIELD_GET(DF_HI_ADDR_OFFSET_EN, reg_dram_offset))
+ return 0;
+
+ *norm_offset = get_hi_addr_offset(reg_dram_offset);
+
+ return 1;
+}
+
+static int df3_6ch_get_dram_addr_map(struct addr_ctx *ctx)
+{
+ u16 dst_fabric_id = FIELD_GET(DF3_DST_FABRIC_ID, ctx->map.limit);
+ u8 i, j, shift = 4, mask = 0xF;
+ u32 reg, offset = 0x60;
+ u16 dst_node_id;
+
+ /* Get Socket 1 register. */
+ if (dst_fabric_id & df_cfg.socket_id_mask)
+ offset = 0x68;
+
+ /* Read D18F0x06{0,8} (DF::Skt0CsTargetRemap0)/(DF::Skt0CsTargetRemap1) */
+ if (df_indirect_read_broadcast(ctx->node_id, 0, offset, &reg))
+ return -EINVAL;
+
+ /* Save 8 remap entries. */
+ for (i = 0, j = 0; i < 8; i++, j++)
+ ctx->map.remap_array[i] = (reg >> (j * shift)) & mask;
+
+ dst_node_id = dst_fabric_id & df_cfg.node_id_mask;
+ dst_node_id >>= df_cfg.node_id_shift;
+
+ /* Read D18F2x090 (DF::Np2ChannelConfig) */
+ if (df_indirect_read_broadcast(dst_node_id, 2, 0x90, &reg))
+ return -EINVAL;
+
+ ctx->map.np2_bits = FIELD_GET(DF_LOG2_ADDR_64K_SPACE0, reg);
+ return 0;
+}
+
+static int df2_get_dram_addr_map(struct addr_ctx *ctx)
+{
+ /* Read D18F0x110 (DramBaseAddress). */
+ if (df_indirect_read_instance(ctx->node_id, 0, 0x110 + (8 * ctx->map.num),
+ ctx->inst_id, &ctx->map.base))
+ return -EINVAL;
+
+ /* Read D18F0x114 (DramLimitAddress). */
+ if (df_indirect_read_instance(ctx->node_id, 0, 0x114 + (8 * ctx->map.num),
+ ctx->inst_id, &ctx->map.limit))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int df3_get_dram_addr_map(struct addr_ctx *ctx)
+{
+ if (df2_get_dram_addr_map(ctx))
+ return -EINVAL;
+
+ /* Read D18F0x3F8 (DfGlobalCtl). */
+ if (df_indirect_read_instance(ctx->node_id, 0, 0x3F8,
+ ctx->inst_id, &ctx->map.ctl))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int df4_get_dram_addr_map(struct addr_ctx *ctx)
+{
+ u8 remap_sel, i, j, shift = 4, mask = 0xF;
+ u32 remap_reg;
+
+ /* Read D18F7xE00 (DramBaseAddress). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0xE00 + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.base))
+ return -EINVAL;
+
+ /* Read D18F7xE04 (DramLimitAddress). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0xE04 + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.limit))
+ return -EINVAL;
+
+ /* Read D18F7xE08 (DramAddressCtl). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0xE08 + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.ctl))
+ return -EINVAL;
+
+ /* Read D18F7xE0C (DramAddressIntlv). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0xE0C + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.intlv))
+ return -EINVAL;
+
+ /* Check if Remap Enable bit is valid. */
+ if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl))
+ return 0;
+
+ /* Fill with bogus values, because '0' is a valid value. */
+ memset(&ctx->map.remap_array, 0xFF, sizeof(ctx->map.remap_array));
+
+ /* Get Remap registers. */
+ remap_sel = FIELD_GET(DF4_REMAP_SEL, ctx->map.ctl);
+
+ /* Read D18F7x180 (CsTargetRemap0A). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x180 + (8 * remap_sel),
+ ctx->inst_id, &remap_reg))
+ return -EINVAL;
+
+ /* Save first 8 remap entries. */
+ for (i = 0, j = 0; i < 8; i++, j++)
+ ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask;
+
+ /* Read D18F7x184 (CsTargetRemap0B). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x184 + (8 * remap_sel),
+ ctx->inst_id, &remap_reg))
+ return -EINVAL;
+
+ /* Save next 8 remap entries. */
+ for (i = 8, j = 0; i < 16; i++, j++)
+ ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask;
+
+ return 0;
+}
+
+static int df4p5_get_dram_addr_map(struct addr_ctx *ctx)
+{
+ u8 remap_sel, i, j, shift = 5, mask = 0x1F;
+ u32 remap_reg;
+
+ /* Read D18F7x200 (DramBaseAddress). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x200 + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.base))
+ return -EINVAL;
+
+ /* Read D18F7x204 (DramLimitAddress). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x204 + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.limit))
+ return -EINVAL;
+
+ /* Read D18F7x208 (DramAddressCtl). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x208 + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.ctl))
+ return -EINVAL;
+
+ /* Read D18F7x20C (DramAddressIntlv). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x20C + (16 * ctx->map.num),
+ ctx->inst_id, &ctx->map.intlv))
+ return -EINVAL;
+
+ /* Check if Remap Enable bit is valid. */
+ if (!FIELD_GET(DF4_REMAP_EN, ctx->map.ctl))
+ return 0;
+
+ /* Fill with bogus values, because '0' is a valid value. */
+ memset(&ctx->map.remap_array, 0xFF, sizeof(ctx->map.remap_array));
+
+ /* Get Remap registers. */
+ remap_sel = FIELD_GET(DF4p5_REMAP_SEL, ctx->map.ctl);
+
+ /* Read D18F7x180 (CsTargetRemap0A). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x180 + (24 * remap_sel),
+ ctx->inst_id, &remap_reg))
+ return -EINVAL;
+
+ /* Save first 6 remap entries. */
+ for (i = 0, j = 0; i < 6; i++, j++)
+ ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask;
+
+ /* Read D18F7x184 (CsTargetRemap0B). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x184 + (24 * remap_sel),
+ ctx->inst_id, &remap_reg))
+ return -EINVAL;
+
+ /* Save next 6 remap entries. */
+ for (i = 6, j = 0; i < 12; i++, j++)
+ ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask;
+
+ /* Read D18F7x188 (CsTargetRemap0C). */
+ if (df_indirect_read_instance(ctx->node_id, 7, 0x188 + (24 * remap_sel),
+ ctx->inst_id, &remap_reg))
+ return -EINVAL;
+
+ /* Save next 6 remap entries. */
+ for (i = 12, j = 0; i < 18; i++, j++)
+ ctx->map.remap_array[i] = (remap_reg >> (j * shift)) & mask;
+
+ return 0;
+}
+
+static int get_dram_addr_map(struct addr_ctx *ctx)
+{
+ switch (df_cfg.rev) {
+ case DF2: return df2_get_dram_addr_map(ctx);
+ case DF3:
+ case DF3p5: return df3_get_dram_addr_map(ctx);
+ case DF4: return df4_get_dram_addr_map(ctx);
+ case DF4p5: return df4p5_get_dram_addr_map(ctx);
+ default:
+ atl_debug_on_bad_df_rev();
+ return -EINVAL;
+ }
+}
+
+static int get_coh_st_fabric_id(struct addr_ctx *ctx)
+{
+ u32 reg;
+
+ /*
+ * On MI300 systems, the Coherent Station Fabric ID is derived
+ * later. And it does not depend on the register value.
+ */
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+ return 0;
+
+ /* Read D18F0x50 (FabricBlockInstanceInformation3). */
+ if (df_indirect_read_instance(ctx->node_id, 0, 0x50, ctx->inst_id, &reg))
+ return -EINVAL;
+
+ if (df_cfg.rev < DF4p5)
+ ctx->coh_st_fabric_id = FIELD_GET(DF2_COH_ST_FABRIC_ID, reg);
+ else
+ ctx->coh_st_fabric_id = FIELD_GET(DF4p5_COH_ST_FABRIC_ID, reg);
+
+ return 0;
+}
+
+static int find_normalized_offset(struct addr_ctx *ctx, u64 *norm_offset)
+{
+ u64 last_offset = 0;
+ int ret;
+
+ for (ctx->map.num = 1; ctx->map.num < df_cfg.num_coh_st_maps; ctx->map.num++) {
+ ret = get_dram_offset(ctx, norm_offset);
+ if (ret < 0)
+ return ret;
+
+ /* Continue search if this map's offset is not enabled. */
+ if (!ret)
+ continue;
+
+ /* Enabled offsets should never be 0. */
+ if (*norm_offset == 0) {
+ atl_debug(ctx, "Enabled map %u offset is 0", ctx->map.num);
+ return -EINVAL;
+ }
+
+ /* Offsets should always increase from one map to the next. */
+ if (*norm_offset <= last_offset) {
+ atl_debug(ctx, "Map %u offset (0x%016llx) <= previous (0x%016llx)",
+ ctx->map.num, *norm_offset, last_offset);
+ return -EINVAL;
+ }
+
+ /* Match if this map's offset is less than the current calculated address. */
+ if (ctx->ret_addr >= *norm_offset)
+ break;
+
+ last_offset = *norm_offset;
+ }
+
+ /*
+ * Finished search without finding a match.
+ * Reset to map 0 and no offset.
+ */
+ if (ctx->map.num >= df_cfg.num_coh_st_maps) {
+ ctx->map.num = 0;
+ *norm_offset = 0;
+ }
+
+ return 0;
+}
+
+static bool valid_map(struct addr_ctx *ctx)
+{
+ if (df_cfg.rev >= DF4)
+ return FIELD_GET(DF_ADDR_RANGE_VAL, ctx->map.ctl);
+ else
+ return FIELD_GET(DF_ADDR_RANGE_VAL, ctx->map.base);
+}
+
+static int get_address_map_common(struct addr_ctx *ctx)
+{
+ u64 norm_offset = 0;
+
+ if (get_coh_st_fabric_id(ctx))
+ return -EINVAL;
+
+ if (find_normalized_offset(ctx, &norm_offset))
+ return -EINVAL;
+
+ if (get_dram_addr_map(ctx))
+ return -EINVAL;
+
+ if (!valid_map(ctx))
+ return -EINVAL;
+
+ ctx->ret_addr -= norm_offset;
+
+ return 0;
+}
+
+static u8 get_num_intlv_chan(struct addr_ctx *ctx)
+{
+ switch (ctx->map.intlv_mode) {
+ case NONE:
+ return 1;
+ case NOHASH_2CHAN:
+ case DF2_2CHAN_HASH:
+ case DF3_COD4_2CHAN_HASH:
+ case DF4_NPS4_2CHAN_HASH:
+ case DF4p5_NPS4_2CHAN_1K_HASH:
+ case DF4p5_NPS4_2CHAN_2K_HASH:
+ return 2;
+ case DF4_NPS4_3CHAN_HASH:
+ case DF4p5_NPS4_3CHAN_1K_HASH:
+ case DF4p5_NPS4_3CHAN_2K_HASH:
+ return 3;
+ case NOHASH_4CHAN:
+ case DF3_COD2_4CHAN_HASH:
+ case DF4_NPS2_4CHAN_HASH:
+ case DF4p5_NPS2_4CHAN_1K_HASH:
+ case DF4p5_NPS2_4CHAN_2K_HASH:
+ return 4;
+ case DF4_NPS2_5CHAN_HASH:
+ case DF4p5_NPS2_5CHAN_1K_HASH:
+ case DF4p5_NPS2_5CHAN_2K_HASH:
+ return 5;
+ case DF3_6CHAN:
+ case DF4_NPS2_6CHAN_HASH:
+ case DF4p5_NPS2_6CHAN_1K_HASH:
+ case DF4p5_NPS2_6CHAN_2K_HASH:
+ return 6;
+ case NOHASH_8CHAN:
+ case DF3_COD1_8CHAN_HASH:
+ case DF4_NPS1_8CHAN_HASH:
+ case MI3_HASH_8CHAN:
+ case DF4p5_NPS1_8CHAN_1K_HASH:
+ case DF4p5_NPS1_8CHAN_2K_HASH:
+ return 8;
+ case DF4_NPS1_10CHAN_HASH:
+ case DF4p5_NPS1_10CHAN_1K_HASH:
+ case DF4p5_NPS1_10CHAN_2K_HASH:
+ return 10;
+ case DF4_NPS1_12CHAN_HASH:
+ case DF4p5_NPS1_12CHAN_1K_HASH:
+ case DF4p5_NPS1_12CHAN_2K_HASH:
+ return 12;
+ case NOHASH_16CHAN:
+ case MI3_HASH_16CHAN:
+ case DF4p5_NPS1_16CHAN_1K_HASH:
+ case DF4p5_NPS1_16CHAN_2K_HASH:
+ return 16;
+ case DF4p5_NPS0_24CHAN_1K_HASH:
+ case DF4p5_NPS0_24CHAN_2K_HASH:
+ return 24;
+ case NOHASH_32CHAN:
+ case MI3_HASH_32CHAN:
+ return 32;
+ default:
+ atl_debug_on_bad_intlv_mode(ctx);
+ return 0;
+ }
+}
+
+static void calculate_intlv_bits(struct addr_ctx *ctx)
+{
+ ctx->map.num_intlv_chan = get_num_intlv_chan(ctx);
+
+ ctx->map.total_intlv_chan = ctx->map.num_intlv_chan;
+ ctx->map.total_intlv_chan *= ctx->map.num_intlv_dies;
+ ctx->map.total_intlv_chan *= ctx->map.num_intlv_sockets;
+
+ /*
+ * Get the number of bits needed to cover this many channels.
+ * order_base_2() rounds up automatically.
+ */
+ ctx->map.total_intlv_bits = order_base_2(ctx->map.total_intlv_chan);
+}
+
+static u8 get_intlv_bit_pos(struct addr_ctx *ctx)
+{
+ u8 addr_sel = 0;
+
+ switch (df_cfg.rev) {
+ case DF2:
+ addr_sel = FIELD_GET(DF2_INTLV_ADDR_SEL, ctx->map.base);
+ break;
+ case DF3:
+ case DF3p5:
+ addr_sel = FIELD_GET(DF3_INTLV_ADDR_SEL, ctx->map.base);
+ break;
+ case DF4:
+ case DF4p5:
+ addr_sel = FIELD_GET(DF4_INTLV_ADDR_SEL, ctx->map.intlv);
+ break;
+ default:
+ atl_debug_on_bad_df_rev();
+ break;
+ }
+
+ /* Add '8' to get the 'interleave bit position'. */
+ return addr_sel + 8;
+}
+
+static u8 get_num_intlv_dies(struct addr_ctx *ctx)
+{
+ u8 dies = 0;
+
+ switch (df_cfg.rev) {
+ case DF2:
+ dies = FIELD_GET(DF2_INTLV_NUM_DIES, ctx->map.limit);
+ break;
+ case DF3:
+ dies = FIELD_GET(DF3_INTLV_NUM_DIES, ctx->map.base);
+ break;
+ case DF3p5:
+ dies = FIELD_GET(DF3p5_INTLV_NUM_DIES, ctx->map.base);
+ break;
+ case DF4:
+ case DF4p5:
+ dies = FIELD_GET(DF4_INTLV_NUM_DIES, ctx->map.intlv);
+ break;
+ default:
+ atl_debug_on_bad_df_rev();
+ break;
+ }
+
+ /* Register value is log2, e.g. 0 -> 1 die, 1 -> 2 dies, etc. */
+ return 1 << dies;
+}
+
+static u8 get_num_intlv_sockets(struct addr_ctx *ctx)
+{
+ u8 sockets = 0;
+
+ switch (df_cfg.rev) {
+ case DF2:
+ sockets = FIELD_GET(DF2_INTLV_NUM_SOCKETS, ctx->map.limit);
+ break;
+ case DF3:
+ case DF3p5:
+ sockets = FIELD_GET(DF2_INTLV_NUM_SOCKETS, ctx->map.base);
+ break;
+ case DF4:
+ case DF4p5:
+ sockets = FIELD_GET(DF4_INTLV_NUM_SOCKETS, ctx->map.intlv);
+ break;
+ default:
+ atl_debug_on_bad_df_rev();
+ break;
+ }
+
+ /* Register value is log2, e.g. 0 -> 1 sockets, 1 -> 2 sockets, etc. */
+ return 1 << sockets;
+}
+
+static int get_global_map_data(struct addr_ctx *ctx)
+{
+ if (get_intlv_mode(ctx))
+ return -EINVAL;
+
+ if (ctx->map.intlv_mode == DF3_6CHAN &&
+ df3_6ch_get_dram_addr_map(ctx))
+ return -EINVAL;
+
+ ctx->map.intlv_bit_pos = get_intlv_bit_pos(ctx);
+ ctx->map.num_intlv_dies = get_num_intlv_dies(ctx);
+ ctx->map.num_intlv_sockets = get_num_intlv_sockets(ctx);
+ calculate_intlv_bits(ctx);
+
+ return 0;
+}
+
+static void dump_address_map(struct dram_addr_map *map)
+{
+ u8 i;
+
+ pr_debug("intlv_mode=0x%x", map->intlv_mode);
+ pr_debug("num=0x%x", map->num);
+ pr_debug("base=0x%x", map->base);
+ pr_debug("limit=0x%x", map->limit);
+ pr_debug("ctl=0x%x", map->ctl);
+ pr_debug("intlv=0x%x", map->intlv);
+
+ for (i = 0; i < MAX_COH_ST_CHANNELS; i++)
+ pr_debug("remap_array[%u]=0x%x", i, map->remap_array[i]);
+
+ pr_debug("intlv_bit_pos=%u", map->intlv_bit_pos);
+ pr_debug("num_intlv_chan=%u", map->num_intlv_chan);
+ pr_debug("num_intlv_dies=%u", map->num_intlv_dies);
+ pr_debug("num_intlv_sockets=%u", map->num_intlv_sockets);
+ pr_debug("total_intlv_chan=%u", map->total_intlv_chan);
+ pr_debug("total_intlv_bits=%u", map->total_intlv_bits);
+}
+
+int get_address_map(struct addr_ctx *ctx)
+{
+ int ret;
+
+ ret = get_address_map_common(ctx);
+ if (ret)
+ return ret;
+
+ ret = get_global_map_data(ctx);
+ if (ret)
+ return ret;
+
+ dump_address_map(&ctx->map);
+
+ return ret;
+}
diff --git a/drivers/ras/amd/atl/reg_fields.h b/drivers/ras/amd/atl/reg_fields.h
new file mode 100644
index 000000000000..9dcdf6e4a856
--- /dev/null
+++ b/drivers/ras/amd/atl/reg_fields.h
@@ -0,0 +1,606 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Address Translation Library
+ *
+ * reg_fields.h : Register field definitions
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+/*
+ * Notes on naming:
+ * 1) Use "DF_" prefix for fields that are the same for all revisions.
+ * 2) Use "DFx_" prefix for fields that differ between revisions.
+ * a) "x" is the first major revision where the new field appears.
+ * b) E.g., if DF2 and DF3 have the same field, then call it DF2.
+ * c) E.g., if DF3p5 and DF4 have the same field, then call it DF4.
+ */
+
+/*
+ * Coherent Station Fabric ID
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x50 [Fabric Block Instance Information 3]
+ * DF2 BlockFabricId [19:8]
+ * DF3 BlockFabricId [19:8]
+ * DF3p5 BlockFabricId [19:8]
+ * DF4 BlockFabricId [19:8]
+ * DF4p5 BlockFabricId [15:8]
+ */
+#define DF2_COH_ST_FABRIC_ID GENMASK(19, 8)
+#define DF4p5_COH_ST_FABRIC_ID GENMASK(15, 8)
+
+/*
+ * Component ID Mask
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ *
+ * D18F1x208 [System Fabric ID Mask 0]
+ * DF3 ComponentIdMask [9:0]
+ *
+ * D18F1x150 [System Fabric ID Mask 0]
+ * DF3p5 ComponentIdMask [15:0]
+ *
+ * D18F4x1B0 [System Fabric ID Mask 0]
+ * DF4 ComponentIdMask [15:0]
+ * DF4p5 ComponentIdMask [15:0]
+ */
+#define DF3_COMPONENT_ID_MASK GENMASK(9, 0)
+#define DF4_COMPONENT_ID_MASK GENMASK(15, 0)
+
+/*
+ * Destination Fabric ID
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x114 [DRAM Limit Address]
+ * DF2 DstFabricID [7:0]
+ * DF3 DstFabricID [9:0]
+ * DF3 DstFabricID [11:0]
+ *
+ * D18F7xE08 [DRAM Address Control]
+ * DF4 DstFabricID [27:16]
+ *
+ * D18F7x208 [DRAM Address Control]
+ * DF4p5 DstFabricID [23:16]
+ */
+#define DF2_DST_FABRIC_ID GENMASK(7, 0)
+#define DF3_DST_FABRIC_ID GENMASK(9, 0)
+#define DF3p5_DST_FABRIC_ID GENMASK(11, 0)
+#define DF4_DST_FABRIC_ID GENMASK(27, 16)
+#define DF4p5_DST_FABRIC_ID GENMASK(23, 16)
+
+/*
+ * Die ID Mask
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F1x208 [System Fabric ID Mask]
+ * DF2 DieIdMask [15:8]
+ *
+ * D18F1x20C [System Fabric ID Mask 1]
+ * DF3 DieIdMask [18:16]
+ *
+ * D18F1x158 [System Fabric ID Mask 2]
+ * DF3p5 DieIdMask [15:0]
+ *
+ * D18F4x1B8 [System Fabric ID Mask 2]
+ * DF4 DieIdMask [15:0]
+ * DF4p5 DieIdMask [15:0]
+ */
+#define DF2_DIE_ID_MASK GENMASK(15, 8)
+#define DF3_DIE_ID_MASK GENMASK(18, 16)
+#define DF4_DIE_ID_MASK GENMASK(15, 0)
+
+/*
+ * Die ID Shift
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F1x208 [System Fabric ID Mask]
+ * DF2 DieIdShift [27:24]
+ *
+ * DF3 N/A
+ * DF3p5 N/A
+ * DF4 N/A
+ * DF4p5 N/A
+ */
+#define DF2_DIE_ID_SHIFT GENMASK(27, 24)
+
+/*
+ * DRAM Address Range Valid
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x110 [DRAM Base Address]
+ * DF2 AddrRngVal [0]
+ * DF3 AddrRngVal [0]
+ * DF3p5 AddrRngVal [0]
+ *
+ * D18F7xE08 [DRAM Address Control]
+ * DF4 AddrRngVal [0]
+ *
+ * D18F7x208 [DRAM Address Control]
+ * DF4p5 AddrRngVal [0]
+ */
+#define DF_ADDR_RANGE_VAL BIT(0)
+
+/*
+ * DRAM Base Address
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x110 [DRAM Base Address]
+ * DF2 DramBaseAddr [31:12]
+ * DF3 DramBaseAddr [31:12]
+ * DF3p5 DramBaseAddr [31:12]
+ *
+ * D18F7xE00 [DRAM Base Address]
+ * DF4 DramBaseAddr [27:0]
+ *
+ * D18F7x200 [DRAM Base Address]
+ * DF4p5 DramBaseAddr [27:0]
+ */
+#define DF2_BASE_ADDR GENMASK(31, 12)
+#define DF4_BASE_ADDR GENMASK(27, 0)
+
+/*
+ * DRAM Hole Base
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x104 [DRAM Hole Control]
+ * DF2 DramHoleBase [31:24]
+ * DF3 DramHoleBase [31:24]
+ * DF3p5 DramHoleBase [31:24]
+ *
+ * D18F7x104 [DRAM Hole Control]
+ * DF4 DramHoleBase [31:24]
+ * DF4p5 DramHoleBase [31:24]
+ */
+#define DF_DRAM_HOLE_BASE_MASK GENMASK(31, 24)
+
+/*
+ * DRAM Limit Address
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x114 [DRAM Limit Address]
+ * DF2 DramLimitAddr [31:12]
+ * DF3 DramLimitAddr [31:12]
+ * DF3p5 DramLimitAddr [31:12]
+ *
+ * D18F7xE04 [DRAM Limit Address]
+ * DF4 DramLimitAddr [27:0]
+ *
+ * D18F7x204 [DRAM Limit Address]
+ * DF4p5 DramLimitAddr [27:0]
+ */
+#define DF2_DRAM_LIMIT_ADDR GENMASK(31, 12)
+#define DF4_DRAM_LIMIT_ADDR GENMASK(27, 0)
+
+/*
+ * Hash Interleave Controls
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ *
+ * D18F0x3F8 [DF Global Control]
+ * DF3 GlbHashIntlvCtl64K [20]
+ * GlbHashIntlvCtl2M [21]
+ * GlbHashIntlvCtl1G [22]
+ *
+ * DF3p5 GlbHashIntlvCtl64K [20]
+ * GlbHashIntlvCtl2M [21]
+ * GlbHashIntlvCtl1G [22]
+ *
+ * D18F7xE08 [DRAM Address Control]
+ * DF4 HashIntlvCtl64K [8]
+ * HashIntlvCtl2M [9]
+ * HashIntlvCtl1G [10]
+ *
+ * D18F7x208 [DRAM Address Control]
+ * DF4p5 HashIntlvCtl4K [7]
+ * HashIntlvCtl64K [8]
+ * HashIntlvCtl2M [9]
+ * HashIntlvCtl1G [10]
+ * HashIntlvCtl1T [15]
+ */
+#define DF3_HASH_CTL_64K BIT(20)
+#define DF3_HASH_CTL_2M BIT(21)
+#define DF3_HASH_CTL_1G BIT(22)
+#define DF4_HASH_CTL_64K BIT(8)
+#define DF4_HASH_CTL_2M BIT(9)
+#define DF4_HASH_CTL_1G BIT(10)
+#define DF4p5_HASH_CTL_4K BIT(7)
+#define DF4p5_HASH_CTL_1T BIT(15)
+
+/*
+ * High Address Offset
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x1B4 [DRAM Offset]
+ * DF2 HiAddrOffset [31:20]
+ * DF3 HiAddrOffset [31:12]
+ * DF3p5 HiAddrOffset [31:12]
+ *
+ * D18F7x140 [DRAM Offset]
+ * DF4 HiAddrOffset [24:1]
+ * DF4p5 HiAddrOffset [24:1]
+ * MI300 HiAddrOffset [31:1]
+ */
+#define DF2_HI_ADDR_OFFSET GENMASK(31, 20)
+#define DF3_HI_ADDR_OFFSET GENMASK(31, 12)
+
+/* Follow reference code by including reserved bits for simplicity. */
+#define DF4_HI_ADDR_OFFSET GENMASK(31, 1)
+
+/*
+ * High Address Offset Enable
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x1B4 [DRAM Offset]
+ * DF2 HiAddrOffsetEn [0]
+ * DF3 HiAddrOffsetEn [0]
+ * DF3p5 HiAddrOffsetEn [0]
+ *
+ * D18F7x140 [DRAM Offset]
+ * DF4 HiAddrOffsetEn [0]
+ * DF4p5 HiAddrOffsetEn [0]
+ */
+#define DF_HI_ADDR_OFFSET_EN BIT(0)
+
+/*
+ * Interleave Address Select
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x110 [DRAM Base Address]
+ * DF2 IntLvAddrSel [10:8]
+ * DF3 IntLvAddrSel [11:9]
+ * DF3p5 IntLvAddrSel [11:9]
+ *
+ * D18F7xE0C [DRAM Address Interleave]
+ * DF4 IntLvAddrSel [2:0]
+ *
+ * D18F7x20C [DRAM Address Interleave]
+ * DF4p5 IntLvAddrSel [2:0]
+ */
+#define DF2_INTLV_ADDR_SEL GENMASK(10, 8)
+#define DF3_INTLV_ADDR_SEL GENMASK(11, 9)
+#define DF4_INTLV_ADDR_SEL GENMASK(2, 0)
+
+/*
+ * Interleave Number of Channels
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x110 [DRAM Base Address]
+ * DF2 IntLvNumChan [7:4]
+ * DF3 IntLvNumChan [5:2]
+ * DF3p5 IntLvNumChan [6:2]
+ *
+ * D18F7xE0C [DRAM Address Interleave]
+ * DF4 IntLvNumChan [8:4]
+ *
+ * D18F7x20C [DRAM Address Interleave]
+ * DF4p5 IntLvNumChan [9:4]
+ */
+#define DF2_INTLV_NUM_CHAN GENMASK(7, 4)
+#define DF3_INTLV_NUM_CHAN GENMASK(5, 2)
+#define DF3p5_INTLV_NUM_CHAN GENMASK(6, 2)
+#define DF4_INTLV_NUM_CHAN GENMASK(8, 4)
+#define DF4p5_INTLV_NUM_CHAN GENMASK(9, 4)
+
+/*
+ * Interleave Number of Dies
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x114 [DRAM Limit Address]
+ * DF2 IntLvNumDies [11:10]
+ *
+ * D18F0x110 [DRAM Base Address]
+ * DF3 IntLvNumDies [7:6]
+ * DF3p5 IntLvNumDies [7]
+ *
+ * D18F7xE0C [DRAM Address Interleave]
+ * DF4 IntLvNumDies [13:12]
+ *
+ * D18F7x20C [DRAM Address Interleave]
+ * DF4p5 IntLvNumDies [13:12]
+ */
+#define DF2_INTLV_NUM_DIES GENMASK(11, 10)
+#define DF3_INTLV_NUM_DIES GENMASK(7, 6)
+#define DF3p5_INTLV_NUM_DIES BIT(7)
+#define DF4_INTLV_NUM_DIES GENMASK(13, 12)
+
+/*
+ * Interleave Number of Sockets
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x114 [DRAM Limit Address]
+ * DF2 IntLvNumSockets [8]
+ *
+ * D18F0x110 [DRAM Base Address]
+ * DF3 IntLvNumSockets [8]
+ * DF3p5 IntLvNumSockets [8]
+ *
+ * D18F7xE0C [DRAM Address Interleave]
+ * DF4 IntLvNumSockets [18]
+ *
+ * D18F7x20C [DRAM Address Interleave]
+ * DF4p5 IntLvNumSockets [18]
+ */
+#define DF2_INTLV_NUM_SOCKETS BIT(8)
+#define DF4_INTLV_NUM_SOCKETS BIT(18)
+
+/*
+ * Legacy MMIO Hole Enable
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F0x110 [DRAM Base Address]
+ * DF2 LgcyMmioHoleEn [1]
+ * DF3 LgcyMmioHoleEn [1]
+ * DF3p5 LgcyMmioHoleEn [1]
+ *
+ * D18F7xE08 [DRAM Address Control]
+ * DF4 LgcyMmioHoleEn [1]
+ *
+ * D18F7x208 [DRAM Address Control]
+ * DF4p5 LgcyMmioHoleEn [1]
+ */
+#define DF_LEGACY_MMIO_HOLE_EN BIT(1)
+
+/*
+ * Log2 Address 64K Space 0
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ *
+ * D18F2x90 [Non-power-of-2 channel Configuration Register for COH_ST DRAM Address Maps]
+ * DF3 Log2Addr64KSpace0 [5:0]
+ *
+ * DF3p5 N/A
+ * DF4 N/A
+ * DF4p5 N/A
+ */
+#define DF_LOG2_ADDR_64K_SPACE0 GENMASK(5, 0)
+
+/*
+ * Major Revision
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ * DF3 N/A
+ * DF3p5 N/A
+ *
+ * D18F0x040 [Fabric Block Instance Count]
+ * DF4 MajorRevision [27:24]
+ * DF4p5 MajorRevision [27:24]
+ */
+#define DF_MAJOR_REVISION GENMASK(27, 24)
+
+/*
+ * Minor Revision
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ * DF3 N/A
+ * DF3p5 N/A
+ *
+ * D18F0x040 [Fabric Block Instance Count]
+ * DF4 MinorRevision [23:16]
+ * DF4p5 MinorRevision [23:16]
+ */
+#define DF_MINOR_REVISION GENMASK(23, 16)
+
+/*
+ * Node ID Mask
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ *
+ * D18F1x208 [System Fabric ID Mask 0]
+ * DF3 NodeIdMask [25:16]
+ *
+ * D18F1x150 [System Fabric ID Mask 0]
+ * DF3p5 NodeIdMask [31:16]
+ *
+ * D18F4x1B0 [System Fabric ID Mask 0]
+ * DF4 NodeIdMask [31:16]
+ * DF4p5 NodeIdMask [31:16]
+ */
+#define DF3_NODE_ID_MASK GENMASK(25, 16)
+#define DF4_NODE_ID_MASK GENMASK(31, 16)
+
+/*
+ * Node ID Shift
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ *
+ * D18F1x20C [System Fabric ID Mask 1]
+ * DF3 NodeIdShift [3:0]
+ *
+ * D18F1x154 [System Fabric ID Mask 1]
+ * DF3p5 NodeIdShift [3:0]
+ *
+ * D18F4x1B4 [System Fabric ID Mask 1]
+ * DF4 NodeIdShift [3:0]
+ * DF4p5 NodeIdShift [3:0]
+ */
+#define DF3_NODE_ID_SHIFT GENMASK(3, 0)
+
+/*
+ * Remap Enable
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ * DF3 N/A
+ * DF3p5 N/A
+ *
+ * D18F7xE08 [DRAM Address Control]
+ * DF4 RemapEn [4]
+ *
+ * D18F7x208 [DRAM Address Control]
+ * DF4p5 RemapEn [4]
+ */
+#define DF4_REMAP_EN BIT(4)
+
+/*
+ * Remap Select
+ *
+ * Access type: Instance
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * DF2 N/A
+ * DF3 N/A
+ * DF3p5 N/A
+ *
+ * D18F7xE08 [DRAM Address Control]
+ * DF4 RemapSel [7:5]
+ *
+ * D18F7x208 [DRAM Address Control]
+ * DF4p5 RemapSel [6:5]
+ */
+#define DF4_REMAP_SEL GENMASK(7, 5)
+#define DF4p5_REMAP_SEL GENMASK(6, 5)
+
+/*
+ * Socket ID Mask
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F1x208 [System Fabric ID Mask]
+ * DF2 SocketIdMask [23:16]
+ *
+ * D18F1x20C [System Fabric ID Mask 1]
+ * DF3 SocketIdMask [26:24]
+ *
+ * D18F1x158 [System Fabric ID Mask 2]
+ * DF3p5 SocketIdMask [31:16]
+ *
+ * D18F4x1B8 [System Fabric ID Mask 2]
+ * DF4 SocketIdMask [31:16]
+ * DF4p5 SocketIdMask [31:16]
+ */
+#define DF2_SOCKET_ID_MASK GENMASK(23, 16)
+#define DF3_SOCKET_ID_MASK GENMASK(26, 24)
+#define DF4_SOCKET_ID_MASK GENMASK(31, 16)
+
+/*
+ * Socket ID Shift
+ *
+ * Access type: Broadcast
+ *
+ * Register
+ * Rev Fieldname Bits
+ *
+ * D18F1x208 [System Fabric ID Mask]
+ * DF2 SocketIdShift [31:28]
+ *
+ * D18F1x20C [System Fabric ID Mask 1]
+ * DF3 SocketIdShift [9:8]
+ *
+ * D18F1x158 [System Fabric ID Mask 2]
+ * DF3p5 SocketIdShift [11:8]
+ *
+ * D18F4x1B4 [System Fabric ID Mask 1]
+ * DF4 SocketIdShift [11:8]
+ * DF4p5 SocketIdShift [11:8]
+ */
+#define DF2_SOCKET_ID_SHIFT GENMASK(31, 28)
+#define DF3_SOCKET_ID_SHIFT GENMASK(9, 8)
+#define DF4_SOCKET_ID_SHIFT GENMASK(11, 8)
diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c
new file mode 100644
index 000000000000..701349e84942
--- /dev/null
+++ b/drivers/ras/amd/atl/system.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * system.c : Functions to read and save system-wide data
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include "internal.h"
+
+int determine_node_id(struct addr_ctx *ctx, u8 socket_id, u8 die_id)
+{
+ u16 socket_id_bits, die_id_bits;
+
+ if (socket_id > 0 && df_cfg.socket_id_mask == 0) {
+ atl_debug(ctx, "Invalid socket inputs: socket_id=%u socket_id_mask=0x%x",
+ socket_id, df_cfg.socket_id_mask);
+ return -EINVAL;
+ }
+
+ /* Do each step independently to avoid shift out-of-bounds issues. */
+ socket_id_bits = socket_id;
+ socket_id_bits <<= df_cfg.socket_id_shift;
+ socket_id_bits &= df_cfg.socket_id_mask;
+
+ if (die_id > 0 && df_cfg.die_id_mask == 0) {
+ atl_debug(ctx, "Invalid die inputs: die_id=%u die_id_mask=0x%x",
+ die_id, df_cfg.die_id_mask);
+ return -EINVAL;
+ }
+
+ /* Do each step independently to avoid shift out-of-bounds issues. */
+ die_id_bits = die_id;
+ die_id_bits <<= df_cfg.die_id_shift;
+ die_id_bits &= df_cfg.die_id_mask;
+
+ ctx->node_id = (socket_id_bits | die_id_bits) >> df_cfg.node_id_shift;
+ return 0;
+}
+
+static void df2_get_masks_shifts(u32 mask0)
+{
+ df_cfg.socket_id_shift = FIELD_GET(DF2_SOCKET_ID_SHIFT, mask0);
+ df_cfg.socket_id_mask = FIELD_GET(DF2_SOCKET_ID_MASK, mask0);
+ df_cfg.die_id_shift = FIELD_GET(DF2_DIE_ID_SHIFT, mask0);
+ df_cfg.die_id_mask = FIELD_GET(DF2_DIE_ID_MASK, mask0);
+ df_cfg.node_id_shift = df_cfg.die_id_shift;
+ df_cfg.node_id_mask = df_cfg.socket_id_mask | df_cfg.die_id_mask;
+ df_cfg.component_id_mask = ~df_cfg.node_id_mask;
+}
+
+static void df3_get_masks_shifts(u32 mask0, u32 mask1)
+{
+ df_cfg.component_id_mask = FIELD_GET(DF3_COMPONENT_ID_MASK, mask0);
+ df_cfg.node_id_mask = FIELD_GET(DF3_NODE_ID_MASK, mask0);
+
+ df_cfg.node_id_shift = FIELD_GET(DF3_NODE_ID_SHIFT, mask1);
+ df_cfg.socket_id_shift = FIELD_GET(DF3_SOCKET_ID_SHIFT, mask1);
+ df_cfg.socket_id_mask = FIELD_GET(DF3_SOCKET_ID_MASK, mask1);
+ df_cfg.die_id_mask = FIELD_GET(DF3_DIE_ID_MASK, mask1);
+}
+
+static void df3p5_get_masks_shifts(u32 mask0, u32 mask1, u32 mask2)
+{
+ df_cfg.component_id_mask = FIELD_GET(DF4_COMPONENT_ID_MASK, mask0);
+ df_cfg.node_id_mask = FIELD_GET(DF4_NODE_ID_MASK, mask0);
+
+ df_cfg.node_id_shift = FIELD_GET(DF3_NODE_ID_SHIFT, mask1);
+ df_cfg.socket_id_shift = FIELD_GET(DF4_SOCKET_ID_SHIFT, mask1);
+
+ df_cfg.socket_id_mask = FIELD_GET(DF4_SOCKET_ID_MASK, mask2);
+ df_cfg.die_id_mask = FIELD_GET(DF4_DIE_ID_MASK, mask2);
+}
+
+static void df4_get_masks_shifts(u32 mask0, u32 mask1, u32 mask2)
+{
+ df3p5_get_masks_shifts(mask0, mask1, mask2);
+
+ if (!(df_cfg.flags.socket_id_shift_quirk && df_cfg.socket_id_shift == 1))
+ return;
+
+ df_cfg.socket_id_shift = 0;
+ df_cfg.socket_id_mask = 1;
+ df_cfg.die_id_shift = 0;
+ df_cfg.die_id_mask = 0;
+ df_cfg.node_id_shift = 8;
+ df_cfg.node_id_mask = 0x100;
+}
+
+static int df4_get_fabric_id_mask_registers(void)
+{
+ u32 mask0, mask1, mask2;
+
+ /* Read D18F4x1B0 (SystemFabricIdMask0) */
+ if (df_indirect_read_broadcast(0, 4, 0x1B0, &mask0))
+ return -EINVAL;
+
+ /* Read D18F4x1B4 (SystemFabricIdMask1) */
+ if (df_indirect_read_broadcast(0, 4, 0x1B4, &mask1))
+ return -EINVAL;
+
+ /* Read D18F4x1B8 (SystemFabricIdMask2) */
+ if (df_indirect_read_broadcast(0, 4, 0x1B8, &mask2))
+ return -EINVAL;
+
+ df4_get_masks_shifts(mask0, mask1, mask2);
+ return 0;
+}
+
+static int df4_determine_df_rev(u32 reg)
+{
+ df_cfg.rev = FIELD_GET(DF_MINOR_REVISION, reg) < 5 ? DF4 : DF4p5;
+
+ /* Check for special cases or quirks based on Device/Vendor IDs.*/
+
+ /* Read D18F0x000 (DeviceVendorId0) */
+ if (df_indirect_read_broadcast(0, 0, 0, &reg))
+ return -EINVAL;
+
+ if (reg == DF_FUNC0_ID_ZEN4_SERVER)
+ df_cfg.flags.socket_id_shift_quirk = 1;
+
+ if (reg == DF_FUNC0_ID_MI300) {
+ df_cfg.flags.heterogeneous = 1;
+
+ if (get_addr_hash_mi300())
+ return -EINVAL;
+ }
+
+ return df4_get_fabric_id_mask_registers();
+}
+
+static int determine_df_rev_legacy(void)
+{
+ u32 fabric_id_mask0, fabric_id_mask1, fabric_id_mask2;
+
+ /*
+ * Check for DF3.5.
+ *
+ * Component ID Mask must be non-zero. Register D18F1x150 is
+ * reserved pre-DF3.5, so value will be Read-as-Zero.
+ */
+
+ /* Read D18F1x150 (SystemFabricIdMask0). */
+ if (df_indirect_read_broadcast(0, 1, 0x150, &fabric_id_mask0))
+ return -EINVAL;
+
+ if (FIELD_GET(DF4_COMPONENT_ID_MASK, fabric_id_mask0)) {
+ df_cfg.rev = DF3p5;
+
+ /* Read D18F1x154 (SystemFabricIdMask1) */
+ if (df_indirect_read_broadcast(0, 1, 0x154, &fabric_id_mask1))
+ return -EINVAL;
+
+ /* Read D18F1x158 (SystemFabricIdMask2) */
+ if (df_indirect_read_broadcast(0, 1, 0x158, &fabric_id_mask2))
+ return -EINVAL;
+
+ df3p5_get_masks_shifts(fabric_id_mask0, fabric_id_mask1, fabric_id_mask2);
+ return 0;
+ }
+
+ /*
+ * Check for DF3.
+ *
+ * Component ID Mask must be non-zero. Field is Read-as-Zero on DF2.
+ */
+
+ /* Read D18F1x208 (SystemFabricIdMask). */
+ if (df_indirect_read_broadcast(0, 1, 0x208, &fabric_id_mask0))
+ return -EINVAL;
+
+ if (FIELD_GET(DF3_COMPONENT_ID_MASK, fabric_id_mask0)) {
+ df_cfg.rev = DF3;
+
+ /* Read D18F1x20C (SystemFabricIdMask1) */
+ if (df_indirect_read_broadcast(0, 1, 0x20C, &fabric_id_mask1))
+ return -EINVAL;
+
+ df3_get_masks_shifts(fabric_id_mask0, fabric_id_mask1);
+ return 0;
+ }
+
+ /* Default to DF2. */
+ df_cfg.rev = DF2;
+ df2_get_masks_shifts(fabric_id_mask0);
+ return 0;
+}
+
+static int determine_df_rev(void)
+{
+ u32 reg;
+ u8 rev;
+
+ if (df_cfg.rev != UNKNOWN)
+ return 0;
+
+ /* Read D18F0x40 (FabricBlockInstanceCount). */
+ if (df_indirect_read_broadcast(0, 0, 0x40, &reg))
+ return -EINVAL;
+
+ /*
+ * Revision fields added for DF4 and later.
+ *
+ * Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
+ */
+ rev = FIELD_GET(DF_MAJOR_REVISION, reg);
+ if (!rev)
+ return determine_df_rev_legacy();
+
+ /*
+ * Fail out for major revisions other than '4'.
+ *
+ * Explicit support should be added for newer systems to avoid issues.
+ */
+ if (rev == 4)
+ return df4_determine_df_rev(reg);
+
+ return -EINVAL;
+}
+
+static void get_num_maps(void)
+{
+ switch (df_cfg.rev) {
+ case DF2:
+ case DF3:
+ case DF3p5:
+ df_cfg.num_coh_st_maps = 2;
+ break;
+ case DF4:
+ case DF4p5:
+ df_cfg.num_coh_st_maps = 4;
+ break;
+ default:
+ atl_debug_on_bad_df_rev();
+ }
+}
+
+static void apply_node_id_shift(void)
+{
+ if (df_cfg.rev == DF2)
+ return;
+
+ df_cfg.die_id_shift = df_cfg.node_id_shift;
+ df_cfg.die_id_mask <<= df_cfg.node_id_shift;
+ df_cfg.socket_id_mask <<= df_cfg.node_id_shift;
+ df_cfg.socket_id_shift += df_cfg.node_id_shift;
+}
+
+static void dump_df_cfg(void)
+{
+ pr_debug("rev=0x%x", df_cfg.rev);
+
+ pr_debug("component_id_mask=0x%x", df_cfg.component_id_mask);
+ pr_debug("die_id_mask=0x%x", df_cfg.die_id_mask);
+ pr_debug("node_id_mask=0x%x", df_cfg.node_id_mask);
+ pr_debug("socket_id_mask=0x%x", df_cfg.socket_id_mask);
+
+ pr_debug("die_id_shift=0x%x", df_cfg.die_id_shift);
+ pr_debug("node_id_shift=0x%x", df_cfg.node_id_shift);
+ pr_debug("socket_id_shift=0x%x", df_cfg.socket_id_shift);
+
+ pr_debug("num_coh_st_maps=%u", df_cfg.num_coh_st_maps);
+
+ pr_debug("flags.legacy_ficaa=%u", df_cfg.flags.legacy_ficaa);
+ pr_debug("flags.socket_id_shift_quirk=%u", df_cfg.flags.socket_id_shift_quirk);
+}
+
+int get_df_system_info(void)
+{
+ if (determine_df_rev()) {
+ pr_warn("amd_atl: Failed to determine DF Revision");
+ df_cfg.rev = UNKNOWN;
+ return -EINVAL;
+ }
+
+ apply_node_id_shift();
+
+ get_num_maps();
+
+ dump_df_cfg();
+
+ return 0;
+}
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
new file mode 100644
index 000000000000..59b6169093f7
--- /dev/null
+++ b/drivers/ras/amd/atl/umc.c
@@ -0,0 +1,341 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Address Translation Library
+ *
+ * umc.c : Unified Memory Controller (UMC) topology helpers
+ *
+ * Copyright (c) 2023, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include "internal.h"
+
+/*
+ * MI300 has a fixed, model-specific mapping between a UMC instance and
+ * its related Data Fabric Coherent Station instance.
+ *
+ * The MCA_IPID_UMC[InstanceId] field holds a unique identifier for the
+ * UMC instance within a Node. Use this to find the appropriate Coherent
+ * Station ID.
+ *
+ * Redundant bits were removed from the map below.
+ */
+static const u16 umc_coh_st_map[32] = {
+ 0x393, 0x293, 0x193, 0x093,
+ 0x392, 0x292, 0x192, 0x092,
+ 0x391, 0x291, 0x191, 0x091,
+ 0x390, 0x290, 0x190, 0x090,
+ 0x793, 0x693, 0x593, 0x493,
+ 0x792, 0x692, 0x592, 0x492,
+ 0x791, 0x691, 0x591, 0x491,
+ 0x790, 0x690, 0x590, 0x490,
+};
+
+#define UMC_ID_MI300 GENMASK(23, 12)
+static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
+{
+ u16 umc_id = FIELD_GET(UMC_ID_MI300, err->ipid);
+ u8 i;
+
+ for (i = 0; i < ARRAY_SIZE(umc_coh_st_map); i++) {
+ if (umc_id == umc_coh_st_map[i])
+ break;
+ }
+
+ WARN_ON_ONCE(i >= ARRAY_SIZE(umc_coh_st_map));
+
+ return i;
+}
+
+/* XOR the bits in @val. */
+static u16 bitwise_xor_bits(u16 val)
+{
+ u16 tmp = 0;
+ u8 i;
+
+ for (i = 0; i < 16; i++)
+ tmp ^= (val >> i) & 0x1;
+
+ return tmp;
+}
+
+struct xor_bits {
+ bool xor_enable;
+ u16 col_xor;
+ u32 row_xor;
+};
+
+#define NUM_BANK_BITS 4
+
+static struct {
+ /* UMC::CH::AddrHashBank */
+ struct xor_bits bank[NUM_BANK_BITS];
+
+ /* UMC::CH::AddrHashPC */
+ struct xor_bits pc;
+
+ /* UMC::CH::AddrHashPC2 */
+ u8 bank_xor;
+} addr_hash;
+
+#define MI300_UMC_CH_BASE 0x90000
+#define MI300_ADDR_HASH_BANK0 (MI300_UMC_CH_BASE + 0xC8)
+#define MI300_ADDR_HASH_PC (MI300_UMC_CH_BASE + 0xE0)
+#define MI300_ADDR_HASH_PC2 (MI300_UMC_CH_BASE + 0xE4)
+
+#define ADDR_HASH_XOR_EN BIT(0)
+#define ADDR_HASH_COL_XOR GENMASK(13, 1)
+#define ADDR_HASH_ROW_XOR GENMASK(31, 14)
+#define ADDR_HASH_BANK_XOR GENMASK(5, 0)
+
+/*
+ * Read UMC::CH::AddrHash{Bank,PC,PC2} registers to get XOR bits used
+ * for hashing. Do this during module init, since the values will not
+ * change during run time.
+ *
+ * These registers are instantiated for each UMC across each AMD Node.
+ * However, they should be identically programmed due to the fixed hardware
+ * design of MI300 systems. So read the values from Node 0 UMC 0 and keep a
+ * single global structure for simplicity.
+ */
+int get_addr_hash_mi300(void)
+{
+ u32 temp;
+ int ret;
+ u8 i;
+
+ for (i = 0; i < NUM_BANK_BITS; i++) {
+ ret = amd_smn_read(0, MI300_ADDR_HASH_BANK0 + (i * 4), &temp);
+ if (ret)
+ return ret;
+
+ addr_hash.bank[i].xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp);
+ addr_hash.bank[i].col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp);
+ addr_hash.bank[i].row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
+ }
+
+ ret = amd_smn_read(0, MI300_ADDR_HASH_PC, &temp);
+ if (ret)
+ return ret;
+
+ addr_hash.pc.xor_enable = FIELD_GET(ADDR_HASH_XOR_EN, temp);
+ addr_hash.pc.col_xor = FIELD_GET(ADDR_HASH_COL_XOR, temp);
+ addr_hash.pc.row_xor = FIELD_GET(ADDR_HASH_ROW_XOR, temp);
+
+ ret = amd_smn_read(0, MI300_ADDR_HASH_PC2, &temp);
+ if (ret)
+ return ret;
+
+ addr_hash.bank_xor = FIELD_GET(ADDR_HASH_BANK_XOR, temp);
+
+ return 0;
+}
+
+/*
+ * MI300 systems report a DRAM address in MCA_ADDR for DRAM ECC errors. This must
+ * be converted to the intermediate normalized address (NA) before translating to a
+ * system physical address.
+ *
+ * The DRAM address includes bank, row, and column. Also included are bits for
+ * pseudochannel (PC) and stack ID (SID).
+ *
+ * Abbreviations: (S)tack ID, (P)seudochannel, (R)ow, (B)ank, (C)olumn, (Z)ero
+ *
+ * The MCA address format is as follows:
+ * MCA_ADDR[27:0] = {S[1:0], P[0], R[14:0], B[3:0], C[4:0], Z[0]}
+ *
+ * The normalized address format is fixed in hardware and is as follows:
+ * NA[30:0] = {S[1:0], R[13:0], C4, B[1:0], B[3:2], C[3:2], P, C[1:0], Z[4:0]}
+ *
+ * Additionally, the PC and Bank bits may be hashed. This must be accounted for before
+ * reconstructing the normalized address.
+ */
+#define MI300_UMC_MCA_COL GENMASK(5, 1)
+#define MI300_UMC_MCA_BANK GENMASK(9, 6)
+#define MI300_UMC_MCA_ROW GENMASK(24, 10)
+#define MI300_UMC_MCA_PC BIT(25)
+#define MI300_UMC_MCA_SID GENMASK(27, 26)
+
+#define MI300_NA_COL_1_0 GENMASK(6, 5)
+#define MI300_NA_PC BIT(7)
+#define MI300_NA_COL_3_2 GENMASK(9, 8)
+#define MI300_NA_BANK_3_2 GENMASK(11, 10)
+#define MI300_NA_BANK_1_0 GENMASK(13, 12)
+#define MI300_NA_COL_4 BIT(14)
+#define MI300_NA_ROW GENMASK(28, 15)
+#define MI300_NA_SID GENMASK(30, 29)
+
+static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
+{
+ u16 i, col, row, bank, pc, sid, temp;
+
+ col = FIELD_GET(MI300_UMC_MCA_COL, addr);
+ bank = FIELD_GET(MI300_UMC_MCA_BANK, addr);
+ row = FIELD_GET(MI300_UMC_MCA_ROW, addr);
+ pc = FIELD_GET(MI300_UMC_MCA_PC, addr);
+ sid = FIELD_GET(MI300_UMC_MCA_SID, addr);
+
+ /* Calculate hash for each Bank bit. */
+ for (i = 0; i < NUM_BANK_BITS; i++) {
+ if (!addr_hash.bank[i].xor_enable)
+ continue;
+
+ temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
+ temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
+ bank ^= temp << i;
+ }
+
+ /* Calculate hash for PC bit. */
+ if (addr_hash.pc.xor_enable) {
+ /* Bits SID[1:0] act as Bank[6:5] for PC hash, so apply them here. */
+ bank |= sid << 5;
+
+ temp = bitwise_xor_bits(col & addr_hash.pc.col_xor);
+ temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor);
+ temp ^= bitwise_xor_bits(bank & addr_hash.bank_xor);
+ pc ^= temp;
+
+ /* Drop SID bits for the sake of debug printing later. */
+ bank &= 0x1F;
+ }
+
+ /* Reconstruct the normalized address starting with NA[4:0] = 0 */
+ addr = 0;
+
+ /* NA[6:5] = Column[1:0] */
+ temp = col & 0x3;
+ addr |= FIELD_PREP(MI300_NA_COL_1_0, temp);
+
+ /* NA[7] = PC */
+ addr |= FIELD_PREP(MI300_NA_PC, pc);
+
+ /* NA[9:8] = Column[3:2] */
+ temp = (col >> 2) & 0x3;
+ addr |= FIELD_PREP(MI300_NA_COL_3_2, temp);
+
+ /* NA[11:10] = Bank[3:2] */
+ temp = (bank >> 2) & 0x3;
+ addr |= FIELD_PREP(MI300_NA_BANK_3_2, temp);
+
+ /* NA[13:12] = Bank[1:0] */
+ temp = bank & 0x3;
+ addr |= FIELD_PREP(MI300_NA_BANK_1_0, temp);
+
+ /* NA[14] = Column[4] */
+ temp = (col >> 4) & 0x1;
+ addr |= FIELD_PREP(MI300_NA_COL_4, temp);
+
+ /* NA[28:15] = Row[13:0] */
+ addr |= FIELD_PREP(MI300_NA_ROW, row);
+
+ /* NA[30:29] = SID[1:0] */
+ addr |= FIELD_PREP(MI300_NA_SID, sid);
+
+ pr_debug("Addr=0x%016lx", addr);
+ pr_debug("Bank=%u Row=%u Column=%u PC=%u SID=%u", bank, row, col, pc, sid);
+
+ return addr;
+}
+
+/*
+ * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
+ * all memory within that DRAM row. This applies to the memory with a DRAM
+ * bank.
+ *
+ * To find the memory addresses, loop through permutations of the DRAM column
+ * bits and find the System Physical address of each. The column bits are used
+ * to calculate the intermediate Normalized address, so all permutations should
+ * be checked.
+ *
+ * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
+ */
+#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL))
+static void retire_row_mi300(struct atl_err *a_err)
+{
+ unsigned long addr;
+ struct page *p;
+ u8 col;
+
+ for (col = 0; col < MI300_NUM_COL; col++) {
+ a_err->addr &= ~MI300_UMC_MCA_COL;
+ a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
+
+ addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
+ if (IS_ERR_VALUE(addr))
+ continue;
+
+ addr = PHYS_PFN(addr);
+
+ /*
+ * Skip invalid or already poisoned pages to avoid unnecessary
+ * error messages from memory_failure().
+ */
+ p = pfn_to_online_page(addr);
+ if (!p)
+ continue;
+
+ if (PageHWPoison(p))
+ continue;
+
+ memory_failure(addr, 0);
+ }
+}
+
+void amd_retire_dram_row(struct atl_err *a_err)
+{
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+ return retire_row_mi300(a_err);
+}
+EXPORT_SYMBOL_GPL(amd_retire_dram_row);
+
+static unsigned long get_addr(unsigned long addr)
+{
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+ return convert_dram_to_norm_addr_mi300(addr);
+
+ return addr;
+}
+
+#define MCA_IPID_INST_ID_HI GENMASK_ULL(47, 44)
+static u8 get_die_id(struct atl_err *err)
+{
+ /*
+ * AMD Node ID is provided in MCA_IPID[InstanceIdHi], and this
+ * needs to be divided by 4 to get the internal Die ID.
+ */
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) {
+ u8 node_id = FIELD_GET(MCA_IPID_INST_ID_HI, err->ipid);
+
+ return node_id >> 2;
+ }
+
+ /*
+ * For CPUs, this is the AMD Node ID modulo the number
+ * of AMD Nodes per socket.
+ */
+ return topology_amd_node_id(err->cpu) % topology_amd_nodes_per_pkg();
+}
+
+#define UMC_CHANNEL_NUM GENMASK(31, 20)
+static u8 get_coh_st_inst_id(struct atl_err *err)
+{
+ if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
+ return get_coh_st_inst_id_mi300(err);
+
+ return FIELD_GET(UMC_CHANNEL_NUM, err->ipid);
+}
+
+unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
+{
+ u8 socket_id = topology_physical_package_id(err->cpu);
+ u8 coh_st_inst_id = get_coh_st_inst_id(err);
+ unsigned long addr = get_addr(err->addr);
+ u8 die_id = get_die_id(err);
+
+ pr_debug("socket_id=0x%x die_id=0x%x coh_st_inst_id=0x%x addr=0x%016lx",
+ socket_id, die_id, coh_st_inst_id, addr);
+
+ return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr);
+}
diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c
new file mode 100644
index 000000000000..2f4ac9591c8f
--- /dev/null
+++ b/drivers/ras/amd/fmpm.c
@@ -0,0 +1,1013 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * FRU (Field-Replaceable Unit) Memory Poison Manager
+ *
+ * Copyright (c) 2024, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Authors:
+ * Naveen Krishna Chatradhi <naveenkrishna.chatradhi@amd.com>
+ * Muralidhara M K <muralidhara.mk@amd.com>
+ * Yazen Ghannam <Yazen.Ghannam@amd.com>
+ *
+ * Implementation notes, assumptions, and limitations:
+ *
+ * - FRU memory poison section and memory poison descriptor definitions are not yet
+ * included in the UEFI specification. So they are defined here. Afterwards, they
+ * may be moved to linux/cper.h, if appropriate.
+ *
+ * - Platforms based on AMD MI300 systems will be the first to use these structures.
+ * There are a number of assumptions made here that will need to be generalized
+ * to support other platforms.
+ *
+ * AMD MI300-based platform(s) assumptions:
+ * - Memory errors are reported through x86 MCA.
+ * - The entire DRAM row containing a memory error should be retired.
+ * - There will be (1) FRU memory poison section per CPER.
+ * - The FRU will be the CPU package (processor socket).
+ * - The default number of memory poison descriptor entries should be (8).
+ * - The platform will use ACPI ERST for persistent storage.
+ * - All FRU records should be saved to persistent storage. Module init will
+ * fail if any FRU record is not successfully written.
+ *
+ * - Boot time memory retirement may occur later than ideal due to dependencies
+ * on other libraries and drivers. This leaves a gap where bad memory may be
+ * accessed during early boot stages.
+ *
+ * - Enough memory should be pre-allocated for each FRU record to be able to hold
+ * the expected number of descriptor entries. This, mostly empty, record is
+ * written to storage during init time. Subsequent writes to the same record
+ * should allow the Platform to update the stored record in-place. Otherwise,
+ * if the record is extended, then the Platform may need to perform costly memory
+ * management operations on the storage. For example, the Platform may spend time
+ * in Firmware copying and invalidating memory on a relatively slow SPI ROM.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cper.h>
+#include <linux/ras.h>
+#include <linux/cpu.h>
+
+#include <acpi/apei.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/mce.h>
+
+#include "../debugfs.h"
+
+#define INVALID_CPU UINT_MAX
+
+/* Validation Bits */
+#define FMP_VALID_ARCH_TYPE BIT_ULL(0)
+#define FMP_VALID_ARCH BIT_ULL(1)
+#define FMP_VALID_ID_TYPE BIT_ULL(2)
+#define FMP_VALID_ID BIT_ULL(3)
+#define FMP_VALID_LIST_ENTRIES BIT_ULL(4)
+#define FMP_VALID_LIST BIT_ULL(5)
+
+/* FRU Architecture Types */
+#define FMP_ARCH_TYPE_X86_CPUID_1_EAX 0
+
+/* FRU ID Types */
+#define FMP_ID_TYPE_X86_PPIN 0
+
+/* FRU Memory Poison Section */
+struct cper_sec_fru_mem_poison {
+ u32 checksum;
+ u64 validation_bits;
+ u32 fru_arch_type;
+ u64 fru_arch;
+ u32 fru_id_type;
+ u64 fru_id;
+ u32 nr_entries;
+} __packed;
+
+/* FRU Descriptor ID Types */
+#define FPD_HW_ID_TYPE_MCA_IPID 0
+
+/* FRU Descriptor Address Types */
+#define FPD_ADDR_TYPE_MCA_ADDR 0
+
+/* Memory Poison Descriptor */
+struct cper_fru_poison_desc {
+ u64 timestamp;
+ u32 hw_id_type;
+ u64 hw_id;
+ u32 addr_type;
+ u64 addr;
+} __packed;
+
+/* Collection of headers and sections for easy pointer use. */
+struct fru_rec {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_desc;
+ struct cper_sec_fru_mem_poison fmp;
+ struct cper_fru_poison_desc entries[];
+} __packed;
+
+/*
+ * Pointers to the complete CPER record of each FRU.
+ *
+ * Memory allocation will include padded space for descriptor entries.
+ */
+static struct fru_rec **fru_records;
+
+/* system physical addresses array */
+static u64 *spa_entries;
+
+#define INVALID_SPA ~0ULL
+
+static struct dentry *fmpm_dfs_dir;
+static struct dentry *fmpm_dfs_entries;
+
+#define CPER_CREATOR_FMP \
+ GUID_INIT(0xcd5c2993, 0xf4b2, 0x41b2, 0xb5, 0xd4, 0xf9, 0xc3, \
+ 0xa0, 0x33, 0x08, 0x75)
+
+#define CPER_SECTION_TYPE_FMP \
+ GUID_INIT(0x5e4706c1, 0x5356, 0x48c6, 0x93, 0x0b, 0x52, 0xf2, \
+ 0x12, 0x0a, 0x44, 0x58)
+
+/**
+ * DOC: max_nr_entries (byte)
+ * Maximum number of descriptor entries possible for each FRU.
+ *
+ * Values between '1' and '255' are valid.
+ * No input or '0' will default to FMPM_DEFAULT_MAX_NR_ENTRIES.
+ */
+static u8 max_nr_entries;
+module_param(max_nr_entries, byte, 0644);
+MODULE_PARM_DESC(max_nr_entries,
+ "Maximum number of memory poison descriptor entries per FRU");
+
+#define FMPM_DEFAULT_MAX_NR_ENTRIES 8
+
+/* Maximum number of FRUs in the system. */
+#define FMPM_MAX_NR_FRU 256
+static unsigned int max_nr_fru;
+
+/* Total length of record including headers and list of descriptor entries. */
+static size_t max_rec_len;
+
+/* Total number of SPA entries across all FRUs. */
+static unsigned int spa_nr_entries;
+
+/*
+ * Protect the local records cache in fru_records and prevent concurrent
+ * writes to storage. This is only needed after init once notifier block
+ * registration is done.
+ *
+ * The majority of a record is fixed at module init and will not change
+ * during run time. The entries within a record will be updated as new
+ * errors are reported. The mutex should be held whenever the entries are
+ * accessed during run time.
+ */
+static DEFINE_MUTEX(fmpm_update_mutex);
+
+#define for_each_fru(i, rec) \
+ for (i = 0; rec = fru_records[i], i < max_nr_fru; i++)
+
+static inline u32 get_fmp_len(struct fru_rec *rec)
+{
+ return rec->sec_desc.section_length - sizeof(struct cper_section_descriptor);
+}
+
+static struct fru_rec *get_fru_record(u64 fru_id)
+{
+ struct fru_rec *rec;
+ unsigned int i;
+
+ for_each_fru(i, rec) {
+ if (rec->fmp.fru_id == fru_id)
+ return rec;
+ }
+
+ pr_debug("Record not found for FRU 0x%016llx\n", fru_id);
+
+ return NULL;
+}
+
+/*
+ * Sum up all bytes within the FRU Memory Poison Section including the Memory
+ * Poison Descriptor entries.
+ *
+ * Don't include the old checksum here. It's a u32 value, so summing each of its
+ * bytes will give the wrong total.
+ */
+static u32 do_fmp_checksum(struct cper_sec_fru_mem_poison *fmp, u32 len)
+{
+ u32 checksum = 0;
+ u8 *buf, *end;
+
+ /* Skip old checksum. */
+ buf = (u8 *)fmp + sizeof(u32);
+ end = buf + len;
+
+ while (buf < end)
+ checksum += (u8)(*(buf++));
+
+ return checksum;
+}
+
+static int update_record_on_storage(struct fru_rec *rec)
+{
+ u32 len, checksum;
+ int ret;
+
+ /* Calculate a new checksum. */
+ len = get_fmp_len(rec);
+
+ /* Get the current total. */
+ checksum = do_fmp_checksum(&rec->fmp, len);
+
+ /* Use the complement value. */
+ rec->fmp.checksum = -checksum;
+
+ pr_debug("Writing to storage\n");
+
+ ret = erst_write(&rec->hdr);
+ if (ret) {
+ pr_warn("Storage update failed for FRU 0x%016llx\n", rec->fmp.fru_id);
+
+ if (ret == -ENOSPC)
+ pr_warn("Not enough space on storage\n");
+ }
+
+ return ret;
+}
+
+static bool rec_has_valid_entries(struct fru_rec *rec)
+{
+ if (!(rec->fmp.validation_bits & FMP_VALID_LIST_ENTRIES))
+ return false;
+
+ if (!(rec->fmp.validation_bits & FMP_VALID_LIST))
+ return false;
+
+ return true;
+}
+
+static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new)
+{
+ /*
+ * Ignore timestamp field.
+ * The same physical error may be reported multiple times due to stuck bits, etc.
+ *
+ * Also, order the checks from most->least likely to fail to shortcut the code.
+ */
+ if (old->addr != new->addr)
+ return false;
+
+ if (old->hw_id != new->hw_id)
+ return false;
+
+ if (old->addr_type != new->addr_type)
+ return false;
+
+ if (old->hw_id_type != new->hw_id_type)
+ return false;
+
+ return true;
+}
+
+static bool rec_has_fpd(struct fru_rec *rec, struct cper_fru_poison_desc *fpd)
+{
+ unsigned int i;
+
+ for (i = 0; i < rec->fmp.nr_entries; i++) {
+ struct cper_fru_poison_desc *fpd_i = &rec->entries[i];
+
+ if (fpds_equal(fpd_i, fpd)) {
+ pr_debug("Found duplicate record\n");
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void save_spa(struct fru_rec *rec, unsigned int entry,
+ u64 addr, u64 id, unsigned int cpu)
+{
+ unsigned int i, fru_idx, spa_entry;
+ struct atl_err a_err;
+ unsigned long spa;
+
+ if (entry >= max_nr_entries) {
+ pr_warn_once("FRU descriptor entry %d out-of-bounds (max: %d)\n",
+ entry, max_nr_entries);
+ return;
+ }
+
+ /* spa_nr_entries is always multiple of max_nr_entries */
+ for (i = 0; i < spa_nr_entries; i += max_nr_entries) {
+ fru_idx = i / max_nr_entries;
+ if (fru_records[fru_idx] == rec)
+ break;
+ }
+
+ if (i >= spa_nr_entries) {
+ pr_warn_once("FRU record %d not found\n", i);
+ return;
+ }
+
+ spa_entry = i + entry;
+ if (spa_entry >= spa_nr_entries) {
+ pr_warn_once("spa_entries[] index out-of-bounds\n");
+ return;
+ }
+
+ memset(&a_err, 0, sizeof(struct atl_err));
+
+ a_err.addr = addr;
+ a_err.ipid = id;
+ a_err.cpu = cpu;
+
+ spa = amd_convert_umc_mca_addr_to_sys_addr(&a_err);
+ if (IS_ERR_VALUE(spa)) {
+ pr_debug("Failed to get system address\n");
+ return;
+ }
+
+ spa_entries[spa_entry] = spa;
+ pr_debug("fru_idx: %u, entry: %u, spa_entry: %u, spa: 0x%016llx\n",
+ fru_idx, entry, spa_entry, spa_entries[spa_entry]);
+}
+
+static void update_fru_record(struct fru_rec *rec, struct mce *m)
+{
+ struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+ struct cper_fru_poison_desc fpd, *fpd_dest;
+ u32 entry = 0;
+
+ mutex_lock(&fmpm_update_mutex);
+
+ memset(&fpd, 0, sizeof(struct cper_fru_poison_desc));
+
+ fpd.timestamp = m->time;
+ fpd.hw_id_type = FPD_HW_ID_TYPE_MCA_IPID;
+ fpd.hw_id = m->ipid;
+ fpd.addr_type = FPD_ADDR_TYPE_MCA_ADDR;
+ fpd.addr = m->addr;
+
+ /* This is the first entry, so just save it. */
+ if (!rec_has_valid_entries(rec))
+ goto save_fpd;
+
+ /* Ignore already recorded errors. */
+ if (rec_has_fpd(rec, &fpd))
+ goto out_unlock;
+
+ if (rec->fmp.nr_entries >= max_nr_entries) {
+ pr_warn("Exceeded number of entries for FRU 0x%016llx\n", rec->fmp.fru_id);
+ goto out_unlock;
+ }
+
+ entry = fmp->nr_entries;
+
+save_fpd:
+ save_spa(rec, entry, m->addr, m->ipid, m->extcpu);
+ fpd_dest = &rec->entries[entry];
+ memcpy(fpd_dest, &fpd, sizeof(struct cper_fru_poison_desc));
+
+ fmp->nr_entries = entry + 1;
+ fmp->validation_bits |= FMP_VALID_LIST_ENTRIES;
+ fmp->validation_bits |= FMP_VALID_LIST;
+
+ pr_debug("Updated FRU 0x%016llx entry #%u\n", fmp->fru_id, entry);
+
+ update_record_on_storage(rec);
+
+out_unlock:
+ mutex_unlock(&fmpm_update_mutex);
+}
+
+static void retire_dram_row(u64 addr, u64 id, u32 cpu)
+{
+ struct atl_err a_err;
+
+ memset(&a_err, 0, sizeof(struct atl_err));
+
+ a_err.addr = addr;
+ a_err.ipid = id;
+ a_err.cpu = cpu;
+
+ amd_retire_dram_row(&a_err);
+}
+
+static int fru_handle_mem_poison(struct notifier_block *nb, unsigned long val, void *data)
+{
+ struct mce *m = (struct mce *)data;
+ struct fru_rec *rec;
+
+ if (!mce_is_memory_error(m))
+ return NOTIFY_DONE;
+
+ retire_dram_row(m->addr, m->ipid, m->extcpu);
+
+ /*
+ * An invalid FRU ID should not happen on real errors. But it
+ * could happen from software error injection, etc.
+ */
+ rec = get_fru_record(m->ppin);
+ if (!rec)
+ return NOTIFY_DONE;
+
+ update_fru_record(rec, m);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block fru_mem_poison_nb = {
+ .notifier_call = fru_handle_mem_poison,
+ .priority = MCE_PRIO_LOWEST,
+};
+
+static void retire_mem_fmp(struct fru_rec *rec)
+{
+ struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+ unsigned int i, cpu;
+
+ for (i = 0; i < fmp->nr_entries; i++) {
+ struct cper_fru_poison_desc *fpd = &rec->entries[i];
+ unsigned int err_cpu = INVALID_CPU;
+
+ if (fpd->hw_id_type != FPD_HW_ID_TYPE_MCA_IPID)
+ continue;
+
+ if (fpd->addr_type != FPD_ADDR_TYPE_MCA_ADDR)
+ continue;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ if (topology_ppin(cpu) == fmp->fru_id) {
+ err_cpu = cpu;
+ break;
+ }
+ }
+ cpus_read_unlock();
+
+ if (err_cpu == INVALID_CPU)
+ continue;
+
+ retire_dram_row(fpd->addr, fpd->hw_id, err_cpu);
+ save_spa(rec, i, fpd->addr, fpd->hw_id, err_cpu);
+ }
+}
+
+static void retire_mem_records(void)
+{
+ struct fru_rec *rec;
+ unsigned int i;
+
+ for_each_fru(i, rec) {
+ if (!rec_has_valid_entries(rec))
+ continue;
+
+ retire_mem_fmp(rec);
+ }
+}
+
+/* Set the CPER Record Header and CPER Section Descriptor fields. */
+static void set_rec_fields(struct fru_rec *rec)
+{
+ struct cper_section_descriptor *sec_desc = &rec->sec_desc;
+ struct cper_record_header *hdr = &rec->hdr;
+
+ memcpy(hdr->signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ hdr->revision = CPER_RECORD_REV;
+ hdr->signature_end = CPER_SIG_END;
+
+ /*
+ * Currently, it is assumed that there is one FRU Memory Poison
+ * section per CPER. But this may change for other implementations.
+ */
+ hdr->section_count = 1;
+
+ /* The logged errors are recoverable. Otherwise, they'd never make it here. */
+ hdr->error_severity = CPER_SEV_RECOVERABLE;
+
+ hdr->validation_bits = 0;
+ hdr->record_length = max_rec_len;
+ hdr->creator_id = CPER_CREATOR_FMP;
+ hdr->notification_type = CPER_NOTIFY_MCE;
+ hdr->record_id = cper_next_record_id();
+ hdr->flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ sec_desc->section_offset = sizeof(struct cper_record_header);
+ sec_desc->section_length = max_rec_len - sizeof(struct cper_record_header);
+ sec_desc->revision = CPER_SEC_REV;
+ sec_desc->validation_bits = 0;
+ sec_desc->flags = CPER_SEC_PRIMARY;
+ sec_desc->section_type = CPER_SECTION_TYPE_FMP;
+ sec_desc->section_severity = CPER_SEV_RECOVERABLE;
+}
+
+static int save_new_records(void)
+{
+ DECLARE_BITMAP(new_records, FMPM_MAX_NR_FRU);
+ struct fru_rec *rec;
+ unsigned int i;
+ int ret = 0;
+
+ for_each_fru(i, rec) {
+ if (rec->hdr.record_length)
+ continue;
+
+ set_rec_fields(rec);
+
+ ret = update_record_on_storage(rec);
+ if (ret)
+ goto out_clear;
+
+ set_bit(i, new_records);
+ }
+
+ return ret;
+
+out_clear:
+ for_each_fru(i, rec) {
+ if (!test_bit(i, new_records))
+ continue;
+
+ erst_clear(rec->hdr.record_id);
+ }
+
+ return ret;
+}
+
+/* Check that the record matches expected types for the current system.*/
+static bool fmp_is_usable(struct fru_rec *rec)
+{
+ struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+ u64 cpuid;
+
+ pr_debug("Validation bits: 0x%016llx\n", fmp->validation_bits);
+
+ if (!(fmp->validation_bits & FMP_VALID_ARCH_TYPE)) {
+ pr_debug("Arch type unknown\n");
+ return false;
+ }
+
+ if (fmp->fru_arch_type != FMP_ARCH_TYPE_X86_CPUID_1_EAX) {
+ pr_debug("Arch type not 'x86 Family/Model/Stepping'\n");
+ return false;
+ }
+
+ if (!(fmp->validation_bits & FMP_VALID_ARCH)) {
+ pr_debug("Arch value unknown\n");
+ return false;
+ }
+
+ cpuid = cpuid_eax(1);
+ if (fmp->fru_arch != cpuid) {
+ pr_debug("Arch value mismatch: record = 0x%016llx, system = 0x%016llx\n",
+ fmp->fru_arch, cpuid);
+ return false;
+ }
+
+ if (!(fmp->validation_bits & FMP_VALID_ID_TYPE)) {
+ pr_debug("FRU ID type unknown\n");
+ return false;
+ }
+
+ if (fmp->fru_id_type != FMP_ID_TYPE_X86_PPIN) {
+ pr_debug("FRU ID type is not 'x86 PPIN'\n");
+ return false;
+ }
+
+ if (!(fmp->validation_bits & FMP_VALID_ID)) {
+ pr_debug("FRU ID value unknown\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool fmp_is_valid(struct fru_rec *rec)
+{
+ struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+ u32 checksum, len;
+
+ len = get_fmp_len(rec);
+ if (len < sizeof(struct cper_sec_fru_mem_poison)) {
+ pr_debug("fmp length is too small\n");
+ return false;
+ }
+
+ /* Checksum must sum to zero for the entire section. */
+ checksum = do_fmp_checksum(fmp, len) + fmp->checksum;
+ if (checksum) {
+ pr_debug("fmp checksum failed: sum = 0x%x\n", checksum);
+ print_hex_dump_debug("fmp record: ", DUMP_PREFIX_NONE, 16, 1, fmp, len, false);
+ return false;
+ }
+
+ if (!fmp_is_usable(rec))
+ return false;
+
+ return true;
+}
+
+static struct fru_rec *get_valid_record(struct fru_rec *old)
+{
+ struct fru_rec *new;
+
+ if (!fmp_is_valid(old)) {
+ pr_debug("Ignoring invalid record\n");
+ return NULL;
+ }
+
+ new = get_fru_record(old->fmp.fru_id);
+ if (!new)
+ pr_debug("Ignoring record for absent FRU\n");
+
+ return new;
+}
+
+/*
+ * Fetch saved records from persistent storage.
+ *
+ * For each found record:
+ * - If it was not created by this module, then ignore it.
+ * - If it is valid, then copy its data to the local cache.
+ * - If it is not valid, then erase it.
+ */
+static int get_saved_records(void)
+{
+ struct fru_rec *old, *new;
+ u64 record_id;
+ int ret, pos;
+ ssize_t len;
+
+ /*
+ * Assume saved records match current max size.
+ *
+ * However, this may not be true depending on module parameters.
+ */
+ old = kmalloc(max_rec_len, GFP_KERNEL);
+ if (!old) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = erst_get_record_id_begin(&pos);
+ if (ret < 0)
+ goto out_end;
+
+ while (!erst_get_record_id_next(&pos, &record_id)) {
+ if (record_id == APEI_ERST_INVALID_RECORD_ID)
+ goto out_end;
+ /*
+ * Make sure to clear temporary buffer between reads to avoid
+ * leftover data from records of various sizes.
+ */
+ memset(old, 0, max_rec_len);
+
+ len = erst_read_record(record_id, &old->hdr, max_rec_len,
+ sizeof(struct fru_rec), &CPER_CREATOR_FMP);
+ if (len < 0)
+ continue;
+
+ if (len > max_rec_len) {
+ pr_debug("Found record larger than max_rec_len\n");
+ continue;
+ }
+
+ new = get_valid_record(old);
+ if (!new)
+ erst_clear(record_id);
+
+ /* Restore the record */
+ memcpy(new, old, len);
+ }
+
+out_end:
+ erst_get_record_id_end();
+ kfree(old);
+out:
+ return ret;
+}
+
+static void set_fmp_fields(struct fru_rec *rec, unsigned int cpu)
+{
+ struct cper_sec_fru_mem_poison *fmp = &rec->fmp;
+
+ fmp->fru_arch_type = FMP_ARCH_TYPE_X86_CPUID_1_EAX;
+ fmp->validation_bits |= FMP_VALID_ARCH_TYPE;
+
+ /* Assume all CPUs in the system have the same value for now. */
+ fmp->fru_arch = cpuid_eax(1);
+ fmp->validation_bits |= FMP_VALID_ARCH;
+
+ fmp->fru_id_type = FMP_ID_TYPE_X86_PPIN;
+ fmp->validation_bits |= FMP_VALID_ID_TYPE;
+
+ fmp->fru_id = topology_ppin(cpu);
+ fmp->validation_bits |= FMP_VALID_ID;
+}
+
+static int init_fmps(void)
+{
+ struct fru_rec *rec;
+ unsigned int i, cpu;
+ int ret = 0;
+
+ for_each_fru(i, rec) {
+ unsigned int fru_cpu = INVALID_CPU;
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ if (topology_physical_package_id(cpu) == i) {
+ fru_cpu = cpu;
+ break;
+ }
+ }
+ cpus_read_unlock();
+
+ if (fru_cpu == INVALID_CPU) {
+ pr_debug("Failed to find matching CPU for FRU #%u\n", i);
+ ret = -ENODEV;
+ break;
+ }
+
+ set_fmp_fields(rec, fru_cpu);
+ }
+
+ return ret;
+}
+
+static int get_system_info(void)
+{
+ /* Only load on MI300A systems for now. */
+ if (!(boot_cpu_data.x86_model >= 0x90 &&
+ boot_cpu_data.x86_model <= 0x9f))
+ return -ENODEV;
+
+ if (!cpu_feature_enabled(X86_FEATURE_AMD_PPIN)) {
+ pr_debug("PPIN feature not available\n");
+ return -ENODEV;
+ }
+
+ /* Use CPU socket as FRU for MI300 systems. */
+ max_nr_fru = topology_max_packages();
+ if (!max_nr_fru)
+ return -ENODEV;
+
+ if (max_nr_fru > FMPM_MAX_NR_FRU) {
+ pr_warn("Too many FRUs to manage: found: %u, max: %u\n",
+ max_nr_fru, FMPM_MAX_NR_FRU);
+ return -ENODEV;
+ }
+
+ if (!max_nr_entries)
+ max_nr_entries = FMPM_DEFAULT_MAX_NR_ENTRIES;
+
+ spa_nr_entries = max_nr_fru * max_nr_entries;
+
+ max_rec_len = sizeof(struct fru_rec);
+ max_rec_len += sizeof(struct cper_fru_poison_desc) * max_nr_entries;
+
+ pr_info("max FRUs: %u, max entries: %u, max record length: %lu\n",
+ max_nr_fru, max_nr_entries, max_rec_len);
+
+ return 0;
+}
+
+static void free_records(void)
+{
+ struct fru_rec *rec;
+ int i;
+
+ for_each_fru(i, rec)
+ kfree(rec);
+
+ kfree(fru_records);
+ kfree(spa_entries);
+}
+
+static int allocate_records(void)
+{
+ int i, ret = 0;
+
+ fru_records = kcalloc(max_nr_fru, sizeof(struct fru_rec *), GFP_KERNEL);
+ if (!fru_records) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < max_nr_fru; i++) {
+ fru_records[i] = kzalloc(max_rec_len, GFP_KERNEL);
+ if (!fru_records[i]) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+ }
+
+ spa_entries = kcalloc(spa_nr_entries, sizeof(u64), GFP_KERNEL);
+ if (!spa_entries) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ for (i = 0; i < spa_nr_entries; i++)
+ spa_entries[i] = INVALID_SPA;
+
+ return ret;
+
+out_free:
+ while (--i >= 0)
+ kfree(fru_records[i]);
+
+ kfree(fru_records);
+out:
+ return ret;
+}
+
+static void *fmpm_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= (spa_nr_entries + 1))
+ return NULL;
+ return pos;
+}
+
+static void *fmpm_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= (spa_nr_entries + 1))
+ return NULL;
+ return pos;
+}
+
+static void fmpm_stop(struct seq_file *f, void *data)
+{
+}
+
+#define SHORT_WIDTH 8
+#define U64_WIDTH 18
+#define TIMESTAMP_WIDTH 19
+#define LONG_WIDTH 24
+#define U64_PAD (LONG_WIDTH - U64_WIDTH)
+#define TS_PAD (LONG_WIDTH - TIMESTAMP_WIDTH)
+static int fmpm_show(struct seq_file *f, void *data)
+{
+ unsigned int fru_idx, entry, spa_entry, line;
+ struct cper_fru_poison_desc *fpd;
+ struct fru_rec *rec;
+
+ line = *(loff_t *)data;
+ if (line == 0) {
+ seq_printf(f, "%-*s", SHORT_WIDTH, "fru_idx");
+ seq_printf(f, "%-*s", LONG_WIDTH, "fru_id");
+ seq_printf(f, "%-*s", SHORT_WIDTH, "entry");
+ seq_printf(f, "%-*s", LONG_WIDTH, "timestamp");
+ seq_printf(f, "%-*s", LONG_WIDTH, "hw_id");
+ seq_printf(f, "%-*s", LONG_WIDTH, "addr");
+ seq_printf(f, "%-*s", LONG_WIDTH, "spa");
+ goto out_newline;
+ }
+
+ spa_entry = line - 1;
+ fru_idx = spa_entry / max_nr_entries;
+ entry = spa_entry % max_nr_entries;
+
+ rec = fru_records[fru_idx];
+ if (!rec)
+ goto out;
+
+ seq_printf(f, "%-*u", SHORT_WIDTH, fru_idx);
+ seq_printf(f, "0x%016llx%-*s", rec->fmp.fru_id, U64_PAD, "");
+ seq_printf(f, "%-*u", SHORT_WIDTH, entry);
+
+ mutex_lock(&fmpm_update_mutex);
+
+ if (entry >= rec->fmp.nr_entries) {
+ seq_printf(f, "%-*s", LONG_WIDTH, "*");
+ seq_printf(f, "%-*s", LONG_WIDTH, "*");
+ seq_printf(f, "%-*s", LONG_WIDTH, "*");
+ seq_printf(f, "%-*s", LONG_WIDTH, "*");
+ goto out_unlock;
+ }
+
+ fpd = &rec->entries[entry];
+
+ seq_printf(f, "%ptT%-*s", &fpd->timestamp, TS_PAD, "");
+ seq_printf(f, "0x%016llx%-*s", fpd->hw_id, U64_PAD, "");
+ seq_printf(f, "0x%016llx%-*s", fpd->addr, U64_PAD, "");
+
+ if (spa_entries[spa_entry] == INVALID_SPA)
+ seq_printf(f, "%-*s", LONG_WIDTH, "*");
+ else
+ seq_printf(f, "0x%016llx%-*s", spa_entries[spa_entry], U64_PAD, "");
+
+out_unlock:
+ mutex_unlock(&fmpm_update_mutex);
+out_newline:
+ seq_putc(f, '\n');
+out:
+ return 0;
+}
+
+static const struct seq_operations fmpm_seq_ops = {
+ .start = fmpm_start,
+ .next = fmpm_next,
+ .stop = fmpm_stop,
+ .show = fmpm_show,
+};
+
+static int fmpm_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &fmpm_seq_ops);
+}
+
+static const struct file_operations fmpm_fops = {
+ .open = fmpm_open,
+ .release = seq_release,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+static void setup_debugfs(void)
+{
+ struct dentry *dfs = ras_get_debugfs_root();
+
+ if (!dfs)
+ return;
+
+ fmpm_dfs_dir = debugfs_create_dir("fmpm", dfs);
+ if (!fmpm_dfs_dir)
+ return;
+
+ fmpm_dfs_entries = debugfs_create_file("entries", 0400, fmpm_dfs_dir, NULL, &fmpm_fops);
+ if (!fmpm_dfs_entries)
+ debugfs_remove(fmpm_dfs_dir);
+}
+
+static const struct x86_cpu_id fmpm_cpuids[] = {
+ X86_MATCH_VENDOR_FAM(AMD, 0x19, NULL),
+ { }
+};
+MODULE_DEVICE_TABLE(x86cpu, fmpm_cpuids);
+
+static int __init fru_mem_poison_init(void)
+{
+ int ret;
+
+ if (!x86_match_cpu(fmpm_cpuids)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (erst_disable) {
+ pr_debug("ERST not available\n");
+ ret = -ENODEV;
+ goto out;
+ }
+
+ ret = get_system_info();
+ if (ret)
+ goto out;
+
+ ret = allocate_records();
+ if (ret)
+ goto out;
+
+ ret = init_fmps();
+ if (ret)
+ goto out_free;
+
+ ret = get_saved_records();
+ if (ret)
+ goto out_free;
+
+ ret = save_new_records();
+ if (ret)
+ goto out_free;
+
+ setup_debugfs();
+
+ retire_mem_records();
+
+ mce_register_decode_chain(&fru_mem_poison_nb);
+
+ pr_info("FRU Memory Poison Manager initialized\n");
+ return 0;
+
+out_free:
+ free_records();
+out:
+ return ret;
+}
+
+static void __exit fru_mem_poison_exit(void)
+{
+ mce_unregister_decode_chain(&fru_mem_poison_nb);
+ debugfs_remove(fmpm_dfs_dir);
+ free_records();
+}
+
+module_init(fru_mem_poison_init);
+module_exit(fru_mem_poison_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("FRU Memory Poison Manager");
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 321af498ee11..e440b15fbabc 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -480,9 +480,15 @@ DEFINE_SHOW_ATTRIBUTE(array);
static int __init create_debugfs_nodes(void)
{
- struct dentry *d, *pfn, *decay, *count, *array;
+ struct dentry *d, *pfn, *decay, *count, *array, *dfs;
- d = debugfs_create_dir("cec", ras_debugfs_dir);
+ dfs = ras_get_debugfs_root();
+ if (!dfs) {
+ pr_warn("Error getting RAS debugfs root!\n");
+ return -1;
+ }
+
+ d = debugfs_create_dir("cec", dfs);
if (!d) {
pr_warn("Error creating cec debugfs node!\n");
return -1;
diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c
index ffb973c328e3..42afd3de68b2 100644
--- a/drivers/ras/debugfs.c
+++ b/drivers/ras/debugfs.c
@@ -3,10 +3,16 @@
#include <linux/ras.h>
#include "debugfs.h"
-struct dentry *ras_debugfs_dir;
+static struct dentry *ras_debugfs_dir;
static atomic_t trace_count = ATOMIC_INIT(0);
+struct dentry *ras_get_debugfs_root(void)
+{
+ return ras_debugfs_dir;
+}
+EXPORT_SYMBOL_GPL(ras_get_debugfs_root);
+
int ras_userspace_consumers(void)
{
return atomic_read(&trace_count);
diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h
index c07443b462ad..4749ccdeeba1 100644
--- a/drivers/ras/debugfs.h
+++ b/drivers/ras/debugfs.h
@@ -4,6 +4,6 @@
#include <linux/debugfs.h>
-extern struct dentry *ras_debugfs_dir;
+struct dentry *ras_get_debugfs_root(void);
#endif /* __RAS_DEBUGFS_H__ */
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 95540ea8dd9d..a6e4792a1b2e 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -10,6 +10,37 @@
#include <linux/ras.h>
#include <linux/uuid.h>
+#if IS_ENABLED(CONFIG_AMD_ATL)
+/*
+ * Once set, this function pointer should never be unset.
+ *
+ * The library module will set this pointer if it successfully loads. The module
+ * should not be unloaded except for testing and debug purposes.
+ */
+static unsigned long (*amd_atl_umc_na_to_spa)(struct atl_err *err);
+
+void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *))
+{
+ amd_atl_umc_na_to_spa = f;
+}
+EXPORT_SYMBOL_GPL(amd_atl_register_decoder);
+
+void amd_atl_unregister_decoder(void)
+{
+ amd_atl_umc_na_to_spa = NULL;
+}
+EXPORT_SYMBOL_GPL(amd_atl_unregister_decoder);
+
+unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
+{
+ if (!amd_atl_umc_na_to_spa)
+ return -EINVAL;
+
+ return amd_atl_umc_na_to_spa(err);
+}
+EXPORT_SYMBOL_GPL(amd_convert_umc_mca_addr_to_sys_addr);
+#endif /* CONFIG_AMD_ATL */
+
#define CREATE_TRACE_POINTS
#define TRACE_INCLUDE_PATH ../../include/ras
#include <ras/ras_event.h>
diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c
index 82efdd20ad01..1d17a83569ce 100644
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@ -195,7 +195,7 @@ static void free_chan_prog(struct ccw1 *cpa)
struct ccw1 *ptr = cpa;
while (ptr->cda) {
- kfree((void *)(addr_t) ptr->cda);
+ kfree(phys_to_virt(ptr->cda));
ptr++;
}
kfree(cpa);
@@ -237,7 +237,7 @@ static struct ccw1 *alloc_chan_prog(const char __user *ubuf, int rec_count,
free_chan_prog(cpa);
return ERR_PTR(-ENOMEM);
}
- cpa[i].cda = (u32)(addr_t) kbuf;
+ cpa[i].cda = (u32)virt_to_phys(kbuf);
if (copy_from_user(kbuf, ubuf, reclen)) {
free_chan_prog(cpa);
return ERR_PTR(-EFAULT);
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index bc3be0330f1d..0969fa01df58 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -29,7 +29,6 @@
#include <asm/irqflags.h>
#include <asm/checksum.h>
#include <asm/os_info.h>
-#include <asm/switch_to.h>
#include <asm/maccess.h>
#include "sclp.h"
diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index aa3292e57e38..6eb8bcd948dc 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -31,7 +31,7 @@
* to devices that use multiple subchannels.
*/
-static struct bus_type ccwgroup_bus_type;
+static const struct bus_type ccwgroup_bus_type;
static void __ccwgroup_remove_symlinks(struct ccwgroup_device *gdev)
{
@@ -465,7 +465,7 @@ static void ccwgroup_shutdown(struct device *dev)
gdrv->shutdown(gdev);
}
-static struct bus_type ccwgroup_bus_type = {
+static const struct bus_type ccwgroup_bus_type = {
.name = "ccwgroup",
.dev_groups = ccwgroup_dev_groups,
.remove = ccwgroup_remove,
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index 64ed55c3aed6..3d88899dff7c 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -1091,8 +1091,8 @@ int __init chsc_init(void)
{
int ret;
- sei_page = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
- chsc_page = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ sei_page = (void *)get_zeroed_page(GFP_KERNEL);
+ chsc_page = (void *)get_zeroed_page(GFP_KERNEL);
if (!sei_page || !chsc_page) {
ret = -ENOMEM;
goto out_err;
diff --git a/drivers/s390/cio/chsc_sch.c b/drivers/s390/cio/chsc_sch.c
index 902237d0baef..e6c800653f98 100644
--- a/drivers/s390/cio/chsc_sch.c
+++ b/drivers/s390/cio/chsc_sch.c
@@ -293,7 +293,7 @@ static int chsc_ioctl_start(void __user *user_area)
if (!css_general_characteristics.dynio)
/* It makes no sense to try. */
return -EOPNOTSUPP;
- chsc_area = (void *)get_zeroed_page(GFP_DMA | GFP_KERNEL);
+ chsc_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!chsc_area)
return -ENOMEM;
request = kzalloc(sizeof(*request), GFP_KERNEL);
@@ -341,7 +341,7 @@ static int chsc_ioctl_on_close_set(void __user *user_area)
ret = -ENOMEM;
goto out_unlock;
}
- on_close_chsc_area = (void *)get_zeroed_page(GFP_DMA | GFP_KERNEL);
+ on_close_chsc_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!on_close_chsc_area) {
ret = -ENOMEM;
goto out_free_request;
@@ -393,7 +393,7 @@ static int chsc_ioctl_start_sync(void __user *user_area)
struct chsc_sync_area *chsc_area;
int ret, ccode;
- chsc_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ chsc_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!chsc_area)
return -ENOMEM;
if (copy_from_user(chsc_area, user_area, PAGE_SIZE)) {
@@ -439,7 +439,7 @@ static int chsc_ioctl_info_channel_path(void __user *user_cd)
u8 data[PAGE_SIZE - 20];
} __attribute__ ((packed)) *scpcd_area;
- scpcd_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ scpcd_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!scpcd_area)
return -ENOMEM;
cd = kzalloc(sizeof(*cd), GFP_KERNEL);
@@ -501,7 +501,7 @@ static int chsc_ioctl_info_cu(void __user *user_cd)
u8 data[PAGE_SIZE - 20];
} __attribute__ ((packed)) *scucd_area;
- scucd_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ scucd_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!scucd_area)
return -ENOMEM;
cd = kzalloc(sizeof(*cd), GFP_KERNEL);
@@ -564,7 +564,7 @@ static int chsc_ioctl_info_sch_cu(void __user *user_cud)
u8 data[PAGE_SIZE - 20];
} __attribute__ ((packed)) *sscud_area;
- sscud_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ sscud_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!sscud_area)
return -ENOMEM;
cud = kzalloc(sizeof(*cud), GFP_KERNEL);
@@ -626,7 +626,7 @@ static int chsc_ioctl_conf_info(void __user *user_ci)
u8 data[PAGE_SIZE - 20];
} __attribute__ ((packed)) *sci_area;
- sci_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ sci_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!sci_area)
return -ENOMEM;
ci = kzalloc(sizeof(*ci), GFP_KERNEL);
@@ -697,7 +697,7 @@ static int chsc_ioctl_conf_comp_list(void __user *user_ccl)
u32 res;
} __attribute__ ((packed)) *cssids_parm;
- sccl_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ sccl_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!sccl_area)
return -ENOMEM;
ccl = kzalloc(sizeof(*ccl), GFP_KERNEL);
@@ -757,7 +757,7 @@ static int chsc_ioctl_chpd(void __user *user_chpd)
int ret;
chpd = kzalloc(sizeof(*chpd), GFP_KERNEL);
- scpd_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ scpd_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!scpd_area || !chpd) {
ret = -ENOMEM;
goto out_free;
@@ -797,7 +797,7 @@ static int chsc_ioctl_dcal(void __user *user_dcal)
u8 data[PAGE_SIZE - 36];
} __attribute__ ((packed)) *sdcal_area;
- sdcal_area = (void *)get_zeroed_page(GFP_KERNEL | GFP_DMA);
+ sdcal_area = (void *)get_zeroed_page(GFP_KERNEL);
if (!sdcal_area)
return -ENOMEM;
dcal = kzalloc(sizeof(*dcal), GFP_KERNEL);
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 5584aa46c94e..f80dc18e2a76 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -169,7 +169,8 @@ static inline void cmf_activate(void *area, unsigned int onoff)
" lgr 2,%[mbo]\n"
" schm\n"
:
- : [r1] "d" ((unsigned long)onoff), [mbo] "d" (area)
+ : [r1] "d" ((unsigned long)onoff),
+ [mbo] "d" (virt_to_phys(area))
: "1", "2");
}
@@ -501,8 +502,7 @@ static int alloc_cmb(struct ccw_device *cdev)
WARN_ON(!list_empty(&cmb_area.list));
spin_unlock(&cmb_area.lock);
- mem = (void*)__get_free_pages(GFP_KERNEL | GFP_DMA,
- get_order(size));
+ mem = (void *)__get_free_pages(GFP_KERNEL, get_order(size));
spin_lock(&cmb_area.lock);
if (cmb_area.mem) {
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 28a88ed2c3aa..094431a62ad5 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -39,7 +39,7 @@ int max_ssid;
#define MAX_CSS_IDX 0
struct channel_subsystem *channel_subsystems[MAX_CSS_IDX + 1];
-static struct bus_type css_bus_type;
+static const struct bus_type css_bus_type;
int
for_each_subchannel(int(*fn)(struct subchannel_id, void *), void *data)
@@ -1409,7 +1409,7 @@ static int css_uevent(const struct device *dev, struct kobj_uevent_env *env)
return ret;
}
-static struct bus_type css_bus_type = {
+static const struct bus_type css_bus_type = {
.name = "css",
.match = css_bus_match,
.probe = css_probe,
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 0cfb179e1bcb..f95d12345d98 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -49,7 +49,7 @@ static const unsigned long recovery_delay[] = { 3, 30, 300 };
static atomic_t ccw_device_init_count = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(ccw_device_init_wq);
-static struct bus_type ccw_bus_type;
+static const struct bus_type ccw_bus_type;
/******************* bus type handling ***********************/
@@ -1776,7 +1776,7 @@ static void ccw_device_shutdown(struct device *dev)
__disable_cmf(cdev);
}
-static struct bus_type ccw_bus_type = {
+static const struct bus_type ccw_bus_type = {
.name = "ccw",
.match = ccw_bus_match,
.uevent = ccw_uevent,
diff --git a/drivers/s390/cio/scm.c b/drivers/s390/cio/scm.c
index 6b21ba68c1fe..c7894d61306d 100644
--- a/drivers/s390/cio/scm.c
+++ b/drivers/s390/cio/scm.c
@@ -42,7 +42,7 @@ static int scmdev_uevent(const struct device *dev, struct kobj_uevent_env *env)
return add_uevent_var(env, "MODALIAS=scm:scmdev");
}
-static struct bus_type scm_bus_type = {
+static const struct bus_type scm_bus_type = {
.name = "scm",
.probe = scmdev_probe,
.remove = scmdev_remove,
@@ -228,7 +228,7 @@ int scm_update_information(void)
size_t num;
int ret;
- scm_info = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
+ scm_info = (void *)__get_free_page(GFP_KERNEL);
if (!scm_info)
return -ENOMEM;
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index f46dd6abacd7..cce0bafd4c92 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -38,6 +38,7 @@
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/module.h>
+#include <asm/uv.h>
#include "ap_bus.h"
#include "ap_debug.h"
@@ -83,14 +84,11 @@ EXPORT_SYMBOL(ap_perms);
DEFINE_MUTEX(ap_perms_mutex);
EXPORT_SYMBOL(ap_perms_mutex);
-/* # of bus scans since init */
-static atomic64_t ap_scan_bus_count;
-
/* # of bindings complete since init */
static atomic64_t ap_bindings_complete_count = ATOMIC64_INIT(0);
-/* completion for initial APQN bindings complete */
-static DECLARE_COMPLETION(ap_init_apqn_bindings_complete);
+/* completion for APQN bindings complete */
+static DECLARE_COMPLETION(ap_apqn_bindings_complete);
static struct ap_config_info *ap_qci_info;
static struct ap_config_info *ap_qci_info_old;
@@ -101,12 +99,16 @@ static struct ap_config_info *ap_qci_info_old;
debug_info_t *ap_dbf_info;
/*
- * Workqueue timer for bus rescan.
+ * AP bus rescan related things.
*/
-static struct timer_list ap_config_timer;
-static int ap_config_time = AP_CONFIG_TIME;
-static void ap_scan_bus(struct work_struct *);
-static DECLARE_WORK(ap_scan_work, ap_scan_bus);
+static bool ap_scan_bus(void);
+static bool ap_scan_bus_result; /* result of last ap_scan_bus() */
+static DEFINE_MUTEX(ap_scan_bus_mutex); /* mutex ap_scan_bus() invocations */
+static atomic64_t ap_scan_bus_count; /* counter ap_scan_bus() invocations */
+static int ap_scan_bus_time = AP_CONFIG_TIME;
+static struct timer_list ap_scan_bus_timer;
+static void ap_scan_bus_wq_callback(struct work_struct *);
+static DECLARE_WORK(ap_scan_bus_work, ap_scan_bus_wq_callback);
/*
* Tasklet & timer for AP request polling and interrupts
@@ -135,7 +137,7 @@ static int ap_max_domain_id = 15;
/* Maximum adapter id, if not given via qci */
static int ap_max_adapter_id = 63;
-static struct bus_type ap_bus_type;
+static const struct bus_type ap_bus_type;
/* Adapter interrupt definitions */
static void ap_interrupt_handler(struct airq_struct *airq,
@@ -753,7 +755,7 @@ static void ap_calc_bound_apqns(unsigned int *apqns, unsigned int *bound)
}
/*
- * After initial ap bus scan do check if all existing APQNs are
+ * After ap bus scan do check if all existing APQNs are
* bound to device drivers.
*/
static void ap_check_bindings_complete(void)
@@ -763,9 +765,9 @@ static void ap_check_bindings_complete(void)
if (atomic64_read(&ap_scan_bus_count) >= 1) {
ap_calc_bound_apqns(&apqns, &bound);
if (bound == apqns) {
- if (!completion_done(&ap_init_apqn_bindings_complete)) {
- complete_all(&ap_init_apqn_bindings_complete);
- AP_DBF_INFO("%s complete\n", __func__);
+ if (!completion_done(&ap_apqn_bindings_complete)) {
+ complete_all(&ap_apqn_bindings_complete);
+ pr_debug("%s all apqn bindings complete\n", __func__);
}
ap_send_bindings_complete_uevent();
}
@@ -782,27 +784,29 @@ static void ap_check_bindings_complete(void)
* -ETIME is returned. On failures negative return values are
* returned to the caller.
*/
-int ap_wait_init_apqn_bindings_complete(unsigned long timeout)
+int ap_wait_apqn_bindings_complete(unsigned long timeout)
{
+ int rc = 0;
long l;
- if (completion_done(&ap_init_apqn_bindings_complete))
+ if (completion_done(&ap_apqn_bindings_complete))
return 0;
if (timeout)
l = wait_for_completion_interruptible_timeout(
- &ap_init_apqn_bindings_complete, timeout);
+ &ap_apqn_bindings_complete, timeout);
else
l = wait_for_completion_interruptible(
- &ap_init_apqn_bindings_complete);
+ &ap_apqn_bindings_complete);
if (l < 0)
- return l == -ERESTARTSYS ? -EINTR : l;
+ rc = l == -ERESTARTSYS ? -EINTR : l;
else if (l == 0 && timeout)
- return -ETIME;
+ rc = -ETIME;
- return 0;
+ pr_debug("%s rc=%d\n", __func__, rc);
+ return rc;
}
-EXPORT_SYMBOL(ap_wait_init_apqn_bindings_complete);
+EXPORT_SYMBOL(ap_wait_apqn_bindings_complete);
static int __ap_queue_devices_with_id_unregister(struct device *dev, void *data)
{
@@ -826,8 +830,8 @@ static int __ap_revise_reserved(struct device *dev, void *dummy)
drvres = to_ap_drv(dev->driver)->flags
& AP_DRIVER_FLAG_DEFAULT;
if (!!devres != !!drvres) {
- AP_DBF_DBG("%s reprobing queue=%02x.%04x\n",
- __func__, card, queue);
+ pr_debug("%s reprobing queue=%02x.%04x\n",
+ __func__, card, queue);
rc = device_reprobe(dev);
if (rc)
AP_DBF_WARN("%s reprobing queue=%02x.%04x failed\n",
@@ -939,8 +943,6 @@ static int ap_device_probe(struct device *dev)
if (is_queue_dev(dev))
hash_del(&to_ap_queue(dev)->hnode);
spin_unlock_bh(&ap_queues_lock);
- } else {
- ap_check_bindings_complete();
}
out:
@@ -1012,16 +1014,47 @@ void ap_driver_unregister(struct ap_driver *ap_drv)
}
EXPORT_SYMBOL(ap_driver_unregister);
-void ap_bus_force_rescan(void)
+/*
+ * Enforce a synchronous AP bus rescan.
+ * Returns true if the bus scan finds a change in the AP configuration
+ * and AP devices have been added or deleted when this function returns.
+ */
+bool ap_bus_force_rescan(void)
{
+ unsigned long scan_counter = atomic64_read(&ap_scan_bus_count);
+ bool rc = false;
+
+ pr_debug(">%s scan counter=%lu\n", __func__, scan_counter);
+
/* Only trigger AP bus scans after the initial scan is done */
- if (atomic64_read(&ap_scan_bus_count) <= 0)
- return;
+ if (scan_counter <= 0)
+ goto out;
+
+ /* Try to acquire the AP scan bus mutex */
+ if (mutex_trylock(&ap_scan_bus_mutex)) {
+ /* mutex acquired, run the AP bus scan */
+ ap_scan_bus_result = ap_scan_bus();
+ rc = ap_scan_bus_result;
+ mutex_unlock(&ap_scan_bus_mutex);
+ goto out;
+ }
+
+ /*
+ * Mutex acquire failed. So there is currently another task
+ * already running the AP bus scan. Then let's simple wait
+ * for the lock which means the other task has finished and
+ * stored the result in ap_scan_bus_result.
+ */
+ if (mutex_lock_interruptible(&ap_scan_bus_mutex)) {
+ /* some error occurred, ignore and go out */
+ goto out;
+ }
+ rc = ap_scan_bus_result;
+ mutex_unlock(&ap_scan_bus_mutex);
- /* processing a asynchronous bus rescan */
- del_timer(&ap_config_timer);
- queue_work(system_long_wq, &ap_scan_work);
- flush_work(&ap_scan_work);
+out:
+ pr_debug("%s rc=%d\n", __func__, rc);
+ return rc;
}
EXPORT_SYMBOL(ap_bus_force_rescan);
@@ -1030,7 +1063,7 @@ EXPORT_SYMBOL(ap_bus_force_rescan);
*/
void ap_bus_cfg_chg(void)
{
- AP_DBF_DBG("%s config change, forcing bus rescan\n", __func__);
+ pr_debug("%s config change, forcing bus rescan\n", __func__);
ap_bus_force_rescan();
}
@@ -1250,7 +1283,7 @@ static BUS_ATTR_RO(ap_interrupts);
static ssize_t config_time_show(const struct bus_type *bus, char *buf)
{
- return sysfs_emit(buf, "%d\n", ap_config_time);
+ return sysfs_emit(buf, "%d\n", ap_scan_bus_time);
}
static ssize_t config_time_store(const struct bus_type *bus,
@@ -1260,8 +1293,8 @@ static ssize_t config_time_store(const struct bus_type *bus,
if (sscanf(buf, "%d\n", &time) != 1 || time < 5 || time > 120)
return -EINVAL;
- ap_config_time = time;
- mod_timer(&ap_config_timer, jiffies + ap_config_time * HZ);
+ ap_scan_bus_time = time;
+ mod_timer(&ap_scan_bus_timer, jiffies + ap_scan_bus_time * HZ);
return count;
}
@@ -1603,7 +1636,7 @@ static struct attribute *ap_bus_attrs[] = {
};
ATTRIBUTE_GROUPS(ap_bus);
-static struct bus_type ap_bus_type = {
+static const struct bus_type ap_bus_type = {
.name = "ap",
.bus_groups = ap_bus_groups,
.match = &ap_bus_match,
@@ -1888,8 +1921,8 @@ static inline void ap_scan_domains(struct ap_card *ac)
aq->last_err_rc = AP_RESPONSE_CHECKSTOPPED;
}
spin_unlock_bh(&aq->lock);
- AP_DBF_DBG("%s(%d,%d) queue dev checkstop on\n",
- __func__, ac->id, dom);
+ pr_debug("%s(%d,%d) queue dev checkstop on\n",
+ __func__, ac->id, dom);
/* 'receive' pending messages with -EAGAIN */
ap_flush_queue(aq);
goto put_dev_and_continue;
@@ -1899,8 +1932,8 @@ static inline void ap_scan_domains(struct ap_card *ac)
if (aq->dev_state > AP_DEV_STATE_UNINITIATED)
_ap_queue_init_state(aq);
spin_unlock_bh(&aq->lock);
- AP_DBF_DBG("%s(%d,%d) queue dev checkstop off\n",
- __func__, ac->id, dom);
+ pr_debug("%s(%d,%d) queue dev checkstop off\n",
+ __func__, ac->id, dom);
goto put_dev_and_continue;
}
/* config state change */
@@ -1912,8 +1945,8 @@ static inline void ap_scan_domains(struct ap_card *ac)
aq->last_err_rc = AP_RESPONSE_DECONFIGURED;
}
spin_unlock_bh(&aq->lock);
- AP_DBF_DBG("%s(%d,%d) queue dev config off\n",
- __func__, ac->id, dom);
+ pr_debug("%s(%d,%d) queue dev config off\n",
+ __func__, ac->id, dom);
ap_send_config_uevent(&aq->ap_dev, aq->config);
/* 'receive' pending messages with -EAGAIN */
ap_flush_queue(aq);
@@ -1924,8 +1957,8 @@ static inline void ap_scan_domains(struct ap_card *ac)
if (aq->dev_state > AP_DEV_STATE_UNINITIATED)
_ap_queue_init_state(aq);
spin_unlock_bh(&aq->lock);
- AP_DBF_DBG("%s(%d,%d) queue dev config on\n",
- __func__, ac->id, dom);
+ pr_debug("%s(%d,%d) queue dev config on\n",
+ __func__, ac->id, dom);
ap_send_config_uevent(&aq->ap_dev, aq->config);
goto put_dev_and_continue;
}
@@ -1997,8 +2030,8 @@ static inline void ap_scan_adapter(int ap)
ap_scan_rm_card_dev_and_queue_devs(ac);
put_device(dev);
} else {
- AP_DBF_DBG("%s(%d) no type info (no APQN found), ignored\n",
- __func__, ap);
+ pr_debug("%s(%d) no type info (no APQN found), ignored\n",
+ __func__, ap);
}
return;
}
@@ -2010,8 +2043,8 @@ static inline void ap_scan_adapter(int ap)
ap_scan_rm_card_dev_and_queue_devs(ac);
put_device(dev);
} else {
- AP_DBF_DBG("%s(%d) no valid type (0) info, ignored\n",
- __func__, ap);
+ pr_debug("%s(%d) no valid type (0) info, ignored\n",
+ __func__, ap);
}
return;
}
@@ -2135,23 +2168,80 @@ static bool ap_get_configuration(void)
sizeof(struct ap_config_info)) != 0;
}
+/*
+ * ap_config_has_new_aps - Check current against old qci info if
+ * new adapters have appeared. Returns true if at least one new
+ * adapter in the apm mask is showing up. Existing adapters or
+ * receding adapters are not counted.
+ */
+static bool ap_config_has_new_aps(void)
+{
+
+ unsigned long m[BITS_TO_LONGS(AP_DEVICES)];
+
+ if (!ap_qci_info)
+ return false;
+
+ bitmap_andnot(m, (unsigned long *)ap_qci_info->apm,
+ (unsigned long *)ap_qci_info_old->apm, AP_DEVICES);
+ if (!bitmap_empty(m, AP_DEVICES))
+ return true;
+
+ return false;
+}
+
+/*
+ * ap_config_has_new_doms - Check current against old qci info if
+ * new (usage) domains have appeared. Returns true if at least one
+ * new domain in the aqm mask is showing up. Existing domains or
+ * receding domains are not counted.
+ */
+static bool ap_config_has_new_doms(void)
+{
+ unsigned long m[BITS_TO_LONGS(AP_DOMAINS)];
+
+ if (!ap_qci_info)
+ return false;
+
+ bitmap_andnot(m, (unsigned long *)ap_qci_info->aqm,
+ (unsigned long *)ap_qci_info_old->aqm, AP_DOMAINS);
+ if (!bitmap_empty(m, AP_DOMAINS))
+ return true;
+
+ return false;
+}
+
/**
* ap_scan_bus(): Scan the AP bus for new devices
- * Runs periodically, workqueue timer (ap_config_time)
- * @unused: Unused pointer.
+ * Always run under mutex ap_scan_bus_mutex protection
+ * which needs to get locked/unlocked by the caller!
+ * Returns true if any config change has been detected
+ * during the scan, otherwise false.
*/
-static void ap_scan_bus(struct work_struct *unused)
+static bool ap_scan_bus(void)
{
- int ap, config_changed = 0;
+ bool config_changed;
+ int ap;
+
+ pr_debug(">%s\n", __func__);
- /* config change notify */
+ /* (re-)fetch configuration via QCI */
config_changed = ap_get_configuration();
- if (config_changed)
+ if (config_changed) {
+ if (ap_config_has_new_aps() || ap_config_has_new_doms()) {
+ /*
+ * Appearance of new adapters and/or domains need to
+ * build new ap devices which need to get bound to an
+ * device driver. Thus reset the APQN bindings complete
+ * completion.
+ */
+ reinit_completion(&ap_apqn_bindings_complete);
+ }
+ /* post a config change notify */
notify_config_changed();
+ }
ap_select_domain();
- AP_DBF_DBG("%s running\n", __func__);
-
/* loop over all possible adapters */
for (ap = 0; ap <= ap_max_adapter_id; ap++)
ap_scan_adapter(ap);
@@ -2174,23 +2264,56 @@ static void ap_scan_bus(struct work_struct *unused)
}
if (atomic64_inc_return(&ap_scan_bus_count) == 1) {
- AP_DBF_DBG("%s init scan complete\n", __func__);
+ pr_debug("%s init scan complete\n", __func__);
ap_send_init_scan_done_uevent();
- ap_check_bindings_complete();
}
- mod_timer(&ap_config_timer, jiffies + ap_config_time * HZ);
+ ap_check_bindings_complete();
+
+ mod_timer(&ap_scan_bus_timer, jiffies + ap_scan_bus_time * HZ);
+
+ pr_debug("<%s config_changed=%d\n", __func__, config_changed);
+
+ return config_changed;
}
-static void ap_config_timeout(struct timer_list *unused)
+/*
+ * Callback for the ap_scan_bus_timer
+ * Runs periodically, workqueue timer (ap_scan_bus_time)
+ */
+static void ap_scan_bus_timer_callback(struct timer_list *unused)
{
- queue_work(system_long_wq, &ap_scan_work);
+ /*
+ * schedule work into the system long wq which when
+ * the work is finally executed, calls the AP bus scan.
+ */
+ queue_work(system_long_wq, &ap_scan_bus_work);
+}
+
+/*
+ * Callback for the ap_scan_bus_work
+ */
+static void ap_scan_bus_wq_callback(struct work_struct *unused)
+{
+ /*
+ * Try to invoke an ap_scan_bus(). If the mutex acquisition
+ * fails there is currently another task already running the
+ * AP scan bus and there is no need to wait and re-trigger the
+ * scan again. Please note at the end of the scan bus function
+ * the AP scan bus timer is re-armed which triggers then the
+ * ap_scan_bus_timer_callback which enqueues a work into the
+ * system_long_wq which invokes this function here again.
+ */
+ if (mutex_trylock(&ap_scan_bus_mutex)) {
+ ap_scan_bus_result = ap_scan_bus();
+ mutex_unlock(&ap_scan_bus_mutex);
+ }
}
static int __init ap_debug_init(void)
{
ap_dbf_info = debug_register("ap", 2, 1,
- DBF_MAX_SPRINTF_ARGS * sizeof(long));
+ AP_DBF_MAX_SPRINTF_ARGS * sizeof(long));
debug_register_view(ap_dbf_info, &debug_sprintf_view);
debug_set_level(ap_dbf_info, DBF_ERR);
@@ -2274,7 +2397,7 @@ static int __init ap_module_init(void)
ap_root_device->bus = &ap_bus_type;
/* Setup the AP bus rescan timer. */
- timer_setup(&ap_config_timer, ap_config_timeout, 0);
+ timer_setup(&ap_scan_bus_timer, ap_scan_bus_timer_callback, 0);
/*
* Setup the high resolution poll timer.
@@ -2292,7 +2415,7 @@ static int __init ap_module_init(void)
goto out_work;
}
- queue_work(system_long_wq, &ap_scan_work);
+ queue_work(system_long_wq, &ap_scan_bus_work);
return 0;
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 98814839ef30..59c7ed49aa02 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -266,7 +266,7 @@ int ap_sb_available(void);
bool ap_is_se_guest(void);
void ap_wait(enum ap_sm_wait wait);
void ap_request_timeout(struct timer_list *t);
-void ap_bus_force_rescan(void);
+bool ap_bus_force_rescan(void);
int ap_test_config_usage_domain(unsigned int domain);
int ap_test_config_ctrl_domain(unsigned int domain);
@@ -352,8 +352,12 @@ int ap_parse_mask_str(const char *str,
* the return value is 0. If the timeout (in jiffies) hits instead
* -ETIME is returned. On failures negative return values are
* returned to the caller.
+ * It may be that the AP bus scan finds new devices. Then the
+ * condition that all APQNs are bound to their device drivers
+ * is reset to false and this call again blocks until either all
+ * APQNs are bound to a device driver or the timeout hits again.
*/
-int ap_wait_init_apqn_bindings_complete(unsigned long timeout);
+int ap_wait_apqn_bindings_complete(unsigned long timeout);
void ap_send_config_uevent(struct ap_device *ap_dev, bool cfg);
void ap_send_online_uevent(struct ap_device *ap_dev, int online);
diff --git a/drivers/s390/crypto/ap_debug.h b/drivers/s390/crypto/ap_debug.h
index c083ce88a9a6..2f66271b8564 100644
--- a/drivers/s390/crypto/ap_debug.h
+++ b/drivers/s390/crypto/ap_debug.h
@@ -16,7 +16,7 @@
#define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO)
#define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO)
-#define DBF_MAX_SPRINTF_ARGS 6
+#define AP_DBF_MAX_SPRINTF_ARGS 6
#define AP_DBF(...) \
debug_sprintf_event(ap_dbf_info, ##__VA_ARGS__)
@@ -26,8 +26,6 @@
debug_sprintf_event(ap_dbf_info, DBF_WARN, ##__VA_ARGS__)
#define AP_DBF_INFO(...) \
debug_sprintf_event(ap_dbf_info, DBF_INFO, ##__VA_ARGS__)
-#define AP_DBF_DBG(...) \
- debug_sprintf_event(ap_dbf_info, DBF_DEBUG, ##__VA_ARGS__)
extern debug_info_t *ap_dbf_info;
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 682595443145..6e4e8d324a6d 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -136,6 +136,8 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
+ print_hex_dump_debug("aprpl: ", DUMP_PREFIX_ADDRESS, 16, 1,
+ aq->reply->msg, aq->reply->len, false);
aq->queue_count = max_t(int, 0, aq->queue_count - 1);
if (!status.queue_empty && !aq->queue_count)
aq->queue_count++;
@@ -169,6 +171,9 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
aq->queue_count = 0;
list_splice_init(&aq->pendingq, &aq->requestq);
aq->requestq_count += aq->pendingq_count;
+ pr_debug("%s queue 0x%02x.%04x rescheduled %d reqs (new req %d)\n",
+ __func__, AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid),
+ aq->pendingq_count, aq->requestq_count);
aq->pendingq_count = 0;
break;
default:
@@ -243,6 +248,8 @@ static enum ap_sm_wait ap_sm_write(struct ap_queue *aq)
/* Start the next request on the queue. */
ap_msg = list_entry(aq->requestq.next, struct ap_message, list);
+ print_hex_dump_debug("apreq: ", DUMP_PREFIX_ADDRESS, 16, 1,
+ ap_msg->msg, ap_msg->len, false);
status = __ap_send(qid, ap_msg->psmid,
ap_msg->msg, ap_msg->len,
ap_msg->flags & AP_MSG_FLAG_SPECIAL);
@@ -446,9 +453,9 @@ static enum ap_sm_wait ap_sm_assoc_wait(struct ap_queue *aq)
case AP_BS_Q_USABLE:
/* association is through */
aq->sm_state = AP_SM_STATE_IDLE;
- AP_DBF_DBG("%s queue 0x%02x.%04x associated with %u\n",
- __func__, AP_QID_CARD(aq->qid),
- AP_QID_QUEUE(aq->qid), aq->assoc_idx);
+ pr_debug("%s queue 0x%02x.%04x associated with %u\n",
+ __func__, AP_QID_CARD(aq->qid),
+ AP_QID_QUEUE(aq->qid), aq->assoc_idx);
return AP_SM_WAIT_NONE;
case AP_BS_Q_USABLE_NO_SECURE_KEY:
/* association still pending */
@@ -690,9 +697,9 @@ static ssize_t ap_functions_show(struct device *dev,
status = ap_test_queue(aq->qid, 1, &hwinfo);
if (status.response_code > AP_RESPONSE_BUSY) {
- AP_DBF_DBG("%s RC 0x%02x on tapq(0x%02x.%04x)\n",
- __func__, status.response_code,
- AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
+ pr_debug("%s RC 0x%02x on tapq(0x%02x.%04x)\n",
+ __func__, status.response_code,
+ AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
return -EIO;
}
@@ -846,9 +853,9 @@ static ssize_t se_bind_show(struct device *dev,
status = ap_test_queue(aq->qid, 1, &hwinfo);
if (status.response_code > AP_RESPONSE_BUSY) {
- AP_DBF_DBG("%s RC 0x%02x on tapq(0x%02x.%04x)\n",
- __func__, status.response_code,
- AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
+ pr_debug("%s RC 0x%02x on tapq(0x%02x.%04x)\n",
+ __func__, status.response_code,
+ AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
return -EIO;
}
@@ -974,9 +981,9 @@ static ssize_t se_associate_show(struct device *dev,
status = ap_test_queue(aq->qid, 1, &hwinfo);
if (status.response_code > AP_RESPONSE_BUSY) {
- AP_DBF_DBG("%s RC 0x%02x on tapq(0x%02x.%04x)\n",
- __func__, status.response_code,
- AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
+ pr_debug("%s RC 0x%02x on tapq(0x%02x.%04x)\n",
+ __func__, status.response_code,
+ AP_QID_CARD(aq->qid), AP_QID_QUEUE(aq->qid));
return -EIO;
}
diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
index 6cfb6b2340c9..dccf664a3d95 100644
--- a/drivers/s390/crypto/pkey_api.c
+++ b/drivers/s390/crypto/pkey_api.c
@@ -42,24 +42,23 @@ MODULE_DESCRIPTION("s390 protected key interface");
* debug feature data and functions
*/
-static debug_info_t *debug_info;
+static debug_info_t *pkey_dbf_info;
-#define DEBUG_DBG(...) debug_sprintf_event(debug_info, 6, ##__VA_ARGS__)
-#define DEBUG_INFO(...) debug_sprintf_event(debug_info, 5, ##__VA_ARGS__)
-#define DEBUG_WARN(...) debug_sprintf_event(debug_info, 4, ##__VA_ARGS__)
-#define DEBUG_ERR(...) debug_sprintf_event(debug_info, 3, ##__VA_ARGS__)
+#define PKEY_DBF_INFO(...) debug_sprintf_event(pkey_dbf_info, 5, ##__VA_ARGS__)
+#define PKEY_DBF_WARN(...) debug_sprintf_event(pkey_dbf_info, 4, ##__VA_ARGS__)
+#define PKEY_DBF_ERR(...) debug_sprintf_event(pkey_dbf_info, 3, ##__VA_ARGS__)
static void __init pkey_debug_init(void)
{
/* 5 arguments per dbf entry (including the format string ptr) */
- debug_info = debug_register("pkey", 1, 1, 5 * sizeof(long));
- debug_register_view(debug_info, &debug_sprintf_view);
- debug_set_level(debug_info, 3);
+ pkey_dbf_info = debug_register("pkey", 1, 1, 5 * sizeof(long));
+ debug_register_view(pkey_dbf_info, &debug_sprintf_view);
+ debug_set_level(pkey_dbf_info, 3);
}
static void __exit pkey_debug_exit(void)
{
- debug_unregister(debug_info);
+ debug_unregister(pkey_dbf_info);
}
/* inside view of a protected key token (only type 0x00 version 0x01) */
@@ -163,14 +162,14 @@ static int pkey_clr2protkey(u32 keytype, const u8 *clrkey,
fc = CPACF_PCKMO_ENC_ECC_ED448_KEY;
break;
default:
- DEBUG_ERR("%s unknown/unsupported keytype %u\n",
- __func__, keytype);
+ PKEY_DBF_ERR("%s unknown/unsupported keytype %u\n",
+ __func__, keytype);
return -EINVAL;
}
if (*protkeylen < keysize + AES_WK_VP_SIZE) {
- DEBUG_ERR("%s prot key buffer size too small: %u < %d\n",
- __func__, *protkeylen, keysize + AES_WK_VP_SIZE);
+ PKEY_DBF_ERR("%s prot key buffer size too small: %u < %d\n",
+ __func__, *protkeylen, keysize + AES_WK_VP_SIZE);
return -EINVAL;
}
@@ -182,7 +181,7 @@ static int pkey_clr2protkey(u32 keytype, const u8 *clrkey,
}
/* check for the pckmo subfunction we need now */
if (!cpacf_test_func(&pckmo_functions, fc)) {
- DEBUG_ERR("%s pckmo functions not available\n", __func__);
+ PKEY_DBF_ERR("%s pckmo functions not available\n", __func__);
return -ENODEV;
}
@@ -244,7 +243,7 @@ static int pkey_skey2pkey(const u8 *key, u8 *protkey,
}
if (rc)
- DEBUG_DBG("%s failed rc=%d\n", __func__, rc);
+ pr_debug("%s failed rc=%d\n", __func__, rc);
return rc;
}
@@ -283,7 +282,7 @@ static int pkey_clr2ep11key(const u8 *clrkey, size_t clrkeylen,
out:
kfree(apqns);
if (rc)
- DEBUG_DBG("%s failed rc=%d\n", __func__, rc);
+ pr_debug("%s failed rc=%d\n", __func__, rc);
return rc;
}
@@ -294,33 +293,36 @@ static int pkey_ep11key2pkey(const u8 *key, size_t keylen,
u8 *protkey, u32 *protkeylen, u32 *protkeytype)
{
u32 nr_apqns, *apqns = NULL;
+ int i, j, rc = -ENODEV;
u16 card, dom;
- int i, rc;
zcrypt_wait_api_operational();
- /* build a list of apqns suitable for this key */
- rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF,
- ZCRYPT_CEX7,
- ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4,
- ep11_kb_wkvp(key, keylen));
- if (rc)
- goto out;
+ /* try two times in case of failure */
+ for (i = 0; i < 2 && rc; i++) {
- /* go through the list of apqns and try to derive an pkey */
- for (rc = -ENODEV, i = 0; i < nr_apqns; i++) {
- card = apqns[i] >> 16;
- dom = apqns[i] & 0xFFFF;
- rc = ep11_kblob2protkey(card, dom, key, keylen,
- protkey, protkeylen, protkeytype);
- if (rc == 0)
- break;
+ /* build a list of apqns suitable for this key */
+ rc = ep11_findcard2(&apqns, &nr_apqns, 0xFFFF, 0xFFFF,
+ ZCRYPT_CEX7,
+ ap_is_se_guest() ? EP11_API_V6 : EP11_API_V4,
+ ep11_kb_wkvp(key, keylen));
+ if (rc)
+ continue; /* retry findcard on failure */
+
+ /* go through the list of apqns and try to derive an pkey */
+ for (rc = -ENODEV, j = 0; j < nr_apqns && rc; j++) {
+ card = apqns[j] >> 16;
+ dom = apqns[j] & 0xFFFF;
+ rc = ep11_kblob2protkey(card, dom, key, keylen,
+ protkey, protkeylen, protkeytype);
+ }
+
+ kfree(apqns);
}
-out:
- kfree(apqns);
if (rc)
- DEBUG_DBG("%s failed rc=%d\n", __func__, rc);
+ pr_debug("%s failed rc=%d\n", __func__, rc);
+
return rc;
}
@@ -336,7 +338,7 @@ static int pkey_verifykey(const struct pkey_seckey *seckey,
int rc;
/* check the secure key for valid AES secure key */
- rc = cca_check_secaeskeytoken(debug_info, 3, (u8 *)seckey, 0);
+ rc = cca_check_secaeskeytoken(pkey_dbf_info, 3, (u8 *)seckey, 0);
if (rc)
goto out;
if (pattributes)
@@ -351,7 +353,7 @@ static int pkey_verifykey(const struct pkey_seckey *seckey,
if (rc > 0) {
/* key mkvp matches to old master key mkvp */
- DEBUG_DBG("%s secure key has old mkvp\n", __func__);
+ pr_debug("%s secure key has old mkvp\n", __func__);
if (pattributes)
*pattributes |= PKEY_VERIFY_ATTR_OLD_MKVP;
rc = 0;
@@ -363,7 +365,7 @@ static int pkey_verifykey(const struct pkey_seckey *seckey,
*pdomain = domain;
out:
- DEBUG_DBG("%s rc=%d\n", __func__, rc);
+ pr_debug("%s rc=%d\n", __func__, rc);
return rc;
}
@@ -379,8 +381,8 @@ static int pkey_genprotkey(u32 keytype, u8 *protkey,
keysize = pkey_keytype_aes_to_size(keytype);
if (!keysize) {
- DEBUG_ERR("%s unknown/unsupported keytype %d\n", __func__,
- keytype);
+ PKEY_DBF_ERR("%s unknown/unsupported keytype %d\n", __func__,
+ keytype);
return -EINVAL;
}
@@ -428,13 +430,13 @@ static int pkey_verifyprotkey(const u8 *protkey, u32 protkeylen,
fc = CPACF_KMC_PAES_256;
break;
default:
- DEBUG_ERR("%s unknown/unsupported keytype %u\n", __func__,
- protkeytype);
+ PKEY_DBF_ERR("%s unknown/unsupported keytype %u\n", __func__,
+ protkeytype);
return -EINVAL;
}
if (protkeylen != pkeylen) {
- DEBUG_ERR("%s invalid protected key size %u for keytype %u\n",
- __func__, protkeylen, protkeytype);
+ PKEY_DBF_ERR("%s invalid protected key size %u for keytype %u\n",
+ __func__, protkeylen, protkeytype);
return -EINVAL;
}
@@ -446,7 +448,7 @@ static int pkey_verifyprotkey(const u8 *protkey, u32 protkeylen,
k = cpacf_kmc(fc | CPACF_ENCRYPT, &param, null_msg, dest_buf,
sizeof(null_msg));
if (k != sizeof(null_msg)) {
- DEBUG_ERR("%s protected key is not valid\n", __func__);
+ PKEY_DBF_ERR("%s protected key is not valid\n", __func__);
return -EKEYREJECTED;
}
@@ -464,13 +466,13 @@ static int nonccatokaes2pkey(const struct clearkeytoken *t,
keysize = pkey_keytype_aes_to_size(t->keytype);
if (!keysize) {
- DEBUG_ERR("%s unknown/unsupported keytype %u\n",
- __func__, t->keytype);
+ PKEY_DBF_ERR("%s unknown/unsupported keytype %u\n",
+ __func__, t->keytype);
return -EINVAL;
}
if (t->len != keysize) {
- DEBUG_ERR("%s non clear key aes token: invalid key len %u\n",
- __func__, t->len);
+ PKEY_DBF_ERR("%s non clear key aes token: invalid key len %u\n",
+ __func__, t->len);
return -EINVAL;
}
@@ -505,7 +507,7 @@ try_via_ep11:
goto out;
failure:
- DEBUG_ERR("%s unable to build protected key from clear", __func__);
+ PKEY_DBF_ERR("%s unable to build protected key from clear", __func__);
out:
kfree(tmpbuf);
@@ -536,14 +538,14 @@ static int nonccatokecc2pkey(const struct clearkeytoken *t,
keylen = 64;
break;
default:
- DEBUG_ERR("%s unknown/unsupported keytype %u\n",
- __func__, t->keytype);
+ PKEY_DBF_ERR("%s unknown/unsupported keytype %u\n",
+ __func__, t->keytype);
return -EINVAL;
}
if (t->len != keylen) {
- DEBUG_ERR("%s non clear key ecc token: invalid key len %u\n",
- __func__, t->len);
+ PKEY_DBF_ERR("%s non clear key ecc token: invalid key len %u\n",
+ __func__, t->len);
return -EINVAL;
}
@@ -551,8 +553,8 @@ static int nonccatokecc2pkey(const struct clearkeytoken *t,
rc = pkey_clr2protkey(t->keytype, t->clearkey,
protkey, protkeylen, protkeytype);
if (rc) {
- DEBUG_ERR("%s unable to build protected key from clear",
- __func__);
+ PKEY_DBF_ERR("%s unable to build protected key from clear",
+ __func__);
}
return rc;
@@ -604,15 +606,15 @@ static int pkey_nonccatok2pkey(const u8 *key, u32 keylen,
protkeylen, protkeytype);
break;
default:
- DEBUG_ERR("%s unknown/unsupported non cca clear key type %u\n",
- __func__, t->keytype);
+ PKEY_DBF_ERR("%s unknown/unsupported non cca clear key type %u\n",
+ __func__, t->keytype);
return -EINVAL;
}
break;
}
case TOKVER_EP11_AES: {
/* check ep11 key for exportable as protected key */
- rc = ep11_check_aes_key(debug_info, 3, key, keylen, 1);
+ rc = ep11_check_aes_key(pkey_dbf_info, 3, key, keylen, 1);
if (rc)
goto out;
rc = pkey_ep11key2pkey(key, keylen,
@@ -621,15 +623,16 @@ static int pkey_nonccatok2pkey(const u8 *key, u32 keylen,
}
case TOKVER_EP11_AES_WITH_HEADER:
/* check ep11 key with header for exportable as protected key */
- rc = ep11_check_aes_key_with_hdr(debug_info, 3, key, keylen, 1);
+ rc = ep11_check_aes_key_with_hdr(pkey_dbf_info,
+ 3, key, keylen, 1);
if (rc)
goto out;
rc = pkey_ep11key2pkey(key, keylen,
protkey, protkeylen, protkeytype);
break;
default:
- DEBUG_ERR("%s unknown/unsupported non-CCA token version %d\n",
- __func__, hdr->version);
+ PKEY_DBF_ERR("%s unknown/unsupported non-CCA token version %d\n",
+ __func__, hdr->version);
}
out:
@@ -654,8 +657,8 @@ static int pkey_ccainttok2pkey(const u8 *key, u32 keylen,
return -EINVAL;
break;
default:
- DEBUG_ERR("%s unknown/unsupported CCA internal token version %d\n",
- __func__, hdr->version);
+ PKEY_DBF_ERR("%s unknown/unsupported CCA internal token version %d\n",
+ __func__, hdr->version);
return -EINVAL;
}
@@ -672,7 +675,7 @@ int pkey_keyblob2pkey(const u8 *key, u32 keylen,
int rc;
if (keylen < sizeof(struct keytoken_header)) {
- DEBUG_ERR("%s invalid keylen %d\n", __func__, keylen);
+ PKEY_DBF_ERR("%s invalid keylen %d\n", __func__, keylen);
return -EINVAL;
}
@@ -686,12 +689,12 @@ int pkey_keyblob2pkey(const u8 *key, u32 keylen,
protkey, protkeylen, protkeytype);
break;
default:
- DEBUG_ERR("%s unknown/unsupported blob type %d\n",
- __func__, hdr->type);
+ PKEY_DBF_ERR("%s unknown/unsupported blob type %d\n",
+ __func__, hdr->type);
return -EINVAL;
}
- DEBUG_DBG("%s rc=%d\n", __func__, rc);
+ pr_debug("%s rc=%d\n", __func__, rc);
return rc;
}
EXPORT_SYMBOL(pkey_keyblob2pkey);
@@ -839,7 +842,7 @@ static int pkey_verifykey2(const u8 *key, size_t keylen,
hdr->version == TOKVER_CCA_AES) {
struct secaeskeytoken *t = (struct secaeskeytoken *)key;
- rc = cca_check_secaeskeytoken(debug_info, 3, key, 0);
+ rc = cca_check_secaeskeytoken(pkey_dbf_info, 3, key, 0);
if (rc)
goto out;
if (ktype)
@@ -869,7 +872,7 @@ static int pkey_verifykey2(const u8 *key, size_t keylen,
hdr->version == TOKVER_CCA_VLSC) {
struct cipherkeytoken *t = (struct cipherkeytoken *)key;
- rc = cca_check_secaescipherkey(debug_info, 3, key, 0, 1);
+ rc = cca_check_secaescipherkey(pkey_dbf_info, 3, key, 0, 1);
if (rc)
goto out;
if (ktype)
@@ -907,7 +910,7 @@ static int pkey_verifykey2(const u8 *key, size_t keylen,
struct ep11keyblob *kb = (struct ep11keyblob *)key;
int api;
- rc = ep11_check_aes_key(debug_info, 3, key, keylen, 1);
+ rc = ep11_check_aes_key(pkey_dbf_info, 3, key, keylen, 1);
if (rc)
goto out;
if (ktype)
@@ -933,8 +936,8 @@ static int pkey_verifykey2(const u8 *key, size_t keylen,
struct ep11kblob_header *kh = (struct ep11kblob_header *)key;
int api;
- rc = ep11_check_aes_key_with_hdr(debug_info, 3,
- key, keylen, 1);
+ rc = ep11_check_aes_key_with_hdr(pkey_dbf_info,
+ 3, key, keylen, 1);
if (rc)
goto out;
if (ktype)
@@ -981,25 +984,27 @@ static int pkey_keyblob2pkey2(const struct pkey_apqn *apqns, size_t nr_apqns,
if (hdr->version == TOKVER_CCA_AES) {
if (keylen != sizeof(struct secaeskeytoken))
return -EINVAL;
- if (cca_check_secaeskeytoken(debug_info, 3, key, 0))
+ if (cca_check_secaeskeytoken(pkey_dbf_info, 3, key, 0))
return -EINVAL;
} else if (hdr->version == TOKVER_CCA_VLSC) {
if (keylen < hdr->len || keylen > MAXCCAVLSCTOKENSIZE)
return -EINVAL;
- if (cca_check_secaescipherkey(debug_info, 3, key, 0, 1))
+ if (cca_check_secaescipherkey(pkey_dbf_info,
+ 3, key, 0, 1))
return -EINVAL;
} else {
- DEBUG_ERR("%s unknown CCA internal token version %d\n",
- __func__, hdr->version);
+ PKEY_DBF_ERR("%s unknown CCA internal token version %d\n",
+ __func__, hdr->version);
return -EINVAL;
}
} else if (hdr->type == TOKTYPE_NON_CCA) {
if (hdr->version == TOKVER_EP11_AES) {
- if (ep11_check_aes_key(debug_info, 3, key, keylen, 1))
+ if (ep11_check_aes_key(pkey_dbf_info,
+ 3, key, keylen, 1))
return -EINVAL;
} else if (hdr->version == TOKVER_EP11_AES_WITH_HEADER) {
- if (ep11_check_aes_key_with_hdr(debug_info, 3,
- key, keylen, 1))
+ if (ep11_check_aes_key_with_hdr(pkey_dbf_info,
+ 3, key, keylen, 1))
return -EINVAL;
} else {
return pkey_nonccatok2pkey(key, keylen,
@@ -1007,8 +1012,8 @@ static int pkey_keyblob2pkey2(const struct pkey_apqn *apqns, size_t nr_apqns,
protkeytype);
}
} else {
- DEBUG_ERR("%s unknown/unsupported blob type %d\n",
- __func__, hdr->type);
+ PKEY_DBF_ERR("%s unknown/unsupported blob type %d\n",
+ __func__, hdr->type);
return -EINVAL;
}
@@ -1234,50 +1239,53 @@ static int pkey_keyblob2pkey3(const struct pkey_apqn *apqns, size_t nr_apqns,
hdr->version == TOKVER_EP11_AES_WITH_HEADER &&
is_ep11_keyblob(key + sizeof(struct ep11kblob_header))) {
/* EP11 AES key blob with header */
- if (ep11_check_aes_key_with_hdr(debug_info, 3, key, keylen, 1))
+ if (ep11_check_aes_key_with_hdr(pkey_dbf_info,
+ 3, key, keylen, 1))
return -EINVAL;
} else if (hdr->type == TOKTYPE_NON_CCA &&
hdr->version == TOKVER_EP11_ECC_WITH_HEADER &&
is_ep11_keyblob(key + sizeof(struct ep11kblob_header))) {
/* EP11 ECC key blob with header */
- if (ep11_check_ecc_key_with_hdr(debug_info, 3, key, keylen, 1))
+ if (ep11_check_ecc_key_with_hdr(pkey_dbf_info,
+ 3, key, keylen, 1))
return -EINVAL;
} else if (hdr->type == TOKTYPE_NON_CCA &&
hdr->version == TOKVER_EP11_AES &&
is_ep11_keyblob(key)) {
/* EP11 AES key blob with header in session field */
- if (ep11_check_aes_key(debug_info, 3, key, keylen, 1))
+ if (ep11_check_aes_key(pkey_dbf_info, 3, key, keylen, 1))
return -EINVAL;
} else if (hdr->type == TOKTYPE_CCA_INTERNAL) {
if (hdr->version == TOKVER_CCA_AES) {
/* CCA AES data key */
if (keylen != sizeof(struct secaeskeytoken))
return -EINVAL;
- if (cca_check_secaeskeytoken(debug_info, 3, key, 0))
+ if (cca_check_secaeskeytoken(pkey_dbf_info, 3, key, 0))
return -EINVAL;
} else if (hdr->version == TOKVER_CCA_VLSC) {
/* CCA AES cipher key */
if (keylen < hdr->len || keylen > MAXCCAVLSCTOKENSIZE)
return -EINVAL;
- if (cca_check_secaescipherkey(debug_info, 3, key, 0, 1))
+ if (cca_check_secaescipherkey(pkey_dbf_info,
+ 3, key, 0, 1))
return -EINVAL;
} else {
- DEBUG_ERR("%s unknown CCA internal token version %d\n",
- __func__, hdr->version);
+ PKEY_DBF_ERR("%s unknown CCA internal token version %d\n",
+ __func__, hdr->version);
return -EINVAL;
}
} else if (hdr->type == TOKTYPE_CCA_INTERNAL_PKA) {
/* CCA ECC (private) key */
if (keylen < sizeof(struct eccprivkeytoken))
return -EINVAL;
- if (cca_check_sececckeytoken(debug_info, 3, key, keylen, 1))
+ if (cca_check_sececckeytoken(pkey_dbf_info, 3, key, keylen, 1))
return -EINVAL;
} else if (hdr->type == TOKTYPE_NON_CCA) {
return pkey_nonccatok2pkey(key, keylen,
protkey, protkeylen, protkeytype);
} else {
- DEBUG_ERR("%s unknown/unsupported blob type %d\n",
- __func__, hdr->type);
+ PKEY_DBF_ERR("%s unknown/unsupported blob type %d\n",
+ __func__, hdr->type);
return -EINVAL;
}
@@ -1350,7 +1358,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
return -EFAULT;
rc = cca_genseckey(kgs.cardnr, kgs.domain,
kgs.keytype, kgs.seckey.seckey);
- DEBUG_DBG("%s cca_genseckey()=%d\n", __func__, rc);
+ pr_debug("%s cca_genseckey()=%d\n", __func__, rc);
if (rc)
break;
if (copy_to_user(ugs, &kgs, sizeof(kgs)))
@@ -1365,7 +1373,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
return -EFAULT;
rc = cca_clr2seckey(kcs.cardnr, kcs.domain, kcs.keytype,
kcs.clrkey.clrkey, kcs.seckey.seckey);
- DEBUG_DBG("%s cca_clr2seckey()=%d\n", __func__, rc);
+ pr_debug("%s cca_clr2seckey()=%d\n", __func__, rc);
if (rc)
break;
if (copy_to_user(ucs, &kcs, sizeof(kcs)))
@@ -1383,7 +1391,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
rc = cca_sec2protkey(ksp.cardnr, ksp.domain,
ksp.seckey.seckey, ksp.protkey.protkey,
&ksp.protkey.len, &ksp.protkey.type);
- DEBUG_DBG("%s cca_sec2protkey()=%d\n", __func__, rc);
+ pr_debug("%s cca_sec2protkey()=%d\n", __func__, rc);
if (rc)
break;
if (copy_to_user(usp, &ksp, sizeof(ksp)))
@@ -1400,7 +1408,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
rc = pkey_clr2protkey(kcp.keytype, kcp.clrkey.clrkey,
kcp.protkey.protkey,
&kcp.protkey.len, &kcp.protkey.type);
- DEBUG_DBG("%s pkey_clr2protkey()=%d\n", __func__, rc);
+ pr_debug("%s pkey_clr2protkey()=%d\n", __func__, rc);
if (rc)
break;
if (copy_to_user(ucp, &kcp, sizeof(kcp)))
@@ -1416,7 +1424,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
return -EFAULT;
rc = cca_findcard(kfc.seckey.seckey,
&kfc.cardnr, &kfc.domain, 1);
- DEBUG_DBG("%s cca_findcard()=%d\n", __func__, rc);
+ pr_debug("%s cca_findcard()=%d\n", __func__, rc);
if (rc < 0)
break;
if (copy_to_user(ufc, &kfc, sizeof(kfc)))
@@ -1432,7 +1440,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
ksp.protkey.len = sizeof(ksp.protkey.protkey);
rc = pkey_skey2pkey(ksp.seckey.seckey, ksp.protkey.protkey,
&ksp.protkey.len, &ksp.protkey.type);
- DEBUG_DBG("%s pkey_skey2pkey()=%d\n", __func__, rc);
+ pr_debug("%s pkey_skey2pkey()=%d\n", __func__, rc);
if (rc)
break;
if (copy_to_user(usp, &ksp, sizeof(ksp)))
@@ -1447,7 +1455,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
return -EFAULT;
rc = pkey_verifykey(&kvk.seckey, &kvk.cardnr, &kvk.domain,
&kvk.keysize, &kvk.attributes);
- DEBUG_DBG("%s pkey_verifykey()=%d\n", __func__, rc);
+ pr_debug("%s pkey_verifykey()=%d\n", __func__, rc);
if (rc)
break;
if (copy_to_user(uvk, &kvk, sizeof(kvk)))
@@ -1463,7 +1471,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
kgp.protkey.len = sizeof(kgp.protkey.protkey);
rc = pkey_genprotkey(kgp.keytype, kgp.protkey.protkey,
&kgp.protkey.len, &kgp.protkey.type);
- DEBUG_DBG("%s pkey_genprotkey()=%d\n", __func__, rc);
+ pr_debug("%s pkey_genprotkey()=%d\n", __func__, rc);
if (rc)
break;
if (copy_to_user(ugp, &kgp, sizeof(kgp)))
@@ -1478,7 +1486,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
return -EFAULT;
rc = pkey_verifyprotkey(kvp.protkey.protkey,
kvp.protkey.len, kvp.protkey.type);
- DEBUG_DBG("%s pkey_verifyprotkey()=%d\n", __func__, rc);
+ pr_debug("%s pkey_verifyprotkey()=%d\n", __func__, rc);
break;
}
case PKEY_KBLOB2PROTK: {
@@ -1494,7 +1502,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
ktp.protkey.len = sizeof(ktp.protkey.protkey);
rc = pkey_keyblob2pkey(kkey, ktp.keylen, ktp.protkey.protkey,
&ktp.protkey.len, &ktp.protkey.type);
- DEBUG_DBG("%s pkey_keyblob2pkey()=%d\n", __func__, rc);
+ pr_debug("%s pkey_keyblob2pkey()=%d\n", __func__, rc);
memzero_explicit(kkey, ktp.keylen);
kfree(kkey);
if (rc)
@@ -1523,7 +1531,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
rc = pkey_genseckey2(apqns, kgs.apqn_entries,
kgs.type, kgs.size, kgs.keygenflags,
kkey, &klen);
- DEBUG_DBG("%s pkey_genseckey2()=%d\n", __func__, rc);
+ pr_debug("%s pkey_genseckey2()=%d\n", __func__, rc);
kfree(apqns);
if (rc) {
kfree(kkey);
@@ -1565,7 +1573,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
rc = pkey_clr2seckey2(apqns, kcs.apqn_entries,
kcs.type, kcs.size, kcs.keygenflags,
kcs.clrkey.clrkey, kkey, &klen);
- DEBUG_DBG("%s pkey_clr2seckey2()=%d\n", __func__, rc);
+ pr_debug("%s pkey_clr2seckey2()=%d\n", __func__, rc);
kfree(apqns);
if (rc) {
kfree(kkey);
@@ -1601,7 +1609,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
rc = pkey_verifykey2(kkey, kvk.keylen,
&kvk.cardnr, &kvk.domain,
&kvk.type, &kvk.size, &kvk.flags);
- DEBUG_DBG("%s pkey_verifykey2()=%d\n", __func__, rc);
+ pr_debug("%s pkey_verifykey2()=%d\n", __func__, rc);
kfree(kkey);
if (rc)
break;
@@ -1630,7 +1638,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
kkey, ktp.keylen,
ktp.protkey.protkey, &ktp.protkey.len,
&ktp.protkey.type);
- DEBUG_DBG("%s pkey_keyblob2pkey2()=%d\n", __func__, rc);
+ pr_debug("%s pkey_keyblob2pkey2()=%d\n", __func__, rc);
kfree(apqns);
memzero_explicit(kkey, ktp.keylen);
kfree(kkey);
@@ -1664,7 +1672,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
}
rc = pkey_apqns4key(kkey, kak.keylen, kak.flags,
apqns, &nr_apqns);
- DEBUG_DBG("%s pkey_apqns4key()=%d\n", __func__, rc);
+ pr_debug("%s pkey_apqns4key()=%d\n", __func__, rc);
kfree(kkey);
if (rc && rc != -ENOSPC) {
kfree(apqns);
@@ -1707,7 +1715,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
}
rc = pkey_apqns4keytype(kat.type, kat.cur_mkvp, kat.alt_mkvp,
kat.flags, apqns, &nr_apqns);
- DEBUG_DBG("%s pkey_apqns4keytype()=%d\n", __func__, rc);
+ pr_debug("%s pkey_apqns4keytype()=%d\n", __func__, rc);
if (rc && rc != -ENOSPC) {
kfree(apqns);
break;
@@ -1757,7 +1765,7 @@ static long pkey_unlocked_ioctl(struct file *filp, unsigned int cmd,
rc = pkey_keyblob2pkey3(apqns, ktp.apqn_entries,
kkey, ktp.keylen,
protkey, &protkeylen, &ktp.pkeytype);
- DEBUG_DBG("%s pkey_keyblob2pkey3()=%d\n", __func__, rc);
+ pr_debug("%s pkey_keyblob2pkey3()=%d\n", __func__, rc);
kfree(apqns);
memzero_explicit(kkey, ktp.keylen);
kfree(kkey);
diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
index a5ab03e42ff1..4aeb3e1213c7 100644
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c
@@ -60,7 +60,7 @@ static void vfio_ap_matrix_dev_release(struct device *dev)
kfree(matrix_dev);
}
-static struct bus_type matrix_bus = {
+static const struct bus_type matrix_bus = {
.name = "matrix",
};
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 983b3b16196c..fc169bc61593 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -659,6 +659,21 @@ static bool vfio_ap_mdev_filter_cdoms(struct ap_matrix_mdev *matrix_mdev)
AP_DOMAINS);
}
+static bool _queue_passable(struct vfio_ap_queue *q)
+{
+ if (!q)
+ return false;
+
+ switch (q->reset_status.response_code) {
+ case AP_RESPONSE_NORMAL:
+ case AP_RESPONSE_DECONFIGURED:
+ case AP_RESPONSE_CHECKSTOPPED:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* vfio_ap_mdev_filter_matrix - filter the APQNs assigned to the matrix mdev
* to ensure no queue devices are passed through to
@@ -687,7 +702,6 @@ static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev,
unsigned long apid, apqi, apqn;
DECLARE_BITMAP(prev_shadow_apm, AP_DEVICES);
DECLARE_BITMAP(prev_shadow_aqm, AP_DOMAINS);
- struct vfio_ap_queue *q;
bitmap_copy(prev_shadow_apm, matrix_mdev->shadow_apcb.apm, AP_DEVICES);
bitmap_copy(prev_shadow_aqm, matrix_mdev->shadow_apcb.aqm, AP_DOMAINS);
@@ -716,8 +730,7 @@ static bool vfio_ap_mdev_filter_matrix(struct ap_matrix_mdev *matrix_mdev,
* hardware device.
*/
apqn = AP_MKQID(apid, apqi);
- q = vfio_ap_mdev_get_queue(matrix_mdev, apqn);
- if (!q || q->reset_status.response_code) {
+ if (!_queue_passable(vfio_ap_mdev_get_queue(matrix_mdev, apqn))) {
clear_bit_inv(apid, matrix_mdev->shadow_apcb.apm);
/*
@@ -1691,6 +1704,7 @@ static int apq_status_check(int apqn, struct ap_queue_status *status)
switch (status->response_code) {
case AP_RESPONSE_NORMAL:
case AP_RESPONSE_DECONFIGURED:
+ case AP_RESPONSE_CHECKSTOPPED:
return 0;
case AP_RESPONSE_RESET_IN_PROGRESS:
case AP_RESPONSE_BUSY:
@@ -1747,14 +1761,6 @@ static void apq_reset_check(struct work_struct *reset_work)
memcpy(&q->reset_status, &status, sizeof(status));
continue;
}
- /*
- * When an AP adapter is deconfigured, the
- * associated queues are reset, so let's set the
- * status response code to 0 so the queue may be
- * passed through (i.e., not filtered)
- */
- if (status.response_code == AP_RESPONSE_DECONFIGURED)
- q->reset_status.response_code = 0;
if (q->saved_isc != VFIO_AP_ISC_INVALID)
vfio_ap_free_aqic_resources(q);
break;
@@ -1781,12 +1787,7 @@ static void vfio_ap_mdev_reset_queue(struct vfio_ap_queue *q)
queue_work(system_long_wq, &q->reset_work);
break;
case AP_RESPONSE_DECONFIGURED:
- /*
- * When an AP adapter is deconfigured, the associated
- * queues are reset, so let's set the status response code to 0
- * so the queue may be passed through (i.e., not filtered).
- */
- q->reset_status.response_code = 0;
+ case AP_RESPONSE_CHECKSTOPPED:
vfio_ap_free_aqic_resources(q);
break;
default:
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 74200f54dfff..02c503f16bc2 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -12,6 +12,9 @@
* Multiple device nodes: Harald Freudenberger <freude@linux.ibm.com>
*/
+#define KMSG_COMPONENT "zcrypt"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/interrupt.h>
@@ -57,10 +60,6 @@ DEFINE_SPINLOCK(zcrypt_list_lock);
LIST_HEAD(zcrypt_card_list);
static atomic_t zcrypt_open_count = ATOMIC_INIT(0);
-static atomic_t zcrypt_rescan_count = ATOMIC_INIT(0);
-
-atomic_t zcrypt_rescan_req = ATOMIC_INIT(0);
-EXPORT_SYMBOL(zcrypt_rescan_req);
static LIST_HEAD(zcrypt_ops_list);
@@ -69,20 +68,15 @@ debug_info_t *zcrypt_dbf_info;
/*
* Process a rescan of the transport layer.
- *
- * Returns 1, if the rescan has been processed, otherwise 0.
+ * Runs a synchronous AP bus rescan.
+ * Returns true if something has changed (for example the
+ * bus scan has found and build up new devices) and it is
+ * worth to do a retry. Otherwise false is returned meaning
+ * no changes on the AP bus level.
*/
-static inline int zcrypt_process_rescan(void)
-{
- if (atomic_read(&zcrypt_rescan_req)) {
- atomic_set(&zcrypt_rescan_req, 0);
- atomic_inc(&zcrypt_rescan_count);
- ap_bus_force_rescan();
- ZCRYPT_DBF_INFO("%s rescan count=%07d\n", __func__,
- atomic_inc_return(&zcrypt_rescan_count));
- return 1;
- }
- return 0;
+static inline bool zcrypt_process_rescan(void)
+{
+ return ap_bus_force_rescan();
}
void zcrypt_msgtype_register(struct zcrypt_ops *zops)
@@ -715,8 +709,7 @@ static long zcrypt_rsa_modexpo(struct ap_perms *perms,
spin_unlock(&zcrypt_list_lock);
if (!pref_zq) {
- ZCRYPT_DBF_DBG("%s no matching queue found => ENODEV\n",
- __func__);
+ pr_debug("%s no matching queue found => ENODEV\n", __func__);
rc = -ENODEV;
goto out;
}
@@ -820,8 +813,7 @@ static long zcrypt_rsa_crt(struct ap_perms *perms,
spin_unlock(&zcrypt_list_lock);
if (!pref_zq) {
- ZCRYPT_DBF_DBG("%s no matching queue found => ENODEV\n",
- __func__);
+ pr_debug("%s no matching queue found => ENODEV\n", __func__);
rc = -ENODEV;
goto out;
}
@@ -865,6 +857,8 @@ static long _zcrypt_send_cprb(bool userspace, struct ap_perms *perms,
rc = prep_cca_ap_msg(userspace, xcrb, &ap_msg, &func_code, &domain);
if (rc)
goto out;
+ print_hex_dump_debug("ccareq: ", DUMP_PREFIX_ADDRESS, 16, 1,
+ ap_msg.msg, ap_msg.len, false);
tdom = *domain;
if (perms != &ap_perms && tdom < AP_DOMAINS) {
@@ -940,8 +934,8 @@ static long _zcrypt_send_cprb(bool userspace, struct ap_perms *perms,
spin_unlock(&zcrypt_list_lock);
if (!pref_zq) {
- ZCRYPT_DBF_DBG("%s no match for address %02x.%04x => ENODEV\n",
- __func__, xcrb->user_defined, *domain);
+ pr_debug("%s no match for address %02x.%04x => ENODEV\n",
+ __func__, xcrb->user_defined, *domain);
rc = -ENODEV;
goto out;
}
@@ -952,6 +946,10 @@ static long _zcrypt_send_cprb(bool userspace, struct ap_perms *perms,
*domain = AP_QID_QUEUE(qid);
rc = pref_zq->ops->send_cprb(userspace, pref_zq, xcrb, &ap_msg);
+ if (!rc) {
+ print_hex_dump_debug("ccarpl: ", DUMP_PREFIX_ADDRESS, 16, 1,
+ ap_msg.msg, ap_msg.len, false);
+ }
spin_lock(&zcrypt_list_lock);
zcrypt_drop_queue(pref_zc, pref_zq, mod, wgt);
@@ -970,7 +968,26 @@ out:
long zcrypt_send_cprb(struct ica_xcRB *xcrb)
{
- return _zcrypt_send_cprb(false, &ap_perms, NULL, xcrb);
+ struct zcrypt_track tr;
+ int rc;
+
+ memset(&tr, 0, sizeof(tr));
+
+ do {
+ rc = _zcrypt_send_cprb(false, &ap_perms, &tr, xcrb);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
+ do {
+ rc = _zcrypt_send_cprb(false, &ap_perms, &tr, xcrb);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+ if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
+ rc = -EIO;
+ if (rc)
+ pr_debug("%s rc=%d\n", __func__, rc);
+
+ return rc;
}
EXPORT_SYMBOL(zcrypt_send_cprb);
@@ -1045,6 +1062,8 @@ static long _zcrypt_send_ep11_cprb(bool userspace, struct ap_perms *perms,
rc = prep_ep11_ap_msg(userspace, xcrb, &ap_msg, &func_code, &domain);
if (rc)
goto out_free;
+ print_hex_dump_debug("ep11req: ", DUMP_PREFIX_ADDRESS, 16, 1,
+ ap_msg.msg, ap_msg.len, false);
if (perms != &ap_perms && domain < AUTOSEL_DOM) {
if (ap_msg.flags & AP_MSG_FLAG_ADMIN) {
@@ -1113,15 +1132,15 @@ static long _zcrypt_send_ep11_cprb(bool userspace, struct ap_perms *perms,
if (!pref_zq) {
if (targets && target_num == 1) {
- ZCRYPT_DBF_DBG("%s no match for address %02x.%04x => ENODEV\n",
- __func__, (int)targets->ap_id,
- (int)targets->dom_id);
+ pr_debug("%s no match for address %02x.%04x => ENODEV\n",
+ __func__, (int)targets->ap_id,
+ (int)targets->dom_id);
} else if (targets) {
- ZCRYPT_DBF_DBG("%s no match for %d target addrs => ENODEV\n",
- __func__, (int)target_num);
+ pr_debug("%s no match for %d target addrs => ENODEV\n",
+ __func__, (int)target_num);
} else {
- ZCRYPT_DBF_DBG("%s no match for address ff.ffff => ENODEV\n",
- __func__);
+ pr_debug("%s no match for address ff.ffff => ENODEV\n",
+ __func__);
}
rc = -ENODEV;
goto out_free;
@@ -1129,6 +1148,10 @@ static long _zcrypt_send_ep11_cprb(bool userspace, struct ap_perms *perms,
qid = pref_zq->queue->qid;
rc = pref_zq->ops->send_ep11_cprb(userspace, pref_zq, xcrb, &ap_msg);
+ if (!rc) {
+ print_hex_dump_debug("ep11rpl: ", DUMP_PREFIX_ADDRESS, 16, 1,
+ ap_msg.msg, ap_msg.len, false);
+ }
spin_lock(&zcrypt_list_lock);
zcrypt_drop_queue(pref_zc, pref_zq, mod, wgt);
@@ -1149,7 +1172,26 @@ out:
long zcrypt_send_ep11_cprb(struct ep11_urb *xcrb)
{
- return _zcrypt_send_ep11_cprb(false, &ap_perms, NULL, xcrb);
+ struct zcrypt_track tr;
+ int rc;
+
+ memset(&tr, 0, sizeof(tr));
+
+ do {
+ rc = _zcrypt_send_ep11_cprb(false, &ap_perms, &tr, xcrb);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
+ do {
+ rc = _zcrypt_send_ep11_cprb(false, &ap_perms, &tr, xcrb);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+ if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
+ rc = -EIO;
+ if (rc)
+ pr_debug("%s rc=%d\n", __func__, rc);
+
+ return rc;
}
EXPORT_SYMBOL(zcrypt_send_ep11_cprb);
@@ -1199,8 +1241,7 @@ static long zcrypt_rng(char *buffer)
spin_unlock(&zcrypt_list_lock);
if (!pref_zq) {
- ZCRYPT_DBF_DBG("%s no matching queue found => ENODEV\n",
- __func__);
+ pr_debug("%s no matching queue found => ENODEV\n", __func__);
rc = -ENODEV;
goto out;
}
@@ -1431,20 +1472,17 @@ static int icarsamodexpo_ioctl(struct ap_perms *perms, unsigned long arg)
do {
rc = zcrypt_rsa_modexpo(perms, &tr, &mex);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
do {
rc = zcrypt_rsa_modexpo(perms, &tr, &mex);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
rc = -EIO;
if (rc) {
- ZCRYPT_DBF_DBG("ioctl ICARSAMODEXPO rc=%d\n", rc);
+ pr_debug("ioctl ICARSAMODEXPO rc=%d\n", rc);
return rc;
}
return put_user(mex.outputdatalength, &umex->outputdatalength);
@@ -1463,20 +1501,17 @@ static int icarsacrt_ioctl(struct ap_perms *perms, unsigned long arg)
do {
rc = zcrypt_rsa_crt(perms, &tr, &crt);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
do {
rc = zcrypt_rsa_crt(perms, &tr, &crt);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
rc = -EIO;
if (rc) {
- ZCRYPT_DBF_DBG("ioctl ICARSACRT rc=%d\n", rc);
+ pr_debug("ioctl ICARSACRT rc=%d\n", rc);
return rc;
}
return put_user(crt.outputdatalength, &ucrt->outputdatalength);
@@ -1495,21 +1530,18 @@ static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg)
do {
rc = _zcrypt_send_cprb(true, perms, &tr, &xcrb);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
do {
rc = _zcrypt_send_cprb(true, perms, &tr, &xcrb);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
rc = -EIO;
if (rc)
- ZCRYPT_DBF_DBG("ioctl ZSENDCPRB rc=%d status=0x%x\n",
- rc, xcrb.status);
+ pr_debug("ioctl ZSENDCPRB rc=%d status=0x%x\n",
+ rc, xcrb.status);
if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb)))
return -EFAULT;
return rc;
@@ -1528,20 +1560,17 @@ static int zsendep11cprb_ioctl(struct ap_perms *perms, unsigned long arg)
do {
rc = _zcrypt_send_ep11_cprb(true, perms, &tr, &xcrb);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
do {
rc = _zcrypt_send_ep11_cprb(true, perms, &tr, &xcrb);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
rc = -EIO;
if (rc)
- ZCRYPT_DBF_DBG("ioctl ZSENDEP11CPRB rc=%d\n", rc);
+ pr_debug("ioctl ZSENDEP11CPRB rc=%d\n", rc);
if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb)))
return -EFAULT;
return rc;
@@ -1670,7 +1699,7 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd,
}
/* unknown ioctl number */
default:
- ZCRYPT_DBF_DBG("unknown ioctl 0x%08x\n", cmd);
+ pr_debug("unknown ioctl 0x%08x\n", cmd);
return -ENOIOCTLCMD;
}
}
@@ -1708,16 +1737,13 @@ static long trans_modexpo32(struct ap_perms *perms, struct file *filp,
mex64.n_modulus = compat_ptr(mex32.n_modulus);
do {
rc = zcrypt_rsa_modexpo(perms, &tr, &mex64);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
do {
rc = zcrypt_rsa_modexpo(perms, &tr, &mex64);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
rc = -EIO;
if (rc)
@@ -1761,16 +1787,13 @@ static long trans_modexpo_crt32(struct ap_perms *perms, struct file *filp,
crt64.u_mult_inv = compat_ptr(crt32.u_mult_inv);
do {
rc = zcrypt_rsa_crt(perms, &tr, &crt64);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
do {
rc = zcrypt_rsa_crt(perms, &tr, &crt64);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
rc = -EIO;
if (rc)
@@ -1833,16 +1856,13 @@ static long trans_xcrb32(struct ap_perms *perms, struct file *filp,
xcrb64.status = xcrb32.status;
do {
rc = _zcrypt_send_cprb(true, perms, &tr, &xcrb64);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
+
+ /* on ENODEV failure: retry once again after a requested rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
do {
rc = _zcrypt_send_cprb(true, perms, &tr, &xcrb64);
- if (rc == -EAGAIN)
- tr.again_counter++;
- } while (rc == -EAGAIN && tr.again_counter < TRACK_AGAIN_MAX);
+ } while (rc == -EAGAIN && ++tr.again_counter < TRACK_AGAIN_MAX);
if (rc == -EAGAIN && tr.again_counter >= TRACK_AGAIN_MAX)
rc = -EIO;
xcrb32.reply_control_blk_length = xcrb64.reply_control_blk_length;
@@ -1914,8 +1934,8 @@ static int zcrypt_rng_data_read(struct hwrng *rng, u32 *data)
*/
if (zcrypt_rng_buffer_index == 0) {
rc = zcrypt_rng((char *)zcrypt_rng_buffer);
- /* on failure: retry once again after a requested rescan */
- if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+ /* on ENODEV failure: retry once again after an AP bus rescan */
+ if (rc == -ENODEV && zcrypt_process_rescan())
rc = zcrypt_rng((char *)zcrypt_rng_buffer);
if (rc < 0)
return -EIO;
@@ -1977,7 +1997,7 @@ void zcrypt_rng_device_remove(void)
* an asynchronous job. This function waits until these initial jobs
* are done and so the zcrypt api should be ready to serve crypto
* requests - if there are resources available. The function uses an
- * internal timeout of 60s. The very first caller will either wait for
+ * internal timeout of 30s. The very first caller will either wait for
* ap bus bindings complete or the timeout happens. This state will be
* remembered for further callers which will only be blocked until a
* decision is made (timeout or bindings complete).
@@ -1996,8 +2016,8 @@ int zcrypt_wait_api_operational(void)
switch (zcrypt_wait_api_state) {
case 0:
/* initial state, invoke wait for the ap bus complete */
- rc = ap_wait_init_apqn_bindings_complete(
- msecs_to_jiffies(60 * 1000));
+ rc = ap_wait_apqn_bindings_complete(
+ msecs_to_jiffies(ZCRYPT_WAIT_BINDINGS_COMPLETE_MS));
switch (rc) {
case 0:
/* ap bus bindings are complete */
@@ -2014,8 +2034,8 @@ int zcrypt_wait_api_operational(void)
break;
default:
/* other failure */
- ZCRYPT_DBF_DBG("%s ap_wait_init_apqn_bindings_complete()=%d\n",
- __func__, rc);
+ pr_debug("%s ap_wait_init_apqn_bindings_complete()=%d\n",
+ __func__, rc);
break;
}
break;
@@ -2038,7 +2058,7 @@ EXPORT_SYMBOL(zcrypt_wait_api_operational);
int __init zcrypt_debug_init(void)
{
zcrypt_dbf_info = debug_register("zcrypt", 2, 1,
- DBF_MAX_SPRINTF_ARGS * sizeof(long));
+ ZCRYPT_DBF_MAX_SPRINTF_ARGS * sizeof(long));
debug_register_view(zcrypt_dbf_info, &debug_sprintf_view);
debug_set_level(zcrypt_dbf_info, DBF_ERR);
diff --git a/drivers/s390/crypto/zcrypt_api.h b/drivers/s390/crypto/zcrypt_api.h
index de659954c8f7..4ed481df57ca 100644
--- a/drivers/s390/crypto/zcrypt_api.h
+++ b/drivers/s390/crypto/zcrypt_api.h
@@ -38,6 +38,15 @@
*/
#define ZCRYPT_RNG_BUFFER_SIZE 4096
+/**
+ * The zcrypt_wait_api_operational() function waits this
+ * amount in milliseconds for ap_wait_aqpn_bindings_complete().
+ * Also on a cprb send failure with ENODEV the send functions
+ * trigger an ap bus rescan and wait this time in milliseconds
+ * for ap_wait_aqpn_bindings_complete() before resending.
+ */
+#define ZCRYPT_WAIT_BINDINGS_COMPLETE_MS 30000
+
/*
* Identifier for Crypto Request Performance Index
*/
diff --git a/drivers/s390/crypto/zcrypt_ccamisc.c b/drivers/s390/crypto/zcrypt_ccamisc.c
index 263fe182648b..0a3a678ffc7e 100644
--- a/drivers/s390/crypto/zcrypt_ccamisc.c
+++ b/drivers/s390/crypto/zcrypt_ccamisc.c
@@ -23,11 +23,6 @@
#include "zcrypt_msgtype6.h"
#include "zcrypt_ccamisc.h"
-#define DEBUG_DBG(...) ZCRYPT_DBF(DBF_DEBUG, ##__VA_ARGS__)
-#define DEBUG_INFO(...) ZCRYPT_DBF(DBF_INFO, ##__VA_ARGS__)
-#define DEBUG_WARN(...) ZCRYPT_DBF(DBF_WARN, ##__VA_ARGS__)
-#define DEBUG_ERR(...) ZCRYPT_DBF(DBF_ERR, ##__VA_ARGS__)
-
/* Size of parameter block used for all cca requests/replies */
#define PARMBSIZE 512
@@ -367,8 +362,8 @@ int cca_genseckey(u16 cardnr, u16 domain,
memcpy(preqparm->lv1.key_length, "KEYLN32 ", 8);
break;
default:
- DEBUG_ERR("%s unknown/unsupported keybitsize %d\n",
- __func__, keybitsize);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported keybitsize %d\n",
+ __func__, keybitsize);
rc = -EINVAL;
goto out;
}
@@ -386,15 +381,15 @@ int cca_genseckey(u16 cardnr, u16 domain,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, errno %d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, errno %d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR("%s secure key generate failure, card response %d/%d\n",
- __func__,
+ ZCRYPT_DBF_ERR("%s secure key generate failure, card response %d/%d\n",
+ __func__,
(int)prepcblk->ccp_rtcode,
(int)prepcblk->ccp_rscode);
rc = -EIO;
@@ -411,8 +406,8 @@ int cca_genseckey(u16 cardnr, u16 domain,
- sizeof(prepparm->lv3.keyblock.toklen)
- sizeof(prepparm->lv3.keyblock.tokattr);
if (seckeysize != SECKEYBLOBSIZE) {
- DEBUG_ERR("%s secure token size mismatch %d != %d bytes\n",
- __func__, seckeysize, SECKEYBLOBSIZE);
+ ZCRYPT_DBF_ERR("%s secure token size mismatch %d != %d bytes\n",
+ __func__, seckeysize, SECKEYBLOBSIZE);
rc = -EIO;
goto out;
}
@@ -505,8 +500,8 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize,
keysize = 32;
break;
default:
- DEBUG_ERR("%s unknown/unsupported keybitsize %d\n",
- __func__, keybitsize);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported keybitsize %d\n",
+ __func__, keybitsize);
rc = -EINVAL;
goto out;
}
@@ -524,17 +519,17 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR("%s clear key import failure, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_ERR("%s clear key import failure, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
rc = -EIO;
goto out;
}
@@ -549,8 +544,8 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize,
- sizeof(prepparm->lv3.keyblock.toklen)
- sizeof(prepparm->lv3.keyblock.tokattr);
if (seckeysize != SECKEYBLOBSIZE) {
- DEBUG_ERR("%s secure token size mismatch %d != %d bytes\n",
- __func__, seckeysize, SECKEYBLOBSIZE);
+ ZCRYPT_DBF_ERR("%s secure token size mismatch %d != %d bytes\n",
+ __func__, seckeysize, SECKEYBLOBSIZE);
rc = -EIO;
goto out;
}
@@ -651,17 +646,17 @@ int cca_sec2protkey(u16 cardnr, u16 domain,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR("%s unwrap secure key failure, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_ERR("%s unwrap secure key failure, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
if (prepcblk->ccp_rtcode == 8 && prepcblk->ccp_rscode == 2290)
rc = -EAGAIN;
else
@@ -669,10 +664,10 @@ int cca_sec2protkey(u16 cardnr, u16 domain,
goto out;
}
if (prepcblk->ccp_rscode != 0) {
- DEBUG_WARN("%s unwrap secure key warning, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_WARN("%s unwrap secure key warning, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
}
/* process response cprb param block */
@@ -683,8 +678,8 @@ int cca_sec2protkey(u16 cardnr, u16 domain,
/* check the returned keyblock */
if (prepparm->lv3.ckb.version != 0x01 &&
prepparm->lv3.ckb.version != 0x02) {
- DEBUG_ERR("%s reply param keyblock version mismatch 0x%02x\n",
- __func__, (int)prepparm->lv3.ckb.version);
+ ZCRYPT_DBF_ERR("%s reply param keyblock version mismatch 0x%02x\n",
+ __func__, (int)prepparm->lv3.ckb.version);
rc = -EIO;
goto out;
}
@@ -707,8 +702,8 @@ int cca_sec2protkey(u16 cardnr, u16 domain,
*protkeytype = PKEY_KEYTYPE_AES_256;
break;
default:
- DEBUG_ERR("%s unknown/unsupported keylen %d\n",
- __func__, prepparm->lv3.ckb.len);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported keylen %d\n",
+ __func__, prepparm->lv3.ckb.len);
rc = -EIO;
goto out;
}
@@ -840,9 +835,8 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags,
case 256:
break;
default:
- DEBUG_ERR(
- "%s unknown/unsupported keybitsize %d\n",
- __func__, keybitsize);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported keybitsize %d\n",
+ __func__, keybitsize);
rc = -EINVAL;
goto out;
}
@@ -880,19 +874,17 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR(
- "%s cipher key generate failure, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_ERR("%s cipher key generate failure, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
rc = -EIO;
goto out;
}
@@ -905,8 +897,8 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags,
/* do some plausibility checks on the key block */
if (prepparm->kb.len < 120 + 5 * sizeof(uint16_t) ||
prepparm->kb.len > 136 + 5 * sizeof(uint16_t)) {
- DEBUG_ERR("%s reply with invalid or unknown key block\n",
- __func__);
+ ZCRYPT_DBF_ERR("%s reply with invalid or unknown key block\n",
+ __func__);
rc = -EIO;
goto out;
}
@@ -1048,19 +1040,17 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR(
- "%s CSNBKPI2 failure, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_ERR("%s CSNBKPI2 failure, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
rc = -EIO;
goto out;
}
@@ -1073,8 +1063,8 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain,
/* do some plausibility checks on the key block */
if (prepparm->kb.len < 120 + 3 * sizeof(uint16_t) ||
prepparm->kb.len > 136 + 3 * sizeof(uint16_t)) {
- DEBUG_ERR("%s reply with invalid or unknown key block\n",
- __func__);
+ ZCRYPT_DBF_ERR("%s reply with invalid or unknown key block\n",
+ __func__);
rc = -EIO;
goto out;
}
@@ -1132,33 +1122,29 @@ int cca_clr2cipherkey(u16 card, u16 dom, u32 keybitsize, u32 keygenflags,
rc = _ip_cprb_helper(card, dom, "AES ", "FIRST ", "MIN3PART",
exorbuf, keybitsize, token, &tokensize);
if (rc) {
- DEBUG_ERR(
- "%s clear key import 1/4 with CSNBKPI2 failed, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s clear key import 1/4 with CSNBKPI2 failed, rc=%d\n",
+ __func__, rc);
goto out;
}
rc = _ip_cprb_helper(card, dom, "AES ", "ADD-PART", NULL,
clrkey, keybitsize, token, &tokensize);
if (rc) {
- DEBUG_ERR(
- "%s clear key import 2/4 with CSNBKPI2 failed, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s clear key import 2/4 with CSNBKPI2 failed, rc=%d\n",
+ __func__, rc);
goto out;
}
rc = _ip_cprb_helper(card, dom, "AES ", "ADD-PART", NULL,
exorbuf, keybitsize, token, &tokensize);
if (rc) {
- DEBUG_ERR(
- "%s clear key import 3/4 with CSNBKPI2 failed, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s clear key import 3/4 with CSNBKPI2 failed, rc=%d\n",
+ __func__, rc);
goto out;
}
rc = _ip_cprb_helper(card, dom, "AES ", "COMPLETE", NULL,
NULL, keybitsize, token, &tokensize);
if (rc) {
- DEBUG_ERR(
- "%s clear key import 4/4 with CSNBKPI2 failed, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s clear key import 4/4 with CSNBKPI2 failed, rc=%d\n",
+ __func__, rc);
goto out;
}
@@ -1265,19 +1251,17 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR(
- "%s unwrap secure key failure, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_ERR("%s unwrap secure key failure, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
if (prepcblk->ccp_rtcode == 8 && prepcblk->ccp_rscode == 2290)
rc = -EAGAIN;
else
@@ -1285,11 +1269,10 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey,
goto out;
}
if (prepcblk->ccp_rscode != 0) {
- DEBUG_WARN(
- "%s unwrap secure key warning, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_WARN("%s unwrap secure key warning, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
}
/* process response cprb param block */
@@ -1300,15 +1283,14 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey,
/* check the returned keyblock */
if (prepparm->vud.ckb.version != 0x01 &&
prepparm->vud.ckb.version != 0x02) {
- DEBUG_ERR("%s reply param keyblock version mismatch 0x%02x\n",
- __func__, (int)prepparm->vud.ckb.version);
+ ZCRYPT_DBF_ERR("%s reply param keyblock version mismatch 0x%02x\n",
+ __func__, (int)prepparm->vud.ckb.version);
rc = -EIO;
goto out;
}
if (prepparm->vud.ckb.algo != 0x02) {
- DEBUG_ERR(
- "%s reply param keyblock algo mismatch 0x%02x != 0x02\n",
- __func__, (int)prepparm->vud.ckb.algo);
+ ZCRYPT_DBF_ERR("%s reply param keyblock algo mismatch 0x%02x != 0x02\n",
+ __func__, (int)prepparm->vud.ckb.algo);
rc = -EIO;
goto out;
}
@@ -1331,8 +1313,8 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey,
*protkeytype = PKEY_KEYTYPE_AES_256;
break;
default:
- DEBUG_ERR("%s unknown/unsupported keylen %d\n",
- __func__, prepparm->vud.ckb.keylen);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported keylen %d\n",
+ __func__, prepparm->vud.ckb.keylen);
rc = -EIO;
goto out;
}
@@ -1432,19 +1414,17 @@ int cca_ecc2protkey(u16 cardnr, u16 domain, const u8 *key,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR(
- "%s unwrap secure key failure, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_ERR("%s unwrap secure key failure, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
if (prepcblk->ccp_rtcode == 8 && prepcblk->ccp_rscode == 2290)
rc = -EAGAIN;
else
@@ -1452,11 +1432,10 @@ int cca_ecc2protkey(u16 cardnr, u16 domain, const u8 *key,
goto out;
}
if (prepcblk->ccp_rscode != 0) {
- DEBUG_WARN(
- "%s unwrap secure key warning, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_WARN("%s unwrap secure key warning, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
}
/* process response cprb param block */
@@ -1466,23 +1445,22 @@ int cca_ecc2protkey(u16 cardnr, u16 domain, const u8 *key,
/* check the returned keyblock */
if (prepparm->vud.ckb.version != 0x02) {
- DEBUG_ERR("%s reply param keyblock version mismatch 0x%02x != 0x02\n",
- __func__, (int)prepparm->vud.ckb.version);
+ ZCRYPT_DBF_ERR("%s reply param keyblock version mismatch 0x%02x != 0x02\n",
+ __func__, (int)prepparm->vud.ckb.version);
rc = -EIO;
goto out;
}
if (prepparm->vud.ckb.algo != 0x81) {
- DEBUG_ERR(
- "%s reply param keyblock algo mismatch 0x%02x != 0x81\n",
- __func__, (int)prepparm->vud.ckb.algo);
+ ZCRYPT_DBF_ERR("%s reply param keyblock algo mismatch 0x%02x != 0x81\n",
+ __func__, (int)prepparm->vud.ckb.algo);
rc = -EIO;
goto out;
}
/* copy the translated protected key */
if (prepparm->vud.ckb.keylen > *protkeylen) {
- DEBUG_ERR("%s prot keylen mismatch %d > buffersize %u\n",
- __func__, prepparm->vud.ckb.keylen, *protkeylen);
+ ZCRYPT_DBF_ERR("%s prot keylen mismatch %d > buffersize %u\n",
+ __func__, prepparm->vud.ckb.keylen, *protkeylen);
rc = -EIO;
goto out;
}
@@ -1550,17 +1528,17 @@ int cca_query_crypto_facility(u16 cardnr, u16 domain,
/* forward xcrb with request CPRB and reply CPRB to zcrypt dd */
rc = zcrypt_send_cprb(&xcrb);
if (rc) {
- DEBUG_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_cprb (cardnr=%d domain=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
/* check response returncode and reasoncode */
if (prepcblk->ccp_rtcode != 0) {
- DEBUG_ERR("%s unwrap secure key failure, card response %d/%d\n",
- __func__,
- (int)prepcblk->ccp_rtcode,
- (int)prepcblk->ccp_rscode);
+ ZCRYPT_DBF_ERR("%s unwrap secure key failure, card response %d/%d\n",
+ __func__,
+ (int)prepcblk->ccp_rtcode,
+ (int)prepcblk->ccp_rscode);
rc = -EIO;
goto out;
}
diff --git a/drivers/s390/crypto/zcrypt_debug.h b/drivers/s390/crypto/zcrypt_debug.h
index 5cf88aabd64b..9a208dc4c200 100644
--- a/drivers/s390/crypto/zcrypt_debug.h
+++ b/drivers/s390/crypto/zcrypt_debug.h
@@ -17,7 +17,7 @@
#define RC2ERR(rc) ((rc) ? DBF_ERR : DBF_INFO)
#define RC2WARN(rc) ((rc) ? DBF_WARN : DBF_INFO)
-#define DBF_MAX_SPRINTF_ARGS 6
+#define ZCRYPT_DBF_MAX_SPRINTF_ARGS 6
#define ZCRYPT_DBF(...) \
debug_sprintf_event(zcrypt_dbf_info, ##__VA_ARGS__)
@@ -27,8 +27,6 @@
debug_sprintf_event(zcrypt_dbf_info, DBF_WARN, ##__VA_ARGS__)
#define ZCRYPT_DBF_INFO(...) \
debug_sprintf_event(zcrypt_dbf_info, DBF_INFO, ##__VA_ARGS__)
-#define ZCRYPT_DBF_DBG(...) \
- debug_sprintf_event(zcrypt_dbf_info, DBF_DEBUG, ##__VA_ARGS__)
extern debug_info_t *zcrypt_dbf_info;
diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c
index 0a877f9792c2..eb7f5489ccf9 100644
--- a/drivers/s390/crypto/zcrypt_ep11misc.c
+++ b/drivers/s390/crypto/zcrypt_ep11misc.c
@@ -24,11 +24,6 @@
#include "zcrypt_ep11misc.h"
#include "zcrypt_ccamisc.h"
-#define DEBUG_DBG(...) ZCRYPT_DBF(DBF_DEBUG, ##__VA_ARGS__)
-#define DEBUG_INFO(...) ZCRYPT_DBF(DBF_INFO, ##__VA_ARGS__)
-#define DEBUG_WARN(...) ZCRYPT_DBF(DBF_WARN, ##__VA_ARGS__)
-#define DEBUG_ERR(...) ZCRYPT_DBF(DBF_ERR, ##__VA_ARGS__)
-
#define EP11_PINBLOB_V1_BYTES 56
/* default iv used here */
@@ -510,7 +505,7 @@ static int check_reply_pl(const u8 *pl, const char *func)
/* start tag */
if (*pl++ != 0x30) {
- DEBUG_ERR("%s reply start tag mismatch\n", func);
+ ZCRYPT_DBF_ERR("%s reply start tag mismatch\n", func);
return -EIO;
}
@@ -527,40 +522,41 @@ static int check_reply_pl(const u8 *pl, const char *func)
len = *((u16 *)pl);
pl += 2;
} else {
- DEBUG_ERR("%s reply start tag lenfmt mismatch 0x%02hhx\n",
- func, *pl);
+ ZCRYPT_DBF_ERR("%s reply start tag lenfmt mismatch 0x%02hhx\n",
+ func, *pl);
return -EIO;
}
/* len should cover at least 3 fields with 32 bit value each */
if (len < 3 * 6) {
- DEBUG_ERR("%s reply length %d too small\n", func, len);
+ ZCRYPT_DBF_ERR("%s reply length %d too small\n", func, len);
return -EIO;
}
/* function tag, length and value */
if (pl[0] != 0x04 || pl[1] != 0x04) {
- DEBUG_ERR("%s function tag or length mismatch\n", func);
+ ZCRYPT_DBF_ERR("%s function tag or length mismatch\n", func);
return -EIO;
}
pl += 6;
/* dom tag, length and value */
if (pl[0] != 0x04 || pl[1] != 0x04) {
- DEBUG_ERR("%s dom tag or length mismatch\n", func);
+ ZCRYPT_DBF_ERR("%s dom tag or length mismatch\n", func);
return -EIO;
}
pl += 6;
/* return value tag, length and value */
if (pl[0] != 0x04 || pl[1] != 0x04) {
- DEBUG_ERR("%s return value tag or length mismatch\n", func);
+ ZCRYPT_DBF_ERR("%s return value tag or length mismatch\n",
+ func);
return -EIO;
}
pl += 2;
ret = *((u32 *)pl);
if (ret != 0) {
- DEBUG_ERR("%s return value 0x%04x != 0\n", func, ret);
+ ZCRYPT_DBF_ERR("%s return value 0x%04x != 0\n", func, ret);
return -EIO;
}
@@ -626,9 +622,8 @@ static int ep11_query_info(u16 cardnr, u16 domain, u32 query_type,
rc = zcrypt_send_ep11_cprb(urb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
- __func__, (int)cardnr, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
+ __func__, (int)cardnr, (int)domain, rc);
goto out;
}
@@ -636,13 +631,13 @@ static int ep11_query_info(u16 cardnr, u16 domain, u32 query_type,
if (rc)
goto out;
if (rep_pl->data_tag != 0x04 || rep_pl->data_lenfmt != 0x82) {
- DEBUG_ERR("%s unknown reply data format\n", __func__);
+ ZCRYPT_DBF_ERR("%s unknown reply data format\n", __func__);
rc = -EIO;
goto out;
}
if (rep_pl->data_len > buflen) {
- DEBUG_ERR("%s mismatch between reply data len and buffer len\n",
- __func__);
+ ZCRYPT_DBF_ERR("%s mismatch between reply data len and buffer len\n",
+ __func__);
rc = -ENOSPC;
goto out;
}
@@ -816,9 +811,8 @@ static int _ep11_genaeskey(u16 card, u16 domain,
case 256:
break;
default:
- DEBUG_ERR(
- "%s unknown/unsupported keybitsize %d\n",
- __func__, keybitsize);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported keybitsize %d\n",
+ __func__, keybitsize);
rc = -EINVAL;
goto out;
}
@@ -878,9 +872,8 @@ static int _ep11_genaeskey(u16 card, u16 domain,
rc = zcrypt_send_ep11_cprb(urb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
- __func__, (int)card, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
+ __func__, (int)card, (int)domain, rc);
goto out;
}
@@ -888,13 +881,13 @@ static int _ep11_genaeskey(u16 card, u16 domain,
if (rc)
goto out;
if (rep_pl->data_tag != 0x04 || rep_pl->data_lenfmt != 0x82) {
- DEBUG_ERR("%s unknown reply data format\n", __func__);
+ ZCRYPT_DBF_ERR("%s unknown reply data format\n", __func__);
rc = -EIO;
goto out;
}
if (rep_pl->data_len > *keybufsize) {
- DEBUG_ERR("%s mismatch reply data len / key buffer len\n",
- __func__);
+ ZCRYPT_DBF_ERR("%s mismatch reply data len / key buffer len\n",
+ __func__);
rc = -ENOSPC;
goto out;
}
@@ -1030,9 +1023,8 @@ static int ep11_cryptsingle(u16 card, u16 domain,
rc = zcrypt_send_ep11_cprb(urb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
- __func__, (int)card, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
+ __func__, (int)card, (int)domain, rc);
goto out;
}
@@ -1040,7 +1032,7 @@ static int ep11_cryptsingle(u16 card, u16 domain,
if (rc)
goto out;
if (rep_pl->data_tag != 0x04) {
- DEBUG_ERR("%s unknown reply data format\n", __func__);
+ ZCRYPT_DBF_ERR("%s unknown reply data format\n", __func__);
rc = -EIO;
goto out;
}
@@ -1053,14 +1045,14 @@ static int ep11_cryptsingle(u16 card, u16 domain,
n = *((u16 *)p);
p += 2;
} else {
- DEBUG_ERR("%s unknown reply data length format 0x%02hhx\n",
- __func__, rep_pl->data_lenfmt);
+ ZCRYPT_DBF_ERR("%s unknown reply data length format 0x%02hhx\n",
+ __func__, rep_pl->data_lenfmt);
rc = -EIO;
goto out;
}
if (n > *outbufsize) {
- DEBUG_ERR("%s mismatch reply data len %d / output buffer %zu\n",
- __func__, n, *outbufsize);
+ ZCRYPT_DBF_ERR("%s mismatch reply data len %d / output buffer %zu\n",
+ __func__, n, *outbufsize);
rc = -ENOSPC;
goto out;
}
@@ -1188,9 +1180,8 @@ static int _ep11_unwrapkey(u16 card, u16 domain,
rc = zcrypt_send_ep11_cprb(urb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
- __func__, (int)card, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
+ __func__, (int)card, (int)domain, rc);
goto out;
}
@@ -1198,13 +1189,13 @@ static int _ep11_unwrapkey(u16 card, u16 domain,
if (rc)
goto out;
if (rep_pl->data_tag != 0x04 || rep_pl->data_lenfmt != 0x82) {
- DEBUG_ERR("%s unknown reply data format\n", __func__);
+ ZCRYPT_DBF_ERR("%s unknown reply data format\n", __func__);
rc = -EIO;
goto out;
}
if (rep_pl->data_len > *keybufsize) {
- DEBUG_ERR("%s mismatch reply data len / key buffer len\n",
- __func__);
+ ZCRYPT_DBF_ERR("%s mismatch reply data len / key buffer len\n",
+ __func__);
rc = -ENOSPC;
goto out;
}
@@ -1343,9 +1334,8 @@ static int _ep11_wrapkey(u16 card, u16 domain,
rc = zcrypt_send_ep11_cprb(urb);
if (rc) {
- DEBUG_ERR(
- "%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
- __func__, (int)card, (int)domain, rc);
+ ZCRYPT_DBF_ERR("%s zcrypt_send_ep11_cprb(card=%d dom=%d) failed, rc=%d\n",
+ __func__, (int)card, (int)domain, rc);
goto out;
}
@@ -1353,13 +1343,13 @@ static int _ep11_wrapkey(u16 card, u16 domain,
if (rc)
goto out;
if (rep_pl->data_tag != 0x04 || rep_pl->data_lenfmt != 0x82) {
- DEBUG_ERR("%s unknown reply data format\n", __func__);
+ ZCRYPT_DBF_ERR("%s unknown reply data format\n", __func__);
rc = -EIO;
goto out;
}
if (rep_pl->data_len > *datasize) {
- DEBUG_ERR("%s mismatch reply data len / data buffer len\n",
- __func__);
+ ZCRYPT_DBF_ERR("%s mismatch reply data len / data buffer len\n",
+ __func__);
rc = -ENOSPC;
goto out;
}
@@ -1386,9 +1376,8 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
if (keybitsize == 128 || keybitsize == 192 || keybitsize == 256) {
clrkeylen = keybitsize / 8;
} else {
- DEBUG_ERR(
- "%s unknown/unsupported keybitsize %d\n",
- __func__, keybitsize);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported keybitsize %d\n",
+ __func__, keybitsize);
return -EINVAL;
}
@@ -1405,9 +1394,8 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
0x00006c00, /* EN/DECRYPT, WRAP/UNWRAP */
kek, &keklen);
if (rc) {
- DEBUG_ERR(
- "%s generate kek key failed, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s generate kek key failed, rc=%d\n",
+ __func__, rc);
goto out;
}
@@ -1415,9 +1403,8 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
rc = ep11_cryptsingle(card, domain, 0, 0, def_iv, kek, keklen,
clrkey, clrkeylen, encbuf, &encbuflen);
if (rc) {
- DEBUG_ERR(
- "%s encrypting key value with kek key failed, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s encrypting key value with kek key failed, rc=%d\n",
+ __func__, rc);
goto out;
}
@@ -1426,9 +1413,8 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
encbuf, encbuflen, 0, def_iv,
keybitsize, 0, keybuf, keybufsize, keytype);
if (rc) {
- DEBUG_ERR(
- "%s importing key value as new key failed,, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s importing key value as new key failed,, rc=%d\n",
+ __func__, rc);
goto out;
}
@@ -1476,17 +1462,16 @@ int ep11_kblob2protkey(u16 card, u16 dom,
rc = _ep11_wrapkey(card, dom, (u8 *)key, keylen,
0, def_iv, wkbuf, &wkbuflen);
if (rc) {
- DEBUG_ERR(
- "%s rewrapping ep11 key to pkey failed, rc=%d\n",
- __func__, rc);
+ ZCRYPT_DBF_ERR("%s rewrapping ep11 key to pkey failed, rc=%d\n",
+ __func__, rc);
goto out;
}
wki = (struct wk_info *)wkbuf;
/* check struct version and pkey type */
if (wki->version != 1 || wki->pkeytype < 1 || wki->pkeytype > 5) {
- DEBUG_ERR("%s wk info version %d or pkeytype %d mismatch.\n",
- __func__, (int)wki->version, (int)wki->pkeytype);
+ ZCRYPT_DBF_ERR("%s wk info version %d or pkeytype %d mismatch.\n",
+ __func__, (int)wki->version, (int)wki->pkeytype);
rc = -EIO;
goto out;
}
@@ -1511,8 +1496,8 @@ int ep11_kblob2protkey(u16 card, u16 dom,
*protkeytype = PKEY_KEYTYPE_AES_256;
break;
default:
- DEBUG_ERR("%s unknown/unsupported AES pkeysize %d\n",
- __func__, (int)wki->pkeysize);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported AES pkeysize %d\n",
+ __func__, (int)wki->pkeysize);
rc = -EIO;
goto out;
}
@@ -1525,16 +1510,16 @@ int ep11_kblob2protkey(u16 card, u16 dom,
break;
case 2: /* TDES */
default:
- DEBUG_ERR("%s unknown/unsupported key type %d\n",
- __func__, (int)wki->pkeytype);
+ ZCRYPT_DBF_ERR("%s unknown/unsupported key type %d\n",
+ __func__, (int)wki->pkeytype);
rc = -EIO;
goto out;
}
/* copy the translated protected key */
if (wki->pkeysize > *protkeylen) {
- DEBUG_ERR("%s wk info pkeysize %llu > protkeysize %u\n",
- __func__, wki->pkeysize, *protkeylen);
+ ZCRYPT_DBF_ERR("%s wk info pkeysize %llu > protkeysize %u\n",
+ __func__, wki->pkeysize, *protkeylen);
rc = -EINVAL;
goto out;
}
diff --git a/drivers/s390/crypto/zcrypt_error.h b/drivers/s390/crypto/zcrypt_error.h
index a44fcfcec938..46e27b43a8af 100644
--- a/drivers/s390/crypto/zcrypt_error.h
+++ b/drivers/s390/crypto/zcrypt_error.h
@@ -119,10 +119,9 @@ static inline int convert_error(struct zcrypt_queue *zq,
case REP82_ERROR_MESSAGE_TYPE: /* 0x20 */
case REP82_ERROR_TRANSPORT_FAIL: /* 0x90 */
/*
- * Msg to wrong type or card/infrastructure failure.
- * Trigger rescan of the ap bus, trigger retry request.
+ * Msg to wrong type or card/infrastructure failure. Return
+ * EAGAIN, the upper layer may do a retry on the request.
*/
- atomic_set(&zcrypt_rescan_req, 1);
/* For type 86 response show the apfs value (failure reason) */
if (ehdr->reply_code == REP82_ERROR_TRANSPORT_FAIL &&
ehdr->type == TYPE86_RSP_CODE) {
diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c
index 2e155de8abe5..3b39cb8f926d 100644
--- a/drivers/s390/crypto/zcrypt_msgtype50.c
+++ b/drivers/s390/crypto/zcrypt_msgtype50.c
@@ -427,7 +427,7 @@ static void zcrypt_msgtype50_receive(struct ap_queue *aq,
len = t80h->len;
if (len > reply->bufsize || len > msg->bufsize ||
len != reply->len) {
- ZCRYPT_DBF_DBG("%s len mismatch => EMSGSIZE\n", __func__);
+ pr_debug("%s len mismatch => EMSGSIZE\n", __func__);
msg->rc = -EMSGSIZE;
goto out;
}
@@ -487,9 +487,9 @@ static long zcrypt_msgtype50_modexpo(struct zcrypt_queue *zq,
out:
ap_msg->private = NULL;
if (rc)
- ZCRYPT_DBF_DBG("%s send me cprb at dev=%02x.%04x rc=%d\n",
- __func__, AP_QID_CARD(zq->queue->qid),
- AP_QID_QUEUE(zq->queue->qid), rc);
+ pr_debug("%s send me cprb at dev=%02x.%04x rc=%d\n",
+ __func__, AP_QID_CARD(zq->queue->qid),
+ AP_QID_QUEUE(zq->queue->qid), rc);
return rc;
}
@@ -537,9 +537,9 @@ static long zcrypt_msgtype50_modexpo_crt(struct zcrypt_queue *zq,
out:
ap_msg->private = NULL;
if (rc)
- ZCRYPT_DBF_DBG("%s send crt cprb at dev=%02x.%04x rc=%d\n",
- __func__, AP_QID_CARD(zq->queue->qid),
- AP_QID_QUEUE(zq->queue->qid), rc);
+ pr_debug("%s send crt cprb at dev=%02x.%04x rc=%d\n",
+ __func__, AP_QID_CARD(zq->queue->qid),
+ AP_QID_QUEUE(zq->queue->qid), rc);
return rc;
}
diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c
index 3c53abbdc342..215f257d2360 100644
--- a/drivers/s390/crypto/zcrypt_msgtype6.c
+++ b/drivers/s390/crypto/zcrypt_msgtype6.c
@@ -437,9 +437,9 @@ static int xcrb_msg_to_type6cprb_msgx(bool userspace, struct ap_message *ap_msg,
ap_msg->flags |= AP_MSG_FLAG_ADMIN;
break;
default:
- ZCRYPT_DBF_DBG("%s unknown CPRB minor version '%c%c'\n",
- __func__, msg->cprbx.func_id[0],
- msg->cprbx.func_id[1]);
+ pr_debug("%s unknown CPRB minor version '%c%c'\n",
+ __func__, msg->cprbx.func_id[0],
+ msg->cprbx.func_id[1]);
}
/* copy data block */
@@ -629,9 +629,9 @@ static int convert_type86_xcrb(bool userspace, struct zcrypt_queue *zq,
/* Copy CPRB to user */
if (xcrb->reply_control_blk_length < msg->fmt2.count1) {
- ZCRYPT_DBF_DBG("%s reply_control_blk_length %u < required %u => EMSGSIZE\n",
- __func__, xcrb->reply_control_blk_length,
- msg->fmt2.count1);
+ pr_debug("%s reply_control_blk_length %u < required %u => EMSGSIZE\n",
+ __func__, xcrb->reply_control_blk_length,
+ msg->fmt2.count1);
return -EMSGSIZE;
}
if (z_copy_to_user(userspace, xcrb->reply_control_blk_addr,
@@ -642,9 +642,9 @@ static int convert_type86_xcrb(bool userspace, struct zcrypt_queue *zq,
/* Copy data buffer to user */
if (msg->fmt2.count2) {
if (xcrb->reply_data_length < msg->fmt2.count2) {
- ZCRYPT_DBF_DBG("%s reply_data_length %u < required %u => EMSGSIZE\n",
- __func__, xcrb->reply_data_length,
- msg->fmt2.count2);
+ pr_debug("%s reply_data_length %u < required %u => EMSGSIZE\n",
+ __func__, xcrb->reply_data_length,
+ msg->fmt2.count2);
return -EMSGSIZE;
}
if (z_copy_to_user(userspace, xcrb->reply_data_addr,
@@ -673,9 +673,9 @@ static int convert_type86_ep11_xcrb(bool userspace, struct zcrypt_queue *zq,
char *data = reply->msg;
if (xcrb->resp_len < msg->fmt2.count1) {
- ZCRYPT_DBF_DBG("%s resp_len %u < required %u => EMSGSIZE\n",
- __func__, (unsigned int)xcrb->resp_len,
- msg->fmt2.count1);
+ pr_debug("%s resp_len %u < required %u => EMSGSIZE\n",
+ __func__, (unsigned int)xcrb->resp_len,
+ msg->fmt2.count1);
return -EMSGSIZE;
}
@@ -875,7 +875,8 @@ static void zcrypt_msgtype6_receive(struct ap_queue *aq,
len = sizeof(struct type86x_reply) + t86r->length;
if (len > reply->bufsize || len > msg->bufsize ||
len != reply->len) {
- ZCRYPT_DBF_DBG("%s len mismatch => EMSGSIZE\n", __func__);
+ pr_debug("%s len mismatch => EMSGSIZE\n",
+ __func__);
msg->rc = -EMSGSIZE;
goto out;
}
@@ -889,7 +890,8 @@ static void zcrypt_msgtype6_receive(struct ap_queue *aq,
len = t86r->fmt2.offset1 + t86r->fmt2.count1;
if (len > reply->bufsize || len > msg->bufsize ||
len != reply->len) {
- ZCRYPT_DBF_DBG("%s len mismatch => EMSGSIZE\n", __func__);
+ pr_debug("%s len mismatch => EMSGSIZE\n",
+ __func__);
msg->rc = -EMSGSIZE;
goto out;
}
@@ -939,7 +941,8 @@ static void zcrypt_msgtype6_receive_ep11(struct ap_queue *aq,
len = t86r->fmt2.offset1 + t86r->fmt2.count1;
if (len > reply->bufsize || len > msg->bufsize ||
len != reply->len) {
- ZCRYPT_DBF_DBG("%s len mismatch => EMSGSIZE\n", __func__);
+ pr_debug("%s len mismatch => EMSGSIZE\n",
+ __func__);
msg->rc = -EMSGSIZE;
goto out;
}
@@ -1151,9 +1154,9 @@ static long zcrypt_msgtype6_send_cprb(bool userspace, struct zcrypt_queue *zq,
out:
if (rc)
- ZCRYPT_DBF_DBG("%s send cprb at dev=%02x.%04x rc=%d\n",
- __func__, AP_QID_CARD(zq->queue->qid),
- AP_QID_QUEUE(zq->queue->qid), rc);
+ pr_debug("%s send cprb at dev=%02x.%04x rc=%d\n",
+ __func__, AP_QID_CARD(zq->queue->qid),
+ AP_QID_QUEUE(zq->queue->qid), rc);
return rc;
}
@@ -1274,9 +1277,9 @@ static long zcrypt_msgtype6_send_ep11_cprb(bool userspace, struct zcrypt_queue *
out:
if (rc)
- ZCRYPT_DBF_DBG("%s send cprb at dev=%02x.%04x rc=%d\n",
- __func__, AP_QID_CARD(zq->queue->qid),
- AP_QID_QUEUE(zq->queue->qid), rc);
+ pr_debug("%s send cprb at dev=%02x.%04x rc=%d\n",
+ __func__, AP_QID_CARD(zq->queue->qid),
+ AP_QID_QUEUE(zq->queue->qid), rc);
return rc;
}
diff --git a/drivers/zorro/zorro-driver.c b/drivers/zorro/zorro-driver.c
index 025edfccedcf..f49d19977e82 100644
--- a/drivers/zorro/zorro-driver.c
+++ b/drivers/zorro/zorro-driver.c
@@ -150,7 +150,7 @@ static int zorro_uevent(const struct device *dev, struct kobj_uevent_env *env)
return 0;
}
-struct bus_type zorro_bus_type = {
+const struct bus_type zorro_bus_type = {
.name = "zorro",
.dev_name = "zorro",
.dev_groups = zorro_device_attribute_groups,
diff --git a/drivers/zorro/zorro.h b/drivers/zorro/zorro.h
index f84df9fb4c20..df44e35203fd 100644
--- a/drivers/zorro/zorro.h
+++ b/drivers/zorro/zorro.h
@@ -4,7 +4,7 @@
* Zorro bus
*/
-extern struct bus_type zorro_bus_type;
+extern const struct bus_type zorro_bus_type;
#ifdef CONFIG_ZORRO_NAMES
diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h
index dc7ed2f46886..2b90c48a6a87 100644
--- a/include/linux/amd-iommu.h
+++ b/include/linux/amd-iommu.h
@@ -85,8 +85,10 @@ int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn,
u64 *value);
struct amd_iommu *get_amd_iommu(unsigned int idx);
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-int amd_iommu_snp_enable(void);
+#ifdef CONFIG_KVM_AMD_SEV
+int amd_iommu_snp_disable(void);
+#else
+static inline int amd_iommu_snp_disable(void) { return 0; }
#endif
#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index 5e95faa959c4..956bcba5dbf2 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -2005,6 +2005,7 @@ raw_atomic_xchg_relaxed(atomic_t *v, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere.
*
@@ -2033,6 +2034,7 @@ raw_atomic_cmpxchg(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere.
*
@@ -2061,6 +2063,7 @@ raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere.
*
@@ -2088,6 +2091,7 @@ raw_atomic_cmpxchg_release(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere.
*
@@ -2112,7 +2116,8 @@ raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere.
*
@@ -2145,7 +2150,8 @@ raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere.
*
@@ -2178,7 +2184,8 @@ raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere.
*
@@ -2210,7 +2217,8 @@ raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere.
*
@@ -2403,6 +2411,7 @@ raw_atomic_add_negative_relaxed(int i, atomic_t *v)
* @u: int value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere.
*
@@ -2432,6 +2441,7 @@ raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)
* @u: int value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_add_unless() elsewhere.
*
@@ -2452,6 +2462,7 @@ raw_atomic_add_unless(atomic_t *v, int a, int u)
* @v: pointer to atomic_t
*
* If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere.
*
@@ -2472,6 +2483,7 @@ raw_atomic_inc_not_zero(atomic_t *v)
* @v: pointer to atomic_t
*
* If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere.
*
@@ -2499,6 +2511,7 @@ raw_atomic_inc_unless_negative(atomic_t *v)
* @v: pointer to atomic_t
*
* If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere.
*
@@ -2526,6 +2539,7 @@ raw_atomic_dec_unless_positive(atomic_t *v)
* @v: pointer to atomic_t
*
* If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere.
*
@@ -4117,6 +4131,7 @@ raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere.
*
@@ -4145,6 +4160,7 @@ raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere.
*
@@ -4173,6 +4189,7 @@ raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere.
*
@@ -4200,6 +4217,7 @@ raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere.
*
@@ -4224,7 +4242,8 @@ raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere.
*
@@ -4257,7 +4276,8 @@ raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere.
*
@@ -4290,7 +4310,8 @@ raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere.
*
@@ -4322,7 +4343,8 @@ raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere.
*
@@ -4515,6 +4537,7 @@ raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
* @u: s64 value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere.
*
@@ -4544,6 +4567,7 @@ raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
* @u: s64 value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere.
*
@@ -4564,6 +4588,7 @@ raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
* @v: pointer to atomic64_t
*
* If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere.
*
@@ -4584,6 +4609,7 @@ raw_atomic64_inc_not_zero(atomic64_t *v)
* @v: pointer to atomic64_t
*
* If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere.
*
@@ -4611,6 +4637,7 @@ raw_atomic64_inc_unless_negative(atomic64_t *v)
* @v: pointer to atomic64_t
*
* If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere.
*
@@ -4638,6 +4665,7 @@ raw_atomic64_dec_unless_positive(atomic64_t *v)
* @v: pointer to atomic64_t
*
* If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere.
*
@@ -4662,4 +4690,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v)
}
#endif /* _LINUX_ATOMIC_FALLBACK_H */
-// eec048affea735b8464f58e6d96992101f8f85f1
+// 14850c0b0db20c62fdc78ccd1d42b98b88d76331
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 54d7bbe0aeaa..debd487fe971 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -1182,6 +1182,7 @@ atomic_xchg_relaxed(atomic_t *v, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there.
*
@@ -1202,6 +1203,7 @@ atomic_cmpxchg(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there.
*
@@ -1221,6 +1223,7 @@ atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there.
*
@@ -1241,6 +1244,7 @@ atomic_cmpxchg_release(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there.
*
@@ -1260,7 +1264,8 @@ atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there.
*
@@ -1282,7 +1287,8 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there.
*
@@ -1303,7 +1309,8 @@ atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there.
*
@@ -1325,7 +1332,8 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
* @new: int value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there.
*
@@ -1475,6 +1483,7 @@ atomic_add_negative_relaxed(int i, atomic_t *v)
* @u: int value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there.
*
@@ -1495,6 +1504,7 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u)
* @u: int value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_add_unless() there.
*
@@ -1513,6 +1523,7 @@ atomic_add_unless(atomic_t *v, int a, int u)
* @v: pointer to atomic_t
*
* If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there.
*
@@ -1531,6 +1542,7 @@ atomic_inc_not_zero(atomic_t *v)
* @v: pointer to atomic_t
*
* If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there.
*
@@ -1549,6 +1561,7 @@ atomic_inc_unless_negative(atomic_t *v)
* @v: pointer to atomic_t
*
* If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there.
*
@@ -1567,6 +1580,7 @@ atomic_dec_unless_positive(atomic_t *v)
* @v: pointer to atomic_t
*
* If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there.
*
@@ -2746,6 +2760,7 @@ atomic64_xchg_relaxed(atomic64_t *v, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there.
*
@@ -2766,6 +2781,7 @@ atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there.
*
@@ -2785,6 +2801,7 @@ atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there.
*
@@ -2805,6 +2822,7 @@ atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there.
*
@@ -2824,7 +2842,8 @@ atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there.
*
@@ -2846,7 +2865,8 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there.
*
@@ -2867,7 +2887,8 @@ atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there.
*
@@ -2889,7 +2910,8 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
* @new: s64 value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there.
*
@@ -3039,6 +3061,7 @@ atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
* @u: s64 value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there.
*
@@ -3059,6 +3082,7 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
* @u: s64 value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_add_unless() there.
*
@@ -3077,6 +3101,7 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
* @v: pointer to atomic64_t
*
* If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there.
*
@@ -3095,6 +3120,7 @@ atomic64_inc_not_zero(atomic64_t *v)
* @v: pointer to atomic64_t
*
* If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there.
*
@@ -3113,6 +3139,7 @@ atomic64_inc_unless_negative(atomic64_t *v)
* @v: pointer to atomic64_t
*
* If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there.
*
@@ -3131,6 +3158,7 @@ atomic64_dec_unless_positive(atomic64_t *v)
* @v: pointer to atomic64_t
*
* If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there.
*
@@ -4310,6 +4338,7 @@ atomic_long_xchg_relaxed(atomic_long_t *v, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there.
*
@@ -4330,6 +4359,7 @@ atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there.
*
@@ -4349,6 +4379,7 @@ atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there.
*
@@ -4369,6 +4400,7 @@ atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there.
*
@@ -4388,7 +4420,8 @@ atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there.
*
@@ -4410,7 +4443,8 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there.
*
@@ -4431,7 +4465,8 @@ atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there.
*
@@ -4453,7 +4488,8 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there.
*
@@ -4603,6 +4639,7 @@ atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
* @u: long value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there.
*
@@ -4623,6 +4660,7 @@ atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
* @u: long value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there.
*
@@ -4641,6 +4679,7 @@ atomic_long_add_unless(atomic_long_t *v, long a, long u)
* @v: pointer to atomic_long_t
*
* If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there.
*
@@ -4659,6 +4698,7 @@ atomic_long_inc_not_zero(atomic_long_t *v)
* @v: pointer to atomic_long_t
*
* If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there.
*
@@ -4677,6 +4717,7 @@ atomic_long_inc_unless_negative(atomic_long_t *v)
* @v: pointer to atomic_long_t
*
* If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there.
*
@@ -4695,6 +4736,7 @@ atomic_long_dec_unless_positive(atomic_long_t *v)
* @v: pointer to atomic_long_t
*
* If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there.
*
@@ -5008,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 2cc4bc990fef44d3836ec108f11b610f3f438184
+// ce5b65e0f1f8a276268b667194581d24bed219d4
diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h
index c82947170ddc..3ef844b3ab8a 100644
--- a/include/linux/atomic/atomic-long.h
+++ b/include/linux/atomic/atomic-long.h
@@ -1352,6 +1352,7 @@ raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere.
*
@@ -1374,6 +1375,7 @@ raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere.
*
@@ -1396,6 +1398,7 @@ raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere.
*
@@ -1418,6 +1421,7 @@ raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere.
*
@@ -1440,7 +1444,8 @@ raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with full ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere.
*
@@ -1463,7 +1468,8 @@ raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with acquire ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere.
*
@@ -1486,7 +1492,8 @@ raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with release ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere.
*
@@ -1509,7 +1516,8 @@ raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
* @new: long value to assign
*
* If (@v == @old), atomically updates @v to @new with relaxed ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere.
*
@@ -1677,6 +1685,7 @@ raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
* @u: long value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere.
*
@@ -1699,6 +1708,7 @@ raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
* @u: long value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere.
*
@@ -1719,6 +1729,7 @@ raw_atomic_long_add_unless(atomic_long_t *v, long a, long u)
* @v: pointer to atomic_long_t
*
* If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere.
*
@@ -1739,6 +1750,7 @@ raw_atomic_long_inc_not_zero(atomic_long_t *v)
* @v: pointer to atomic_long_t
*
* If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere.
*
@@ -1759,6 +1771,7 @@ raw_atomic_long_inc_unless_negative(atomic_long_t *v)
* @v: pointer to atomic_long_t
*
* If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere.
*
@@ -1779,6 +1792,7 @@ raw_atomic_long_dec_unless_positive(atomic_long_t *v)
* @v: pointer to atomic_long_t
*
* If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere.
*
@@ -1795,4 +1809,4 @@ raw_atomic_long_dec_if_positive(atomic_long_t *v)
}
#endif /* _LINUX_ATOMIC_LONG_H */
-// 4ef23f98c73cff96d239896175fd26b10b88899e
+// 1c4a26fc77f345342953770ebe3c4d08e7ce2f9a
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 75bd1692d2e3..aff92b1d284f 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -35,7 +35,7 @@
(typeof(ptr)) (__ptr + (off)); \
})
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
#define __noretpoline __attribute__((__indirect_branch__("keep")))
#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index bb1339c7057b..cdcdaa48b4d2 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -209,7 +209,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
*/
#define ___ADDRESSABLE(sym, __attrs) \
static void * __used __attrs \
- __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
+ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
#define __ADDRESSABLE(sym) \
___ADDRESSABLE(sym, __section(".discard.addressable"))
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 289810685fc5..8bdf6e0918c1 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -334,6 +334,18 @@
#define __section(section) __attribute__((__section__(section)))
/*
+ * Optional: only supported since gcc >= 12
+ *
+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-uninitialized-variable-attribute
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#uninitialized
+ */
+#if __has_attribute(__uninitialized__)
+# define __uninitialized __attribute__((__uninitialized__))
+#else
+# define __uninitialized
+#endif
+
+/*
* gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-unused-function-attribute
* gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html#index-unused-type-attribute
* gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-unused-variable-attribute
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 61a0ddf2bef6..ae5a20cf2f9c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -75,6 +75,8 @@ extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev,
struct device_attribute *attr, char *buf);
extern ssize_t cpu_show_gds(struct device *dev,
struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev,
+ struct device_attribute *attr, char *buf);
extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
@@ -196,6 +198,8 @@ void arch_cpu_idle(void);
void arch_cpu_idle_prepare(void);
void arch_cpu_idle_enter(void);
void arch_cpu_idle_exit(void);
+void arch_tick_broadcast_enter(void);
+void arch_tick_broadcast_exit(void);
void __noreturn arch_cpu_idle_dead(void);
#ifdef CONFIG_ARCH_HAS_CPU_FINALIZE_INIT
diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h
index adb83a42a6b9..35227d47cfc9 100644
--- a/include/linux/indirect_call_wrapper.h
+++ b/include/linux/indirect_call_wrapper.h
@@ -2,7 +2,7 @@
#ifndef _LINUX_INDIRECT_CALL_WRAPPER_H
#define _LINUX_INDIRECT_CALL_WRAPPER_H
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
/*
* INDIRECT_CALL_$NR - wrapper for indirect calls with $NR known builtin
diff --git a/include/linux/module.h b/include/linux/module.h
index 452500431077..932fabe2965a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -888,7 +888,7 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
static inline void module_bug_cleanup(struct module *mod) {}
#endif /* CONFIG_GENERIC_BUG */
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
extern bool retpoline_module_ok(bool has_retpoline);
#else
static inline bool retpoline_module_ok(bool has_retpoline)
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 7e208d46ba5b..67edc4ca2bee 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -32,11 +32,9 @@
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
#endif
-#ifndef CONFIG_PREEMPT_RT
-
#ifdef CONFIG_DEBUG_MUTEXES
-#define __DEBUG_MUTEX_INITIALIZER(lockname) \
+# define __DEBUG_MUTEX_INITIALIZER(lockname) \
, .magic = &lockname
extern void mutex_destroy(struct mutex *lock);
@@ -49,6 +47,7 @@ static inline void mutex_destroy(struct mutex *lock) {}
#endif
+#ifndef CONFIG_PREEMPT_RT
/**
* mutex_init - initialize the mutex
* @mutex: the mutex to be initialized
@@ -101,9 +100,6 @@ extern bool mutex_is_locked(struct mutex *lock);
extern void __mutex_rt_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
-extern int mutex_trylock(struct mutex *lock);
-
-static inline void mutex_destroy(struct mutex *lock) { }
#define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex)
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 33212e93f4a6..b3b8d3dab52d 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -131,7 +131,7 @@
*/
.macro VALIDATE_UNRET_BEGIN
#if defined(CONFIG_NOINSTR_VALIDATION) && \
- (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO))
+ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO))
.Lhere_\@:
.pushsection .discard.validate_unret
.long .Lhere_\@ - .
diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h
index 7fd17e82bab4..3705c2044fc0 100644
--- a/include/linux/psp-sev.h
+++ b/include/linux/psp-sev.h
@@ -78,6 +78,36 @@ enum sev_cmd {
SEV_CMD_DBG_DECRYPT = 0x060,
SEV_CMD_DBG_ENCRYPT = 0x061,
+ /* SNP specific commands */
+ SEV_CMD_SNP_INIT = 0x081,
+ SEV_CMD_SNP_SHUTDOWN = 0x082,
+ SEV_CMD_SNP_PLATFORM_STATUS = 0x083,
+ SEV_CMD_SNP_DF_FLUSH = 0x084,
+ SEV_CMD_SNP_INIT_EX = 0x085,
+ SEV_CMD_SNP_SHUTDOWN_EX = 0x086,
+ SEV_CMD_SNP_DECOMMISSION = 0x090,
+ SEV_CMD_SNP_ACTIVATE = 0x091,
+ SEV_CMD_SNP_GUEST_STATUS = 0x092,
+ SEV_CMD_SNP_GCTX_CREATE = 0x093,
+ SEV_CMD_SNP_GUEST_REQUEST = 0x094,
+ SEV_CMD_SNP_ACTIVATE_EX = 0x095,
+ SEV_CMD_SNP_LAUNCH_START = 0x0A0,
+ SEV_CMD_SNP_LAUNCH_UPDATE = 0x0A1,
+ SEV_CMD_SNP_LAUNCH_FINISH = 0x0A2,
+ SEV_CMD_SNP_DBG_DECRYPT = 0x0B0,
+ SEV_CMD_SNP_DBG_ENCRYPT = 0x0B1,
+ SEV_CMD_SNP_PAGE_SWAP_OUT = 0x0C0,
+ SEV_CMD_SNP_PAGE_SWAP_IN = 0x0C1,
+ SEV_CMD_SNP_PAGE_MOVE = 0x0C2,
+ SEV_CMD_SNP_PAGE_MD_INIT = 0x0C3,
+ SEV_CMD_SNP_PAGE_SET_STATE = 0x0C6,
+ SEV_CMD_SNP_PAGE_RECLAIM = 0x0C7,
+ SEV_CMD_SNP_PAGE_UNSMASH = 0x0C8,
+ SEV_CMD_SNP_CONFIG = 0x0C9,
+ SEV_CMD_SNP_DOWNLOAD_FIRMWARE_EX = 0x0CA,
+ SEV_CMD_SNP_COMMIT = 0x0CB,
+ SEV_CMD_SNP_VLEK_LOAD = 0x0CD,
+
SEV_CMD_MAX,
};
@@ -523,12 +553,269 @@ struct sev_data_attestation_report {
u32 len; /* In/Out */
} __packed;
+/**
+ * struct sev_data_snp_download_firmware - SNP_DOWNLOAD_FIRMWARE command params
+ *
+ * @address: physical address of firmware image
+ * @len: length of the firmware image
+ */
+struct sev_data_snp_download_firmware {
+ u64 address; /* In */
+ u32 len; /* In */
+} __packed;
+
+/**
+ * struct sev_data_snp_activate - SNP_ACTIVATE command params
+ *
+ * @gctx_paddr: system physical address guest context page
+ * @asid: ASID to bind to the guest
+ */
+struct sev_data_snp_activate {
+ u64 gctx_paddr; /* In */
+ u32 asid; /* In */
+} __packed;
+
+/**
+ * struct sev_data_snp_addr - generic SNP command params
+ *
+ * @address: physical address of generic data param
+ */
+struct sev_data_snp_addr {
+ u64 address; /* In/Out */
+} __packed;
+
+/**
+ * struct sev_data_snp_launch_start - SNP_LAUNCH_START command params
+ *
+ * @gctx_paddr: system physical address of guest context page
+ * @policy: guest policy
+ * @ma_gctx_paddr: system physical address of migration agent
+ * @ma_en: the guest is associated with a migration agent
+ * @imi_en: launch flow is launching an IMI (Incoming Migration Image) for the
+ * purpose of guest-assisted migration.
+ * @rsvd: reserved
+ * @gosvw: guest OS-visible workarounds, as defined by hypervisor
+ */
+struct sev_data_snp_launch_start {
+ u64 gctx_paddr; /* In */
+ u64 policy; /* In */
+ u64 ma_gctx_paddr; /* In */
+ u32 ma_en:1; /* In */
+ u32 imi_en:1; /* In */
+ u32 rsvd:30;
+ u8 gosvw[16]; /* In */
+} __packed;
+
+/* SNP support page type */
+enum {
+ SNP_PAGE_TYPE_NORMAL = 0x1,
+ SNP_PAGE_TYPE_VMSA = 0x2,
+ SNP_PAGE_TYPE_ZERO = 0x3,
+ SNP_PAGE_TYPE_UNMEASURED = 0x4,
+ SNP_PAGE_TYPE_SECRET = 0x5,
+ SNP_PAGE_TYPE_CPUID = 0x6,
+
+ SNP_PAGE_TYPE_MAX
+};
+
+/**
+ * struct sev_data_snp_launch_update - SNP_LAUNCH_UPDATE command params
+ *
+ * @gctx_paddr: system physical address of guest context page
+ * @page_size: page size 0 indicates 4K and 1 indicates 2MB page
+ * @page_type: encoded page type
+ * @imi_page: indicates that this page is part of the IMI (Incoming Migration
+ * Image) of the guest
+ * @rsvd: reserved
+ * @rsvd2: reserved
+ * @address: system physical address of destination page to encrypt
+ * @rsvd3: reserved
+ * @vmpl1_perms: VMPL permission mask for VMPL1
+ * @vmpl2_perms: VMPL permission mask for VMPL2
+ * @vmpl3_perms: VMPL permission mask for VMPL3
+ * @rsvd4: reserved
+ */
+struct sev_data_snp_launch_update {
+ u64 gctx_paddr; /* In */
+ u32 page_size:1; /* In */
+ u32 page_type:3; /* In */
+ u32 imi_page:1; /* In */
+ u32 rsvd:27;
+ u32 rsvd2;
+ u64 address; /* In */
+ u32 rsvd3:8;
+ u32 vmpl1_perms:8; /* In */
+ u32 vmpl2_perms:8; /* In */
+ u32 vmpl3_perms:8; /* In */
+ u32 rsvd4;
+} __packed;
+
+/**
+ * struct sev_data_snp_launch_finish - SNP_LAUNCH_FINISH command params
+ *
+ * @gctx_paddr: system physical address of guest context page
+ * @id_block_paddr: system physical address of ID block
+ * @id_auth_paddr: system physical address of ID block authentication structure
+ * @id_block_en: indicates whether ID block is present
+ * @auth_key_en: indicates whether author key is present in authentication structure
+ * @rsvd: reserved
+ * @host_data: host-supplied data for guest, not interpreted by firmware
+ */
+struct sev_data_snp_launch_finish {
+ u64 gctx_paddr;
+ u64 id_block_paddr;
+ u64 id_auth_paddr;
+ u8 id_block_en:1;
+ u8 auth_key_en:1;
+ u64 rsvd:62;
+ u8 host_data[32];
+} __packed;
+
+/**
+ * struct sev_data_snp_guest_status - SNP_GUEST_STATUS command params
+ *
+ * @gctx_paddr: system physical address of guest context page
+ * @address: system physical address of guest status page
+ */
+struct sev_data_snp_guest_status {
+ u64 gctx_paddr;
+ u64 address;
+} __packed;
+
+/**
+ * struct sev_data_snp_page_reclaim - SNP_PAGE_RECLAIM command params
+ *
+ * @paddr: system physical address of page to be claimed. The 0th bit in the
+ * address indicates the page size. 0h indicates 4KB and 1h indicates
+ * 2MB page.
+ */
+struct sev_data_snp_page_reclaim {
+ u64 paddr;
+} __packed;
+
+/**
+ * struct sev_data_snp_page_unsmash - SNP_PAGE_UNSMASH command params
+ *
+ * @paddr: system physical address of page to be unsmashed. The 0th bit in the
+ * address indicates the page size. 0h indicates 4 KB and 1h indicates
+ * 2 MB page.
+ */
+struct sev_data_snp_page_unsmash {
+ u64 paddr;
+} __packed;
+
+/**
+ * struct sev_data_snp_dbg - DBG_ENCRYPT/DBG_DECRYPT command parameters
+ *
+ * @gctx_paddr: system physical address of guest context page
+ * @src_addr: source address of data to operate on
+ * @dst_addr: destination address of data to operate on
+ */
+struct sev_data_snp_dbg {
+ u64 gctx_paddr; /* In */
+ u64 src_addr; /* In */
+ u64 dst_addr; /* In */
+} __packed;
+
+/**
+ * struct sev_data_snp_guest_request - SNP_GUEST_REQUEST command params
+ *
+ * @gctx_paddr: system physical address of guest context page
+ * @req_paddr: system physical address of request page
+ * @res_paddr: system physical address of response page
+ */
+struct sev_data_snp_guest_request {
+ u64 gctx_paddr; /* In */
+ u64 req_paddr; /* In */
+ u64 res_paddr; /* In */
+} __packed;
+
+/**
+ * struct sev_data_snp_init_ex - SNP_INIT_EX structure
+ *
+ * @init_rmp: indicate that the RMP should be initialized.
+ * @list_paddr_en: indicate that list_paddr is valid
+ * @rsvd: reserved
+ * @rsvd1: reserved
+ * @list_paddr: system physical address of range list
+ * @rsvd2: reserved
+ */
+struct sev_data_snp_init_ex {
+ u32 init_rmp:1;
+ u32 list_paddr_en:1;
+ u32 rsvd:30;
+ u32 rsvd1;
+ u64 list_paddr;
+ u8 rsvd2[48];
+} __packed;
+
+/**
+ * struct sev_data_range - RANGE structure
+ *
+ * @base: system physical address of first byte of range
+ * @page_count: number of 4KB pages in this range
+ * @rsvd: reserved
+ */
+struct sev_data_range {
+ u64 base;
+ u32 page_count;
+ u32 rsvd;
+} __packed;
+
+/**
+ * struct sev_data_range_list - RANGE_LIST structure
+ *
+ * @num_elements: number of elements in RANGE_ARRAY
+ * @rsvd: reserved
+ * @ranges: array of num_elements of type RANGE
+ */
+struct sev_data_range_list {
+ u32 num_elements;
+ u32 rsvd;
+ struct sev_data_range ranges[];
+} __packed;
+
+/**
+ * struct sev_data_snp_shutdown_ex - SNP_SHUTDOWN_EX structure
+ *
+ * @len: length of the command buffer read by the PSP
+ * @iommu_snp_shutdown: Disable enforcement of SNP in the IOMMU
+ * @rsvd1: reserved
+ */
+struct sev_data_snp_shutdown_ex {
+ u32 len;
+ u32 iommu_snp_shutdown:1;
+ u32 rsvd1:31;
+} __packed;
+
+/**
+ * struct sev_platform_init_args
+ *
+ * @error: SEV firmware error code
+ * @probe: True if this is being called as part of CCP module probe, which
+ * will defer SEV_INIT/SEV_INIT_EX firmware initialization until needed
+ * unless psp_init_on_probe module param is set
+ */
+struct sev_platform_init_args {
+ int error;
+ bool probe;
+};
+
+/**
+ * struct sev_data_snp_commit - SNP_COMMIT structure
+ *
+ * @len: length of the command buffer read by the PSP
+ */
+struct sev_data_snp_commit {
+ u32 len;
+} __packed;
+
#ifdef CONFIG_CRYPTO_DEV_SP_PSP
/**
* sev_platform_init - perform SEV INIT command
*
- * @error: SEV command return code
+ * @args: struct sev_platform_init_args to pass in arguments
*
* Returns:
* 0 if the SEV successfully processed the command
@@ -537,7 +824,7 @@ struct sev_data_attestation_report {
* -%ETIMEDOUT if the SEV command timed out
* -%EIO if the SEV returned a non-zero return code
*/
-int sev_platform_init(int *error);
+int sev_platform_init(struct sev_platform_init_args *args);
/**
* sev_platform_status - perform SEV PLATFORM_STATUS command
@@ -637,14 +924,32 @@ int sev_guest_df_flush(int *error);
*/
int sev_guest_decommission(struct sev_data_decommission *data, int *error);
+/**
+ * sev_do_cmd - issue an SEV or an SEV-SNP command
+ *
+ * @cmd: SEV or SEV-SNP firmware command to issue
+ * @data: arguments for firmware command
+ * @psp_ret: SEV command return code
+ *
+ * Returns:
+ * 0 if the SEV device successfully processed the command
+ * -%ENODEV if the PSP device is not available
+ * -%ENOTSUPP if PSP device does not support SEV
+ * -%ETIMEDOUT if the SEV command timed out
+ * -%EIO if PSP device returned a non-zero return code
+ */
+int sev_do_cmd(int cmd, void *data, int *psp_ret);
+
void *psp_copy_user_blob(u64 uaddr, u32 len);
+void *snp_alloc_firmware_page(gfp_t mask);
+void snp_free_firmware_page(void *addr);
#else /* !CONFIG_CRYPTO_DEV_SP_PSP */
static inline int
sev_platform_status(struct sev_user_data_status *status, int *error) { return -ENODEV; }
-static inline int sev_platform_init(int *error) { return -ENODEV; }
+static inline int sev_platform_init(struct sev_platform_init_args *args) { return -ENODEV; }
static inline int
sev_guest_deactivate(struct sev_data_deactivate *data, int *error) { return -ENODEV; }
@@ -653,6 +958,9 @@ static inline int
sev_guest_decommission(struct sev_data_decommission *data, int *error) { return -ENODEV; }
static inline int
+sev_do_cmd(int cmd, void *data, int *psp_ret) { return -ENODEV; }
+
+static inline int
sev_guest_activate(struct sev_data_activate *data, int *error) { return -ENODEV; }
static inline int sev_guest_df_flush(int *error) { return -ENODEV; }
@@ -662,6 +970,13 @@ sev_issue_cmd_external_user(struct file *filep, unsigned int id, void *data, int
static inline void *psp_copy_user_blob(u64 __user uaddr, u32 len) { return ERR_PTR(-EINVAL); }
+static inline void *snp_alloc_firmware_page(gfp_t mask)
+{
+ return NULL;
+}
+
+static inline void snp_free_firmware_page(void *addr) { }
+
#endif /* CONFIG_CRYPTO_DEV_SP_PSP */
#endif /* __PSP_SEV_H__ */
diff --git a/include/linux/pti.h b/include/linux/pti.h
index 1a941efcaa62..1fbf9d6c20ef 100644
--- a/include/linux/pti.h
+++ b/include/linux/pti.h
@@ -2,7 +2,7 @@
#ifndef _INCLUDE_PTI_H
#define _INCLUDE_PTI_H
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
#include <asm/pti.h>
#else
static inline void pti_init(void) { }
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 1f4048bf2674..a64182bc72ad 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -25,6 +25,7 @@ void log_non_standard_event(const guid_t *sec_type,
const guid_t *fru_id, const char *fru_text,
const u8 sev, const u8 *err, const u32 len);
void log_arm_hw_error(struct cper_sec_proc_arm *err);
+
#else
static inline void
log_non_standard_event(const guid_t *sec_type,
@@ -35,4 +36,21 @@ static inline void
log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
#endif
+struct atl_err {
+ u64 addr;
+ u64 ipid;
+ u32 cpu;
+};
+
+#if IS_ENABLED(CONFIG_AMD_ATL)
+void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
+void amd_atl_unregister_decoder(void);
+void amd_retire_dram_row(struct atl_err *err);
+unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
+#else
+static inline void amd_retire_dram_row(struct atl_err *err) { }
+static inline unsigned long
+amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
+#endif /* CONFIG_AMD_ATL */
+
#endif /* __RAS_H__ */
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 66942d7fba7f..a365f67131ec 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -6,6 +6,12 @@
#include <linux/list.h>
#include <linux/pid.h>
+/* CLOSID, RMID value used by the default control group */
+#define RESCTRL_RESERVED_CLOSID 0
+#define RESCTRL_RESERVED_RMID 0
+
+#define RESCTRL_PICK_ANY_CPU -1
+
#ifdef CONFIG_PROC_CPU_RESCTRL
int proc_resctrl_show(struct seq_file *m,
@@ -153,7 +159,7 @@ struct resctrl_schema;
* @cache_level: Which cache level defines scope of this resource
* @cache: Cache allocation related data
* @membw: If the component has bandwidth controls, their properties.
- * @domains: All domains for this resource
+ * @domains: RCU list of all domains for this resource
* @name: Name to use in "schemata" file.
* @data_width: Character width of data when displaying
* @default_ctrl: Specifies default cache cbm or memory B/W percent.
@@ -219,36 +225,70 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
u32 closid, enum resctrl_conf_type type);
int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
+void resctrl_online_cpu(unsigned int cpu);
+void resctrl_offline_cpu(unsigned int cpu);
/**
* resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid
* for this resource and domain.
* @r: resource that the counter should be read from.
* @d: domain that the counter should be read from.
+ * @closid: closid that matches the rmid. Depending on the architecture, the
+ * counter may match traffic of both @closid and @rmid, or @rmid
+ * only.
* @rmid: rmid of the counter to read.
* @eventid: eventid to read, e.g. L3 occupancy.
* @val: result of the counter read in bytes.
+ * @arch_mon_ctx: An architecture specific value from
+ * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies
+ * the hardware monitor allocated for this read request.
*
- * Call from process context on a CPU that belongs to domain @d.
+ * Some architectures need to sleep when first programming some of the counters.
+ * (specifically: arm64's MPAM cache occupancy counters can return 'not ready'
+ * for a short period of time). Call from a non-migrateable process context on
+ * a CPU that belongs to domain @d. e.g. use smp_call_on_cpu() or
+ * schedule_work_on(). This function can be called with interrupts masked,
+ * e.g. using smp_call_function_any(), but may consistently return an error.
*
* Return:
* 0 on success, or -EIO, -EINVAL etc on error.
*/
int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
- u32 rmid, enum resctrl_event_id eventid, u64 *val);
+ u32 closid, u32 rmid, enum resctrl_event_id eventid,
+ u64 *val, void *arch_mon_ctx);
+
+/**
+ * resctrl_arch_rmid_read_context_check() - warn about invalid contexts
+ *
+ * When built with CONFIG_DEBUG_ATOMIC_SLEEP generate a warning when
+ * resctrl_arch_rmid_read() is called with preemption disabled.
+ *
+ * The contract with resctrl_arch_rmid_read() is that if interrupts
+ * are unmasked, it can sleep. This allows NOHZ_FULL systems to use an
+ * IPI, (and fail if the call needed to sleep), while most of the time
+ * the work is scheduled, allowing the call to sleep.
+ */
+static inline void resctrl_arch_rmid_read_context_check(void)
+{
+ if (!irqs_disabled())
+ might_sleep();
+}
/**
* resctrl_arch_reset_rmid() - Reset any private state associated with rmid
* and eventid.
* @r: The domain's resource.
* @d: The rmid's domain.
+ * @closid: closid that matches the rmid. Depending on the architecture, the
+ * counter may match traffic of both @closid and @rmid, or @rmid only.
* @rmid: The rmid whose counter values should be reset.
* @eventid: The eventid whose counter values should be reset.
*
* This can be called from any CPU.
*/
void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
- u32 rmid, enum resctrl_event_id eventid);
+ u32 closid, u32 rmid,
+ enum resctrl_event_id eventid);
/**
* resctrl_arch_reset_rmid_all() - Reset all private state associated with
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index a8b28647aafc..b04a5d04dee9 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -117,13 +117,13 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS)
/*
- * Domain members share CPU package resources (i.e. caches)
+ * Domain members share CPU Last Level Caches
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
* the same cache(s).
* NEEDS_GROUPS: Caches are shared between groups.
*/
-SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/*
* Only a single load balancing instance
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 11e0e00e0bb9..18572c9ea724 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -38,21 +38,21 @@ extern const struct sd_flag_debug sd_flag_debug[];
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
{
- return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC;
}
#endif
#ifdef CONFIG_SCHED_CLUSTER
static inline int cpu_cluster_flags(void)
{
- return SD_CLUSTER | SD_SHARE_PKG_RESOURCES;
+ return SD_CLUSTER | SD_SHARE_LLC;
}
#endif
#ifdef CONFIG_SCHED_MC
static inline int cpu_core_flags(void)
{
- return SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_LLC;
}
#endif
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 7398ce99786c..fcd61dfe2af3 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -105,6 +105,12 @@ static inline void on_each_cpu_cond(smp_cond_func_t cond_func,
on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}
+/*
+ * Architecture specific boot CPU setup. Defined as empty weak function in
+ * init/main.c. Architectures can override it.
+ */
+void smp_prepare_boot_cpu(void);
+
#ifdef CONFIG_SMP
#include <linux/preempt.h>
@@ -171,12 +177,6 @@ void generic_smp_call_function_single_interrupt(void);
#define generic_smp_call_function_interrupt \
generic_smp_call_function_single_interrupt
-/*
- * Mark the boot cpu "online" so that it can call console drivers in
- * printk() and can access its per-cpu storage.
- */
-void smp_prepare_boot_cpu(void);
-
extern unsigned int setup_max_cpus;
extern void __init setup_nr_cpu_ids(void);
extern void __init smp_init(void);
@@ -203,7 +203,6 @@ static inline void up_smp_call_function(smp_call_func_t func, void *info)
(up_smp_call_function(func, info))
static inline void smp_send_reschedule(int cpu) { }
-#define smp_prepare_boot_cpu() do {} while (0)
#define smp_call_function_many(mask, func, info, wait) \
(up_smp_call_function(func, info))
static inline void call_function_init(void) { }
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 44fddfa93e18..4924a33700b7 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -12,6 +12,7 @@
#include <linux/cpumask.h>
#include <linux/sched.h>
#include <linux/rcupdate.h>
+#include <linux/static_key.h>
#ifdef CONFIG_GENERIC_CLOCKEVENTS
extern void __init tick_init(void);
@@ -69,6 +70,8 @@ enum tick_broadcast_state {
TICK_BROADCAST_ENTER,
};
+extern struct static_key_false arch_needs_tick_broadcast;
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern void tick_broadcast_control(enum tick_broadcast_mode mode);
#else
@@ -164,9 +167,16 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#endif /* !CONFIG_NO_HZ_COMMON */
+/*
+ * Mask of CPUs that are nohz_full.
+ *
+ * Users should be guarded by CONFIG_NO_HZ_FULL or a tick_nohz_full_cpu()
+ * check.
+ */
+extern cpumask_var_t tick_nohz_full_mask;
+
#ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
-extern cpumask_var_t tick_nohz_full_mask;
static inline bool tick_nohz_full_enabled(void)
{
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 780a5f6ad4a6..ff27cb2e1662 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -93,7 +93,7 @@ extern const struct nft_set_type nft_set_bitmap_type;
extern const struct nft_set_type nft_set_pipapo_type;
extern const struct nft_set_type nft_set_pipapo_avx2_type;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext);
bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h
index a608546bcefc..ffe58a02537c 100644
--- a/include/net/tc_wrapper.h
+++ b/include/net/tc_wrapper.h
@@ -4,7 +4,7 @@
#include <net/pkt_cls.h>
-#if IS_ENABLED(CONFIG_RETPOLINE)
+#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
#include <linux/cpufeature.h>
#include <linux/static_key.h>
diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index b44ba7dcdefc..b7a2c2ee35b7 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -28,6 +28,9 @@ enum {
SEV_PEK_CERT_IMPORT,
SEV_GET_ID, /* This command is deprecated, use SEV_GET_ID2 */
SEV_GET_ID2,
+ SNP_PLATFORM_STATUS,
+ SNP_COMMIT,
+ SNP_SET_CONFIG,
SEV_MAX,
};
@@ -69,6 +72,12 @@ typedef enum {
SEV_RET_RESOURCE_LIMIT,
SEV_RET_SECURE_DATA_INVALID,
SEV_RET_INVALID_KEY = 0x27,
+ SEV_RET_INVALID_PAGE_SIZE,
+ SEV_RET_INVALID_PAGE_STATE,
+ SEV_RET_INVALID_MDATA_ENTRY,
+ SEV_RET_INVALID_PAGE_OWNER,
+ SEV_RET_INVALID_PAGE_AEAD_OFLOW,
+ SEV_RET_RMP_INIT_REQUIRED,
SEV_RET_MAX,
} sev_ret_code;
@@ -156,6 +165,56 @@ struct sev_user_data_get_id2 {
} __packed;
/**
+ * struct sev_user_data_snp_status - SNP status
+ *
+ * @api_major: API major version
+ * @api_minor: API minor version
+ * @state: current platform state
+ * @is_rmp_initialized: whether RMP is initialized or not
+ * @rsvd: reserved
+ * @build_id: firmware build id for the API version
+ * @mask_chip_id: whether chip id is present in attestation reports or not
+ * @mask_chip_key: whether attestation reports are signed or not
+ * @vlek_en: VLEK (Version Loaded Endorsement Key) hashstick is loaded
+ * @rsvd1: reserved
+ * @guest_count: the number of guest currently managed by the firmware
+ * @current_tcb_version: current TCB version
+ * @reported_tcb_version: reported TCB version
+ */
+struct sev_user_data_snp_status {
+ __u8 api_major; /* Out */
+ __u8 api_minor; /* Out */
+ __u8 state; /* Out */
+ __u8 is_rmp_initialized:1; /* Out */
+ __u8 rsvd:7;
+ __u32 build_id; /* Out */
+ __u32 mask_chip_id:1; /* Out */
+ __u32 mask_chip_key:1; /* Out */
+ __u32 vlek_en:1; /* Out */
+ __u32 rsvd1:29;
+ __u32 guest_count; /* Out */
+ __u64 current_tcb_version; /* Out */
+ __u64 reported_tcb_version; /* Out */
+} __packed;
+
+/**
+ * struct sev_user_data_snp_config - system wide configuration value for SNP.
+ *
+ * @reported_tcb: the TCB version to report in the guest attestation report.
+ * @mask_chip_id: whether chip id is present in attestation reports or not
+ * @mask_chip_key: whether attestation reports are signed or not
+ * @rsvd: reserved
+ * @rsvd1: reserved
+ */
+struct sev_user_data_snp_config {
+ __u64 reported_tcb ; /* In */
+ __u32 mask_chip_id:1; /* In */
+ __u32 mask_chip_key:1; /* In */
+ __u32 rsvd:30; /* In */
+ __u8 rsvd1[52];
+} __packed;
+
+/**
* struct sev_issue_cmd - SEV ioctl parameters
*
* @cmd: SEV commands to execute
diff --git a/init/main.c b/init/main.c
index b5bdc1cd7ae9..ac5bc4c3de6d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -774,6 +774,10 @@ void __init __weak smp_setup_processor_id(void)
{
}
+void __init __weak smp_prepare_boot_cpu(void)
+{
+}
+
# if THREAD_SIZE >= PAGE_SIZE
void __init __weak thread_stack_cache_init(void)
{
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 185bd1c906b0..6083883c4fe0 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -223,9 +223,10 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ bool contended = false;
+
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
- trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
@@ -234,8 +235,11 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
* Try set sem->block; this provides writer-writer exclusion.
* Having sem->block set makes new readers block.
*/
- if (!__percpu_down_write_trylock(sem))
+ if (!__percpu_down_write_trylock(sem)) {
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
percpu_rwsem_wait(sem, /* .reader = */ false);
+ contended = true;
+ }
/* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
@@ -247,7 +251,8 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
- trace_contention_end(sem, 0);
+ if (contended)
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 6a0184e9c234..ae2b12f68b90 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -294,8 +294,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
{
struct pv_node *pn = (struct pv_node *)node;
struct pv_node *pp = (struct pv_node *)prev;
+ bool __maybe_unused wait_early;
int loop;
- bool wait_early;
for (;;) {
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4a10e8c16fd2..88d08eeb8bc0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -237,12 +237,13 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
*/
static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
{
- unsigned long owner, *p = (unsigned long *) &lock->owner;
+ unsigned long *p = (unsigned long *) &lock->owner;
+ unsigned long owner, new;
+ owner = READ_ONCE(*p);
do {
- owner = *p;
- } while (cmpxchg_relaxed(p, owner,
- owner | RT_MUTEX_HAS_WAITERS) != owner);
+ new = owner | RT_MUTEX_HAS_WAITERS;
+ } while (!try_cmpxchg_relaxed(p, &owner, new));
/*
* The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2340b6d90ec6..c6d17aee4209 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -35,7 +35,7 @@
/*
* The least significant 2 bits of the owner value has the following
* meanings when set.
- * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
+ * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
* - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
*
* When the rwsem is reader-owned and a spinning writer has timed out,
@@ -1002,8 +1002,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
/*
* To prevent a constant stream of readers from starving a sleeping
- * waiter, don't attempt optimistic lock stealing if the lock is
- * currently owned by readers.
+ * writer, don't attempt optimistic lock stealing if the lock is
+ * very likely owned by readers.
*/
if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
(rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5f76ec5b0c0c..b35382dbe51d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1821,7 +1821,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css);
#endif
#ifdef CONFIG_SYSCTL
-#ifdef CONFIG_UCLAMP_TASK
#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)
{
@@ -1927,7 +1926,6 @@ undo:
return result;
}
#endif
-#endif
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
@@ -2096,7 +2094,7 @@ static void __init init_uclamp(void)
}
}
-#else /* CONFIG_UCLAMP_TASK */
+#else /* !CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline int uclamp_validate(struct task_struct *p,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c0d0b577fd91..db34050ec3e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7335,7 +7335,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
if (!available_idle_cpu(cpu)) {
idle = false;
if (*idle_cpu == -1) {
- if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
*idle_cpu = cpu;
break;
}
@@ -7343,7 +7343,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
}
break;
}
- if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
*idle_cpu = cpu;
}
@@ -7357,13 +7357,19 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
if (cpu == target)
continue;
+ /*
+ * Check if the CPU is in the LLC scheduling domain of @target.
+ * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
+ */
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
}
@@ -7387,7 +7393,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
return __select_idle_cpu(core, p);
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -7637,7 +7643,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
has_idle_core = test_idle_cores(target);
if (!has_idle_core && cpus_share_cache(prev, target)) {
- i = select_idle_smt(p, prev);
+ i = select_idle_smt(p, sd, prev);
if ((unsigned int)i < nr_cpumask_bits)
return i;
}
@@ -9330,19 +9336,17 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
static inline bool others_have_blocked(struct rq *rq)
{
- if (READ_ONCE(rq->avg_rt.util_avg))
+ if (cpu_util_rt(rq))
return true;
- if (READ_ONCE(rq->avg_dl.util_avg))
+ if (cpu_util_dl(rq))
return true;
if (thermal_load_avg(rq))
return true;
-#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
- if (READ_ONCE(rq->avg_irq.util_avg))
+ if (cpu_util_irq(rq))
return true;
-#endif
return false;
}
@@ -9599,8 +9603,8 @@ static unsigned long scale_rt_capacity(int cpu)
* avg_thermal.load_avg tracks thermal pressure and the weighted
* average uses the actual delta max capacity(load).
*/
- used = READ_ONCE(rq->avg_rt.util_avg);
- used += READ_ONCE(rq->avg_dl.util_avg);
+ used = cpu_util_rt(rq);
+ used += cpu_util_dl(rq);
used += thermal_load_avg(rq);
if (unlikely(used >= max))
@@ -9833,51 +9837,49 @@ group_type group_classify(unsigned int imbalance_pct,
*/
static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
{
+ if (!(sd->flags & SD_ASYM_PACKING))
+ return false;
+
if (!sched_smt_active())
return true;
return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
}
+static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
+{
+ /*
+ * First check if @dst_cpu can do asym_packing load balance. Only do it
+ * if it has higher priority than @src_cpu.
+ */
+ return sched_use_asym_prio(sd, dst_cpu) &&
+ sched_asym_prefer(dst_cpu, src_cpu);
+}
+
/**
- * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * sched_group_asym - Check if the destination CPU can do asym_packing balance
* @env: The load balancing environment
- * @sds: Load-balancing data with statistics of the local group
* @sgs: Load-balancing statistics of the candidate busiest group
* @group: The candidate busiest group
*
* @env::dst_cpu can do asym_packing if it has higher priority than the
* preferred CPU of @group.
*
- * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
- * can do asym_packing balance only if all its SMT siblings are idle. Also, it
- * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
- * imbalances in the number of CPUS are dealt with in find_busiest_group().
- *
- * If we are balancing load within an SMT core, or at PKG domain level, always
- * proceed.
- *
* Return: true if @env::dst_cpu can do with asym_packing load balance. False
* otherwise.
*/
static inline bool
-sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
- struct sched_group *group)
+sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
{
- /* Ensure that the whole local core is idle, if applicable. */
- if (!sched_use_asym_prio(env->sd, env->dst_cpu))
- return false;
-
/*
- * CPU priorities does not make sense for SMT cores with more than one
+ * CPU priorities do not make sense for SMT cores with more than one
* busy sibling.
*/
- if (group->flags & SD_SHARE_CPUCAPACITY) {
- if (sgs->group_weight - sgs->idle_cpus != 1)
- return false;
- }
+ if ((group->flags & SD_SHARE_CPUCAPACITY) &&
+ (sgs->group_weight - sgs->idle_cpus != 1))
+ return false;
- return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
+ return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
}
/* One group has more than one SMT CPU while the other group does not */
@@ -10031,11 +10033,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
/* Check if dst CPU is idle and preferred to this group */
- if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
- env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
- sched_asym(env, sds, sgs, group)) {
+ if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+ sched_group_asym(env, sgs, group))
sgs->group_asym_packing = 1;
- }
/* Check for loaded SMT group to be balanced to dst CPU */
if (!local_group && smt_balance(env, sgs, group))
@@ -10099,9 +10099,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
switch (sgs->group_type) {
case group_overloaded:
/* Select the overloaded group with highest avg_load. */
- if (sgs->avg_load <= busiest->avg_load)
- return false;
- break;
+ return sgs->avg_load > busiest->avg_load;
case group_imbalanced:
/*
@@ -10112,18 +10110,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_asym_packing:
/* Prefer to move from lowest priority CPU's work */
- if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
- return false;
- break;
+ return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
case group_misfit_task:
/*
* If we have more than one misfit sg go with the biggest
* misfit.
*/
- if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
- return false;
- break;
+ return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
case group_smt_balance:
/*
@@ -10275,10 +10269,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
* be computed and tested before calling idle_cpu_without().
*/
-#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
-#endif
return 1;
}
@@ -10671,16 +10663,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
- if (local_group)
- goto next_group;
-
-
- if (update_sd_pick_busiest(env, sds, sg, sgs)) {
+ if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
-next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -10784,7 +10771,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (local->group_type == group_has_spare) {
if ((busiest->group_type > group_fully_busy) &&
- !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
+ !(env->sd->flags & SD_SHARE_LLC)) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity
@@ -11140,10 +11127,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* If balancing between cores, let lower priority CPUs help
* SMT cores with more than one busy sibling.
*/
- if ((env->sd->flags & SD_ASYM_PACKING) &&
- sched_use_asym_prio(env->sd, i) &&
- sched_asym_prefer(i, env->dst_cpu) &&
- nr_running == 1)
+ if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
continue;
switch (env->migration_type) {
@@ -11239,8 +11223,7 @@ asym_active_balance(struct lb_env *env)
* the lower priority @env::dst_cpu help it. Do not follow
* CPU priority.
*/
- return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
- sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
(sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
!sched_use_asym_prio(env->sd, env->src_cpu));
}
@@ -12023,8 +12006,7 @@ static void nohz_balancer_kick(struct rq *rq)
* preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_use_asym_prio(sd, i) &&
- sched_asym_prefer(i, cpu)) {
+ if (sched_asym(sd, i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b15d40cad7ea..6135fbe83d68 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -81,6 +81,25 @@ void __weak arch_cpu_idle(void)
cpu_idle_force_poll = 1;
}
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE
+DEFINE_STATIC_KEY_FALSE(arch_needs_tick_broadcast);
+
+static inline void cond_tick_broadcast_enter(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_enter();
+}
+
+static inline void cond_tick_broadcast_exit(void)
+{
+ if (static_branch_unlikely(&arch_needs_tick_broadcast))
+ tick_broadcast_exit();
+}
+#else
+static inline void cond_tick_broadcast_enter(void) { }
+static inline void cond_tick_broadcast_exit(void) { }
+#endif
+
/**
* default_idle_call - Default CPU idle routine.
*
@@ -90,6 +109,7 @@ void __cpuidle default_idle_call(void)
{
instrumentation_begin();
if (!current_clr_polling_and_test()) {
+ cond_tick_broadcast_enter();
trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
@@ -99,6 +119,7 @@ void __cpuidle default_idle_call(void)
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+ cond_tick_broadcast_exit();
}
local_irq_enable();
instrumentation_end();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d60841064527..e85c6c76b878 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3153,7 +3153,7 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
{
- return rq->avg_irq.util_avg;
+ return READ_ONCE(rq->avg_irq.util_avg);
}
static inline
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a9f0187939f6..dc6483b78aa9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -661,13 +661,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
}
/*
- * Keep a special pointer to the highest sched_domain that has
- * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
- * allows us to avoid some pointer chasing select_idle_sibling().
+ * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set
+ * (Last Level Cache Domain) for this allows us to avoid some pointer chasing
+ * select_idle_sibling().
*
- * Also keep a unique ID per domain (we use the first CPU number in
- * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_cache().
+ * Also keep a unique ID per domain (we use the first CPU number in the cpumask
+ * of the domain), this allows us to quickly tell if two CPUs are in the same
+ * cache domain, see cpus_share_cache().
*/
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
@@ -688,7 +688,7 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;
- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
@@ -1555,11 +1555,12 @@ static struct cpumask ***sched_domains_numa_masks;
*
* These flags are purely descriptive of the topology and do not prescribe
* behaviour. Behaviour is artificial and mapped in the below sd_init()
- * function:
+ * function. For details, see include/linux/sched/sd_flags.h.
*
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_CPUCAPACITY
+ * SD_SHARE_LLC
+ * SD_CLUSTER
+ * SD_NUMA
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1569,7 +1570,7 @@ static struct cpumask ***sched_domains_numa_masks;
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY | \
SD_CLUSTER | \
- SD_SHARE_PKG_RESOURCES | \
+ SD_SHARE_LLC | \
SD_NUMA | \
SD_ASYM_PACKING)
@@ -1612,7 +1613,7 @@ sd_init(struct sched_domain_topology_level *tl,
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUCAPACITY
- | 0*SD_SHARE_PKG_RESOURCES
+ | 0*SD_SHARE_LLC
| 0*SD_SERIALIZE
| 1*SD_PREFER_SIBLING
| 0*SD_NUMA
@@ -1649,7 +1650,7 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
- } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ } else if (sd->flags & SD_SHARE_LLC) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
@@ -1674,7 +1675,7 @@ sd_init(struct sched_domain_topology_level *tl,
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
- if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+ if (sd->flags & SD_SHARE_LLC) {
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
@@ -2449,8 +2450,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
struct sched_domain *child = sd->child;
- if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
- (child->flags & SD_SHARE_PKG_RESOURCES)) {
+ if (!(sd->flags & SD_SHARE_LLC) && child &&
+ (child->flags & SD_SHARE_LLC)) {
struct sched_domain __rcu *top_p;
unsigned int nr_llcs;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index bae8f11070be..fc3b1a06c981 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -39,6 +39,11 @@ config GENERIC_CLOCKEVENTS_BROADCAST
bool
depends on GENERIC_CLOCKEVENTS
+# Handle broadcast in default_idle_call()
+config GENERIC_CLOCKEVENTS_BROADCAST_IDLE
+ bool
+ depends on GENERIC_CLOCKEVENTS_BROADCAST
+
# Automatically adjust the min. reprogramming time for
# clock event device
config GENERIC_CLOCKEVENTS_MIN_ADJUST
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index aa332ace108b..3103a484182e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1022,7 +1022,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer)
u64 ts;
/* Skip retpolines :-( */
- if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && likely(buffer->clock == trace_clock_local))
ts = trace_clock_local();
else
ts = buffer->clock();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f3b50b47b7ea..6c596e65de8a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1303,7 +1303,7 @@ config PROVE_LOCKING
select DEBUG_SPINLOCK
select DEBUG_MUTEXES if !PREEMPT_RT
select DEBUG_RT_MUTEXES if RT_MUTEXES
- select DEBUG_RWSEMS
+ select DEBUG_RWSEMS if !PREEMPT_RT
select DEBUG_WW_MUTEX_SLOWPATH
select DEBUG_LOCK_ALLOC
select PREEMPT_COUNT if !ARCH_NO_PREEMPT
@@ -1426,7 +1426,7 @@ config DEBUG_WW_MUTEX_SLOWPATH
config DEBUG_RWSEMS
bool "RW Semaphore debugging: basic checks"
- depends on DEBUG_KERNEL
+ depends on DEBUG_KERNEL && !PREEMPT_RT
help
This debugging feature allows mismatched rw semaphore locks
and unlocks to be detected and reported.
diff --git a/lib/raid6/s390vx.uc b/lib/raid6/s390vx.uc
index cd0b9e95f499..863e2d320938 100644
--- a/lib/raid6/s390vx.uc
+++ b/lib/raid6/s390vx.uc
@@ -12,15 +12,14 @@
*/
#include <linux/raid/pq.h>
-#include <asm/fpu/api.h>
-#include <asm/vx-insn.h>
+#include <asm/fpu.h>
#define NSIZE 16
-static inline void LOAD_CONST(void)
+static __always_inline void LOAD_CONST(void)
{
- asm volatile("VREPIB %v24,7");
- asm volatile("VREPIB %v25,0x1d");
+ fpu_vrepib(24, 0x07);
+ fpu_vrepib(25, 0x1d);
}
/*
@@ -28,10 +27,7 @@ static inline void LOAD_CONST(void)
* vector register y left by 1 bit and stores the result in
* vector register x.
*/
-static inline void SHLBYTE(int x, int y)
-{
- asm volatile ("VAB %0,%1,%1" : : "i" (x), "i" (y));
-}
+#define SHLBYTE(x, y) fpu_vab(x, y, y)
/*
* For each of the 16 bytes in the vector register y the MASK()
@@ -39,49 +35,17 @@ static inline void SHLBYTE(int x, int y)
* or 0x00 if the high bit is 0. The result is stored in vector
* register x.
*/
-static inline void MASK(int x, int y)
-{
- asm volatile ("VESRAVB %0,%1,24" : : "i" (x), "i" (y));
-}
-
-static inline void AND(int x, int y, int z)
-{
- asm volatile ("VN %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
-}
-
-static inline void XOR(int x, int y, int z)
-{
- asm volatile ("VX %0,%1,%2" : : "i" (x), "i" (y), "i" (z));
-}
+#define MASK(x, y) fpu_vesravb(x, y, 24)
-static inline void LOAD_DATA(int x, u8 *ptr)
-{
- typedef struct { u8 _[16 * $#]; } addrtype;
- register addrtype *__ptr asm("1") = (addrtype *) ptr;
-
- asm volatile ("VLM %2,%3,0,%1"
- : : "m" (*__ptr), "a" (__ptr), "i" (x),
- "i" (x + $# - 1));
-}
-
-static inline void STORE_DATA(int x, u8 *ptr)
-{
- typedef struct { u8 _[16 * $#]; } addrtype;
- register addrtype *__ptr asm("1") = (addrtype *) ptr;
-
- asm volatile ("VSTM %2,%3,0,1"
- : "=m" (*__ptr) : "a" (__ptr), "i" (x),
- "i" (x + $# - 1));
-}
-
-static inline void COPY_VEC(int x, int y)
-{
- asm volatile ("VLR %0,%1" : : "i" (x), "i" (y));
-}
+#define AND(x, y, z) fpu_vn(x, y, z)
+#define XOR(x, y, z) fpu_vx(x, y, z)
+#define LOAD_DATA(x, ptr) fpu_vlm(x, x + $# - 1, ptr)
+#define STORE_DATA(x, ptr) fpu_vstm(x, x + $# - 1, ptr)
+#define COPY_VEC(x, y) fpu_vlr(x, y)
static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
{
- struct kernel_fpu vxstate;
+ DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
u8 **dptr, *p, *q;
int d, z, z0;
@@ -114,7 +78,7 @@ static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
size_t bytes, void **ptrs)
{
- struct kernel_fpu vxstate;
+ DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
u8 **dptr, *p, *q;
int d, z, z0;
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 882ea7c80f2d..202ce0d1666c 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -101,7 +101,7 @@ endif
endif
ifdef CONFIG_NFT_CT
-ifdef CONFIG_RETPOLINE
+ifdef CONFIG_MITIGATION_RETPOLINE
nf_tables-objs += nft_ct_fast.o
endif
endif
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index c3e635364701..a48d5f0e2f3e 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -21,7 +21,7 @@
#include <net/netfilter/nf_log.h>
#include <net/netfilter/nft_meta.h>
-#if defined(CONFIG_RETPOLINE) && defined(CONFIG_X86)
+#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86)
static struct static_key_false nf_tables_skip_direct_calls;
@@ -207,7 +207,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr,
struct nft_regs *regs,
struct nft_pktinfo *pkt)
{
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
unsigned long e;
if (nf_skip_indirect_calls())
@@ -236,7 +236,7 @@ static void expr_call_ops_eval(const struct nft_expr *expr,
X(e, nft_objref_map_eval);
#undef X
indirect_call:
-#endif /* CONFIG_RETPOLINE */
+#endif /* CONFIG_MITIGATION_RETPOLINE */
expr->ops->eval(expr, regs, pkt);
}
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 255640013ab8..452ed94c3a4d 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -754,7 +754,7 @@ static bool nft_ct_set_reduce(struct nft_regs_track *track,
return false;
}
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
static const struct nft_expr_ops nft_ct_get_fast_ops = {
.type = &nft_ct_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_ct)),
@@ -799,7 +799,7 @@ nft_ct_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EINVAL);
if (tb[NFTA_CT_DREG]) {
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY]));
switch (k) {
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 870e5b113d13..a0055f510e31 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -24,7 +24,7 @@ struct nft_lookup {
struct nft_set_binding binding;
};
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext)
{
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 36b025cc4fd2..87f6e3c6daa8 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2410,7 +2410,7 @@ static struct pernet_operations psched_net_ops = {
.exit = psched_net_exit,
};
-#if IS_ENABLED(CONFIG_RETPOLINE)
+#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
#endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index da5aa5aed1e3..13067d3ef22b 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -256,7 +256,7 @@ objtool := $(objtree)/tools/objtool/objtool
objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label
objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr
-objtool-args-$(CONFIG_CALL_DEPTH_TRACKING) += --hacks=skylake
+objtool-args-$(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) += --hacks=skylake
objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt
objtool-args-$(CONFIG_FINEIBT) += --cfi
objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount
@@ -264,9 +264,9 @@ ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOOL
objtool-args-$(CONFIG_HAVE_OBJTOOL_NOP_MCOUNT) += --mnop
endif
objtool-args-$(CONFIG_UNWINDER_ORC) += --orc
-objtool-args-$(CONFIG_RETPOLINE) += --retpoline
-objtool-args-$(CONFIG_RETHUNK) += --rethunk
-objtool-args-$(CONFIG_SLS) += --sls
+objtool-args-$(CONFIG_MITIGATION_RETPOLINE) += --retpoline
+objtool-args-$(CONFIG_MITIGATION_RETHUNK) += --rethunk
+objtool-args-$(CONFIG_MITIGATION_SLS) += --sls
objtool-args-$(CONFIG_STACK_VALIDATION) += --stackval
objtool-args-$(CONFIG_HAVE_STATIC_CALL_INLINE) += --static-call
objtool-args-$(CONFIG_HAVE_UACCESS_VALIDATION) += --uaccess
diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
index 25b3b587d37c..6de297916ce6 100644
--- a/scripts/Makefile.vmlinux_o
+++ b/scripts/Makefile.vmlinux_o
@@ -38,7 +38,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION))
vmlinux-objtool-args-$(delay-objtool) += $(objtool-args-y)
vmlinux-objtool-args-$(CONFIG_GCOV_KERNEL) += --no-unreachable
vmlinux-objtool-args-$(CONFIG_NOINSTR_VALIDATION) += --noinstr \
- $(if $(or $(CONFIG_CPU_UNRET_ENTRY),$(CONFIG_CPU_SRSO)), --unret)
+ $(if $(or $(CONFIG_MITIGATION_UNRET_ENTRY),$(CONFIG_MITIGATION_SRSO)), --unret)
objtool-args = $(vmlinux-objtool-args-y) --link
diff --git a/scripts/atomic/kerneldoc/add_unless b/scripts/atomic/kerneldoc/add_unless
index f828e5f6750c..fbc2fadfbdc4 100644
--- a/scripts/atomic/kerneldoc/add_unless
+++ b/scripts/atomic/kerneldoc/add_unless
@@ -10,6 +10,7 @@ cat <<EOF
* @u: ${int} value to compare with
*
* If (@v != @u), atomically updates @v to (@v + @a) with ${desc_order} ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* ${desc_noinstr}
*
diff --git a/scripts/atomic/kerneldoc/cmpxchg b/scripts/atomic/kerneldoc/cmpxchg
index 3bce328f50cf..02b24ee9d8a4 100644
--- a/scripts/atomic/kerneldoc/cmpxchg
+++ b/scripts/atomic/kerneldoc/cmpxchg
@@ -6,6 +6,7 @@ cat <<EOF
* @new: ${int} value to assign
*
* If (@v == @old), atomically updates @v to @new with ${desc_order} ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* ${desc_noinstr}
*
diff --git a/scripts/atomic/kerneldoc/dec_if_positive b/scripts/atomic/kerneldoc/dec_if_positive
index 04f1aed3cf83..9468b4a69603 100644
--- a/scripts/atomic/kerneldoc/dec_if_positive
+++ b/scripts/atomic/kerneldoc/dec_if_positive
@@ -4,6 +4,7 @@ cat <<EOF
* @v: pointer to ${atomic}_t
*
* If (@v > 0), atomically updates @v to (@v - 1) with ${desc_order} ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* ${desc_noinstr}
*
diff --git a/scripts/atomic/kerneldoc/dec_unless_positive b/scripts/atomic/kerneldoc/dec_unless_positive
index ee73612f0354..06a678678f71 100644
--- a/scripts/atomic/kerneldoc/dec_unless_positive
+++ b/scripts/atomic/kerneldoc/dec_unless_positive
@@ -4,6 +4,7 @@ cat <<EOF
* @v: pointer to ${atomic}_t
*
* If (@v <= 0), atomically updates @v to (@v - 1) with ${desc_order} ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* ${desc_noinstr}
*
diff --git a/scripts/atomic/kerneldoc/inc_not_zero b/scripts/atomic/kerneldoc/inc_not_zero
index 618be08e653e..c1a30fc66ee9 100644
--- a/scripts/atomic/kerneldoc/inc_not_zero
+++ b/scripts/atomic/kerneldoc/inc_not_zero
@@ -4,6 +4,7 @@ cat <<EOF
* @v: pointer to ${atomic}_t
*
* If (@v != 0), atomically updates @v to (@v + 1) with ${desc_order} ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* ${desc_noinstr}
*
diff --git a/scripts/atomic/kerneldoc/inc_unless_negative b/scripts/atomic/kerneldoc/inc_unless_negative
index 597f23d4dc8d..ece0d2c7b38f 100644
--- a/scripts/atomic/kerneldoc/inc_unless_negative
+++ b/scripts/atomic/kerneldoc/inc_unless_negative
@@ -4,6 +4,7 @@ cat <<EOF
* @v: pointer to ${atomic}_t
*
* If (@v >= 0), atomically updates @v to (@v + 1) with ${desc_order} ordering.
+ * Otherwise, @v is not modified and relaxed ordering is provided.
*
* ${desc_noinstr}
*
diff --git a/scripts/atomic/kerneldoc/try_cmpxchg b/scripts/atomic/kerneldoc/try_cmpxchg
index 296553206c06..3ccff29538f5 100644
--- a/scripts/atomic/kerneldoc/try_cmpxchg
+++ b/scripts/atomic/kerneldoc/try_cmpxchg
@@ -6,7 +6,8 @@ cat <<EOF
* @new: ${int} value to assign
*
* If (@v == @old), atomically updates @v to @new with ${desc_order} ordering.
- * Otherwise, updates @old to the current value of @v.
+ * Otherwise, @v is not modified, @old is updated to the current value of @v,
+ * and relaxed ordering is provided.
*
* ${desc_noinstr}
*
diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs
index 0da52b548ba5..19f72bfdbb82 100644
--- a/scripts/generate_rust_target.rs
+++ b/scripts/generate_rust_target.rs
@@ -155,7 +155,7 @@ fn main() {
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
);
let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string();
- if cfg.has("RETPOLINE") {
+ if cfg.has("MITIGATION_RETPOLINE") {
features += ",+retpoline-external-thunk";
}
ts.push("features", features);
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 6bf5fc5560df..dc006966e097 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1853,7 +1853,7 @@ static void add_header(struct buffer *b, struct module *mod)
buf_printf(b,
"\n"
- "#ifdef CONFIG_RETPOLINE\n"
+ "#ifdef CONFIG_MITIGATION_RETPOLINE\n"
"MODULE_INFO(retpoline, \"Y\");\n"
"#endif\n");
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index ccbf914b3d1a..25160d26764b 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -444,6 +444,7 @@
#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SEV_SNP (19*32+ 4) /* AMD Secure Encrypted Virtualization - Secure Nested Paging */
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* AMD SEV-ES full debug state swap support */
diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index f40b29d3abad..1f23960d2b06 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -44,32 +44,32 @@
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
# define DISABLE_PTI 0
#else
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
# define DISABLE_RETPOLINE 0
#else
# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \
(1 << (X86_FEATURE_RETPOLINE_LFENCE & 31)))
#endif
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
# define DISABLE_RETHUNK 0
#else
# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31))
#endif
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
# define DISABLE_UNRET 0
#else
# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31))
#endif
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
# define DISABLE_CALL_DEPTH_TRACKING 0
#else
# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
diff --git a/tools/arch/x86/lib/insn.c b/tools/arch/x86/lib/insn.c
index 8fd63a067308..ada4b4a79dd4 100644
--- a/tools/arch/x86/lib/insn.c
+++ b/tools/arch/x86/lib/insn.c
@@ -71,7 +71,7 @@ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
insn->kaddr = kaddr;
insn->end_kaddr = kaddr + buf_len;
insn->next_byte = kaddr;
- insn->x86_64 = x86_64 ? 1 : 0;
+ insn->x86_64 = x86_64;
insn->opnd_bytes = 4;
if (x86_64)
insn->addr_bytes = 8;
@@ -268,11 +268,9 @@ int insn_get_opcode(struct insn *insn)
if (opcode->got)
return 0;
- if (!insn->prefixes.got) {
- ret = insn_get_prefixes(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_prefixes(insn);
+ if (ret)
+ return ret;
/* Get first opcode */
op = get_next(insn_byte_t, insn);
@@ -339,11 +337,9 @@ int insn_get_modrm(struct insn *insn)
if (modrm->got)
return 0;
- if (!insn->opcode.got) {
- ret = insn_get_opcode(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_opcode(insn);
+ if (ret)
+ return ret;
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
@@ -386,11 +382,9 @@ int insn_rip_relative(struct insn *insn)
if (!insn->x86_64)
return 0;
- if (!modrm->got) {
- ret = insn_get_modrm(insn);
- if (ret)
- return 0;
- }
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return 0;
/*
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
@@ -417,11 +411,9 @@ int insn_get_sib(struct insn *insn)
if (insn->sib.got)
return 0;
- if (!insn->modrm.got) {
- ret = insn_get_modrm(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (insn->modrm.nbytes) {
modrm = insn->modrm.bytes[0];
@@ -460,11 +452,9 @@ int insn_get_displacement(struct insn *insn)
if (insn->displacement.got)
return 0;
- if (!insn->sib.got) {
- ret = insn_get_sib(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_sib(insn);
+ if (ret)
+ return ret;
if (insn->modrm.nbytes) {
/*
@@ -628,11 +618,9 @@ int insn_get_immediate(struct insn *insn)
if (insn->immediate.got)
return 0;
- if (!insn->displacement.got) {
- ret = insn_get_displacement(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_displacement(insn);
+ if (ret)
+ return ret;
if (inat_has_moffset(insn->attr)) {
if (!__get_moffset(insn))
@@ -703,11 +691,9 @@ int insn_get_length(struct insn *insn)
if (insn->length)
return 0;
- if (!insn->immediate.got) {
- ret = insn_get_immediate(insn);
- if (ret)
- return ret;
- }
+ ret = insn_get_immediate(insn);
+ if (ret)
+ return ret;
insn->length = (unsigned char)((unsigned long)insn->next_byte
- (unsigned long)insn->kaddr);
diff --git a/tools/objtool/arch/x86/special.c b/tools/objtool/arch/x86/special.c
index 29e949579ede..4134d27c696b 100644
--- a/tools/objtool/arch/x86/special.c
+++ b/tools/objtool/arch/x86/special.c
@@ -83,7 +83,7 @@ bool arch_support_alt_relocation(struct special_alt *special_alt,
* TODO: Once we have DWARF CFI and smarter instruction decoding logic,
* ensure the same register is used in the mov and jump instructions.
*
- * NOTE: RETPOLINE made it harder still to decode dynamic jumps.
+ * NOTE: MITIGATION_RETPOLINE made it harder still to decode dynamic jumps.
*/
struct reloc *arch_find_switch_table(struct objtool_file *file,
struct instruction *insn)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 548ec3cd7c00..8440b7bb343c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3980,11 +3980,11 @@ static int validate_retpoline(struct objtool_file *file)
if (insn->type == INSN_RETURN) {
if (opts.rethunk) {
- WARN_INSN(insn, "'naked' return found in RETHUNK build");
+ WARN_INSN(insn, "'naked' return found in MITIGATION_RETHUNK build");
} else
continue;
} else {
- WARN_INSN(insn, "indirect %s found in RETPOLINE build",
+ WARN_INSN(insn, "indirect %s found in MITIGATION_RETPOLINE build",
insn->type == INSN_JUMP_DYNAMIC ? "jump" : "call");
}
diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index bb3ca9a5d731..b6da8f71ea19 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -375,6 +375,32 @@ static void test_copy(void)
kvm_vm_free(t.kvm_vm);
}
+static void test_copy_access_register(void)
+{
+ struct test_default t = test_default_init(guest_copy);
+
+ HOST_SYNC(t.vcpu, STAGE_INITED);
+
+ prepare_mem12();
+ t.run->psw_mask &= ~(3UL << (63 - 17));
+ t.run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */
+
+ /*
+ * Primary address space gets used if an access register
+ * contains zero. The host makes use of AR[1] so is a good
+ * candidate to ensure the guest AR (of zero) is used.
+ */
+ CHECK_N_DO(MOP, t.vcpu, LOGICAL, WRITE, mem1, t.size,
+ GADDR_V(mem1), AR(1));
+ HOST_SYNC(t.vcpu, STAGE_COPIED);
+
+ CHECK_N_DO(MOP, t.vcpu, LOGICAL, READ, mem2, t.size,
+ GADDR_V(mem2), AR(1));
+ ASSERT_MEM_EQ(mem1, mem2, t.size);
+
+ kvm_vm_free(t.kvm_vm);
+}
+
static void set_storage_key_range(void *addr, size_t len, uint8_t key)
{
uintptr_t _addr, abs, i;
@@ -1102,6 +1128,11 @@ int main(int argc, char *argv[])
.requirements_met = extension_cap > 0,
},
{
+ .name = "copy with access register mode",
+ .test = test_copy_access_register,
+ .requirements_met = true,
+ },
+ {
.name = "error checks with key",
.test = test_errors_key,
.requirements_met = extension_cap > 0,