From 292a7d6fca33df70ca4b8e9b0d0e74adf87582dc Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 21 Apr 2023 10:50:36 +0200 Subject: [PATCH 1/2] KVM: s390: pv: fix asynchronous teardown for small VMs On machines without the Destroy Secure Configuration Fast UVC, the topmost level of page tables is set aside and freed asynchronously as last step of the asynchronous teardown. Each gmap has a host_to_guest radix tree mapping host (userspace) addresses (with 1M granularity) to gmap segment table entries (pmds). If a guest is smaller than 2GB, the topmost level of page tables is the segment table (i.e. there are only 2 levels). Replacing it means that the pointers in the host_to_guest mapping would become stale and cause all kinds of nasty issues. This patch fixes the issue by disallowing asynchronous teardown for guests with only 2 levels of page tables. Userspace should (and already does) try using the normal destroy if the asynchronous one fails. Update s390_replace_asce so it refuses to replace segment type ASCEs. This is still needed in case the normal destroy VM fails. Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot") Reviewed-by: Marc Hartmayer Reviewed-by: Janosch Frank Signed-off-by: Claudio Imbrenda Message-Id: <20230421085036.52511-2-imbrenda@linux.ibm.com> --- arch/s390/kvm/pv.c | 5 +++++ arch/s390/mm/gmap.c | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index e032ebbf51b9..3ce5f4351156 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -314,6 +314,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc) */ if (kvm->arch.pv.set_aside) return -EINVAL; + + /* Guest with segment type ASCE, refuse to destroy asynchronously */ + if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 5a716bdcba05..2267cf9819b2 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2833,6 +2833,9 @@ EXPORT_SYMBOL_GPL(s390_unlist_old_asce); * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy * @gmap: the gmap whose ASCE needs to be replaced * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. * If the allocation of the new top level page table fails, the ASCE is not * replaced. * In any case, the old ASCE is always removed from the gmap CRST list. @@ -2847,6 +2850,10 @@ int s390_replace_asce(struct gmap *gmap) s390_unlist_old_asce(gmap); + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; From c148dc8e2fa403be501612ee409db866eeed35c0 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Fri, 28 Apr 2023 11:27:53 +0200 Subject: [PATCH 2/2] KVM: s390: fix race in gmap_make_secure() Fix a potential race in gmap_make_secure() and remove the last user of follow_page() without FOLL_GET. The old code is locking something it doesn't have a reference to, and as explained by Jason and David in this discussion: https://lore.kernel.org/linux-mm/Y9J4P%2FRNvY1Ztn0Q@nvidia.com/ it can lead to all kind of bad things, including the page getting unmapped (MADV_DONTNEED), freed, reallocated as a larger folio and the unlock_page() would target the wrong bit. There is also another race with the FOLL_WRITE, which could race between the follow_page() and the get_locked_pte(). The main point is to remove the last use of follow_page() without FOLL_GET or FOLL_PIN, removing the races can be considered a nice bonus. Link: https://lore.kernel.org/linux-mm/Y9J4P%2FRNvY1Ztn0Q@nvidia.com/ Suggested-by: Jason Gunthorpe Fixes: 214d9bbcd3a6 ("s390/mm: provide memory management functions for protected KVM guests") Reviewed-by: Jason Gunthorpe Signed-off-by: Claudio Imbrenda Message-Id: <20230428092753.27913-2-imbrenda@linux.ibm.com> --- arch/s390/kernel/uv.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 9f18a4af9c13..cb2ee06df286 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -192,21 +192,10 @@ static int expected_page_refs(struct page *page) return res; } -static int make_secure_pte(pte_t *ptep, unsigned long addr, - struct page *exp_page, struct uv_cb_header *uvcb) +static int make_page_secure(struct page *page, struct uv_cb_header *uvcb) { - pte_t entry = READ_ONCE(*ptep); - struct page *page; int expected, cc = 0; - if (!pte_present(entry)) - return -ENXIO; - if (pte_val(entry) & _PAGE_INVALID) - return -ENXIO; - - page = pte_page(entry); - if (page != exp_page) - return -ENXIO; if (PageWriteback(page)) return -EAGAIN; expected = expected_page_refs(page); @@ -304,17 +293,18 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) goto out; rc = -ENXIO; - page = follow_page(vma, uaddr, FOLL_WRITE); - if (IS_ERR_OR_NULL(page)) - goto out; - - lock_page(page); ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); - if (should_export_before_import(uvcb, gmap->mm)) - uv_convert_from_secure(page_to_phys(page)); - rc = make_secure_pte(ptep, uaddr, page, uvcb); + if (pte_present(*ptep) && !(pte_val(*ptep) & _PAGE_INVALID) && pte_write(*ptep)) { + page = pte_page(*ptep); + rc = -EAGAIN; + if (trylock_page(page)) { + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(page_to_phys(page)); + rc = make_page_secure(page, uvcb); + unlock_page(page); + } + } pte_unmap_unlock(ptep, ptelock); - unlock_page(page); out: mmap_read_unlock(gmap->mm);