diff options
author | Lokesh Gidra <lokeshgidra@google.com> | 2024-01-16 20:20:25 +0000 |
---|---|---|
committer | Kalesh Singh <kaleshsingh@google.com> | 2024-03-08 18:44:12 +0000 |
commit | 9f82443753a923d3eaa72004b59cc54bb52744cd (patch) | |
tree | 53a7f58ff8a105b144ae4a6a332dd83c2bc4cbb6 | |
parent | 03d375b29809a93450da370228c7f6b86ca3ed4c (diff) | |
download | gs-9f82443753a923d3eaa72004b59cc54bb52744cd.tar.gz |
ANDROID: userfaultfd: allow SPF for UFFD_FEATURE_SIGBUS on private+anon
Currently we bail out of speculative page fault when we detect that the
fault address is in a userfaultfd registered vma. However, if userfaultfd
is being used with UFFD_FEATURE_SIGBUS feature, then handle_userfault()
doesn't do much and is easiest to handle with SPF. This patch lets
MISSING userfaultfs on private anonymous mappings be allowed with SPF if
UFFD_FEATURE_SIGBUS is used.
With this patch we get >99% success rate for userfaults caused during
userfaultfd GC's compaction phase. This translates into eliminating
uninterruptible sleep time in do_page_fault() due to userfaults.
Bug: 320478828
Bug: 328786602
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Change-Id: Ic7fde0fde03602b35179bc0cf891ddbbc434190f
(cherry picked from commit 582c6d188ec138d8ed9c6ef235bf5698d80d7d6b)
-rw-r--r-- | fs/userfaultfd.c | 96 | ||||
-rw-r--r-- | include/linux/mm_types.h | 2 | ||||
-rw-r--r-- | include/linux/userfaultfd_k.h | 12 | ||||
-rw-r--r-- | mm/memory.c | 34 | ||||
-rw-r--r-- | mm/userfaultfd.c | 2 |
5 files changed, 111 insertions, 35 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 20892669308c..6ecedf4498e3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -71,6 +71,7 @@ struct userfaultfd_ctx { bool mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; + struct rcu_head rcu_head; }; struct userfaultfd_fork_ctx { @@ -156,6 +157,13 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) refcount_inc(&ctx->refcount); } +static void __free_userfaultfd_ctx(struct rcu_head *head) +{ + struct userfaultfd_ctx *ctx = container_of(head, struct userfaultfd_ctx, + rcu_head); + kmem_cache_free(userfaultfd_ctx_cachep, ctx); +} + /** * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd * context. @@ -176,7 +184,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); mmdrop(ctx->mm); - kmem_cache_free(userfaultfd_ctx_cachep, ctx); + call_rcu(&ctx->rcu_head, __free_userfaultfd_ctx); } } @@ -350,6 +358,24 @@ static inline long userfaultfd_get_blocking_state(unsigned int flags) return TASK_UNINTERRUPTIBLE; } +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +bool userfaultfd_using_sigbus(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *ctx; + bool ret; + + /* + * Do it inside RCU section to ensure that the ctx doesn't + * disappear under us. + */ + rcu_read_lock(); + ctx = rcu_dereference(vma->vm_userfaultfd_ctx.ctx); + ret = ctx && (ctx->features & UFFD_FEATURE_SIGBUS); + rcu_read_unlock(); + return ret; +} +#endif + /* * The locking rules involved in returning VM_FAULT_RETRY depending on * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and @@ -394,7 +420,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) */ mmap_assert_locked(mm); - ctx = vmf->vma->vm_userfaultfd_ctx.ctx; + ctx = rcu_dereference_protected(vmf->vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&mm->mmap_lock)); if (!ctx) goto out; @@ -611,8 +638,10 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + if (rcu_access_pointer(vma->vm_userfaultfd_ctx.ctx) == + release_new_ctx) { + rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, + NULL); vma->vm_flags &= ~__VM_UFFD_FLAGS; } mmap_write_unlock(mm); @@ -642,10 +671,13 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) struct userfaultfd_ctx *ctx = NULL, *octx; struct userfaultfd_fork_ctx *fctx; - octx = vma->vm_userfaultfd_ctx.ctx; + octx = rcu_dereference_protected( + vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&vma->vm_mm->mmap_lock)); + if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vm_write_begin(vma); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL); WRITE_ONCE(vma->vm_flags, vma->vm_flags & ~__VM_UFFD_FLAGS); vm_write_end(vma); @@ -684,7 +716,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) list_add_tail(&fctx->list, fcs); } - vma->vm_userfaultfd_ctx.ctx = ctx; + rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, ctx); return 0; } @@ -717,7 +749,8 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, { struct userfaultfd_ctx *ctx; - ctx = vma->vm_userfaultfd_ctx.ctx; + ctx = rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&vma->vm_mm->mmap_lock)); if (!ctx) return; @@ -728,7 +761,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, WRITE_ONCE(ctx->mmap_changing, true); } else { /* Drop uffd context if remap feature not enabled */ - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL); vma->vm_flags &= ~__VM_UFFD_FLAGS; } } @@ -765,7 +798,8 @@ bool userfaultfd_remove(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue ewq; - ctx = vma->vm_userfaultfd_ctx.ctx; + ctx = rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&mm->mmap_lock)); if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) return true; @@ -803,7 +837,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, { for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { struct userfaultfd_unmap_ctx *unmap_ctx; - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + struct userfaultfd_ctx *ctx = + rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&vma->vm_mm->mmap_lock)); if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || has_unmap_ctx(ctx, unmaps, start, end)) @@ -868,10 +904,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) mmap_write_lock(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct userfaultfd_ctx *cur_uffd_ctx = + rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&mm->mmap_lock)); cond_resched(); - BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ + BUG_ON(!!cur_uffd_ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); - if (vma->vm_userfaultfd_ctx.ctx != ctx) { + if (cur_uffd_ctx != ctx) { prev = vma; continue; } @@ -887,7 +926,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; vm_write_begin(vma); WRITE_ONCE(vma->vm_flags, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL); vm_write_end(vma); } mmap_write_unlock(mm); @@ -1350,9 +1389,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, found = false; basic_ioctls = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + struct userfaultfd_ctx *cur_uffd_ctx = + rcu_dereference_protected(cur->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&mm->mmap_lock)); cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + BUG_ON(!!cur_uffd_ctx ^ !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ @@ -1395,8 +1437,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * wouldn't know which one to deliver the userfaults to. */ ret = -EBUSY; - if (cur->vm_userfaultfd_ctx.ctx && - cur->vm_userfaultfd_ctx.ctx != ctx) + if (cur_uffd_ctx && cur_uffd_ctx != ctx) goto out_unlock; /* @@ -1414,18 +1455,20 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = 0; do { + struct userfaultfd_ctx *cur_uffd_ctx = + rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&mm->mmap_lock)); cond_resched(); BUG_ON(!vma_can_userfault(vma, vm_flags)); - BUG_ON(vma->vm_userfaultfd_ctx.ctx && - vma->vm_userfaultfd_ctx.ctx != ctx); + BUG_ON(cur_uffd_ctx && cur_uffd_ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ - if (vma->vm_userfaultfd_ctx.ctx == ctx && + if (cur_uffd_ctx == ctx && (vma->vm_flags & vm_flags) == vm_flags) goto skip; @@ -1461,7 +1504,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ vm_write_begin(vma); WRITE_ONCE(vma->vm_flags, new_flags); - vma->vm_userfaultfd_ctx.ctx = ctx; + rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, ctx); vm_write_end(vma); if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) @@ -1561,7 +1604,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); - BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ + BUG_ON(!!rcu_access_pointer(cur->vm_userfaultfd_ctx.ctx) ^ !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* @@ -1583,6 +1626,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, ret = 0; do { + struct userfaultfd_ctx *cur_uffd_ctx = + rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx, + lockdep_is_held(&mm->mmap_lock)); cond_resched(); BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); @@ -1591,7 +1637,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ - if (!vma->vm_userfaultfd_ctx.ctx) + if (!cur_uffd_ctx) goto skip; WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1610,7 +1656,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range range; range.start = start; range.len = vma_end - start; - wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); + wake_userfault(cur_uffd_ctx, &range); } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; @@ -1640,7 +1686,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, */ vm_write_begin(vma); WRITE_ONCE(vma->vm_flags, new_flags); - vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL); vm_write_end(vma); skip: diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 80126b5c2568..2648ec4d2d9f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -297,7 +297,7 @@ struct vm_region { #ifdef CONFIG_USERFAULTFD #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, }) struct vm_userfaultfd_ctx { - struct userfaultfd_ctx *ctx; + struct userfaultfd_ctx __rcu *ctx; }; #else /* CONFIG_USERFAULTFD */ #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {}) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 331d2ccf0bcc..43902f3efbd1 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -36,6 +36,9 @@ extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +extern bool userfaultfd_using_sigbus(struct vm_area_struct *vma); +#endif /* * The mode of operation for __mcopy_atomic and its helpers. @@ -75,7 +78,7 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm, static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { - return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; + return rcu_access_pointer(vma->vm_userfaultfd_ctx.ctx) == vm_ctx.ctx; } /* @@ -154,6 +157,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, return VM_FAULT_SIGBUS; } +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT +static inline bool userfaultfd_using_sigbus(struct vm_area_struct *vma) +{ + return false; +} +#endif + static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { diff --git a/mm/memory.c b/mm/memory.c index ea4ebb9a72eb..7c09fb676bc7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5006,6 +5006,7 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm, pud_t pudval; int seq; vm_fault_t ret; + bool uffd_missing_sigbus = false; /* Clear flags that may lead to release the mmap_sem to retry */ flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE); @@ -5018,20 +5019,31 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm, return VM_FAULT_RETRY; } - if (!vmf_allows_speculation(&vmf)) - return VM_FAULT_RETRY; - vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags); vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot); #ifdef CONFIG_USERFAULTFD - /* Can't call userland page fault handler in the speculative path */ + /* + * Only support SPF for SIGBUS+MISSING userfaults in private anonymous + * VMAs. Rest all should be retried with mmap_lock. + */ if (unlikely(vmf.vma_flags & __VM_UFFD_FLAGS)) { - trace_spf_vma_notsup(_RET_IP_, vmf.vma, address); - return VM_FAULT_RETRY; + uffd_missing_sigbus = vma_is_anonymous(vmf.vma) && + (vmf.vma_flags & VM_UFFD_MISSING) && + userfaultfd_using_sigbus(vmf.vma); + if (!uffd_missing_sigbus) { + trace_spf_vma_notsup(_RET_IP_, vmf.vma, address); + return VM_FAULT_RETRY; + } + /* Not having anon_vma implies that the PTE is missing */ + if (!vmf.vma->anon_vma) + return VM_FAULT_SIGBUS; } #endif + if (!vmf_allows_speculation(&vmf)) + return VM_FAULT_RETRY; + if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) { /* * This could be detected by the check address against VMA's @@ -5149,6 +5161,9 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm, local_irq_enable(); + if (!vmf.pte && uffd_missing_sigbus) + return VM_FAULT_SIGBUS; + /* * We need to re-validate the VMA after checking the bounds, otherwise * we might have a false positive on the bounds. @@ -5184,7 +5199,12 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm, out_walk: trace_spf_vma_notsup(_RET_IP_, vmf.vma, address); local_irq_enable(); - return VM_FAULT_RETRY; + /* + * Failing page-table walk is similar to page-missing so give an + * opportunity to SIGBUS+MISSING userfault to handle it before retrying + * with mmap_lock + */ + return uffd_missing_sigbus ? VM_FAULT_SIGBUS : VM_FAULT_RETRY; out_segv: trace_spf_vma_access(_RET_IP_, vmf.vma, address); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index fa707e50b102..8ab123714314 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -42,7 +42,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, * enforce the VM_MAYWRITE check done at uffd registration * time. */ - if (!dst_vma->vm_userfaultfd_ctx.ctx) + if (!rcu_access_pointer(dst_vma->vm_userfaultfd_ctx.ctx)) return NULL; return dst_vma; |