summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLokesh Gidra <lokeshgidra@google.com>2024-01-16 20:20:25 +0000
committerKalesh Singh <kaleshsingh@google.com>2024-03-08 18:44:12 +0000
commit9f82443753a923d3eaa72004b59cc54bb52744cd (patch)
tree53a7f58ff8a105b144ae4a6a332dd83c2bc4cbb6
parent03d375b29809a93450da370228c7f6b86ca3ed4c (diff)
downloadgs-9f82443753a923d3eaa72004b59cc54bb52744cd.tar.gz
ANDROID: userfaultfd: allow SPF for UFFD_FEATURE_SIGBUS on private+anon
Currently we bail out of speculative page fault when we detect that the fault address is in a userfaultfd registered vma. However, if userfaultfd is being used with UFFD_FEATURE_SIGBUS feature, then handle_userfault() doesn't do much and is easiest to handle with SPF. This patch lets MISSING userfaultfs on private anonymous mappings be allowed with SPF if UFFD_FEATURE_SIGBUS is used. With this patch we get >99% success rate for userfaults caused during userfaultfd GC's compaction phase. This translates into eliminating uninterruptible sleep time in do_page_fault() due to userfaults. Bug: 320478828 Bug: 328786602 Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> Change-Id: Ic7fde0fde03602b35179bc0cf891ddbbc434190f (cherry picked from commit 582c6d188ec138d8ed9c6ef235bf5698d80d7d6b)
-rw-r--r--fs/userfaultfd.c96
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/userfaultfd_k.h12
-rw-r--r--mm/memory.c34
-rw-r--r--mm/userfaultfd.c2
5 files changed, 111 insertions, 35 deletions
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 20892669308c..6ecedf4498e3 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -71,6 +71,7 @@ struct userfaultfd_ctx {
bool mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
+ struct rcu_head rcu_head;
};
struct userfaultfd_fork_ctx {
@@ -156,6 +157,13 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
refcount_inc(&ctx->refcount);
}
+static void __free_userfaultfd_ctx(struct rcu_head *head)
+{
+ struct userfaultfd_ctx *ctx = container_of(head, struct userfaultfd_ctx,
+ rcu_head);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+}
+
/**
* userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
* context.
@@ -176,7 +184,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
mmdrop(ctx->mm);
- kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ call_rcu(&ctx->rcu_head, __free_userfaultfd_ctx);
}
}
@@ -350,6 +358,24 @@ static inline long userfaultfd_get_blocking_state(unsigned int flags)
return TASK_UNINTERRUPTIBLE;
}
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+bool userfaultfd_using_sigbus(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *ctx;
+ bool ret;
+
+ /*
+ * Do it inside RCU section to ensure that the ctx doesn't
+ * disappear under us.
+ */
+ rcu_read_lock();
+ ctx = rcu_dereference(vma->vm_userfaultfd_ctx.ctx);
+ ret = ctx && (ctx->features & UFFD_FEATURE_SIGBUS);
+ rcu_read_unlock();
+ return ret;
+}
+#endif
+
/*
* The locking rules involved in returning VM_FAULT_RETRY depending on
* FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -394,7 +420,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*/
mmap_assert_locked(mm);
- ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
+ ctx = rcu_dereference_protected(vmf->vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&mm->mmap_lock));
if (!ctx)
goto out;
@@ -611,8 +638,10 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */
mmap_write_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next)
- if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ if (rcu_access_pointer(vma->vm_userfaultfd_ctx.ctx) ==
+ release_new_ctx) {
+ rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx,
+ NULL);
vma->vm_flags &= ~__VM_UFFD_FLAGS;
}
mmap_write_unlock(mm);
@@ -642,10 +671,13 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
struct userfaultfd_ctx *ctx = NULL, *octx;
struct userfaultfd_fork_ctx *fctx;
- octx = vma->vm_userfaultfd_ctx.ctx;
+ octx = rcu_dereference_protected(
+ vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&vma->vm_mm->mmap_lock));
+
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vm_write_begin(vma);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
WRITE_ONCE(vma->vm_flags,
vma->vm_flags & ~__VM_UFFD_FLAGS);
vm_write_end(vma);
@@ -684,7 +716,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
list_add_tail(&fctx->list, fcs);
}
- vma->vm_userfaultfd_ctx.ctx = ctx;
+ rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, ctx);
return 0;
}
@@ -717,7 +749,8 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
{
struct userfaultfd_ctx *ctx;
- ctx = vma->vm_userfaultfd_ctx.ctx;
+ ctx = rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&vma->vm_mm->mmap_lock));
if (!ctx)
return;
@@ -728,7 +761,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
WRITE_ONCE(ctx->mmap_changing, true);
} else {
/* Drop uffd context if remap feature not enabled */
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
vma->vm_flags &= ~__VM_UFFD_FLAGS;
}
}
@@ -765,7 +798,8 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
struct userfaultfd_ctx *ctx;
struct userfaultfd_wait_queue ewq;
- ctx = vma->vm_userfaultfd_ctx.ctx;
+ ctx = rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&mm->mmap_lock));
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
return true;
@@ -803,7 +837,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
{
for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
struct userfaultfd_unmap_ctx *unmap_ctx;
- struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+ struct userfaultfd_ctx *ctx =
+ rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&vma->vm_mm->mmap_lock));
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
has_unmap_ctx(ctx, unmaps, start, end))
@@ -868,10 +904,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
mmap_write_lock(mm);
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct userfaultfd_ctx *cur_uffd_ctx =
+ rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&mm->mmap_lock));
cond_resched();
- BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+ BUG_ON(!!cur_uffd_ctx ^
!!(vma->vm_flags & __VM_UFFD_FLAGS));
- if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ if (cur_uffd_ctx != ctx) {
prev = vma;
continue;
}
@@ -887,7 +926,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
vm_write_end(vma);
}
mmap_write_unlock(mm);
@@ -1350,9 +1389,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
found = false;
basic_ioctls = false;
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ struct userfaultfd_ctx *cur_uffd_ctx =
+ rcu_dereference_protected(cur->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&mm->mmap_lock));
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ BUG_ON(!!cur_uffd_ctx ^
!!(cur->vm_flags & __VM_UFFD_FLAGS));
/* check not compatible vmas */
@@ -1395,8 +1437,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* wouldn't know which one to deliver the userfaults to.
*/
ret = -EBUSY;
- if (cur->vm_userfaultfd_ctx.ctx &&
- cur->vm_userfaultfd_ctx.ctx != ctx)
+ if (cur_uffd_ctx && cur_uffd_ctx != ctx)
goto out_unlock;
/*
@@ -1414,18 +1455,20 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
ret = 0;
do {
+ struct userfaultfd_ctx *cur_uffd_ctx =
+ rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&mm->mmap_lock));
cond_resched();
BUG_ON(!vma_can_userfault(vma, vm_flags));
- BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
- vma->vm_userfaultfd_ctx.ctx != ctx);
+ BUG_ON(cur_uffd_ctx && cur_uffd_ctx != ctx);
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
/*
* Nothing to do: this vma is already registered into this
* userfaultfd and with the right tracking mode too.
*/
- if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ if (cur_uffd_ctx == ctx &&
(vma->vm_flags & vm_flags) == vm_flags)
goto skip;
@@ -1461,7 +1504,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
*/
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
- vma->vm_userfaultfd_ctx.ctx = ctx;
+ rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, ctx);
vm_write_end(vma);
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
@@ -1561,7 +1604,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ BUG_ON(!!rcu_access_pointer(cur->vm_userfaultfd_ctx.ctx) ^
!!(cur->vm_flags & __VM_UFFD_FLAGS));
/*
@@ -1583,6 +1626,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
ret = 0;
do {
+ struct userfaultfd_ctx *cur_uffd_ctx =
+ rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+ lockdep_is_held(&mm->mmap_lock));
cond_resched();
BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
@@ -1591,7 +1637,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* Nothing to do: this vma is already registered into this
* userfaultfd and with the right tracking mode too.
*/
- if (!vma->vm_userfaultfd_ctx.ctx)
+ if (!cur_uffd_ctx)
goto skip;
WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1610,7 +1656,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range range;
range.start = start;
range.len = vma_end - start;
- wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
+ wake_userfault(cur_uffd_ctx, &range);
}
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
@@ -1640,7 +1686,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
*/
vm_write_begin(vma);
WRITE_ONCE(vma->vm_flags, new_flags);
- vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
vm_write_end(vma);
skip:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 80126b5c2568..2648ec4d2d9f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -297,7 +297,7 @@ struct vm_region {
#ifdef CONFIG_USERFAULTFD
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
struct vm_userfaultfd_ctx {
- struct userfaultfd_ctx *ctx;
+ struct userfaultfd_ctx __rcu *ctx;
};
#else /* CONFIG_USERFAULTFD */
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 331d2ccf0bcc..43902f3efbd1 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,6 +36,9 @@
extern int sysctl_unprivileged_userfaultfd;
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern bool userfaultfd_using_sigbus(struct vm_area_struct *vma);
+#endif
/*
* The mode of operation for __mcopy_atomic and its helpers.
@@ -75,7 +78,7 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm,
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx vm_ctx)
{
- return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+ return rcu_access_pointer(vma->vm_userfaultfd_ctx.ctx) == vm_ctx.ctx;
}
/*
@@ -154,6 +157,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
return VM_FAULT_SIGBUS;
}
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static inline bool userfaultfd_using_sigbus(struct vm_area_struct *vma)
+{
+ return false;
+}
+#endif
+
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx vm_ctx)
{
diff --git a/mm/memory.c b/mm/memory.c
index ea4ebb9a72eb..7c09fb676bc7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5006,6 +5006,7 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm,
pud_t pudval;
int seq;
vm_fault_t ret;
+ bool uffd_missing_sigbus = false;
/* Clear flags that may lead to release the mmap_sem to retry */
flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
@@ -5018,20 +5019,31 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm,
return VM_FAULT_RETRY;
}
- if (!vmf_allows_speculation(&vmf))
- return VM_FAULT_RETRY;
-
vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags);
vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot);
#ifdef CONFIG_USERFAULTFD
- /* Can't call userland page fault handler in the speculative path */
+ /*
+ * Only support SPF for SIGBUS+MISSING userfaults in private anonymous
+ * VMAs. Rest all should be retried with mmap_lock.
+ */
if (unlikely(vmf.vma_flags & __VM_UFFD_FLAGS)) {
- trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
- return VM_FAULT_RETRY;
+ uffd_missing_sigbus = vma_is_anonymous(vmf.vma) &&
+ (vmf.vma_flags & VM_UFFD_MISSING) &&
+ userfaultfd_using_sigbus(vmf.vma);
+ if (!uffd_missing_sigbus) {
+ trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
+ return VM_FAULT_RETRY;
+ }
+ /* Not having anon_vma implies that the PTE is missing */
+ if (!vmf.vma->anon_vma)
+ return VM_FAULT_SIGBUS;
}
#endif
+ if (!vmf_allows_speculation(&vmf))
+ return VM_FAULT_RETRY;
+
if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) {
/*
* This could be detected by the check address against VMA's
@@ -5149,6 +5161,9 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm,
local_irq_enable();
+ if (!vmf.pte && uffd_missing_sigbus)
+ return VM_FAULT_SIGBUS;
+
/*
* We need to re-validate the VMA after checking the bounds, otherwise
* we might have a false positive on the bounds.
@@ -5184,7 +5199,12 @@ static vm_fault_t ___handle_speculative_fault(struct mm_struct *mm,
out_walk:
trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
local_irq_enable();
- return VM_FAULT_RETRY;
+ /*
+ * Failing page-table walk is similar to page-missing so give an
+ * opportunity to SIGBUS+MISSING userfault to handle it before retrying
+ * with mmap_lock
+ */
+ return uffd_missing_sigbus ? VM_FAULT_SIGBUS : VM_FAULT_RETRY;
out_segv:
trace_spf_vma_access(_RET_IP_, vmf.vma, address);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index fa707e50b102..8ab123714314 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -42,7 +42,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* enforce the VM_MAYWRITE check done at uffd registration
* time.
*/
- if (!dst_vma->vm_userfaultfd_ctx.ctx)
+ if (!rcu_access_pointer(dst_vma->vm_userfaultfd_ctx.ctx))
return NULL;
return dst_vma;