From a56e9d538b6a7d145f38f6ebf174b7bc91e2c29d Mon Sep 17 00:00:00 2001 From: UtsavBalar1231 Date: Thu, 28 Apr 2022 19:13:00 +0530 Subject: [PATCH] Revert "mm: provide speculative fault infrastructure" This reverts commit 396b980864960745d22e32e663b384db825e9458. Signed-off-by: UtsavBalar1231 Change-Id: I90e27e3e598f41efc131f106611fc20e33b709d8 --- include/linux/hugetlb_inline.h | 2 +- include/linux/mm.h | 31 --- include/linux/pagemap.h | 4 +- mm/internal.h | 16 +- mm/memory.c | 342 +-------------------------------- 5 files changed, 7 insertions(+), 388 deletions(-) diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 9e25283d6fc9..0660a03d37d9 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -8,7 +8,7 @@ static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { - return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB); + return !!(vma->vm_flags & VM_HUGETLB); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 969230a8738f..2c37a8a9c4cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -331,8 +331,6 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ #define FAULT_FLAG_PREFAULT_OLD 0x400 /* Make faultaround ptes old */ -/* Speculative fault, not holding mmap_sem */ -#define FAULT_FLAG_SPECULATIVE 0x200 #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ @@ -361,10 +359,6 @@ struct vm_fault { gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ -#ifdef CONFIG_SPECULATIVE_PAGE_FAULT - unsigned int sequence; - pmd_t orig_pmd; /* value of PMD at the time of fault */ -#endif pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching @@ -1529,31 +1523,6 @@ int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags); - -#ifdef CONFIG_SPECULATIVE_PAGE_FAULT -extern int __handle_speculative_fault(struct mm_struct *mm, - unsigned long address, - unsigned int flags); -static inline int handle_speculative_fault(struct mm_struct *mm, - unsigned long address, - unsigned int flags) -{ - /* - * Try speculative page fault for multithreaded user space task only. - */ - if (!(flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1) - return VM_FAULT_RETRY; - return __handle_speculative_fault(mm, address, flags); -} -#else -static inline int handle_speculative_fault(struct mm_struct *mm, - unsigned long address, - unsigned int flags) -{ - return VM_FAULT_RETRY; -} -#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ - extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3cee35db398..e9c622da7ea7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -460,8 +460,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, pgoff_t pgoff; if (unlikely(is_vm_hugetlb_page(vma))) return linear_hugepage_index(vma, address); - pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT; - pgoff += READ_ONCE(vma->vm_pgoff); + pgoff = (address - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; return pgoff; } diff --git a/mm/internal.h b/mm/internal.h index dc3d42da7d91..4af6773db387 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -44,21 +44,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); extern struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr); extern void put_vma(struct vm_area_struct *vma); - -static inline bool vma_has_changed(struct vm_fault *vmf) -{ - int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb); - unsigned int seq = READ_ONCE(vmf->vma->vm_sequence.sequence); - - /* - * Matches both the wmb in write_seqlock_{begin,end}() and - * the wmb in vma_rb_erase(). - */ - smp_rmb(); - - return ret || seq != vmf->sequence; -} -#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ +#endif void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/memory.c b/mm/memory.c index a217be4dc395..51cb66ef9569 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -809,8 +809,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (page) dump_page(page, "bad pte"); pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", - (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, - mapping, index); + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, @@ -2395,113 +2394,6 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); -#ifdef CONFIG_SPECULATIVE_PAGE_FAULT -static bool pte_spinlock(struct vm_fault *vmf) -{ - bool ret = false; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - pmd_t pmdval; -#endif - - /* Check if vma is still valid */ - if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { - vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - spin_lock(vmf->ptl); - return true; - } - - local_irq_disable(); - if (vma_has_changed(vmf)) - goto out; - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * We check if the pmd value is still the same to ensure that there - * is not a huge collapse operation in progress in our back. - */ - pmdval = READ_ONCE(*vmf->pmd); - if (!pmd_same(pmdval, vmf->orig_pmd)) - goto out; -#endif - - vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - if (unlikely(!spin_trylock(vmf->ptl))) - goto out; - - if (vma_has_changed(vmf)) { - spin_unlock(vmf->ptl); - goto out; - } - - ret = true; -out: - local_irq_enable(); - return ret; -} - -static bool pte_map_lock(struct vm_fault *vmf) -{ - bool ret = false; - pte_t *pte; - spinlock_t *ptl; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - pmd_t pmdval; -#endif - - if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - return true; - } - - /* - * The first vma_has_changed() guarantees the page-tables are still - * valid, having IRQs disabled ensures they stay around, hence the - * second vma_has_changed() to make sure they are still valid once - * we've got the lock. After that a concurrent zap_pte_range() will - * block on the PTL and thus we're safe. - */ - local_irq_disable(); - if (vma_has_changed(vmf)) - goto out; - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * We check if the pmd value is still the same to ensure that there - * is not a huge collapse operation in progress in our back. - */ - pmdval = READ_ONCE(*vmf->pmd); - if (!pmd_same(pmdval, vmf->orig_pmd)) - goto out; -#endif - - /* - * Same as pte_offset_map_lock() except that we call - * spin_trylock() in place of spin_lock() to avoid race with - * unmap path which may have the lock and wait for this CPU - * to invalidate TLB but this CPU has irq disabled. - * Since we are in a speculative patch, accept it could fail - */ - ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - pte = pte_offset_map(vmf->pmd, vmf->address); - if (unlikely(!spin_trylock(ptl))) { - pte_unmap(pte); - goto out; - } - - if (vma_has_changed(vmf)) { - pte_unmap_unlock(pte, ptl); - goto out; - } - - vmf->pte = pte; - vmf->ptl = ptl; - ret = true; -out: - local_irq_enable(); - return ret; -} -#else static inline bool pte_spinlock(struct vm_fault *vmf) { vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); @@ -2515,7 +2407,6 @@ static inline bool pte_map_lock(struct vm_fault *vmf) vmf->address, &vmf->ptl); return true; } -#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ /* * handle_pte_fault chooses page fault handler according to an entry which was @@ -3506,14 +3397,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; - /* - * Don't call the userfaultfd during the speculative path. - * We already checked for the VMA to not be managed through - * userfaultfd, but it may be set in our back once we have lock - * the pte. In such a case we can ignore it this time. - */ - if (vmf->flags & FAULT_FLAG_SPECULATIVE) - goto setpte; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -3556,8 +3439,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) goto unlock_and_release; /* Deliver the page fault to userland, check inside PT lock */ - if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && - userfaultfd_missing(vma)) { + if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -4376,15 +4258,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) pte_t entry; if (unlikely(pmd_none(*vmf->pmd))) { - /* - * In the case of the speculative page fault handler we abort - * the speculative path immediately as the pmd is probably - * in the way to be converted in a huge one. We will try - * again holding the mmap_sem (which implies that the collapse - * operation is done). - */ - if (vmf->flags & FAULT_FLAG_SPECULATIVE) - return VM_FAULT_RETRY; /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table @@ -4392,7 +4265,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * concurrent faults and from rmap lookups. */ vmf->pte = NULL; - } else if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + } else { /* See comment in pte_alloc_one_map() */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; @@ -4401,9 +4274,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) * pmd from under us anymore at this point because we hold the * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). - * This is not applicable to the speculative page fault handler - * but in that case, the pte is fetched earlier in - * handle_speculative_fault(). */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); vmf->orig_pte = *vmf->pte; @@ -4426,8 +4296,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); - else if (vmf->flags & FAULT_FLAG_SPECULATIVE) - return VM_FAULT_RETRY; else return do_fault(vmf); } @@ -4525,9 +4393,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; -#ifdef CONFIG_SPECULATIVE_PAGE_FAULT - vmf.sequence = raw_read_seqcount(&vma->vm_sequence); -#endif if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) @@ -4561,207 +4426,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return handle_pte_fault(&vmf); } -#ifdef CONFIG_SPECULATIVE_PAGE_FAULT - -#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL -/* This is required by vm_normal_page() */ -#error "Speculative page fault handler requires CONFIG_ARCH_HAS_PTE_SPECIAL" -#endif - -/* - * vm_normal_page() adds some processing which should be done while - * hodling the mmap_sem. - */ -int __handle_speculative_fault(struct mm_struct *mm, unsigned long address, - unsigned int flags) -{ - struct vm_fault vmf = { - .address = address, - }; - pgd_t *pgd, pgdval; - p4d_t *p4d, p4dval; - pud_t pudval; - int seq, ret = VM_FAULT_RETRY; - struct vm_area_struct *vma; - - /* Clear flags that may lead to release the mmap_sem to retry */ - flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE); - flags |= FAULT_FLAG_SPECULATIVE; - - vma = get_vma(mm, address); - if (!vma) - return ret; - - /* rmb <-> seqlock,vma_rb_erase() */ - seq = raw_read_seqcount(&vma->vm_sequence); - if (seq & 1) - goto out_put; - - /* - * Can't call vm_ops service has we don't know what they would do - * with the VMA. - * This include huge page from hugetlbfs. - */ - if (vma->vm_ops) - goto out_put; - - /* - * __anon_vma_prepare() requires the mmap_sem to be held - * because vm_next and vm_prev must be safe. This can't be guaranteed - * in the speculative path. - */ - if (unlikely(!vma->anon_vma)) - goto out_put; - - vmf.vma_flags = READ_ONCE(vma->vm_flags); - vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot); - - /* Can't call userland page fault handler in the speculative path */ - if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) - goto out_put; - - if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) - /* - * This could be detected by the check address against VMA's - * boundaries but we want to trace it as not supported instead - * of changed. - */ - goto out_put; - - if (address < READ_ONCE(vma->vm_start) - || READ_ONCE(vma->vm_end) <= address) - goto out_put; - - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, - flags & FAULT_FLAG_INSTRUCTION, - flags & FAULT_FLAG_REMOTE)) { - ret = VM_FAULT_SIGSEGV; - goto out_put; - } - - /* This is one is required to check that the VMA has write access set */ - if (flags & FAULT_FLAG_WRITE) { - if (unlikely(!(vmf.vma_flags & VM_WRITE))) { - ret = VM_FAULT_SIGSEGV; - goto out_put; - } - } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) { - ret = VM_FAULT_SIGSEGV; - goto out_put; - } - -#ifdef CONFIG_NUMA - struct mempolicy *pol; - - /* - * MPOL_INTERLEAVE implies additional checks in - * mpol_misplaced() which are not compatible with the - *speculative page fault processing. - */ - pol = __get_vma_policy(vma, address); - if (!pol) - pol = get_task_policy(current); - if (pol && pol->mode == MPOL_INTERLEAVE) - goto out_put; -#endif - - /* - * Do a speculative lookup of the PTE entry. - */ - local_irq_disable(); - pgd = pgd_offset(mm, address); - pgdval = READ_ONCE(*pgd); - if (pgd_none(pgdval) || unlikely(pgd_bad(pgdval))) - goto out_walk; - - p4d = p4d_offset(pgd, address); - p4dval = READ_ONCE(*p4d); - if (p4d_none(p4dval) || unlikely(p4d_bad(p4dval))) - goto out_walk; - - vmf.pud = pud_offset(p4d, address); - pudval = READ_ONCE(*vmf.pud); - if (pud_none(pudval) || unlikely(pud_bad(pudval))) - goto out_walk; - - /* Huge pages at PUD level are not supported. */ - if (unlikely(pud_trans_huge(pudval))) - goto out_walk; - - vmf.pmd = pmd_offset(vmf.pud, address); - vmf.orig_pmd = READ_ONCE(*vmf.pmd); - /* - * pmd_none could mean that a hugepage collapse is in progress - * in our back as collapse_huge_page() mark it before - * invalidating the pte (which is done once the IPI is catched - * by all CPU and we have interrupt disabled). - * For this reason we cannot handle THP in a speculative way since we - * can't safely indentify an in progress collapse operation done in our - * back on that PMD. - * Regarding the order of the following checks, see comment in - * pmd_devmap_trans_unstable() - */ - if (unlikely(pmd_devmap(vmf.orig_pmd) || - pmd_none(vmf.orig_pmd) || pmd_trans_huge(vmf.orig_pmd) || - is_swap_pmd(vmf.orig_pmd))) - goto out_walk; - - /* - * The above does not allocate/instantiate page-tables because doing so - * would lead to the possibility of instantiating page-tables after - * free_pgtables() -- and consequently leaking them. - * - * The result is that we take at least one !speculative fault per PMD - * in order to instantiate it. - */ - - vmf.pte = pte_offset_map(vmf.pmd, address); - vmf.orig_pte = READ_ONCE(*vmf.pte); - barrier(); /* See comment in handle_pte_fault() */ - if (pte_none(vmf.orig_pte)) { - pte_unmap(vmf.pte); - vmf.pte = NULL; - } - - vmf.vma = vma; - vmf.pgoff = linear_page_index(vma, address); - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.sequence = seq; - vmf.flags = flags; - - local_irq_enable(); - - /* - * We need to re-validate the VMA after checking the bounds, otherwise - * we might have a false positive on the bounds. - */ - if (read_seqcount_retry(&vma->vm_sequence, seq)) - goto out_put; - - mem_cgroup_enter_user_fault(); - ret = handle_pte_fault(&vmf); - mem_cgroup_exit_user_fault(); - - put_vma(vma); - - /* - * The task may have entered a memcg OOM situation but - * if the allocation error was handled gracefully (no - * VM_FAULT_OOM), there is no need to kill anything. - * Just clean up the OOM state peacefully. - */ - if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) - mem_cgroup_oom_synchronize(false); - return ret; - -out_walk: - local_irq_enable(); -out_put: - put_vma(vma); - return ret; -} -#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */ - /* * By the time we get here, we already hold the mm semaphore *