BACKPORT: mm: don't be stuck to rmap lock on reclaim path

The rmap locks(i_mmap_rwsem and anon_vma->root->rwsem) could be contended
under memory pressure if processes keep working on their vmas(e.g., fork,
mmap, munmap).  It makes reclaim path stuck.  In our real workload traces,
we see kswapd is waiting the lock for 300ms+(worst case, a sec) and it
makes other processes entering direct reclaim, which were also stuck on
the lock.

This patch makes lru aging path try_lock mode like shink_page_list so the
reclaim context will keep working with next lru pages without being stuck.
if it found the rmap lock contended, it rotates the page back to head of
lru in both active/inactive lrus to make them consistent behavior, which
is basic starting point rather than adding more heristic.

Since this patch introduces a new "contended" field as out-param along
with try_lock in-param in rmap_walk_control, it's not immutable any longer
if the try_lock is set so remove const keywords on rmap related functions.
Since rmap walking is already expensive operation, I doubt the const
would help sizable benefit( And we didn't have it until 5.17).

In a heavy app workload in Android, trace shows following statistics.  It
almost removes rmap lock contention from reclaim path.

Martin Liu reported:

Before:

   max_dur(ms)  min_dur(ms)  max-min(dur)ms  avg_dur(ms)  sum_dur(ms)  count blocked_function
         1632            0            1631   151.542173        31672    209  page_lock_anon_vma_read
          601            0             601   145.544681        28817    198  rmap_walk_file

After:

   max_dur(ms)  min_dur(ms)  max-min(dur)ms  avg_dur(ms)  sum_dur(ms)  count blocked_function
          NaN          NaN              NaN          NaN          NaN    0.0             NaN
            0            0                0     0.127645            1     12  rmap_walk_file

[minchan@kernel.org: add comment, per Matthew]
  Link: https://lkml.kernel.org/r/YnNqeB5tUf6LZ57b@google.com
Link: https://lkml.kernel.org/r/20220510215423.164547-1-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: John Dias <joaodias@google.com>
Cc: Tim Murray <timmurray@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Martin Liu <liumartin@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Conflicts:
	folio->page

(cherry picked from commit 6d4675e601357834dadd2ba1d803f6484596015c)
Bug: 239681156
Signed-off-by: Minchan Kim <minchan@google.com>
Change-Id: I0c63e0291120c8a1b5f2d83b8a7b210cb56c27a2
This commit is contained in:
Minchan Kim 2022-05-19 14:08:54 -07:00 committed by spakkkk
parent 77cd7bea25
commit 6f414bfe88
8 changed files with 79 additions and 20 deletions

View File

@ -520,6 +520,11 @@ static inline void i_mmap_unlock_write(struct address_space *mapping)
up_write(&mapping->i_mmap_rwsem);
}
static inline int i_mmap_trylock_read(struct address_space *mapping)
{
return down_read_trylock(&mapping->i_mmap_rwsem);
}
static inline void i_mmap_lock_read(struct address_space *mapping)
{
down_read(&mapping->i_mmap_rwsem);

View File

@ -135,6 +135,11 @@ static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
down_read(&anon_vma->root->rwsem);
}
static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
{
return down_read_trylock(&anon_vma->root->rwsem);
}
static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
up_read(&anon_vma->root->rwsem);
@ -249,17 +254,14 @@ void try_to_munlock(struct page *);
void remove_migration_ptes(struct page *old, struct page *new, bool locked);
/*
* Called by memory-failure.c to kill processes.
*/
struct anon_vma *page_lock_anon_vma_read(struct page *page);
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
/*
* rmap_walk_control: To control rmap traversing for specific needs
*
* arg: passed to rmap_one() and invalid_vma()
* try_lock: bail out if the rmap lock is contended
* contended: indicate the rmap traversal bailed out due to lock contention
* rmap_one: executed on each vma where page is mapped
* done: for checking traversing termination condition
* anon_lock: for getting anon_lock by optimized way rather than default
@ -267,6 +269,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
*/
struct rmap_walk_control {
void *arg;
bool try_lock;
bool contended;
/*
* Return false if page table scanning in rmap_walk should be stopped.
* Otherwise, return true.
@ -274,13 +278,21 @@ struct rmap_walk_control {
bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *arg);
int (*done)(struct page *page);
struct anon_vma *(*anon_lock)(struct page *page);
struct anon_vma *(*anon_lock)(struct page *page,
struct rmap_walk_control *rwc);
bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};
void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
/*
* Called by memory-failure.c to kill processes.
*/
struct anon_vma *page_lock_anon_vma_read(struct page *page,
struct rmap_walk_control *rwc);
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
#else /* !CONFIG_MMU */
#define anon_vma_init() do {} while (0)

View File

@ -1585,7 +1585,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
*/
get_page(page);
spin_unlock(vmf->ptl);
anon_vma = page_lock_anon_vma_read(page);
anon_vma = page_lock_anon_vma_read(page, NULL);
/* Confirm the PMD did not change while page_table_lock was released */
spin_lock(vmf->ptl);

View File

@ -2610,7 +2610,13 @@ again:
struct vm_area_struct *vma;
cond_resched();
anon_vma_lock_read(anon_vma);
if (!anon_vma_trylock_read(anon_vma)) {
if (rwc->try_lock) {
rwc->contended = true;
return;
}
anon_vma_lock_read(anon_vma);
}
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
unsigned long addr;

View File

@ -446,7 +446,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct anon_vma *av;
pgoff_t pgoff;
av = page_lock_anon_vma_read(page);
av = page_lock_anon_vma_read(page, NULL);
if (av == NULL) /* Not actually mapped anymore */
return;

View File

@ -95,10 +95,10 @@ static bool page_idle_clear_pte_refs_one(struct page *page,
static void page_idle_clear_pte_refs(struct page *page)
{
/*
* Since rwc.arg is unused, rwc is effectively immutable, so we
* can make it static const to save some cycles and stack.
* Since rwc.try_lock is unused, rwc is effectively immutable, so we
* can make it static to save some cycles and stack.
*/
static const struct rmap_walk_control rwc = {
static struct rmap_walk_control rwc = {
.rmap_one = page_idle_clear_pte_refs_one,
.anon_lock = page_lock_anon_vma_read,
};

View File

@ -509,9 +509,11 @@ out:
*
* Its a little more complex as it tries to keep the fast path to a single
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
* reference like with page_get_anon_vma() and then block on the mutex
* on !rwc->try_lock case.
*/
struct anon_vma *page_lock_anon_vma_read(struct page *page)
struct anon_vma *page_lock_anon_vma_read(struct page *page,
struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@ -539,6 +541,12 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
goto out;
}
if (rwc && rwc->try_lock) {
anon_vma = NULL;
rwc->contended = true;
goto out;
}
/* trylock failed, we got to sleep */
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
@ -840,8 +848,10 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
* @memcg: target memory cgroup
* @vm_flags: collect encountered vma->vm_flags who actually referenced the page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
* Quick test_and_clear_referenced for all mappings of a page,
*
* Return: The number of mappings which referenced the page. Return -1 if
* the function bailed out due to rmap lock contention.
*/
int page_referenced(struct page *page,
int is_locked,
@ -857,6 +867,7 @@ int page_referenced(struct page *page,
.rmap_one = page_referenced_one,
.arg = (void *)&pra,
.anon_lock = page_lock_anon_vma_read,
.try_lock = true,
};
*vm_flags = 0;
@ -887,7 +898,7 @@ int page_referenced(struct page *page,
if (we_locked)
unlock_page(page);
return pra.referenced;
return rwc.contended ? -1 : pra.referenced;
}
static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
@ -1801,7 +1812,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
struct anon_vma *anon_vma;
if (rwc->anon_lock)
return rwc->anon_lock(page);
return rwc->anon_lock(page, rwc);
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@ -1813,7 +1824,17 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
if (!anon_vma)
return NULL;
if (anon_vma_trylock_read(anon_vma))
goto out;
if (rwc->try_lock) {
anon_vma = NULL;
rwc->contended = true;
goto out;
}
anon_vma_lock_read(anon_vma);
out:
return anon_vma;
}
@ -1904,8 +1925,18 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
pgoff_start = page_to_pgoff(page);
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
if (!locked)
if (!locked) {
if (i_mmap_trylock_read(mapping))
goto lookup;
if (rwc->try_lock) {
rwc->contended = true;
return;
}
i_mmap_lock_read(mapping);
}
lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);

View File

@ -1042,6 +1042,10 @@ static enum page_references page_check_references(struct page *page,
if (vm_flags & VM_LOCKED)
return PAGEREF_RECLAIM;
/* rmap lock contention: rotate */
if (referenced_ptes == -1)
return PAGEREF_KEEP;
if (referenced_ptes) {
if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
@ -2200,8 +2204,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
}
/* Referenced or rmap lock contention: rotate */
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
&vm_flags) != 0) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and