FROMLIST: mm: multi-gen LRU: support page table walks

To further exploit spatial locality, the aging prefers to walk page
tables to search for young PTEs and promote hot pages. A kill switch
will be added in the next patch to disable this behavior. When
disabled, the aging relies on the rmap only.

NB: this behavior has nothing similar with the page table scanning in
the 2.4 kernel [1], which searches page tables for old PTEs, adds cold
pages to swapcache and unmaps them.

To avoid confusion, the term "iteration" specifically means the
traversal of an entire mm_struct list; the term "walk" will be applied
to page tables and the rmap, as usual.

An mm_struct list is maintained for each memcg, and an mm_struct
follows its owner task to the new memcg when this task is migrated.
Given an lruvec, the aging iterates lruvec_memcg()->mm_list and calls
walk_page_range() with each mm_struct on this list to promote hot
pages before it increments max_seq.

When multiple page table walkers iterate the same list, each of them
gets a unique mm_struct; therefore they can run concurrently. Page
table walkers ignore any misplaced pages, e.g., if an mm_struct was
migrated, pages it left in the previous memcg will not be promoted
when its current memcg is under reclaim. Similarly, page table walkers
will not promote pages from nodes other than the one under reclaim.

This patch uses the following optimizations when walking page tables:
1. It tracks the usage of mm_struct's between context switches so that
   page table walkers can skip processes that have been sleeping since
   the last iteration.
2. It uses generational Bloom filters to record populated branches so
   that page table walkers can reduce their search space based on the
   query results, e.g., to skip page tables containing mostly holes or
   misplaced pages.
3. It takes advantage of the accessed bit in non-leaf PMD entries when
   CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
4. It does not zigzag between a PGD table and the same PMD table
   spanning multiple VMAs. IOW, it finishes all the VMAs within the
   range of the same PMD table before it returns to a PGD table. This
   improves the cache performance for workloads that have large
   numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.

Server benchmark results:
  Single workload:
    fio (buffered I/O): no change

  Single workload:
    memcached (anon): +[5.5, 7.5]%
                         Ops/sec      KB/sec
      patch1-7:          1014393.57   39455.42
      patch1-8:          1078507.59   41949.15

  Configurations:
    no change

Client benchmark results:
  kswapd profiles:
    patch1-7
      45.54%  lzo1x_1_do_compress (real work)
       9.56%  page_vma_mapped_walk
       6.70%  _raw_spin_unlock_irq
       2.78%  ptep_clear_flush
       2.47%  do_raw_spin_lock
       2.22%  __zram_bvec_write
       1.87%  lru_gen_look_around
       1.78%  memmove
       1.77%  obj_malloc
       1.44%  free_unref_page_list

    patch1-8
      47.02%  lzo1x_1_do_compress (real work)
       6.73%  page_vma_mapped_walk
       6.14%  _raw_spin_unlock_irq
       3.39%  walk_pte_range
       2.63%  ptep_clear_flush
       2.29%  __zram_bvec_write
       2.10%  do_raw_spin_lock
       1.81%  memmove
       1.73%  obj_malloc
       1.53%  free_unref_page_list

  Configurations:
    no change

[1] https://lwn.net/Articles/23732/
[2] https://source.android.com/devices/tech/debug/scudo

Link: https://lore.kernel.org/r/20220309021230.721028-9-yuzhao@google.com/
Signed-off-by: Yu Zhao <yuzhao@google.com>
Acked-by: Brian Geffon <bgeffon@google.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Acked-by: Steven Barrett <steven@liquorix.net>
Acked-by: Suleiman Souhlal <suleiman@google.com>
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
Tested-by: Donald Carr <d@chaos-reins.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Bug: 228114874
Change-Id: I5a3c97cf8ebf8d65d5f9528cd979a637c190053e
This commit is contained in:
Yu Zhao 2022-02-25 20:55:13 -07:00 committed by spakkkk
parent 4aed665140
commit 400395317f
13 changed files with 1156 additions and 17 deletions

View File

@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
lru_gen_add_mm(mm);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
@ -1045,6 +1046,7 @@ static int exec_mmap(struct mm_struct *mm)
activate_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
lru_gen_use_mm(mm);
tsk->mm->vmacache_seqnum = 0;
vmacache_flush(tsk);
task_unlock(tsk);

View File

@ -311,6 +311,11 @@ struct mem_cgroup {
struct list_head event_list;
spinlock_t event_list_lock;
#ifdef CONFIG_LRU_GEN
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
#endif
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};

View File

@ -1430,6 +1430,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
* (see the comment on walk_page_range() for more details)
*/
struct mm_walk {
int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
unsigned long next, struct mm_walk *walk);
int (*pud_entry)(pud_t *pud, unsigned long addr,
unsigned long next, struct mm_walk *walk);
int (*pmd_entry)(pmd_t *pmd, unsigned long addr,

View File

@ -3,6 +3,7 @@
#define _LINUX_MM_TYPES_H
#include <linux/mm_types_task.h>
#include <linux/sched.h>
#include <linux/auxvec.h>
#include <linux/list.h>
@ -14,6 +15,8 @@
#include <linux/uprobes.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/nodemask.h>
#include <linux/mmdebug.h>
#include <linux/android_kabi.h>
#include <asm/mmu.h>
@ -509,6 +512,23 @@ struct mm_struct {
/* HMM needs to track a few things per mm */
struct hmm *hmm;
#endif
#ifdef CONFIG_LRU_GEN
struct {
/* this mm_struct is on lru_gen_mm_list */
struct list_head list;
#ifdef CONFIG_MEMCG
/* points to the memcg of "owner" above */
struct mem_cgroup *memcg;
#endif
/*
* Set when switching to this mm_struct, as a hint of
* whether it has been used since the last time per-node
* page table walkers cleared the corresponding bits.
*/
nodemask_t nodes;
} lru_gen;
#endif /* CONFIG_LRU_GEN */
} __randomize_layout;
/*
@ -535,6 +555,65 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
#ifdef CONFIG_LRU_GEN
struct lru_gen_mm_list {
/* mm_struct list for page table walkers */
struct list_head fifo;
/* protects the list above */
spinlock_t lock;
};
void lru_gen_add_mm(struct mm_struct *mm);
void lru_gen_del_mm(struct mm_struct *mm);
#ifdef CONFIG_MEMCG
void lru_gen_migrate_mm(struct mm_struct *mm);
#endif
static inline void lru_gen_init_mm(struct mm_struct *mm)
{
INIT_LIST_HEAD(&mm->lru_gen.list);
#ifdef CONFIG_MEMCG
mm->lru_gen.memcg = NULL;
#endif
nodes_clear(mm->lru_gen.nodes);
}
static inline void lru_gen_use_mm(struct mm_struct *mm)
{
/* unlikely but not a bug when racing with lru_gen_migrate_mm() */
VM_WARN_ON(list_empty(&mm->lru_gen.list));
if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lru_gen.nodes))
nodes_setall(mm->lru_gen.nodes);
}
#else /* !CONFIG_LRU_GEN */
static inline void lru_gen_add_mm(struct mm_struct *mm)
{
}
static inline void lru_gen_del_mm(struct mm_struct *mm)
{
}
#ifdef CONFIG_MEMCG
static inline void lru_gen_migrate_mm(struct mm_struct *mm)
{
}
#endif
static inline void lru_gen_init_mm(struct mm_struct *mm)
{
}
static inline void lru_gen_use_mm(struct mm_struct *mm)
{
}
#endif /* CONFIG_LRU_GEN */
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long start, unsigned long end);

View File

@ -352,6 +352,58 @@ struct lru_gen_struct {
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
};
enum {
MM_PTE_TOTAL, /* total leaf entries */
MM_PTE_OLD, /* old leaf entries */
MM_PTE_YOUNG, /* young leaf entries */
MM_PMD_TOTAL, /* total non-leaf entries */
MM_PMD_FOUND, /* non-leaf entries found in Bloom filters */
MM_PMD_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
};
/* mnemonic codes for the mm stats above */
#define MM_STAT_CODES "toydfa"
/* double-buffering Bloom filters */
#define NR_BLOOM_FILTERS 2
struct lru_gen_mm_state {
/* set to max_seq after each iteration */
unsigned long seq;
/* where the current iteration starts (inclusive) */
struct list_head *head;
/* where the last iteration ends (exclusive) */
struct list_head *tail;
/* to wait for the last page table walker to finish */
struct wait_queue_head wait;
/* Bloom filters flip after each iteration */
unsigned long *filters[NR_BLOOM_FILTERS];
/* the mm stats for debugging */
unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
/* the number of concurrent page table walkers */
int nr_walkers;
};
struct lru_gen_mm_walk {
/* the lruvec under reclaim */
struct lruvec *lruvec;
/* unstable max_seq from lru_gen_struct */
unsigned long max_seq;
/* the next address within an mm to scan */
unsigned long next_addr;
/* to batch page table entries */
unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
/* to batch promoted pages */
int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
/* to batch the mm stats */
int mm_stats[NR_MM_STATS];
/* total batched items */
int batched;
bool can_swap;
bool full_scan;
};
void lru_gen_init_lruvec(struct lruvec *lruvec);
void *lru_gen_eviction(struct page *page);
void lru_gen_refault(struct page *page, void *shadow);
@ -403,6 +455,8 @@ struct lruvec {
#ifdef CONFIG_LRU_GEN
/* evictable pages divided into generations */
struct lru_gen_struct lrugen;
/* to concurrently iterate lru_gen_mm_list */
struct lru_gen_mm_state mm_state;
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
@ -883,6 +937,11 @@ typedef struct pglist_data {
unsigned long flags;
#ifdef CONFIG_LRU_GEN
/* kswap mm walk data */
struct lru_gen_mm_walk mm_walk;
#endif
ZONE_PADDING(_pad2_)
/* Per-node vmstats */

View File

@ -128,6 +128,10 @@ union swap_header {
*/
struct reclaim_state {
unsigned long reclaimed_slab;
#ifdef CONFIG_LRU_GEN
/* per-thread mm walk data */
struct lru_gen_mm_walk *mm_walk;
#endif
};
#ifdef __KERNEL__

View File

@ -489,6 +489,7 @@ assign_new_owner:
goto retry;
}
mm->owner = c;
lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}

View File

@ -1002,6 +1002,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
fail_nocontext:
@ -1044,6 +1045,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
mmdrop(mm);
}
@ -2386,6 +2388,13 @@ long _do_fork(unsigned long clone_flags,
get_task_struct(p);
}
if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
task_unlock(p);
}
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */

View File

@ -3551,6 +3551,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */

View File

@ -5430,6 +5430,29 @@ static void mem_cgroup_move_task(void)
}
#endif
#ifdef CONFIG_LRU_GEN
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct task_struct *task = NULL;
cgroup_taskset_for_each_leader(task, css, tset)
break;
if (!task)
return;
task_lock(task);
if (task->mm && task->mm->owner == task)
lru_gen_migrate_mm(task->mm);
task_unlock(task);
}
#else
static void mem_cgroup_attach(struct cgroup_taskset *tset)
{
}
#endif /* CONFIG_LRU_GEN */
/*
* Cgroup retains root cgroups across [un]mount cycles making it necessary
* to verify whether we're attached to the default hierarchy on each mount
@ -5799,6 +5822,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
.attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.bind = mem_cgroup_bind,

View File

@ -4028,7 +4028,7 @@ static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
struct reclaim_state reclaim_state;
struct reclaim_state reclaim_state = {};
int progress;
unsigned int noreclaim_flag;
unsigned long pflags;

View File

@ -131,6 +131,11 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
break;
continue;
}
if (walk->p4d_entry) {
err = walk->p4d_entry(p4d, addr, next, walk);
if (err)
break;
}
if (walk->pmd_entry || walk->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
@ -157,7 +162,7 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
break;
continue;
}
if (walk->pmd_entry || walk->pte_entry)
if (walk->p4d_entry || walk->pmd_entry || walk->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;

File diff suppressed because it is too large Load Diff