FROMLIST: mm: multi-gen LRU: kill switch
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that can be disabled include: 0x0001: the multi-gen LRU core 0x0002: walking page table, when arch_has_hw_pte_young() returns true 0x0004: clearing the accessed bit in non-leaf PMD entries, when CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y [yYnN]: apply to all the components above E.g., echo y >/sys/kernel/mm/lru_gen/enabled cat /sys/kernel/mm/lru_gen/enabled 0x0007 echo 5 >/sys/kernel/mm/lru_gen/enabled cat /sys/kernel/mm/lru_gen/enabled 0x0005 NB: the page table walks happen on the scale of seconds under heavy memory pressure, in which case the mmap_lock contention is a lesser concern, compared with the LRU lock contention and the I/O congestion. So far the only well-known case of the mmap_lock contention happens on Android, due to Scudo [1] which allocates several thousand VMAs for merely a few hundred MBs. The SPF and the Maple Tree also have provided their own assessments [2][3]. However, if walking page tables does worsen the mmap_lock contention, the kill switch can be used to disable it. In this case the multi-gen LRU will suffer a minor performance degradation, as shown previously. Clearing the accessed bit in non-leaf PMD entries can also be disabled, since this behavior was not tested on x86 varieties other than Intel and AMD. [1] https://source.android.com/devices/tech/debug/scudo [2] https://lore.kernel.org/lkml/20220128131006.67712-1-michel@lespinasse.org/ [3] https://lore.kernel.org/lkml/20220202024137.2516438-1-Liam.Howlett@oracle.com/ Link: https://lore.kernel.org/r/20220309021230.721028-11-yuzhao@google.com/ Signed-off-by: Yu Zhao <yuzhao@google.com> Acked-by: Brian Geffon <bgeffon@google.com> Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> Acked-by: Steven Barrett <steven@liquorix.net> Acked-by: Suleiman Souhlal <suleiman@google.com> Tested-by: Daniel Byrne <djbyrne@mtu.edu> Tested-by: Donald Carr <d@chaos-reins.com> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> Tested-by: Sofia Trinh <sofia.trinh@edi.works> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> Bug: 228114874 Change-Id: I71801d9470a2588cad8bfd14fbcfafc7b010aa03
This commit is contained in:
parent
6b92f53c19
commit
8c6ffecc48
@ -423,6 +423,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
|
||||
css_put(&cgrp->self);
|
||||
}
|
||||
|
||||
extern struct mutex cgroup_mutex;
|
||||
|
||||
static inline void cgroup_lock(void)
|
||||
{
|
||||
mutex_lock(&cgroup_mutex);
|
||||
}
|
||||
|
||||
static inline void cgroup_unlock(void)
|
||||
{
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
}
|
||||
|
||||
/**
|
||||
* task_css_set_check - obtain a task's css_set with extra access conditions
|
||||
* @task: the task to obtain css_set for
|
||||
@ -437,7 +449,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
|
||||
* as locks used during the cgroup_subsys::attach() methods.
|
||||
*/
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
#define task_css_set_check(task, __c) \
|
||||
rcu_dereference_check((task)->cgroups, \
|
||||
@ -701,6 +712,8 @@ struct cgroup_subsys_state;
|
||||
struct cgroup;
|
||||
|
||||
static inline void css_put(struct cgroup_subsys_state *css) {}
|
||||
static inline void cgroup_lock(void) {}
|
||||
static inline void cgroup_unlock(void) {}
|
||||
static inline int cgroup_attach_task_all(struct task_struct *from,
|
||||
struct task_struct *t) { return 0; }
|
||||
static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||
|
@ -93,7 +93,15 @@ static __always_inline enum lru_list page_lru(struct page *page)
|
||||
|
||||
static inline bool lru_gen_enabled(void)
|
||||
{
|
||||
return true;
|
||||
#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
|
||||
return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
#else
|
||||
DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
|
||||
return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline bool lru_gen_in_fault(void)
|
||||
@ -183,7 +191,7 @@ static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bo
|
||||
int zone = page_zonenum(page);
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
if (PageUnevictable(page))
|
||||
if (PageUnevictable(page) || !lrugen->enabled)
|
||||
return false;
|
||||
/*
|
||||
* There are three common cases for this page:
|
||||
|
@ -312,6 +312,13 @@ enum {
|
||||
LRU_GEN_FILE,
|
||||
};
|
||||
|
||||
enum {
|
||||
LRU_GEN_CORE,
|
||||
LRU_GEN_MM_WALK,
|
||||
LRU_GEN_NONLEAF_YOUNG,
|
||||
NR_LRU_GEN_CAPS
|
||||
};
|
||||
|
||||
#define MIN_LRU_BATCH BITS_PER_LONG
|
||||
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128)
|
||||
|
||||
@ -350,6 +357,8 @@ struct lru_gen_struct {
|
||||
/* can be modified without holding the LRU lock */
|
||||
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
/* whether the multi-gen LRU is enabled */
|
||||
bool enabled;
|
||||
};
|
||||
|
||||
enum {
|
||||
|
@ -144,7 +144,6 @@ struct cgroup_sb_opts {
|
||||
bool none;
|
||||
};
|
||||
|
||||
extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
|
@ -852,6 +852,12 @@ config LRU_GEN
|
||||
help
|
||||
A high performance LRU implementation to overcommit memory.
|
||||
|
||||
config LRU_GEN_ENABLED
|
||||
bool "Enable by default"
|
||||
depends on LRU_GEN
|
||||
help
|
||||
This option enables the multi-gen LRU by default.
|
||||
|
||||
config LRU_GEN_STATS
|
||||
bool "Full stats for debugging"
|
||||
depends on LRU_GEN
|
||||
|
244
mm/vmscan.c
244
mm/vmscan.c
@ -53,6 +53,7 @@
|
||||
#include <linux/psi.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@ -2551,6 +2552,12 @@ out:
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
#else
|
||||
DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
#endif
|
||||
|
||||
/******************************************************************************
|
||||
* shorthand helpers
|
||||
******************************************************************************/
|
||||
@ -2587,6 +2594,15 @@ static int page_lru_tier(struct page *page)
|
||||
return lru_tier_from_refs(refs);
|
||||
}
|
||||
|
||||
static bool get_cap(int cap)
|
||||
{
|
||||
#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
return static_branch_likely(&lru_gen_caps[cap]);
|
||||
#else
|
||||
return static_branch_unlikely(&lru_gen_caps[cap]);
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||
{
|
||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
@ -3393,7 +3409,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
|
||||
goto next;
|
||||
|
||||
if (!pmd_trans_huge(pmd[i])) {
|
||||
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
|
||||
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||
get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||
goto next;
|
||||
}
|
||||
@ -3501,10 +3518,12 @@ restart:
|
||||
priv->mm_stats[MM_PMD_TOTAL]++;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
if (!pmd_young(val))
|
||||
continue;
|
||||
|
||||
walk_pmd_range_locked(pud, addr, vma, walk, &pos);
|
||||
}
|
||||
#endif
|
||||
if (!priv->full_scan && !test_bloom_filter(priv->lruvec, priv->max_seq, pmd + i))
|
||||
continue;
|
||||
@ -3745,7 +3764,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
if (!full_scan && !arch_has_hw_pte_young()) {
|
||||
if (!full_scan && (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK))) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
||||
@ -4456,6 +4475,223 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
|
||||
blk_finish_plug(&plug);
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
* state change
|
||||
******************************************************************************/
|
||||
|
||||
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
{
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
if (lrugen->enabled) {
|
||||
enum lru_list lru;
|
||||
|
||||
for_each_evictable_lru(lru) {
|
||||
if (!list_empty(&lruvec->lists[lru]))
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
int gen, type, zone;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
return false;
|
||||
|
||||
/* unlikely but not a bug when reset_batch_size() is pending */
|
||||
VM_WARN_ON(lrugen->nr_pages[gen][type][zone]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool fill_evictable(struct lruvec *lruvec)
|
||||
{
|
||||
enum lru_list lru;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
|
||||
for_each_evictable_lru(lru) {
|
||||
int type = is_file_lru(lru);
|
||||
bool active = is_active_lru(lru);
|
||||
struct list_head *head = &lruvec->lists[lru];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
bool success;
|
||||
struct page *page = lru_to_page(head);
|
||||
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageActive(page) != active, page);
|
||||
VM_BUG_ON_PAGE(page_is_file_cache(page) != type, page);
|
||||
VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
|
||||
|
||||
prefetchw_prev_lru_page(page, head, flags);
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
success = lru_gen_add_page(lruvec, page, false);
|
||||
VM_BUG_ON(!success);
|
||||
|
||||
if (!--remaining)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool drain_evictable(struct lruvec *lruvec)
|
||||
{
|
||||
int gen, type, zone;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
bool success;
|
||||
struct page *page = lru_to_page(head);
|
||||
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageActive(page), page);
|
||||
VM_BUG_ON_PAGE(page_is_file_cache(page) != type, page);
|
||||
VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
||||
|
||||
prefetchw_prev_lru_page(page, head, flags);
|
||||
|
||||
success = lru_gen_del_page(lruvec, page, false);
|
||||
VM_BUG_ON(!success);
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
|
||||
if (!--remaining)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void lru_gen_change_state(bool enable)
|
||||
{
|
||||
static DEFINE_MUTEX(state_mutex);
|
||||
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
cgroup_lock();
|
||||
cpus_read_lock();
|
||||
get_online_mems();
|
||||
mutex_lock(&state_mutex);
|
||||
|
||||
if (enable == lru_gen_enabled())
|
||||
goto unlock;
|
||||
|
||||
if (enable)
|
||||
static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
else
|
||||
static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
do {
|
||||
int nid;
|
||||
|
||||
for_each_node(nid) {
|
||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
|
||||
if (!lruvec)
|
||||
continue;
|
||||
|
||||
if (!pgdat) {
|
||||
lruvec->lrugen.enabled = enable;
|
||||
continue;
|
||||
}
|
||||
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
|
||||
VM_BUG_ON(!seq_is_valid(lruvec));
|
||||
VM_BUG_ON(!state_is_valid(lruvec));
|
||||
|
||||
lruvec->lrugen.enabled = enable;
|
||||
|
||||
while (!(enable ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
cond_resched();
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
}
|
||||
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
unlock:
|
||||
mutex_unlock(&state_mutex);
|
||||
put_online_mems();
|
||||
cpus_read_unlock();
|
||||
cgroup_unlock();
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
* sysfs interface
|
||||
******************************************************************************/
|
||||
|
||||
static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
unsigned int caps = 0;
|
||||
|
||||
if (get_cap(LRU_GEN_CORE))
|
||||
caps |= BIT(LRU_GEN_CORE);
|
||||
|
||||
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||
caps |= BIT(LRU_GEN_MM_WALK);
|
||||
|
||||
if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
||||
}
|
||||
|
||||
static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t len)
|
||||
{
|
||||
int i;
|
||||
unsigned int caps;
|
||||
|
||||
if (tolower(*buf) == 'n')
|
||||
caps = 0;
|
||||
else if (tolower(*buf) == 'y')
|
||||
caps = -1;
|
||||
else if (kstrtouint(buf, 0, &caps))
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
|
||||
bool enable = caps & BIT(i);
|
||||
|
||||
if (i == LRU_GEN_CORE)
|
||||
lru_gen_change_state(enable);
|
||||
else if (enable)
|
||||
static_branch_enable(&lru_gen_caps[i]);
|
||||
else
|
||||
static_branch_disable(&lru_gen_caps[i]);
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||
enabled, 0644, show_enable, store_enable
|
||||
);
|
||||
|
||||
static struct attribute *lru_gen_attrs[] = {
|
||||
&lru_gen_enabled_attr.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
static struct attribute_group lru_gen_attr_group = {
|
||||
.name = "lru_gen",
|
||||
.attrs = lru_gen_attrs,
|
||||
};
|
||||
|
||||
/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
@ -4466,6 +4702,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled = lru_gen_enabled();
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
@ -4506,6 +4743,9 @@ static int __init init_lru_gen(void)
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
|
||||
|
||||
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
pr_err("lru_gen: failed to create sysfs group\n");
|
||||
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
||||
|
Loading…
Reference in New Issue
Block a user