diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc4daeb50f9a..703d58fa188d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1080,6 +1080,12 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head) { } +static inline void __count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ab9312e072a0..e60c11dc1037 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -757,6 +757,8 @@ struct zonelist { extern struct page *mem_map; #endif +struct mm_walk_args; + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -853,6 +855,9 @@ typedef struct pglist_data { unsigned long flags; +#ifdef CONFIG_LRU_GEN + struct mm_walk_args *mm_walk_args; +#endif ZONE_PADDING(_pad2_) /* Per-node vmstats */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 0c014310ed22..5f319bc6481c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1163,6 +1163,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, !(PageAnon(page) && !PageSwapBacked(page))) sc->nr_scanned++; + /* in case the page was found accessed by lru_gen_scan_around() */ + if (lru_gen_enabled() && !skip_reference_check && + page_mapped(page) && PageReferenced(page)) + goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); @@ -3800,6 +3805,485 @@ out: set_page_dirty(pte_page(pte[i])); } +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool should_skip_page(struct page *page, struct scan_control *sc) +{ + if (!sc->may_unmap && page_mapped(page)) + return true; + + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) + return true; + + if (!get_page_unless_zero(page)) + return true; + + return false; +} + +static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_to_isolate) +{ + bool success; + int gen = page_lru_gen(page); + int type = page_is_file_cache(page); + int zone = page_zonenum(page); + int tier = lru_tier_from_usage(page_tier_usage(page)); + struct lrugen *lrugen = &lruvec->evictable; + + VM_BUG_ON_PAGE(gen == -1, page); + VM_BUG_ON_PAGE(tier_to_isolate < 0, page); + + /* a lazy-free page that has been written into? */ + if (type && PageDirty(page) && PageAnon(page)) { + success = lru_gen_deletion(page, lruvec); + VM_BUG_ON_PAGE(!success, page); + SetPageSwapBacked(page); + add_page_to_lru_list_tail(page, lruvec); + return true; + } + + /* page_update_gen() has updated the gen #? */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + /* activate this page if its tier has a higher refault rate */ + if (tier_to_isolate < tier) { + int hist = hist_from_seq_or_gen(gen); + + page_inc_gen(page, lruvec, false); + WRITE_ONCE(lrugen->activated[hist][type][tier - 1], + lrugen->activated[hist][type][tier - 1] + hpage_nr_pages(page)); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + return true; + } + + /* mark this page for reclaim if it's pending writeback */ + if (PageWriteback(page) || (type && PageDirty(page))) { + page_inc_gen(page, lruvec, true); + return true; + } + + return false; +} + +static void isolate_page(struct page *page, struct lruvec *lruvec) +{ + bool success; + + success = lru_gen_deletion(page, lruvec); + VM_BUG_ON_PAGE(!success, page); + + ClearPageLRU(page); + + if (PageActive(page)) { + ClearPageActive(page); + /* make sure shrink_page_list() rejects this page */ + SetPageReferenced(page); + return; + } + + /* make sure shrink_page_list() doesn't try to write this page */ + ClearPageReclaim(page); + /* make sure shrink_page_list() doesn't reject this page */ + ClearPageReferenced(page); +} + +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, long *nr_to_scan, + int type, int tier, struct list_head *list) +{ + bool success; + int gen, zone; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int batch_size = 0; + struct lrugen *lrugen = &lruvec->evictable; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_BUG_ON(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return -ENOENT; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + int delta = hpage_nr_pages(page); + + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_is_file_cache(page) != type, page); + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); + + prefetchw_prev_lru_page(page, head, flags); + + scanned += delta; + + if (sort_page(page, lruvec, tier)) + sorted += delta; + else if (should_skip_page(page, sc)) { + list_move(&page->lru, &moved); + skipped += delta; + } else { + isolate_page(page, lruvec); + list_add(&page->lru, list); + isolated += delta; + } + + if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX || + ++batch_size == MAX_BATCH_SIZE) + break; + } + + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + + if (scanned >= *nr_to_scan || isolated >= SWAP_CLUSTER_MAX || + batch_size == MAX_BATCH_SIZE) + break; + } + + success = try_inc_min_seq(lruvec, type); + + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (global_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + + *nr_to_scan -= scanned; + + if (*nr_to_scan <= 0 || success || isolated) + return isolated; + /* + * We may have trouble finding eligible pages due to reclaim_idx, + * may_unmap and may_writepage. The following check makes sure we won't + * be stuck if we aren't making enough progress. + */ + return batch_size == MAX_BATCH_SIZE && sorted >= SWAP_CLUSTER_MAX ? 0 : -ENOENT; +} + +static int get_tier_to_isolate(struct lruvec *lruvec, int type) +{ + int tier; + struct controller_pos sp, pv; + + /* + * Ideally we don't want to evict upper tiers that have higher refault + * rates. However, we need to leave a margin for the fluctuations in + * refault rates. So we use a larger gain factor to make sure upper + * tiers are indeed more active. We choose 2 because the lowest upper + * tier would have twice of the refault rate of the base tier, according + * to their numbers of accesses. + */ + read_controller_pos(&sp, lruvec, type, 0, 1); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_controller_pos(&pv, lruvec, type, tier, 2); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_to_isolate) +{ + int type, tier; + struct controller_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare the refault rates between the base tiers of anon and file to + * determine which type to evict. Also need to compare the refault rates + * of the upper tiers of the selected type with that of the base tier of + * the other type to determine which tier of the selected type to evict. + */ + read_controller_pos(&sp, lruvec, 0, 0, gain[0]); + read_controller_pos(&pv, lruvec, 1, 0, gain[1]); + type = positive_ctrl_err(&sp, &pv); + + read_controller_pos(&sp, lruvec, !type, 0, gain[!type]); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_controller_pos(&pv, lruvec, type, tier, gain[type]); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_to_isolate = tier - 1; + + return type; +} + +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + long *nr_to_scan, int *type_to_scan, struct list_head *list) +{ + int i; + int type; + int isolated; + int tier = -1; + DEFINE_MAX_SEQ(); + DEFINE_MIN_SEQ(); + + VM_BUG_ON(!seq_is_valid(lruvec)); + + if (get_hi_wmark(max_seq, min_seq, swappiness) == MIN_NR_GENS) + return 0; + /* + * Try to select a type based on generations and swappiness, and if that + * fails, fall back to get_type_to_scan(). When anon and file are both + * available from the same generation, swappiness 200 is interpreted as + * anon first and swappiness 1 is interpreted as file first. + */ + type = !swappiness || min_seq[0] > min_seq[1] || + (min_seq[0] == min_seq[1] && swappiness != 200 && + (swappiness == 1 || get_type_to_scan(lruvec, swappiness, &tier))); + + if (tier == -1) + tier = get_tier_to_isolate(lruvec, type); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + isolated = scan_pages(lruvec, sc, nr_to_scan, type, tier, list); + if (isolated >= 0) + break; + + type = !type; + tier = get_tier_to_isolate(lruvec, type); + } + + if (isolated < 0) + isolated = *nr_to_scan = 0; + + *type_to_scan = type; + + return isolated; +} + +/* Main function used by the foreground, the background and the user-triggered eviction. */ +static bool evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + long *nr_to_scan) +{ + int type; + int isolated; + int reclaimed; + LIST_HEAD(list); + struct page *page; + enum vm_event_item item; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&pgdat->lru_lock); + + isolated = isolate_pages(lruvec, sc, swappiness, nr_to_scan, &type, &list); + VM_BUG_ON(list_empty(&list) == !!isolated); + + if (isolated) + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, isolated); + + spin_unlock_irq(&pgdat->lru_lock); + + if (!isolated) + goto done; + + reclaimed = shrink_page_list(&list, pgdat, sc, 0, NULL, false); + /* + * We need to prevent rejected pages from being added back to the same + * lists they were isolated from. Otherwise we may risk looping on them + * forever. We use PageActive() or !PageReferenced() && PageWorkingset() + * to tell lru_gen_addition() not to add them to the oldest generation. + */ + list_for_each_entry(page, &list, lru) { + if (PageMlocked(page)) + continue; + + if (page_mapped(page) && PageReferenced(page)) + SetPageActive(page); + else if (!PageActive(page)) + SetPageWorkingset(page); + ClearPageReferenced(page); + } + + spin_lock_irq(&pgdat->lru_lock); + + putback_inactive_pages(lruvec, &list); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + type, -isolated); + + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (global_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(lruvec_memcg(lruvec), item, reclaimed); + + spin_unlock_irq(&pgdat->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + sc->nr_reclaimed += reclaimed; +done: + return *nr_to_scan > 0 && sc->nr_reclaimed < sc->nr_to_reclaim; +} + +/****************************************************************************** + * page reclaim + ******************************************************************************/ + +static int get_swappiness(struct lruvec *lruvec) +{ + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + int swappiness = mem_cgroup_get_nr_swap_pages(memcg) >= (long)SWAP_CLUSTER_MAX ? + mem_cgroup_swappiness(memcg) : 0; + + VM_BUG_ON(swappiness > 200U); + + return swappiness; +} + +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + unsigned long *eligible) +{ + int gen, type, zone; + long nr_to_scan = 0; + struct lrugen *lrugen = &lruvec->evictable; + DEFINE_MAX_SEQ(); + DEFINE_MIN_SEQ(); + + lru_add_drain(); + + for (type = !swappiness; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone <= sc->reclaim_idx; zone++) + nr_to_scan += READ_ONCE(lrugen->sizes[gen][type][zone]); + } + } + + *eligible = nr_to_scan = max(nr_to_scan, 0L); + nr_to_scan = round_up(nr_to_scan >> sc->priority, SWAP_CLUSTER_MAX); + + if (get_hi_wmark(max_seq, min_seq, swappiness) > MIN_NR_GENS) + return nr_to_scan; + + /* kswapd uses lru_gen_age_node() */ + if (current_is_kswapd()) + return 0; + + return walk_mm_list(lruvec, max_seq, sc, swappiness, NULL) ? nr_to_scan : 0; +} + +static unsigned long lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + unsigned long eligible; + long scanned = 0; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + enum mem_cgroup_protection prot = mem_cgroup_protected(sc->target_mem_cgroup, memcg); + + blk_start_plug(&plug); + + while (true) { + long nr_to_scan; + int swappiness = sc->may_swap ? get_swappiness(lruvec) : 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &eligible) - scanned; + if (nr_to_scan < (long)SWAP_CLUSTER_MAX) + break; + + scanned += nr_to_scan; + + if (!evict_pages(lruvec, sc, swappiness, &nr_to_scan)) + break; + + scanned -= nr_to_scan; + + if (prot == MEMCG_PROT_MIN || (prot == MEMCG_PROT_LOW && !sc->memcg_low_reclaim)) + break; + + cond_resched(); + } + + blk_finish_plug(&plug); + + return eligible; +} + +/****************************************************************************** + * the background aging + ******************************************************************************/ + +static int lru_gen_spread = MIN_NR_GENS; + +static void try_walk_mm_list(struct lruvec *lruvec, struct scan_control *sc) +{ + int gen, type, zone; + long old_and_young[2] = {}; + int spread = READ_ONCE(lru_gen_spread); + int swappiness = get_swappiness(lruvec); + struct lrugen *lrugen = &lruvec->evictable; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + DEFINE_MAX_SEQ(); + DEFINE_MIN_SEQ(); + + lru_add_drain(); + + for (type = !swappiness; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + old_and_young[seq == max_seq] += + READ_ONCE(lrugen->sizes[gen][type][zone]); + } + } + + old_and_young[0] = max(old_and_young[0], 0L); + old_and_young[1] = max(old_and_young[1], 0L); + + /* try to spread pages out across spread+1 generations */ + if (old_and_young[0] >= old_and_young[1] * spread && + get_lo_wmark(max_seq, min_seq, swappiness) > max(spread, MIN_NR_GENS)) + return; + + walk_mm_list(lruvec, max_seq, sc, swappiness, pgdat->mm_walk_args); +} + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON(!current_is_kswapd()); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); + enum mem_cgroup_protection prot = mem_cgroup_protected(sc->target_mem_cgroup, + memcg); + + if (prot != MEMCG_PROT_MIN && (prot != MEMCG_PROT_LOW || sc->memcg_low_reclaim)) + try_walk_mm_list(lruvec, sc); + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +} + /****************************************************************************** * state change ******************************************************************************/ @@ -4003,6 +4487,21 @@ static int __meminit __maybe_unused lru_gen_online_mem(struct notifier_block *se return NOTIFY_DONE; } +static void lru_gen_start_kswapd(int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + + pgdat->mm_walk_args = kvzalloc_node(size_of_mm_walk_args(), GFP_KERNEL, nid); + WARN_ON_ONCE(!pgdat->mm_walk_args); +} + +static void lru_gen_stop_kswapd(int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + + kvfree(pgdat->mm_walk_args); +} + /****************************************************************************** * initialization ******************************************************************************/ @@ -4051,6 +4550,25 @@ static int __init init_lru_gen(void) */ arch_initcall(init_lru_gen); +#else /* CONFIG_LRU_GEN */ + +static unsigned long lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + return 0; +} + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + +static void lru_gen_start_kswapd(int nid) +{ +} + +static void lru_gen_stop_kswapd(int nid) +{ +} + #endif /* CONFIG_LRU_GEN */ /* @@ -4069,6 +4587,11 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc struct blk_plug plug; bool scan_adjusted; + if (lru_gen_enabled()) { + *lru_pages = lru_gen_shrink_lruvec(lruvec, sc); + return; + } + get_scan_count(lruvec, memcg, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ @@ -4575,6 +5098,9 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) { struct mem_cgroup *memcg; + if (lru_gen_enabled()) + return; + memcg = mem_cgroup_iter(root_memcg, NULL, NULL); do { unsigned long refaults; @@ -4940,6 +5466,11 @@ static void age_active_anon(struct pglist_data *pgdat, { struct mem_cgroup *memcg; + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); + return; + } + if (!total_swap_pages) return; @@ -5536,6 +6067,8 @@ int kswapd_run(int nid) if (pgdat->kswapd) return 0; + lru_gen_start_kswapd(nid); + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ @@ -5558,6 +6091,7 @@ void kswapd_stop(int nid) if (kswapd) { kthread_stop(kswapd); NODE_DATA(nid)->kswapd = NULL; + lru_gen_stop_kswapd(nid); } }