2026-01-29 22:25:33 +08:00

1745 lines
39 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* dynamic pool core file
*
* Copyright (C) 2024 Huawei Limited.
*/
#define pr_fmt(fmt) "Dynamic pool: " fmt
#include <linux/memblock.h>
#include <linux/dynamic_pool.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/dynamic_pool.h>
static bool enable_dhugetlb;
static bool enable_dpagelist;
/* Indicate the enabled of dynamic pool */
DEFINE_STATIC_KEY_FALSE(dynamic_pool_key);
/* Protect the operation of dynamic pool */
static DEFINE_MUTEX(dpool_mutex);
/* Introduce the special opeartion. */
struct dynamic_pool_ops {
int (*fill_pool)(struct dynamic_pool *dpool, void *arg);
int (*drain_pool)(struct dynamic_pool *dpool);
int (*restore_pool)(struct dynamic_pool *dpool);
};
/* Used to record the mapping of page and dpool */
struct dpool_page_array {
unsigned long count;
struct dynamic_pool *dpool[];
};
#define DEFAULT_PAGE_ARRAY_COUNT 4096
#define hugepage_index(pfn) ((pfn) >> PUD_ORDER)
static struct dpool_page_array *dpool_page_array;
static DEFINE_RWLOCK(dpool_page_array_rwlock);
/* For dpagelist, there are only one dpool */
static struct dynamic_pool *dpool_global_pool;
/* Used for percpu pages pool */
#define PCP_PAGE_MAX 1024
#define PCP_PAGE_BATCH (PCP_PAGE_MAX >> 2)
/* === reference function ============================================= */
static bool dpool_get_unless_zero(struct dynamic_pool *dpool)
{
if (!dpool)
return false;
return refcount_inc_not_zero(&dpool->refcnt);
}
static void dpool_put(struct dynamic_pool *dpool)
{
if (!dpool)
return;
if (refcount_dec_and_test(&dpool->refcnt)) {
dpool->memcg->dpool = NULL;
css_put(&dpool->memcg->css);
dpool_global_pool = NULL;
synchronize_rcu();
free_percpu(dpool->pcp_pool);
kfree(dpool->pfn_ranges);
kfree(dpool);
}
}
static struct dynamic_pool *dpool_get_from_memcg(struct mem_cgroup *memcg)
{
struct dynamic_pool *dpool;
rcu_read_lock();
dpool = memcg->dpool;
if (!dpool_get_unless_zero(dpool))
dpool = NULL;
rcu_read_unlock();
return dpool;
}
static struct dynamic_pool *dpool_get_from_task(struct task_struct *tsk)
{
struct dynamic_pool *dpool = NULL;
struct mem_cgroup *memcg;
if (!dpool_enabled)
return NULL;
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(tsk);
} while (memcg && !css_tryget(&memcg->css));
rcu_read_unlock();
if (!memcg)
return NULL;
dpool = dpool_get_from_memcg(memcg);
css_put(&memcg->css);
return dpool;
}
static struct dynamic_pool *dpool_get_from_page(struct page *page)
{
struct dynamic_pool *dpool = NULL;
unsigned long idx;
rcu_read_lock();
if (enable_dhugetlb) {
idx = hugepage_index(page_to_pfn(page));
read_lock(&dpool_page_array_rwlock);
if (idx < dpool_page_array->count)
dpool = dpool_page_array->dpool[idx];
read_unlock(&dpool_page_array_rwlock);
} else if (enable_dpagelist) {
/*
* Attention: dpool_global_pool return for any page,
* so need other check to make sure it is from dpool.
*/
dpool = dpool_global_pool;
}
if (!dpool_get_unless_zero(dpool))
dpool = NULL;
rcu_read_unlock();
return dpool;
}
bool __task_in_dynamic_pool(struct task_struct *tsk)
{
struct dynamic_pool *dpool;
if (!dpool_enabled)
return false;
dpool = dpool_get_from_task(tsk);
dpool_put(dpool);
return !!dpool;
}
bool page_in_dynamic_pool(struct page *page)
{
struct dynamic_pool *dpool;
bool ret;
if (!dpool_enabled)
return false;
if (PageDpool(page))
return true;
/*
* If the page don't have the flags, it may be in pcp list.
* Check it using the page range.
*/
dpool = dpool_get_from_page(page);
if (enable_dpagelist && dpool) {
unsigned long pfn = page_to_pfn(page);
int range_cnt = dpool->range_cnt;
struct range *range;
int i;
for (i = 0; i < range_cnt; i++) {
range = &dpool->pfn_ranges[i];
if (pfn >= range->start && pfn <= range->end)
goto out;
}
/* The pfn is not in the range, set dpool to NULL */
dpool = NULL;
}
out:
ret = dpool ? !PagePool(page) : false;
dpool_put(dpool);
return ret;
}
/* === demote and promote function ==================================== */
static void dpool_disable_pcp_pool(struct dynamic_pool *dpool, bool drain);
static void dpool_enable_pcp_pool(struct dynamic_pool *dpool);
/*
* Clear compound structure which is inverse of prep_compound_page,
* For detail, see destroy_compound_hugetlb_folio_for_demote.
*/
static void clear_compound_page(struct folio *folio, unsigned int order)
{
int i;
int nr_pages = 1 << order;
struct page *p;
atomic_set(&folio->_entire_mapcount, 0);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);
p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE;
p->mapping = NULL;
if (!i)
__ClearPageHead(p);
else
clear_compound_head(p);
set_page_private(p, 0);
}
}
static int dpool_demote_gigantic_page(struct pages_pool *src_pool,
struct pages_pool *dst_pool,
struct page *page)
{
struct folio *folio = page_folio(page);
struct hstate *h = size_to_hstate(PMD_SIZE);
int nr_pages = 1 << PUD_ORDER;
int block_size = 1 << PMD_ORDER;
struct page *subpage;
int i;
if (PageHWPoison(page))
return -EHWPOISON;
list_del(&page->lru);
__ClearPageDpool(page);
src_pool->free_pages--;
destroy_compound_hugetlb_folio_for_demote(folio, PUD_ORDER);
for (i = 0; i < nr_pages; i += block_size) {
subpage = folio_page(folio, i);
prep_compound_page(subpage, PMD_ORDER);
folio_change_private(page_folio(subpage), NULL);
__SetPageDpool(subpage);
__prep_new_hugetlb_folio(h, page_folio(subpage));
list_add_tail(&subpage->lru, &dst_pool->freelist);
dst_pool->free_pages++;
}
return 0;
}
static int dpool_demote_huge_page(struct pages_pool *src_pool,
struct pages_pool *dst_pool,
struct page *page)
{
struct folio *folio = page_folio(page);
int nr_pages = 1 << PMD_ORDER;
struct page *subpage;
int i;
if (PageHWPoison(page))
return -EHWPOISON;
list_del(&page->lru);
__ClearPageDpool(page);
src_pool->free_pages--;
__folio_clear_hugetlb(page_folio(page));
clear_compound_page(page_folio(page), PMD_ORDER);
for (i = 0; i < nr_pages; i++) {
subpage = folio_page(folio, i);
dpool_free_page_prepare(subpage);
__SetPageDpool(subpage);
list_add_tail(&subpage->lru, &dst_pool->freelist);
dst_pool->free_pages++;
}
return 0;
}
static int dpool_demote_pool_locked(struct dynamic_pool *dpool, int type)
{
struct pages_pool *src_pool, *dst_pool;
struct split_page *spage = NULL;
struct page *page = NULL;
int ret = -ENOMEM;
lockdep_assert_held(&dpool->lock);
if (type < 0 || type >= PAGES_POOL_MAX - 1)
return -EINVAL;
src_pool = &dpool->pool[type];
dst_pool = &dpool->pool[type + 1];
spage = kzalloc(sizeof(struct split_page), GFP_ATOMIC);
if (!spage)
goto out;
if (!src_pool->free_pages && dpool_demote_pool_locked(dpool, type - 1))
goto out;
list_for_each_entry(page, &src_pool->freelist, lru) {
switch (type) {
case PAGES_POOL_1G:
ret = dpool_demote_gigantic_page(src_pool, dst_pool, page);
break;
case PAGES_POOL_2M:
ret = dpool_demote_huge_page(src_pool, dst_pool, page);
break;
default:
BUG();
}
if (!ret)
break;
}
out:
if (!ret) {
spage->start_pfn = page_to_pfn(page);
list_add(&spage->entry, &src_pool->splitlist);
src_pool->split_pages++;
} else {
kfree(spage);
}
trace_dpool_demote(dpool, type, page, ret);
return ret;
}
static int dpool_promote_gigantic_page(struct pages_pool *src_pool,
struct pages_pool *dst_pool,
struct split_page *spage)
{
struct hstate *h = size_to_hstate(PUD_SIZE);
int nr_pages = 1 << PUD_ORDER;
int block_size = 1 << PMD_ORDER;
struct page *page, *subpage;
int i;
for (i = 0; i < nr_pages; i += block_size) {
subpage = pfn_to_page(spage->start_pfn + i);
if (!PageDpool(subpage))
return -EBUSY;
if (PageHWPoison(subpage))
return -EHWPOISON;
}
for (i = 0; i < nr_pages; i += block_size) {
subpage = pfn_to_page(spage->start_pfn + i);
__folio_clear_hugetlb(page_folio(subpage));
clear_compound_page(page_folio(subpage), PMD_ORDER);
__ClearPageDpool(subpage);
list_del(&subpage->lru);
src_pool->free_pages--;
}
page = pfn_to_page(spage->start_pfn);
prep_compound_gigantic_folio_for_demote(page_folio(page), PUD_ORDER);
folio_change_private(page_folio(page), NULL);
__SetPageDpool(page);
__prep_new_hugetlb_folio(h, page_folio(page));
list_add_tail(&page->lru, &dst_pool->freelist);
dst_pool->free_pages++;
return 0;
}
static int dpool_promote_huge_page(struct pages_pool *src_pool,
struct pages_pool *dst_pool,
struct split_page *spage)
{
struct hstate *h = size_to_hstate(PMD_SIZE);
int nr_pages = 1 << PMD_ORDER;
struct page *page, *subpage;
int i;
for (i = 0; i < nr_pages; i++) {
subpage = pfn_to_page(spage->start_pfn + i);
if (!PageDpool(subpage))
return -EBUSY;
if (PageHWPoison(subpage))
return -EHWPOISON;
}
for (i = 0; i < nr_pages; i++) {
subpage = pfn_to_page(spage->start_pfn + i);
__ClearPageDpool(subpage);
list_del(&subpage->lru);
src_pool->free_pages--;
}
page = pfn_to_page(spage->start_pfn);
dpool_prep_new_page(page, PMD_ORDER, __GFP_COMP, 0);
set_page_count(page, 0);
folio_change_private(page_folio(page), NULL);
__SetPageDpool(page);
__prep_new_hugetlb_folio(h, page_folio(page));
list_add_tail(&page->lru, &dst_pool->freelist);
dst_pool->free_pages++;
return 0;
}
static int dpool_promote_pool(struct dynamic_pool *dpool, int type)
{
struct pages_pool *src_pool, *dst_pool;
struct split_page *spage, *spage_next;
struct page *page = NULL;
int ret = -ENOMEM;
if (type < 0 || type >= PAGES_POOL_MAX - 1)
return -EINVAL;
src_pool = &dpool->pool[type + 1];
dst_pool = &dpool->pool[type];
spin_lock_irq(&dpool->lock);
if (!dst_pool->split_pages)
goto unlock;
list_for_each_entry_safe(spage, spage_next, &dst_pool->splitlist, entry) {
switch (type) {
case PAGES_POOL_1G:
ret = dpool_promote_gigantic_page(src_pool, dst_pool, spage);
break;
case PAGES_POOL_2M: {
unsigned long nr_pages = 1 << PMD_ORDER;
/*
* Since the dpool_mutex is already locked,
* there is no way to free spage_next, so
* it is safe to unlock here.
*/
spin_unlock_irq(&dpool->lock);
cond_resched();
lru_add_drain_all();
dpool_disable_pcp_pool(dpool, true);
do_migrate_range(spage->start_pfn,
spage->start_pfn + nr_pages);
spin_lock_irq(&dpool->lock);
dpool_enable_pcp_pool(dpool);
ret = dpool_promote_huge_page(src_pool, dst_pool, spage);
break;
}
default:
BUG();
}
if (!ret)
break;
}
if (!ret) {
page = pfn_to_page(spage->start_pfn);
list_del(&spage->entry);
dst_pool->split_pages--;
}
unlock:
spin_unlock_irq(&dpool->lock);
if (!ret)
kfree(spage);
trace_dpool_promote(dpool, type, page, ret);
return ret;
}
/* === percpu pool function =========================================== */
static void dpool_refill_pcp_pool(struct dynamic_pool *dpool,
struct pcp_pages_pool *pcp_pool,
unsigned long count)
{
struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K];
struct page *page, *next;
unsigned long flags;
int i = 0;
lockdep_assert_held(&pcp_pool->lock);
spin_lock_irqsave(&dpool->lock, flags);
if (!pool->free_pages && dpool_demote_pool_locked(dpool, PAGES_POOL_2M))
goto unlock;
list_for_each_entry_safe(page, next, &pool->freelist, lru) {
list_move_tail(&page->lru, &pcp_pool->freelist);
__ClearPageDpool(page);
pool->free_pages--;
pcp_pool->free_pages++;
if (++i == count)
break;
}
unlock:
spin_unlock_irqrestore(&dpool->lock, flags);
}
static void dpool_drain_pcp_pool(struct dynamic_pool *dpool,
struct pcp_pages_pool *pcp_pool,
unsigned long count)
{
struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K];
struct page *page, *next;
unsigned long flags;
int i = 0;
lockdep_assert_held(&pcp_pool->lock);
spin_lock_irqsave(&dpool->lock, flags);
list_for_each_entry_safe(page, next, &pcp_pool->freelist, lru) {
list_move_tail(&page->lru, &pool->freelist);
__SetPageDpool(page);
pcp_pool->free_pages--;
pool->free_pages++;
if (++i == count)
break;
}
pool->used_pages += pcp_pool->used_pages;
pcp_pool->used_pages = 0;
spin_unlock_irqrestore(&dpool->lock, flags);
}
static void dpool_drain_all_pcp_pool(struct dynamic_pool *dpool)
{
struct pcp_pages_pool *pcp_pool;
unsigned long flags;
int cpu;
for_each_possible_cpu(cpu) {
pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu);
spin_lock_irqsave(&pcp_pool->lock, flags);
dpool_drain_pcp_pool(dpool, pcp_pool, pcp_pool->free_pages);
spin_unlock_irqrestore(&pcp_pool->lock, flags);
}
}
static void dpool_wait_all_pcp_pool_unlock(struct dynamic_pool *dpool)
{
struct pcp_pages_pool *pcp_pool;
unsigned long flags;
int cpu;
for_each_possible_cpu(cpu) {
pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu);
spin_lock_irqsave(&pcp_pool->lock, flags);
spin_unlock_irqrestore(&pcp_pool->lock, flags);
}
}
/* The caller have to make sure no others write the count */
static void dpool_sum_pcp_pool(struct dynamic_pool *dpool,
unsigned long *free_pages, long *used_pages)
{
struct pcp_pages_pool *pcp_pool;
int cpu;
*free_pages = 0;
*used_pages = 0;
for_each_possible_cpu(cpu) {
pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu);
*free_pages += pcp_pool->free_pages;
*used_pages += pcp_pool->used_pages;
}
}
static void dpool_disable_pcp_pool(struct dynamic_pool *dpool, bool drain)
{
atomic_inc(&dpool->pcp_refcnt);
/* After increase refcount, wait for other user to unlock. */
if (drain)
dpool_drain_all_pcp_pool(dpool);
else
dpool_wait_all_pcp_pool_unlock(dpool);
}
static void dpool_enable_pcp_pool(struct dynamic_pool *dpool)
{
atomic_dec(&dpool->pcp_refcnt);
}
static bool dpool_pcp_enabled(struct dynamic_pool *dpool)
{
return !atomic_read(&dpool->pcp_refcnt);
}
static struct page *dpool_alloc_pcp_page(struct dynamic_pool *dpool)
{
struct pcp_pages_pool *pcp_pool;
struct page *page = NULL;
unsigned long flags;
pcp_pool = this_cpu_ptr(dpool->pcp_pool);
spin_lock_irqsave(&pcp_pool->lock, flags);
if (!dpool->online || !dpool_pcp_enabled(dpool))
goto unlock;
retry:
page = NULL;
if (!pcp_pool->free_pages)
dpool_refill_pcp_pool(dpool, pcp_pool, PCP_PAGE_BATCH);
page = list_first_entry_or_null(&pcp_pool->freelist, struct page, lru);
if (!page)
goto unlock;
list_del(&page->lru);
pcp_pool->free_pages--;
pcp_pool->used_pages++;
if (dpool_check_new_page(page)) {
SetPagePool(page);
goto retry;
}
SetPagePool(page);
unlock:
spin_unlock_irqrestore(&pcp_pool->lock, flags);
return page;
}
static int dpool_free_pcp_page(struct dynamic_pool *dpool, struct page *page)
{
struct pcp_pages_pool *pcp_pool;
unsigned long flags;
int ret = 0;
pcp_pool = this_cpu_ptr(dpool->pcp_pool);
spin_lock_irqsave(&pcp_pool->lock, flags);
if (!dpool_pcp_enabled(dpool)) {
ret = -EINVAL;
goto unlock;
}
ClearPagePool(page);
if (!dpool_free_page_prepare(page)) {
SetPagePool(page);
goto unlock;
}
list_add(&page->lru, &pcp_pool->freelist);
pcp_pool->free_pages++;
pcp_pool->used_pages--;
if (pcp_pool->free_pages > PCP_PAGE_MAX)
dpool_drain_pcp_pool(dpool, pcp_pool, PCP_PAGE_BATCH);
unlock:
spin_unlock_irqrestore(&pcp_pool->lock, flags);
return ret;
}
/* === allocation interface =========================================== */
int dynamic_pool_can_attach(struct task_struct *tsk, struct mem_cgroup *memcg)
{
struct dynamic_pool *src_dpool, *dst_dpool;
int ret = 0;
if (!dpool_enabled)
return 0;
src_dpool = dpool_get_from_task(tsk);
if (!src_dpool)
return 0;
dst_dpool = dpool_get_from_memcg(memcg);
if (dst_dpool != src_dpool)
ret = -EPERM;
dpool_put(src_dpool);
dpool_put(dst_dpool);
return ret;
}
bool dynamic_pool_should_alloc(gfp_t gfp_mask, unsigned int order)
{
gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE;
if (current->flags & PF_KTHREAD)
return false;
if (order != 0)
return false;
/*
* The cgroup only charges anonymous and file pages from usespage.
* some filesystem maybe has masked out the __GFP_IO | __GFP_FS
* to avoid recursive memory request. eg: loop device, xfs.
*/
if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE)
return false;
#ifdef CONFIG_MEMORY_RELIABLE
if (mem_reliable_is_enabled() && (gfp_mask & GFP_RELIABLE))
return false;
#endif
return true;
}
struct page *dynamic_pool_alloc_page(gfp_t gfp, unsigned int order,
unsigned int alloc_flags)
{
struct dynamic_pool *dpool;
struct pages_pool *pool;
struct page *page = NULL;
unsigned long flags;
if (!dpool_enabled)
return NULL;
if (!dynamic_pool_should_alloc(gfp, order))
return NULL;
dpool = dpool_get_from_task(current);
if (!dpool)
return NULL;
page = dpool_alloc_pcp_page(dpool);
if (page)
goto put;
pool = &dpool->pool[PAGES_POOL_4K];
spin_lock_irqsave(&dpool->lock, flags);
if (!dpool->online)
goto unlock;
retry:
page = NULL;
if (!pool->free_pages && dpool_demote_pool_locked(dpool, PAGES_POOL_2M)) {
spin_unlock_irqrestore(&dpool->lock, flags);
dpool_drain_all_pcp_pool(dpool);
spin_lock_irqsave(&dpool->lock, flags);
if (!dpool->online || !pool->free_pages)
goto unlock;
}
page = list_first_entry_or_null(&pool->freelist, struct page, lru);
if (!page)
goto unlock;
__ClearPageDpool(page);
list_del(&page->lru);
pool->free_pages--;
pool->used_pages++;
if (dpool_check_new_page(page)) {
/* This is a bad page, treat it as a used pages */
SetPagePool(page);
goto retry;
}
SetPagePool(page);
unlock:
spin_unlock_irqrestore(&dpool->lock, flags);
put:
dpool_put(dpool);
if (page)
dpool_prep_new_page(page, order, gfp, alloc_flags);
return page;
}
void dynamic_pool_free_page(struct page *page)
{
struct dynamic_pool *dpool;
struct pages_pool *pool;
unsigned long flags;
if (!dpool_enabled)
return;
dpool = dpool_get_from_page(page);
if (!dpool) {
pr_err("get dpool failed when free page 0x%px\n", page);
return;
}
if (!dpool_free_pcp_page(dpool, page))
goto put;
pool = &dpool->pool[PAGES_POOL_4K];
spin_lock_irqsave(&dpool->lock, flags);
ClearPagePool(page);
if (!dpool_free_page_prepare(page)) {
SetPagePool(page);
goto unlock;
}
__SetPageDpool(page);
list_add(&page->lru, &pool->freelist);
pool->free_pages++;
pool->used_pages--;
unlock:
spin_unlock_irqrestore(&dpool->lock, flags);
put:
dpool_put(dpool);
}
void dynamic_pool_bind_file(struct hugetlbfs_inode_info *p, struct hstate *h)
{
unsigned long size;
if (!dpool_enabled || !p)
return;
size = huge_page_size(h);
if (size == PMD_SIZE || size == PUD_SIZE)
p->dpool = dpool_get_from_task(current);
else
p->dpool = NULL;
}
void dynamic_pool_unbind_file(struct hugetlbfs_inode_info *p)
{
struct dynamic_pool *dpool;
if (!dpool_enabled || !p || !p->dpool)
return;
dpool = p->dpool;
p->dpool = NULL;
dpool_put(dpool);
}
int dynamic_pool_hugetlb_acct_memory(struct hstate *h, long delta,
struct hugetlbfs_inode_info *p)
{
struct dynamic_pool *dpool;
struct pages_pool *pool;
unsigned long flags;
int type;
int ret = -ENOMEM;
if (!dpool_enabled || !p || !p->dpool)
return 0;
dpool = p->dpool;
spin_lock_irqsave(&dpool->lock, flags);
if (hstate_is_gigantic(h))
type = PAGES_POOL_1G;
else
type = PAGES_POOL_2M;
pool = &dpool->pool[type];
if (delta > 0) {
if (delta <= pool->free_huge_pages - pool->resv_huge_pages) {
pool->resv_huge_pages += delta;
ret = 0;
}
} else {
pool->resv_huge_pages -= (unsigned long)(-delta);
WARN_ON(pool->resv_huge_pages < 0);
ret = 0;
}
spin_unlock_irqrestore(&dpool->lock, flags);
trace_dpool_acct_memory(dpool, type, delta, pool->resv_huge_pages,
ret);
return ret;
}
struct folio *dynamic_pool_alloc_hugepage(struct hugetlbfs_inode_info *p,
struct hstate *h, bool reserved)
{
struct dynamic_pool *dpool;
struct pages_pool *pool;
struct folio *folio = NULL;
unsigned long flags;
int type;
if (!dpool_enabled)
return NULL;
dpool = p->dpool;
if (!dpool)
return NULL;
if (hstate_is_gigantic(h))
type = PAGES_POOL_1G;
else
type = PAGES_POOL_2M;
pool = &dpool->pool[type];
spin_lock_irqsave(&dpool->lock, flags);
if (!dpool->online)
goto unlock;
list_for_each_entry(folio, &pool->freelist, lru) {
if (folio_test_hwpoison(folio))
continue;
list_del(&folio->lru);
__folio_clear_dpool(folio);
folio_ref_unfreeze(folio, 1);
pool->free_huge_pages--;
pool->used_huge_pages++;
if (reserved) {
folio_set_hugetlb_restore_reserve(folio);
pool->resv_huge_pages--;
}
folio_set_pool(folio);
goto unlock;
}
folio = NULL;
unlock:
spin_unlock_irqrestore(&dpool->lock, flags);
trace_dpool_alloc_hugepage(dpool, type, folio, pool->free_huge_pages,
pool->resv_huge_pages);
return folio;
}
void dynamic_pool_free_hugepage(struct folio *folio, bool restore_reserve)
{
struct hstate *h = folio_hstate(folio);
struct dynamic_pool *dpool;
struct pages_pool *pool;
unsigned long flags;
int type;
if (!dpool_enabled)
return;
dpool = dpool_get_from_page(folio_page(folio, 0));
if (!dpool) {
pr_err("get dpool failed when free hugepage 0x%px\n", folio);
return;
}
if (hstate_is_gigantic(h))
type = PAGES_POOL_1G;
else
type = PAGES_POOL_2M;
pool = &dpool->pool[type];
spin_lock_irqsave(&dpool->lock, flags);
if (folio_test_hwpoison(folio))
goto unlock;
folio_clear_pool(folio);
__folio_set_dpool(folio);
list_add(&folio->lru, &pool->freelist);
pool->free_huge_pages++;
pool->used_huge_pages--;
if (restore_reserve)
pool->resv_huge_pages++;
unlock:
spin_unlock_irqrestore(&dpool->lock, flags);
dpool_put(dpool);
trace_dpool_free_hugepage(dpool, type, folio, pool->free_huge_pages,
pool->resv_huge_pages);
}
/* === dynamic pool function ========================================== */
static void dpool_dump_child_memcg(struct mem_cgroup *memcg, void *message)
{
struct mem_cgroup *root = (struct mem_cgroup *)message;
struct cgroup *cgrp;
if (root == memcg)
return;
cgrp = memcg->css.cgroup;
pr_err("child memcg exists: ");
pr_cont_cgroup_name(cgrp);
pr_cont("\n");
}
static struct dynamic_pool *dpool_create(struct mem_cgroup *memcg,
struct dynamic_pool_ops *ops)
{
struct dynamic_pool *dpool;
int cpu;
int i;
if (memcg_has_children(memcg)) {
pr_err("create failed, memcg has children\n");
mem_cgroup_scan_cgroups(memcg, dpool_dump_child_memcg, memcg);
return NULL;
}
dpool = kzalloc(sizeof(struct dynamic_pool), GFP_KERNEL);
if (!dpool)
return NULL;
dpool->pcp_pool = alloc_percpu(struct pcp_pages_pool);
if (!dpool->pcp_pool) {
kfree(dpool);
return NULL;
}
spin_lock_init(&dpool->lock);
refcount_set(&dpool->refcnt, 1);
dpool->memcg = memcg;
dpool->ops = ops;
atomic_set(&dpool->pcp_refcnt, 0);
for (i = 0; i < PAGES_POOL_MAX; i++) {
INIT_LIST_HEAD(&dpool->pool[i].freelist);
INIT_LIST_HEAD(&dpool->pool[i].splitlist);
}
for_each_possible_cpu(cpu) {
struct pcp_pages_pool *pcp_pool;
pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu);
spin_lock_init(&pcp_pool->lock);
INIT_LIST_HEAD(&pcp_pool->freelist);
pcp_pool->free_pages = 0;
pcp_pool->used_pages = 0;
}
css_get(&memcg->css);
memcg->dpool = dpool;
dpool->online = true;
return dpool;
}
void dynamic_pool_inherit(struct mem_cgroup *memcg)
{
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct dynamic_pool *dpool;
if (!dpool_enabled || !parent || !memcg)
return;
mutex_lock(&dpool_mutex);
dpool = dpool_get_from_memcg(parent);
memcg->dpool = dpool;
/* Don't increase refcount for child memcg */
dpool_put(dpool);
mutex_unlock(&dpool_mutex);
}
int dynamic_pool_destroy(struct cgroup *cgrp, bool *clear_css_online)
{
struct cgroup_subsys_state *css = cgrp->subsys[memory_cgrp_id];
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct dynamic_pool *dpool;
int ret = 0;
if (!dpool_enabled || !memcg)
return 0;
mutex_lock(&dpool_mutex);
dpool = dpool_get_from_memcg(memcg);
if (!dpool)
goto unlock;
if (dpool->memcg != memcg) {
memcg->dpool = NULL;
goto put;
}
/* A offline dpool is not allowed for allocation */
dpool->online = false;
/* Disable pcp pool forever */
dpool_disable_pcp_pool(dpool, true);
/*
* Even if no process exists in the memory cgroup, some pages may
* still be occupied. Release these pages before restore pool.
*/
mem_cgroup_force_empty(dpool->memcg);
BUG_ON(!dpool->ops->restore_pool);
ret = dpool->ops->restore_pool(dpool);
if (ret) {
pr_err("restore pool failed\n");
goto put;
}
BUG_ON(!dpool->ops->drain_pool);
ret = dpool->ops->drain_pool(dpool);
if (ret) {
pr_err("drain pool failed\n");
goto put;
}
memcg->dpool = NULL;
/* Release the initial reference count */
dpool_put(dpool);
/*
* Since dpool is destroyed and the memcg will be freed then,
* clear CSS_ONLINE immediately to prevent race with create.
*/
if (cgrp->self.flags & CSS_ONLINE) {
cgrp->self.flags &= ~CSS_ONLINE;
*clear_css_online = true;
}
put:
dpool_put(dpool);
unlock:
mutex_unlock(&dpool_mutex);
return ret;
}
static int __init dynamic_pool_init(void)
{
if (!enable_dhugetlb && !enable_dpagelist)
return 0;
if (enable_dhugetlb) {
unsigned long count, size;
count = max_t(unsigned long, hugepage_index(max_pfn),
DEFAULT_PAGE_ARRAY_COUNT);
size = sizeof(struct dpool_page_array) +
count * sizeof(struct dynamic_pool *);
dpool_page_array = kzalloc(size, GFP_KERNEL);
if (!dpool_page_array) {
pr_err("init failed\n");
return -ENOMEM;
}
dpool_page_array->count = count;
}
static_branch_enable(&dynamic_pool_key);
pr_info("enabled\n");
return 0;
}
subsys_initcall(dynamic_pool_init);
/* === Dynamic hugetlb interface ====================================== */
static int __init dynamic_hugetlb_setup(char *buf)
{
if (enable_dpagelist)
return 0;
return kstrtobool(buf, &enable_dhugetlb);
}
early_param("dynamic_hugetlb", dynamic_hugetlb_setup);
static int dpool_record_page(struct dynamic_pool *dpool, unsigned long idx)
{
read_lock(&dpool_page_array_rwlock);
/*
* If page's pfn is greater than dhugetlb_pagelist_t->count (which
* may occurs due to memory hotplug) then dhugetlb_pagelist_t need
* to be reallocated, so need write_lock here.
*/
if (idx >= dpool_page_array->count) {
unsigned long size;
struct dpool_page_array *tmp;
read_unlock(&dpool_page_array_rwlock);
write_lock(&dpool_page_array_rwlock);
size = sizeof(struct dpool_page_array) +
(idx + 1) * sizeof(struct dynamic_pool *);
tmp = krealloc(dpool_page_array, size, GFP_ATOMIC);
if (!tmp) {
write_unlock(&dpool_page_array_rwlock);
return -ENOMEM;
}
tmp->count = idx + 1;
dpool_page_array = tmp;
write_unlock(&dpool_page_array_rwlock);
read_lock(&dpool_page_array_rwlock);
}
dpool_page_array->dpool[idx] = dpool;
read_unlock(&dpool_page_array_rwlock);
return 0;
}
static int dpool_fill_from_hugetlb(struct dynamic_pool *dpool, void *arg)
{
struct hstate *h = size_to_hstate(PUD_SIZE);
unsigned long nr_pages = *(unsigned long *)arg;
int nid = dpool->nid;
unsigned long count = 0;
struct pages_pool *pool = &dpool->pool[PAGES_POOL_1G];
struct page *page, *next;
struct folio *folio;
unsigned long idx;
LIST_HEAD(page_list);
if (!h)
return -EINVAL;
spin_lock_irq(&hugetlb_lock);
if ((h->free_huge_pages_node[nid] < nr_pages) ||
(h->free_huge_pages - h->resv_huge_pages < nr_pages)) {
spin_unlock_irq(&hugetlb_lock);
return -ENOMEM;
}
while (count < nr_pages) {
folio = dequeue_hugetlb_folio_node_exact(h, nid);
if (!folio)
break;
page = folio_page(folio, 0);
/* dequeue will unfreeze the page, refreeze it. */
page_ref_freeze(page, 1);
idx = hugepage_index(page_to_pfn(page));
if (dpool_record_page(dpool, idx)) {
enqueue_hugetlb_folio(h, folio);
pr_err("dpool_page_array can't record page 0x%px\n",
page);
continue;
}
list_move(&page->lru, &page_list);
count++;
}
spin_unlock_irq(&hugetlb_lock);
list_for_each_entry_safe(page, next, &page_list, lru) {
if (hugetlb_vmemmap_restore(h, page)) {
spin_lock_irq(&hugetlb_lock);
enqueue_hugetlb_folio(h, folio);
spin_unlock_irq(&hugetlb_lock);
pr_err("restore hugetlb_vmemmap failed page 0x%px\n",
page);
continue;
}
__SetPageDpool(page);
spin_lock_irq(&dpool->lock);
list_move(&page->lru, &pool->freelist);
pool->free_pages++;
dpool->total_pages++;
spin_unlock_irq(&dpool->lock);
}
return 0;
}
static int dpool_drain_to_hugetlb(struct dynamic_pool *dpool)
{
struct hstate *h = size_to_hstate(PUD_SIZE);
struct pages_pool *pool = &dpool->pool[PAGES_POOL_1G];
struct page *page, *next;
unsigned long idx;
LIST_HEAD(page_list);
if (!h)
return -EINVAL;
spin_lock_irq(&dpool->lock);
list_for_each_entry_safe(page, next, &pool->freelist, lru) {
WARN_ON(PageHWPoison(page));
idx = hugepage_index(page_to_pfn(page));
WARN_ON(dpool_record_page(NULL, idx));
list_move(&page->lru, &page_list);
__ClearPageDpool(page);
pool->free_pages--;
dpool->total_pages--;
}
spin_unlock_irq(&dpool->lock);
list_for_each_entry_safe(page, next, &page_list, lru) {
hugetlb_vmemmap_optimize(h, page);
spin_lock_irq(&hugetlb_lock);
enqueue_hugetlb_folio(h, page_folio(page));
spin_unlock_irq(&hugetlb_lock);
}
return dpool->total_pages ? -ENOMEM : 0;
}
static int dpool_merge_all(struct dynamic_pool *dpool)
{
struct pages_pool *pool;
int ret = -ENOMEM;
pool = &dpool->pool[PAGES_POOL_2M];
while (pool->split_pages) {
cond_resched();
ret = dpool_promote_pool(dpool, PAGES_POOL_2M);
if (ret) {
pr_err("some 4K pages can't merge ret: %d, delete failed: \n",
ret);
pr_cont_cgroup_name(dpool->memcg->css.cgroup);
pr_cont("\n");
goto out;
}
}
spin_lock_irq(&dpool->lock);
if (pool->split_pages || pool->used_huge_pages || pool->resv_huge_pages) {
ret = -ENOMEM;
pr_err("some 2M pages are still in use or mmap, delete failed: ");
pr_cont_cgroup_name(dpool->memcg->css.cgroup);
pr_cont("\n");
spin_unlock_irq(&dpool->lock);
goto out;
}
pool->free_pages += pool->nr_huge_pages;
pool->nr_huge_pages = 0;
pool->free_huge_pages = 0;
spin_unlock_irq(&dpool->lock);
pool = &dpool->pool[PAGES_POOL_1G];
while (pool->split_pages) {
cond_resched();
ret = dpool_promote_pool(dpool, PAGES_POOL_1G);
if (ret) {
pr_err("some 2M pages can't merge ret: %d, delete failed: \n",
ret);
pr_cont_cgroup_name(dpool->memcg->css.cgroup);
pr_cont("\n");
goto out;
}
}
spin_lock_irq(&dpool->lock);
if (pool->split_pages || pool->used_huge_pages || pool->resv_huge_pages) {
ret = -ENOMEM;
pr_err("some 1G pages are still in use or mmap, delete failed: ");
pr_cont_cgroup_name(dpool->memcg->css.cgroup);
pr_cont("\n");
spin_unlock_irq(&dpool->lock);
goto out;
}
pool->free_pages += pool->nr_huge_pages;
pool->nr_huge_pages = 0;
pool->free_huge_pages = 0;
spin_unlock_irq(&dpool->lock);
ret = 0;
out:
return ret;
}
static struct dynamic_pool_ops hugetlb_dpool_ops = {
.fill_pool = dpool_fill_from_hugetlb,
.drain_pool = dpool_drain_to_hugetlb,
.restore_pool = dpool_merge_all,
};
/* If dynamic pool is disabled, hide the interface */
bool dynamic_pool_hide_files(struct cftype *cft)
{
if (dpool_enabled && enable_dhugetlb)
return false;
return !!strstr(cft->name, "dhugetlb");
}
int dynamic_pool_add_memory(struct mem_cgroup *memcg, int nid,
unsigned long size)
{
struct dynamic_pool *dpool;
int ret = -EINVAL;
bool new_create = false;
if (!dpool_enabled)
return -EINVAL;
mutex_lock(&dpool_mutex);
if (!(memcg->css.cgroup->self.flags & CSS_ONLINE)) {
pr_err("add memory failed, memcg is going offline\n");
goto unlock;
}
dpool = memcg->dpool;
if (!dpool) {
dpool = dpool_create(memcg, &hugetlb_dpool_ops);
if (!dpool)
goto unlock;
dpool->nid = nid;
new_create = true;
} else if (dpool->memcg != memcg) {
pr_err("add memory failed, not parent memcg\n");
goto unlock;
} else if (dpool->nid != nid) {
pr_err("add memory failed, not target nid(%d)\n",
dpool->nid);
goto unlock;
}
BUG_ON(!dpool->ops->fill_pool);
ret = dpool->ops->fill_pool(dpool, &size);
if (ret) {
pr_err("fill pool failed\n");
/*
* If create a new hpool here but add memory failed,
* release it directly here.
*/
if (new_create) {
memcg->dpool = NULL;
dpool_put(dpool);
}
}
unlock:
mutex_unlock(&dpool_mutex);
return ret;
}
void dynamic_pool_show(struct mem_cgroup *memcg, struct seq_file *m)
{
struct dynamic_pool *dpool;
unsigned long free_pages;
long used_pages;
if (!dpool_enabled || !memcg)
return;
dpool = dpool_get_from_memcg(memcg);
if (!dpool) {
seq_puts(m, "Current hierarchial have not memory pool.\n");
return;
}
dpool_disable_pcp_pool(dpool, false);
spin_lock_irq(&dpool->lock);
/*
* no others can modify the count because pcp pool is disabled and
* dpool->lock is locked.
*/
dpool_sum_pcp_pool(dpool, &free_pages, &used_pages);
free_pages += dpool->pool[PAGES_POOL_4K].free_pages;
used_pages += dpool->pool[PAGES_POOL_4K].used_pages;
seq_printf(m, "nid %d\n", dpool->nid);
seq_printf(m, "dhugetlb_total_pages %lu\n", dpool->total_pages);
seq_printf(m, "1G_total_reserved_pages %lu\n",
dpool->pool[PAGES_POOL_1G].nr_huge_pages);
seq_printf(m, "1G_free_reserved_pages %lu\n",
dpool->pool[PAGES_POOL_1G].free_huge_pages);
seq_printf(m, "1G_mmap_reserved_pages %lu\n",
dpool->pool[PAGES_POOL_1G].resv_huge_pages);
seq_printf(m, "1G_used_pages %lu\n",
dpool->pool[PAGES_POOL_1G].used_huge_pages);
seq_printf(m, "2M_total_reserved_pages %lu\n",
dpool->pool[PAGES_POOL_2M].nr_huge_pages);
seq_printf(m, "2M_free_reserved_pages %lu\n",
dpool->pool[PAGES_POOL_2M].free_huge_pages);
seq_printf(m, "2M_mmap_reserved_pages %lu\n",
dpool->pool[PAGES_POOL_2M].resv_huge_pages);
seq_printf(m, "2M_used_pages %lu\n",
dpool->pool[PAGES_POOL_2M].used_huge_pages);
seq_printf(m, "1G_free_unreserved_pages %lu\n",
dpool->pool[PAGES_POOL_1G].free_pages);
seq_printf(m, "2M_free_unreserved_pages %lu\n",
dpool->pool[PAGES_POOL_2M].free_pages);
seq_printf(m, "4K_free_pages %lu\n", free_pages);
seq_printf(m, "4K_used_pages %ld\n", used_pages);
spin_unlock_irq(&dpool->lock);
dpool_enable_pcp_pool(dpool);
dpool_put(dpool);
}
int dynamic_pool_reserve_hugepage(struct mem_cgroup *memcg,
unsigned long nr_pages, int type)
{
struct dynamic_pool *dpool;
struct pages_pool *pool;
unsigned long delta;
int ret = -EINVAL;
if (!dpool_enabled)
return -EINVAL;
mutex_lock(&dpool_mutex);
dpool = dpool_get_from_memcg(memcg);
if (!dpool)
goto unlock;
pool = &dpool->pool[type];
spin_lock_irq(&dpool->lock);
if (nr_pages > pool->nr_huge_pages) {
delta = nr_pages - pool->nr_huge_pages;
while (delta > pool->free_pages &&
!dpool_demote_pool_locked(dpool, type - 1)) {
spin_unlock_irq(&dpool->lock);
cond_resched();
spin_lock_irq(&dpool->lock);
}
/* Only try merge pages for 2M pages */
if (type == PAGES_POOL_2M) {
while (delta > pool->free_pages) {
spin_unlock_irq(&dpool->lock);
cond_resched();
if (dpool_promote_pool(dpool, type)) {
spin_lock_irq(&dpool->lock);
break;
}
spin_lock_irq(&dpool->lock);
}
}
delta = min(delta, pool->free_pages);
pool->nr_huge_pages += delta;
pool->free_huge_pages += delta;
pool->free_pages -= delta;
} else {
delta = min(pool->nr_huge_pages - nr_pages,
pool->free_huge_pages - pool->resv_huge_pages);
pool->nr_huge_pages -= delta;
pool->free_huge_pages -= delta;
pool->free_pages += delta;
}
spin_unlock_irq(&dpool->lock);
dpool_put(dpool);
ret = 0;
unlock:
mutex_unlock(&dpool_mutex);
return ret;
}
/* === Dynamic pagelist interface ===================================== */
static int __init dynamic_pagelist_setup(char *buf)
{
if (enable_dhugetlb)
return 0;
return kstrtobool(buf, &enable_dpagelist);
}
early_param("dpool", dynamic_pagelist_setup);
static int dpool_fill_from_pagelist(struct dynamic_pool *dpool, void *arg)
{
struct dpool_info *info = (struct dpool_info *)arg;
struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K];
int i, ret = -EINVAL;
dpool->range_cnt = info->range_cnt;
dpool->pfn_ranges =
kmalloc_array(info->range_cnt, sizeof(struct range), GFP_KERNEL);
if (!dpool->pfn_ranges)
return -ENOMEM;
memcpy(dpool->pfn_ranges, info->pfn_ranges,
sizeof(struct range) * dpool->range_cnt);
spin_lock_irq(&dpool->lock);
for (i = 0; i < dpool->range_cnt; i++) {
struct range *range = &dpool->pfn_ranges[i];
u64 pfn;
for (pfn = range->start; pfn <= range->end; pfn++) {
struct page *page = pfn_to_page(pfn);
set_page_count(page, 0);
page_mapcount_reset(page);
if (!dpool_free_page_prepare(page)) {
pr_err("fill pool failed, check pages failed\n");
goto unlock;
}
__SetPageDpool(page);
list_add_tail(&page->lru, &pool->freelist);
pool->free_pages++;
cond_resched_lock(&dpool->lock);
}
}
ret = 0;
unlock:
spin_unlock_irq(&dpool->lock);
return ret;
}
static int dpool_drain_to_pagelist(struct dynamic_pool *dpool)
{
struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K];
/* check poisoned pages */
return (pool->used_pages == dpool->nr_poisoned_pages) ? 0 : -ENOMEM;
}
static int dpool_migrate_used_pages(struct dynamic_pool *dpool)
{
int range_cnt = dpool->range_cnt;
int i;
spin_lock_irq(&dpool->lock);
dpool->nr_poisoned_pages = 0;
for (i = 0; i < range_cnt; i++) {
struct range *range = &dpool->pfn_ranges[i];
u64 pfn;
for (pfn = range->start; pfn <= range->end; pfn++) {
struct page *page = pfn_to_page(pfn);
/* Unlock and try migration. */
spin_unlock_irq(&dpool->lock);
cond_resched();
if (PageDpool(page)) {
spin_lock_irq(&dpool->lock);
continue;
}
if (PageHWPoison(page))
dpool->nr_poisoned_pages++;
lru_add_drain_all();
do_migrate_range(pfn, pfn + 1);
spin_lock_irq(&dpool->lock);
}
}
spin_unlock_irq(&dpool->lock);
return 0;
}
struct dynamic_pool_ops pagelist_dpool_ops = {
.fill_pool = dpool_fill_from_pagelist,
.drain_pool = dpool_drain_to_pagelist,
.restore_pool = dpool_migrate_used_pages,
};
int dpool_init(struct dpool_info *arg)
{
struct dynamic_pool *dpool;
int ret;
if (!dpool_enabled)
return -EINVAL;
if (!arg || !arg->memcg || arg->range_cnt <= 0) {
pr_err("init failed, arg is invalid\n");
return -EINVAL;
}
mutex_lock(&dpool_mutex);
if (dpool_global_pool || arg->memcg->dpool) {
pr_err("init failed, dpool is already exist\n");
ret = -EINVAL;
goto unlock;
}
if (!(arg->memcg->css.cgroup->self.flags & CSS_ONLINE)) {
pr_err("init failed, memcg is not online\n");
ret = -EINVAL;
goto unlock;
}
dpool = dpool_create(arg->memcg, &pagelist_dpool_ops);
if (!dpool) {
pr_err("init failed, create failed. ret: %d\n", ret);
ret = -ENOMEM;
goto unlock;
}
dpool_global_pool = dpool;
BUG_ON(!dpool->ops->fill_pool);
ret = dpool->ops->fill_pool(dpool, arg);
if (ret)
dpool_put(dpool);
unlock:
mutex_unlock(&dpool_mutex);
return ret;
}
void dynamic_pool_show_meminfo(struct seq_file *m)
{
struct dynamic_pool *dpool;
struct pages_pool *pool;
unsigned long free_pages = 0;
long used_pages = 0;
unsigned long flags;
if (!dpool_enabled || !enable_dpagelist)
return;
dpool = dpool_get_from_page(NULL);
if (!dpool)
goto out;
pool = &dpool->pool[PAGES_POOL_4K];
dpool_disable_pcp_pool(dpool, false);
spin_lock_irqsave(&dpool->lock, flags);
dpool_sum_pcp_pool(dpool, &free_pages, &used_pages);
free_pages += pool->free_pages;
used_pages += pool->used_pages;
spin_unlock_irqrestore(&dpool->lock, flags);
dpool_enable_pcp_pool(dpool);
out:
if (m) {
seq_printf(m,
"DPoolTotal: %8lu kB\n"
"DPoolFree: %8ld kB\n",
(free_pages + used_pages) << (PAGE_SHIFT - 10),
free_pages << (PAGE_SHIFT - 10));
} else {
pr_info("DPoolTotal: %lu kB\n",
(free_pages + used_pages) << (PAGE_SHIFT - 10));
pr_info("DPoolFree: %ld kB\n", free_pages << (PAGE_SHIFT - 10));
}
dpool_put(dpool);
}