// SPDX-License-Identifier: GPL-2.0-or-later /* * dynamic pool core file * * Copyright (C) 2024 Huawei Limited. */ #define pr_fmt(fmt) "Dynamic pool: " fmt #include #include #include "internal.h" #include "hugetlb_vmemmap.h" #define CREATE_TRACE_POINTS #include static bool enable_dhugetlb; static bool enable_dpagelist; /* Indicate the enabled of dynamic pool */ DEFINE_STATIC_KEY_FALSE(dynamic_pool_key); /* Protect the operation of dynamic pool */ static DEFINE_MUTEX(dpool_mutex); /* Introduce the special opeartion. */ struct dynamic_pool_ops { int (*fill_pool)(struct dynamic_pool *dpool, void *arg); int (*drain_pool)(struct dynamic_pool *dpool); int (*restore_pool)(struct dynamic_pool *dpool); }; /* Used to record the mapping of page and dpool */ struct dpool_page_array { unsigned long count; struct dynamic_pool *dpool[]; }; #define DEFAULT_PAGE_ARRAY_COUNT 4096 #define hugepage_index(pfn) ((pfn) >> PUD_ORDER) static struct dpool_page_array *dpool_page_array; static DEFINE_RWLOCK(dpool_page_array_rwlock); /* For dpagelist, there are only one dpool */ static struct dynamic_pool *dpool_global_pool; /* Used for percpu pages pool */ #define PCP_PAGE_MAX 1024 #define PCP_PAGE_BATCH (PCP_PAGE_MAX >> 2) /* === reference function ============================================= */ static bool dpool_get_unless_zero(struct dynamic_pool *dpool) { if (!dpool) return false; return refcount_inc_not_zero(&dpool->refcnt); } static void dpool_put(struct dynamic_pool *dpool) { if (!dpool) return; if (refcount_dec_and_test(&dpool->refcnt)) { dpool->memcg->dpool = NULL; css_put(&dpool->memcg->css); dpool_global_pool = NULL; synchronize_rcu(); free_percpu(dpool->pcp_pool); kfree(dpool->pfn_ranges); kfree(dpool); } } static struct dynamic_pool *dpool_get_from_memcg(struct mem_cgroup *memcg) { struct dynamic_pool *dpool; rcu_read_lock(); dpool = memcg->dpool; if (!dpool_get_unless_zero(dpool)) dpool = NULL; rcu_read_unlock(); return dpool; } static struct dynamic_pool *dpool_get_from_task(struct task_struct *tsk) { struct dynamic_pool *dpool = NULL; struct mem_cgroup *memcg; if (!dpool_enabled) return NULL; rcu_read_lock(); do { memcg = mem_cgroup_from_task(tsk); } while (memcg && !css_tryget(&memcg->css)); rcu_read_unlock(); if (!memcg) return NULL; dpool = dpool_get_from_memcg(memcg); css_put(&memcg->css); return dpool; } static struct dynamic_pool *dpool_get_from_page(struct page *page) { struct dynamic_pool *dpool = NULL; unsigned long idx; rcu_read_lock(); if (enable_dhugetlb) { idx = hugepage_index(page_to_pfn(page)); read_lock(&dpool_page_array_rwlock); if (idx < dpool_page_array->count) dpool = dpool_page_array->dpool[idx]; read_unlock(&dpool_page_array_rwlock); } else if (enable_dpagelist) { /* * Attention: dpool_global_pool return for any page, * so need other check to make sure it is from dpool. */ dpool = dpool_global_pool; } if (!dpool_get_unless_zero(dpool)) dpool = NULL; rcu_read_unlock(); return dpool; } bool __task_in_dynamic_pool(struct task_struct *tsk) { struct dynamic_pool *dpool; if (!dpool_enabled) return false; dpool = dpool_get_from_task(tsk); dpool_put(dpool); return !!dpool; } bool page_in_dynamic_pool(struct page *page) { struct dynamic_pool *dpool; bool ret; if (!dpool_enabled) return false; if (PageDpool(page)) return true; /* * If the page don't have the flags, it may be in pcp list. * Check it using the page range. */ dpool = dpool_get_from_page(page); if (enable_dpagelist && dpool) { unsigned long pfn = page_to_pfn(page); int range_cnt = dpool->range_cnt; struct range *range; int i; for (i = 0; i < range_cnt; i++) { range = &dpool->pfn_ranges[i]; if (pfn >= range->start && pfn <= range->end) goto out; } /* The pfn is not in the range, set dpool to NULL */ dpool = NULL; } out: ret = dpool ? !PagePool(page) : false; dpool_put(dpool); return ret; } /* === demote and promote function ==================================== */ static void dpool_disable_pcp_pool(struct dynamic_pool *dpool, bool drain); static void dpool_enable_pcp_pool(struct dynamic_pool *dpool); /* * Clear compound structure which is inverse of prep_compound_page, * For detail, see destroy_compound_hugetlb_folio_for_demote. */ static void clear_compound_page(struct folio *folio, unsigned int order) { int i; int nr_pages = 1 << order; struct page *p; atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i); p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; p->mapping = NULL; if (!i) __ClearPageHead(p); else clear_compound_head(p); set_page_private(p, 0); } } static int dpool_demote_gigantic_page(struct pages_pool *src_pool, struct pages_pool *dst_pool, struct page *page) { struct folio *folio = page_folio(page); struct hstate *h = size_to_hstate(PMD_SIZE); int nr_pages = 1 << PUD_ORDER; int block_size = 1 << PMD_ORDER; struct page *subpage; int i; if (PageHWPoison(page)) return -EHWPOISON; list_del(&page->lru); __ClearPageDpool(page); src_pool->free_pages--; destroy_compound_hugetlb_folio_for_demote(folio, PUD_ORDER); for (i = 0; i < nr_pages; i += block_size) { subpage = folio_page(folio, i); prep_compound_page(subpage, PMD_ORDER); folio_change_private(page_folio(subpage), NULL); __SetPageDpool(subpage); __prep_new_hugetlb_folio(h, page_folio(subpage)); list_add_tail(&subpage->lru, &dst_pool->freelist); dst_pool->free_pages++; } return 0; } static int dpool_demote_huge_page(struct pages_pool *src_pool, struct pages_pool *dst_pool, struct page *page) { struct folio *folio = page_folio(page); int nr_pages = 1 << PMD_ORDER; struct page *subpage; int i; if (PageHWPoison(page)) return -EHWPOISON; list_del(&page->lru); __ClearPageDpool(page); src_pool->free_pages--; __folio_clear_hugetlb(page_folio(page)); clear_compound_page(page_folio(page), PMD_ORDER); for (i = 0; i < nr_pages; i++) { subpage = folio_page(folio, i); dpool_free_page_prepare(subpage); __SetPageDpool(subpage); list_add_tail(&subpage->lru, &dst_pool->freelist); dst_pool->free_pages++; } return 0; } static int dpool_demote_pool_locked(struct dynamic_pool *dpool, int type) { struct pages_pool *src_pool, *dst_pool; struct split_page *spage = NULL; struct page *page = NULL; int ret = -ENOMEM; lockdep_assert_held(&dpool->lock); if (type < 0 || type >= PAGES_POOL_MAX - 1) return -EINVAL; src_pool = &dpool->pool[type]; dst_pool = &dpool->pool[type + 1]; spage = kzalloc(sizeof(struct split_page), GFP_ATOMIC); if (!spage) goto out; if (!src_pool->free_pages && dpool_demote_pool_locked(dpool, type - 1)) goto out; list_for_each_entry(page, &src_pool->freelist, lru) { switch (type) { case PAGES_POOL_1G: ret = dpool_demote_gigantic_page(src_pool, dst_pool, page); break; case PAGES_POOL_2M: ret = dpool_demote_huge_page(src_pool, dst_pool, page); break; default: BUG(); } if (!ret) break; } out: if (!ret) { spage->start_pfn = page_to_pfn(page); list_add(&spage->entry, &src_pool->splitlist); src_pool->split_pages++; } else { kfree(spage); } trace_dpool_demote(dpool, type, page, ret); return ret; } static int dpool_promote_gigantic_page(struct pages_pool *src_pool, struct pages_pool *dst_pool, struct split_page *spage) { struct hstate *h = size_to_hstate(PUD_SIZE); int nr_pages = 1 << PUD_ORDER; int block_size = 1 << PMD_ORDER; struct page *page, *subpage; int i; for (i = 0; i < nr_pages; i += block_size) { subpage = pfn_to_page(spage->start_pfn + i); if (!PageDpool(subpage)) return -EBUSY; if (PageHWPoison(subpage)) return -EHWPOISON; } for (i = 0; i < nr_pages; i += block_size) { subpage = pfn_to_page(spage->start_pfn + i); __folio_clear_hugetlb(page_folio(subpage)); clear_compound_page(page_folio(subpage), PMD_ORDER); __ClearPageDpool(subpage); list_del(&subpage->lru); src_pool->free_pages--; } page = pfn_to_page(spage->start_pfn); prep_compound_gigantic_folio_for_demote(page_folio(page), PUD_ORDER); folio_change_private(page_folio(page), NULL); __SetPageDpool(page); __prep_new_hugetlb_folio(h, page_folio(page)); list_add_tail(&page->lru, &dst_pool->freelist); dst_pool->free_pages++; return 0; } static int dpool_promote_huge_page(struct pages_pool *src_pool, struct pages_pool *dst_pool, struct split_page *spage) { struct hstate *h = size_to_hstate(PMD_SIZE); int nr_pages = 1 << PMD_ORDER; struct page *page, *subpage; int i; for (i = 0; i < nr_pages; i++) { subpage = pfn_to_page(spage->start_pfn + i); if (!PageDpool(subpage)) return -EBUSY; if (PageHWPoison(subpage)) return -EHWPOISON; } for (i = 0; i < nr_pages; i++) { subpage = pfn_to_page(spage->start_pfn + i); __ClearPageDpool(subpage); list_del(&subpage->lru); src_pool->free_pages--; } page = pfn_to_page(spage->start_pfn); dpool_prep_new_page(page, PMD_ORDER, __GFP_COMP, 0); set_page_count(page, 0); folio_change_private(page_folio(page), NULL); __SetPageDpool(page); __prep_new_hugetlb_folio(h, page_folio(page)); list_add_tail(&page->lru, &dst_pool->freelist); dst_pool->free_pages++; return 0; } static int dpool_promote_pool(struct dynamic_pool *dpool, int type) { struct pages_pool *src_pool, *dst_pool; struct split_page *spage, *spage_next; struct page *page = NULL; int ret = -ENOMEM; if (type < 0 || type >= PAGES_POOL_MAX - 1) return -EINVAL; src_pool = &dpool->pool[type + 1]; dst_pool = &dpool->pool[type]; spin_lock_irq(&dpool->lock); if (!dst_pool->split_pages) goto unlock; list_for_each_entry_safe(spage, spage_next, &dst_pool->splitlist, entry) { switch (type) { case PAGES_POOL_1G: ret = dpool_promote_gigantic_page(src_pool, dst_pool, spage); break; case PAGES_POOL_2M: { unsigned long nr_pages = 1 << PMD_ORDER; /* * Since the dpool_mutex is already locked, * there is no way to free spage_next, so * it is safe to unlock here. */ spin_unlock_irq(&dpool->lock); cond_resched(); lru_add_drain_all(); dpool_disable_pcp_pool(dpool, true); do_migrate_range(spage->start_pfn, spage->start_pfn + nr_pages); spin_lock_irq(&dpool->lock); dpool_enable_pcp_pool(dpool); ret = dpool_promote_huge_page(src_pool, dst_pool, spage); break; } default: BUG(); } if (!ret) break; } if (!ret) { page = pfn_to_page(spage->start_pfn); list_del(&spage->entry); dst_pool->split_pages--; } unlock: spin_unlock_irq(&dpool->lock); if (!ret) kfree(spage); trace_dpool_promote(dpool, type, page, ret); return ret; } /* === percpu pool function =========================================== */ static void dpool_refill_pcp_pool(struct dynamic_pool *dpool, struct pcp_pages_pool *pcp_pool, unsigned long count) { struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; struct page *page, *next; unsigned long flags; int i = 0; lockdep_assert_held(&pcp_pool->lock); spin_lock_irqsave(&dpool->lock, flags); if (!pool->free_pages && dpool_demote_pool_locked(dpool, PAGES_POOL_2M)) goto unlock; list_for_each_entry_safe(page, next, &pool->freelist, lru) { list_move_tail(&page->lru, &pcp_pool->freelist); __ClearPageDpool(page); pool->free_pages--; pcp_pool->free_pages++; if (++i == count) break; } unlock: spin_unlock_irqrestore(&dpool->lock, flags); } static void dpool_drain_pcp_pool(struct dynamic_pool *dpool, struct pcp_pages_pool *pcp_pool, unsigned long count) { struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; struct page *page, *next; unsigned long flags; int i = 0; lockdep_assert_held(&pcp_pool->lock); spin_lock_irqsave(&dpool->lock, flags); list_for_each_entry_safe(page, next, &pcp_pool->freelist, lru) { list_move_tail(&page->lru, &pool->freelist); __SetPageDpool(page); pcp_pool->free_pages--; pool->free_pages++; if (++i == count) break; } pool->used_pages += pcp_pool->used_pages; pcp_pool->used_pages = 0; spin_unlock_irqrestore(&dpool->lock, flags); } static void dpool_drain_all_pcp_pool(struct dynamic_pool *dpool) { struct pcp_pages_pool *pcp_pool; unsigned long flags; int cpu; for_each_possible_cpu(cpu) { pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); spin_lock_irqsave(&pcp_pool->lock, flags); dpool_drain_pcp_pool(dpool, pcp_pool, pcp_pool->free_pages); spin_unlock_irqrestore(&pcp_pool->lock, flags); } } static void dpool_wait_all_pcp_pool_unlock(struct dynamic_pool *dpool) { struct pcp_pages_pool *pcp_pool; unsigned long flags; int cpu; for_each_possible_cpu(cpu) { pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); spin_lock_irqsave(&pcp_pool->lock, flags); spin_unlock_irqrestore(&pcp_pool->lock, flags); } } /* The caller have to make sure no others write the count */ static void dpool_sum_pcp_pool(struct dynamic_pool *dpool, unsigned long *free_pages, long *used_pages) { struct pcp_pages_pool *pcp_pool; int cpu; *free_pages = 0; *used_pages = 0; for_each_possible_cpu(cpu) { pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); *free_pages += pcp_pool->free_pages; *used_pages += pcp_pool->used_pages; } } static void dpool_disable_pcp_pool(struct dynamic_pool *dpool, bool drain) { atomic_inc(&dpool->pcp_refcnt); /* After increase refcount, wait for other user to unlock. */ if (drain) dpool_drain_all_pcp_pool(dpool); else dpool_wait_all_pcp_pool_unlock(dpool); } static void dpool_enable_pcp_pool(struct dynamic_pool *dpool) { atomic_dec(&dpool->pcp_refcnt); } static bool dpool_pcp_enabled(struct dynamic_pool *dpool) { return !atomic_read(&dpool->pcp_refcnt); } static struct page *dpool_alloc_pcp_page(struct dynamic_pool *dpool) { struct pcp_pages_pool *pcp_pool; struct page *page = NULL; unsigned long flags; pcp_pool = this_cpu_ptr(dpool->pcp_pool); spin_lock_irqsave(&pcp_pool->lock, flags); if (!dpool->online || !dpool_pcp_enabled(dpool)) goto unlock; retry: page = NULL; if (!pcp_pool->free_pages) dpool_refill_pcp_pool(dpool, pcp_pool, PCP_PAGE_BATCH); page = list_first_entry_or_null(&pcp_pool->freelist, struct page, lru); if (!page) goto unlock; list_del(&page->lru); pcp_pool->free_pages--; pcp_pool->used_pages++; if (dpool_check_new_page(page)) { SetPagePool(page); goto retry; } SetPagePool(page); unlock: spin_unlock_irqrestore(&pcp_pool->lock, flags); return page; } static int dpool_free_pcp_page(struct dynamic_pool *dpool, struct page *page) { struct pcp_pages_pool *pcp_pool; unsigned long flags; int ret = 0; pcp_pool = this_cpu_ptr(dpool->pcp_pool); spin_lock_irqsave(&pcp_pool->lock, flags); if (!dpool_pcp_enabled(dpool)) { ret = -EINVAL; goto unlock; } ClearPagePool(page); if (!dpool_free_page_prepare(page)) { SetPagePool(page); goto unlock; } list_add(&page->lru, &pcp_pool->freelist); pcp_pool->free_pages++; pcp_pool->used_pages--; if (pcp_pool->free_pages > PCP_PAGE_MAX) dpool_drain_pcp_pool(dpool, pcp_pool, PCP_PAGE_BATCH); unlock: spin_unlock_irqrestore(&pcp_pool->lock, flags); return ret; } /* === allocation interface =========================================== */ int dynamic_pool_can_attach(struct task_struct *tsk, struct mem_cgroup *memcg) { struct dynamic_pool *src_dpool, *dst_dpool; int ret = 0; if (!dpool_enabled) return 0; src_dpool = dpool_get_from_task(tsk); if (!src_dpool) return 0; dst_dpool = dpool_get_from_memcg(memcg); if (dst_dpool != src_dpool) ret = -EPERM; dpool_put(src_dpool); dpool_put(dst_dpool); return ret; } bool dynamic_pool_should_alloc(gfp_t gfp_mask, unsigned int order) { gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE; if (current->flags & PF_KTHREAD) return false; if (order != 0) return false; /* * The cgroup only charges anonymous and file pages from usespage. * some filesystem maybe has masked out the __GFP_IO | __GFP_FS * to avoid recursive memory request. eg: loop device, xfs. */ if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE) return false; #ifdef CONFIG_MEMORY_RELIABLE if (mem_reliable_is_enabled() && (gfp_mask & GFP_RELIABLE)) return false; #endif return true; } struct page *dynamic_pool_alloc_page(gfp_t gfp, unsigned int order, unsigned int alloc_flags) { struct dynamic_pool *dpool; struct pages_pool *pool; struct page *page = NULL; unsigned long flags; if (!dpool_enabled) return NULL; if (!dynamic_pool_should_alloc(gfp, order)) return NULL; dpool = dpool_get_from_task(current); if (!dpool) return NULL; page = dpool_alloc_pcp_page(dpool); if (page) goto put; pool = &dpool->pool[PAGES_POOL_4K]; spin_lock_irqsave(&dpool->lock, flags); if (!dpool->online) goto unlock; retry: page = NULL; if (!pool->free_pages && dpool_demote_pool_locked(dpool, PAGES_POOL_2M)) { spin_unlock_irqrestore(&dpool->lock, flags); dpool_drain_all_pcp_pool(dpool); spin_lock_irqsave(&dpool->lock, flags); if (!dpool->online || !pool->free_pages) goto unlock; } page = list_first_entry_or_null(&pool->freelist, struct page, lru); if (!page) goto unlock; __ClearPageDpool(page); list_del(&page->lru); pool->free_pages--; pool->used_pages++; if (dpool_check_new_page(page)) { /* This is a bad page, treat it as a used pages */ SetPagePool(page); goto retry; } SetPagePool(page); unlock: spin_unlock_irqrestore(&dpool->lock, flags); put: dpool_put(dpool); if (page) dpool_prep_new_page(page, order, gfp, alloc_flags); return page; } void dynamic_pool_free_page(struct page *page) { struct dynamic_pool *dpool; struct pages_pool *pool; unsigned long flags; if (!dpool_enabled) return; dpool = dpool_get_from_page(page); if (!dpool) { pr_err("get dpool failed when free page 0x%px\n", page); return; } if (!dpool_free_pcp_page(dpool, page)) goto put; pool = &dpool->pool[PAGES_POOL_4K]; spin_lock_irqsave(&dpool->lock, flags); ClearPagePool(page); if (!dpool_free_page_prepare(page)) { SetPagePool(page); goto unlock; } __SetPageDpool(page); list_add(&page->lru, &pool->freelist); pool->free_pages++; pool->used_pages--; unlock: spin_unlock_irqrestore(&dpool->lock, flags); put: dpool_put(dpool); } void dynamic_pool_bind_file(struct hugetlbfs_inode_info *p, struct hstate *h) { unsigned long size; if (!dpool_enabled || !p) return; size = huge_page_size(h); if (size == PMD_SIZE || size == PUD_SIZE) p->dpool = dpool_get_from_task(current); else p->dpool = NULL; } void dynamic_pool_unbind_file(struct hugetlbfs_inode_info *p) { struct dynamic_pool *dpool; if (!dpool_enabled || !p || !p->dpool) return; dpool = p->dpool; p->dpool = NULL; dpool_put(dpool); } int dynamic_pool_hugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *p) { struct dynamic_pool *dpool; struct pages_pool *pool; unsigned long flags; int type; int ret = -ENOMEM; if (!dpool_enabled || !p || !p->dpool) return 0; dpool = p->dpool; spin_lock_irqsave(&dpool->lock, flags); if (hstate_is_gigantic(h)) type = PAGES_POOL_1G; else type = PAGES_POOL_2M; pool = &dpool->pool[type]; if (delta > 0) { if (delta <= pool->free_huge_pages - pool->resv_huge_pages) { pool->resv_huge_pages += delta; ret = 0; } } else { pool->resv_huge_pages -= (unsigned long)(-delta); WARN_ON(pool->resv_huge_pages < 0); ret = 0; } spin_unlock_irqrestore(&dpool->lock, flags); trace_dpool_acct_memory(dpool, type, delta, pool->resv_huge_pages, ret); return ret; } struct folio *dynamic_pool_alloc_hugepage(struct hugetlbfs_inode_info *p, struct hstate *h, bool reserved) { struct dynamic_pool *dpool; struct pages_pool *pool; struct folio *folio = NULL; unsigned long flags; int type; if (!dpool_enabled) return NULL; dpool = p->dpool; if (!dpool) return NULL; if (hstate_is_gigantic(h)) type = PAGES_POOL_1G; else type = PAGES_POOL_2M; pool = &dpool->pool[type]; spin_lock_irqsave(&dpool->lock, flags); if (!dpool->online) goto unlock; list_for_each_entry(folio, &pool->freelist, lru) { if (folio_test_hwpoison(folio)) continue; list_del(&folio->lru); __folio_clear_dpool(folio); folio_ref_unfreeze(folio, 1); pool->free_huge_pages--; pool->used_huge_pages++; if (reserved) { folio_set_hugetlb_restore_reserve(folio); pool->resv_huge_pages--; } folio_set_pool(folio); goto unlock; } folio = NULL; unlock: spin_unlock_irqrestore(&dpool->lock, flags); trace_dpool_alloc_hugepage(dpool, type, folio, pool->free_huge_pages, pool->resv_huge_pages); return folio; } void dynamic_pool_free_hugepage(struct folio *folio, bool restore_reserve) { struct hstate *h = folio_hstate(folio); struct dynamic_pool *dpool; struct pages_pool *pool; unsigned long flags; int type; if (!dpool_enabled) return; dpool = dpool_get_from_page(folio_page(folio, 0)); if (!dpool) { pr_err("get dpool failed when free hugepage 0x%px\n", folio); return; } if (hstate_is_gigantic(h)) type = PAGES_POOL_1G; else type = PAGES_POOL_2M; pool = &dpool->pool[type]; spin_lock_irqsave(&dpool->lock, flags); if (folio_test_hwpoison(folio)) goto unlock; folio_clear_pool(folio); __folio_set_dpool(folio); list_add(&folio->lru, &pool->freelist); pool->free_huge_pages++; pool->used_huge_pages--; if (restore_reserve) pool->resv_huge_pages++; unlock: spin_unlock_irqrestore(&dpool->lock, flags); dpool_put(dpool); trace_dpool_free_hugepage(dpool, type, folio, pool->free_huge_pages, pool->resv_huge_pages); } /* === dynamic pool function ========================================== */ static void dpool_dump_child_memcg(struct mem_cgroup *memcg, void *message) { struct mem_cgroup *root = (struct mem_cgroup *)message; struct cgroup *cgrp; if (root == memcg) return; cgrp = memcg->css.cgroup; pr_err("child memcg exists: "); pr_cont_cgroup_name(cgrp); pr_cont("\n"); } static struct dynamic_pool *dpool_create(struct mem_cgroup *memcg, struct dynamic_pool_ops *ops) { struct dynamic_pool *dpool; int cpu; int i; if (memcg_has_children(memcg)) { pr_err("create failed, memcg has children\n"); mem_cgroup_scan_cgroups(memcg, dpool_dump_child_memcg, memcg); return NULL; } dpool = kzalloc(sizeof(struct dynamic_pool), GFP_KERNEL); if (!dpool) return NULL; dpool->pcp_pool = alloc_percpu(struct pcp_pages_pool); if (!dpool->pcp_pool) { kfree(dpool); return NULL; } spin_lock_init(&dpool->lock); refcount_set(&dpool->refcnt, 1); dpool->memcg = memcg; dpool->ops = ops; atomic_set(&dpool->pcp_refcnt, 0); for (i = 0; i < PAGES_POOL_MAX; i++) { INIT_LIST_HEAD(&dpool->pool[i].freelist); INIT_LIST_HEAD(&dpool->pool[i].splitlist); } for_each_possible_cpu(cpu) { struct pcp_pages_pool *pcp_pool; pcp_pool = per_cpu_ptr(dpool->pcp_pool, cpu); spin_lock_init(&pcp_pool->lock); INIT_LIST_HEAD(&pcp_pool->freelist); pcp_pool->free_pages = 0; pcp_pool->used_pages = 0; } css_get(&memcg->css); memcg->dpool = dpool; dpool->online = true; return dpool; } void dynamic_pool_inherit(struct mem_cgroup *memcg) { struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct dynamic_pool *dpool; if (!dpool_enabled || !parent || !memcg) return; mutex_lock(&dpool_mutex); dpool = dpool_get_from_memcg(parent); memcg->dpool = dpool; /* Don't increase refcount for child memcg */ dpool_put(dpool); mutex_unlock(&dpool_mutex); } int dynamic_pool_destroy(struct cgroup *cgrp, bool *clear_css_online) { struct cgroup_subsys_state *css = cgrp->subsys[memory_cgrp_id]; struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct dynamic_pool *dpool; int ret = 0; if (!dpool_enabled || !memcg) return 0; mutex_lock(&dpool_mutex); dpool = dpool_get_from_memcg(memcg); if (!dpool) goto unlock; if (dpool->memcg != memcg) { memcg->dpool = NULL; goto put; } /* A offline dpool is not allowed for allocation */ dpool->online = false; /* Disable pcp pool forever */ dpool_disable_pcp_pool(dpool, true); /* * Even if no process exists in the memory cgroup, some pages may * still be occupied. Release these pages before restore pool. */ mem_cgroup_force_empty(dpool->memcg); BUG_ON(!dpool->ops->restore_pool); ret = dpool->ops->restore_pool(dpool); if (ret) { pr_err("restore pool failed\n"); goto put; } BUG_ON(!dpool->ops->drain_pool); ret = dpool->ops->drain_pool(dpool); if (ret) { pr_err("drain pool failed\n"); goto put; } memcg->dpool = NULL; /* Release the initial reference count */ dpool_put(dpool); /* * Since dpool is destroyed and the memcg will be freed then, * clear CSS_ONLINE immediately to prevent race with create. */ if (cgrp->self.flags & CSS_ONLINE) { cgrp->self.flags &= ~CSS_ONLINE; *clear_css_online = true; } put: dpool_put(dpool); unlock: mutex_unlock(&dpool_mutex); return ret; } static int __init dynamic_pool_init(void) { if (!enable_dhugetlb && !enable_dpagelist) return 0; if (enable_dhugetlb) { unsigned long count, size; count = max_t(unsigned long, hugepage_index(max_pfn), DEFAULT_PAGE_ARRAY_COUNT); size = sizeof(struct dpool_page_array) + count * sizeof(struct dynamic_pool *); dpool_page_array = kzalloc(size, GFP_KERNEL); if (!dpool_page_array) { pr_err("init failed\n"); return -ENOMEM; } dpool_page_array->count = count; } static_branch_enable(&dynamic_pool_key); pr_info("enabled\n"); return 0; } subsys_initcall(dynamic_pool_init); /* === Dynamic hugetlb interface ====================================== */ static int __init dynamic_hugetlb_setup(char *buf) { if (enable_dpagelist) return 0; return kstrtobool(buf, &enable_dhugetlb); } early_param("dynamic_hugetlb", dynamic_hugetlb_setup); static int dpool_record_page(struct dynamic_pool *dpool, unsigned long idx) { read_lock(&dpool_page_array_rwlock); /* * If page's pfn is greater than dhugetlb_pagelist_t->count (which * may occurs due to memory hotplug) then dhugetlb_pagelist_t need * to be reallocated, so need write_lock here. */ if (idx >= dpool_page_array->count) { unsigned long size; struct dpool_page_array *tmp; read_unlock(&dpool_page_array_rwlock); write_lock(&dpool_page_array_rwlock); size = sizeof(struct dpool_page_array) + (idx + 1) * sizeof(struct dynamic_pool *); tmp = krealloc(dpool_page_array, size, GFP_ATOMIC); if (!tmp) { write_unlock(&dpool_page_array_rwlock); return -ENOMEM; } tmp->count = idx + 1; dpool_page_array = tmp; write_unlock(&dpool_page_array_rwlock); read_lock(&dpool_page_array_rwlock); } dpool_page_array->dpool[idx] = dpool; read_unlock(&dpool_page_array_rwlock); return 0; } static int dpool_fill_from_hugetlb(struct dynamic_pool *dpool, void *arg) { struct hstate *h = size_to_hstate(PUD_SIZE); unsigned long nr_pages = *(unsigned long *)arg; int nid = dpool->nid; unsigned long count = 0; struct pages_pool *pool = &dpool->pool[PAGES_POOL_1G]; struct page *page, *next; struct folio *folio; unsigned long idx; LIST_HEAD(page_list); if (!h) return -EINVAL; spin_lock_irq(&hugetlb_lock); if ((h->free_huge_pages_node[nid] < nr_pages) || (h->free_huge_pages - h->resv_huge_pages < nr_pages)) { spin_unlock_irq(&hugetlb_lock); return -ENOMEM; } while (count < nr_pages) { folio = dequeue_hugetlb_folio_node_exact(h, nid); if (!folio) break; page = folio_page(folio, 0); /* dequeue will unfreeze the page, refreeze it. */ page_ref_freeze(page, 1); idx = hugepage_index(page_to_pfn(page)); if (dpool_record_page(dpool, idx)) { enqueue_hugetlb_folio(h, folio); pr_err("dpool_page_array can't record page 0x%px\n", page); continue; } list_move(&page->lru, &page_list); count++; } spin_unlock_irq(&hugetlb_lock); list_for_each_entry_safe(page, next, &page_list, lru) { if (hugetlb_vmemmap_restore(h, page)) { spin_lock_irq(&hugetlb_lock); enqueue_hugetlb_folio(h, folio); spin_unlock_irq(&hugetlb_lock); pr_err("restore hugetlb_vmemmap failed page 0x%px\n", page); continue; } __SetPageDpool(page); spin_lock_irq(&dpool->lock); list_move(&page->lru, &pool->freelist); pool->free_pages++; dpool->total_pages++; spin_unlock_irq(&dpool->lock); } return 0; } static int dpool_drain_to_hugetlb(struct dynamic_pool *dpool) { struct hstate *h = size_to_hstate(PUD_SIZE); struct pages_pool *pool = &dpool->pool[PAGES_POOL_1G]; struct page *page, *next; unsigned long idx; LIST_HEAD(page_list); if (!h) return -EINVAL; spin_lock_irq(&dpool->lock); list_for_each_entry_safe(page, next, &pool->freelist, lru) { WARN_ON(PageHWPoison(page)); idx = hugepage_index(page_to_pfn(page)); WARN_ON(dpool_record_page(NULL, idx)); list_move(&page->lru, &page_list); __ClearPageDpool(page); pool->free_pages--; dpool->total_pages--; } spin_unlock_irq(&dpool->lock); list_for_each_entry_safe(page, next, &page_list, lru) { hugetlb_vmemmap_optimize(h, page); spin_lock_irq(&hugetlb_lock); enqueue_hugetlb_folio(h, page_folio(page)); spin_unlock_irq(&hugetlb_lock); } return dpool->total_pages ? -ENOMEM : 0; } static int dpool_merge_all(struct dynamic_pool *dpool) { struct pages_pool *pool; int ret = -ENOMEM; pool = &dpool->pool[PAGES_POOL_2M]; while (pool->split_pages) { cond_resched(); ret = dpool_promote_pool(dpool, PAGES_POOL_2M); if (ret) { pr_err("some 4K pages can't merge ret: %d, delete failed: \n", ret); pr_cont_cgroup_name(dpool->memcg->css.cgroup); pr_cont("\n"); goto out; } } spin_lock_irq(&dpool->lock); if (pool->split_pages || pool->used_huge_pages || pool->resv_huge_pages) { ret = -ENOMEM; pr_err("some 2M pages are still in use or mmap, delete failed: "); pr_cont_cgroup_name(dpool->memcg->css.cgroup); pr_cont("\n"); spin_unlock_irq(&dpool->lock); goto out; } pool->free_pages += pool->nr_huge_pages; pool->nr_huge_pages = 0; pool->free_huge_pages = 0; spin_unlock_irq(&dpool->lock); pool = &dpool->pool[PAGES_POOL_1G]; while (pool->split_pages) { cond_resched(); ret = dpool_promote_pool(dpool, PAGES_POOL_1G); if (ret) { pr_err("some 2M pages can't merge ret: %d, delete failed: \n", ret); pr_cont_cgroup_name(dpool->memcg->css.cgroup); pr_cont("\n"); goto out; } } spin_lock_irq(&dpool->lock); if (pool->split_pages || pool->used_huge_pages || pool->resv_huge_pages) { ret = -ENOMEM; pr_err("some 1G pages are still in use or mmap, delete failed: "); pr_cont_cgroup_name(dpool->memcg->css.cgroup); pr_cont("\n"); spin_unlock_irq(&dpool->lock); goto out; } pool->free_pages += pool->nr_huge_pages; pool->nr_huge_pages = 0; pool->free_huge_pages = 0; spin_unlock_irq(&dpool->lock); ret = 0; out: return ret; } static struct dynamic_pool_ops hugetlb_dpool_ops = { .fill_pool = dpool_fill_from_hugetlb, .drain_pool = dpool_drain_to_hugetlb, .restore_pool = dpool_merge_all, }; /* If dynamic pool is disabled, hide the interface */ bool dynamic_pool_hide_files(struct cftype *cft) { if (dpool_enabled && enable_dhugetlb) return false; return !!strstr(cft->name, "dhugetlb"); } int dynamic_pool_add_memory(struct mem_cgroup *memcg, int nid, unsigned long size) { struct dynamic_pool *dpool; int ret = -EINVAL; bool new_create = false; if (!dpool_enabled) return -EINVAL; mutex_lock(&dpool_mutex); if (!(memcg->css.cgroup->self.flags & CSS_ONLINE)) { pr_err("add memory failed, memcg is going offline\n"); goto unlock; } dpool = memcg->dpool; if (!dpool) { dpool = dpool_create(memcg, &hugetlb_dpool_ops); if (!dpool) goto unlock; dpool->nid = nid; new_create = true; } else if (dpool->memcg != memcg) { pr_err("add memory failed, not parent memcg\n"); goto unlock; } else if (dpool->nid != nid) { pr_err("add memory failed, not target nid(%d)\n", dpool->nid); goto unlock; } BUG_ON(!dpool->ops->fill_pool); ret = dpool->ops->fill_pool(dpool, &size); if (ret) { pr_err("fill pool failed\n"); /* * If create a new hpool here but add memory failed, * release it directly here. */ if (new_create) { memcg->dpool = NULL; dpool_put(dpool); } } unlock: mutex_unlock(&dpool_mutex); return ret; } void dynamic_pool_show(struct mem_cgroup *memcg, struct seq_file *m) { struct dynamic_pool *dpool; unsigned long free_pages; long used_pages; if (!dpool_enabled || !memcg) return; dpool = dpool_get_from_memcg(memcg); if (!dpool) { seq_puts(m, "Current hierarchial have not memory pool.\n"); return; } dpool_disable_pcp_pool(dpool, false); spin_lock_irq(&dpool->lock); /* * no others can modify the count because pcp pool is disabled and * dpool->lock is locked. */ dpool_sum_pcp_pool(dpool, &free_pages, &used_pages); free_pages += dpool->pool[PAGES_POOL_4K].free_pages; used_pages += dpool->pool[PAGES_POOL_4K].used_pages; seq_printf(m, "nid %d\n", dpool->nid); seq_printf(m, "dhugetlb_total_pages %lu\n", dpool->total_pages); seq_printf(m, "1G_total_reserved_pages %lu\n", dpool->pool[PAGES_POOL_1G].nr_huge_pages); seq_printf(m, "1G_free_reserved_pages %lu\n", dpool->pool[PAGES_POOL_1G].free_huge_pages); seq_printf(m, "1G_mmap_reserved_pages %lu\n", dpool->pool[PAGES_POOL_1G].resv_huge_pages); seq_printf(m, "1G_used_pages %lu\n", dpool->pool[PAGES_POOL_1G].used_huge_pages); seq_printf(m, "2M_total_reserved_pages %lu\n", dpool->pool[PAGES_POOL_2M].nr_huge_pages); seq_printf(m, "2M_free_reserved_pages %lu\n", dpool->pool[PAGES_POOL_2M].free_huge_pages); seq_printf(m, "2M_mmap_reserved_pages %lu\n", dpool->pool[PAGES_POOL_2M].resv_huge_pages); seq_printf(m, "2M_used_pages %lu\n", dpool->pool[PAGES_POOL_2M].used_huge_pages); seq_printf(m, "1G_free_unreserved_pages %lu\n", dpool->pool[PAGES_POOL_1G].free_pages); seq_printf(m, "2M_free_unreserved_pages %lu\n", dpool->pool[PAGES_POOL_2M].free_pages); seq_printf(m, "4K_free_pages %lu\n", free_pages); seq_printf(m, "4K_used_pages %ld\n", used_pages); spin_unlock_irq(&dpool->lock); dpool_enable_pcp_pool(dpool); dpool_put(dpool); } int dynamic_pool_reserve_hugepage(struct mem_cgroup *memcg, unsigned long nr_pages, int type) { struct dynamic_pool *dpool; struct pages_pool *pool; unsigned long delta; int ret = -EINVAL; if (!dpool_enabled) return -EINVAL; mutex_lock(&dpool_mutex); dpool = dpool_get_from_memcg(memcg); if (!dpool) goto unlock; pool = &dpool->pool[type]; spin_lock_irq(&dpool->lock); if (nr_pages > pool->nr_huge_pages) { delta = nr_pages - pool->nr_huge_pages; while (delta > pool->free_pages && !dpool_demote_pool_locked(dpool, type - 1)) { spin_unlock_irq(&dpool->lock); cond_resched(); spin_lock_irq(&dpool->lock); } /* Only try merge pages for 2M pages */ if (type == PAGES_POOL_2M) { while (delta > pool->free_pages) { spin_unlock_irq(&dpool->lock); cond_resched(); if (dpool_promote_pool(dpool, type)) { spin_lock_irq(&dpool->lock); break; } spin_lock_irq(&dpool->lock); } } delta = min(delta, pool->free_pages); pool->nr_huge_pages += delta; pool->free_huge_pages += delta; pool->free_pages -= delta; } else { delta = min(pool->nr_huge_pages - nr_pages, pool->free_huge_pages - pool->resv_huge_pages); pool->nr_huge_pages -= delta; pool->free_huge_pages -= delta; pool->free_pages += delta; } spin_unlock_irq(&dpool->lock); dpool_put(dpool); ret = 0; unlock: mutex_unlock(&dpool_mutex); return ret; } /* === Dynamic pagelist interface ===================================== */ static int __init dynamic_pagelist_setup(char *buf) { if (enable_dhugetlb) return 0; return kstrtobool(buf, &enable_dpagelist); } early_param("dpool", dynamic_pagelist_setup); static int dpool_fill_from_pagelist(struct dynamic_pool *dpool, void *arg) { struct dpool_info *info = (struct dpool_info *)arg; struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; int i, ret = -EINVAL; dpool->range_cnt = info->range_cnt; dpool->pfn_ranges = kmalloc_array(info->range_cnt, sizeof(struct range), GFP_KERNEL); if (!dpool->pfn_ranges) return -ENOMEM; memcpy(dpool->pfn_ranges, info->pfn_ranges, sizeof(struct range) * dpool->range_cnt); spin_lock_irq(&dpool->lock); for (i = 0; i < dpool->range_cnt; i++) { struct range *range = &dpool->pfn_ranges[i]; u64 pfn; for (pfn = range->start; pfn <= range->end; pfn++) { struct page *page = pfn_to_page(pfn); set_page_count(page, 0); page_mapcount_reset(page); if (!dpool_free_page_prepare(page)) { pr_err("fill pool failed, check pages failed\n"); goto unlock; } __SetPageDpool(page); list_add_tail(&page->lru, &pool->freelist); pool->free_pages++; cond_resched_lock(&dpool->lock); } } ret = 0; unlock: spin_unlock_irq(&dpool->lock); return ret; } static int dpool_drain_to_pagelist(struct dynamic_pool *dpool) { struct pages_pool *pool = &dpool->pool[PAGES_POOL_4K]; /* check poisoned pages */ return (pool->used_pages == dpool->nr_poisoned_pages) ? 0 : -ENOMEM; } static int dpool_migrate_used_pages(struct dynamic_pool *dpool) { int range_cnt = dpool->range_cnt; int i; spin_lock_irq(&dpool->lock); dpool->nr_poisoned_pages = 0; for (i = 0; i < range_cnt; i++) { struct range *range = &dpool->pfn_ranges[i]; u64 pfn; for (pfn = range->start; pfn <= range->end; pfn++) { struct page *page = pfn_to_page(pfn); /* Unlock and try migration. */ spin_unlock_irq(&dpool->lock); cond_resched(); if (PageDpool(page)) { spin_lock_irq(&dpool->lock); continue; } if (PageHWPoison(page)) dpool->nr_poisoned_pages++; lru_add_drain_all(); do_migrate_range(pfn, pfn + 1); spin_lock_irq(&dpool->lock); } } spin_unlock_irq(&dpool->lock); return 0; } struct dynamic_pool_ops pagelist_dpool_ops = { .fill_pool = dpool_fill_from_pagelist, .drain_pool = dpool_drain_to_pagelist, .restore_pool = dpool_migrate_used_pages, }; int dpool_init(struct dpool_info *arg) { struct dynamic_pool *dpool; int ret; if (!dpool_enabled) return -EINVAL; if (!arg || !arg->memcg || arg->range_cnt <= 0) { pr_err("init failed, arg is invalid\n"); return -EINVAL; } mutex_lock(&dpool_mutex); if (dpool_global_pool || arg->memcg->dpool) { pr_err("init failed, dpool is already exist\n"); ret = -EINVAL; goto unlock; } if (!(arg->memcg->css.cgroup->self.flags & CSS_ONLINE)) { pr_err("init failed, memcg is not online\n"); ret = -EINVAL; goto unlock; } dpool = dpool_create(arg->memcg, &pagelist_dpool_ops); if (!dpool) { pr_err("init failed, create failed. ret: %d\n", ret); ret = -ENOMEM; goto unlock; } dpool_global_pool = dpool; BUG_ON(!dpool->ops->fill_pool); ret = dpool->ops->fill_pool(dpool, arg); if (ret) dpool_put(dpool); unlock: mutex_unlock(&dpool_mutex); return ret; } void dynamic_pool_show_meminfo(struct seq_file *m) { struct dynamic_pool *dpool; struct pages_pool *pool; unsigned long free_pages = 0; long used_pages = 0; unsigned long flags; if (!dpool_enabled || !enable_dpagelist) return; dpool = dpool_get_from_page(NULL); if (!dpool) goto out; pool = &dpool->pool[PAGES_POOL_4K]; dpool_disable_pcp_pool(dpool, false); spin_lock_irqsave(&dpool->lock, flags); dpool_sum_pcp_pool(dpool, &free_pages, &used_pages); free_pages += pool->free_pages; used_pages += pool->used_pages; spin_unlock_irqrestore(&dpool->lock, flags); dpool_enable_pcp_pool(dpool); out: if (m) { seq_printf(m, "DPoolTotal: %8lu kB\n" "DPoolFree: %8ld kB\n", (free_pages + used_pages) << (PAGE_SHIFT - 10), free_pages << (PAGE_SHIFT - 10)); } else { pr_info("DPoolTotal: %lu kB\n", (free_pages + used_pages) << (PAGE_SHIFT - 10)); pr_info("DPoolFree: %ld kB\n", free_pages << (PAGE_SHIFT - 10)); } dpool_put(dpool); }