557 lines
14 KiB
C
Raw Normal View History

2026-01-29 22:25:33 +08:00
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved.
*
* userswap core file
*/
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/hugetlb.h>
#include <linux/userswap.h>
#include "internal.h"
DEFINE_STATIC_KEY_FALSE(userswap_enabled);
static bool vma_uswap_compatible(struct vm_area_struct *vma)
{
if (!vma || !vma_is_anonymous(vma) || vma->vm_file ||
(vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | VM_IO |
VM_PFNMAP | VM_HUGETLB)))
return false;
return true;
}
/*
* Check if pages between 'addr ~ addr+len' can be user swapped. If so, get
* the reference of the pages and return the pages through input parameters
* 'ppages'.
*/
static unsigned long pages_can_be_swapped(struct mm_struct *mm,
unsigned long addr,
unsigned long len,
struct page ***ppages)
{
struct vm_area_struct *vma;
struct page *page = NULL;
struct page **pages = NULL;
unsigned long addr_end = addr + len;
unsigned long ret;
unsigned long i, page_num = 0;
*ppages = NULL;
pages = kvzalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL);
if (!pages)
return -ENOMEM;
while (addr < addr_end) {
vma = find_vma(mm, addr);
if (!vma || addr < vma->vm_start ||
!(vma->vm_flags & VM_USWAP) ||
!vma_uswap_compatible(vma)) {
ret = -EINVAL;
goto out_err;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
ret = -EAGAIN;
goto out_err;
}
get_again:
/*
* follow_page will inc page ref, dec the ref after we remap
* the page.
*/
page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
if (IS_ERR_OR_NULL(page)) {
ret = -ENODEV;
goto out_err;
}
pages[page_num++] = page;
if (!PageAnon(page) || !PageSwapBacked(page) ||
PageHuge(page) || PageSwapCache(page)) {
ret = -EINVAL;
goto out_err;
}
if (PageTransCompound(page)) {
if (trylock_page(page)) {
if (!split_huge_page(page)) {
unlock_page(page);
put_page(page);
page_num--;
goto get_again;
} else {
unlock_page(page);
}
}
ret = -EINVAL;
goto out_err;
}
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
if (page_mapcount(page) > 1 ||
page_mapcount(page) + 1 != page_count(page)) {
ret = -EBUSY;
goto out_err;
}
addr += PAGE_SIZE;
}
*ppages = pages;
return 0;
out_err:
for (i = 0; i < page_num; i++)
put_page(pages[i]);
kvfree(pages);
return ret;
}
static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr)
{
pud_t *pud;
pmd_t *pmd;
pud = get_old_pud(mm, addr);
if (!pud)
return false;
else if (pud_huge(*pud))
return true;
pmd = pmd_offset(pud, addr);
if (!pmd)
return false;
else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd))
return true;
return false;
}
static int uswap_unmap_anon_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr, struct page *page,
pmd_t *pmd, pte_t *old_pte, bool set_to_swp)
{
struct mmu_notifier_range range;
spinlock_t *ptl;
pte_t *pte, _old_pte;
int ret = 0;
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, addr,
addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_present(*pte)) {
ret = -EINVAL;
goto out_release_unlock;
}
flush_cache_page(vma, addr, pte_pfn(*pte));
_old_pte = ptep_clear_flush(vma, addr, pte);
if (old_pte)
*old_pte = _old_pte;
if (set_to_swp)
set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry(
SWP_USERSWAP_ENTRY, page_to_pfn(page))));
dec_mm_counter(mm, MM_ANONPAGES);
add_reliable_page_counter(page, mm, -1);
folio_remove_rmap_pte(page_folio(page), page, vma);
page->mapping = NULL;
out_release_unlock:
pte_unmap_unlock(pte, ptl);
mmu_notifier_invalidate_range_end(&range);
return ret;
}
static unsigned long vm_insert_anon_page(struct vm_area_struct *vma,
unsigned long addr, struct page *page)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
pte_t *pte, dst_pte;
spinlock_t *ptl;
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
return -ENOMEM;
if (!pte_none(*pte)) {
ret = -EBUSY;
goto out_unlock;
}
inc_mm_counter(mm, MM_ANONPAGES);
add_reliable_page_counter(page, mm, 1);
page_add_new_anon_rmap(page, vma, addr);
dst_pte = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte));
set_pte_at(mm, addr, pte, dst_pte);
out_unlock:
pte_unmap_unlock(pte, ptl);
return ret;
}
static void uswap_map_anon_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr,
struct page *page,
pmd_t *pmd,
pte_t old_pte)
{
spinlock_t *ptl;
pte_t *pte;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_cache_page(vma, addr, pte_pfn(*pte));
set_pte_at(mm, addr, pte, old_pte);
inc_mm_counter(mm, MM_ANONPAGES);
add_reliable_page_counter(page, mm, 1);
page_add_new_anon_rmap(page, vma, addr);
pte_unmap_unlock(pte, ptl);
}
static void uswapout_recover(struct mm_struct *mm,
unsigned long old_addr_start, unsigned long len,
struct page **pages, unsigned long new_addr_start,
pte_t *ptes)
{
unsigned long unmap_old_addr = old_addr_start;
unsigned long unmap_new_addr = new_addr_start;
struct page *page;
pmd_t *old_pmd, *new_pmd;
pte_t pte;
unsigned long i;
for (i = 0; i < len; i++) {
page = pages[i];
pte = ptes[i];
new_pmd = mm_find_pmd(mm, new_addr_start);
old_pmd = mm_find_pmd(mm, unmap_old_addr);
uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr),
unmap_new_addr, page, new_pmd, NULL,
false);
uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
unmap_old_addr, page, old_pmd, pte);
unmap_old_addr += PAGE_SIZE;
unmap_new_addr += PAGE_SIZE;
}
if (pte_val(ptes[len]) != 0) {
page = pages[len];
pte = ptes[len];
old_pmd = mm_find_pmd(mm, unmap_old_addr);
uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr),
unmap_old_addr, page, old_pmd, pte);
get_page(page);
}
}
/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */
static unsigned long do_user_swap(struct mm_struct *mm,
unsigned long old_addr_start,
unsigned long len, struct page **pages,
unsigned long new_addr_start)
{
struct vm_area_struct *old_vma, *new_vma;
unsigned long old_addr = old_addr_start;
unsigned long new_addr = new_addr_start;
struct page *page;
pmd_t *pmd;
pte_t old_pte, *ptes;
bool pages_dirty = false;
unsigned long i = 0, j;
int ret;
ptes = kvzalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL);
if (!ptes)
return -ENOMEM;
lru_add_drain();
for (j = 0; j < len; j += PAGE_SIZE) {
page = pages[i];
ret = -EINVAL;
if (!page)
goto out_recover;
if (is_thp_or_huge(mm, new_addr))
goto out_recover;
old_vma = find_vma(mm, old_addr);
if (!old_vma || old_addr < old_vma->vm_start)
goto out_recover;
new_vma = find_vma(mm, new_addr);
if (!new_vma || new_addr < new_vma->vm_start)
goto out_recover;
if (!vma_uswap_compatible(new_vma))
goto out_recover;
ret = -EACCES;
if (!(old_vma->vm_flags & VM_WRITE) &&
(new_vma->vm_flags & VM_WRITE))
goto out_recover;
ret = -ENXIO;
pmd = mm_find_pmd(mm, old_addr);
if (!pmd)
goto out_recover;
ret = uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd,
&old_pte, true);
if (ret)
goto out_recover;
ptes[i] = old_pte;
if (pte_dirty(old_pte) || PageDirty(page))
pages_dirty = true;
put_page(page);
ret = vm_insert_anon_page(new_vma, new_addr, page);
if (ret)
goto out_recover;
get_page(page);
old_addr += PAGE_SIZE;
new_addr += PAGE_SIZE;
i++;
}
if (pages_dirty)
new_addr_start = new_addr_start | USWAP_PAGES_DIRTY;
kvfree(ptes);
return new_addr_start;
out_recover:
uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes);
kvfree(ptes);
return ret;
}
/*
* When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall
* mremap.
* Unmap the pages between 'addr ~ addr + old_len' and remap them to 'new_addr
* ~ new_addr + new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY.
*/
unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len)
{
struct page **pages = NULL;
struct mm_struct *mm = current->mm;
unsigned long len = old_len;
unsigned long ret = -EINVAL;
unsigned long i;
if (!static_branch_unlikely(&userswap_enabled))
goto out;
if (offset_in_page(old_addr))
goto out;
old_len = PAGE_ALIGN(old_len);
new_len = PAGE_ALIGN(new_len);
if (!new_len || old_len != new_len || offset_in_page(new_addr))
goto out;
if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len ||
old_addr > TASK_SIZE - old_len)
goto out;
/* Ensure the old/new locations do not overlap */
if (old_addr + old_len > new_addr && new_addr + new_len > old_addr)
goto out;
lru_add_drain_all();
mmap_write_lock(mm);
ret = pages_can_be_swapped(mm, old_addr, len, &pages);
if (ret)
goto out_release_unlock;
ret = do_user_swap(mm, old_addr, len, pages, new_addr);
/* follow_page() above increased the reference */
for (i = 0; i < len / PAGE_SIZE; i++)
if (pages[i])
put_page(pages[i]);
kvfree(pages);
out_release_unlock:
mmap_write_unlock(mm);
out:
return ret;
}
bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode)
{
if (!static_branch_unlikely(&userswap_enabled))
return true;
if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP))
return true;
uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP;
if (uffdio_register->mode != UFFDIO_REGISTER_MODE_MISSING)
return false;
*uswap_mode = true;
return true;
}
/*
* register the whole vma overlapping with the address range to avoid splitting
* the vma which could reduce fragmentation.
*/
bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register,
unsigned long *vm_flags, struct mm_struct *mm)
{
struct vm_area_struct *vma, *cur;
unsigned long end;
bool ret = false;
VMA_ITERATOR(vmi, mm, uffdio_register->range.start);
end = uffdio_register->range.start + uffdio_register->range.len - 1;
mmap_read_lock(mm);
vma = find_vma(mm, uffdio_register->range.start);
if (!vma || vma->vm_start >= end)
goto out_unlock;
for_each_vma_range(vmi, cur, end)
if (!vma_uswap_compatible(cur))
goto out_unlock;
uffdio_register->range.start = vma->vm_start;
vma = find_vma(mm, end);
if (vma && end >= vma->vm_start)
uffdio_register->range.len = vma->vm_end - uffdio_register->range.start;
*vm_flags |= VM_USWAP;
ret = true;
out_unlock:
mmap_read_unlock(mm);
return ret;
}
vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf,
struct vm_area_struct *vma)
{
const char *process_prefix = "uswap";
/* print error if we come across a nested fault */
if (!strncmp(current->comm, process_prefix, strlen(process_prefix))) {
pr_err("USWAP: fault %lx is triggered by %s\n", vmf->address,
current->comm);
return VM_FAULT_SIGBUS;
}
if (!(vma->vm_flags & VM_UFFD_MISSING)) {
pr_err("USWAP: addr %lx flags %lx is not a user swap page",
vmf->address, vma->vm_flags);
return VM_FAULT_SIGBUS;
}
return handle_userfault(vmf, VM_UFFD_MISSING);
}
int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr)
{
struct vm_area_struct *src_vma;
pte_t dst_pte, *pte, src_pte;
struct page *page;
spinlock_t *ptl;
pmd_t *src_pmd;
int ret;
src_vma = find_vma(mm, src_addr);
if (!src_vma || src_addr < src_vma->vm_start)
return -EINVAL;
if (!vma_uswap_compatible(src_vma))
return -EINVAL;
page = follow_page(src_vma, src_addr, FOLL_GET | FOLL_DUMP);
if (IS_ERR_OR_NULL(page))
return -ENODEV;
ret = -ENXIO;
src_pmd = mm_find_pmd(mm, src_addr);
if (!src_pmd)
goto out_put_page;
if (!PageLRU(page))
lru_add_drain_all();
ret = -EBUSY;
if (page_mapcount(page) > 1 ||
page_mapcount(page) + 1 != page_count(page))
goto out_put_page;
ret = uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd,
&src_pte, false);
if (ret)
goto out_put_page;
if (dst_vma->vm_flags & VM_USWAP)
ClearPageDirty(page);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
__SetPageUptodate(page);
dst_pte = mk_pte(page, dst_vma->vm_page_prot);
if (dst_vma->vm_flags & VM_WRITE)
dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte));
if (dst_vma->vm_flags & VM_USWAP)
dst_pte = pte_mkclean(dst_pte);
pte = pte_offset_map_lock(mm, dst_pmd, dst_addr, &ptl);
/*
* The userspace may swap in a large area. Part of the area is not
* swapped out. If concurrent execution, PTE may be present. Skip those
* pages (pte_present).
* No other scenes should be handled except first pagefault (pte_none)
* and after userswap out (SWP_USERSWAP_ENTRY).
*/
if (pte_present(*pte) || (!pte_none(*pte) &&
!is_userswap_entry(pte_to_swp_entry(*pte)))) {
pte_unmap_unlock(pte, ptl);
uswap_map_anon_page(mm, src_vma, src_addr, page, src_pmd,
src_pte);
ret = -EEXIST;
goto out_put_page;
}
inc_mm_counter(mm, MM_ANONPAGES);
add_reliable_page_counter(page, mm, 1);
page_add_new_anon_rmap(page, dst_vma, dst_addr);
set_pte_at(mm, dst_addr, pte, dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, pte);
pte_unmap_unlock(pte, ptl);
ret = 0;
out_put_page:
put_page(page);
return ret;
}
static int __init enable_userswap_setup(char *str)
{
static_branch_enable(&userswap_enabled);
return 1;
}
__setup("enable_userswap", enable_userswap_setup);