1697 lines
43 KiB
C
1697 lines
43 KiB
C
|
|
// SPDX-License-Identifier: GPL-2.0
|
||
|
|
/*
|
||
|
|
* Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
|
||
|
|
* This program is free software; you can redistribute it and/or modify
|
||
|
|
* it under the terms of the GNU General Public License version 2 and
|
||
|
|
* only version 2 as published by the Free Software Foundation.
|
||
|
|
*
|
||
|
|
* This program is distributed in the hope that it will be useful,
|
||
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
|
* GNU General Public License for more details.
|
||
|
|
*/
|
||
|
|
|
||
|
|
#include <linux/mm.h>
|
||
|
|
#include <linux/sched.h>
|
||
|
|
#include <linux/fs.h>
|
||
|
|
#include <linux/pfn_t.h>
|
||
|
|
#include <linux/buffer_head.h>
|
||
|
|
#include <linux/iomap.h>
|
||
|
|
#include <linux/dax.h>
|
||
|
|
#include <linux/cpufeature.h>
|
||
|
|
#include <linux/pgtable.h>
|
||
|
|
#include "euler.h"
|
||
|
|
#include "dax.h"
|
||
|
|
#include "dep.h"
|
||
|
|
#include "wear.h"
|
||
|
|
#include "alloc_interface.h"
|
||
|
|
|
||
|
|
int eufs_persist_btree_node(void *root, int sta, int len);
|
||
|
|
|
||
|
|
static __always_inline void eufs_clear_pmem(void *addr, size_t size)
|
||
|
|
{
|
||
|
|
memset(addr, 0, size);
|
||
|
|
eufs_flush_range(addr, size);
|
||
|
|
}
|
||
|
|
|
||
|
|
static __always_inline void *eufs_find_data_block_btree(struct inode *inode,
|
||
|
|
unsigned long blocknr,
|
||
|
|
__le64 **parent)
|
||
|
|
{
|
||
|
|
__le64 *bp;
|
||
|
|
u32 height, bit_shift;
|
||
|
|
unsigned int idx;
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
|
||
|
|
/* inode must be a regular file */
|
||
|
|
height = vi->i_volatile_height;
|
||
|
|
bp = vi->i_volatile_root;
|
||
|
|
|
||
|
|
NV_ASSERT(blocknr < (1UL << (height * EUFS_FILE_TREE_DEGREE_SHIFT)));
|
||
|
|
|
||
|
|
if (height == 0) {
|
||
|
|
BUG_ON(blocknr != 0);
|
||
|
|
if (parent)
|
||
|
|
*parent = NULL;
|
||
|
|
return (void *)bp;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (height == 1) {
|
||
|
|
if (bp[blocknr] == NULL_VAL)
|
||
|
|
BUG();
|
||
|
|
|
||
|
|
if (parent)
|
||
|
|
*parent = bp;
|
||
|
|
return s2p(inode->i_sb, bp[blocknr]);
|
||
|
|
}
|
||
|
|
while (height > 0) {
|
||
|
|
bit_shift = (height - 1) * EUFS_FILE_TREE_DEGREE_SHIFT;
|
||
|
|
idx = blocknr >> bit_shift;
|
||
|
|
if (parent)
|
||
|
|
*parent = bp;
|
||
|
|
bp = s2p(inode->i_sb, bp[idx]);
|
||
|
|
if (bp == 0)
|
||
|
|
return 0;
|
||
|
|
blocknr = blocknr & ((1 << bit_shift) - 1);
|
||
|
|
height--;
|
||
|
|
}
|
||
|
|
return bp;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int eufs_extend_btree_recursive_blind(struct inode *inode,
|
||
|
|
int level_left, __le64 *parent,
|
||
|
|
int sta_index,
|
||
|
|
int end_index, /* inclusive */
|
||
|
|
struct alloc_batch *ab)
|
||
|
|
{
|
||
|
|
struct super_block *sb = inode->i_sb;
|
||
|
|
void *p;
|
||
|
|
long r;
|
||
|
|
int i;
|
||
|
|
|
||
|
|
for (i = sta_index; i <= end_index; ++i) {
|
||
|
|
if (!level_left) {
|
||
|
|
parent[i] = NULL_ADDR;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
/* level_left */
|
||
|
|
p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab);
|
||
|
|
if (!p)
|
||
|
|
return -ENOSPC;
|
||
|
|
parent[i] = p2s(sb, p);
|
||
|
|
/* recur */
|
||
|
|
r = eufs_extend_btree_recursive_blind(inode, level_left - 1, p,
|
||
|
|
0,
|
||
|
|
EUFS_FILE_TREE_DEGREE - 1,
|
||
|
|
ab);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Allocate blocks from top to bottom.
|
||
|
|
*
|
||
|
|
* Only allocate the interior blocks which will have a leaf child block or
|
||
|
|
* an interior child block. And the unused pointers for children will NOT
|
||
|
|
* be zeroed.
|
||
|
|
*
|
||
|
|
* New leaf blocks are not allocated, and their values are set to NULL_ADDR.
|
||
|
|
*/
|
||
|
|
static int eufs_extend_btree_recursive(struct inode *inode, int level_left,
|
||
|
|
__le64 *parent, unsigned long origin,
|
||
|
|
unsigned long num_blocks,
|
||
|
|
struct alloc_batch *ab, bool blind)
|
||
|
|
{
|
||
|
|
struct super_block *sb = inode->i_sb;
|
||
|
|
const unsigned long nblocks_per_slot =
|
||
|
|
1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT);
|
||
|
|
unsigned long off;
|
||
|
|
int sta_index, end_index;
|
||
|
|
int i;
|
||
|
|
long r;
|
||
|
|
void *p;
|
||
|
|
|
||
|
|
if (blind) {
|
||
|
|
return eufs_extend_btree_recursive_blind(
|
||
|
|
inode, level_left, parent, 0, EUFS_FILE_TREE_DEGREE - 1,
|
||
|
|
ab);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (origin == 0) {
|
||
|
|
/* end_index could be zero */
|
||
|
|
end_index = (num_blocks - 1) / nblocks_per_slot;
|
||
|
|
r = eufs_extend_btree_recursive_blind(inode, level_left, parent,
|
||
|
|
0, end_index - 1, ab);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
if (!level_left) {
|
||
|
|
parent[end_index] = NULL_ADDR;
|
||
|
|
} else {
|
||
|
|
p = eufs_alloc_batch_allocate_file_index(inode->i_sb,
|
||
|
|
ab);
|
||
|
|
if (!p)
|
||
|
|
return -ENOSPC;
|
||
|
|
parent[end_index] = p2s(sb, p);
|
||
|
|
off = nblocks_per_slot * end_index;
|
||
|
|
r = eufs_extend_btree_recursive(inode, level_left - 1,
|
||
|
|
p, 0, num_blocks - off,
|
||
|
|
ab, false);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
}
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
sta_index = (origin - 1) / nblocks_per_slot;
|
||
|
|
end_index = (num_blocks - 1) / nblocks_per_slot;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* No need to create a new sub-tree, so descend to the sub-tree
|
||
|
|
* rooted in parent[sta_index]
|
||
|
|
*/
|
||
|
|
if (sta_index == end_index) {
|
||
|
|
if (!level_left)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
/* calculate the needed block count in the sub-tree */
|
||
|
|
off = sta_index * nblocks_per_slot;
|
||
|
|
r = eufs_extend_btree_recursive(inode, level_left - 1,
|
||
|
|
s2p(sb, parent[sta_index]),
|
||
|
|
origin - off, num_blocks - off,
|
||
|
|
ab, false);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!level_left) {
|
||
|
|
for (i = sta_index + 1; i <= end_index; ++i)
|
||
|
|
parent[i] = NULL_ADDR;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* extend sub-tree shared with existed blocks to its maximum size */
|
||
|
|
off = sta_index * nblocks_per_slot;
|
||
|
|
r = eufs_extend_btree_recursive(inode, level_left - 1,
|
||
|
|
s2p(sb, parent[sta_index]),
|
||
|
|
origin - off, nblocks_per_slot, ab,
|
||
|
|
false);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
|
||
|
|
/* new sub-trees which will be fully initialized */
|
||
|
|
r = eufs_extend_btree_recursive_blind(inode, level_left, parent,
|
||
|
|
sta_index + 1, end_index - 1, ab);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
|
||
|
|
/* the last new sub-tree which may only needs partial initialization */
|
||
|
|
p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab);
|
||
|
|
if (!p)
|
||
|
|
return -ENOSPC;
|
||
|
|
parent[end_index] = p2s(sb, p);
|
||
|
|
|
||
|
|
off = end_index * nblocks_per_slot;
|
||
|
|
r = eufs_extend_btree_recursive(inode, level_left - 1, p, 0,
|
||
|
|
num_blocks - off, ab, false);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static unsigned long eufs_count_pages(unsigned long leaf_blocks)
|
||
|
|
{
|
||
|
|
unsigned long tot = leaf_blocks;
|
||
|
|
|
||
|
|
while (leaf_blocks > 1) {
|
||
|
|
leaf_blocks = DIV_ROUND_UP(leaf_blocks, EUFS_FILE_TREE_DEGREE);
|
||
|
|
tot += leaf_blocks;
|
||
|
|
}
|
||
|
|
return tot;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* So that we have page[0..num_blocks-1] */
|
||
|
|
int eufs_extend_btree(struct inode *inode, unsigned long num_blocks)
|
||
|
|
{
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
unsigned long full_size;
|
||
|
|
unsigned long need_blocks;
|
||
|
|
__le64 *new_root;
|
||
|
|
long r = 0;
|
||
|
|
struct alloc_batch *ab = &vi->page_batch;
|
||
|
|
|
||
|
|
if (!num_blocks)
|
||
|
|
return 0;
|
||
|
|
if (vi->i_volatile_tree_blocks >= num_blocks)
|
||
|
|
/* already allocated */
|
||
|
|
return 0;
|
||
|
|
if (num_blocks > inode->i_sb->s_maxbytes >> EUFS_BLOCK_SIZE_BITS)
|
||
|
|
return -EFBIG;
|
||
|
|
|
||
|
|
/* Grow from vi->i_volatile_tree_blocks to num_blocks */
|
||
|
|
need_blocks = eufs_count_pages(num_blocks) -
|
||
|
|
eufs_count_pages(vi->i_volatile_tree_blocks);
|
||
|
|
|
||
|
|
/* Set NULL_ADDR for extended data blocks */
|
||
|
|
need_blocks -= (num_blocks - vi->i_volatile_tree_blocks);
|
||
|
|
|
||
|
|
r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
|
||
|
|
BUG_ON(!vi->i_volatile_root);
|
||
|
|
if (!vi->i_volatile_root) {
|
||
|
|
vi->i_volatile_root =
|
||
|
|
eufs_alloc_batch_allocate_file_data(inode->i_sb, ab);
|
||
|
|
BUG_ON(!vi->i_volatile_root);
|
||
|
|
vi->i_volatile_height = 0;
|
||
|
|
}
|
||
|
|
if (num_blocks == 1) {
|
||
|
|
/* Already allocated */
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
full_size = 1UL
|
||
|
|
<< (vi->i_volatile_height * EUFS_FILE_TREE_DEGREE_SHIFT);
|
||
|
|
while (full_size < num_blocks) {
|
||
|
|
new_root =
|
||
|
|
eufs_alloc_batch_allocate_file_index(inode->i_sb, ab);
|
||
|
|
new_root[0] = p2s(inode->i_sb, vi->i_volatile_root);
|
||
|
|
vi->i_volatile_root = new_root;
|
||
|
|
vi->i_volatile_height++;
|
||
|
|
full_size <<= EUFS_FILE_TREE_DEGREE_SHIFT;
|
||
|
|
}
|
||
|
|
BUG_ON(vi->i_volatile_height < 1);
|
||
|
|
r = eufs_extend_btree_recursive(inode, vi->i_volatile_height - 1,
|
||
|
|
vi->i_volatile_root,
|
||
|
|
vi->i_volatile_tree_blocks, num_blocks,
|
||
|
|
ab, false);
|
||
|
|
out:
|
||
|
|
eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab);
|
||
|
|
vi->i_volatile_tree_blocks = num_blocks;
|
||
|
|
num_blocks <<= (inode->i_blkbits - 9);
|
||
|
|
if (num_blocks > inode->i_blocks)
|
||
|
|
inode->i_blocks = num_blocks;
|
||
|
|
return r;
|
||
|
|
}
|
||
|
|
|
||
|
|
int eufs_alloc_blocks_btree(struct inode *inode, unsigned long start_block,
|
||
|
|
unsigned long num_blocks, int zero)
|
||
|
|
{
|
||
|
|
long r;
|
||
|
|
unsigned long blocknr, need_blocks = 0,
|
||
|
|
end_block = start_block + num_blocks;
|
||
|
|
long pi_tree_blocks =
|
||
|
|
eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode)));
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
struct alloc_batch *ab = &vi->page_batch;
|
||
|
|
__le64 *parent;
|
||
|
|
unsigned int ofs;
|
||
|
|
void *xmem;
|
||
|
|
int last_ofs_line = -1;
|
||
|
|
|
||
|
|
r = eufs_extend_btree(inode, start_block + num_blocks);
|
||
|
|
if (r)
|
||
|
|
return r;
|
||
|
|
|
||
|
|
if (start_block == 0)
|
||
|
|
vi->hole_at_sta = false;
|
||
|
|
|
||
|
|
/* The 0th data block is always allocated. */
|
||
|
|
blocknr = start_block ? start_block : 1;
|
||
|
|
|
||
|
|
/* TODO: need optimization. */
|
||
|
|
while (blocknr < end_block) {
|
||
|
|
eufs_find_data_block_btree(inode, blocknr, &parent);
|
||
|
|
BUG_ON(!parent);
|
||
|
|
ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
|
||
|
|
while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) {
|
||
|
|
if (parent[ofs] == NULL_ADDR) {
|
||
|
|
/*
|
||
|
|
* The leaf blocks are not allocated before persist,
|
||
|
|
* e.g. through truncate() + fsync()
|
||
|
|
*/
|
||
|
|
if (blocknr < pi_tree_blocks) {
|
||
|
|
xmem = eufs_zalloc_file_data(
|
||
|
|
inode->i_sb);
|
||
|
|
if (!xmem)
|
||
|
|
return -ENOSPC;
|
||
|
|
eufs_alloc_persist(inode->i_sb, xmem,
|
||
|
|
false);
|
||
|
|
eufs_flush_page(xmem);
|
||
|
|
parent[ofs] = p2s(inode->i_sb, xmem);
|
||
|
|
eufs_flush_cacheline(&parent[ofs]);
|
||
|
|
|
||
|
|
invalidate_inode_pages2_range(
|
||
|
|
inode->i_mapping, blocknr,
|
||
|
|
blocknr);
|
||
|
|
|
||
|
|
} else
|
||
|
|
need_blocks++;
|
||
|
|
}
|
||
|
|
ofs++;
|
||
|
|
blocknr++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!need_blocks)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
/* TODO: need optimization. */
|
||
|
|
r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
|
||
|
|
blocknr = start_block ? start_block : 1;
|
||
|
|
while (blocknr < end_block) {
|
||
|
|
eufs_find_data_block_btree(inode, blocknr, &parent);
|
||
|
|
BUG_ON(!parent);
|
||
|
|
last_ofs_line = -1;
|
||
|
|
ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
|
||
|
|
while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) {
|
||
|
|
if (parent[ofs] == NULL_ADDR) {
|
||
|
|
xmem = eufs_alloc_batch_allocate_file_data(
|
||
|
|
inode->i_sb, ab);
|
||
|
|
if (zero == EUFS_ALLOC_BLOCKS_ZERO_ALL ||
|
||
|
|
((zero == EUFS_ALLOC_BLOCKS_ZERO_EDGE) &&
|
||
|
|
(blocknr == start_block ||
|
||
|
|
blocknr == end_block - 1)))
|
||
|
|
eufs_clear_pmem(xmem, PAGE_SIZE);
|
||
|
|
|
||
|
|
parent[ofs] = p2s(inode->i_sb, xmem);
|
||
|
|
|
||
|
|
invalidate_inode_pages2_range(inode->i_mapping,
|
||
|
|
blocknr, blocknr);
|
||
|
|
|
||
|
|
if (last_ofs_line == -1)
|
||
|
|
last_ofs_line =
|
||
|
|
(ofs >>
|
||
|
|
EUFS_PTR_CNT_SHIFT_PER_CACHELINE);
|
||
|
|
}
|
||
|
|
ofs++;
|
||
|
|
if (last_ofs_line != -1 &&
|
||
|
|
(ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE) !=
|
||
|
|
last_ofs_line) {
|
||
|
|
eufs_flush_cacheline(&parent[ofs - 1]);
|
||
|
|
last_ofs_line = -1;
|
||
|
|
}
|
||
|
|
blocknr++;
|
||
|
|
}
|
||
|
|
if (last_ofs_line != -1)
|
||
|
|
eufs_flush_cacheline(&parent[ofs - 1]);
|
||
|
|
}
|
||
|
|
|
||
|
|
eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab);
|
||
|
|
return r;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int eufs_alloc_blocks_btree_for_write(struct inode *inode, loff_t pos,
|
||
|
|
int len)
|
||
|
|
{
|
||
|
|
long r;
|
||
|
|
unsigned long blocknr, need_blocks = 0;
|
||
|
|
long pi_tree_blocks =
|
||
|
|
eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode)));
|
||
|
|
size_t file_size_block = PAGE_DIV_ROUND_UP(inode->i_size);
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
struct alloc_batch *ab = &vi->page_batch;
|
||
|
|
__le64 *parent;
|
||
|
|
void *xmem;
|
||
|
|
/* The page first byte resides in */
|
||
|
|
unsigned long first_page = PAGE_DIV_ROUND_DOWN(pos);
|
||
|
|
/* The page last byte resides in */
|
||
|
|
unsigned long last_page = PAGE_DIV_ROUND_DOWN(pos + len - 1);
|
||
|
|
unsigned long pending_flush_bits;
|
||
|
|
int start_offset;
|
||
|
|
int end_offset;
|
||
|
|
int ofs;
|
||
|
|
|
||
|
|
r = eufs_extend_btree(inode, last_page + 1);
|
||
|
|
if (r)
|
||
|
|
return r;
|
||
|
|
|
||
|
|
/* hole_at_sta is used by SEEK_HOLE. */
|
||
|
|
/* FIXME: We need a durable way to present hole_at_sta. */
|
||
|
|
if (first_page == 0)
|
||
|
|
vi->hole_at_sta = false;
|
||
|
|
|
||
|
|
/* The 0th data block is always allocated. */
|
||
|
|
blocknr = first_page ? first_page : 1;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Can be optimized by saving the top-down parent pointers
|
||
|
|
* in a cursor and advancing by moving the cursor
|
||
|
|
*/
|
||
|
|
while (blocknr <= last_page) {
|
||
|
|
eufs_find_data_block_btree(inode, blocknr, &parent);
|
||
|
|
BUG_ON(!parent);
|
||
|
|
|
||
|
|
/* One ofs, one block */
|
||
|
|
for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
|
||
|
|
ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page;
|
||
|
|
++ofs, ++blocknr) {
|
||
|
|
/* Not a hole */
|
||
|
|
if (parent[ofs] != NULL_ADDR)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
/* Hole */
|
||
|
|
if (blocknr < pi_tree_blocks) {
|
||
|
|
/*
|
||
|
|
* TODO: optimize option, instead of wrting
|
||
|
|
* zeros here, we can write the actual data
|
||
|
|
* instead.
|
||
|
|
*/
|
||
|
|
xmem = eufs_zalloc_file_data(inode->i_sb);
|
||
|
|
if (!xmem)
|
||
|
|
return -ENOSPC;
|
||
|
|
eufs_alloc_persist(inode->i_sb, xmem, false);
|
||
|
|
eufs_flush_page(xmem);
|
||
|
|
parent[ofs] = p2s(inode->i_sb, xmem);
|
||
|
|
eufs_flush_cacheline(&parent[ofs]);
|
||
|
|
|
||
|
|
invalidate_inode_pages2_range(inode->i_mapping,
|
||
|
|
blocknr, blocknr);
|
||
|
|
|
||
|
|
} else
|
||
|
|
need_blocks++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (!need_blocks)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
/* FIXME: This requries re-write */
|
||
|
|
r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks);
|
||
|
|
if (IS_ERR_VALUE(r))
|
||
|
|
return r;
|
||
|
|
|
||
|
|
start_offset = pos & (PAGE_SIZE - 1);
|
||
|
|
end_offset = (pos + len) & (PAGE_SIZE - 1);
|
||
|
|
blocknr = first_page ? first_page : 1;
|
||
|
|
while (blocknr <= last_page) {
|
||
|
|
unsigned long bit;
|
||
|
|
|
||
|
|
eufs_find_data_block_btree(inode, blocknr, &parent);
|
||
|
|
|
||
|
|
BUG_ON(!parent);
|
||
|
|
|
||
|
|
/* No cacheline is pending to be flushed for this index block */
|
||
|
|
pending_flush_bits = 0;
|
||
|
|
|
||
|
|
for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1);
|
||
|
|
ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page;
|
||
|
|
++ofs, ++blocknr) {
|
||
|
|
/* Not a hole */
|
||
|
|
if (parent[ofs] != NULL_ADDR)
|
||
|
|
continue;
|
||
|
|
|
||
|
|
xmem = eufs_alloc_batch_allocate_file_data(inode->i_sb,
|
||
|
|
ab);
|
||
|
|
if (unlikely(blocknr == first_page &&
|
||
|
|
(start_offset != 0)))
|
||
|
|
eufs_clear_pmem(xmem, start_offset);
|
||
|
|
|
||
|
|
/* Do not clear the last block which is after the EOF-block */
|
||
|
|
if (unlikely(blocknr == last_page &&
|
||
|
|
(end_offset != 0) &&
|
||
|
|
blocknr < file_size_block))
|
||
|
|
eufs_clear_pmem((char *)xmem + end_offset,
|
||
|
|
PAGE_SIZE - end_offset);
|
||
|
|
|
||
|
|
parent[ofs] = p2s(inode->i_sb, xmem);
|
||
|
|
|
||
|
|
invalidate_inode_pages2_range(inode->i_mapping, blocknr,
|
||
|
|
blocknr);
|
||
|
|
set_bit(ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE,
|
||
|
|
&pending_flush_bits);
|
||
|
|
}
|
||
|
|
|
||
|
|
for (bit = find_first_bit(&pending_flush_bits, 64); bit < 64;
|
||
|
|
bit = find_next_bit(&pending_flush_bits, 64, bit + 1)) {
|
||
|
|
ofs = bit << EUFS_PTR_CNT_SHIFT_PER_CACHELINE;
|
||
|
|
eufs_flush_cacheline(&parent[ofs]);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab);
|
||
|
|
return r;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_free_recursive_btree_blind(struct super_block *sb,
|
||
|
|
__le64 *root, int level_left)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
|
||
|
|
BUG_ON(!root);
|
||
|
|
if (level_left == -1) {
|
||
|
|
if (root != NULL_ADDR_PTR)
|
||
|
|
nv_zfree(sb, root);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
/* level_left */
|
||
|
|
BUG_ON(root == NULL_ADDR_PTR);
|
||
|
|
for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) {
|
||
|
|
eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]),
|
||
|
|
level_left - 1);
|
||
|
|
}
|
||
|
|
nv_zfree(sb, root);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_free_recursive_btree(struct super_block *sb, __le64 *root,
|
||
|
|
int level_left, u64 blocks_left)
|
||
|
|
{
|
||
|
|
u64 nblocks_per_slot;
|
||
|
|
int i;
|
||
|
|
|
||
|
|
BUG_ON(!root);
|
||
|
|
BUG_ON(!blocks_left);
|
||
|
|
if (level_left == -1) {
|
||
|
|
if (root != NULL_ADDR_PTR)
|
||
|
|
nv_zfree(sb, root);
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
/* level_left */
|
||
|
|
BUG_ON(root == NULL_ADDR_PTR);
|
||
|
|
nblocks_per_slot = 1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT);
|
||
|
|
for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) {
|
||
|
|
if (blocks_left >= nblocks_per_slot) {
|
||
|
|
/* the whole sub-tree needs to be freed */
|
||
|
|
eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]),
|
||
|
|
level_left - 1);
|
||
|
|
blocks_left -= nblocks_per_slot;
|
||
|
|
if (blocks_left == 0)
|
||
|
|
break;
|
||
|
|
} else {
|
||
|
|
eufs_free_recursive_btree(sb, s2p(sb, root[i]),
|
||
|
|
level_left - 1, blocks_left);
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
nv_zfree(sb, root);
|
||
|
|
}
|
||
|
|
|
||
|
|
int eufs_shrink_btree(struct inode *inode)
|
||
|
|
{
|
||
|
|
struct super_block *sb = inode->i_sb;
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
struct eufs_inode *pi = EUFS_PI(inode);
|
||
|
|
|
||
|
|
void *root;
|
||
|
|
|
||
|
|
u64 capacity;
|
||
|
|
__le64 *new_root;
|
||
|
|
__le64 *old_root;
|
||
|
|
int new_height;
|
||
|
|
u64 size;
|
||
|
|
u64 blocks;
|
||
|
|
u64 count;
|
||
|
|
u64 blocks_left;
|
||
|
|
u64 __maybe_unused pi_root_o;
|
||
|
|
|
||
|
|
BUG_ON(!inode_is_locked(inode));
|
||
|
|
BUG_ON(vi->i_volatile_height > EUFS_MAX_FILE_TREE_HEIGHT);
|
||
|
|
|
||
|
|
pi_root_o = eufs_iread_root(pi);
|
||
|
|
eufs_dbg("shrink btree stage 1: pi=%px vi->{s=%lld b=%lld h=%d r=%px} pi->{s=%lld b=%lld h=%d r=0x%llx}\n",
|
||
|
|
pi, inode->i_size, vi->i_volatile_tree_blocks,
|
||
|
|
vi->i_volatile_height, vi->i_volatile_root,
|
||
|
|
eufs_iread_size(pi), eufs_iread_tree_blocks(pi),
|
||
|
|
root_height(pi_root_o), root_ptr(pi_root_o));
|
||
|
|
eufs_sync_pinode(inode, pi, false);
|
||
|
|
|
||
|
|
capacity = PAGE_SIZE
|
||
|
|
<< (EUFS_FILE_TREE_DEGREE_SHIFT * vi->i_volatile_height);
|
||
|
|
new_root = vi->i_volatile_root;
|
||
|
|
old_root = vi->i_volatile_root;
|
||
|
|
new_height = vi->i_volatile_height;
|
||
|
|
size = inode->i_size == 0 ? 1 : inode->i_size;
|
||
|
|
|
||
|
|
/* old block count */
|
||
|
|
blocks = vi->i_volatile_tree_blocks;
|
||
|
|
|
||
|
|
/* Check whether the height should be reduced */
|
||
|
|
for (;;) {
|
||
|
|
capacity >>= EUFS_FILE_TREE_DEGREE_SHIFT;
|
||
|
|
if (capacity < size || capacity < PAGE_SIZE)
|
||
|
|
break;
|
||
|
|
new_root = s2p(sb, new_root[0]);
|
||
|
|
new_height--;
|
||
|
|
}
|
||
|
|
vi->i_volatile_root = new_root;
|
||
|
|
vi->i_volatile_height = new_height;
|
||
|
|
vi->i_volatile_tree_blocks = DIV_ROUND_UP(size, PAGE_SIZE);
|
||
|
|
|
||
|
|
eufs_sync_pinode(inode, pi, false);
|
||
|
|
|
||
|
|
eufs_alloc_batch_persist_reset(sb, &vi->page_batch);
|
||
|
|
|
||
|
|
/* new block count and it's greater than 0 */
|
||
|
|
count = blocks_left = vi->i_volatile_tree_blocks;
|
||
|
|
|
||
|
|
/* shrink from old_root/_height to new_root/_height */
|
||
|
|
root = old_root;
|
||
|
|
|
||
|
|
if (blocks_left == blocks)
|
||
|
|
goto out;
|
||
|
|
|
||
|
|
if (blocks == 1)
|
||
|
|
goto out;
|
||
|
|
|
||
|
|
if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
|
||
|
|
int i;
|
||
|
|
|
||
|
|
__le64 *proot = root;
|
||
|
|
|
||
|
|
for (i = count; i < blocks; ++i) {
|
||
|
|
nv_free(sb, s2p(sb, proot[i]));
|
||
|
|
proot[i] = NULL_VAL;
|
||
|
|
}
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
|
||
|
|
int i;
|
||
|
|
__le64 *proot = root;
|
||
|
|
|
||
|
|
for (i = EUFS_H2_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE;
|
||
|
|
++i) {
|
||
|
|
__le64 *pages = s2p(sb, proot[i]);
|
||
|
|
int j = EUFS_H2_INDEX_IN_L1(count);
|
||
|
|
|
||
|
|
for (; j < EUFS_FILE_TREE_DEGREE; ++j) {
|
||
|
|
nv_free(sb, s2p(sb, pages[j]));
|
||
|
|
pages[j] = NULL_VAL;
|
||
|
|
count++;
|
||
|
|
if (count >= blocks)
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (EUFS_H2_IS_FREE_L1_SUBTREE(i, blocks_left)) {
|
||
|
|
nv_free(sb, pages);
|
||
|
|
proot[i] = NULL_VAL;
|
||
|
|
}
|
||
|
|
if (count >= blocks)
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
|
||
|
|
int i, j, k;
|
||
|
|
__le64 *pproot = root;
|
||
|
|
|
||
|
|
for (i = EUFS_H3_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE;
|
||
|
|
++i) {
|
||
|
|
__le64 *proot = s2p(sb, pproot[i]);
|
||
|
|
|
||
|
|
j = EUFS_H3_INDEX_IN_L1(i, count);
|
||
|
|
for (; j < EUFS_FILE_TREE_DEGREE; ++j) {
|
||
|
|
__le64 *pages = s2p(sb, proot[j]);
|
||
|
|
|
||
|
|
k = EUFS_H3_INDEX_IN_L2(count);
|
||
|
|
for (; k < EUFS_FILE_TREE_DEGREE; ++k) {
|
||
|
|
nv_free(sb, s2p(sb, pages[k]));
|
||
|
|
pages[k] = NULL_VAL;
|
||
|
|
count++;
|
||
|
|
if (count >= blocks)
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (EUFS_H3_IS_FREE_L2_SUBTREE(i, j,
|
||
|
|
blocks_left)) {
|
||
|
|
nv_free(sb, pages);
|
||
|
|
proot[j] = NULL_VAL;
|
||
|
|
}
|
||
|
|
if (count >= blocks)
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
if (EUFS_H3_IS_FREE_L1_SUBTREE(i, blocks_left)) {
|
||
|
|
nv_free(sb, proot);
|
||
|
|
pproot[i] = NULL_VAL;
|
||
|
|
}
|
||
|
|
if (count >= blocks)
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
out:
|
||
|
|
while (old_root != new_root) {
|
||
|
|
__le64 *r = old_root;
|
||
|
|
|
||
|
|
BUG_ON(!r);
|
||
|
|
old_root = s2p(sb, r[0]);
|
||
|
|
nv_free(sb, r);
|
||
|
|
}
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int eufs_free_btree(struct super_block *sb, void *root, int height, u64 blocks)
|
||
|
|
{
|
||
|
|
NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT));
|
||
|
|
eufs_dbg("nvfree tree root: %px\n", root);
|
||
|
|
if (blocks == 0)
|
||
|
|
return 0;
|
||
|
|
if (blocks == 1) {
|
||
|
|
/* height == 0 */
|
||
|
|
nv_free(sb, root);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
eufs_free_recursive_btree(sb, (__le64 *)root, height - 1, blocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
int eufs_persist_btree_node(void *root, int sta, int len)
|
||
|
|
{
|
||
|
|
BUG_ON(len > EUFS_FILE_TREE_DEGREE);
|
||
|
|
BUG_ON(len < 0);
|
||
|
|
BUG_ON(sta + len > EUFS_FILE_TREE_DEGREE);
|
||
|
|
BUG_ON(sta + len < 0);
|
||
|
|
if (len == 0)
|
||
|
|
return 0;
|
||
|
|
eufs_ptr_fast_check(root);
|
||
|
|
eufs_flush_range(((void **)root) + sta, len * sizeof(void *));
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_btree_h2_subtree(struct super_block *sb, void *root,
|
||
|
|
int start0, int idx0, int idx1)
|
||
|
|
{
|
||
|
|
__le64 *proot = root;
|
||
|
|
int i;
|
||
|
|
void *p;
|
||
|
|
|
||
|
|
for (i = start0; i < idx0; ++i) {
|
||
|
|
BUG_ON(proot[i] == 0);
|
||
|
|
|
||
|
|
p = s2p(sb, proot[i]);
|
||
|
|
eufs_ptr_fast_check(p);
|
||
|
|
eufs_persist_btree_node(p, 0, EUFS_FILE_TREE_DEGREE);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* According to the WARN_ON in eufs_persist_new_btree_h2,
|
||
|
|
* idx0 < EUFS_FILE_TREE_DEGREE if idx1 != 0. So the following code
|
||
|
|
* is safe.
|
||
|
|
*/
|
||
|
|
if (idx1 != 0) {
|
||
|
|
p = s2p(sb, proot[idx0]);
|
||
|
|
eufs_persist_btree_node(p, 0, idx1);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_btree_h2_root(void *root, int start0, int idx0,
|
||
|
|
int idx1)
|
||
|
|
{
|
||
|
|
int cnt = idx0 - start0;
|
||
|
|
|
||
|
|
/*
|
||
|
|
* It's the L1 index of the next block, so when it's not equals with 0,
|
||
|
|
* the node[idx0] also needs persistence.
|
||
|
|
*/
|
||
|
|
if (idx1 != 0)
|
||
|
|
cnt++;
|
||
|
|
|
||
|
|
eufs_persist_btree_node(root, start0, cnt);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_new_btree_h2_by_idx(struct super_block *sb, void *root,
|
||
|
|
int start0, int idx0, int idx1)
|
||
|
|
{
|
||
|
|
eufs_persist_btree_h2_subtree(sb, root, start0, idx0, idx1);
|
||
|
|
/* It's a new btree, so persist the whole root node */
|
||
|
|
eufs_persist_btree_h2_root(root, 0, idx0, idx1);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_new_btree_h2(struct super_block *sb, void *root,
|
||
|
|
int start0, unsigned long bcnt)
|
||
|
|
{
|
||
|
|
/* the L0/L1 index of the next block in new tree with height 2 */
|
||
|
|
int idx0 = EUFS_H2_INDEX_IN_L0(bcnt);
|
||
|
|
int idx1 = EUFS_H2_INDEX_IN_L1(bcnt);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in
|
||
|
|
* which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0)
|
||
|
|
*/
|
||
|
|
WARN_ON(idx0 == EUFS_FILE_TREE_DEGREE && idx1);
|
||
|
|
|
||
|
|
eufs_persist_new_btree_h2_by_idx(sb, root, start0, idx0, idx1);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_inc_btree_h2_by_idx(struct super_block *sb, void *root,
|
||
|
|
int old_idx0, int old_idx1,
|
||
|
|
int new_idx0, int new_idx1)
|
||
|
|
{
|
||
|
|
__le64 *proot = root;
|
||
|
|
void *p;
|
||
|
|
int start;
|
||
|
|
|
||
|
|
p = s2p(sb, proot[old_idx0]);
|
||
|
|
if (old_idx0 == new_idx0) {
|
||
|
|
if (old_idx0 == EUFS_FILE_TREE_DEGREE)
|
||
|
|
return;
|
||
|
|
|
||
|
|
eufs_persist_btree_node(p, old_idx1, new_idx1 - old_idx1);
|
||
|
|
|
||
|
|
/* node[old_idx0] needs persistence */
|
||
|
|
if (!old_idx1)
|
||
|
|
eufs_persist_btree_node(root, old_idx0, 1);
|
||
|
|
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
eufs_persist_btree_node(p, old_idx1, EUFS_FILE_TREE_DEGREE - old_idx1);
|
||
|
|
|
||
|
|
eufs_persist_btree_h2_subtree(sb, root, old_idx0 + 1, new_idx0,
|
||
|
|
new_idx1);
|
||
|
|
|
||
|
|
start = old_idx0;
|
||
|
|
/* if old_idx0 is not 0, root[start] must have already been persisted */
|
||
|
|
if (old_idx1)
|
||
|
|
start++;
|
||
|
|
eufs_persist_btree_h2_root(root, start, new_idx0, new_idx1);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_inc_btree_h2(struct super_block *sb, void *root,
|
||
|
|
unsigned long old_bcnt,
|
||
|
|
unsigned long new_bcnt)
|
||
|
|
{
|
||
|
|
/* the L0/L1 index of the next block in tree */
|
||
|
|
int old_idx0 = EUFS_H2_INDEX_IN_L0(old_bcnt);
|
||
|
|
int old_idx1 = EUFS_H2_INDEX_IN_L1(old_bcnt);
|
||
|
|
int new_idx0 = EUFS_H2_INDEX_IN_L0(new_bcnt);
|
||
|
|
int new_idx1 = EUFS_H2_INDEX_IN_L1(new_bcnt);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in
|
||
|
|
* which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0)
|
||
|
|
*/
|
||
|
|
WARN_ON(old_idx0 == EUFS_FILE_TREE_DEGREE && old_idx1);
|
||
|
|
WARN_ON(new_idx0 == EUFS_FILE_TREE_DEGREE && new_idx1);
|
||
|
|
|
||
|
|
eufs_persist_inc_btree_h2_by_idx(sb, root, old_idx0, old_idx1, new_idx0,
|
||
|
|
new_idx1);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_new_btree_h3(struct super_block *sb, void *root,
|
||
|
|
int start0, unsigned long bcnt_left)
|
||
|
|
{
|
||
|
|
int i;
|
||
|
|
unsigned long left = bcnt_left;
|
||
|
|
__le64 *pproot = root;
|
||
|
|
|
||
|
|
for (i = start0; i < EUFS_FILE_TREE_DEGREE; ++i) {
|
||
|
|
__le64 *proot = s2p(sb, pproot[i]);
|
||
|
|
int j;
|
||
|
|
|
||
|
|
for (j = 0; j < EUFS_FILE_TREE_DEGREE; ++j) {
|
||
|
|
void *p = s2p(sb, proot[j]);
|
||
|
|
|
||
|
|
if (left >= EUFS_FILE_TREE_DEGREE) {
|
||
|
|
eufs_persist_btree_node(p, 0,
|
||
|
|
EUFS_FILE_TREE_DEGREE);
|
||
|
|
left -= EUFS_FILE_TREE_DEGREE;
|
||
|
|
} else {
|
||
|
|
eufs_persist_btree_node(p, 0, left);
|
||
|
|
left = 0;
|
||
|
|
j++;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eufs_persist_btree_node(proot, 0, j);
|
||
|
|
if (!left) {
|
||
|
|
i++;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
eufs_persist_btree_node(root, 0, i);
|
||
|
|
}
|
||
|
|
|
||
|
|
static void eufs_persist_inc_btree_h3(struct super_block *sb, void *root,
|
||
|
|
unsigned long old_bcnt,
|
||
|
|
unsigned long new_bcnt)
|
||
|
|
{
|
||
|
|
/* The L0/L1/L2 position of the next block in tree */
|
||
|
|
int o_idx0 = EUFS_H3_INDEX_IN_L0(old_bcnt);
|
||
|
|
int o_idx1 = EUFS_H3_INDEX_IN_L1(o_idx0, old_bcnt);
|
||
|
|
int o_idx2 = EUFS_H3_INDEX_IN_L2(old_bcnt);
|
||
|
|
int n_idx0 = EUFS_H3_INDEX_IN_L0(new_bcnt);
|
||
|
|
int n_idx1 = EUFS_H3_INDEX_IN_L1(n_idx0, new_bcnt);
|
||
|
|
int n_idx2 = EUFS_H3_INDEX_IN_L2(new_bcnt);
|
||
|
|
__le64 *pproot = root;
|
||
|
|
__le64 *proot;
|
||
|
|
void *p;
|
||
|
|
int i;
|
||
|
|
|
||
|
|
if (o_idx0 == n_idx0 && o_idx1 == n_idx1) {
|
||
|
|
/* persist from the bottom up */
|
||
|
|
proot = s2p(sb, pproot[o_idx0]);
|
||
|
|
p = s2p(sb, proot[o_idx1]);
|
||
|
|
eufs_persist_btree_node(p, o_idx2, n_idx2 - o_idx2);
|
||
|
|
|
||
|
|
/* node[o_idx1] needs persistence */
|
||
|
|
if (!o_idx2) {
|
||
|
|
eufs_persist_btree_node(proot, o_idx1, 1);
|
||
|
|
|
||
|
|
/* node[o_idx0] needs persistence */
|
||
|
|
if (!o_idx1)
|
||
|
|
eufs_persist_btree_node(root, o_idx0, 1);
|
||
|
|
}
|
||
|
|
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (o_idx0 == n_idx0) {
|
||
|
|
proot = s2p(sb, pproot[o_idx0]);
|
||
|
|
eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2,
|
||
|
|
n_idx1, n_idx2);
|
||
|
|
|
||
|
|
/* node[o_idx0] needs persistence */
|
||
|
|
if (!o_idx1 && !o_idx2)
|
||
|
|
eufs_persist_btree_node(root, o_idx0, 1);
|
||
|
|
|
||
|
|
return;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* A corner case: o_idx1 == EUFS_FILE_TREE_DEGREE && o_idx2 == 0. This
|
||
|
|
* can be handled in the function eufs_persist_inc_btree_h2_by_idx, but
|
||
|
|
* we still check it here for efficiency.
|
||
|
|
*/
|
||
|
|
if (o_idx1 < EUFS_FILE_TREE_DEGREE) {
|
||
|
|
proot = s2p(sb, pproot[o_idx0]);
|
||
|
|
eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2,
|
||
|
|
EUFS_FILE_TREE_DEGREE, 0);
|
||
|
|
} else {
|
||
|
|
WARN_ON(o_idx2 != 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
for (i = o_idx0 + 1; i < n_idx0; ++i) {
|
||
|
|
proot = s2p(sb, pproot[i]);
|
||
|
|
eufs_persist_new_btree_h2_by_idx(sb, proot, 0,
|
||
|
|
EUFS_FILE_TREE_DEGREE, 0);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (n_idx1 || n_idx2) {
|
||
|
|
proot = s2p(sb, pproot[n_idx0]);
|
||
|
|
eufs_persist_new_btree_h2_by_idx(sb, proot, 0, n_idx1, n_idx2);
|
||
|
|
/* root[n_idx0] needs to be persisted */
|
||
|
|
n_idx0++;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* root[o_idx0] has been persisted */
|
||
|
|
if (o_idx1 || o_idx2)
|
||
|
|
o_idx0++;
|
||
|
|
|
||
|
|
eufs_persist_btree_node(root, o_idx0, n_idx0 - o_idx0);
|
||
|
|
}
|
||
|
|
|
||
|
|
/* Only structure persistency is needed */
|
||
|
|
int eufs_persist_btree(struct super_block *sb, void *root, int height,
|
||
|
|
u64 old_size, u64 new_size)
|
||
|
|
{
|
||
|
|
unsigned long old_nblocks, new_nblocks;
|
||
|
|
__le64 *proot;
|
||
|
|
__le64 *pproot;
|
||
|
|
|
||
|
|
if (old_size == 0)
|
||
|
|
old_size = 1; /* at least one block */
|
||
|
|
NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT));
|
||
|
|
if (!root)
|
||
|
|
return 0;
|
||
|
|
/* don't support for persisting for shrink */
|
||
|
|
if (old_size > new_size)
|
||
|
|
return 0;
|
||
|
|
old_nblocks = DIV_ROUND_UP(old_size, PAGE_SIZE);
|
||
|
|
new_nblocks = DIV_ROUND_UP(new_size, PAGE_SIZE);
|
||
|
|
if (old_nblocks == new_nblocks)
|
||
|
|
return 0;
|
||
|
|
proot = root;
|
||
|
|
if (old_nblocks == 1) {
|
||
|
|
/* data do not need flush */
|
||
|
|
if (new_nblocks == 1)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
|
||
|
|
eufs_persist_btree_node(root, 0, new_nblocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
|
||
|
|
eufs_persist_new_btree_h2(sb, root, 0, new_nblocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
|
||
|
|
eufs_persist_new_btree_h3(sb, root, 0, new_nblocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
} else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) {
|
||
|
|
eufs_persist_btree_node(root, old_nblocks,
|
||
|
|
new_nblocks - old_nblocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
|
||
|
|
__le64 *p = s2p(sb, proot[0]);
|
||
|
|
|
||
|
|
eufs_persist_btree_node(p, old_nblocks,
|
||
|
|
EUFS_FILE_TREE_DEGREE -
|
||
|
|
old_nblocks);
|
||
|
|
eufs_persist_new_btree_h2(sb, proot, 1, new_nblocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
|
||
|
|
void *p;
|
||
|
|
|
||
|
|
pproot = root;
|
||
|
|
proot = s2p(sb, pproot[0]);
|
||
|
|
p = s2p(sb, proot[0]);
|
||
|
|
eufs_persist_btree_node(p, old_nblocks,
|
||
|
|
EUFS_FILE_TREE_DEGREE -
|
||
|
|
old_nblocks);
|
||
|
|
eufs_persist_new_btree_h2(
|
||
|
|
sb, proot, 1, EUFS_FILE_BCNT_WITH_HEIGHT(2));
|
||
|
|
|
||
|
|
eufs_persist_new_btree_h3(
|
||
|
|
sb, root, 1,
|
||
|
|
new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2));
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
} else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) {
|
||
|
|
eufs_persist_inc_btree_h2(sb, root, old_nblocks,
|
||
|
|
new_nblocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
|
||
|
|
pproot = root;
|
||
|
|
proot = s2p(sb, pproot[0]);
|
||
|
|
eufs_persist_inc_btree_h2(
|
||
|
|
sb, proot, old_nblocks,
|
||
|
|
EUFS_FILE_BCNT_WITH_HEIGHT(2));
|
||
|
|
|
||
|
|
eufs_persist_new_btree_h3(
|
||
|
|
sb, root, 1,
|
||
|
|
new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2));
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
} else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
|
||
|
|
if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) {
|
||
|
|
eufs_persist_inc_btree_h3(sb, root, old_nblocks,
|
||
|
|
new_nblocks);
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
BUG();
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static ssize_t do_mapping_read(struct address_space *mapping,
|
||
|
|
struct file_ra_state *_ra, struct file *filp,
|
||
|
|
char __user *buf, size_t len, loff_t *ppos)
|
||
|
|
{
|
||
|
|
struct inode *inode = mapping->host;
|
||
|
|
pgoff_t index, end_index;
|
||
|
|
unsigned long offset;
|
||
|
|
loff_t isize, pos;
|
||
|
|
size_t copied = 0, error = 0;
|
||
|
|
|
||
|
|
pos = *ppos;
|
||
|
|
index = pos >> PAGE_SHIFT;
|
||
|
|
offset = pos & ~PAGE_MASK;
|
||
|
|
|
||
|
|
isize = i_size_read(inode);
|
||
|
|
if (!isize)
|
||
|
|
goto out;
|
||
|
|
|
||
|
|
end_index = (isize - 1) >> PAGE_SHIFT;
|
||
|
|
do {
|
||
|
|
unsigned long nr, left;
|
||
|
|
void *xmem;
|
||
|
|
|
||
|
|
/* nr is the maximum number of bytes to copy from this page */
|
||
|
|
nr = PAGE_SIZE;
|
||
|
|
if (index >= end_index) {
|
||
|
|
if (index > end_index)
|
||
|
|
goto out;
|
||
|
|
|
||
|
|
nr = ((isize - 1) & ~PAGE_MASK) + 1;
|
||
|
|
if (nr <= offset)
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
nr = nr - offset;
|
||
|
|
if (nr > len - copied)
|
||
|
|
nr = len - copied;
|
||
|
|
|
||
|
|
xmem = eufs_find_data_block_btree(inode, index, NULL);
|
||
|
|
|
||
|
|
BUG_ON(!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE));
|
||
|
|
if (unlikely(!xmem))
|
||
|
|
BUG();
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Ok, we have the mem, so now we can copy it to user space...
|
||
|
|
*
|
||
|
|
* The actor routine returns how many bytes were actually used..
|
||
|
|
* NOTE! This may not be the same as how much of a user buffer
|
||
|
|
* we filled up (we may be padding etc), so we can only update
|
||
|
|
* "pos" here (the actor routine has to update the user buffer
|
||
|
|
* pointers and the remaining count).
|
||
|
|
*/
|
||
|
|
if (xmem != NULL_ADDR_PTR)
|
||
|
|
left = __copy_to_user(buf + copied, xmem + offset, nr);
|
||
|
|
else
|
||
|
|
left = __clear_user(buf + copied, nr);
|
||
|
|
|
||
|
|
if (left) {
|
||
|
|
error = -EFAULT;
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
|
||
|
|
copied += (nr - left);
|
||
|
|
offset += (nr - left);
|
||
|
|
index += offset >> PAGE_SHIFT;
|
||
|
|
offset &= ~PAGE_MASK;
|
||
|
|
} while (copied < len);
|
||
|
|
|
||
|
|
out:
|
||
|
|
*ppos = pos + copied;
|
||
|
|
if (filp)
|
||
|
|
file_accessed(filp);
|
||
|
|
|
||
|
|
return copied ? copied : error;
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* Wrappers. We need to use the rcu read lock to avoid
|
||
|
|
* concurrent truncate operation. No problem for write because we held
|
||
|
|
* i_mutex.
|
||
|
|
*/
|
||
|
|
ssize_t eufs_file_read(struct file *filp, char __user *buf, size_t len,
|
||
|
|
loff_t *ppos)
|
||
|
|
{
|
||
|
|
ssize_t res;
|
||
|
|
|
||
|
|
inode_lock_shared(file_inode(filp));
|
||
|
|
if (!access_ok(buf, len))
|
||
|
|
res = -EFAULT;
|
||
|
|
else
|
||
|
|
res = do_mapping_read(filp->f_mapping, &filp->f_ra, filp, buf,
|
||
|
|
len, ppos);
|
||
|
|
inode_unlock_shared(file_inode(filp));
|
||
|
|
return res;
|
||
|
|
}
|
||
|
|
|
||
|
|
static __always_inline size_t memcpy_to_nvmm(char *kmem, loff_t offset,
|
||
|
|
const char __user *buf,
|
||
|
|
size_t bytes)
|
||
|
|
{
|
||
|
|
size_t copied;
|
||
|
|
|
||
|
|
if (support_clwb && !force_nocache_write) {
|
||
|
|
copied = bytes - __copy_from_user(kmem + offset, buf, bytes);
|
||
|
|
eufs_flush_buffer(kmem + offset, copied, 0);
|
||
|
|
} else {
|
||
|
|
copied = bytes - __copy_from_user_inatomic_nocache(
|
||
|
|
kmem + offset, buf, bytes);
|
||
|
|
}
|
||
|
|
|
||
|
|
return copied;
|
||
|
|
}
|
||
|
|
|
||
|
|
ssize_t __eufs_file_write_inode(struct inode *inode, const char __user *buf,
|
||
|
|
size_t count, loff_t pos, loff_t *ppos,
|
||
|
|
bool zero, bool keep)
|
||
|
|
{
|
||
|
|
long status = 0;
|
||
|
|
size_t bytes;
|
||
|
|
ssize_t written = 0;
|
||
|
|
|
||
|
|
if (!count)
|
||
|
|
return 0;
|
||
|
|
|
||
|
|
eufs_dbg("file write: inode=%px count=%lx pos=%llx, zero=%d keep=%d\n",
|
||
|
|
inode, count, pos, zero, keep);
|
||
|
|
|
||
|
|
do {
|
||
|
|
unsigned long index;
|
||
|
|
unsigned long offset;
|
||
|
|
size_t copied;
|
||
|
|
__le64 *parent;
|
||
|
|
void __pmem *xmem;
|
||
|
|
void __pmem *xmem_new = NULL;
|
||
|
|
|
||
|
|
offset = (pos & (PAGE_SIZE - 1)); /* Within page */
|
||
|
|
index = pos >> PAGE_SHIFT;
|
||
|
|
bytes = PAGE_SIZE - offset;
|
||
|
|
if (bytes > count)
|
||
|
|
bytes = count;
|
||
|
|
|
||
|
|
xmem = eufs_find_data_block_btree(inode, index, &parent);
|
||
|
|
if (!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE)) {
|
||
|
|
dump_stack();
|
||
|
|
BUG();
|
||
|
|
}
|
||
|
|
|
||
|
|
/* do no wear leveling for 0-level btrees */
|
||
|
|
if (xmem != NULL_ADDR_PTR && parent && !zero) {
|
||
|
|
/* wear threshold! */
|
||
|
|
if (!wear_inc(inode->i_sb, xmem))
|
||
|
|
xmem_new = eufs_malloc_file_data(inode->i_sb);
|
||
|
|
}
|
||
|
|
if (zero) {
|
||
|
|
copied = bytes;
|
||
|
|
if (xmem != NULL_ADDR_PTR)
|
||
|
|
eufs_clear_pmem((char *)xmem + offset, bytes);
|
||
|
|
} else {
|
||
|
|
BUG_ON(xmem == NULL_ADDR_PTR);
|
||
|
|
copied = memcpy_to_nvmm((char *)xmem, offset, buf,
|
||
|
|
bytes);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (xmem_new) {
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
|
||
|
|
eufs_dbg(
|
||
|
|
"inode=%px pos=%llx xmem:[%px -> %px] weared\n",
|
||
|
|
inode, pos, xmem, xmem_new);
|
||
|
|
eufs_alloc_persist(inode->i_sb, xmem_new, true);
|
||
|
|
|
||
|
|
WARN_ON(xmem !=
|
||
|
|
s2p(inode->i_sb,
|
||
|
|
parent[index % EUFS_FILE_TREE_DEGREE]));
|
||
|
|
|
||
|
|
/*
|
||
|
|
* disable page fault, clear all related PTEs, and remove the
|
||
|
|
* dax entry from the radix tree before replace the old block
|
||
|
|
*/
|
||
|
|
down_write(&vi->mmap_rwsem);
|
||
|
|
invalidate_inode_pages2_range(inode->i_mapping,
|
||
|
|
pos / PAGE_SIZE,
|
||
|
|
pos / PAGE_SIZE);
|
||
|
|
memcpy_to_nvmm(xmem_new, 0, xmem, PAGE_SIZE);
|
||
|
|
parent[index % EUFS_FILE_TREE_DEGREE] =
|
||
|
|
p2s(inode->i_sb, xmem_new);
|
||
|
|
up_write(&vi->mmap_rwsem);
|
||
|
|
|
||
|
|
eufs_flush_cacheline(
|
||
|
|
&parent[index % EUFS_FILE_TREE_DEGREE]);
|
||
|
|
eufs_pbarrier();
|
||
|
|
|
||
|
|
/*
|
||
|
|
* It is important to persist all preivous alllocations
|
||
|
|
* here. Otherwise, the xmem might be freed before its
|
||
|
|
* information is handled in the page_batch, which will
|
||
|
|
* cause xmem being marked as allocated (page_batch does
|
||
|
|
* this) when it is in the free list.
|
||
|
|
* xfstests/generic/299 can trigger this.
|
||
|
|
*/
|
||
|
|
eufs_alloc_batch_persist_reset(
|
||
|
|
inode->i_sb, &EUFS_I(inode)->page_batch);
|
||
|
|
nv_free_rest(inode->i_sb, xmem);
|
||
|
|
}
|
||
|
|
|
||
|
|
eufs_dbg(
|
||
|
|
"! file writing to pos=%ld xmem=%px, offset=%ld, buf=%px, bytes=%ld index=%ld, copied=%ld\n",
|
||
|
|
(long)pos, xmem, (long)offset, buf, (long)bytes,
|
||
|
|
(long)index, (long)copied);
|
||
|
|
|
||
|
|
if (likely(copied > 0)) {
|
||
|
|
written += copied;
|
||
|
|
count -= copied;
|
||
|
|
pos += copied;
|
||
|
|
buf += copied;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (unlikely(copied != bytes)) {
|
||
|
|
status = -EFAULT;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
} while (count);
|
||
|
|
if (ppos)
|
||
|
|
*ppos = pos;
|
||
|
|
eufs_dbg("pos: %d inode->i_size: %d written: %d\n", (int)pos,
|
||
|
|
(int)inode->i_size, (int)written);
|
||
|
|
/*
|
||
|
|
* No need to use i_size_read() here, the i_size
|
||
|
|
* cannot change under us because we hold i_mutex.
|
||
|
|
*/
|
||
|
|
if (!keep && pos > inode->i_size)
|
||
|
|
eufs_inode_size_write(inode, pos);
|
||
|
|
|
||
|
|
return written ? written : status;
|
||
|
|
}
|
||
|
|
|
||
|
|
ssize_t __eufs_file_write(struct address_space *mapping,
|
||
|
|
const char __user *buf, size_t count, loff_t pos,
|
||
|
|
loff_t *ppos, bool zero, bool keep)
|
||
|
|
{
|
||
|
|
return __eufs_file_write_inode(mapping->host, buf, count, pos, ppos,
|
||
|
|
zero, keep);
|
||
|
|
}
|
||
|
|
|
||
|
|
ssize_t eufs_file_write(struct file *filp, const char __user *buf,
|
||
|
|
size_t len, loff_t *ppos)
|
||
|
|
{
|
||
|
|
struct address_space *mapping = filp->f_mapping;
|
||
|
|
struct inode *inode = mapping->host;
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
struct super_block *sb = inode->i_sb;
|
||
|
|
ssize_t written = 0;
|
||
|
|
loff_t pos;
|
||
|
|
size_t count, ret;
|
||
|
|
bool osync = false;
|
||
|
|
|
||
|
|
inode_lock(inode);
|
||
|
|
|
||
|
|
if (!access_ok(buf, len)) {
|
||
|
|
ret = -EFAULT;
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (filp->f_flags & O_APPEND)
|
||
|
|
pos = inode->i_size;
|
||
|
|
else
|
||
|
|
pos = *ppos;
|
||
|
|
|
||
|
|
if (filp->f_flags & __O_SYNC)
|
||
|
|
osync = true;
|
||
|
|
|
||
|
|
count = len;
|
||
|
|
if (count == 0) {
|
||
|
|
ret = 0;
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
|
||
|
|
NV_ASSERT(sb->s_blocksize == PAGE_SIZE);
|
||
|
|
|
||
|
|
ret = file_remove_privs(filp);
|
||
|
|
if (ret)
|
||
|
|
goto out;
|
||
|
|
|
||
|
|
inode->i_ctime = inode->i_mtime = current_time(inode);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* It's a little tricky here. We should use mmap_rwsem to protect
|
||
|
|
* the block allocation and i_size update, but mmap_rwsem can not
|
||
|
|
* be taken during block writing because that will lead to
|
||
|
|
* dead-lock. We only use mmap_rwsem to protect the block allocation,
|
||
|
|
* and there are two reasons we can do that:
|
||
|
|
* 1. mmap fault takes the mmap_rwsem before read i_size, so it
|
||
|
|
* can not read the updated i_size before the allocation is done.
|
||
|
|
* 2. write only extends the block tree, and will not remove or
|
||
|
|
* modify the existed block mappings.
|
||
|
|
*/
|
||
|
|
down_write(&vi->mmap_rwsem);
|
||
|
|
/*
|
||
|
|
* Possible cases for writing [pos~pos+len)
|
||
|
|
*
|
||
|
|
* Definitions
|
||
|
|
* EOF: the byte after last valid byte
|
||
|
|
* EOF-page: page contains EOF
|
||
|
|
* first: the page pos belongs to
|
||
|
|
* last: the page pos+len belongs to
|
||
|
|
* Macro EOP(p): the last byte of p's page
|
||
|
|
*
|
||
|
|
* IMPORTANT NOTICE: we do not guarantee that [EOF~EOP(EOF)] are
|
||
|
|
* zeroed! When we mmap a file, we will erase that (in DRAM) in the
|
||
|
|
* mmap syscall. This can concurrently happen with a write syscall
|
||
|
|
* which may cause consistency problems (especially when it's an
|
||
|
|
* append). Concurrent mmap-access and read-/write-access should be
|
||
|
|
* protected by the application.
|
||
|
|
*
|
||
|
|
* 1) EOF-page | first | last
|
||
|
|
* area-to-zero: [EOF~EOP(EOF)]
|
||
|
|
* 2) EOF-page=first| last
|
||
|
|
* area-to-zero: [EOF~pos) if EOF<pos
|
||
|
|
* 3) first | EOF-page | last
|
||
|
|
* area-to-zero: none
|
||
|
|
* 4) first | EOF-page=last
|
||
|
|
* area-to-zero: none
|
||
|
|
* 5) first | last | EOF-page
|
||
|
|
* area-to-zero:
|
||
|
|
* And for ALL cases, if first/last page is a hole, we need to zero
|
||
|
|
* the part that will not be written in this write.
|
||
|
|
*/
|
||
|
|
|
||
|
|
/* don't zero-out the allocated blocks */
|
||
|
|
ret = eufs_alloc_blocks_btree_for_write(inode, pos, count);
|
||
|
|
if (IS_ERR_VALUE(ret)) {
|
||
|
|
up_write(&vi->mmap_rwsem);
|
||
|
|
goto out;
|
||
|
|
}
|
||
|
|
|
||
|
|
/* If we decide to guarantee zeroed file tail, we may use this snippet. */
|
||
|
|
/* zeroing part of the last block goes beyond the new EOF */
|
||
|
|
if (PAGE_ALIGN(pos + count) > PAGE_ALIGN(inode->i_size))
|
||
|
|
eufs_inode_zero_range(inode, pos + count,
|
||
|
|
PAGE_ALIGN(pos + count));
|
||
|
|
|
||
|
|
/*
|
||
|
|
* zeroing the hole created by write.
|
||
|
|
* part of the hole is included in the last page that exceeds EOF,
|
||
|
|
* and it has already been zeroed, so only zeroing the remaining part.
|
||
|
|
*/
|
||
|
|
if (pos > inode->i_size) {
|
||
|
|
loff_t offset = inode->i_size & (PAGE_SIZE - 1);
|
||
|
|
|
||
|
|
if (offset || !inode->i_size) {
|
||
|
|
/*
|
||
|
|
* Zero EOF~EOP(EOF).
|
||
|
|
* This also satisfies case 2), since [EOP(EOF)+1~pos]
|
||
|
|
* are holes.
|
||
|
|
*/
|
||
|
|
eufs_inode_zero_range_len(inode, inode->i_size,
|
||
|
|
PAGE_SIZE - offset);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
up_write(&vi->mmap_rwsem);
|
||
|
|
|
||
|
|
written = __eufs_file_write(mapping, buf, count, pos, ppos, false,
|
||
|
|
false);
|
||
|
|
if (written < 0 || written != count) {
|
||
|
|
eufs_dbg("write incomplete/failed: written %ld len %ld pos %llx\n",
|
||
|
|
written, count, pos);
|
||
|
|
}
|
||
|
|
if (osync) {
|
||
|
|
eufs_alloc_batch_persist_reset(sb, &EUFS_I(inode)->page_batch);
|
||
|
|
eufs_sync_pinode(inode, EUFS_PI(inode), false);
|
||
|
|
} else {
|
||
|
|
request_persistence(inode);
|
||
|
|
}
|
||
|
|
|
||
|
|
ret = written;
|
||
|
|
|
||
|
|
out:
|
||
|
|
inode_unlock(inode);
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int eufs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
|
||
|
|
unsigned int flags, struct iomap *iomap,
|
||
|
|
struct iomap *src)
|
||
|
|
{
|
||
|
|
struct super_block *sb = inode->i_sb;
|
||
|
|
struct eufs_sb_info *sbi = sb->s_fs_info;
|
||
|
|
unsigned int blkbits = inode->i_blkbits;
|
||
|
|
unsigned long first_block = offset >> blkbits;
|
||
|
|
bool new = false;
|
||
|
|
void *__pmem xmem;
|
||
|
|
__le64 *__pmem parent;
|
||
|
|
|
||
|
|
eufs_dbg("fault: inode=%px addr=0x%llx rw=%d length=%lld\n", inode,
|
||
|
|
offset, flags & IOMAP_WRITE, length);
|
||
|
|
inode_leaf_lock(inode);
|
||
|
|
xmem = eufs_find_data_block_btree(inode, first_block, &parent);
|
||
|
|
/* allocate a new block for write */
|
||
|
|
if (xmem == NULL_ADDR_PTR && (flags & IOMAP_WRITE)) {
|
||
|
|
int ofs = first_block & (EUFS_FILE_TREE_DEGREE - 1);
|
||
|
|
|
||
|
|
/*
|
||
|
|
* We cannot use normal allocation here because they can send
|
||
|
|
* IPI to gather pages and blocks. So here we need to use
|
||
|
|
* non-blocking version, which uses reserved pages instead of
|
||
|
|
* gathering pages by IPI.
|
||
|
|
*/
|
||
|
|
xmem = eufs_zalloc_file_data(inode->i_sb);
|
||
|
|
if (!xmem) {
|
||
|
|
inode_leaf_unlock(inode);
|
||
|
|
return -ENOSPC;
|
||
|
|
}
|
||
|
|
|
||
|
|
eufs_alloc_persist(inode->i_sb, xmem, false);
|
||
|
|
/*
|
||
|
|
* the first block is preallocated during inode initialization,
|
||
|
|
* so parent should not be NULL when xmem is NULL_ADDR
|
||
|
|
*/
|
||
|
|
BUG_ON(!parent);
|
||
|
|
eufs_flush_page(xmem);
|
||
|
|
parent[ofs] = p2s(sb, xmem);
|
||
|
|
eufs_flush_cacheline(&parent[ofs]);
|
||
|
|
|
||
|
|
new = true;
|
||
|
|
}
|
||
|
|
inode_leaf_unlock(inode);
|
||
|
|
|
||
|
|
iomap->flags = 0;
|
||
|
|
iomap->bdev = inode->i_sb->s_bdev;
|
||
|
|
iomap->offset = (u64)first_block << blkbits;
|
||
|
|
iomap->dax_dev = sbi->s_dax_dev;
|
||
|
|
iomap->length = 1 << blkbits;
|
||
|
|
|
||
|
|
if (xmem == NULL_ADDR_PTR) {
|
||
|
|
iomap->type = IOMAP_HOLE;
|
||
|
|
iomap->addr = IOMAP_NULL_ADDR;
|
||
|
|
} else {
|
||
|
|
iomap->type = IOMAP_MAPPED;
|
||
|
|
iomap->addr = (xmem - sbi->virt_addr);
|
||
|
|
}
|
||
|
|
|
||
|
|
if (new)
|
||
|
|
iomap->flags |= IOMAP_F_NEW;
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static int eufs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
|
||
|
|
ssize_t written, unsigned int flags,
|
||
|
|
struct iomap *iomap)
|
||
|
|
{
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
const struct iomap_ops eufs_iomap_ops = {
|
||
|
|
.iomap_begin = eufs_iomap_begin,
|
||
|
|
.iomap_end = eufs_iomap_end,
|
||
|
|
};
|
||
|
|
|
||
|
|
static unsigned int eufs_dax_fault(struct vm_fault *vmf)
|
||
|
|
{
|
||
|
|
struct inode *inode = file_inode(vmf->vma->vm_file);
|
||
|
|
struct eufs_inode_info *vi = EUFS_I(inode);
|
||
|
|
int ret;
|
||
|
|
|
||
|
|
if (vmf->flags & FAULT_FLAG_WRITE) {
|
||
|
|
sb_start_pagefault(inode->i_sb);
|
||
|
|
file_update_time(vmf->vma->vm_file);
|
||
|
|
}
|
||
|
|
|
||
|
|
/*
|
||
|
|
* i_size and the block tree must be consistent during mmap fault,
|
||
|
|
* else eulerfs may map to a freed block or a hole instead of an
|
||
|
|
* allocated block.
|
||
|
|
*
|
||
|
|
* Now i_rwsem is used to protect against the update of i_size and
|
||
|
|
* the block tree, but it can NOT been used in mmap fault path,
|
||
|
|
* because mmap fault may be triggered in the middle of
|
||
|
|
* write or read operation when the dst or src buffer is a mapped
|
||
|
|
* range of the same file, and that will lead to dead-lock due to
|
||
|
|
* two acquisitions of the same lock (i_rwsem).
|
||
|
|
*
|
||
|
|
* So mmap_rwsem is provided. The read-lock will be used in mmap
|
||
|
|
* fault path, and the write-lock will be used in truncate &
|
||
|
|
* fallocate & write paths.
|
||
|
|
*/
|
||
|
|
down_read(&vi->mmap_rwsem);
|
||
|
|
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ret, &eufs_iomap_ops);
|
||
|
|
up_read(&vi->mmap_rwsem);
|
||
|
|
|
||
|
|
if (vmf->flags & FAULT_FLAG_WRITE)
|
||
|
|
sb_end_pagefault(inode->i_sb);
|
||
|
|
|
||
|
|
return ret;
|
||
|
|
}
|
||
|
|
|
||
|
|
const struct vm_operations_struct eufs_file_vm_ops = {
|
||
|
|
.fault = eufs_dax_fault,
|
||
|
|
.page_mkwrite = eufs_dax_fault,
|
||
|
|
.pfn_mkwrite = eufs_dax_fault,
|
||
|
|
};
|
||
|
|
|
||
|
|
int eufs_dax_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||
|
|
{
|
||
|
|
file_accessed(file);
|
||
|
|
vma->vm_flags |= VM_MIXEDMAP;
|
||
|
|
vma->vm_ops = &eufs_file_vm_ops;
|
||
|
|
eufs_dbg("dax file mmaped!\n");
|
||
|
|
return 0;
|
||
|
|
}
|
||
|
|
|
||
|
|
static loff_t eufs_seek_block(struct file *file, loff_t offset, int whence)
|
||
|
|
{
|
||
|
|
struct inode *inode = file_inode(file);
|
||
|
|
loff_t maxbytes = inode->i_sb->s_maxbytes;
|
||
|
|
pgoff_t pgofs;
|
||
|
|
loff_t data_ofs = offset, isize;
|
||
|
|
__le64 *parent;
|
||
|
|
void *addr;
|
||
|
|
unsigned int ofs;
|
||
|
|
|
||
|
|
inode_lock(inode);
|
||
|
|
|
||
|
|
isize = i_size_read(inode);
|
||
|
|
|
||
|
|
if (offset >= isize)
|
||
|
|
goto fail;
|
||
|
|
|
||
|
|
pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
|
||
|
|
|
||
|
|
if (EUFS_I(inode)->hole_at_sta && pgofs == 0) {
|
||
|
|
if (whence == SEEK_HOLE)
|
||
|
|
goto found;
|
||
|
|
pgofs++;
|
||
|
|
data_ofs = (loff_t)pgofs << PAGE_SHIFT;
|
||
|
|
}
|
||
|
|
|
||
|
|
while (data_ofs < isize) {
|
||
|
|
addr = eufs_find_data_block_btree(inode, pgofs, &parent);
|
||
|
|
ofs = pgofs & (EUFS_FILE_TREE_DEGREE - 1);
|
||
|
|
while (ofs < EUFS_FILE_TREE_DEGREE && data_ofs < isize) {
|
||
|
|
if (parent)
|
||
|
|
addr = s2p(inode->i_sb, parent[ofs]);
|
||
|
|
if (addr == NULL_ADDR_PTR && whence == SEEK_HOLE)
|
||
|
|
goto found;
|
||
|
|
if (addr && addr != NULL_ADDR_PTR &&
|
||
|
|
whence == SEEK_DATA)
|
||
|
|
goto found;
|
||
|
|
ofs++;
|
||
|
|
pgofs++;
|
||
|
|
data_ofs = (loff_t)pgofs << PAGE_SHIFT;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if (whence == SEEK_DATA)
|
||
|
|
goto fail;
|
||
|
|
found:
|
||
|
|
if (whence == SEEK_HOLE && data_ofs > isize)
|
||
|
|
data_ofs = isize;
|
||
|
|
inode_unlock(inode);
|
||
|
|
return vfs_setpos(file, data_ofs, maxbytes);
|
||
|
|
fail:
|
||
|
|
inode_unlock(inode);
|
||
|
|
return -ENXIO;
|
||
|
|
}
|
||
|
|
|
||
|
|
loff_t eufs_file_llseek(struct file *file, loff_t offset, int whence)
|
||
|
|
{
|
||
|
|
struct inode *inode = file_inode(file);
|
||
|
|
loff_t maxbytes = inode->i_sb->s_maxbytes;
|
||
|
|
|
||
|
|
switch (whence) {
|
||
|
|
case SEEK_SET:
|
||
|
|
case SEEK_CUR:
|
||
|
|
case SEEK_END:
|
||
|
|
return generic_file_llseek_size(file, offset, whence, maxbytes,
|
||
|
|
i_size_read(inode));
|
||
|
|
case SEEK_DATA:
|
||
|
|
case SEEK_HOLE:
|
||
|
|
if (offset < 0)
|
||
|
|
return -ENXIO;
|
||
|
|
return eufs_seek_block(file, offset, whence);
|
||
|
|
}
|
||
|
|
return -EINVAL;
|
||
|
|
}
|