From 09ac2b60db1e857aa06633d980415b77b478b3c5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Dec 2020 11:44:25 -0800 Subject: [PATCH 01/45] f2fs: handle unallocated section and zone on pinned/atgc If we have large section/zone, unallocated segment makes them corrupted. E.g., - Pinned file: -1 119304647 119304647 - ATGC data: -1 119304647 119304647 Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e81eb0748e2a..229814b4f4a6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,11 +101,11 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) #define GET_SEC_FROM_SEG(sbi, segno) \ - ((segno) / (sbi)->segs_per_sec) + (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ ((secno) * (sbi)->segs_per_sec) #define GET_ZONE_FROM_SEC(sbi, secno) \ - ((secno) / (sbi)->secs_per_zone) + (((secno) == -1) ? -1: (secno) / (sbi)->secs_per_zone) #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) From 1bc0d459811e15a2144227dc98f471120bc680ed Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Tue, 22 Dec 2020 21:34:15 +0800 Subject: [PATCH 02/45] f2fs: Replace expression with offsetof() Use the existing offsetof() macro instead of duplicating code. Signed-off-by: Zheng Yongjun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8861838f49c9..5f561eb33d86 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2696,7 +2696,7 @@ retry: src = F2FS_INODE(page); dst = F2FS_INODE(ipage); - memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src); + memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); From 5f1583810b404e3bca274634b6969d1b813af219 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Mon, 14 Dec 2020 11:54:53 +0800 Subject: [PATCH 03/45] f2fs: fix to set inode->i_mode correctly for posix_acl_update_mode We should update the ~S_IRWXUGO part of inode->i_mode in __setattr_copy, because posix_acl_update_mode updates mode based on inode->i_mode, which finally overwrites the ~S_IRWXUGO part of i_acl_mode with old i_mode. Testcase to reproduce this bug: 0. adduser abc 1. mkfs.f2fs /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. mkdir /mnt/f2fs/test 4. setfacl -m u:abc:r /mnt/f2fs/test 5. chmod +s /mnt/f2fs/test Signed-off-by: Weichao Guo Signed-off-by: Bin Shu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84d4043103ce..9e2d604d5760 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -872,6 +872,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; + inode->i_mode = (inode->i_mode & S_IRWXUGO) | (mode & ~S_IRWXUGO); set_acl_inode(inode, mode); } } From 46000d2be7209e2f732757c0faaa22262f15fa8c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 25 Dec 2020 16:52:27 +0800 Subject: [PATCH 04/45] f2fs: enhance to update i_mode and acl atomically in f2fs_setattr() Previously, in f2fs_setattr(), we don't update S_ISUID|S_ISGID|S_ISVTX bits with S_IRWXUGO bits and acl entries atomically, so in error path, chmod() may partially success, this patch enhances to make chmod() flow being atomical. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 23 ++++++++++++++++++++++- fs/f2fs/file.c | 7 ++++--- fs/f2fs/xattr.c | 15 +++++++++------ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1e5e9b1136ee..732ec10e7890 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return __f2fs_get_acl(inode, type, NULL); } +static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, + struct posix_acl **acl) +{ + umode_t mode = inode->i_mode; + int error; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) + mode = F2FS_I(inode)->i_acl_mode; + + error = posix_acl_equiv_mode(*acl, &mode); + if (error < 0) + return error; + if (error == 0) + *acl = NULL; + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + mode &= ~S_ISGID; + *mode_p = mode; + return 0; +} + static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -213,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = f2fs_acl_update_mode(inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9e2d604d5760..c2753e2656ff 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -872,7 +872,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - inode->i_mode = (inode->i_mode & S_IRWXUGO) | (mode & ~S_IRWXUGO); set_acl_inode(inode, mode); } } @@ -972,8 +971,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); - if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; + + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + if (!err) + inode->i_mode = F2FS_I(inode)->i_acl_mode; clear_inode_flag(inode, FI_ACL_MODE); } } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 65afcc3cc68a..2086bef6c154 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -673,7 +673,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, } if (value && f2fs_xattr_value_same(here, value, size)) - goto exit; + goto same; } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; @@ -738,17 +738,20 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(inode, FI_ACL_MODE)) { - inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); - clear_inode_flag(inode, FI_ACL_MODE); - } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); + +same: + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + inode->i_ctime = current_time(inode); + clear_inode_flag(inode, FI_ACL_MODE); + } + exit: kfree(base_addr); return error; From a5cf21779ae2cfca9b9fe389f4a93f1f15983c42 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Dec 2020 18:07:01 +0800 Subject: [PATCH 05/45] f2fs: enforce the immutable flag on open files This patch ports commit 02b016ca7f99 ("ext4: enforce the immutable flag on open files") to f2fs. According to the chattr man page, "a file with the 'i' attribute cannot be modified..." Historically, this was only enforced when the file was opened, per the rest of the description, "... and the file can not be opened in write mode". There is general agreement that we should standardize all file systems to prevent modifications even for files that were opened at the time the immutable flag is set. Eventually, a change to enforce this at the VFS layer should be landing in mainline. Cc: stable@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c2753e2656ff..09507d7c4c00 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -60,6 +60,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) bool need_alloc = true; int err = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -887,6 +890,14 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (attr->ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + if ((attr->ia_valid & ATTR_SIZE) && !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; @@ -4374,6 +4385,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } + if (unlikely(IS_IMMUTABLE(inode))) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; @@ -4438,6 +4454,7 @@ write: if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } +unlock: inode_unlock(inode); out: trace_f2fs_file_write_iter(inode, iocb->ki_pos, From 565a750b34cc7106cc2f9bf39c5ec06f1a3f3c05 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 26 Dec 2020 18:07:41 +0800 Subject: [PATCH 06/45] f2fs: relocate f2fs_precache_extents() Relocate f2fs_precache_extents() in prior to check_swap_activate(), then extent cache can be enabled before its use. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6f8d44a64303..9b1b76de66b3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -4056,12 +4056,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!f2fs_disable_compressed_file(inode)) return -EINVAL; + f2fs_precache_extents(inode); + ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); - f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } From 945a315ae739c7077aff28c54586cbc7068f1ee6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jan 2021 17:40:13 +0800 Subject: [PATCH 07/45] f2fs: compress: deny setting unsupported compress algorithm If kernel doesn't support certain kinds of compress algorithm, deny to set them as compress algorithm of f2fs via 'compress_algorithm=%s' mount option. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b587d94f5e6d..e0c4d303ec0f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -872,14 +872,26 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) if (!name) return -ENOMEM; if (!strcmp(name, "lzo")) { +#ifdef CONFIG_F2FS_FS_LZO F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; +#else + f2fs_info(sbi, "kernel doesn't support lzo compression"); +#endif } else if (!strcmp(name, "lz4")) { +#ifdef CONFIG_F2FS_FS_LZ4 F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; +#else + f2fs_info(sbi, "kernel doesn't support lz4 compression"); +#endif } else if (!strcmp(name, "zstd")) { +#ifdef CONFIG_F2FS_FS_ZSTD F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; +#else + f2fs_info(sbi, "kernel doesn't support zstd compression"); +#endif } else { kfree(name); return -EINVAL; From bc20f67bfa05b0bf6ad6949a576f1dd789c2cfa5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Jan 2021 17:46:43 +0800 Subject: [PATCH 08/45] f2fs: compress: support compress level Expand 'compress_algorithm' mount option to accept parameter as format of :, by this way, it gives a way to allow user to do more specified config on lz4 and zstd compression level, then f2fs compression can provide higher compress ratio. In order to set compress level for lz4 algorithm, it needs to set CONFIG_LZ4HC_COMPRESS and CONFIG_F2FS_FS_LZ4HC config to enable lz4hc compress algorithm. CR and performance number on lz4/lz4hc algorithm: dd if=enwik9 of=compressed_file conv=fsync Original blocks: 244382 lz4 lz4hc-9 compressed blocks 170647 163270 compress ratio 69.8% 66.8% speed 16.4207 s, 60.9 MB/s 26.7299 s, 37.4 MB/s compress ratio = after / before Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 867 +++++++++++++++++++++++++++++ fs/f2fs/Kconfig | 10 + fs/f2fs/compress.c | 41 +- fs/f2fs/f2fs.h | 9 + fs/f2fs/super.c | 88 ++- include/linux/f2fs_fs.h | 3 + 6 files changed, 1013 insertions(+), 5 deletions(-) create mode 100644 Documentation/filesystems/f2fs.rst diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst new file mode 100644 index 000000000000..5eff4009e77e --- /dev/null +++ b/Documentation/filesystems/f2fs.rst @@ -0,0 +1,867 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================== +WHAT IS Flash-Friendly File System (F2FS)? +========================================== + +NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have +been equipped on a variety systems ranging from mobile to server systems. Since +they are known to have different characteristics from the conventional rotating +disks, a file system, an upper layer to the storage device, should adapt to the +changes from the sketch in the design level. + +F2FS is a file system exploiting NAND flash memory-based storage devices, which +is based on Log-structured File System (LFS). The design has been focused on +addressing the fundamental issues in LFS, which are snowball effect of wandering +tree and high cleaning overhead. + +Since a NAND flash memory-based storage device shows different characteristic +according to its internal geometry or flash memory management scheme, namely FTL, +F2FS and its tools support various parameters not only for configuring on-disk +layout, but also for selecting allocation and cleaning algorithms. + +The following git tree provides the file system formatting tool (mkfs.f2fs), +a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs). + +- git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git + +For reporting bugs and sending patches, please use the following mailing list: + +- linux-f2fs-devel@lists.sourceforge.net + +Background and Design issues +============================ + +Log-structured File System (LFS) +-------------------------------- +"A log-structured file system writes all modifications to disk sequentially in +a log-like structure, thereby speeding up both file writing and crash recovery. +The log is the only structure on disk; it contains indexing information so that +files can be read back from the log efficiently. In order to maintain large free +areas on disk for fast writing, we divide the log into segments and use a +segment cleaner to compress the live information from heavily fragmented +segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and +implementation of a log-structured file system", ACM Trans. Computer Systems +10, 1, 26–52. + +Wandering Tree Problem +---------------------- +In LFS, when a file data is updated and written to the end of log, its direct +pointer block is updated due to the changed location. Then the indirect pointer +block is also updated due to the direct pointer block update. In this manner, +the upper index structures such as inode, inode map, and checkpoint block are +also updated recursively. This problem is called as wandering tree problem [1], +and in order to enhance the performance, it should eliminate or relax the update +propagation as much as possible. + +[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/ + +Cleaning Overhead +----------------- +Since LFS is based on out-of-place writes, it produces so many obsolete blocks +scattered across the whole storage. In order to serve new empty log space, it +needs to reclaim these obsolete blocks seamlessly to users. This job is called +as a cleaning process. + +The process consists of three operations as follows. + +1. A victim segment is selected through referencing segment usage table. +2. It loads parent index structures of all the data in the victim identified by + segment summary blocks. +3. It checks the cross-reference between the data and its parent index structure. +4. It moves valid data selectively. + +This cleaning job may cause unexpected long delays, so the most important goal +is to hide the latencies to users. And also definitely, it should reduce the +amount of valid data to be moved, and move them quickly as well. + +Key Features +============ + +Flash Awareness +--------------- +- Enlarge the random write area for better performance, but provide the high + spatial locality +- Align FS data structures to the operational units in FTL as best efforts + +Wandering Tree Problem +---------------------- +- Use a term, “node”, that represents inodes as well as various pointer blocks +- Introduce Node Address Table (NAT) containing the locations of all the “node” + blocks; this will cut off the update propagation. + +Cleaning Overhead +----------------- +- Support a background cleaning process +- Support greedy and cost-benefit algorithms for victim selection policies +- Support multi-head logs for static/dynamic hot and cold data separation +- Introduce adaptive logging for efficient block allocation + +Mount Options +============= + + +======================== ============================================================ +background_gc=%s Turn on/off cleaning operations, namely garbage + collection, triggered in background when I/O subsystem is + idle. If background_gc=on, it will turn on the garbage + collection and if background_gc=off, garbage collection + will be turned off. If background_gc=sync, it will turn + on synchronous garbage collection running in background. + Default value for this option is on. So garbage + collection is on by default. +disable_roll_forward Disable the roll-forward recovery routine +norecovery Disable the roll-forward recovery routine, mounted read- + only (i.e., -o ro,disable_roll_forward) +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. +no_heap Disable heap-style segment allocation which finds free + segments for data from the beginning of main area, while + for node from the end of main area. +nouser_xattr Disable Extended User Attributes. Note: xattr is enabled + by default if CONFIG_F2FS_FS_XATTR is selected. +noacl Disable POSIX Access Control List. Note: acl is enabled + by default if CONFIG_F2FS_FS_POSIX_ACL is selected. +active_logs=%u Support configuring the number of active logs. In the + current design, f2fs supports only 2, 4, and 6 logs. + Default number is 6. +disable_ext_identify Disable the extension list configured by mkfs, so f2fs + is not aware of cold files such as media files. +inline_xattr Enable the inline xattrs feature. +noinline_xattr Disable the inline xattrs feature. +inline_xattr_size=%u Support configuring inline xattr size, it depends on + flexible inline xattr feature. +inline_data Enable the inline data feature: Newly created small (<~3.4k) + files can be written into inode block. +inline_dentry Enable the inline dir feature: data in newly created + directory entries can be written into inode block. The + space of inode block which is used to store inline + dentries is limited to ~3.4k. +noinline_dentry Disable the inline dentry feature. +flush_merge Merge concurrent cache_flush commands as much as possible + to eliminate redundant command issues. If the underlying + device handles the cache_flush command relatively slowly, + recommend to enable this option. +nobarrier This option can be used if underlying storage guarantees + its cached data should be written to the novolatile area. + If this option is set, no cache_flush commands are issued + but f2fs still guarantees the write ordering of all the + data writes. +fastboot This option is used when a system wants to reduce mount + time as much as possible, even though normal performance + can be sacrificed. +extent_cache Enable an extent cache based on rb-tree, it can cache + as many as extent which map between contiguous logical + address and physical address per inode, resulting in + increasing the cache hit ratio. Set by default. +noextent_cache Disable an extent cache based on rb-tree explicitly, see + the above extent_cache mount option. +noinline_data Disable the inline data feature, inline data feature is + enabled by default. +data_flush Enable data flushing before checkpoint in order to + persist data of regular and symlink. +reserve_root=%d Support configuring reserved space which is used for + allocation from a privileged user with specified uid or + gid, unit: 4KB, the default limit is 0.2% of user blocks. +resuid=%d The user ID which may use the reserved blocks. +resgid=%d The group ID which may use the reserved blocks. +fault_injection=%d Enable fault injection in all supported types with + specified injection rate. +fault_type=%d Support configuring fault injection type, should be + enabled with fault_injection option, fault type value + is shown below, it supports single or combined type. + + =================== =========== + Type_Name Type_Value + =================== =========== + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_READ_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 + FAULT_WRITE_IO 0x000004000 + =================== =========== +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". +usrquota Enable plain user disk quota accounting. +grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. +usrjquota= Appoint specified file and type during mount, so that quota +grpjquota= information can be properly updated during recovery flow, +prjjquota= : must be in root directory; +jqfmt= : [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journalled quota. +offgrpjquota Turn off group journalled quota. +offprjjquota Turn off project journalled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. +whint_mode=%s Control which write hints are passed down to block + layer. This supports "off", "user-based", and + "fs-based". In "off" mode (default), f2fs does not pass + down hints. In "user-based" mode, f2fs tries to pass + down hints given by users. And in "fs-based" mode, f2fs + passes down hints with its policy. +alloc_mode=%s Adjust block allocation policy, which supports "reuse" + and "default". +fsync_mode=%s Control the policy of fsync. Currently supports "posix", + "strict", and "nobarrier". In "posix" mode, which is + default, fsync will follow POSIX semantics and does a + light operation to improve the filesystem performance. + In "strict" mode, fsync will be heavy and behaves in line + with xfs, ext4 and btrfs, where xfstest generic/342 will + pass, but the performance will regress. "nobarrier" is + based on "posix", but doesn't issue flush command for + non-atomic files likewise "nobarrier" mount option. +test_dummy_encryption +test_dummy_encryption=%s + Enable dummy encryption, which provides a fake fscrypt + context. The fake fscrypt context is used by xfstests. + The argument may be either "v1" or "v2", in order to + select the corresponding fscrypt policy version. +checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable" + to reenable checkpointing. Is enabled by default. While + disabled, any unmounting or unexpected shutdowns will cause + the filesystem contents to appear as they did when the + filesystem was mounted with that option. + While mounting with checkpoint=disabled, the filesystem must + run garbage collection to ensure that all available space can + be used. If this takes too much time, the mount may return + EAGAIN. You may optionally add a value to indicate how much + of the disk you would be willing to temporarily give up to + avoid additional garbage collection. This can be given as a + number of blocks, or as a percent. For instance, mounting + with checkpoint=disable:100% would always succeed, but it may + hide up to all remaining free space. The actual space that + would be unusable can be viewed at /sys/fs/f2fs//unusable + This space is reclaimed once checkpoint=enable. +compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", + "lz4", "zstd" and "lzo-rle" algorithm. +compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only + "lz4" and "zstd" support compress level config. + algorithm level range + lz4 3 - 16 + zstd 1 - 22 +compress_log_size=%u Support configuring compress cluster size, the size will + be 4KB * (1 << %u), 16KB is minimum size, also it's + default size. +compress_extension=%s Support adding specified extension, so that f2fs can enable + compression on those corresponding files, e.g. if all files + with '.ext' has high compression rate, we can set the '.ext' + on compression extension list and enable compression on + these file by default rather than to enable it via ioctl. + For other files, we can still enable compression via ioctl. + Note that, there is one reserved special extension '*', it + can be set to enable compression for all files. +compress_chksum Support verifying chksum of raw data in compressed cluster. +compress_mode=%s Control file compression mode. This supports "fs" and "user" + modes. In "fs" mode (default), f2fs does automatic compression + on the compression enabled files. In "user" mode, f2fs disables + the automaic compression and gives the user discretion of + choosing the target file and the timing. The user can do manual + compression/decompression on the compression enabled files using + ioctls. +inlinecrypt When possible, encrypt/decrypt the contents of encrypted + files using the blk-crypto framework rather than + filesystem-layer encryption. This allows the use of + inline encryption hardware. The on-disk format is + unaffected. For more details, see + Documentation/block/inline-encryption.rst. +atgc Enable age-threshold garbage collection, it provides high + effectiveness and efficiency on background GC. +======================== ============================================================ + +Debugfs Entries +=============== + +/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as +f2fs. Each file shows the whole f2fs information. + +/sys/kernel/debug/f2fs/status includes: + + - major file system information managed by f2fs currently + - average SIT information about whole segments + - current memory footprint consumed by f2fs. + +Sysfs Entries +============= + +Information about mounted f2fs file systems can be found in +/sys/fs/f2fs. Each mounted filesystem will have a directory in +/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda). +The files in each per-device directory are shown in table below. + +Files in /sys/fs/f2fs/ +(see also Documentation/ABI/testing/sysfs-fs-f2fs) + +Usage +===== + +1. Download userland tools and compile them. + +2. Skip, if f2fs was compiled statically inside kernel. + Otherwise, insert the f2fs.ko module:: + + # insmod f2fs.ko + +3. Create a directory to use when mounting:: + + # mkdir /mnt/f2fs + +4. Format the block device, and then mount as f2fs:: + + # mkfs.f2fs -l label /dev/block_device + # mount -t f2fs /dev/block_device /mnt/f2fs + +mkfs.f2fs +--------- +The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem, +which builds a basic on-disk layout. + +The quick options consist of: + +=============== =========================================================== +``-l [label]`` Give a volume label, up to 512 unicode name. +``-a [0 or 1]`` Split start location of each area for heap-based allocation. + + 1 is set by default, which performs this. +``-o [int]`` Set overprovision ratio in percent over volume size. + + 5 is set by default. +``-s [int]`` Set the number of segments per section. + + 1 is set by default. +``-z [int]`` Set the number of sections per zone. + + 1 is set by default. +``-e [str]`` Set basic extension list. e.g. "mp3,gif,mov" +``-t [0 or 1]`` Disable discard command or not. + + 1 is set by default, which conducts discard. +=============== =========================================================== + +Note: please refer to the manpage of mkfs.f2fs(8) to get full option list. + +fsck.f2fs +--------- +The fsck.f2fs is a tool to check the consistency of an f2fs-formatted +partition, which examines whether the filesystem metadata and user-made data +are cross-referenced correctly or not. +Note that, initial version of the tool does not fix any inconsistency. + +The quick options consist of:: + + -d debug level [default:0] + +Note: please refer to the manpage of fsck.f2fs(8) to get full option list. + +dump.f2fs +--------- +The dump.f2fs shows the information of specific inode and dumps SSA and SIT to +file. Each file is dump_ssa and dump_sit. + +The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. +It shows on-disk inode information recognized by a given inode number, and is +able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and +./dump_sit respectively. + +The options consist of:: + + -d debug level [default:0] + -i inode no (hex) + -s [SIT dump segno from #1~#2 (decimal), for all 0~-1] + -a [SSA dump segno from #1~#2 (decimal), for all 0~-1] + +Examples:: + + # dump.f2fs -i [ino] /dev/sdx + # dump.f2fs -s 0~-1 /dev/sdx (SIT dump) + # dump.f2fs -a 0~-1 /dev/sdx (SSA dump) + +Note: please refer to the manpage of dump.f2fs(8) to get full option list. + +sload.f2fs +---------- +The sload.f2fs gives a way to insert files and directories in the exisiting disk +image. This tool is useful when building f2fs images given compiled files. + +Note: please refer to the manpage of sload.f2fs(8) to get full option list. + +resize.f2fs +----------- +The resize.f2fs lets a user resize the f2fs-formatted disk image, while preserving +all the files and directories stored in the image. + +Note: please refer to the manpage of resize.f2fs(8) to get full option list. + +defrag.f2fs +----------- +The defrag.f2fs can be used to defragment scattered written data as well as +filesystem metadata across the disk. This can improve the write speed by giving +more free consecutive space. + +Note: please refer to the manpage of defrag.f2fs(8) to get full option list. + +f2fs_io +------- +The f2fs_io is a simple tool to issue various filesystem APIs as well as +f2fs-specific ones, which is very useful for QA tests. + +Note: please refer to the manpage of f2fs_io(8) to get full option list. + +Design +====== + +On-disk Layout +-------------- + +F2FS divides the whole volume into a number of segments, each of which is fixed +to 2MB in size. A section is composed of consecutive segments, and a zone +consists of a set of sections. By default, section and zone sizes are set to one +segment size identically, but users can easily modify the sizes by mkfs. + +F2FS splits the entire volume into six areas, and all the areas except superblock +consist of multiple segments as described below:: + + align with the zone size <-| + |-> align with the segment size + _________________________________________________________________________ + | | | Segment | Node | Segment | | + | Superblock | Checkpoint | Info. | Address | Summary | Main | + | (SB) | (CP) | Table (SIT) | Table (NAT) | Area (SSA) | | + |____________|_____2______|______N______|______N______|______N_____|__N___| + . . + . . + . . + ._________________________________________. + |_Segment_|_..._|_Segment_|_..._|_Segment_| + . . + ._________._________ + |_section_|__...__|_ + . . + .________. + |__zone__| + +- Superblock (SB) + It is located at the beginning of the partition, and there exist two copies + to avoid file system crash. It contains basic partition information and some + default parameters of f2fs. + +- Checkpoint (CP) + It contains file system information, bitmaps for valid NAT/SIT sets, orphan + inode lists, and summary entries of current active segments. + +- Segment Information Table (SIT) + It contains segment information such as valid block count and bitmap for the + validity of all the blocks. + +- Node Address Table (NAT) + It is composed of a block address table for all the node blocks stored in + Main area. + +- Segment Summary Area (SSA) + It contains summary entries which contains the owner information of all the + data and node blocks stored in Main area. + +- Main Area + It contains file and directory data including their indices. + +In order to avoid misalignment between file system and flash-based storage, F2FS +aligns the start block address of CP with the segment size. Also, it aligns the +start block address of Main area with the zone size by reserving some segments +in SSA area. + +Reference the following survey for additional technical details. +https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey + +File System Metadata Structure +------------------------------ + +F2FS adopts the checkpointing scheme to maintain file system consistency. At +mount time, F2FS first tries to find the last valid checkpoint data by scanning +CP area. In order to reduce the scanning time, F2FS uses only two copies of CP. +One of them always indicates the last valid data, which is called as shadow copy +mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism. + +For file system consistency, each CP points to which NAT and SIT copies are +valid, as shown as below:: + + +--------+----------+---------+ + | CP | SIT | NAT | + +--------+----------+---------+ + . . . . + . . . . + . . . . + +-------+-------+--------+--------+--------+--------+ + | CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 | + +-------+-------+--------+--------+--------+--------+ + | ^ ^ + | | | + `----------------------------------------' + +Index Structure +--------------- + +The key data structure to manage the data locations is a "node". Similar to +traditional file structures, F2FS has three types of node: inode, direct node, +indirect node. F2FS assigns 4KB to an inode block which contains 923 data block +indices, two direct node pointers, two indirect node pointers, and one double +indirect node pointer as described below. One direct node block contains 1018 +data blocks, and one indirect node block contains also 1018 node blocks. Thus, +one inode block (i.e., a file) covers:: + + 4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB. + + Inode block (4KB) + |- data (923) + |- direct node (2) + | `- data (1018) + |- indirect node (2) + | `- direct node (1018) + | `- data (1018) + `- double indirect node (1) + `- indirect node (1018) + `- direct node (1018) + `- data (1018) + +Note that all the node blocks are mapped by NAT which means the location of +each node is translated by the NAT table. In the consideration of the wandering +tree problem, F2FS is able to cut off the propagation of node updates caused by +leaf data writes. + +Directory Structure +------------------- + +A directory entry occupies 11 bytes, which consists of the following attributes. + +- hash hash value of the file name +- ino inode number +- len the length of file name +- type file type such as directory, symlink, etc + +A dentry block consists of 214 dentry slots and file names. Therein a bitmap is +used to represent whether each dentry is valid or not. A dentry block occupies +4KB with the following composition. + +:: + + Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) + + dentries(11 * 214 bytes) + file name (8 * 214 bytes) + + [Bucket] + +--------------------------------+ + |dentry block 1 | dentry block 2 | + +--------------------------------+ + . . + . . + . [Dentry Block Structure: 4KB] . + +--------+----------+----------+------------+ + | bitmap | reserved | dentries | file names | + +--------+----------+----------+------------+ + [Dentry Block: 4KB] . . + . . + . . + +------+------+-----+------+ + | hash | ino | len | type | + +------+------+-----+------+ + [Dentry Structure: 11 bytes] + +F2FS implements multi-level hash tables for directory structure. Each level has +a hash table with dedicated number of hash buckets as shown below. Note that +"A(2B)" means a bucket includes 2 data blocks. + +:: + + ---------------------- + A : bucket + B : block + N : MAX_DIR_HASH_DEPTH + ---------------------- + + level #0 | A(2B) + | + level #1 | A(2B) - A(2B) + | + level #2 | A(2B) - A(2B) - A(2B) - A(2B) + . | . . . . + level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B) + . | . . . . + level #N | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B) + +The number of blocks and buckets are determined by:: + + ,- 2, if n < MAX_DIR_HASH_DEPTH / 2, + # of blocks in level #n = | + `- 4, Otherwise + + ,- 2^(n + dir_level), + | if n + dir_level < MAX_DIR_HASH_DEPTH / 2, + # of buckets in level #n = | + `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1), + Otherwise + +When F2FS finds a file name in a directory, at first a hash value of the file +name is calculated. Then, F2FS scans the hash table in level #0 to find the +dentry consisting of the file name and its inode number. If not found, F2FS +scans the next hash table in level #1. In this way, F2FS scans hash tables in +each levels incrementally from 1 to N. In each level F2FS needs to scan only +one bucket determined by the following equation, which shows O(log(# of files)) +complexity:: + + bucket number to scan in level #n = (hash value) % (# of buckets in level #n) + +In the case of file creation, F2FS finds empty consecutive slots that cover the +file name. F2FS searches the empty slots in the hash tables of whole levels from +1 to N in the same way as the lookup operation. + +The following figure shows an example of two cases holding children:: + + --------------> Dir <-------------- + | | + child child + + child - child [hole] - child + + child - child - child [hole] - [hole] - child + + Case 1: Case 2: + Number of children = 6, Number of children = 3, + File size = 7 File size = 7 + +Default Block Allocation +------------------------ + +At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node +and Hot/Warm/Cold data. + +- Hot node contains direct node blocks of directories. +- Warm node contains direct node blocks except hot node blocks. +- Cold node contains indirect node blocks +- Hot data contains dentry blocks +- Warm data contains data blocks except hot and cold data blocks +- Cold data contains multimedia data or migrated data blocks + +LFS has two schemes for free space management: threaded log and copy-and-compac- +tion. The copy-and-compaction scheme which is known as cleaning, is well-suited +for devices showing very good sequential write performance, since free segments +are served all the time for writing new data. However, it suffers from cleaning +overhead under high utilization. Contrarily, the threaded log scheme suffers +from random writes, but no cleaning process is needed. F2FS adopts a hybrid +scheme where the copy-and-compaction scheme is adopted by default, but the +policy is dynamically changed to the threaded log scheme according to the file +system status. + +In order to align F2FS with underlying flash-based storage, F2FS allocates a +segment in a unit of section. F2FS expects that the section size would be the +same as the unit size of garbage collection in FTL. Furthermore, with respect +to the mapping granularity in FTL, F2FS allocates each section of the active +logs from different zones as much as possible, since FTL can write the data in +the active logs into one allocation unit according to its mapping granularity. + +Cleaning process +---------------- + +F2FS does cleaning both on demand and in the background. On-demand cleaning is +triggered when there are not enough free segments to serve VFS calls. Background +cleaner is operated by a kernel thread, and triggers the cleaning job when the +system is idle. + +F2FS supports two victim selection policies: greedy and cost-benefit algorithms. +In the greedy algorithm, F2FS selects a victim segment having the smallest number +of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment +according to the segment age and the number of valid blocks in order to address +log block thrashing problem in the greedy algorithm. F2FS adopts the greedy +algorithm for on-demand cleaner, while background cleaner adopts cost-benefit +algorithm. + +In order to identify whether the data in the victim segment are valid or not, +F2FS manages a bitmap. Each bit represents the validity of a block, and the +bitmap is composed of a bit stream covering whole blocks in main area. + +Write-hint Policy +----------------- + +1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + +2) whint_mode=user-based. F2FS tries to pass down hints given by +users. + +===================== ======================== =================== +User F2FS Block +===================== ======================== =================== + META WRITE_LIFE_NOT_SET + HOT_NODE " + WARM_NODE " + COLD_NODE " +ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME +extension list " " + +-- buffered io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET +WRITE_LIFE_NONE " " +WRITE_LIFE_MEDIUM " " +WRITE_LIFE_LONG " " + +-- direct io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET +WRITE_LIFE_NONE " WRITE_LIFE_NONE +WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM +WRITE_LIFE_LONG " WRITE_LIFE_LONG +===================== ======================== =================== + +3) whint_mode=fs-based. F2FS passes down hints with its policy. + +===================== ======================== =================== +User F2FS Block +===================== ======================== =================== + META WRITE_LIFE_MEDIUM; + HOT_NODE WRITE_LIFE_NOT_SET + WARM_NODE " + COLD_NODE WRITE_LIFE_NONE +ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME +extension list " " + +-- buffered io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_LONG +WRITE_LIFE_NONE " " +WRITE_LIFE_MEDIUM " " +WRITE_LIFE_LONG " " + +-- direct io +WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME +WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT +WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET +WRITE_LIFE_NONE " WRITE_LIFE_NONE +WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM +WRITE_LIFE_LONG " WRITE_LIFE_LONG +===================== ======================== =================== + +Fallocate(2) Policy +------------------- + +The default policy follows the below POSIX rule. + +Allocating disk space + The default operation (i.e., mode is zero) of fallocate() allocates + the disk space within the range specified by offset and len. The + file size (as reported by stat(2)) will be changed if offset+len is + greater than the file size. Any subregion within the range specified + by offset and len that did not contain data before the call will be + initialized to zero. This default behavior closely resembles the + behavior of the posix_fallocate(3) library function, and is intended + as a method of optimally implementing that function. + +However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to +fallocate(fd, DEFAULT_MODE), it allocates on-disk block addressess having +zero or random data, which is useful to the below scenario where: + + 1. create(fd) + 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE) + 3. fallocate(fd, 0, 0, size) + 4. address = fibmap(fd, offset) + 5. open(blkdev) + 6. write(blkdev, address) + +Compression implementation +-------------------------- + +- New term named cluster is defined as basic unit of compression, file can + be divided into multiple clusters logically. One cluster includes 4 << n + (n >= 0) logical pages, compression size is also cluster size, each of + cluster can be compressed or not. + +- In cluster metadata layout, one special block address is used to indicate + a cluster is a compressed one or normal one; for compressed cluster, following + metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs + stores data including compress header and compressed data. + +- In order to eliminate write amplification during overwrite, F2FS only + support compression on write-once file, data can be compressed only when + all logical blocks in cluster contain valid data and compress ratio of + cluster data is lower than specified threshold. + +- To enable compression on regular inode, there are three ways: + + * chattr +c file + * chattr +c dir; touch dir/file + * mount w/ -o compress_extension=ext; touch file.ext + +Compress metadata layout:: + + [Dnode Structure] + +-----------------------------------------------+ + | cluster 1 | cluster 2 | ......... | cluster N | + +-----------------------------------------------+ + . . . . + . . . . + . Compressed Cluster . . Normal Cluster . + +----------+---------+---------+---------+ +---------+---------+---------+---------+ + |compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 | + +----------+---------+---------+---------+ +---------+---------+---------+---------+ + . . + . . + . . + +-------------+-------------+----------+----------------------------+ + | data length | data chksum | reserved | compressed data | + +-------------+-------------+----------+----------------------------+ + +Compression mode +-------------------------- + +f2fs supports "fs" and "user" compression modes with "compression_mode" mount option. +With this option, f2fs provides a choice to select the way how to compress the +compression enabled files (refer to "Compression implementation" section for how to +enable compression on a regular inode). + +1) compress_mode=fs +This is the default option. f2fs does automatic compression in the writeback of the +compression enabled files. + +2) compress_mode=user +This disables the automaic compression and gives the user discretion of choosing the +target file and the timing. The user can do manual compression/decompression on the +compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE +ioctls like the below. + +To decompress a file, + +fd = open(filename, O_WRONLY, 0); +ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE); + +To compress a file, + +fd = open(filename, O_WRONLY, 0); +ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE); + +NVMe Zoned Namespace devices +---------------------------- + +- ZNS defines a per-zone capacity which can be equal or less than the + zone-size. Zone-capacity is the number of usable blocks in the zone. + F2FS checks if zone-capacity is less than zone-size, if it is, then any + segment which starts after the zone-capacity is marked as not-free in + the free segment bitmap at initial mount time. These segments are marked + as permanently used so they are not allocated for writes and + consequently are not needed to be garbage collected. In case the + zone-capacity is not aligned to default segment size(2MB), then a segment + can start before the zone-capacity and span across zone-capacity boundary. + Such spanning segments are also considered as usable segments. All blocks + past the zone-capacity are considered unusable in these segments. diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 79eed58d9d69..1d2d3f879f6c 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -127,6 +127,16 @@ config F2FS_FS_LZ4 help Support LZ4 compress algorithm, if unsure, say Y. +config F2FS_FS_LZ4HC + bool "LZ4HC compression support" + depends on F2FS_FS_COMPRESSION + depends on F2FS_FS_LZ4 + select LZ4HC_COMPRESS + default y + help + Support LZ4HC compress algorithm, LZ4HC has compatible on-disk + layout with LZ4, if unsure, say Y. + config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index a4c6e9af4322..5cc43c7d2e9b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -253,8 +253,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = { #ifdef CONFIG_F2FS_FS_LZ4 static int lz4_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZ4_MEM_COMPRESS, GFP_NOFS); + unsigned int size = LZ4_MEM_COMPRESS; + +#ifdef CONFIG_F2FS_FS_LZ4HC + if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) + size = LZ4HC_MEM_COMPRESS; +#endif + + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); if (!cc->private) return -ENOMEM; @@ -273,10 +279,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) cc->private = NULL; } +#ifdef CONFIG_F2FS_FS_LZ4HC +static int lz4hc_compress_pages(struct compress_ctx *cc) +{ + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; + int len; + + if (level) + len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, level, cc->private); + else + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) + return -EAGAIN; + + cc->clen = len; + return 0; +} +#endif + static int lz4_compress_pages(struct compress_ctx *cc) { int len; +#ifdef CONFIG_F2FS_FS_LZ4HC + return lz4hc_compress_pages(cc); +#endif len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, cc->clen, cc->private); if (!len) @@ -326,8 +356,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) ZSTD_CStream *stream; void *workspace; unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> + COMPRESS_LEVEL_OFFSET; - params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + + params = ZSTD_getParams(level, cc->rlen, 0); workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 322b51809a39..18b16deac2d1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -145,6 +145,7 @@ struct f2fs_mount_info { /* For compression */ unsigned char compress_algorithm; /* algorithm type */ unsigned char compress_log_size; /* cluster log size */ + unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ int compress_mode; /* compression mode */ @@ -734,6 +735,7 @@ struct f2fs_inode_info { atomic_t i_compr_blocks; /* # of compressed blocks */ unsigned char i_compress_algorithm; /* algorithm type */ unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned short i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ }; @@ -1314,6 +1316,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define COMPRESS_LEVEL_OFFSET 8 + /* compress context */ struct compress_ctx { struct inode *inode; /* inode the context belong to */ @@ -3945,6 +3949,11 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; + if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + F2FS_OPTION(sbi).compress_level) + F2FS_I(inode)->i_compress_flag |= + F2FS_OPTION(sbi).compress_level << + COMPRESS_LEVEL_OFFSET; F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e0c4d303ec0f..79da50891961 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -461,6 +463,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, return 0; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +#ifdef CONFIG_F2FS_FS_LZ4 +static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +#endif + + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + +#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + f2fs_info(sbi, "invalid lz4hc compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +#else + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; +#endif +} +#endif + +#ifdef CONFIG_F2FS_FS_ZSTD +static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) +{ + unsigned int level; + int len = 4; + + if (strlen(str) == len) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } + + str += len; + + if (str[0] != ':') { + f2fs_info(sbi, "wrong format, e.g. :"); + return -EINVAL; + } + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + + if (!level || level > ZSTD_maxCLevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } + + F2FS_OPTION(sbi).compress_level = level; + return 0; +} +#endif +#endif + static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -873,20 +943,31 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -ENOMEM; if (!strcmp(name, "lzo")) { #ifdef CONFIG_F2FS_FS_LZO + F2FS_OPTION(sbi).compress_level = 0; F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; #else f2fs_info(sbi, "kernel doesn't support lzo compression"); #endif - } else if (!strcmp(name, "lz4")) { + } else if (!strncmp(name, "lz4", 3)) { #ifdef CONFIG_F2FS_FS_LZ4 + ret = f2fs_set_lz4hc_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; #else f2fs_info(sbi, "kernel doesn't support lz4 compression"); #endif - } else if (!strcmp(name, "zstd")) { + } else if (!strncmp(name, "zstd", 4)) { #ifdef CONFIG_F2FS_FS_ZSTD + ret = f2fs_set_zstd_level(sbi, name); + if (ret) { + kfree(name); + return -EINVAL; + } F2FS_OPTION(sbi).compress_algorithm = COMPRESS_ZSTD; #else @@ -1543,6 +1624,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, } seq_printf(seq, ",compress_algorithm=%s", algtype); + if (F2FS_OPTION(sbi).compress_level) + seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level); + seq_printf(seq, ",compress_log_size=%u", F2FS_OPTION(sbi).compress_log_size); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 7dc2a06cf19a..c6cc0a566ef5 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -274,6 +274,9 @@ struct f2fs_inode { __u8 i_compress_algorithm; /* compress algorithm */ __u8 i_log_cluster_size; /* log of cluster size */ __le16 i_compress_flag; /* compress flag */ + /* 0 bit: chksum flag + * [10,15] bits: compress level + */ __le32 i_extra_end[0]; /* for attribute size calculation */ } __packed; __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ From 124a3e77e02c064c7c8bfa4b86d736c752188984 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 9 Dec 2020 16:43:27 +0800 Subject: [PATCH 09/45] f2fs: introduce a new per-sb directory in sysfs Add a new directory 'stat' in path of /sys/fs/f2fs//, later we can add new readonly stat sysfs file into this directory, it will make directory less mess. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++- fs/f2fs/sysfs.c | 68 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 18b16deac2d1..e97d828e607b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1549,9 +1549,12 @@ struct f2fs_sb_info { unsigned int node_io_flag; /* For sysfs suppport */ - struct kobject s_kobj; + struct kobject s_kobj; /* /sys/fs/f2fs/ */ struct completion s_kobj_unregister; + struct kobject s_stat_kobj; /* /sys/fs/f2fs//stat */ + struct completion s_stat_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index db9c49f9f0de..e670634a8978 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -709,6 +709,10 @@ static struct attribute *f2fs_feat_attrs[] = { NULL, }; +static struct attribute *f2fs_stat_attrs[] = { + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -737,6 +741,44 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static ssize_t f2fs_stat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_stat_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_stat_kobj); + complete(&sbi->s_stat_kobj_unregister); +} + +static const struct sysfs_ops f2fs_stat_attr_ops = { + .show = f2fs_stat_attr_show, + .store = f2fs_stat_attr_store, +}; + +static struct kobj_type f2fs_stat_ktype = { + .default_attrs = f2fs_stat_attrs, + .sysfs_ops = &f2fs_stat_attr_ops, + .release = f2fs_stat_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -943,11 +985,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) { - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - return err; - } + if (err) + goto put_sb_kobj; + + sbi->s_stat_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_stat_kobj_unregister); + err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, + &sbi->s_kobj, "stat"); + if (err) + goto put_stat_kobj; if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -963,6 +1009,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_stat_kobj: + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); +put_sb_kobj: + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); + return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) @@ -974,6 +1027,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + + kobject_del(&sbi->s_stat_kobj); + kobject_put(&sbi->s_stat_kobj); + wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); From 03e6a4a94f47a370c065c59a72ec89bee4240730 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 14 Dec 2020 17:20:57 +0800 Subject: [PATCH 10/45] f2fs: fix to tag FIEMAP_EXTENT_MERGED in f2fs_fiemap() f2fs does not natively support extents in metadata, 'extent' in f2fs is used as a virtual concept, so in f2fs_fiemap() interface, it needs to tag FIEMAP_EXTENT_MERGED flag to indicated the extent status is a result of merging. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9b1b76de66b3..07774b574f54 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1915,6 +1915,7 @@ next: } if (size) { + flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; From 569a20e378675f18f54619fd7f6e8466d9df3033 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Dec 2020 17:15:23 +0800 Subject: [PATCH 11/45] f2fs: fix out-of-repair __setattr_copy() __setattr_copy() was copied from setattr_copy() in fs/attr.c, there is two missing patches doesn't cover this inner function, fix it. Commit 7fa294c8991c ("userns: Allow chown and setgid preservation") Commit 23adbe12ef7d ("fs,userns: Change inode_capable to capable_wrt_inode_uidgid") Fixes: fbfa2cc58d53 ("f2fs: add file operations") Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 09507d7c4c00..13c00f200029 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -873,7 +873,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + if (!in_group_p(inode->i_gid) && + !capable_wrt_inode_uidgid(inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } From 4e99026b3dc5ffdf1a7f15c2ec209300c00dcd88 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Dec 2020 16:38:35 +0800 Subject: [PATCH 12/45] f2fs: trival cleanup in move_data_block() Trival cleanups: - relocate set_summary() before its use - relocate "allocate block address" to correct place - remove unneeded f2fs_wait_on_page_writeback() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 3ef84e6ded41..39330ad3c44e 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1169,8 +1169,6 @@ static int move_data_block(struct inode *inode, block_t bidx, if (err) goto put_out; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* read page */ fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; @@ -1207,6 +1205,9 @@ static int move_data_block(struct inode *inode, block_t bidx, } } + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* allocate block address */ f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); @@ -1233,9 +1234,6 @@ static int move_data_block(struct inode *inode, block_t bidx, set_page_writeback(fio.encrypted_page); ClearPageError(page); - /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; From 314ed3b1794b79002c30feb512da00ff6a69f286 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 4 Jan 2021 22:33:02 -0800 Subject: [PATCH 13/45] f2fs: clean up post-read processing Rework the post-read processing logic to be much easier to understand. At least one bug is fixed by this: if an I/O error occurred when reading from disk, decryption and verity would be performed on the uninitialized data, causing misleading messages in the kernel log. Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 149 ++++++++++++----- fs/f2fs/data.c | 387 ++++++++++++++++++--------------------------- fs/f2fs/f2fs.h | 55 ++++++- 3 files changed, 312 insertions(+), 279 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5cc43c7d2e9b..c6b1bfacb6d8 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -727,38 +727,27 @@ out: return ret; } -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi= F2FS_I(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; int ret; int i; - dec_page_count(sbi, F2FS_RD_DATA); - - if (bio->bi_status || PageError(page)) - dic->failed = true; - - if (atomic_dec_return(&dic->pending_pages)) - return; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, dic->cluster_size, fi->i_compress_algorithm); - /* submit partial compressed pages */ if (dic->failed) { ret = -EIO; - goto out_free_dic; + goto out_end_io; } dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); if (!dic->tpages) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } for (i = 0; i < dic->cluster_size; i++) { @@ -770,20 +759,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) dic->tpages[i] = f2fs_compress_alloc_page(); if (!dic->tpages[i]) { ret = -ENOMEM; - goto out_free_dic; + goto out_end_io; } } if (cops->init_decompress_ctx) { ret = cops->init_decompress_ctx(dic); if (ret) - goto out_free_dic; + goto out_end_io; } dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); if (!dic->rbuf) { ret = -ENOMEM; - goto destroy_decompress_ctx; + goto out_destroy_decompress_ctx; } dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); @@ -822,18 +811,34 @@ out_vunmap_cbuf: vm_unmap_ram(dic->cbuf, dic->nr_cpages); out_vunmap_rbuf: vm_unmap_ram(dic->rbuf, dic->cluster_size); -destroy_decompress_ctx: +out_destroy_decompress_ctx: if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); -out_free_dic: - if (!verity) - f2fs_decompress_end_io(dic->rpages, dic->cluster_size, - ret, false); - +out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - if (!verity) - f2fs_free_dic(dic); + f2fs_decompress_end_io(dic, ret); +} + +/* + * This is called when a page of a compressed cluster has been read from disk + * (or failed to be read from disk). It checks whether this page was the last + * page being waited on in the cluster, and if so, it decompresses the cluster + * (or in the case of a failure, cleans up without actually decompressing). + */ +void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + + dec_page_count(sbi, F2FS_RD_DATA); + + if (failed) + WRITE_ONCE(dic->failed, true); + + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1500,6 +1505,8 @@ destroy_out: return err; } +static void f2fs_free_dic(struct decompress_io_ctx *dic); + struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; @@ -1518,12 +1525,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - atomic_set(&dic->pending_pages, cc->nr_cpages); + atomic_set(&dic->remaining_pages, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; dic->nr_cpages = cc->nr_cpages; + refcount_set(&dic->refcnt, 1); dic->failed = false; + dic->need_verity = f2fs_need_verity(cc->inode, start_idx); for (i = 0; i < dic->cluster_size; i++) dic->rpages[i] = cc->rpages[i]; @@ -1552,7 +1561,7 @@ out_free: return ERR_PTR(-ENOMEM); } -void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_free_dic(struct decompress_io_ctx *dic) { int i; @@ -1580,30 +1589,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) kmem_cache_free(dic_entry_slab, dic); } -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity) +static void f2fs_put_dic(struct decompress_io_ctx *dic) +{ + if (refcount_dec_and_test(&dic->refcnt)) + f2fs_free_dic(dic); +} + +/* + * Update and unlock the cluster's pagecache pages, and release the reference to + * the decompress_io_ctx that was being held for I/O completion. + */ +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) { int i; - for (i = 0; i < cluster_size; i++) { - struct page *rpage = rpages[i]; + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; if (!rpage) continue; - if (err || PageError(rpage)) - goto clear_uptodate; - - if (!verity || fsverity_verify_page(rpage)) { + /* PG_error was set if verity failed. */ + if (failed || PageError(rpage)) { + ClearPageUptodate(rpage); + /* will re-read again later */ + ClearPageError(rpage); + } else { SetPageUptodate(rpage); - goto unlock; } -clear_uptodate: - ClearPageUptodate(rpage); - ClearPageError(rpage); -unlock: unlock_page(rpage); } + + f2fs_put_dic(dic); +} + +static void f2fs_verify_cluster(struct work_struct *work) +{ + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, verity_work); + int i; + + /* Verify the cluster's decompressed pages with fs-verity. */ + for (i = 0; i < dic->cluster_size; i++) { + struct page *rpage = dic->rpages[i]; + + if (rpage && !fsverity_verify_page(rpage)) + SetPageError(rpage); + } + + __f2fs_decompress_end_io(dic, false); +} + +/* + * This is called when a compressed cluster has been decompressed + * (or failed to be read and/or decompressed). + */ +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +{ + if (!failed && dic->need_verity) { + /* + * Note that to avoid deadlocks, the verity work can't be done + * on the decompression workqueue. This is because verifying + * the data pages can involve reading metadata pages from the + * file, and these metadata pages may be compressed. + */ + INIT_WORK(&dic->verity_work, f2fs_verify_cluster); + fsverity_enqueue_verify_work(&dic->verity_work); + } else { + __f2fs_decompress_end_io(dic, failed); + } +} + +/* + * Put a reference to a compressed page's decompress_io_ctx. + * + * This is called when the page is no longer needed and can be freed. + */ +void f2fs_put_page_dic(struct page *page) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_put_dic(dic); } int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 07774b574f54..1cd764da3247 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -113,10 +113,21 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_DECRYPT, - STEP_DECOMPRESS_NOWQ, /* handle normal cluster data inplace */ - STEP_DECOMPRESS, /* handle compressed cluster data in workqueue */ - STEP_VERITY, +#ifdef CONFIG_FS_ENCRYPTION + STEP_DECRYPT = 1 << 0, +#else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + STEP_DECOMPRESS = 1 << 1, +#else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ +#endif +#ifdef CONFIG_FS_VERITY + STEP_VERITY = 1 << 2, +#else + STEP_VERITY = 0, /* compile out the verity-related code */ +#endif }; struct bio_post_read_ctx { @@ -126,25 +137,26 @@ struct bio_post_read_ctx { unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio, bool compr, bool verity) +static void f2fs_finish_read_bio(struct bio *bio) { - struct page *page; struct bio_vec *bv; - int i; + int iter_all; - bio_for_each_segment_all(bv, bio, i) { - page = bv->bv_page; + /* + * Update and unlock the bio's pagecache pages, and put the + * decompression context for any compressed pages. + */ + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; -#ifdef CONFIG_F2FS_FS_COMPRESSION - if (compr && f2fs_is_compressed_page(page)) { - f2fs_decompress_pages(bio, page, verity); + if (f2fs_is_compressed_page(page)) { + if (bio->bi_status) + f2fs_end_read_compressed_page(page, true); + f2fs_put_page_dic(page); continue; } - if (verity) - continue; -#endif - /* PG_error was set if any post_read step failed */ + /* PG_error was set if decryption or verity failed. */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); /* will re-read again later */ @@ -155,106 +167,104 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } + + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); } -static void f2fs_release_read_bio(struct bio *bio); -static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) -{ - if (!compr) - __read_end_io(bio, false, verity); - f2fs_release_read_bio(bio); -} - -static void f2fs_decompress_bio(struct bio *bio, bool verity) -{ - __read_end_io(bio, true, verity); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx); - -static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) -{ - fscrypt_decrypt_bio(ctx->bio); -} - -static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) -{ - f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); -} - -#ifdef CONFIG_F2FS_FS_COMPRESSION -static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) -{ - f2fs_decompress_end_io(rpages, cluster_size, false, true); -} - -static void f2fs_verify_bio(struct bio *bio) -{ - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - struct decompress_io_ctx *dic; - - dic = (struct decompress_io_ctx *)page_private(page); - - if (dic) { - if (atomic_dec_return(&dic->verity_pages)) - continue; - f2fs_verify_pages(dic->rpages, - dic->cluster_size); - f2fs_free_dic(dic); - continue; - } - - if (bio->bi_status || PageError(page)) - goto clear_uptodate; - - if (fsverity_verify_page(page)) { - SetPageUptodate(page); - goto unlock; - } -clear_uptodate: - ClearPageUptodate(page); - ClearPageError(page); -unlock: - dec_page_count(F2FS_P_SB(page), __read_io_type(page)); - unlock_page(page); - } -} -#endif - -static void f2fs_verity_work(struct work_struct *work) +static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; -#ifdef CONFIG_F2FS_FS_COMPRESSION - unsigned int enabled_steps = ctx->enabled_steps; -#endif + bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readpages() again, and while verity - * will be disabled for this, decryption may still be needed, resulting - * in another bio_post_read_ctx being allocated. So to prevent - * deadlocks we need to release the current ctx to the mempool first. - * This assumes that verity is the last post-read step. + * will be disabled for this, decryption and/or decompression may still + * be needed, resulting in another bio_post_read_ctx being allocated. + * So to prevent deadlocks we need to release the current ctx to the + * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; -#ifdef CONFIG_F2FS_FS_COMPRESSION - /* previous step is decompression */ - if (enabled_steps & (1 << STEP_DECOMPRESS)) { - f2fs_verify_bio(bio); - f2fs_release_read_bio(bio); - return; - } -#endif + /* + * Verify the bio's pages with fs-verity. Exclude compressed pages, + * as those were handled separately by f2fs_end_read_compressed_page(). + */ + if (may_have_compressed_pages) { + struct bio_vec *bv; + int iter_all; - fsverity_verify_bio(bio); - __f2fs_read_end_io(bio, false, false); + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + + if (!f2fs_is_compressed_page(page) && + !PageError(page) && !fsverity_verify_page(page)) + SetPageError(page); + } + } else { + fsverity_verify_bio(bio); + } + + f2fs_finish_read_bio(bio); +} + +/* + * If the bio's data needs to be verified with fs-verity, then enqueue the + * verity work for the bio. Otherwise finish the bio now. + * + * Note that to avoid deadlocks, the verity work can't be done on the + * decryption/decompression workqueue. This is because verifying the data pages + * can involve reading verity metadata pages from the file, and these verity + * metadata pages may be encrypted and/or compressed. + */ +static void f2fs_verify_and_finish_bio(struct bio *bio) +{ + struct bio_post_read_ctx *ctx = bio->bi_private; + + if (ctx && (ctx->enabled_steps & STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verify_bio); + fsverity_enqueue_verify_work(&ctx->work); + } else { + f2fs_finish_read_bio(bio); + } +} + +/* + * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last + * remaining page was read by @ctx->bio. + * + * Note that a bio may span clusters (even a mix of compressed and uncompressed + * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates + * that the bio includes at least one compressed page. The actual decompression + * is done on a per-cluster basis, not a per-bio basis. + */ +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +{ + struct bio_vec *bv; + int iter_all; + bool all_compressed = true; + + bio_for_each_segment_all(bv, ctx->bio, iter_all) { + struct page *page = bv->bv_page; + + /* PG_error was set if decryption failed. */ + if (f2fs_is_compressed_page(page)) + f2fs_end_read_compressed_page(page, PageError(page)); + else + all_compressed = false; + } + + /* + * Optimization: if all the bio's pages are compressed, then scheduling + * the per-bio verity work is unnecessary, as verity will be fully + * handled at the compression cluster level. + */ + if (all_compressed) + ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) @@ -262,88 +272,50 @@ static void f2fs_post_read_work(struct work_struct *work) struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) - f2fs_decrypt_work(ctx); + if (ctx->enabled_steps & STEP_DECRYPT) + fscrypt_decrypt_bio(ctx->bio); - if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) - f2fs_decompress_work(ctx); + if (ctx->enabled_steps & STEP_DECOMPRESS) + f2fs_handle_step_decompress(ctx); - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, - ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); -} - -static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, - struct work_struct *work) -{ - queue_work(sbi->post_read_wq, work); -} - -static void bio_post_read_processing(struct bio_post_read_ctx *ctx) -{ - /* - * We use different work queues for decryption and for verity because - * verity may require reading metadata pages that need decryption, and - * we shouldn't recurse to the same workqueue. - */ - - if (ctx->enabled_steps & (1 << STEP_DECRYPT) || - ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); - return; - } - - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, f2fs_verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - - __f2fs_read_end_io(ctx->bio, false, false); -} - -static bool f2fs_bio_post_read_required(struct bio *bio) -{ - return bio->bi_private; + f2fs_verify_and_finish_bio(ctx->bio); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); + struct bio_post_read_ctx *ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } - if (f2fs_bio_post_read_required(bio)) { - struct bio_post_read_ctx *ctx = bio->bi_private; - - bio_post_read_processing(ctx); + if (bio->bi_status) { + f2fs_finish_read_bio(bio); return; } - __f2fs_read_end_io(bio, false, false); + if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + } else { + f2fs_verify_and_finish_bio(bio); + } } static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = bio->bi_private; struct bio_vec *bvec; - int i; + int iter_all; if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; } - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); @@ -565,7 +537,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; - int i; + int iter_all; if (!bio) return false; @@ -573,7 +545,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (!inode && !page && !ino) return true; - bio_for_each_segment_all(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, iter_all) { struct page *target = bvec->bv_page; if (fscrypt_is_bounce_page(target)) { @@ -976,16 +948,9 @@ out: up_write(&io->io_rwsem); } -static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) -{ - return fsverity_active(inode) && - idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); -} - static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write, - bool for_verity) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1001,13 +966,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (f2fs_encrypted_file(inode)) - post_read_steps |= 1 << STEP_DECRYPT; - if (f2fs_compressed_file(inode)) - post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (for_verity && f2fs_need_verity(inode, first_idx)) - post_read_steps |= 1 << STEP_VERITY; + post_read_steps |= STEP_DECRYPT; - if (post_read_steps) { + if (f2fs_need_verity(inode, first_idx)) + post_read_steps |= STEP_VERITY; + + /* + * STEP_DECOMPRESS is handled specially, since a compressed file might + * contain both compressed and uncompressed clusters. We'll allocate a + * bio_post_read_ctx if the file is compressed, but the caller is + * responsible for enabling STEP_DECOMPRESS if it's actually needed. + */ + + if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; @@ -1019,13 +990,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } -static void f2fs_release_read_bio(struct bio *bio) -{ - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); -} - /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr, int op_flags, bool for_write) @@ -1034,7 +998,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write, true); + page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -2072,7 +2036,7 @@ submit_and_realloc: if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false, true); + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2118,8 +2082,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; - struct bio_post_read_ctx *ctx; - bool for_verity = false; int i; int ret = 0; @@ -2185,29 +2147,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - /* - * It's possible to enable fsverity on the fly when handling a cluster, - * which requires complicated error handling. Instead of adding more - * complexity, let's give a rule where end_io post-processes fsverity - * per cluster. In order to do that, we need to submit bio, if previous - * bio sets a different post-process policy. - */ - if (fsverity_active(cc->inode)) { - atomic_set(&dic->verity_pages, cc->nr_cpages); - for_verity = true; - - if (bio) { - ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { - __submit_bio(sbi, bio, DATA); - bio = NULL; - } - } - } - for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; + struct bio_post_read_ctx *ctx; blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2222,31 +2165,10 @@ submit_and_realloc: if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write, for_verity); + page->index, for_write); if (IS_ERR(bio)) { - unsigned int remained = dic->nr_cpages - i; - bool release = false; - ret = PTR_ERR(bio); - dic->failed = true; - - if (for_verity) { - if (!atomic_sub_return(remained, - &dic->verity_pages)) - release = true; - } else { - if (!atomic_sub_return(remained, - &dic->pending_pages)) - release = true; - } - - if (release) { - f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); - f2fs_free_dic(dic); - } - + f2fs_decompress_end_io(dic, ret); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; @@ -2258,10 +2180,9 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - /* tag STEP_DECOMPRESS to handle IO in wq */ ctx = bio->bi_private; - if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS))) - ctx->enabled_steps |= 1 << STEP_DECOMPRESS; + ctx->enabled_steps |= STEP_DECOMPRESS; + refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -2278,7 +2199,13 @@ submit_and_realloc: out_put_dnode: f2fs_put_dnode(&dn); out: - f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) { + ClearPageUptodate(cc->rpages[i]); + ClearPageError(cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + } *bio_ret = bio; return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e97d828e607b..537b9365db0e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1345,7 +1345,7 @@ struct compress_io_ctx { atomic_t pending_pages; /* in-flight compressed page count */ }; -/* decompress io context for read IO path */ +/* Context for decompressing one cluster on the read IO path */ struct decompress_io_ctx { u32 magic; /* magic number to indicate page is compressed */ struct inode *inode; /* inode the context belong to */ @@ -1361,11 +1361,37 @@ struct decompress_io_ctx { struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ - atomic_t pending_pages; /* in-flight compressed page count */ - atomic_t verity_pages; /* in-flight page count for verity */ - bool failed; /* indicate IO error during decompression */ + + /* + * The number of compressed pages remaining to be read in this cluster. + * This is initially nr_cpages. It is decremented by 1 each time a page + * has been read (or failed to be read). When it reaches 0, the cluster + * is decompressed (or an error is reported). + * + * If an error occurs before all the pages have been submitted for I/O, + * then this will never reach 0. In this case the I/O submitter is + * responsible for calling f2fs_decompress_end_io() instead. + */ + atomic_t remaining_pages; + + /* + * Number of references to this decompress_io_ctx. + * + * One reference is held for I/O completion. This reference is dropped + * after the pagecache pages are updated and unlocked -- either after + * decompression (and verity if enabled), or after an error. + * + * In addition, each compressed page holds a reference while it is in a + * bio. These references are necessary prevent compressed pages from + * being freed while they are still in a bio. + */ + refcount_t refcnt; + + bool failed; /* IO error occurred before decompression? */ + bool need_verity; /* need fs-verity verification after decompression? */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ + struct work_struct verity_work; /* work to verify the decompressed pages */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -3894,7 +3920,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +void f2fs_end_read_compressed_page(struct page *page, bool failed); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3907,9 +3933,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_free_dic(struct decompress_io_ctx *dic); -void f2fs_decompress_end_io(struct page **rpages, - unsigned int cluster_size, bool err, bool verity); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); +void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -3933,6 +3958,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } +static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +{ + WARN_ON_ONCE(1); +} +static inline void f2fs_put_page_dic(struct page *page) +{ + WARN_ON_ONCE(1); +} static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } @@ -4153,6 +4186,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return false; } +static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) +{ + return fsverity_active(inode) && + idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); +} + #ifdef CONFIG_F2FS_FAULT_INJECTION extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, unsigned int type); From 3708f23a900521d1c57202b36f47ab0dc9efe21a Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Wed, 6 Jan 2021 08:49:28 +0900 Subject: [PATCH 14/45] f2fs: fix null page reference in redirty_blocks By Colin's static analysis, we found out there is a null page reference under low memory situation in redirty_blocks. I've made the page finding loop stop immediately and return an error not to cause further memory pressure when we run into a failure to find a page under low memory condition. Signed-off-by: Daeho Jeong Reported-by: Colin Ian King Fixes: 5fdb322ff2c2 ("f2fs: add F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE") Reviewed-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 13c00f200029..229ebe13294a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4080,8 +4080,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) for (i = 0; i < page_len; i++, redirty_idx++) { page = find_lock_page(mapping, redirty_idx); - if (!page) - ret = -ENOENT; + if (!page) { + ret = -ENOMEM; + break; + } set_page_dirty(page); f2fs_put_page(page, 1); f2fs_put_page(page, 0); From 7383fd9bc8810390f9d4ea1909b825fbb224097f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Jan 2021 09:55:09 +0800 Subject: [PATCH 15/45] f2fs: fix to set/clear I_LINKABLE under i_lock fsstress + fault injection test case reports a warning message as below: WARNING: CPU: 13 PID: 6226 at fs/inode.c:361 inc_nlink+0x32/0x40 Call Trace: f2fs_init_inode_metadata+0x25c/0x4a0 [f2fs] f2fs_add_inline_entry+0x153/0x3b0 [f2fs] f2fs_add_dentry+0x75/0x80 [f2fs] f2fs_do_add_link+0x108/0x160 [f2fs] f2fs_rename2+0x6ab/0x14f0 [f2fs] vfs_rename+0x70c/0x940 do_renameat2+0x4d8/0x4f0 __x64_sys_renameat2+0x4b/0x60 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Following race case can cause this: Thread A Kworker - f2fs_rename - f2fs_create_whiteout - __f2fs_tmpfile - f2fs_i_links_write - f2fs_mark_inode_dirty_sync - mark_inode_dirty_sync - writeback_single_inode - __writeback_single_inode - spin_lock(&inode->i_lock) - inode->i_state |= I_LINKABLE - inode->i_state &= ~dirty - spin_unlock(&inode->i_lock) - f2fs_add_link - f2fs_do_add_link - f2fs_add_dentry - f2fs_add_inline_entry - f2fs_init_inode_metadata - f2fs_i_links_write - inc_nlink - WARN_ON(!(inode->i_state & I_LINKABLE)) Fix to add i_lock to avoid i_state update race condition. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 70a8e516fd32..db5d6c666ea2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -850,7 +850,11 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + + spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; + spin_unlock(&inode->i_lock); + *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -1036,7 +1040,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; + + spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; + spin_unlock(&whiteout->i_lock); + iput(whiteout); } From d0b5341c90c294524434442c597504be2bec3674 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Dec 2020 15:25:29 -0800 Subject: [PATCH 16/45] libfs: unexport generic_ci_d_compare() and generic_ci_d_hash() Now that generic_set_encrypted_ci_d_ops() has been added and ext4 and f2fs are using it, it's no longer necessary to export generic_ci_d_compare() and generic_ci_d_hash() to filesystems. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/libfs.c | 8 +++----- include/linux/fs.h | 5 ----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index b82041854991..5743ce49a683 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1277,8 +1277,8 @@ static bool needs_casefold(const struct inode *dir) * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ -int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) +static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) { const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *dir = READ_ONCE(parent->d_inode); @@ -1315,7 +1315,6 @@ fallback: return 1; return !!memcmp(str, name->name, len); } -EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems @@ -1324,7 +1323,7 @@ EXPORT_SYMBOL(generic_ci_d_compare); * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ -int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) +static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; @@ -1339,7 +1338,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) return -EINVAL; return 0; } -EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, diff --git a/include/linux/fs.h b/include/linux/fs.h index 24e5bc507eca..a54ccfb9ea4a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3251,11 +3251,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -#ifdef CONFIG_UNICODE -extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); -extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name); -#endif extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); #ifdef CONFIG_MIGRATION From 256b72b2944b247e2c5ab6fa82120d83f08987d0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 11 Jan 2021 17:42:53 +0800 Subject: [PATCH 17/45] f2fs: compress: fix potential deadlock generic/269 reports a hangtask issue, the root cause is ABBA deadlock described as below: Thread A Thread B - down_write(&sbi->gc_lock) -- A - f2fs_write_data_pages - lock all pages in cluster -- B - f2fs_write_multi_pages - f2fs_write_raw_pages - f2fs_write_single_data_page - f2fs_balance_fs - down_write(&sbi->gc_lock) -- A - f2fs_gc - do_garbage_collect - ra_data_block - pagecache_get_page -- B To fix this, it needs to avoid calling f2fs_balance_fs() if there is still cluster pages been locked in context of cluster writeback, so instead, let's call f2fs_balance_fs() in the end of f2fs_write_raw_pages() when all cluster pages were unlocked. Fixes: 4c8ff7095bef ("f2fs: support data compression") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 5 ++++- fs/f2fs/data.c | 10 ++++++---- fs/f2fs/f2fs.h | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c6b1bfacb6d8..f2918a62989c 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1426,7 +1426,7 @@ retry_write: ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, - compr_blocks); + compr_blocks, false); if (ret) { if (ret == AOP_WRITEPAGE_ACTIVATE) { unlock_page(cc->rpages[i]); @@ -1461,6 +1461,9 @@ retry_write: *submitted += _submitted; } + + f2fs_balance_fs(F2FS_M_SB(mapping), true); + return 0; out_err: for (++i; i < cc->cluster_size; i++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1cd764da3247..36efd7771116 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2623,7 +2623,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks) + int compr_blocks, + bool allow_balance) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2761,7 +2762,7 @@ out: } unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && - !F2FS_I(inode)->cp_task) + !F2FS_I(inode)->cp_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2808,7 +2809,7 @@ out: #endif return f2fs_write_single_data_page(page, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0); + wbc, FS_DATA_IO, 0, true); } /* @@ -2978,7 +2979,8 @@ continue_unlock: } #endif ret = f2fs_write_single_data_page(page, &submitted, - &bio, &last_block, wbc, io_type, 0); + &bio, &last_block, wbc, io_type, + 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) unlock_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 537b9365db0e..a4678bb18d8c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3513,7 +3513,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, - int compr_blocks); + int compr_blocks, bool allow_balance); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); From 79ac6fa9f50da225d0d96b03c9f7a1bd5989cc44 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 13 Jan 2021 13:21:54 +0800 Subject: [PATCH 18/45] f2fs: fix to use per-inode maxbytes F2FS inode may have different max size, e.g. compressed file have less blkaddr entries in all its direct-node blocks, result in being with less max filesize. So change to use per-inode maxbytes. Suggested-by: Chao Yu Signed-off-by: Chengguang Xu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 9 ++++++--- fs/f2fs/super.c | 12 ++++++++---- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 36efd7771116..0095a1d52722 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3715,7 +3715,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ - if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks)) + if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a4678bb18d8c..d9353d976c41 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1478,7 +1478,6 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ int readdir_ra; /* readahead inode in readdir */ u64 max_io_bytes; /* max io bytes to merge IOs */ @@ -3275,6 +3274,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); +loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 229ebe13294a..bd6bf9ebf333 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -505,6 +505,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; loff_t maxbytes = inode->i_sb->s_maxbytes; + if (f2fs_compressed_file(inode)) + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + switch (whence) { case SEEK_SET: case SEEK_CUR: @@ -689,7 +692,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BLK_ALIGN(from); - if (free_from >= sbi->max_file_blocks) + if (free_from >= max_file_blocks(inode)) goto free_partial; if (lock) @@ -2770,7 +2773,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) return -EINVAL; if (unlikely((range.start + range.len) >> PAGE_SHIFT > - sbi->max_file_blocks)) + max_file_blocks(inode))) return -EINVAL; err = mnt_want_write_file(filp); @@ -3333,7 +3336,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; - end = F2FS_I_SB(inode)->max_file_blocks; + end = max_file_blocks(inode); while (map.m_lblk < end) { map.m_len = end - map.m_lblk; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 79da50891961..aaa52d34c3c1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2701,10 +2701,10 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_blocks(void) +loff_t max_file_blocks(struct inode *inode) { loff_t result = 0; - loff_t leaf_count = DEF_ADDRS_PER_BLOCK; + loff_t leaf_count; /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - @@ -2713,6 +2713,11 @@ static loff_t max_file_blocks(void) * result as zero. */ + if (inode && f2fs_compressed_file(inode)) + leaf_count = ADDRS_PER_BLOCK(inode); + else + leaf_count = DEF_ADDRS_PER_BLOCK; + /* two direct node blocks */ result += (leaf_count * 2); @@ -3589,8 +3594,7 @@ try_onemore: if (err) goto free_options; - sbi->max_file_blocks = max_file_blocks(); - sb->s_maxbytes = sbi->max_file_blocks << + sb->s_maxbytes = max_file_blocks(NULL) << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; From 5e5afb124c4527b04e2d49927e1d039ba4f5d5d0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 14 Jan 2021 09:41:27 +0800 Subject: [PATCH 19/45] f2fs: introduce sb_status sysfs node Introduce /sys/fs/f2fs//stat/sb_status to show superblock status in real time as a hexadecimal value. value sb status macro description 0x1 SBI_IS_DIRTY, /* dirty flag for checkpoint */ 0x2 SBI_IS_CLOSE, /* specify unmounting */ 0x4 SBI_NEED_FSCK, /* need fsck.f2fs to fix */ 0x8 SBI_POR_DOING, /* recovery is doing or not */ 0x10 SBI_NEED_SB_WRITE, /* need to recover superblock */ 0x20 SBI_NEED_CP, /* need to checkpoint */ 0x40 SBI_IS_SHUTDOWN, /* shutdown by ioctl */ 0x80 SBI_IS_RECOVERED, /* recovered orphan/data */ 0x100 SBI_CP_DISABLED, /* CP was disabled last mount */ 0x200 SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ 0x400 SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ 0x800 SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ 0x1000 SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ 0x2000 SBI_IS_RESIZEFS, /* resizefs is in process */ Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 23 +++++++++++++++++++++++ fs/f2fs/sysfs.c | 8 ++++++++ 2 files changed, 31 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 2618587ab55e..abb7717eb31d 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -357,3 +357,26 @@ Description: This gives a control to limit the bio size in f2fs. Default is zero, which will follow underlying block layer limit, whereas, if it has a certain bytes value, f2fs won't submit a bio larger than that size. + +What: /sys/fs/f2fs//stat/sb_status +Date: December 2020 +Contact: "Chao Yu" +Description: Show status of f2fs superblock in real time. + + ====== ===================== ================================= + value sb status macro description + 0x1 SBI_IS_DIRTY dirty flag for checkpoint + 0x2 SBI_IS_CLOSE specify unmounting + 0x4 SBI_NEED_FSCK need fsck.f2fs to fix + 0x8 SBI_POR_DOING recovery is doing or not + 0x10 SBI_NEED_SB_WRITE need to recover superblock + 0x20 SBI_NEED_CP need to checkpoint + 0x40 SBI_IS_SHUTDOWN shutdown by ioctl + 0x80 SBI_IS_RECOVERED recovered orphan/data + 0x100 SBI_CP_DISABLED CP was disabled last mount + 0x200 SBI_CP_DISABLED_QUICK CP was disabled quickly + 0x400 SBI_QUOTA_NEED_FLUSH need to flush quota info in CP + 0x800 SBI_QUOTA_SKIP_FLUSH skip flushing quota in current CP + 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted + 0x2000 SBI_IS_RESIZEFS resizefs is in process + ====== ===================== ================================= diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e670634a8978..81840ba130e4 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -101,6 +101,12 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, sbi->sectors_written_start) >> 1))); } +static ssize_t sb_status_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%lx\n", sbi->s_flag); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -709,7 +715,9 @@ static struct attribute *f2fs_feat_attrs[] = { NULL, }; +F2FS_GENERAL_RO_ATTR(sb_status); static struct attribute *f2fs_stat_attrs[] = { + ATTR_LIST(sb_status), NULL, }; From 5ecffc99fb8bc2e491a1eef1ab48c389efa09adb Mon Sep 17 00:00:00 2001 From: Jack Qiu Date: Wed, 13 Jan 2021 17:58:53 +0800 Subject: [PATCH 20/45] f2fs: remove unused stat_{inc, dec}_atomic_write Just clean code, no logical change. Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9353d976c41..d96254494f9d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3759,8 +3759,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #define stat_dec_compr_inode(inode) do { } while (0) #define stat_add_compr_blocks(inode, blocks) do { } while (0) #define stat_sub_compr_blocks(inode, blocks) do { } while (0) -#define stat_inc_atomic_write(inode) do { } while (0) -#define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) From ff6bdecfa0ad51922ba6f8ee24b827ed52e5316f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 14 Jan 2021 13:59:09 -0800 Subject: [PATCH 21/45] f2fs: deprecate f2fs_trace_io This patch deprecates f2fs_trace_io, since f2fs uses page->private more broadly, resulting in more buggy cases. Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 10 --- fs/f2fs/Makefile | 1 - fs/f2fs/checkpoint.c | 3 - fs/f2fs/data.c | 4 -- fs/f2fs/file.c | 2 - fs/f2fs/node.c | 2 - fs/f2fs/segment.c | 3 - fs/f2fs/super.c | 6 -- fs/f2fs/trace.c | 165 ------------------------------------------- fs/f2fs/trace.h | 43 ----------- 10 files changed, 239 deletions(-) delete mode 100644 fs/f2fs/trace.c delete mode 100644 fs/f2fs/trace.h diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1d2d3f879f6c..5c0c113cf411 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -84,16 +84,6 @@ config F2FS_FS_ENCRYPTION FS_ENCRYPTION. Use CONFIG_FS_ENCRYPTION=y in new config files. -config F2FS_IO_TRACE - bool "F2FS IO tracer" - depends on F2FS_FS - depends on FUNCTION_TRACER - help - F2FS IO trace is based on a function trace, which gathers process - information and block IO patterns in the filesystem level. - - If unsure, say N. - config F2FS_FAULT_INJECTION bool "F2FS fault injection facility" depends on F2FS_FS diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ee7316b42f69..e5295746208b 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,6 +7,5 @@ f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o -f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 617d0f6b0836..404420c6801b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -17,7 +17,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include static struct kmem_cache *ino_entry_slab; @@ -443,7 +442,6 @@ static int f2fs_set_meta_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; @@ -1017,7 +1015,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) spin_unlock(&sbi->inode_lock[type]); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0095a1d52722..b7bcaa7a4113 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -23,7 +23,6 @@ #include "f2fs.h" #include "node.h" #include "segment.h" -#include "trace.h" #include #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -650,7 +649,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); @@ -845,7 +843,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) return -EFSCORRUPTED; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(fio, 0); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) @@ -935,7 +932,6 @@ alloc_new: wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; - f2fs_trace_ios(fio, 0); trace_f2fs_submit_page_write(fio->page, fio); skip: diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bd6bf9ebf333..3e37198a880d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -29,7 +29,6 @@ #include "xattr.h" #include "acl.h" #include "gc.h" -#include "trace.h" #include #include @@ -369,7 +368,6 @@ flush_out: f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); - f2fs_trace_ios(NULL, 1); return ret; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5f561eb33d86..0137878cb394 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,7 +17,6 @@ #include "node.h" #include "segment.h" #include "xattr.h" -#include "trace.h" #include #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -2089,7 +2088,6 @@ static int f2fs_set_node_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); f2fs_set_page_private(page, 0); - f2fs_trace_pid(page); return 1; } return 0; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6ec659bc8d8d..651514e8f226 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,7 +20,6 @@ #include "segment.h" #include "node.h" #include "gc.h" -#include "trace.h" #include #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_trace_pid(page); - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aaa52d34c3c1..1b77db10ced1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -32,7 +32,6 @@ #include "segment.h" #include "xattr.h" #include "gc.h" -#include "trace.h" #define CREATE_TRACE_POINTS #include @@ -1439,8 +1438,6 @@ int f2fs_sync_fs(struct super_block *sb, int sync) err = f2fs_write_checkpoint(sbi, &cpc); up_write(&sbi->gc_lock); } - f2fs_trace_ios(NULL, 1); - return err; } @@ -4073,8 +4070,6 @@ static int __init init_f2fs_fs(void) return -EINVAL; } - f2fs_build_trace_ios(); - err = init_inodecache(); if (err) goto fail; @@ -4167,7 +4162,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); destroy_inodecache(); - f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c deleted file mode 100644 index d0ab533a9ce8..000000000000 --- a/fs/f2fs/trace.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "trace.h" - -static RADIX_TREE(pids, GFP_ATOMIC); -static spinlock_t pids_lock; -static struct last_io_info last_io; - -static inline void __print_last_io(void) -{ - if (!last_io.len) - return; - - trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", - last_io.major, last_io.minor, - last_io.pid, "----------------", - last_io.type, - last_io.fio.op, last_io.fio.op_flags, - last_io.fio.new_blkaddr, - last_io.len); - memset(&last_io, 0, sizeof(last_io)); -} - -static int __file_type(struct inode *inode, pid_t pid) -{ - if (f2fs_is_atomic_file(inode)) - return __ATOMIC_FILE; - else if (f2fs_is_volatile_file(inode)) - return __VOLATILE_FILE; - else if (S_ISDIR(inode->i_mode)) - return __DIR_FILE; - else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) - return __NODE_FILE; - else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) - return __META_FILE; - else if (pid) - return __NORMAL_FILE; - else - return __MISC_FILE; -} - -void f2fs_trace_pid(struct page *page) -{ - struct inode *inode = page->mapping->host; - pid_t pid = task_pid_nr(current); - void *p; - - set_page_private(page, (unsigned long)pid); - -retry: - if (radix_tree_preload(GFP_NOFS)) - return; - - spin_lock(&pids_lock); - p = radix_tree_lookup(&pids, pid); - if (p == current) - goto out; - if (p) - radix_tree_delete(&pids, pid); - - if (radix_tree_insert(&pids, pid, current)) { - spin_unlock(&pids_lock); - radix_tree_preload_end(); - cond_resched(); - goto retry; - } - - trace_printk("%3x:%3x %4x %-16s\n", - MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), - pid, current->comm); -out: - spin_unlock(&pids_lock); - radix_tree_preload_end(); -} - -void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) -{ - struct inode *inode; - pid_t pid; - int major, minor; - - if (flush) { - __print_last_io(); - return; - } - - inode = fio->page->mapping->host; - pid = page_private(fio->page); - - major = MAJOR(inode->i_sb->s_dev); - minor = MINOR(inode->i_sb->s_dev); - - if (last_io.major == major && last_io.minor == minor && - last_io.pid == pid && - last_io.type == __file_type(inode, pid) && - last_io.fio.op == fio->op && - last_io.fio.op_flags == fio->op_flags && - last_io.fio.new_blkaddr + last_io.len == - fio->new_blkaddr) { - last_io.len++; - return; - } - - __print_last_io(); - - last_io.major = major; - last_io.minor = minor; - last_io.pid = pid; - last_io.type = __file_type(inode, pid); - last_io.fio = *fio; - last_io.len = 1; - return; -} - -void f2fs_build_trace_ios(void) -{ - spin_lock_init(&pids_lock); -} - -#define PIDVEC_SIZE 128 -static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, - unsigned int max_items) -{ - struct radix_tree_iter iter; - void **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, &pids, &iter, first_index) { - results[ret] = iter.index; - if (++ret == max_items) - break; - } - return ret; -} - -void f2fs_destroy_trace_ios(void) -{ - pid_t pid[PIDVEC_SIZE]; - pid_t next_pid = 0; - unsigned int found; - - spin_lock(&pids_lock); - while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { - unsigned idx; - - next_pid = pid[found - 1] + 1; - for (idx = 0; idx < found; idx++) - radix_tree_delete(&pids, pid[idx]); - } - spin_unlock(&pids_lock); -} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h deleted file mode 100644 index 789f6aa727fc..000000000000 --- a/fs/f2fs/trace.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * f2fs IO tracer - * - * Copyright (c) 2014 Motorola Mobility - * Copyright (c) 2014 Jaegeuk Kim - */ -#ifndef __F2FS_TRACE_H__ -#define __F2FS_TRACE_H__ - -#ifdef CONFIG_F2FS_IO_TRACE -#include - -enum file_type { - __NORMAL_FILE, - __DIR_FILE, - __NODE_FILE, - __META_FILE, - __ATOMIC_FILE, - __VOLATILE_FILE, - __MISC_FILE, -}; - -struct last_io_info { - int major, minor; - pid_t pid; - enum file_type type; - struct f2fs_io_info fio; - block_t len; -}; - -extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct f2fs_io_info *, int); -extern void f2fs_build_trace_ios(void); -extern void f2fs_destroy_trace_ios(void); -#else -#define f2fs_trace_pid(p) -#define f2fs_trace_ios(i, n) -#define f2fs_build_trace_ios() -#define f2fs_destroy_trace_ios() - -#endif -#endif /* __F2FS_TRACE_H__ */ From 046c59c130006ebc26995a3d70ee0e7eb717b564 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 26 Jan 2021 17:00:42 -0800 Subject: [PATCH 22/45] f2fs: flush data when enabling checkpoint back During checkpoint=disable period, f2fs bypasses all the synchronous IOs such as sync and fsync. So, when enabling it back, we must flush all of them in order to keep the data persistent. Otherwise, suddern power-cut right after enabling checkpoint will cause data loss. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Cc: stable@vger.kernel.org Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 1b77db10ced1..c371458c3413 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1876,6 +1876,9 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { + /* we should flush all the data to keep data consistency */ + sync_inodes_sb(sbi->sb); + down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); From e7b876906b21f3faea6f651812e0f113d2659071 Mon Sep 17 00:00:00 2001 From: Yi Chen Date: Thu, 28 Jan 2021 17:02:56 +0800 Subject: [PATCH 23/45] f2fs: fix to avoid inconsistent quota data Occasionally, quota data may be corrupted detected by fsck: Info: checkpoint state = 45 : crc compacted_summary unmount [QUOTA WARNING] Usage inconsistent for ID 0:actual (1543036928, 762) != expected (1543032832, 762) [ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found. [QUOTA WARNING] Usage inconsistent for ID 0:actual (1352478720, 344) != expected (1352474624, 344) [ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found. [FSCK] Unreachable nat entries [Ok..] [0x0] [FSCK] SIT valid block bitmap checking [Ok..] [FSCK] Hard link checking for regular file [Ok..] [0x0] [FSCK] valid_block_count matching with CP [Ok..] [0xdf299] [FSCK] valid_node_count matcing with CP (de lookup) [Ok..] [0x2b01] [FSCK] valid_node_count matcing with CP (nat lookup) [Ok..] [0x2b01] [FSCK] valid_inode_count matched with CP [Ok..] [0x2665] [FSCK] free segment_count matched with CP [Ok..] [0xcb04] [FSCK] next block offset is free [Ok..] [FSCK] fixing SIT types [FSCK] other corrupted bugs [Fail] The root cause is: If we open file w/ readonly flag, disk quota info won't be initialized for this file, however, following mmap() will force to convert inline inode via f2fs_convert_inline_inode(), which may increase block usage for this inode w/o updating quota data, it causes inconsistent disk quota info. The issue will happen in following stack: open(file, O_RDONLY) mmap(file) - f2fs_convert_inline_inode - f2fs_convert_inline_page - f2fs_reserve_block - f2fs_reserve_new_block - f2fs_reserve_new_blocks - f2fs_i_blocks_write - dquot_claim_block inode->i_blocks increase, but the dqb_curspace keep the size for the dquots is NULL. To fix this issue, let's call dquot_initialize() anyway in both f2fs_truncate() and f2fs_convert_inline_inode() functions to avoid potential inconsistent quota data issue. Fixes: 0abd675e97e6 ("f2fs: support plain user/group quota") Signed-off-by: Daiyue Zhang Signed-off-by: Dehe Gu Signed-off-by: Junchao Jiang Signed-off-by: Ge Qiu Signed-off-by: Yi Chen Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 4 ++++ fs/f2fs/inline.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 3e37198a880d..38b06941c9f5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -790,6 +790,10 @@ int f2fs_truncate(struct inode *inode) return -EIO; } + err = dquot_initialize(inode); + if (err) + return err; + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 6cd739114c9d..936df9342fcd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -191,6 +191,10 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; + err = dquot_initialize(inode); + if (err) + return err; + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; From 04dddfa5c2fad441e3ca5d68ce210b556e0442c4 Mon Sep 17 00:00:00 2001 From: Liu Song Date: Sun, 31 Jan 2021 20:26:05 +0800 Subject: [PATCH 24/45] f2fs: remove unnecessary initialization in xattr.c These variables will be explicitly assigned before use, so there is no need to initialize. Signed-off-by: Liu Song Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/xattr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 2086bef6c154..8159fae74b9a 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -327,7 +327,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - int err = 0; + int err; if (!xnid && !inline_size) return -ENODATA; @@ -515,7 +515,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry = NULL; - int error = 0; + int error; unsigned int size, len; void *base_addr = NULL; int base_size; @@ -562,7 +562,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; - int error = 0; + int error; size_t rest = buffer_size; down_read(&F2FS_I(inode)->i_xattr_sem); @@ -632,7 +632,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int found, newsize; size_t len; __u32 new_hsize; - int error = 0; + int error; if (name == NULL) return -EINVAL; From 406c773c874833ab79c92112c31f132bad328b5d Mon Sep 17 00:00:00 2001 From: Dehe Gu Date: Tue, 2 Feb 2021 17:39:22 +0800 Subject: [PATCH 25/45] f2fs: fix a wrong condition in __submit_bio We should use !F2FS_IO_ALIGNED() to check and submit_io directly. Fixes: 8223ecc456d0 ("f2fs: fix to add missing F2FS_IO_ALIGNED() condition") Reviewed-by: Chao Yu Signed-off-by: Dehe Gu Signed-off-by: Ge Qiu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b7bcaa7a4113..21eb4296a56d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -441,7 +441,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); - if (F2FS_IO_ALIGNED(sbi)) + if (!F2FS_IO_ALIGNED(sbi)) goto submit_io; start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; From 84893428bd3dc8f66b08015bc7343be37ec07d3c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Feb 2021 16:03:11 +0800 Subject: [PATCH 26/45] f2fs: relocate inline conversion from mmap() to mkwrite() If there is page fault only for read case on inline inode, we don't need to convert inline inode, instead, let's do conversion for write case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 38b06941c9f5..7726519d6168 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -72,6 +72,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } + err = f2fs_convert_inline_inode(inode); + if (err) + goto err; + #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret = f2fs_is_compressed_cluster(inode, page->index); @@ -525,7 +529,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - int err; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -533,11 +536,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - /* we don't need to use inline_data strictly */ - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; set_inode_flag(inode, FI_MMAP_FILE); From 259b547d9abf3bec965021d2912a2dbce82d3e01 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 19 Jan 2021 09:00:42 +0900 Subject: [PATCH 27/45] f2fs: introduce checkpoint_merge mount option We've added a new mount options, "checkpoint_merge" and "nocheckpoint_merge", which creates a kernel daemon and makes it to merge concurrent checkpoint requests as much as possible to eliminate redundant checkpoint issues. Plus, we can eliminate the sluggish issue caused by slow checkpoint operation when the checkpoint is done in a process context in a cgroup having low i/o budget and cpu shares. To make this do better, we set the default i/o priority of the kernel daemon to "3", to give one higher priority than other kernel threads. The below verification result explains this. The basic idea has come from https://opensource.samsung.com. [Verification] Android Pixel Device(ARM64, 7GB RAM, 256GB UFS) Create two I/O cgroups (fg w/ weight 100, bg w/ wight 20) Set "strict_guarantees" to "1" in BFQ tunables In "fg" cgroup, - thread A => trigger 1000 checkpoint operations "for i in `seq 1 1000`; do touch test_dir1/file; fsync test_dir1; done" - thread B => gererating async. I/O "fio --rw=write --numjobs=1 --bs=128k --runtime=3600 --time_based=1 --filename=test_img --name=test" In "bg" cgroup, - thread C => trigger repeated checkpoint operations "echo $$ > /dev/blkio/bg/tasks; while true; do touch test_dir2/file; fsync test_dir2; done" We've measured thread A's execution time. [ w/o patch ] Elapsed Time: Avg. 68 seconds [ w/ patch ] Elapsed Time: Avg. 48 seconds Reported-by: kernel test robot Reported-by: Dan Carpenter [Jaegeuk Kim: fix the return value in f2fs_start_ckpt_thread, reported by Dan] Signed-off-by: Daeho Jeong Signed-off-by: Sungjong Seo Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 11 ++ fs/f2fs/checkpoint.c | 177 +++++++++++++++++++++++++++++ fs/f2fs/debug.c | 12 ++ fs/f2fs/f2fs.h | 27 +++++ fs/f2fs/super.c | 58 ++++++++-- 5 files changed, 277 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 5eff4009e77e..f75ec244762f 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -247,6 +247,17 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl hide up to all remaining free space. The actual space that would be unusable can be viewed at /sys/fs/f2fs//unusable This space is reclaimed once checkpoint=enable. +checkpoint_merge When checkpoint is enabled, this can be used to create a kernel + daemon and make it to merge concurrent checkpoint requests as + much as possible to eliminate redundant checkpoint issues. Plus, + we can eliminate the sluggish issue caused by slow checkpoint + operation when the checkpoint is done in a process context in + a cgroup having low i/o budget and cpu shares. To make this + do better, we set the default i/o priority of the kernel daemon + to "3", to give one higher priority than other kernel threads. + This is the same way to give a I/O priority to the jbd2 + journaling thread of ext4 filesystem. +nocheckpoint_merge Disable checkpoint merge feature. compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", "lz4", "zstd" and "lzo-rle" algorithm. compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 404420c6801b..814276a5e418 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -13,12 +13,15 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "segment.h" #include +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -1705,3 +1708,177 @@ void f2fs_destroy_checkpoint_caches(void) kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } + +static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc = { .reason = CP_SYNC, }; + int err; + + down_write(&sbi->gc_lock); + err = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return err; +} + +static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req *req, *next; + struct llist_node *dispatch_list; + u64 sum_diff = 0, diff, count = 0; + int ret; + + dispatch_list = llist_del_all(&cprc->issue_list); + if (!dispatch_list) + return; + dispatch_list = llist_reverse_order(dispatch_list); + + ret = __write_checkpoint_sync(sbi); + atomic_inc(&cprc->issued_ckpt); + + llist_for_each_entry_safe(req, next, dispatch_list, llnode) { + diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); + req->ret = ret; + complete(&req->wait); + + sum_diff += diff; + count++; + } + atomic_sub(count, &cprc->queued_ckpt); + atomic_add(count, &cprc->total_ckpt); + + spin_lock(&cprc->stat_lock); + cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); + if (cprc->peak_time < cprc->cur_time) + cprc->peak_time = cprc->cur_time; + spin_unlock(&cprc->stat_lock); +} + +static int issue_checkpoint_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct ckpt_req_control *cprc = &sbi->cprc_info; + wait_queue_head_t *q = &cprc->ckpt_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; + + sb_start_intwrite(sbi->sb); + + if (!llist_empty(&cprc->issue_list)) + __checkpoint_and_complete_reqs(sbi); + + sb_end_intwrite(sbi->sb); + + wait_event_interruptible(*q, + kthread_should_stop() || !llist_empty(&cprc->issue_list)); + goto repeat; +} + +static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, + struct ckpt_req *wait_req) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (!llist_empty(&cprc->issue_list)) { + __checkpoint_and_complete_reqs(sbi); + } else { + /* already dispatched by issue_checkpoint_thread */ + if (wait_req) + wait_for_completion(&wait_req->wait); + } +} + +static void init_ckpt_req(struct ckpt_req *req) +{ + memset(req, 0, sizeof(struct ckpt_req)); + + init_completion(&req->wait); + req->queue_time = ktime_get(); +} + +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + struct ckpt_req req; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + int ret; + + down_write(&sbi->gc_lock); + ret = f2fs_write_checkpoint(sbi, &cpc); + up_write(&sbi->gc_lock); + + return ret; + } + + if (!cprc->f2fs_issue_ckpt) + return __write_checkpoint_sync(sbi); + + init_ckpt_req(&req); + + llist_add(&req.llnode, &cprc->issue_list); + atomic_inc(&cprc->queued_ckpt); + + /* update issue_list before we wake up issue_checkpoint thread */ + smp_mb(); + + if (waitqueue_active(&cprc->ckpt_wait_queue)) + wake_up(&cprc->ckpt_wait_queue); + + if (cprc->f2fs_issue_ckpt) + wait_for_completion(&req.wait); + else + flush_remained_ckpt_reqs(sbi, &req); + + return req.ret; +} + +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) + return 0; + + cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, + "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(cprc->f2fs_issue_ckpt)) { + cprc->f2fs_issue_ckpt = NULL; + return -ENOMEM; + } + + set_task_ioprio(cprc->f2fs_issue_ckpt, DEFAULT_CHECKPOINT_IOPRIO); + + return 0; +} + +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + if (cprc->f2fs_issue_ckpt) { + struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt; + + cprc->f2fs_issue_ckpt = NULL; + kthread_stop(ckpt_task); + + flush_remained_ckpt_reqs(sbi, NULL); + } +} + +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) +{ + struct ckpt_req_control *cprc = &sbi->cprc_info; + + atomic_set(&cprc->issued_ckpt, 0); + atomic_set(&cprc->total_ckpt, 0); + atomic_set(&cprc->queued_ckpt, 0); + init_waitqueue_head(&cprc->ckpt_wait_queue); + init_llist_head(&cprc->issue_list); + spin_lock_init(&cprc->stat_lock); +} diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 197c914119da..91855d5721cd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; } + si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt); + si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt); + si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt); + spin_lock(&sbi->cprc_info.stat_lock); + si->cur_ckpt_time = sbi->cprc_info.cur_time; + si->peak_ckpt_time = sbi->cprc_info.peak_time; + spin_unlock(&sbi->cprc_info.stat_lock); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v) si->meta_count[META_NAT]); seq_printf(s, " - ssa blocks : %u\n", si->meta_count[META_SSA]); + seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, " + "Cur time: %4d(ms), Peak time: %4d(ms))\n", + si->nr_queued_ckpt, si->nr_issued_ckpt, + si->nr_total_ckpt, si->cur_ckpt_time, + si->peak_ckpt_time); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d96254494f9d..8f8d39f22023 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -96,6 +96,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -266,6 +267,25 @@ struct fsync_node_entry { unsigned int seq_id; /* sequence id */ }; +struct ckpt_req { + struct completion wait; /* completion for checkpoint done */ + struct llist_node llnode; /* llist_node to be linked in wait queue */ + int ret; /* return code of checkpoint */ + ktime_t queue_time; /* request queued time */ +}; + +struct ckpt_req_control { + struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ + atomic_t issued_ckpt; /* # of actually issued ckpts */ + atomic_t total_ckpt; /* # of total ckpts */ + atomic_t queued_ckpt; /* # of queued ckpts */ + struct llist_head issue_list; /* list for command issue */ + spinlock_t stat_lock; /* lock for below checkpoint time stats */ + unsigned int cur_time; /* cur wait time in msec for currently issued checkpoint */ + unsigned int peak_time; /* peak wait time in msec until now */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1438,6 +1458,7 @@ struct f2fs_sb_info { wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ + struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -3459,6 +3480,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); void f2fs_destroy_checkpoint_caches(void); +int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi); +int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi); +void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); /* * data.c @@ -3574,6 +3599,8 @@ struct f2fs_stat_info { int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; + int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt; + unsigned int cur_ckpt_time, peak_ckpt_time; int inline_xattr, inline_inode, inline_dir, append, update, orphans; int compr_inode; unsigned long long compr_blocks; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c371458c3413..014536d490c1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -142,6 +142,8 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_checkpoint_merge, + Opt_nocheckpoint_merge, Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, @@ -211,6 +213,8 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_checkpoint_merge, "checkpoint_merge"}, + {Opt_nocheckpoint_merge, "nocheckpoint_merge"}, {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, @@ -931,6 +935,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_checkpoint_merge: + set_opt(sbi, MERGE_CHECKPOINT); + break; + case Opt_nocheckpoint_merge: + clear_opt(sbi, MERGE_CHECKPOINT); + break; #ifdef CONFIG_F2FS_FS_COMPRESSION case Opt_compress_algorithm: if (!f2fs_sb_has_compression(sbi)) { @@ -1331,6 +1341,12 @@ static void f2fs_put_super(struct super_block *sb) /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); + /* + * flush all issued checkpoints and stop checkpoint issue thread. + * after then, all checkpoints should be done by each process context. + */ + f2fs_stop_ckpt_thread(sbi); + /* * We don't need to do checkpoint when superblock is clean. * But, the previous checkpoint was not done by umount, it needs to do @@ -1429,15 +1445,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return -EAGAIN; - if (sync) { - struct cp_control cpc; + if (sync) + err = f2fs_issue_checkpoint(sbi); - cpc.reason = __get_cp_reason(sbi); - - down_write(&sbi->gc_lock); - err = f2fs_write_checkpoint(sbi, &cpc); - up_write(&sbi->gc_lock); - } return err; } @@ -1756,6 +1766,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISABLE_CHECKPOINT)) seq_printf(seq, ",checkpoint=disable:%u", F2FS_OPTION(sbi).unusable_cap); + if (test_opt(sbi, MERGE_CHECKPOINT)) + seq_puts(seq, ",checkpoint_merge"); + else + seq_puts(seq, ",nocheckpoint_merge"); if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -2037,6 +2051,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto restore_gc; + } + } else { + f2fs_stop_ckpt_thread(sbi); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -3761,6 +3788,19 @@ try_onemore: f2fs_init_fsync_node_info(sbi); + /* setup checkpoint request control and start checkpoint issue thread */ + f2fs_init_ckpt_req_control(sbi); + if (!test_opt(sbi, DISABLE_CHECKPOINT) && + test_opt(sbi, MERGE_CHECKPOINT)) { + err = f2fs_start_ckpt_thread(sbi); + if (err) { + f2fs_err(sbi, + "Failed to start F2FS issue_checkpoint_thread (%d)", + err); + goto stop_ckpt_thread; + } + } + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -3959,6 +3999,8 @@ free_nm: free_sm: f2fs_destroy_segment_manager(sbi); f2fs_destroy_post_read_wq(sbi); +stop_ckpt_thread: + f2fs_stop_ckpt_thread(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); From 0dd64c26c1488b4cc483c2d8d799541676a5a6a3 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 21 Jan 2021 22:45:29 +0900 Subject: [PATCH 28/45] f2fs: add ckpt_thread_ioprio sysfs node Added "ckpt_thread_ioprio" sysfs node to give a way to change checkpoint merge daemon's io priority. Its default value is "be,3", which means "BE" I/O class and I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 9 ++++ fs/f2fs/checkpoint.c | 3 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/sysfs.c | 55 +++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index abb7717eb31d..8c90e0764050 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -380,3 +380,12 @@ Description: Show status of f2fs superblock in real time. 0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted 0x2000 SBI_IS_RESIZEFS resizefs is in process ====== ===================== ================================= + +What: /sys/fs/f2fs//ckpt_thread_ioprio +Date: January 2021 +Contact: "Daeho Jeong" +Description: Give a way to change checkpoint merge daemon's io priority. + Its default value is "be,3", which means "BE" I/O class and + I/O priority "3". We can select the class between "rt" and "be", + and set the I/O priority within valid range of it. "," delimiter + is necessary in between I/O class and priority number. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 814276a5e418..e67da7ed4d61 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1852,7 +1852,7 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) return -ENOMEM; } - set_task_ioprio(cprc->f2fs_issue_ckpt, DEFAULT_CHECKPOINT_IOPRIO); + set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); return 0; } @@ -1878,6 +1878,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) atomic_set(&cprc->issued_ckpt, 0); atomic_set(&cprc->total_ckpt, 0); atomic_set(&cprc->queued_ckpt, 0); + cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; init_waitqueue_head(&cprc->ckpt_wait_queue); init_llist_head(&cprc->issue_list); spin_lock_init(&cprc->stat_lock); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8f8d39f22023..c51cccc9d21b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -276,6 +276,7 @@ struct ckpt_req { struct ckpt_req_control { struct task_struct *f2fs_issue_ckpt; /* checkpoint task */ + int ckpt_thread_ioprio; /* checkpoint merge thread ioprio */ wait_queue_head_t ckpt_wait_queue; /* waiting queue for wake-up */ atomic_t issued_ckpt; /* # of actually issued ckpts */ atomic_t total_ckpt; /* # of total ckpts */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 81840ba130e4..d36a5a86b92c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -34,6 +35,7 @@ enum { FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ + CPRC_INFO, /* struct ckpt_req_control */ }; struct f2fs_attr { @@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif + else if (struct_type == CPRC_INFO) + return (unsigned char *)&sbi->cprc_info; return NULL; } @@ -270,6 +274,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + struct ckpt_req_control *cprc = &sbi->cprc_info; + int len = 0; + int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); + int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio); + + if (class == IOPRIO_CLASS_RT) + len += scnprintf(buf + len, PAGE_SIZE - len, "rt,"); + else if (class == IOPRIO_CLASS_BE) + len += scnprintf(buf + len, PAGE_SIZE - len, "be,"); + else + return -EINVAL; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data); + return len; + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -323,6 +344,38 @@ out: return ret ? ret : count; } + if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { + const char *name = strim((char *)buf); + struct ckpt_req_control *cprc = &sbi->cprc_info; + int class; + long data; + int ret; + + if (!strncmp(name, "rt,", 3)) + class = IOPRIO_CLASS_RT; + else if (!strncmp(name, "be,", 3)) + class = IOPRIO_CLASS_BE; + else + return -EINVAL; + + name += 3; + ret = kstrtol(name, 10, &data); + if (ret) + return ret; + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); + if (test_opt(sbi, MERGE_CHECKPOINT)) { + ret = set_task_ioprio(cprc->f2fs_issue_ckpt, + cprc->ckpt_thread_ioprio); + if (ret) + return ret; + } + + return count; + } + ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); @@ -582,6 +635,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); @@ -667,6 +721,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(unusable), From ee6e2341a92e5fb683e2457cf9aa776e83ac66e0 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 8 Feb 2021 13:42:21 -0800 Subject: [PATCH 29/45] f2fs: don't grab superblock freeze for flush/ckpt thread There are controlled by f2fs_freeze(). This fixes xfstests/generic/068 which is stuck at task:f2fs_ckpt-252:3 state:D stack: 0 pid: 5761 ppid: 2 flags:0x00004000 Call Trace: __schedule+0x44c/0x8a0 schedule+0x4f/0xc0 percpu_rwsem_wait+0xd8/0x140 ? percpu_down_write+0xf0/0xf0 __percpu_down_read+0x56/0x70 issue_checkpoint_thread+0x12c/0x160 [f2fs] ? wait_woken+0x80/0x80 kthread+0x114/0x150 ? __checkpoint_and_complete_reqs+0x110/0x110 [f2fs] ? kthread_park+0x90/0x90 ret_from_fork+0x22/0x30 Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ---- fs/f2fs/segment.c | 4 ---- fs/f2fs/super.c | 4 ++++ 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e67da7ed4d61..c62be7bf819c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1764,13 +1764,9 @@ repeat: if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&cprc->issue_list)) __checkpoint_and_complete_reqs(sbi); - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&cprc->issue_list)); goto repeat; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 651514e8f226..c3988bd6eb80 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -607,8 +607,6 @@ repeat: if (kthread_should_stop()) return 0; - sb_start_intwrite(sbi->sb); - if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -629,8 +627,6 @@ repeat: fcc->dispatch_list = NULL; } - sb_end_intwrite(sbi->sb); - wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 014536d490c1..8910cca3bbba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1463,6 +1463,10 @@ static int f2fs_freeze(struct super_block *sb) /* must be clean, since sync_filesystem() was already called */ if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) return -EINVAL; + + /* ensure no checkpoint required */ + if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list)) + return -EINVAL; return 0; } From 3bdc297d53fd4dbb882bb4e321f2ed037a354b3d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 12 Feb 2021 14:09:54 -0800 Subject: [PATCH 30/45] f2fs: give a warning only for readonly partition Let's allow mounting readonly partition. We're able to recovery later once we have it as read-write back. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8910cca3bbba..3cf23968e8a0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3890,12 +3890,10 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { - err = -EROFS; + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - goto free_meta; - } - f2fs_info(sbi, "write access unavailable, skipping recovery"); + else + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } From 0018bbabf3415dd9c682b37497d301c845dd3510 Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Thu, 4 Feb 2021 21:25:56 +0800 Subject: [PATCH 31/45] Documentation: f2fs: fix typo s/automaic/automatic Fix typo in f2fs documentation. Signed-off-by: Ed Tsai Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index f75ec244762f..81c05baa8312 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -847,7 +847,7 @@ This is the default option. f2fs does automatic compression in the writeback of compression enabled files. 2) compress_mode=user -This disables the automaic compression and gives the user discretion of choosing the +This disables the automatic compression and gives the user discretion of choosing the target file and the timing. The user can do manual compression/decompression on the compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE ioctls like the below. From f224aee7aefeb9794ef9870edbc65d6192db5167 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 21 Jul 2020 15:59:20 -0700 Subject: [PATCH 32/45] fs-verity: use smp_load_acquire() for ->i_verity_info Normally smp_store_release() or cmpxchg_release() is paired with smp_load_acquire(). Sometimes smp_load_acquire() can be replaced with the more lightweight READ_ONCE(). However, for this to be safe, all the published memory must only be accessed in a way that involves the pointer itself. This may not be the case if allocating the object also involves initializing a static or global variable, for example. fsverity_info::tree_params.hash_alg->tfm is a crypto_ahash object that's internal to and is allocated by the crypto subsystem. So by using READ_ONCE() for ->i_verity_info, we're relying on internal implementation details of the crypto subsystem. Remove this fragile assumption by using smp_load_acquire() instead. Also fix the cmpxchg logic to correctly execute an ACQUIRE barrier when losing the cmpxchg race, since cmpxchg doesn't guarantee a memory barrier on failure. (Note: I haven't seen any real-world problems here. This change is just fixing the code to be guaranteed correct and less fragile.) Fixes: fd2d1acfcadf ("fs-verity: add the hook for file ->open()") Link: https://lore.kernel.org/r/20200721225920.114347-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/open.c | 15 ++++++++++++--- include/linux/fsverity.h | 9 +++++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/verity/open.c b/fs/verity/open.c index d007db0c9304..bfe0280c14e4 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -221,11 +221,20 @@ out: void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple processes may race to set ->i_verity_info, so use cmpxchg. - * This pairs with the READ_ONCE() in fsverity_get_info(). + * Multiple tasks may race to set ->i_verity_info, so use + * cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., here we publish ->i_verity_info with a + * RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL) + if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { + /* Lost the race, so free the fsverity_info we allocated. */ fsverity_free_info(vi); + /* + * Afterwards, the caller may access ->i_verity_info directly, + * so make sure to ACQUIRE the winning fsverity_info. + */ + (void)fsverity_get_info(inode); + } } void fsverity_free_info(struct fsverity_info *vi) diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 78201a6d35f6..c1144a450392 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -115,8 +115,13 @@ struct fsverity_operations { static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { - /* pairs with the cmpxchg() in fsverity_set_info() */ - return READ_ONCE(inode->i_verity_info); + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). + * I.e., another task may publish ->i_verity_info concurrently, + * executing a RELEASE barrier. We need to use smp_load_acquire() here + * to safely ACQUIRE the memory the other task published. + */ + return smp_load_acquire(&inode->i_verity_info); } /* enable.c */ From e334646d47af63a1acdf73b3cc5c8dfb0e90d7f6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:15 -0800 Subject: [PATCH 33/45] fs-verity: remove filenames from file comments Embedding the file path inside kernel source code files isn't particularly useful as often files are moved around and the paths become incorrect. checkpatch.pl warns about this since v5.10-rc1. Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- fs/verity/enable.c | 2 +- fs/verity/hash_algs.c | 2 +- fs/verity/init.c | 2 +- fs/verity/measure.c | 2 +- fs/verity/open.c | 2 +- fs/verity/signature.c | 2 +- fs/verity/verify.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index d734cebaae70..04c9bba6c28e 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/enable.c: ioctl to enable verity on a file + * Ioctl to enable verity on a file * * Copyright 2019 Google LLC */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index c37e186ebeb6..71d0fccb6d4c 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/hash_algs.c: fs-verity hash algorithms + * fs-verity hash algorithms * * Copyright 2019 Google LLC */ diff --git a/fs/verity/init.c b/fs/verity/init.c index 94c104e00861..c98b7016f446 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/init.c: fs-verity module initialization and logging + * fs-verity module initialization and logging * * Copyright 2019 Google LLC */ diff --git a/fs/verity/measure.c b/fs/verity/measure.c index df409a5682ed..5300b8d38537 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/measure.c: ioctl to get a verity file's measurement + * Ioctl to get a verity file's measurement * * Copyright 2019 Google LLC */ diff --git a/fs/verity/open.c b/fs/verity/open.c index bfe0280c14e4..a28d5be78a09 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/open.c: opening fs-verity files + * Opening fs-verity files * * Copyright 2019 Google LLC */ diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 9cfd2f5ff57d..5c95db52b744 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/signature.c: verification of builtin signatures + * Verification of builtin signatures * * Copyright 2019 Google LLC */ diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 0ee4386386ec..817ce4f8cd3c 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * fs/verity/verify.c: data verification functions, i.e. hooks for ->readpages() + * Data verification functions, i.e. hooks for ->readpages() * * Copyright 2019 Google LLC */ From 8284c77ceb0495a5c1bd3416995aac548de344bd Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:16 -0800 Subject: [PATCH 34/45] fs-verity: rename fsverity_signed_digest to fsverity_formatted_digest The name "struct fsverity_signed_digest" is causing confusion because it isn't actually a signed digest, but rather it's the way that the digest is formatted in order to be signed. Rename it to "struct fsverity_formatted_digest" to prevent this confusion. Also update the struct's comment to clarify that it's specific to the built-in signature verification support and isn't a requirement for all fs-verity users. I'll be renaming this struct in fsverity-utils too. Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 2 +- fs/verity/fsverity_private.h | 17 ++++++++++++----- fs/verity/signature.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index a95536b6443c..b5978f34d6f7 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -372,7 +372,7 @@ kernel. Specifically, it adds support for: File measurements must be signed in the following format, which is similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: - struct fsverity_signed_digest { + struct fsverity_formatted_digest { char magic[8]; /* must be "FSVerity" */ __le16 digest_algorithm; __le16 digest_size; diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index e96d99d5145e..75f8e18b44a5 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -101,12 +101,19 @@ struct fsverity_descriptor { sizeof(struct fsverity_descriptor)) /* - * Format in which verity file measurements are signed. This is the same as - * 'struct fsverity_digest', except here some magic bytes are prepended to - * provide some context about what is being signed in case the same key is used - * for non-fsverity purposes, and here the fields have fixed endianness. + * Format in which verity file measurements are signed in built-in signatures. + * This is the same as 'struct fsverity_digest', except here some magic bytes + * are prepended to provide some context about what is being signed in case the + * same key is used for non-fsverity purposes, and here the fields have fixed + * endianness. + * + * This struct is specific to the built-in signature verification support, which + * is optional. fs-verity users may also verify signatures in userspace, in + * which case userspace is responsible for deciding on what bytes are signed. + * This struct may still be used, but it doesn't have to be. For example, + * userspace could instead use a string like "sha256:$digest_as_hex_string". */ -struct fsverity_signed_digest { +struct fsverity_formatted_digest { char magic[8]; /* must be "FSVerity" */ __le16 digest_algorithm; __le16 digest_size; diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 5c95db52b744..4ac9dff281b5 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -44,7 +44,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, const struct inode *inode = vi->inode; const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; const u32 sig_size = le32_to_cpu(desc->sig_size); - struct fsverity_signed_digest *d; + struct fsverity_formatted_digest *d; int err; if (sig_size == 0) { From 58343491fe0980bd183ad47674e1e9bcbcb50fde Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:17 -0800 Subject: [PATCH 35/45] fs-verity: rename "file measurement" to "file digest" I originally chose the name "file measurement" to refer to the fs-verity file digest to avoid confusion with traditional full-file digests or with the bare root hash of the Merkle tree. But the name "file measurement" hasn't caught on, and usually people are calling it something else, usually the "file digest". E.g. see "struct fsverity_digest" and "struct fsverity_formatted_digest", the libfsverity_compute_digest() and libfsverity_sign_digest() functions in libfsverity, and the "fsverity digest" command. Having multiple names for the same thing is always confusing. So to hopefully avoid confusion in the future, rename "fs-verity file measurement" to "fs-verity file digest". This leaves FS_IOC_MEASURE_VERITY as the only reference to "measure" in the kernel, which makes some amount of sense since the ioctl is actively "measuring" the file. I'll be renaming this in fsverity-utils too (though similarly the 'fsverity measure' command, which is a wrapper for FS_IOC_MEASURE_VERITY, will stay). Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 60 +++++++++++++------------- fs/verity/enable.c | 6 +-- fs/verity/fsverity_private.h | 12 +++--- fs/verity/measure.c | 12 +++--- fs/verity/open.c | 22 +++++----- fs/verity/signature.c | 10 ++--- 6 files changed, 61 insertions(+), 61 deletions(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index b5978f34d6f7..7c23f0b58ee3 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -27,9 +27,9 @@ automatically verified against the file's Merkle tree. Reads of any corrupted data, including mmap reads, will fail. Userspace can use another ioctl to retrieve the root hash (actually -the "file measurement", which is a hash that includes the root hash) -that fs-verity is enforcing for the file. This ioctl executes in -constant time, regardless of the file size. +the "fs-verity file digest", which is a hash that includes the Merkle +tree root hash) that fs-verity is enforcing for the file. This ioctl +executes in constant time, regardless of the file size. fs-verity is essentially a way to hash a file in constant time, subject to the caveat that reads which would violate the hash will @@ -177,9 +177,10 @@ FS_IOC_ENABLE_VERITY can fail with the following errors: FS_IOC_MEASURE_VERITY --------------------- -The FS_IOC_MEASURE_VERITY ioctl retrieves the measurement of a verity -file. The file measurement is a digest that cryptographically -identifies the file contents that are being enforced on reads. +The FS_IOC_MEASURE_VERITY ioctl retrieves the digest of a verity file. +The fs-verity file digest is a cryptographic digest that identifies +the file contents that are being enforced on reads; it is computed via +a Merkle tree and is different from a traditional full-file digest. This ioctl takes in a pointer to a variable-length structure:: @@ -197,7 +198,7 @@ On success, 0 is returned and the kernel fills in the structure as follows: - ``digest_algorithm`` will be the hash algorithm used for the file - measurement. It will match ``fsverity_enable_arg::hash_algorithm``. + digest. It will match ``fsverity_enable_arg::hash_algorithm``. - ``digest_size`` will be the size of the digest in bytes, e.g. 32 for SHA-256. (This can be redundant with ``digest_algorithm``.) - ``digest`` will be the actual bytes of the digest. @@ -257,25 +258,24 @@ non-verity one, with the following exceptions: with EIO (for read()) or SIGBUS (for mmap() reads). - If the sysctl "fs.verity.require_signatures" is set to 1 and the - file's verity measurement is not signed by a key in the fs-verity - keyring, then opening the file will fail. See `Built-in signature - verification`_. + file is not signed by a key in the fs-verity keyring, then opening + the file will fail. See `Built-in signature verification`_. Direct access to the Merkle tree is not supported. Therefore, if a verity file is copied, or is backed up and restored, then it will lose its "verity"-ness. fs-verity is primarily meant for files like executables that are managed by a package manager. -File measurement computation -============================ +File digest computation +======================= This section describes how fs-verity hashes the file contents using a -Merkle tree to produce the "file measurement" which cryptographically -identifies the file contents. This algorithm is the same for all -filesystems that support fs-verity. +Merkle tree to produce the digest which cryptographically identifies +the file contents. This algorithm is the same for all filesystems +that support fs-verity. Userspace only needs to be aware of this algorithm if it needs to -compute the file measurement itself, e.g. in order to sign the file. +compute fs-verity file digests itself, e.g. in order to sign files. .. _fsverity_merkle_tree: @@ -325,9 +325,9 @@ can't a distinguish a large file from a small second file whose data is exactly the top-level hash block of the first file. Ambiguities also arise from the convention of padding to the next block boundary. -To solve this problem, the verity file measurement is actually -computed as a hash of the following structure, which contains the -Merkle tree root hash as well as other fields such as the file size:: +To solve this problem, the fs-verity file digest is actually computed +as a hash of the following structure, which contains the Merkle tree +root hash as well as other fields such as the file size:: struct fsverity_descriptor { __u8 version; /* must be 1 */ @@ -359,18 +359,18 @@ kernel. Specifically, it adds support for: certificates from being added. 2. `FS_IOC_ENABLE_VERITY`_ accepts a pointer to a PKCS#7 formatted - detached signature in DER format of the file measurement. On - success, this signature is persisted alongside the Merkle tree. + detached signature in DER format of the file's fs-verity digest. + On success, this signature is persisted alongside the Merkle tree. Then, any time the file is opened, the kernel will verify the - file's actual measurement against this signature, using the - certificates in the ".fs-verity" keyring. + file's actual digest against this signature, using the certificates + in the ".fs-verity" keyring. 3. A new sysctl "fs.verity.require_signatures" is made available. When set to 1, the kernel requires that all verity files have a - correctly signed file measurement as described in (2). + correctly signed digest as described in (2). -File measurements must be signed in the following format, which is -similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: +fs-verity file digests must be signed in the following format, which +is similar to the structure used by `FS_IOC_MEASURE_VERITY`_:: struct fsverity_formatted_digest { char magic[8]; /* must be "FSVerity" */ @@ -421,8 +421,8 @@ can only be set by `FS_IOC_ENABLE_VERITY`_, and it cannot be cleared. ext4 also supports encryption, which can be used simultaneously with fs-verity. In this case, the plaintext data is verified rather than -the ciphertext. This is necessary in order to make the file -measurement meaningful, since every file is encrypted differently. +the ciphertext. This is necessary in order to make the fs-verity file +digest meaningful, since every file is encrypted differently. ext4 stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond @@ -592,8 +592,8 @@ weren't already directly answered in other parts of this document. :Q: Isn't fs-verity useless because the attacker can just modify the hashes in the Merkle tree, which is stored on-disk? :A: To verify the authenticity of an fs-verity file you must verify - the authenticity of the "file measurement", which is basically the - root hash of the Merkle tree. See `Use cases`_. + the authenticity of the "fs-verity file digest", which + incorporates the root hash of the Merkle tree. See `Use cases`_. :Q: Isn't fs-verity useless because the attacker can just replace a verity file with a non-verity one? diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 04c9bba6c28e..835c3399fee5 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -398,9 +398,9 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) * Some pages of the file may have been evicted from pagecache after * being used in the Merkle tree construction, then read into pagecache * again by another process reading from the file concurrently. Since - * these pages didn't undergo verification against the file measurement - * which fs-verity now claims to be enforcing, we have to wipe the - * pagecache to ensure that all future reads are verified. + * these pages didn't undergo verification against the file digest which + * fs-verity now claims to be enforcing, we have to wipe the pagecache + * to ensure that all future reads are verified. */ filemap_write_and_wait(inode->i_mapping); invalidate_inode_pages2(inode->i_mapping); diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 75f8e18b44a5..21e9930d65fb 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -67,19 +67,19 @@ struct merkle_tree_params { * When a verity file is first opened, an instance of this struct is allocated * and stored in ->i_verity_info; it remains until the inode is evicted. It * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file measurement. The Merkle - * tree pages themselves are not cached here, but the filesystem may cache them. + * data read from the file. It also caches the file digest. The Merkle tree + * pages themselves are not cached here, but the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; u8 root_hash[FS_VERITY_MAX_DIGEST_SIZE]; - u8 measurement[FS_VERITY_MAX_DIGEST_SIZE]; + u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE]; const struct inode *inode; }; /* - * Merkle tree properties. The file measurement is the hash of this structure - * excluding the signature and with the sig_size field set to 0. + * Merkle tree properties. The fs-verity file digest is the hash of this + * structure excluding the signature and with the sig_size field set to 0. */ struct fsverity_descriptor { __u8 version; /* must be 1 */ @@ -101,7 +101,7 @@ struct fsverity_descriptor { sizeof(struct fsverity_descriptor)) /* - * Format in which verity file measurements are signed in built-in signatures. + * Format in which fs-verity file digests are signed in built-in signatures. * This is the same as 'struct fsverity_digest', except here some magic bytes * are prepended to provide some context about what is being signed in case the * same key is used for non-fsverity purposes, and here the fields have fixed diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 5300b8d38537..f0d7b30c62db 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Ioctl to get a verity file's measurement + * Ioctl to get a verity file's digest * * Copyright 2019 Google LLC */ @@ -10,12 +10,12 @@ #include /** - * fsverity_ioctl_measure() - get a verity file's measurement - * @filp: file to get measurement of + * fsverity_ioctl_measure() - get a verity file's digest + * @filp: file to get digest of * @_uarg: user pointer to fsverity_digest * - * Retrieve the file measurement that the kernel is enforcing for reads from a - * verity file. See the "FS_IOC_MEASURE_VERITY" section of + * Retrieve the file digest that the kernel is enforcing for reads from a verity + * file. See the "FS_IOC_MEASURE_VERITY" section of * Documentation/filesystems/fsverity.rst for the documentation. * * Return: 0 on success, -errno on failure @@ -51,7 +51,7 @@ int fsverity_ioctl_measure(struct file *filp, void __user *_uarg) if (copy_to_user(uarg, &arg, sizeof(arg))) return -EFAULT; - if (copy_to_user(uarg->digest, vi->measurement, hash_alg->digest_size)) + if (copy_to_user(uarg->digest, vi->file_digest, hash_alg->digest_size)) return -EFAULT; return 0; diff --git a/fs/verity/open.c b/fs/verity/open.c index a28d5be78a09..228d0eca3e2e 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -124,18 +124,18 @@ out_err: } /* - * Compute the file measurement by hashing the fsverity_descriptor excluding the + * Compute the file digest by hashing the fsverity_descriptor excluding the * signature and with the sig_size field set to 0. */ -static int compute_file_measurement(struct fsverity_hash_alg *hash_alg, - struct fsverity_descriptor *desc, - u8 *measurement) +static int compute_file_digest(struct fsverity_hash_alg *hash_alg, + struct fsverity_descriptor *desc, + u8 *file_digest) { __le32 sig_size = desc->sig_size; int err; desc->sig_size = 0; - err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), measurement); + err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest); desc->sig_size = sig_size; return err; @@ -199,15 +199,15 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); - err = compute_file_measurement(vi->tree_params.hash_alg, desc, - vi->measurement); + err = compute_file_digest(vi->tree_params.hash_alg, desc, + vi->file_digest); if (err) { - fsverity_err(inode, "Error %d computing file measurement", err); + fsverity_err(inode, "Error %d computing file digest", err); goto out; } - pr_debug("Computed file measurement: %s:%*phN\n", + pr_debug("Computed file digest: %s:%*phN\n", vi->tree_params.hash_alg->name, - vi->tree_params.digest_size, vi->measurement); + vi->tree_params.digest_size, vi->file_digest); err = fsverity_verify_signature(vi, desc, desc_size); out: @@ -354,7 +354,7 @@ int __init fsverity_init_info_cache(void) { fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, SLAB_RECLAIM_ACCOUNT, - measurement); + file_digest); if (!fsverity_info_cachep) return -ENOMEM; return 0; diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 4ac9dff281b5..a8591fbe6fe5 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -32,8 +32,8 @@ static struct key *fsverity_keyring; * @desc: the file's fsverity_descriptor * @desc_size: size of @desc * - * If the file's fs-verity descriptor includes a signature of the file - * measurement, verify it against the certificates in the fs-verity keyring. + * If the file's fs-verity descriptor includes a signature of the file digest, + * verify it against the certificates in the fs-verity keyring. * * Return: 0 on success (signature valid or not required); -errno on failure */ @@ -67,7 +67,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, memcpy(d->magic, "FSVerity", 8); d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs); d->digest_size = cpu_to_le16(hash_alg->digest_size); - memcpy(d->digest, vi->measurement, hash_alg->digest_size); + memcpy(d->digest, vi->file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, desc->signature, sig_size, @@ -90,8 +90,8 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return err; } - pr_debug("Valid signature for file measurement %s:%*phN\n", - hash_alg->name, hash_alg->digest_size, vi->measurement); + pr_debug("Valid signature for file digest %s:%*phN\n", + hash_alg->name, hash_alg->digest_size, vi->file_digest); return 0; } From 8a6f3470939e8f3e2e0aa42bdb1e913883f6024f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 Nov 2020 13:19:18 -0800 Subject: [PATCH 36/45] fs-verity: move structs needed for file signing to UAPI header Although it isn't used directly by the ioctls, "struct fsverity_descriptor" is required by userspace programs that need to compute fs-verity file digests in a standalone way. Therefore it's also needed to sign files in a standalone way. Similarly, "struct fsverity_formatted_digest" (previously called "struct fsverity_signed_digest" which was misleading) is also needed to sign files if the built-in signature verification is being used. Therefore, move these structs to the UAPI header. While doing this, try to make it clear that the signature-related fields in fsverity_descriptor aren't used in the file digest computation. Acked-by: Luca Boccassi Link: https://lore.kernel.org/r/20201113211918.71883-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 6 +--- fs/verity/fsverity_private.h | 37 ------------------- include/uapi/linux/fsverity.h | 49 ++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 42 deletions(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 7c23f0b58ee3..28ec7ba5b6df 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -334,17 +334,13 @@ root hash as well as other fields such as the file size:: __u8 hash_algorithm; /* Merkle tree hash algorithm */ __u8 log_blocksize; /* log2 of size of data and tree blocks */ __u8 salt_size; /* size of salt in bytes; 0 if none */ - __le32 sig_size; /* must be 0 */ + __le32 __reserved_0x04; /* must be 0 */ __le64 data_size; /* size of file the Merkle tree is built over */ __u8 root_hash[64]; /* Merkle tree root hash */ __u8 salt[32]; /* salt prepended to each hashed block */ __u8 __reserved[144]; /* must be 0's */ }; -Note that the ``sig_size`` field must be set to 0 for the purpose of -computing the file measurement, even if a signature was provided (or -will be provided) to `FS_IOC_ENABLE_VERITY`_. - Built-in signature verification =============================== diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 21e9930d65fb..96f7b332f54f 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -77,49 +77,12 @@ struct fsverity_info { const struct inode *inode; }; -/* - * Merkle tree properties. The fs-verity file digest is the hash of this - * structure excluding the signature and with the sig_size field set to 0. - */ -struct fsverity_descriptor { - __u8 version; /* must be 1 */ - __u8 hash_algorithm; /* Merkle tree hash algorithm */ - __u8 log_blocksize; /* log2 of size of data and tree blocks */ - __u8 salt_size; /* size of salt in bytes; 0 if none */ - __le32 sig_size; /* size of signature in bytes; 0 if none */ - __le64 data_size; /* size of file the Merkle tree is built over */ - __u8 root_hash[64]; /* Merkle tree root hash */ - __u8 salt[32]; /* salt prepended to each hashed block */ - __u8 __reserved[144]; /* must be 0's */ - __u8 signature[]; /* optional PKCS#7 signature */ -}; - /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 #define FS_VERITY_MAX_SIGNATURE_SIZE (FS_VERITY_MAX_DESCRIPTOR_SIZE - \ sizeof(struct fsverity_descriptor)) -/* - * Format in which fs-verity file digests are signed in built-in signatures. - * This is the same as 'struct fsverity_digest', except here some magic bytes - * are prepended to provide some context about what is being signed in case the - * same key is used for non-fsverity purposes, and here the fields have fixed - * endianness. - * - * This struct is specific to the built-in signature verification support, which - * is optional. fs-verity users may also verify signatures in userspace, in - * which case userspace is responsible for deciding on what bytes are signed. - * This struct may still be used, but it doesn't have to be. For example, - * userspace could instead use a string like "sha256:$digest_as_hex_string". - */ -struct fsverity_formatted_digest { - char magic[8]; /* must be "FSVerity" */ - __le16 digest_algorithm; - __le16 digest_size; - __u8 digest[]; -}; - /* hash_algs.c */ extern struct fsverity_hash_alg fsverity_hash_algs[]; diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index da0daf6c193b..33f44156f8ea 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -34,6 +34,55 @@ struct fsverity_digest { __u8 digest[]; }; +/* + * Struct containing a file's Merkle tree properties. The fs-verity file digest + * is the hash of this struct. A userspace program needs this struct only if it + * needs to compute fs-verity file digests itself, e.g. in order to sign files. + * It isn't needed just to enable fs-verity on a file. + * + * Note: when computing the file digest, 'sig_size' and 'signature' must be left + * zero and empty, respectively. These fields are present only because some + * filesystems reuse this struct as part of their on-disk format. + */ +struct fsverity_descriptor { + __u8 version; /* must be 1 */ + __u8 hash_algorithm; /* Merkle tree hash algorithm */ + __u8 log_blocksize; /* log2 of size of data and tree blocks */ + __u8 salt_size; /* size of salt in bytes; 0 if none */ +#ifdef __KERNEL__ + __le32 sig_size; +#else + __le32 __reserved_0x04; /* must be 0 */ +#endif + __le64 data_size; /* size of file the Merkle tree is built over */ + __u8 root_hash[64]; /* Merkle tree root hash */ + __u8 salt[32]; /* salt prepended to each hashed block */ + __u8 __reserved[144]; /* must be 0's */ +#ifdef __KERNEL__ + __u8 signature[]; +#endif +}; + +/* + * Format in which fs-verity file digests are signed in built-in signatures. + * This is the same as 'struct fsverity_digest', except here some magic bytes + * are prepended to provide some context about what is being signed in case the + * same key is used for non-fsverity purposes, and here the fields have fixed + * endianness. + * + * This struct is specific to the built-in signature verification support, which + * is optional. fs-verity users may also verify signatures in userspace, in + * which case userspace is responsible for deciding on what bytes are signed. + * This struct may still be used, but it doesn't have to be. For example, + * userspace could instead use a string like "sha256:$digest_as_hex_string". + */ +struct fsverity_formatted_digest { + char magic[8]; /* must be "FSVerity" */ + __le16 digest_algorithm; + __le16 digest_size; + __u8 digest[]; +}; + #define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg) #define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest) From 4a7bdcc72022babd33abf2f15cfcc1d57f3a73f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:21:57 +0100 Subject: [PATCH 37/45] f2fs: remove a few bd_part checks bd_part is never NULL for a block device in use by a file system, so remove the checks. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Chao Yu Signed-off-by: Jens Axboe --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/sysfs.c | 9 --------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c62be7bf819c..c7fc8a33616f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1387,8 +1387,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, static inline u64 get_sectors_written(struct block_device *bdev) { - return bdev->bd_part ? - (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]) : 0; + return (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]); } u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d36a5a86b92c..dc2b102f3482 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -94,11 +94,6 @@ static ssize_t free_segments_show(struct f2fs_attr *a, static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - return sprintf(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - @@ -114,12 +109,8 @@ static ssize_t sb_status_show(struct f2fs_attr *a, static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - struct super_block *sb = sbi->sb; int len = 0; - if (!sb->s_bdev->bd_part) - return sprintf(buf, "0\n"); - if (f2fs_sb_has_encrypt(sbi)) len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); From 206329e22bb90eb383341ed44897012cc478a4da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:37 +0100 Subject: [PATCH 38/45] f2fs: use blkdev_issue_flush in __submit_flush_wait Use the blkdev_issue_flush helper instead of duplicating it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 3 ++- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 12 +----------- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 21eb4296a56d..d638ee8dd225 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -53,7 +53,8 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); } -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) +static struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, + bool noio) { if (noio) { /* No failure on bio allocation */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c51cccc9d21b..24dfca37c802 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3491,7 +3491,6 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c3988bd6eb80..c6ea8bed9502 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -563,17 +563,7 @@ do_sync: static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio; - int ret; - - bio = f2fs_bio_alloc(sbi, 0, false); - if (!bio) - return -ENOMEM; - - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio_set_dev(bio, bdev); - ret = submit_bio_wait(bio); - bio_put(bio); + int ret = blkdev_issue_flush(bdev, GFP_NOFS, NULL); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); From aa977bd3d85086a3b2f2a26c802df21006f25693 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Jan 2021 15:52:38 +0100 Subject: [PATCH 39/45] f2fs: remove FAULT_ALLOC_BIO Sleeping bio allocations do not fail, which means that injecting an error into sleeping bio allocations is a little silly. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Acked-by: Damien Le Moal Signed-off-by: Jens Axboe --- Documentation/filesystems/f2fs.rst | 1 - fs/f2fs/data.c | 29 ++++------------------------- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 1 - 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 81c05baa8312..35ed01a5fbc9 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -179,7 +179,6 @@ fault_type=%d Support configuring fault injection type, should be FAULT_KVMALLOC 0x000000002 FAULT_PAGE_ALLOC 0x000000004 FAULT_PAGE_GET 0x000000008 - FAULT_ALLOC_BIO 0x000000010 FAULT_ALLOC_NID 0x000000020 FAULT_ORPHAN 0x000000040 FAULT_BLOCK 0x000000080 diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d638ee8dd225..e1352ec0f210 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -47,28 +47,6 @@ void f2fs_destroy_bioset(void) bioset_exit(&f2fs_bioset); } -static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, - unsigned int nr_iovecs) -{ - return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); -} - -static struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, - bool noio) -{ - if (noio) { - /* No failure on bio allocation */ - return __f2fs_bio_alloc(GFP_NOIO, npages); - } - - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return __f2fs_bio_alloc(GFP_KERNEL, npages); -} - static bool __is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; @@ -412,7 +390,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) struct f2fs_sb_info *sbi = fio->sbi; struct bio *bio; - bio = f2fs_bio_alloc(sbi, npages, true); + bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset); f2fs_target_device(sbi, fio->new_blkaddr, bio); if (is_read_io(fio->op)) { @@ -954,8 +932,9 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), - for_write); + bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, + min_t(int, nr_pages, BIO_MAX_PAGES), + &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); f2fs_target_device(sbi, blkaddr, bio); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 24dfca37c802..392bfd8acd34 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -42,7 +42,6 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, - FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3cf23968e8a0..f43fad1cb666 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -45,7 +45,6 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", - [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", From 9112146082565dc7e9e9f6e2ef44bf45bea13c48 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:14 -0800 Subject: [PATCH 40/45] fs-verity: factor out fsverity_get_descriptor() The FS_IOC_READ_VERITY_METADATA ioctl will need to return the fs-verity descriptor (and signature) to userspace. There are a few ways we could implement this: - Save a copy of the descriptor (and signature) in the fsverity_info struct that hangs off of the in-memory inode. However, this would waste memory since most of the time it wouldn't be needed. - Regenerate the descriptor from the merkle_tree_params in the fsverity_info. However, this wouldn't work for the signature, nor for the salt which the merkle_tree_params only contains indirectly as part of the 'hashstate'. It would also be error-prone. - Just get them from the filesystem again. The disadvantage is that in general we can't trust that they haven't been maliciously changed since the file has opened. However, the use cases for FS_IOC_READ_VERITY_METADATA don't require that it verifies the chain of trust. So this is okay as long as we do some basic validation. In preparation for implementing the third option, factor out a helper function fsverity_get_descriptor() which gets the descriptor (and appended signature) from the filesystem and does some basic validation. As part of this, start checking the sig_size field for overflow. Currently fsverity_verify_signature() does this. But the new ioctl will need this too, so do it earlier. Link: https://lore.kernel.org/r/20210115181819.34732-2-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- fs/verity/fsverity_private.h | 7 +- fs/verity/open.c | 130 +++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 46 deletions(-) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 96f7b332f54f..a647b761750a 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -122,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const u8 *salt, size_t salt_size); struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *desc, size_t desc_size); + struct fsverity_descriptor *desc, + size_t desc_size); void fsverity_set_info(struct inode *inode, struct fsverity_info *vi); void fsverity_free_info(struct fsverity_info *vi); +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret); + int __init fsverity_init_info_cache(void); void __init fsverity_exit_info_cache(void); diff --git a/fs/verity/open.c b/fs/verity/open.c index 228d0eca3e2e..a987bb785e9b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -142,45 +142,17 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, } /* - * Validate the given fsverity_descriptor and create a new fsverity_info from - * it. The signature (if present) is also checked. + * Create a new fsverity_info from the given fsverity_descriptor (with optional + * appended signature), and check the signature if present. The + * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, - void *_desc, size_t desc_size) + struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_descriptor *desc = _desc; struct fsverity_info *vi; int err; - if (desc_size < sizeof(*desc)) { - fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", - desc_size); - return ERR_PTR(-EINVAL); - } - - if (desc->version != 1) { - fsverity_err(inode, "Unrecognized descriptor version: %u", - desc->version); - return ERR_PTR(-EINVAL); - } - - if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { - fsverity_err(inode, "Reserved bits set in descriptor"); - return ERR_PTR(-EINVAL); - } - - if (desc->salt_size > sizeof(desc->salt)) { - fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); - return ERR_PTR(-EINVAL); - } - - if (le64_to_cpu(desc->data_size) != inode->i_size) { - fsverity_err(inode, - "Wrong data_size: %llu (desc) != %lld (inode)", - le64_to_cpu(desc->data_size), inode->i_size); - return ERR_PTR(-EINVAL); - } - vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); if (!vi) return ERR_PTR(-ENOMEM); @@ -245,15 +217,57 @@ void fsverity_free_info(struct fsverity_info *vi) kmem_cache_free(fsverity_info_cachep, vi); } -/* Ensure the inode has an ->i_verity_info */ -static int ensure_verity_info(struct inode *inode) +static bool validate_fsverity_descriptor(struct inode *inode, + const struct fsverity_descriptor *desc, + size_t desc_size) { - struct fsverity_info *vi = fsverity_get_info(inode); - struct fsverity_descriptor *desc; - int res; + if (desc_size < sizeof(*desc)) { + fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", + desc_size); + return false; + } - if (vi) - return 0; + if (desc->version != 1) { + fsverity_err(inode, "Unrecognized descriptor version: %u", + desc->version); + return false; + } + + if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { + fsverity_err(inode, "Reserved bits set in descriptor"); + return false; + } + + if (desc->salt_size > sizeof(desc->salt)) { + fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); + return false; + } + + if (le64_to_cpu(desc->data_size) != inode->i_size) { + fsverity_err(inode, + "Wrong data_size: %llu (desc) != %lld (inode)", + le64_to_cpu(desc->data_size), inode->i_size); + return false; + } + + if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) { + fsverity_err(inode, "Signature overflows verity descriptor"); + return false; + } + + return true; +} + +/* + * Read the inode's fsverity_descriptor (with optional appended signature) from + * the filesystem, and do basic validation of it. + */ +int fsverity_get_descriptor(struct inode *inode, + struct fsverity_descriptor **desc_ret, + size_t *desc_size_ret) +{ + int res; + struct fsverity_descriptor *desc; res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); if (res < 0) { @@ -272,20 +286,46 @@ static int ensure_verity_info(struct inode *inode) res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); if (res < 0) { fsverity_err(inode, "Error %d reading verity descriptor", res); - goto out_free_desc; + kfree(desc); + return res; } - vi = fsverity_create_info(inode, desc, res); + if (!validate_fsverity_descriptor(inode, desc, res)) { + kfree(desc); + return -EINVAL; + } + + *desc_ret = desc; + *desc_size_ret = res; + return 0; +} + +/* Ensure the inode has an ->i_verity_info */ +static int ensure_verity_info(struct inode *inode) +{ + struct fsverity_info *vi = fsverity_get_info(inode); + struct fsverity_descriptor *desc; + size_t desc_size; + int err; + + if (vi) + return 0; + + err = fsverity_get_descriptor(inode, &desc, &desc_size); + if (err) + return err; + + vi = fsverity_create_info(inode, desc, desc_size); if (IS_ERR(vi)) { - res = PTR_ERR(vi); + err = PTR_ERR(vi); goto out_free_desc; } fsverity_set_info(inode, vi); - res = 0; + err = 0; out_free_desc: kfree(desc); - return res; + return err; } /** From b8dc76de9074256463e4c5f64e8da5e68d174366 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:15 -0800 Subject: [PATCH 41/45] fs-verity: don't pass whole descriptor to fsverity_verify_signature() Now that fsverity_get_descriptor() validates the sig_size field, fsverity_verify_signature() doesn't need to do it. Just change the prototype of fsverity_verify_signature() to take the signature directly rather than take a fsverity_descriptor. Link: https://lore.kernel.org/r/20210115181819.34732-3-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Amy Parker Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- fs/verity/fsverity_private.h | 6 ++---- fs/verity/open.c | 3 ++- fs/verity/signature.c | 20 ++++++-------------- 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index a647b761750a..a275cad1548a 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -140,15 +140,13 @@ void __init fsverity_exit_info_cache(void); #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size); + const u8 *signature, size_t sig_size); int __init fsverity_init_signature(void); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) + const u8 *signature, size_t sig_size) { return 0; } diff --git a/fs/verity/open.c b/fs/verity/open.c index a987bb785e9b..60ff8af7219f 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -181,7 +181,8 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode, vi->tree_params.hash_alg->name, vi->tree_params.digest_size, vi->file_digest); - err = fsverity_verify_signature(vi, desc, desc_size); + err = fsverity_verify_signature(vi, desc->signature, + le32_to_cpu(desc->sig_size)); out: if (err) { fsverity_free_info(vi); diff --git a/fs/verity/signature.c b/fs/verity/signature.c index a8591fbe6fe5..b42f6e832825 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -29,21 +29,19 @@ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature * @vi: the file's fsverity_info - * @desc: the file's fsverity_descriptor - * @desc_size: size of @desc + * @signature: the file's built-in signature + * @sig_size: size of signature in bytes, or 0 if no signature * - * If the file's fs-verity descriptor includes a signature of the file digest, - * verify it against the certificates in the fs-verity keyring. + * If the file includes a signature of its fs-verity file digest, verify it + * against the certificates in the fs-verity keyring. * * Return: 0 on success (signature valid or not required); -errno on failure */ int fsverity_verify_signature(const struct fsverity_info *vi, - const struct fsverity_descriptor *desc, - size_t desc_size) + const u8 *signature, size_t sig_size) { const struct inode *inode = vi->inode; const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; - const u32 sig_size = le32_to_cpu(desc->sig_size); struct fsverity_formatted_digest *d; int err; @@ -56,11 +54,6 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } - if (sig_size > desc_size - sizeof(*desc)) { - fsverity_err(inode, "Signature overflows verity descriptor"); - return -EBADMSG; - } - d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); if (!d) return -ENOMEM; @@ -70,8 +63,7 @@ int fsverity_verify_signature(const struct fsverity_info *vi, memcpy(d->digest, vi->file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, - desc->signature, sig_size, - fsverity_keyring, + signature, sig_size, fsverity_keyring, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); kfree(d); From aad86a03651462022be1d8a1589f538af149aac6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:16 -0800 Subject: [PATCH 42/45] fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl Add an ioctl FS_IOC_READ_VERITY_METADATA which will allow reading verity metadata from a file that has fs-verity enabled, including: - The Merkle tree - The fsverity_descriptor (not including the signature if present) - The built-in signature, if present This ioctl has similar semantics to pread(). It is passed the type of metadata to read (one of the above three), and a buffer, offset, and size. It returns the number of bytes read or an error. Separate patches will add support for each of the above metadata types. This patch just adds the ioctl itself. This ioctl doesn't make any assumption about where the metadata is stored on-disk. It does assume the metadata is in a stable format, but that's basically already the case: - The Merkle tree and fsverity_descriptor are defined by how fs-verity file digests are computed; see the "File digest computation" section of Documentation/filesystems/fsverity.rst. Technically, the way in which the levels of the tree are ordered relative to each other wasn't previously specified, but it's logical to put the root level first. - The built-in signature is the value passed to FS_IOC_ENABLE_VERITY. This ioctl is useful because it allows writing a server program that takes a verity file and serves it to a client program, such that the client can do its own fs-verity compatible verification of the file. This only makes sense if the client doesn't trust the server and if the server needs to provide the storage for the client. More concretely, there is interest in using this ability in Android to export APK files (which are protected by fs-verity) to "protected VMs". This would use Protected KVM (https://lwn.net/Articles/836693), which provides an isolated execution environment without having to trust the traditional "host". A "guest" VM can boot from a signed image and perform specific tasks in a minimum trusted environment using files that have fs-verity enabled on the host, without trusting the host or requiring that the guest has its own trusted storage. Technically, it would be possible to duplicate the metadata and store it in separate files for serving. However, that would be less efficient and would require extra care in userspace to maintain file consistency. In addition to the above, the ability to read the built-in signatures is useful because it allows a system that is using the in-kernel signature verification to migrate to userspace signature verification. Link: https://lore.kernel.org/r/20210115181819.34732-4-ebiggers@kernel.org Reviewed-by: Victor Hsieh Acked-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 57 ++++++++++++++++++++++++++ fs/ext4/ioctl.c | 7 ++++ fs/f2fs/file.c | 11 +++++ fs/verity/Makefile | 1 + fs/verity/read_metadata.c | 55 +++++++++++++++++++++++++ include/linux/fsverity.h | 12 ++++++ include/uapi/linux/fsverity.h | 10 +++++ 7 files changed, 153 insertions(+) create mode 100644 fs/verity/read_metadata.c diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 28ec7ba5b6df..a9a7663ec980 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -217,6 +217,63 @@ FS_IOC_MEASURE_VERITY can fail with the following errors: - ``EOVERFLOW``: the digest is longer than the specified ``digest_size`` bytes. Try providing a larger buffer. +FS_IOC_READ_VERITY_METADATA +--------------------------- + +The FS_IOC_READ_VERITY_METADATA ioctl reads verity metadata from a +verity file. This ioctl is available since Linux v5.12. + +This ioctl allows writing a server program that takes a verity file +and serves it to a client program, such that the client can do its own +fs-verity compatible verification of the file. This only makes sense +if the client doesn't trust the server and if the server needs to +provide the storage for the client. + +This is a fairly specialized use case, and most fs-verity users won't +need this ioctl. + +This ioctl takes in a pointer to the following structure:: + + struct fsverity_read_metadata_arg { + __u64 metadata_type; + __u64 offset; + __u64 length; + __u64 buf_ptr; + __u64 __reserved; + }; + +``metadata_type`` specifies the type of metadata to read. + +The semantics are similar to those of ``pread()``. ``offset`` +specifies the offset in bytes into the metadata item to read from, and +``length`` specifies the maximum number of bytes to read from the +metadata item. ``buf_ptr`` is the pointer to the buffer to read into, +cast to a 64-bit integer. ``__reserved`` must be 0. On success, the +number of bytes read is returned. 0 is returned at the end of the +metadata item. The returned length may be less than ``length``, for +example if the ioctl is interrupted. + +The metadata returned by FS_IOC_READ_VERITY_METADATA isn't guaranteed +to be authenticated against the file digest that would be returned by +`FS_IOC_MEASURE_VERITY`_, as the metadata is expected to be used to +implement fs-verity compatible verification anyway (though absent a +malicious disk, the metadata will indeed match). E.g. to implement +this ioctl, the filesystem is allowed to just read the Merkle tree +blocks from disk without actually verifying the path to the root node. + +FS_IOC_READ_VERITY_METADATA can fail with the following errors: + +- ``EFAULT``: the caller provided inaccessible memory +- ``EINTR``: the ioctl was interrupted before any data was read +- ``EINVAL``: reserved fields were set, or ``offset + length`` + overflowed +- ``ENODATA``: the file is not a verity file +- ``ENOTTY``: this type of filesystem does not implement fs-verity, or + this ioctl is not yet implemented on it +- ``EOPNOTSUPP``: the kernel was not configured with fs-verity + support, or the filesystem superblock has not had the 'verity' + feature enabled on it. (See `Filesystem support`_.) + FS_IOC_GETFLAGS --------------- diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5214b4a93be9..1cad30aeb51b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1102,6 +1102,12 @@ resizefs_out: return -EOPNOTSUPP; return fsverity_ioctl_measure(filp, (void __user *)arg); + case FS_IOC_READ_VERITY_METADATA: + if (!ext4_has_feature_verity(sb)) + return -EOPNOTSUPP; + return fsverity_ioctl_read_metadata(filp, + (const void __user *)arg); + default: return -ENOTTY; } @@ -1172,6 +1178,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7726519d6168..ae9bc814fc11 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3400,6 +3400,14 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg) return fsverity_ioctl_measure(filp, (void __user *)arg); } +static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg) +{ + if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp)))) + return -EOPNOTSUPP; + + return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); +} + static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -4314,6 +4322,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_enable_verity(filp, arg); case FS_IOC_MEASURE_VERITY: return f2fs_ioc_measure_verity(filp, arg); + case FS_IOC_READ_VERITY_METADATA: + return f2fs_ioc_read_verity_metadata(filp, arg); case FS_IOC_GETFSLABEL: return f2fs_ioc_getfslabel(filp, arg); case FS_IOC_SETFSLABEL: @@ -4570,6 +4580,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_RESIZE_FS: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: + case FS_IOC_READ_VERITY_METADATA: case FS_IOC_GETFSLABEL: case FS_IOC_SETFSLABEL: case F2FS_IOC_GET_COMPRESS_BLOCKS: diff --git a/fs/verity/Makefile b/fs/verity/Makefile index 570e9136334d..435559a4fa9e 100644 --- a/fs/verity/Makefile +++ b/fs/verity/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \ init.o \ measure.o \ open.o \ + read_metadata.o \ verify.o obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c new file mode 100644 index 000000000000..43be990fd53e --- /dev/null +++ b/fs/verity/read_metadata.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ioctl to read verity metadata + * + * Copyright 2021 Google LLC + */ + +#include "fsverity_private.h" + +#include + +/** + * fsverity_ioctl_read_metadata() - read verity metadata from a file + * @filp: file to read the metadata from + * @uarg: user pointer to fsverity_read_metadata_arg + * + * Return: length read on success, 0 on EOF, -errno on failure + */ +int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) +{ + struct inode *inode = file_inode(filp); + const struct fsverity_info *vi; + struct fsverity_read_metadata_arg arg; + int length; + void __user *buf; + + vi = fsverity_get_info(inode); + if (!vi) + return -ENODATA; /* not a verity file */ + /* + * Note that we don't have to explicitly check that the file is open for + * reading, since verity files can only be opened for reading. + */ + + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (arg.__reserved) + return -EINVAL; + + /* offset + length must not overflow. */ + if (arg.offset + arg.length < arg.offset) + return -EINVAL; + + /* Ensure that the return value will fit in INT_MAX. */ + length = min_t(u64, arg.length, INT_MAX); + + buf = u64_to_user_ptr(arg.buf_ptr); + + switch (arg.metadata_type) { + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata); diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index c1144a450392..b568b3c7d095 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -138,6 +138,10 @@ int fsverity_file_open(struct inode *inode, struct file *filp); int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void fsverity_cleanup_inode(struct inode *inode); +/* read_metadata.c */ + +int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg); + /* verify.c */ bool fsverity_verify_page(struct page *page); @@ -183,6 +187,14 @@ static inline void fsverity_cleanup_inode(struct inode *inode) { } +/* read_metadata.c */ + +static inline int fsverity_ioctl_read_metadata(struct file *filp, + const void __user *uarg) +{ + return -EOPNOTSUPP; +} + /* verify.c */ static inline bool fsverity_verify_page(struct page *page) diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index 33f44156f8ea..e062751294d0 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -83,7 +83,17 @@ struct fsverity_formatted_digest { __u8 digest[]; }; +struct fsverity_read_metadata_arg { + __u64 metadata_type; + __u64 offset; + __u64 length; + __u64 buf_ptr; + __u64 __reserved; +}; + #define FS_IOC_ENABLE_VERITY _IOW('f', 133, struct fsverity_enable_arg) #define FS_IOC_MEASURE_VERITY _IOWR('f', 134, struct fsverity_digest) +#define FS_IOC_READ_VERITY_METADATA \ + _IOWR('f', 135, struct fsverity_read_metadata_arg) #endif /* _UAPI_LINUX_FSVERITY_H */ From 11fa0d75de393ee16a1d2d3e6fd63eed8d015373 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:17 -0800 Subject: [PATCH 43/45] fs-verity: support reading Merkle tree with ioctl Add support for FS_VERITY_METADATA_TYPE_MERKLE_TREE to FS_IOC_READ_VERITY_METADATA. This allows a userspace server program to retrieve the Merkle tree of a verity file for serving to a client which implements fs-verity compatible verification. See the patch which introduced FS_IOC_READ_VERITY_METADATA for more details. This has been tested using a new xfstest which calls this ioctl via a new subcommand for the 'fsverity' program from fsverity-utils. Link: https://lore.kernel.org/r/20210115181819.34732-5-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 10 +++- fs/verity/read_metadata.c | 70 ++++++++++++++++++++++++++ include/uapi/linux/fsverity.h | 2 + 3 files changed, 81 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index a9a7663ec980..e698a4383eb0 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -234,6 +234,8 @@ need this ioctl. This ioctl takes in a pointer to the following structure:: + #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 + struct fsverity_read_metadata_arg { __u64 metadata_type; __u64 offset; @@ -242,7 +244,13 @@ This ioctl takes in a pointer to the following structure:: __u64 __reserved; }; -``metadata_type`` specifies the type of metadata to read. +``metadata_type`` specifies the type of metadata to read: + +- ``FS_VERITY_METADATA_TYPE_MERKLE_TREE`` reads the blocks of the + Merkle tree. The blocks are returned in order from the root level + to the leaf level. Within each level, the blocks are returned in + the same order that their hashes are themselves hashed. + See `Merkle tree`_ for more information. The semantics are similar to those of ``pread()``. ``offset`` specifies the offset in bytes into the metadata item to read from, and diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 43be990fd53e..0f8ad2991cf9 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -7,8 +7,75 @@ #include "fsverity_private.h" +#include +#include +#include #include +static int fsverity_read_merkle_tree(struct inode *inode, + const struct fsverity_info *vi, + void __user *buf, u64 offset, int length) +{ + const struct fsverity_operations *vops = inode->i_sb->s_vop; + u64 end_offset; + unsigned int offs_in_page; + pgoff_t index, last_index; + int retval = 0; + int err = 0; + + end_offset = min(offset + length, vi->tree_params.tree_size); + if (offset >= end_offset) + return 0; + offs_in_page = offset_in_page(offset); + last_index = (end_offset - 1) >> PAGE_SHIFT; + + /* + * Iterate through each Merkle tree page in the requested range and copy + * the requested portion to userspace. Note that the Merkle tree block + * size isn't important here, as we are returning a byte stream; i.e., + * we can just work with pages even if the tree block size != PAGE_SIZE. + */ + for (index = offset >> PAGE_SHIFT; index <= last_index; index++) { + unsigned long num_ra_pages = + min_t(unsigned long, last_index - index + 1, + inode->i_sb->s_bdi->io_pages); + unsigned int bytes_to_copy = min_t(u64, end_offset - offset, + PAGE_SIZE - offs_in_page); + struct page *page; + const void *virt; + + page = vops->read_merkle_tree_page(inode, index, num_ra_pages); + if (IS_ERR(page)) { + err = PTR_ERR(page); + fsverity_err(inode, + "Error %d reading Merkle tree page %lu", + err, index); + break; + } + + virt = kmap(page); + if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) { + kunmap(page); + put_page(page); + err = -EFAULT; + break; + } + kunmap(page); + put_page(page); + + retval += bytes_to_copy; + buf += bytes_to_copy; + offset += bytes_to_copy; + + if (fatal_signal_pending(current)) { + err = -EINTR; + break; + } + cond_resched(); + offs_in_page = 0; + } + return retval ? retval : err; +} /** * fsverity_ioctl_read_metadata() - read verity metadata from a file * @filp: file to read the metadata from @@ -48,6 +115,9 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) buf = u64_to_user_ptr(arg.buf_ptr); switch (arg.metadata_type) { + case FS_VERITY_METADATA_TYPE_MERKLE_TREE: + return fsverity_read_merkle_tree(inode, vi, buf, arg.offset, + length); default: return -EINVAL; } diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index e062751294d0..94003b153cb3 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -83,6 +83,8 @@ struct fsverity_formatted_digest { __u8 digest[]; }; +#define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 + struct fsverity_read_metadata_arg { __u64 metadata_type; __u64 offset; From 67c2d4aa2ed77afb08a38e2d4bd530ec1168bd6a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:18 -0800 Subject: [PATCH 44/45] fs-verity: support reading descriptor with ioctl Add support for FS_VERITY_METADATA_TYPE_DESCRIPTOR to FS_IOC_READ_VERITY_METADATA. This allows a userspace server program to retrieve the fs-verity descriptor of a file for serving to a client which implements fs-verity compatible verification. See the patch which introduced FS_IOC_READ_VERITY_METADATA for more details. "fs-verity descriptor" here means only the part that userspace cares about because it is hashed to produce the file digest. It doesn't include the signature which ext4 and f2fs append to the fsverity_descriptor struct when storing it on-disk, since that way of storing the signature is an implementation detail. The next patch adds a separate metadata_type value for retrieving the signature separately. This has been tested using a new xfstest which calls this ioctl via a new subcommand for the 'fsverity' program from fsverity-utils. Link: https://lore.kernel.org/r/20210115181819.34732-6-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 4 +++ fs/verity/read_metadata.c | 40 ++++++++++++++++++++++++++ include/uapi/linux/fsverity.h | 1 + 3 files changed, 45 insertions(+) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index e698a4383eb0..633166c84c80 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -235,6 +235,7 @@ need this ioctl. This ioctl takes in a pointer to the following structure:: #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 + #define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 struct fsverity_read_metadata_arg { __u64 metadata_type; @@ -252,6 +253,9 @@ This ioctl takes in a pointer to the following structure:: the same order that their hashes are themselves hashed. See `Merkle tree`_ for more information. +- ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity + descriptor. See `fs-verity descriptor`_. + The semantics are similar to those of ``pread()``. ``offset`` specifies the offset in bytes into the metadata item to read from, and ``length`` specifies the maximum number of bytes to read from the diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 0f8ad2991cf9..2dea6dd3bb05 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -76,6 +76,44 @@ static int fsverity_read_merkle_tree(struct inode *inode, } return retval ? retval : err; } + +/* Copy the requested portion of the buffer to userspace. */ +static int fsverity_read_buffer(void __user *dst, u64 offset, int length, + const void *src, size_t src_length) +{ + if (offset >= src_length) + return 0; + src += offset; + src_length -= offset; + + length = min_t(size_t, length, src_length); + + if (copy_to_user(dst, src, length)) + return -EFAULT; + + return length; +} + +static int fsverity_read_descriptor(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + /* don't include the signature */ + desc_size = offsetof(struct fsverity_descriptor, signature); + desc->sig_size = 0; + + res = fsverity_read_buffer(buf, offset, length, desc, desc_size); + + kfree(desc); + return res; +} /** * fsverity_ioctl_read_metadata() - read verity metadata from a file * @filp: file to read the metadata from @@ -118,6 +156,8 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) case FS_VERITY_METADATA_TYPE_MERKLE_TREE: return fsverity_read_merkle_tree(inode, vi, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_DESCRIPTOR: + return fsverity_read_descriptor(inode, buf, arg.offset, length); default: return -EINVAL; } diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index 94003b153cb3..41abc283dbcc 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -84,6 +84,7 @@ struct fsverity_formatted_digest { }; #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 +#define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 struct fsverity_read_metadata_arg { __u64 metadata_type; From 5f4d2c15ba07a95486757b027347a87f7b95a206 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 15 Jan 2021 10:18:19 -0800 Subject: [PATCH 45/45] fs-verity: support reading signature with ioctl Add support for FS_VERITY_METADATA_TYPE_SIGNATURE to FS_IOC_READ_VERITY_METADATA. This allows a userspace server program to retrieve the built-in signature (if present) of a verity file for serving to a client which implements fs-verity compatible verification. See the patch which introduced FS_IOC_READ_VERITY_METADATA for more details. The ability for userspace to read the built-in signatures is also useful because it allows a system that is using the in-kernel signature verification to migrate to userspace signature verification. This has been tested using a new xfstest which calls this ioctl via a new subcommand for the 'fsverity' program from fsverity-utils. Link: https://lore.kernel.org/r/20210115181819.34732-7-ebiggers@kernel.org Reviewed-by: Victor Hsieh Reviewed-by: Jaegeuk Kim Reviewed-by: Chao Yu Signed-off-by: Eric Biggers --- Documentation/filesystems/fsverity.rst | 9 +++++++- fs/verity/read_metadata.c | 30 ++++++++++++++++++++++++++ include/uapi/linux/fsverity.h | 1 + 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst index 633166c84c80..8e73ac0c6ae6 100644 --- a/Documentation/filesystems/fsverity.rst +++ b/Documentation/filesystems/fsverity.rst @@ -236,6 +236,7 @@ This ioctl takes in a pointer to the following structure:: #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 #define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 + #define FS_VERITY_METADATA_TYPE_SIGNATURE 3 struct fsverity_read_metadata_arg { __u64 metadata_type; @@ -256,6 +257,10 @@ This ioctl takes in a pointer to the following structure:: - ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity descriptor. See `fs-verity descriptor`_. +- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the signature which was + passed to FS_IOC_ENABLE_VERITY, if any. See `Built-in signature + verification`_. + The semantics are similar to those of ``pread()``. ``offset`` specifies the offset in bytes into the metadata item to read from, and ``length`` specifies the maximum number of bytes to read from the @@ -279,7 +284,9 @@ FS_IOC_READ_VERITY_METADATA can fail with the following errors: - ``EINTR``: the ioctl was interrupted before any data was read - ``EINVAL``: reserved fields were set, or ``offset + length`` overflowed -- ``ENODATA``: the file is not a verity file +- ``ENODATA``: the file is not a verity file, or + FS_VERITY_METADATA_TYPE_SIGNATURE was requested but the file doesn't + have a built-in signature - ``ENOTTY``: this type of filesystem does not implement fs-verity, or this ioctl is not yet implemented on it - ``EOPNOTSUPP``: the kernel was not configured with fs-verity diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 2dea6dd3bb05..7e2d0c7bdf0d 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -114,6 +114,34 @@ static int fsverity_read_descriptor(struct inode *inode, kfree(desc); return res; } + +static int fsverity_read_signature(struct inode *inode, + void __user *buf, u64 offset, int length) +{ + struct fsverity_descriptor *desc; + size_t desc_size; + int res; + + res = fsverity_get_descriptor(inode, &desc, &desc_size); + if (res) + return res; + + if (desc->sig_size == 0) { + res = -ENODATA; + goto out; + } + + /* + * Include only the signature. Note that fsverity_get_descriptor() + * already verified that sig_size is in-bounds. + */ + res = fsverity_read_buffer(buf, offset, length, desc->signature, + le32_to_cpu(desc->sig_size)); +out: + kfree(desc); + return res; +} + /** * fsverity_ioctl_read_metadata() - read verity metadata from a file * @filp: file to read the metadata from @@ -158,6 +186,8 @@ int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg) length); case FS_VERITY_METADATA_TYPE_DESCRIPTOR: return fsverity_read_descriptor(inode, buf, arg.offset, length); + case FS_VERITY_METADATA_TYPE_SIGNATURE: + return fsverity_read_signature(inode, buf, arg.offset, length); default: return -EINVAL; } diff --git a/include/uapi/linux/fsverity.h b/include/uapi/linux/fsverity.h index 41abc283dbcc..15384e22e331 100644 --- a/include/uapi/linux/fsverity.h +++ b/include/uapi/linux/fsverity.h @@ -85,6 +85,7 @@ struct fsverity_formatted_digest { #define FS_VERITY_METADATA_TYPE_MERKLE_TREE 1 #define FS_VERITY_METADATA_TYPE_DESCRIPTOR 2 +#define FS_VERITY_METADATA_TYPE_SIGNATURE 3 struct fsverity_read_metadata_arg { __u64 metadata_type;