Merge tag '5.12-rc1-4.19' of https://kernel.googlesource.com/pub/scm/linux/kernel/git/jaegeuk/f2fs-stable into HEAD

* tag '5.12-rc1-4.19' of https://kernel.googlesource.com/pub/scm/linux/kernel/git/jaegeuk/f2fs-stable: fs-verity: support reading signature with ioctl fs-verity: support reading descriptor with ioctl fs-verity: support reading Merkle tree with ioctl fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl fs-verity: don't pass whole descriptor to fsverity_verify_signature() fs-verity: factor out fsverity_get_descriptor() f2fs: remove FAULT_ALLOC_BIO f2fs: use blkdev_issue_flush in __submit_flush_wait f2fs: remove a few bd_part checks fs-verity: move structs needed for file signing to UAPI header fs-verity: rename "file measurement" to "file digest" fs-verity: rename fsverity_signed_digest to fsverity_formatted_digest fs-verity: remove filenames from file comments fs-verity: use smp_load_acquire() for ->i_verity_info Documentation: f2fs: fix typo s/automaic/automatic f2fs: give a warning only for readonly partition f2fs: don't grab superblock freeze for flush/ckpt thread f2fs: add ckpt_thread_ioprio sysfs node f2fs: introduce checkpoint_merge mount option f2fs: relocate inline conversion from mmap() to mkwrite() f2fs: fix a wrong condition in __submit_bio f2fs: remove unnecessary initialization in xattr.c f2fs: fix to avoid inconsistent quota data f2fs: flush data when enabling checkpoint back f2fs: deprecate f2fs_trace_io f2fs: remove unused stat_{inc, dec}_atomic_write f2fs: introduce sb_status sysfs node f2fs: fix to use per-inode maxbytes f2fs: compress: fix potential deadlock libfs: unexport generic_ci_d_compare() and generic_ci_d_hash() f2fs: fix to set/clear I_LINKABLE under i_lock f2fs: fix null page reference in redirty_blocks f2fs: clean up post-read processing f2fs: trival cleanup in move_data_block() f2fs: fix out-of-repair __setattr_copy() f2fs: fix to tag FIEMAP_EXTENT_MERGED in f2fs_fiemap() f2fs: introduce a new per-sb directory in sysfs f2fs: compress: support compress level f2fs: compress: deny setting unsupported compress algorithm f2fs: relocate f2fs_precache_extents() f2fs: enforce the immutable flag on open files f2fs: enhance to update i_mode and acl atomically in f2fs_setattr() f2fs: fix to set inode->i_mode correctly for posix_acl_update_mode f2fs: Replace expression with offsetof() f2fs: handle unallocated section and zone on pinned/atgc Change-Id: I88c2467fb263f9ddfda6eda8ba16037c433db8f2 Signed-off-by: UtsavBalar1231 <utsavbalar1231@gmail.com> Conflicts: fs/ext4/ioctl.c fs/f2fs/data.c fs/f2fs/file.c fs/verity/signature.c include/uapi/linux/fsverity.h
2022-07-03 14:22:19 +00:00 · 2022-07-03 14:22:19 +00:00 · 63b140889d
commit 63b140889d
parent 2e7d3e85e9 5f4d2c15ba
31 changed files with 2344 additions and 741 deletions
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@ -357,3 +357,35 @@ Description:	This gives a control to limit the bio size in f2fs.
 		Default is zero, which will follow underlying block layer limit,
 		whereas, if it has a certain bytes value, f2fs won't submit a
 		bio larger than that size.
+
+What:		/sys/fs/f2fs/<disk>/stat/sb_status
+Date:		December 2020
+Contact:	"Chao Yu" <yuchao0@huawei.com>
+Description:	Show status of f2fs superblock in real time.
+
+		====== ===================== =================================
+		value  sb status macro       description
+		0x1    SBI_IS_DIRTY          dirty flag for checkpoint
+		0x2    SBI_IS_CLOSE          specify unmounting
+		0x4    SBI_NEED_FSCK         need fsck.f2fs to fix
+		0x8    SBI_POR_DOING         recovery is doing or not
+		0x10   SBI_NEED_SB_WRITE     need to recover superblock
+		0x20   SBI_NEED_CP           need to checkpoint
+		0x40   SBI_IS_SHUTDOWN       shutdown by ioctl
+		0x80   SBI_IS_RECOVERED      recovered orphan/data
+		0x100  SBI_CP_DISABLED       CP was disabled last mount
+		0x200  SBI_CP_DISABLED_QUICK CP was disabled quickly
+		0x400  SBI_QUOTA_NEED_FLUSH  need to flush quota info in CP
+		0x800  SBI_QUOTA_SKIP_FLUSH  skip flushing quota in current CP
+		0x1000 SBI_QUOTA_NEED_REPAIR quota file may be corrupted
+		0x2000 SBI_IS_RESIZEFS       resizefs is in process
+		====== ===================== =================================
+
+What:		/sys/fs/f2fs/<disk>/ckpt_thread_ioprio
+Date:		January 2021
+Contact:	"Daeho Jeong" <daehojeong@google.com>
+Description:	Give a way to change checkpoint merge daemon's io priority.
+		Its default value is "be,3", which means "BE" I/O class and
+		I/O priority "3". We can select the class between "rt" and "be",
+		and set the I/O priority within valid range of it. "," delimiter
+		is necessary in between I/O class and priority number.
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@ -0,0 +1,877 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================================
+WHAT IS Flash-Friendly File System (F2FS)?
+==========================================
+
+NAND flash memory-based storage devices, such as SSD, eMMC, and SD cards, have
+been equipped on a variety systems ranging from mobile to server systems. Since
+they are known to have different characteristics from the conventional rotating
+disks, a file system, an upper layer to the storage device, should adapt to the
+changes from the sketch in the design level.
+
+F2FS is a file system exploiting NAND flash memory-based storage devices, which
+is based on Log-structured File System (LFS). The design has been focused on
+addressing the fundamental issues in LFS, which are snowball effect of wandering
+tree and high cleaning overhead.
+
+Since a NAND flash memory-based storage device shows different characteristic
+according to its internal geometry or flash memory management scheme, namely FTL,
+F2FS and its tools support various parameters not only for configuring on-disk
+layout, but also for selecting allocation and cleaning algorithms.
+
+The following git tree provides the file system formatting tool (mkfs.f2fs),
+a consistency checking tool (fsck.f2fs), and a debugging tool (dump.f2fs).
+
+- git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs-tools.git
+
+For reporting bugs and sending patches, please use the following mailing list:
+
+- linux-f2fs-devel@lists.sourceforge.net
+
+Background and Design issues
+============================
+
+Log-structured File System (LFS)
+--------------------------------
+"A log-structured file system writes all modifications to disk sequentially in
+a log-like structure, thereby speeding up  both file writing and crash recovery.
+The log is the only structure on disk; it contains indexing information so that
+files can be read back from the log efficiently. In order to maintain large free
+areas on disk for fast writing, we divide  the log into segments and use a
+segment cleaner to compress the live information from heavily fragmented
+segments." from Rosenblum, M. and Ousterhout, J. K., 1992, "The design and
+implementation of a log-structured file system", ACM Trans. Computer Systems
+10, 1, 26–52.
+
+Wandering Tree Problem
+----------------------
+In LFS, when a file data is updated and written to the end of log, its direct
+pointer block is updated due to the changed location. Then the indirect pointer
+block is also updated due to the direct pointer block update. In this manner,
+the upper index structures such as inode, inode map, and checkpoint block are
+also updated recursively. This problem is called as wandering tree problem [1],
+and in order to enhance the performance, it should eliminate or relax the update
+propagation as much as possible.
+
+[1] Bityutskiy, A. 2005. JFFS3 design issues. http://www.linux-mtd.infradead.org/
+
+Cleaning Overhead
+-----------------
+Since LFS is based on out-of-place writes, it produces so many obsolete blocks
+scattered across the whole storage. In order to serve new empty log space, it
+needs to reclaim these obsolete blocks seamlessly to users. This job is called
+as a cleaning process.
+
+The process consists of three operations as follows.
+
+1. A victim segment is selected through referencing segment usage table.
+2. It loads parent index structures of all the data in the victim identified by
+   segment summary blocks.
+3. It checks the cross-reference between the data and its parent index structure.
+4. It moves valid data selectively.
+
+This cleaning job may cause unexpected long delays, so the most important goal
+is to hide the latencies to users. And also definitely, it should reduce the
+amount of valid data to be moved, and move them quickly as well.
+
+Key Features
+============
+
+Flash Awareness
+---------------
+- Enlarge the random write area for better performance, but provide the high
+  spatial locality
+- Align FS data structures to the operational units in FTL as best efforts
+
+Wandering Tree Problem
+----------------------
+- Use a term, “node”, that represents inodes as well as various pointer blocks
+- Introduce Node Address Table (NAT) containing the locations of all the “node”
+  blocks; this will cut off the update propagation.
+
+Cleaning Overhead
+-----------------
+- Support a background cleaning process
+- Support greedy and cost-benefit algorithms for victim selection policies
+- Support multi-head logs for static/dynamic hot and cold data separation
+- Introduce adaptive logging for efficient block allocation
+
+Mount Options
+=============
+
+
+======================== ============================================================
+background_gc=%s	 Turn on/off cleaning operations, namely garbage
+			 collection, triggered in background when I/O subsystem is
+			 idle. If background_gc=on, it will turn on the garbage
+			 collection and if background_gc=off, garbage collection
+			 will be turned off. If background_gc=sync, it will turn
+			 on synchronous garbage collection running in background.
+			 Default value for this option is on. So garbage
+			 collection is on by default.
+disable_roll_forward	 Disable the roll-forward recovery routine
+norecovery		 Disable the roll-forward recovery routine, mounted read-
+			 only (i.e., -o ro,disable_roll_forward)
+discard/nodiscard	 Enable/disable real-time discard in f2fs, if discard is
+			 enabled, f2fs will issue discard/TRIM commands when a
+			 segment is cleaned.
+no_heap			 Disable heap-style segment allocation which finds free
+			 segments for data from the beginning of main area, while
+			 for node from the end of main area.
+nouser_xattr		 Disable Extended User Attributes. Note: xattr is enabled
+			 by default if CONFIG_F2FS_FS_XATTR is selected.
+noacl			 Disable POSIX Access Control List. Note: acl is enabled
+			 by default if CONFIG_F2FS_FS_POSIX_ACL is selected.
+active_logs=%u		 Support configuring the number of active logs. In the
+			 current design, f2fs supports only 2, 4, and 6 logs.
+			 Default number is 6.
+disable_ext_identify	 Disable the extension list configured by mkfs, so f2fs
+			 is not aware of cold files such as media files.
+inline_xattr		 Enable the inline xattrs feature.
+noinline_xattr		 Disable the inline xattrs feature.
+inline_xattr_size=%u	 Support configuring inline xattr size, it depends on
+			 flexible inline xattr feature.
+inline_data		 Enable the inline data feature: Newly created small (<~3.4k)
+			 files can be written into inode block.
+inline_dentry		 Enable the inline dir feature: data in newly created
+			 directory entries can be written into inode block. The
+			 space of inode block which is used to store inline
+			 dentries is limited to ~3.4k.
+noinline_dentry		 Disable the inline dentry feature.
+flush_merge		 Merge concurrent cache_flush commands as much as possible
+			 to eliminate redundant command issues. If the underlying
+			 device handles the cache_flush command relatively slowly,
+			 recommend to enable this option.
+nobarrier		 This option can be used if underlying storage guarantees
+			 its cached data should be written to the novolatile area.
+			 If this option is set, no cache_flush commands are issued
+			 but f2fs still guarantees the write ordering of all the
+			 data writes.
+fastboot		 This option is used when a system wants to reduce mount
+			 time as much as possible, even though normal performance
+			 can be sacrificed.
+extent_cache		 Enable an extent cache based on rb-tree, it can cache
+			 as many as extent which map between contiguous logical
+			 address and physical address per inode, resulting in
+			 increasing the cache hit ratio. Set by default.
+noextent_cache		 Disable an extent cache based on rb-tree explicitly, see
+			 the above extent_cache mount option.
+noinline_data		 Disable the inline data feature, inline data feature is
+			 enabled by default.
+data_flush		 Enable data flushing before checkpoint in order to
+			 persist data of regular and symlink.
+reserve_root=%d		 Support configuring reserved space which is used for
+			 allocation from a privileged user with specified uid or
+			 gid, unit: 4KB, the default limit is 0.2% of user blocks.
+resuid=%d		 The user ID which may use the reserved blocks.
+resgid=%d		 The group ID which may use the reserved blocks.
+fault_injection=%d	 Enable fault injection in all supported types with
+			 specified injection rate.
+fault_type=%d		 Support configuring fault injection type, should be
+			 enabled with fault_injection option, fault type value
+			 is shown below, it supports single or combined type.
+
+			 ===================	  ===========
+			 Type_Name		  Type_Value
+			 ===================	  ===========
+			 FAULT_KMALLOC		  0x000000001
+			 FAULT_KVMALLOC		  0x000000002
+			 FAULT_PAGE_ALLOC	  0x000000004
+			 FAULT_PAGE_GET		  0x000000008
+			 FAULT_ALLOC_NID	  0x000000020
+			 FAULT_ORPHAN		  0x000000040
+			 FAULT_BLOCK		  0x000000080
+			 FAULT_DIR_DEPTH	  0x000000100
+			 FAULT_EVICT_INODE	  0x000000200
+			 FAULT_TRUNCATE		  0x000000400
+			 FAULT_READ_IO		  0x000000800
+			 FAULT_CHECKPOINT	  0x000001000
+			 FAULT_DISCARD		  0x000002000
+			 FAULT_WRITE_IO		  0x000004000
+			 ===================	  ===========
+mode=%s			 Control block allocation mode which supports "adaptive"
+			 and "lfs". In "lfs" mode, there should be no random
+			 writes towards main area.
+io_bits=%u		 Set the bit size of write IO requests. It should be set
+			 with "mode=lfs".
+usrquota		 Enable plain user disk quota accounting.
+grpquota		 Enable plain group disk quota accounting.
+prjquota		 Enable plain project quota accounting.
+usrjquota=<file>	 Appoint specified file and type during mount, so that quota
+grpjquota=<file>	 information can be properly updated during recovery flow,
+prjjquota=<file>	 <quota file>: must be in root directory;
+jqfmt=<quota type>	 <quota type>: [vfsold,vfsv0,vfsv1].
+offusrjquota		 Turn off user journalled quota.
+offgrpjquota		 Turn off group journalled quota.
+offprjjquota		 Turn off project journalled quota.
+quota			 Enable plain user disk quota accounting.
+noquota			 Disable all plain disk quota option.
+whint_mode=%s		 Control which write hints are passed down to block
+			 layer. This supports "off", "user-based", and
+			 "fs-based".  In "off" mode (default), f2fs does not pass
+			 down hints. In "user-based" mode, f2fs tries to pass
+			 down hints given by users. And in "fs-based" mode, f2fs
+			 passes down hints with its policy.
+alloc_mode=%s		 Adjust block allocation policy, which supports "reuse"
+			 and "default".
+fsync_mode=%s		 Control the policy of fsync. Currently supports "posix",
+			 "strict", and "nobarrier". In "posix" mode, which is
+			 default, fsync will follow POSIX semantics and does a
+			 light operation to improve the filesystem performance.
+			 In "strict" mode, fsync will be heavy and behaves in line
+			 with xfs, ext4 and btrfs, where xfstest generic/342 will
+			 pass, but the performance will regress. "nobarrier" is
+			 based on "posix", but doesn't issue flush command for
+			 non-atomic files likewise "nobarrier" mount option.
+test_dummy_encryption
+test_dummy_encryption=%s
+			 Enable dummy encryption, which provides a fake fscrypt
+			 context. The fake fscrypt context is used by xfstests.
+			 The argument may be either "v1" or "v2", in order to
+			 select the corresponding fscrypt policy version.
+checkpoint=%s[:%u[%]]	 Set to "disable" to turn off checkpointing. Set to "enable"
+			 to reenable checkpointing. Is enabled by default. While
+			 disabled, any unmounting or unexpected shutdowns will cause
+			 the filesystem contents to appear as they did when the
+			 filesystem was mounted with that option.
+			 While mounting with checkpoint=disabled, the filesystem must
+			 run garbage collection to ensure that all available space can
+			 be used. If this takes too much time, the mount may return
+			 EAGAIN. You may optionally add a value to indicate how much
+			 of the disk you would be willing to temporarily give up to
+			 avoid additional garbage collection. This can be given as a
+			 number of blocks, or as a percent. For instance, mounting
+			 with checkpoint=disable:100% would always succeed, but it may
+			 hide up to all remaining free space. The actual space that
+			 would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
+			 This space is reclaimed once checkpoint=enable.
+checkpoint_merge	 When checkpoint is enabled, this can be used to create a kernel
+			 daemon and make it to merge concurrent checkpoint requests as
+			 much as possible to eliminate redundant checkpoint issues. Plus,
+			 we can eliminate the sluggish issue caused by slow checkpoint
+			 operation when the checkpoint is done in a process context in
+			 a cgroup having low i/o budget and cpu shares. To make this
+			 do better, we set the default i/o priority of the kernel daemon
+			 to "3", to give one higher priority than other kernel threads.
+			 This is the same way to give a I/O priority to the jbd2
+			 journaling thread of ext4 filesystem.
+nocheckpoint_merge	 Disable checkpoint merge feature.
+compress_algorithm=%s	 Control compress algorithm, currently f2fs supports "lzo",
+			 "lz4", "zstd" and "lzo-rle" algorithm.
+compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only
+			 "lz4" and "zstd" support compress level config.
+			 algorithm	level range
+			 lz4		3 - 16
+			 zstd		1 - 22
+compress_log_size=%u	 Support configuring compress cluster size, the size will
+			 be 4KB * (1 << %u), 16KB is minimum size, also it's
+			 default size.
+compress_extension=%s	 Support adding specified extension, so that f2fs can enable
+			 compression on those corresponding files, e.g. if all files
+			 with '.ext' has high compression rate, we can set the '.ext'
+			 on compression extension list and enable compression on
+			 these file by default rather than to enable it via ioctl.
+			 For other files, we can still enable compression via ioctl.
+			 Note that, there is one reserved special extension '*', it
+			 can be set to enable compression for all files.
+compress_chksum		 Support verifying chksum of raw data in compressed cluster.
+compress_mode=%s	 Control file compression mode. This supports "fs" and "user"
+			 modes. In "fs" mode (default), f2fs does automatic compression
+			 on the compression enabled files. In "user" mode, f2fs disables
+			 the automaic compression and gives the user discretion of
+			 choosing the target file and the timing. The user can do manual
+			 compression/decompression on the compression enabled files using
+			 ioctls.
+inlinecrypt		 When possible, encrypt/decrypt the contents of encrypted
+			 files using the blk-crypto framework rather than
+			 filesystem-layer encryption. This allows the use of
+			 inline encryption hardware. The on-disk format is
+			 unaffected. For more details, see
+			 Documentation/block/inline-encryption.rst.
+atgc			 Enable age-threshold garbage collection, it provides high
+			 effectiveness and efficiency on background GC.
+======================== ============================================================
+
+Debugfs Entries
+===============
+
+/sys/kernel/debug/f2fs/ contains information about all the partitions mounted as
+f2fs. Each file shows the whole f2fs information.
+
+/sys/kernel/debug/f2fs/status includes:
+
+ - major file system information managed by f2fs currently
+ - average SIT information about whole segments
+ - current memory footprint consumed by f2fs.
+
+Sysfs Entries
+=============
+
+Information about mounted f2fs file systems can be found in
+/sys/fs/f2fs.  Each mounted filesystem will have a directory in
+/sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda).
+The files in each per-device directory are shown in table below.
+
+Files in /sys/fs/f2fs/<devname>
+(see also Documentation/ABI/testing/sysfs-fs-f2fs)
+
+Usage
+=====
+
+1. Download userland tools and compile them.
+
+2. Skip, if f2fs was compiled statically inside kernel.
+   Otherwise, insert the f2fs.ko module::
+
+	# insmod f2fs.ko
+
+3. Create a directory to use when mounting::
+
+	# mkdir /mnt/f2fs
+
+4. Format the block device, and then mount as f2fs::
+
+	# mkfs.f2fs -l label /dev/block_device
+	# mount -t f2fs /dev/block_device /mnt/f2fs
+
+mkfs.f2fs
+---------
+The mkfs.f2fs is for the use of formatting a partition as the f2fs filesystem,
+which builds a basic on-disk layout.
+
+The quick options consist of:
+
+===============    ===========================================================
+``-l [label]``     Give a volume label, up to 512 unicode name.
+``-a [0 or 1]``    Split start location of each area for heap-based allocation.
+
+                   1 is set by default, which performs this.
+``-o [int]``       Set overprovision ratio in percent over volume size.
+
+                   5 is set by default.
+``-s [int]``       Set the number of segments per section.
+
+                   1 is set by default.
+``-z [int]``       Set the number of sections per zone.
+
+                   1 is set by default.
+``-e [str]``       Set basic extension list. e.g. "mp3,gif,mov"
+``-t [0 or 1]``    Disable discard command or not.
+
+                   1 is set by default, which conducts discard.
+===============    ===========================================================
+
+Note: please refer to the manpage of mkfs.f2fs(8) to get full option list.
+
+fsck.f2fs
+---------
+The fsck.f2fs is a tool to check the consistency of an f2fs-formatted
+partition, which examines whether the filesystem metadata and user-made data
+are cross-referenced correctly or not.
+Note that, initial version of the tool does not fix any inconsistency.
+
+The quick options consist of::
+
+  -d debug level [default:0]
+
+Note: please refer to the manpage of fsck.f2fs(8) to get full option list.
+
+dump.f2fs
+---------
+The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
+file. Each file is dump_ssa and dump_sit.
+
+The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
+It shows on-disk inode information recognized by a given inode number, and is
+able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
+./dump_sit respectively.
+
+The options consist of::
+
+  -d debug level [default:0]
+  -i inode no (hex)
+  -s [SIT dump segno from #1~#2 (decimal), for all 0~-1]
+  -a [SSA dump segno from #1~#2 (decimal), for all 0~-1]
+
+Examples::
+
+    # dump.f2fs -i [ino] /dev/sdx
+    # dump.f2fs -s 0~-1 /dev/sdx (SIT dump)
+    # dump.f2fs -a 0~-1 /dev/sdx (SSA dump)
+
+Note: please refer to the manpage of dump.f2fs(8) to get full option list.
+
+sload.f2fs
+----------
+The sload.f2fs gives a way to insert files and directories in the exisiting disk
+image. This tool is useful when building f2fs images given compiled files.
+
+Note: please refer to the manpage of sload.f2fs(8) to get full option list.
+
+resize.f2fs
+-----------
+The resize.f2fs lets a user resize the f2fs-formatted disk image, while preserving
+all the files and directories stored in the image.
+
+Note: please refer to the manpage of resize.f2fs(8) to get full option list.
+
+defrag.f2fs
+-----------
+The defrag.f2fs can be used to defragment scattered written data as well as
+filesystem metadata across the disk. This can improve the write speed by giving
+more free consecutive space.
+
+Note: please refer to the manpage of defrag.f2fs(8) to get full option list.
+
+f2fs_io
+-------
+The f2fs_io is a simple tool to issue various filesystem APIs as well as
+f2fs-specific ones, which is very useful for QA tests.
+
+Note: please refer to the manpage of f2fs_io(8) to get full option list.
+
+Design
+======
+
+On-disk Layout
+--------------
+
+F2FS divides the whole volume into a number of segments, each of which is fixed
+to 2MB in size. A section is composed of consecutive segments, and a zone
+consists of a set of sections. By default, section and zone sizes are set to one
+segment size identically, but users can easily modify the sizes by mkfs.
+
+F2FS splits the entire volume into six areas, and all the areas except superblock
+consist of multiple segments as described below::
+
+                                            align with the zone size <-|
+                 |-> align with the segment size
+     _________________________________________________________________________
+    |            |            |   Segment   |    Node     |   Segment  |      |
+    | Superblock | Checkpoint |    Info.    |   Address   |   Summary  | Main |
+    |    (SB)    |   (CP)     | Table (SIT) | Table (NAT) | Area (SSA) |      |
+    |____________|_____2______|______N______|______N______|______N_____|__N___|
+                                                                       .      .
+                                                             .                .
+                                                 .                            .
+                                    ._________________________________________.
+                                    |_Segment_|_..._|_Segment_|_..._|_Segment_|
+                                    .           .
+                                    ._________._________
+                                    |_section_|__...__|_
+                                    .            .
+		                    .________.
+	                            |__zone__|
+
+- Superblock (SB)
+   It is located at the beginning of the partition, and there exist two copies
+   to avoid file system crash. It contains basic partition information and some
+   default parameters of f2fs.
+
+- Checkpoint (CP)
+   It contains file system information, bitmaps for valid NAT/SIT sets, orphan
+   inode lists, and summary entries of current active segments.
+
+- Segment Information Table (SIT)
+   It contains segment information such as valid block count and bitmap for the
+   validity of all the blocks.
+
+- Node Address Table (NAT)
+   It is composed of a block address table for all the node blocks stored in
+   Main area.
+
+- Segment Summary Area (SSA)
+   It contains summary entries which contains the owner information of all the
+   data and node blocks stored in Main area.
+
+- Main Area
+   It contains file and directory data including their indices.
+
+In order to avoid misalignment between file system and flash-based storage, F2FS
+aligns the start block address of CP with the segment size. Also, it aligns the
+start block address of Main area with the zone size by reserving some segments
+in SSA area.
+
+Reference the following survey for additional technical details.
+https://wiki.linaro.org/WorkingGroups/Kernel/Projects/FlashCardSurvey
+
+File System Metadata Structure
+------------------------------
+
+F2FS adopts the checkpointing scheme to maintain file system consistency. At
+mount time, F2FS first tries to find the last valid checkpoint data by scanning
+CP area. In order to reduce the scanning time, F2FS uses only two copies of CP.
+One of them always indicates the last valid data, which is called as shadow copy
+mechanism. In addition to CP, NAT and SIT also adopt the shadow copy mechanism.
+
+For file system consistency, each CP points to which NAT and SIT copies are
+valid, as shown as below::
+
+  +--------+----------+---------+
+  |   CP   |    SIT   |   NAT   |
+  +--------+----------+---------+
+  .         .          .          .
+  .            .              .              .
+  .               .                 .                 .
+  +-------+-------+--------+--------+--------+--------+
+  | CP #0 | CP #1 | SIT #0 | SIT #1 | NAT #0 | NAT #1 |
+  +-------+-------+--------+--------+--------+--------+
+     |             ^                          ^
+     |             |                          |
+     `----------------------------------------'
+
+Index Structure
+---------------
+
+The key data structure to manage the data locations is a "node". Similar to
+traditional file structures, F2FS has three types of node: inode, direct node,
+indirect node. F2FS assigns 4KB to an inode block which contains 923 data block
+indices, two direct node pointers, two indirect node pointers, and one double
+indirect node pointer as described below. One direct node block contains 1018
+data blocks, and one indirect node block contains also 1018 node blocks. Thus,
+one inode block (i.e., a file) covers::
+
+  4KB * (923 + 2 * 1018 + 2 * 1018 * 1018 + 1018 * 1018 * 1018) := 3.94TB.
+
+   Inode block (4KB)
+     |- data (923)
+     |- direct node (2)
+     |          `- data (1018)
+     |- indirect node (2)
+     |            `- direct node (1018)
+     |                       `- data (1018)
+     `- double indirect node (1)
+                         `- indirect node (1018)
+			              `- direct node (1018)
+	                                         `- data (1018)
+
+Note that all the node blocks are mapped by NAT which means the location of
+each node is translated by the NAT table. In the consideration of the wandering
+tree problem, F2FS is able to cut off the propagation of node updates caused by
+leaf data writes.
+
+Directory Structure
+-------------------
+
+A directory entry occupies 11 bytes, which consists of the following attributes.
+
+- hash		hash value of the file name
+- ino		inode number
+- len		the length of file name
+- type		file type such as directory, symlink, etc
+
+A dentry block consists of 214 dentry slots and file names. Therein a bitmap is
+used to represent whether each dentry is valid or not. A dentry block occupies
+4KB with the following composition.
+
+::
+
+  Dentry Block(4 K) = bitmap (27 bytes) + reserved (3 bytes) +
+	              dentries(11 * 214 bytes) + file name (8 * 214 bytes)
+
+                         [Bucket]
+             +--------------------------------+
+             |dentry block 1 | dentry block 2 |
+             +--------------------------------+
+             .               .
+       .                             .
+  .       [Dentry Block Structure: 4KB]       .
+  +--------+----------+----------+------------+
+  | bitmap | reserved | dentries | file names |
+  +--------+----------+----------+------------+
+  [Dentry Block: 4KB] .   .
+		 .               .
+            .                          .
+            +------+------+-----+------+
+            | hash | ino  | len | type |
+            +------+------+-----+------+
+            [Dentry Structure: 11 bytes]
+
+F2FS implements multi-level hash tables for directory structure. Each level has
+a hash table with dedicated number of hash buckets as shown below. Note that
+"A(2B)" means a bucket includes 2 data blocks.
+
+::
+
+    ----------------------
+    A : bucket
+    B : block
+    N : MAX_DIR_HASH_DEPTH
+    ----------------------
+
+    level #0   | A(2B)
+	    |
+    level #1   | A(2B) - A(2B)
+	    |
+    level #2   | A(2B) - A(2B) - A(2B) - A(2B)
+	.     |   .       .       .       .
+    level #N/2 | A(2B) - A(2B) - A(2B) - A(2B) - A(2B) - ... - A(2B)
+	.     |   .       .       .       .
+    level #N   | A(4B) - A(4B) - A(4B) - A(4B) - A(4B) - ... - A(4B)
+
+The number of blocks and buckets are determined by::
+
+                            ,- 2, if n < MAX_DIR_HASH_DEPTH / 2,
+  # of blocks in level #n = |
+                            `- 4, Otherwise
+
+                             ,- 2^(n + dir_level),
+			     |        if n + dir_level < MAX_DIR_HASH_DEPTH / 2,
+  # of buckets in level #n = |
+                             `- 2^((MAX_DIR_HASH_DEPTH / 2) - 1),
+			              Otherwise
+
+When F2FS finds a file name in a directory, at first a hash value of the file
+name is calculated. Then, F2FS scans the hash table in level #0 to find the
+dentry consisting of the file name and its inode number. If not found, F2FS
+scans the next hash table in level #1. In this way, F2FS scans hash tables in
+each levels incrementally from 1 to N. In each level F2FS needs to scan only
+one bucket determined by the following equation, which shows O(log(# of files))
+complexity::
+
+  bucket number to scan in level #n = (hash value) % (# of buckets in level #n)
+
+In the case of file creation, F2FS finds empty consecutive slots that cover the
+file name. F2FS searches the empty slots in the hash tables of whole levels from
+1 to N in the same way as the lookup operation.
+
+The following figure shows an example of two cases holding children::
+
+       --------------> Dir <--------------
+       |                                 |
+    child                             child
+
+    child - child                     [hole] - child
+
+    child - child - child             [hole] - [hole] - child
+
+   Case 1:                           Case 2:
+   Number of children = 6,           Number of children = 3,
+   File size = 7                     File size = 7
+
+Default Block Allocation
+------------------------
+
+At runtime, F2FS manages six active logs inside "Main" area: Hot/Warm/Cold node
+and Hot/Warm/Cold data.
+
+- Hot node	contains direct node blocks of directories.
+- Warm node	contains direct node blocks except hot node blocks.
+- Cold node	contains indirect node blocks
+- Hot data	contains dentry blocks
+- Warm data	contains data blocks except hot and cold data blocks
+- Cold data	contains multimedia data or migrated data blocks
+
+LFS has two schemes for free space management: threaded log and copy-and-compac-
+tion. The copy-and-compaction scheme which is known as cleaning, is well-suited
+for devices showing very good sequential write performance, since free segments
+are served all the time for writing new data. However, it suffers from cleaning
+overhead under high utilization. Contrarily, the threaded log scheme suffers
+from random writes, but no cleaning process is needed. F2FS adopts a hybrid
+scheme where the copy-and-compaction scheme is adopted by default, but the
+policy is dynamically changed to the threaded log scheme according to the file
+system status.
+
+In order to align F2FS with underlying flash-based storage, F2FS allocates a
+segment in a unit of section. F2FS expects that the section size would be the
+same as the unit size of garbage collection in FTL. Furthermore, with respect
+to the mapping granularity in FTL, F2FS allocates each section of the active
+logs from different zones as much as possible, since FTL can write the data in
+the active logs into one allocation unit according to its mapping granularity.
+
+Cleaning process
+----------------
+
+F2FS does cleaning both on demand and in the background. On-demand cleaning is
+triggered when there are not enough free segments to serve VFS calls. Background
+cleaner is operated by a kernel thread, and triggers the cleaning job when the
+system is idle.
+
+F2FS supports two victim selection policies: greedy and cost-benefit algorithms.
+In the greedy algorithm, F2FS selects a victim segment having the smallest number
+of valid blocks. In the cost-benefit algorithm, F2FS selects a victim segment
+according to the segment age and the number of valid blocks in order to address
+log block thrashing problem in the greedy algorithm. F2FS adopts the greedy
+algorithm for on-demand cleaner, while background cleaner adopts cost-benefit
+algorithm.
+
+In order to identify whether the data in the victim segment are valid or not,
+F2FS manages a bitmap. Each bit represents the validity of a block, and the
+bitmap is composed of a bit stream covering whole blocks in main area.
+
+Write-hint Policy
+-----------------
+
+1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET.
+
+2) whint_mode=user-based. F2FS tries to pass down hints given by
+users.
+
+===================== ======================== ===================
+User                  F2FS                     Block
+===================== ======================== ===================
+                      META                     WRITE_LIFE_NOT_SET
+                      HOT_NODE                 "
+                      WARM_NODE                "
+                      COLD_NODE                "
+ioctl(COLD)           COLD_DATA                WRITE_LIFE_EXTREME
+extension list        "                        "
+
+-- buffered io
+WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
+WRITE_LIFE_NONE       "                        "
+WRITE_LIFE_MEDIUM     "                        "
+WRITE_LIFE_LONG       "                        "
+
+-- direct io
+WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
+WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
+WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
+WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
+===================== ======================== ===================
+
+3) whint_mode=fs-based. F2FS passes down hints with its policy.
+
+===================== ======================== ===================
+User                  F2FS                     Block
+===================== ======================== ===================
+                      META                     WRITE_LIFE_MEDIUM;
+                      HOT_NODE                 WRITE_LIFE_NOT_SET
+                      WARM_NODE                "
+                      COLD_NODE                WRITE_LIFE_NONE
+ioctl(COLD)           COLD_DATA                WRITE_LIFE_EXTREME
+extension list        "                        "
+
+-- buffered io
+WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_LONG
+WRITE_LIFE_NONE       "                        "
+WRITE_LIFE_MEDIUM     "                        "
+WRITE_LIFE_LONG       "                        "
+
+-- direct io
+WRITE_LIFE_EXTREME    COLD_DATA                WRITE_LIFE_EXTREME
+WRITE_LIFE_SHORT      HOT_DATA                 WRITE_LIFE_SHORT
+WRITE_LIFE_NOT_SET    WARM_DATA                WRITE_LIFE_NOT_SET
+WRITE_LIFE_NONE       "                        WRITE_LIFE_NONE
+WRITE_LIFE_MEDIUM     "                        WRITE_LIFE_MEDIUM
+WRITE_LIFE_LONG       "                        WRITE_LIFE_LONG
+===================== ======================== ===================
+
+Fallocate(2) Policy
+-------------------
+
+The default policy follows the below POSIX rule.
+
+Allocating disk space
+    The default operation (i.e., mode is zero) of fallocate() allocates
+    the disk space within the range specified by offset and len.  The
+    file size (as reported by stat(2)) will be changed if offset+len is
+    greater than the file size.  Any subregion within the range specified
+    by offset and len that did not contain data before the call will be
+    initialized to zero.  This default behavior closely resembles the
+    behavior of the posix_fallocate(3) library function, and is intended
+    as a method of optimally implementing that function.
+
+However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
+fallocate(fd, DEFAULT_MODE), it allocates on-disk block addressess having
+zero or random data, which is useful to the below scenario where:
+
+ 1. create(fd)
+ 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
+ 3. fallocate(fd, 0, 0, size)
+ 4. address = fibmap(fd, offset)
+ 5. open(blkdev)
+ 6. write(blkdev, address)
+
+Compression implementation
+--------------------------
+
+- New term named cluster is defined as basic unit of compression, file can
+  be divided into multiple clusters logically. One cluster includes 4 << n
+  (n >= 0) logical pages, compression size is also cluster size, each of
+  cluster can be compressed or not.
+
+- In cluster metadata layout, one special block address is used to indicate
+  a cluster is a compressed one or normal one; for compressed cluster, following
+  metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs
+  stores data including compress header and compressed data.
+
+- In order to eliminate write amplification during overwrite, F2FS only
+  support compression on write-once file, data can be compressed only when
+  all logical blocks in cluster contain valid data and compress ratio of
+  cluster data is lower than specified threshold.
+
+- To enable compression on regular inode, there are three ways:
+
+  * chattr +c file
+  * chattr +c dir; touch dir/file
+  * mount w/ -o compress_extension=ext; touch file.ext
+
+Compress metadata layout::
+
+				[Dnode Structure]
+		+-----------------------------------------------+
+		| cluster 1 | cluster 2 | ......... | cluster N |
+		+-----------------------------------------------+
+		.           .                       .           .
+	.                       .                .                      .
+    .         Compressed Cluster       .        .        Normal Cluster            .
+    +----------+---------+---------+---------+  +---------+---------+---------+---------+
+    |compr flag| block 1 | block 2 | block 3 |  | block 1 | block 2 | block 3 | block 4 |
+    +----------+---------+---------+---------+  +---------+---------+---------+---------+
+	    .                             .
+	    .                                           .
+	.                                                           .
+	+-------------+-------------+----------+----------------------------+
+	| data length | data chksum | reserved |      compressed data       |
+	+-------------+-------------+----------+----------------------------+
+
+Compression mode
+--------------------------
+
+f2fs supports "fs" and "user" compression modes with "compression_mode" mount option.
+With this option, f2fs provides a choice to select the way how to compress the
+compression enabled files (refer to "Compression implementation" section for how to
+enable compression on a regular inode).
+
+1) compress_mode=fs
+This is the default option. f2fs does automatic compression in the writeback of the
+compression enabled files.
+
+2) compress_mode=user
+This disables the automatic compression and gives the user discretion of choosing the
+target file and the timing. The user can do manual compression/decompression on the
+compression enabled files using F2FS_IOC_DECOMPRESS_FILE and F2FS_IOC_COMPRESS_FILE
+ioctls like the below.
+
+To decompress a file,
+
+fd = open(filename, O_WRONLY, 0);
+ret = ioctl(fd, F2FS_IOC_DECOMPRESS_FILE);
+
+To compress a file,
+
+fd = open(filename, O_WRONLY, 0);
+ret = ioctl(fd, F2FS_IOC_COMPRESS_FILE);
+
+NVMe Zoned Namespace devices
+----------------------------
+
+- ZNS defines a per-zone capacity which can be equal or less than the
+  zone-size. Zone-capacity is the number of usable blocks in the zone.
+  F2FS checks if zone-capacity is less than zone-size, if it is, then any
+  segment which starts after the zone-capacity is marked as not-free in
+  the free segment bitmap at initial mount time. These segments are marked
+  as permanently used so they are not allocated for writes and
+  consequently are not needed to be garbage collected. In case the
+  zone-capacity is not aligned to default segment size(2MB), then a segment
+  can start before the zone-capacity and span across zone-capacity boundary.
+  Such spanning segments are also considered as usable segments. All blocks
+  past the zone-capacity are considered unusable in these segments.
--- a/Documentation/filesystems/fsverity.rst
+++ b/Documentation/filesystems/fsverity.rst
@ -217,6 +217,82 @@ FS_IOC_MEASURE_VERITY can fail with the following errors:
 - ``EOVERFLOW``: the digest is longer than the specified
  ``digest_size`` bytes.  Try providing a larger buffer.

+FS_IOC_READ_VERITY_METADATA
+---------------------------
+
+The FS_IOC_READ_VERITY_METADATA ioctl reads verity metadata from a
+verity file.  This ioctl is available since Linux v5.12.
+
+This ioctl allows writing a server program that takes a verity file
+and serves it to a client program, such that the client can do its own
+fs-verity compatible verification of the file.  This only makes sense
+if the client doesn't trust the server and if the server needs to
+provide the storage for the client.
+
+This is a fairly specialized use case, and most fs-verity users won't
+need this ioctl.
+
+This ioctl takes in a pointer to the following structure::
+
+   #define FS_VERITY_METADATA_TYPE_MERKLE_TREE     1
+   #define FS_VERITY_METADATA_TYPE_DESCRIPTOR      2
+   #define FS_VERITY_METADATA_TYPE_SIGNATURE       3
+
+   struct fsverity_read_metadata_arg {
+           __u64 metadata_type;
+           __u64 offset;
+           __u64 length;
+           __u64 buf_ptr;
+           __u64 __reserved;
+   };
+
+``metadata_type`` specifies the type of metadata to read:
+
+- ``FS_VERITY_METADATA_TYPE_MERKLE_TREE`` reads the blocks of the
+  Merkle tree.  The blocks are returned in order from the root level
+  to the leaf level.  Within each level, the blocks are returned in
+  the same order that their hashes are themselves hashed.
+  See `Merkle tree`_ for more information.
+
+- ``FS_VERITY_METADATA_TYPE_DESCRIPTOR`` reads the fs-verity
+  descriptor.  See `fs-verity descriptor`_.
+
+- ``FS_VERITY_METADATA_TYPE_SIGNATURE`` reads the signature which was
+  passed to FS_IOC_ENABLE_VERITY, if any.  See `Built-in signature
+  verification`_.
+
+The semantics are similar to those of ``pread()``.  ``offset``
+specifies the offset in bytes into the metadata item to read from, and
+``length`` specifies the maximum number of bytes to read from the
+metadata item.  ``buf_ptr`` is the pointer to the buffer to read into,
+cast to a 64-bit integer.  ``__reserved`` must be 0.  On success, the
+number of bytes read is returned.  0 is returned at the end of the
+metadata item.  The returned length may be less than ``length``, for
+example if the ioctl is interrupted.
+
+The metadata returned by FS_IOC_READ_VERITY_METADATA isn't guaranteed
+to be authenticated against the file digest that would be returned by
+`FS_IOC_MEASURE_VERITY`_, as the metadata is expected to be used to
+implement fs-verity compatible verification anyway (though absent a
+malicious disk, the metadata will indeed match).  E.g. to implement
+this ioctl, the filesystem is allowed to just read the Merkle tree
+blocks from disk without actually verifying the path to the root node.
+
+FS_IOC_READ_VERITY_METADATA can fail with the following errors:
+
+- ``EFAULT``: the caller provided inaccessible memory
+- ``EINTR``: the ioctl was interrupted before any data was read
+- ``EINVAL``: reserved fields were set, or ``offset + length``
+  overflowed
+- ``ENODATA``: the file is not a verity file, or
+  FS_VERITY_METADATA_TYPE_SIGNATURE was requested but the file doesn't
+  have a built-in signature
+- ``ENOTTY``: this type of filesystem does not implement fs-verity, or
+  this ioctl is not yet implemented on it
+- ``EOPNOTSUPP``: the kernel was not configured with fs-verity
+  support, or the filesystem superblock has not had the 'verity'
+  feature enabled on it.  (See `Filesystem support`_.)
+
 FS_IOC_GETFLAGS
 ---------------

--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@ -1238,6 +1238,12 @@ out:
 			return -EOPNOTSUPP;
 		return fsverity_ioctl_measure(filp, (void __user *)arg);

+	case FS_IOC_READ_VERITY_METADATA:
+		if (!ext4_has_feature_verity(sb))
+			return -EOPNOTSUPP;
+		return fsverity_ioctl_read_metadata(filp,
+						    (const void __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@ -1308,6 +1314,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FS_IOC_GETFSMAP:
 	case FS_IOC_ENABLE_VERITY:
 	case FS_IOC_MEASURE_VERITY:
+	case FS_IOC_READ_VERITY_METADATA:
 	case EXT4_IOC_FSGETXATTR:
 	case EXT4_IOC_FSSETXATTR:
 		break;
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@ -84,16 +84,6 @@ config F2FS_FS_ENCRYPTION
 	  FS_ENCRYPTION.  Use CONFIG_FS_ENCRYPTION=y in new config
 	  files.

-config F2FS_IO_TRACE
-	bool "F2FS IO tracer"
-	depends on F2FS_FS
-	depends on FUNCTION_TRACER
-	help
-	  F2FS IO trace is based on a function trace, which gathers process
-	  information and block IO patterns in the filesystem level.
-
-	  If unsure, say N.
-
 config F2FS_FAULT_INJECTION
 	bool "F2FS fault injection facility"
 	depends on F2FS_FS
@ -127,6 +117,16 @@ config F2FS_FS_LZ4
 	help
 	  Support LZ4 compress algorithm, if unsure, say Y.

+config F2FS_FS_LZ4HC
+	bool "LZ4HC compression support"
+	depends on F2FS_FS_COMPRESSION
+	depends on F2FS_FS_LZ4
+	select LZ4HC_COMPRESS
+	default y
+	help
+	  Support LZ4HC compress algorithm, LZ4HC has compatible on-disk
+	  layout with LZ4, if unsure, say Y.
+
 config F2FS_FS_ZSTD
 	bool "ZSTD compression support"
 	depends on F2FS_FS_COMPRESSION
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@ -7,6 +7,5 @@ f2fs-y		+= shrinker.o extent_cache.o sysfs.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
-f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
 f2fs-$(CONFIG_FS_VERITY) += verity.o
 f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@ -200,6 +200,27 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
 	return __f2fs_get_acl(inode, type, NULL);
 }

+static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
+			  struct posix_acl **acl)
+{
+	umode_t mode = inode->i_mode;
+	int error;
+
+	if (is_inode_flag_set(inode, FI_ACL_MODE))
+		mode = F2FS_I(inode)->i_acl_mode;
+
+	error = posix_acl_equiv_mode(*acl, &mode);
+	if (error < 0)
+		return error;
+	if (error == 0)
+		*acl = NULL;
+	if (!in_group_p(inode->i_gid) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		mode &= ~S_ISGID;
+	*mode_p = mode;
+	return 0;
+}
+
 static int __f2fs_set_acl(struct inode *inode, int type,
 			struct posix_acl *acl, struct page *ipage)
 {
@ -213,7 +234,7 @@ static int __f2fs_set_acl(struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl && !ipage) {
-			error = posix_acl_update_mode(inode, &mode, &acl);
+			error = f2fs_acl_update_mode(inode, &mode, &acl);
 			if (error)
 				return error;
 			set_acl_inode(inode, mode);
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@ -13,13 +13,15 @@
 #include <linux/f2fs_fs.h>
 #include <linux/pagevec.h>
 #include <linux/swap.h>
+#include <linux/kthread.h>

 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
-#include "trace.h"
 #include <trace/events/f2fs.h>

+#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
 static struct kmem_cache *ino_entry_slab;
 struct kmem_cache *f2fs_inode_entry_slab;

@ -443,7 +445,6 @@ static int f2fs_set_meta_page_dirty(struct page *page)
 		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
 		f2fs_set_page_private(page, 0);
-		f2fs_trace_pid(page);
 		return 1;
 	}
 	return 0;
@ -1017,7 +1018,6 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
 	spin_unlock(&sbi->inode_lock[type]);

 	f2fs_set_page_private(page, 0);
-	f2fs_trace_pid(page);
 }

 void f2fs_remove_dirty_inode(struct inode *inode)
@ -1385,8 +1385,7 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi,

 static inline u64 get_sectors_written(struct block_device *bdev)
 {
-	return bdev->bd_part ?
-		(u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]) : 0;
+	return (u64)part_stat_read(bdev->bd_part, sectors[STAT_WRITE]);
 }

 u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi)
@ -1714,3 +1713,174 @@ void f2fs_destroy_checkpoint_caches(void)
 	kmem_cache_destroy(ino_entry_slab);
 	kmem_cache_destroy(f2fs_inode_entry_slab);
 }
+
+static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
+{
+	struct cp_control cpc = { .reason = CP_SYNC, };
+	int err;
+
+	down_write(&sbi->gc_lock);
+	err = f2fs_write_checkpoint(sbi, &cpc);
+	up_write(&sbi->gc_lock);
+
+	return err;
+}
+
+static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi)
+{
+	struct ckpt_req_control *cprc = &sbi->cprc_info;
+	struct ckpt_req *req, *next;
+	struct llist_node *dispatch_list;
+	u64 sum_diff = 0, diff, count = 0;
+	int ret;
+
+	dispatch_list = llist_del_all(&cprc->issue_list);
+	if (!dispatch_list)
+		return;
+	dispatch_list = llist_reverse_order(dispatch_list);
+
+	ret = __write_checkpoint_sync(sbi);
+	atomic_inc(&cprc->issued_ckpt);
+
+	llist_for_each_entry_safe(req, next, dispatch_list, llnode) {
+		diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time);
+		req->ret = ret;
+		complete(&req->wait);
+
+		sum_diff += diff;
+		count++;
+	}
+	atomic_sub(count, &cprc->queued_ckpt);
+	atomic_add(count, &cprc->total_ckpt);
+
+	spin_lock(&cprc->stat_lock);
+	cprc->cur_time = (unsigned int)div64_u64(sum_diff, count);
+	if (cprc->peak_time < cprc->cur_time)
+		cprc->peak_time = cprc->cur_time;
+	spin_unlock(&cprc->stat_lock);
+}
+
+static int issue_checkpoint_thread(void *data)
+{
+	struct f2fs_sb_info *sbi = data;
+	struct ckpt_req_control *cprc = &sbi->cprc_info;
+	wait_queue_head_t *q = &cprc->ckpt_wait_queue;
+repeat:
+	if (kthread_should_stop())
+		return 0;
+
+	if (!llist_empty(&cprc->issue_list))
+		__checkpoint_and_complete_reqs(sbi);
+
+	wait_event_interruptible(*q,
+		kthread_should_stop() || !llist_empty(&cprc->issue_list));
+	goto repeat;
+}
+
+static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi,
+		struct ckpt_req *wait_req)
+{
+	struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+	if (!llist_empty(&cprc->issue_list)) {
+		__checkpoint_and_complete_reqs(sbi);
+	} else {
+		/* already dispatched by issue_checkpoint_thread */
+		if (wait_req)
+			wait_for_completion(&wait_req->wait);
+	}
+}
+
+static void init_ckpt_req(struct ckpt_req *req)
+{
+	memset(req, 0, sizeof(struct ckpt_req));
+
+	init_completion(&req->wait);
+	req->queue_time = ktime_get();
+}
+
+int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
+{
+	struct ckpt_req_control *cprc = &sbi->cprc_info;
+	struct ckpt_req req;
+	struct cp_control cpc;
+
+	cpc.reason = __get_cp_reason(sbi);
+	if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
+		int ret;
+
+		down_write(&sbi->gc_lock);
+		ret = f2fs_write_checkpoint(sbi, &cpc);
+		up_write(&sbi->gc_lock);
+
+		return ret;
+	}
+
+	if (!cprc->f2fs_issue_ckpt)
+		return __write_checkpoint_sync(sbi);
+
+	init_ckpt_req(&req);
+
+	llist_add(&req.llnode, &cprc->issue_list);
+	atomic_inc(&cprc->queued_ckpt);
+
+	/* update issue_list before we wake up issue_checkpoint thread */
+	smp_mb();
+
+	if (waitqueue_active(&cprc->ckpt_wait_queue))
+		wake_up(&cprc->ckpt_wait_queue);
+
+	if (cprc->f2fs_issue_ckpt)
+		wait_for_completion(&req.wait);
+	else
+		flush_remained_ckpt_reqs(sbi, &req);
+
+	return req.ret;
+}
+
+int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+	dev_t dev = sbi->sb->s_bdev->bd_dev;
+	struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+	if (cprc->f2fs_issue_ckpt)
+		return 0;
+
+	cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
+			"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
+	if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+		cprc->f2fs_issue_ckpt = NULL;
+		return -ENOMEM;
+	}
+
+	set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
+
+	return 0;
+}
+
+void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi)
+{
+	struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+	if (cprc->f2fs_issue_ckpt) {
+		struct task_struct *ckpt_task = cprc->f2fs_issue_ckpt;
+
+		cprc->f2fs_issue_ckpt = NULL;
+		kthread_stop(ckpt_task);
+
+		flush_remained_ckpt_reqs(sbi, NULL);
+	}
+}
+
+void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi)
+{
+	struct ckpt_req_control *cprc = &sbi->cprc_info;
+
+	atomic_set(&cprc->issued_ckpt, 0);
+	atomic_set(&cprc->total_ckpt, 0);
+	atomic_set(&cprc->queued_ckpt, 0);
+	cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO;
+	init_waitqueue_head(&cprc->ckpt_wait_queue);
+	init_llist_head(&cprc->issue_list);
+	spin_lock_init(&cprc->stat_lock);
+}
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@ -253,8 +253,14 @@ static const struct f2fs_compress_ops f2fs_lzo_ops = {
 #ifdef CONFIG_F2FS_FS_LZ4
 static int lz4_init_compress_ctx(struct compress_ctx *cc)
 {
-	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
-				LZ4_MEM_COMPRESS, GFP_NOFS);
+	unsigned int size = LZ4_MEM_COMPRESS;
+
+#ifdef CONFIG_F2FS_FS_LZ4HC
+	if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET)
+		size = LZ4HC_MEM_COMPRESS;
+#endif
+
+	cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS);
 	if (!cc->private)
 		return -ENOMEM;

@ -273,10 +279,34 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
 	cc->private = NULL;
 }

+#ifdef CONFIG_F2FS_FS_LZ4HC
+static int lz4hc_compress_pages(struct compress_ctx *cc)
+{
+	unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+						COMPRESS_LEVEL_OFFSET;
+	int len;
+
+	if (level)
+		len = LZ4_compress_HC(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+					cc->clen, level, cc->private);
+	else
+		len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
+						cc->clen, cc->private);
+	if (!len)
+		return -EAGAIN;
+
+	cc->clen = len;
+	return 0;
+}
+#endif
+
 static int lz4_compress_pages(struct compress_ctx *cc)
 {
 	int len;

+#ifdef CONFIG_F2FS_FS_LZ4HC
+	return lz4hc_compress_pages(cc);
+#endif
 	len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen,
 						cc->clen, cc->private);
 	if (!len)
@ -326,8 +356,13 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
 	ZSTD_CStream *stream;
 	void *workspace;
 	unsigned int workspace_size;
+	unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+						COMPRESS_LEVEL_OFFSET;

-	params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0);
+	if (!level)
+		level = F2FS_ZSTD_DEFAULT_CLEVEL;
+
+	params = ZSTD_getParams(level, cc->rlen, 0);
 	workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);

 	workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
@ -692,38 +727,27 @@ out:
 	return ret;
 }

-void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
+static void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
 {
-	struct decompress_io_ctx *dic =
-			(struct decompress_io_ctx *)page_private(page);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
-	struct f2fs_inode_info *fi= F2FS_I(dic->inode);
+	struct f2fs_inode_info *fi = F2FS_I(dic->inode);
 	const struct f2fs_compress_ops *cops =
 			f2fs_cops[fi->i_compress_algorithm];
 	int ret;
 	int i;

-	dec_page_count(sbi, F2FS_RD_DATA);
-
-	if (bio->bi_status || PageError(page))
-		dic->failed = true;
-
-	if (atomic_dec_return(&dic->pending_pages))
-		return;
-
 	trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
 				dic->cluster_size, fi->i_compress_algorithm);

-	/* submit partial compressed pages */
 	if (dic->failed) {
 		ret = -EIO;
-		goto out_free_dic;
+		goto out_end_io;
 	}

 	dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
 	if (!dic->tpages) {
 		ret = -ENOMEM;
-		goto out_free_dic;
+		goto out_end_io;
 	}

 	for (i = 0; i < dic->cluster_size; i++) {
@ -735,20 +759,20 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 		dic->tpages[i] = f2fs_compress_alloc_page();
 		if (!dic->tpages[i]) {
 			ret = -ENOMEM;
-			goto out_free_dic;
+			goto out_end_io;
 		}
 	}

 	if (cops->init_decompress_ctx) {
 		ret = cops->init_decompress_ctx(dic);
 		if (ret)
-			goto out_free_dic;
+			goto out_end_io;
 	}

 	dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
 	if (!dic->rbuf) {
 		ret = -ENOMEM;
-		goto destroy_decompress_ctx;
+		goto out_destroy_decompress_ctx;
 	}

 	dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
@ -787,18 +811,34 @@ out_vunmap_cbuf:
 	vm_unmap_ram(dic->cbuf, dic->nr_cpages);
 out_vunmap_rbuf:
 	vm_unmap_ram(dic->rbuf, dic->cluster_size);
-destroy_decompress_ctx:
+out_destroy_decompress_ctx:
 	if (cops->destroy_decompress_ctx)
 		cops->destroy_decompress_ctx(dic);
-out_free_dic:
-	if (!verity)
-		f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
-								ret, false);
-
+out_end_io:
 	trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
 							dic->clen, ret);
-	if (!verity)
-		f2fs_free_dic(dic);
+	f2fs_decompress_end_io(dic, ret);
+}
+
+/*
+ * This is called when a page of a compressed cluster has been read from disk
+ * (or failed to be read from disk).  It checks whether this page was the last
+ * page being waited on in the cluster, and if so, it decompresses the cluster
+ * (or in the case of a failure, cleans up without actually decompressing).
+ */
+void f2fs_end_read_compressed_page(struct page *page, bool failed)
+{
+	struct decompress_io_ctx *dic =
+			(struct decompress_io_ctx *)page_private(page);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+
+	dec_page_count(sbi, F2FS_RD_DATA);
+
+	if (failed)
+		WRITE_ONCE(dic->failed, true);
+
+	if (atomic_dec_and_test(&dic->remaining_pages))
+		f2fs_decompress_cluster(dic);
 }

 static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@ -1387,7 +1427,7 @@ retry_write:

 		ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
 						NULL, NULL, wbc, io_type,
-						compr_blocks);
+						compr_blocks, false);
 		if (ret) {
 			if (ret == AOP_WRITEPAGE_ACTIVATE) {
 				unlock_page(cc->rpages[i]);
@ -1422,6 +1462,9 @@ retry_write:

 		*submitted += _submitted;
 	}
+
+	f2fs_balance_fs(F2FS_M_SB(mapping), true);
+
 	return 0;
 out_err:
 	for (++i; i < cc->cluster_size; i++) {
@ -1466,6 +1509,8 @@ destroy_out:
 	return err;
 }

+static void f2fs_free_dic(struct decompress_io_ctx *dic);
+
 struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
 {
 	struct decompress_io_ctx *dic;
@ -1484,12 +1529,14 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)

 	dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
 	dic->inode = cc->inode;
-	atomic_set(&dic->pending_pages, cc->nr_cpages);
+	atomic_set(&dic->remaining_pages, cc->nr_cpages);
 	dic->cluster_idx = cc->cluster_idx;
 	dic->cluster_size = cc->cluster_size;
 	dic->log_cluster_size = cc->log_cluster_size;
 	dic->nr_cpages = cc->nr_cpages;
+	refcount_set(&dic->refcnt, 1);
 	dic->failed = false;
+	dic->need_verity = f2fs_need_verity(cc->inode, start_idx);

 	for (i = 0; i < dic->cluster_size; i++)
 		dic->rpages[i] = cc->rpages[i];
@ -1518,7 +1565,7 @@ out_free:
 	return ERR_PTR(-ENOMEM);
 }

-void f2fs_free_dic(struct decompress_io_ctx *dic)
+static void f2fs_free_dic(struct decompress_io_ctx *dic)
 {
 	int i;

@ -1546,30 +1593,88 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
 	kmem_cache_free(dic_entry_slab, dic);
 }

-void f2fs_decompress_end_io(struct page **rpages,
-			unsigned int cluster_size, bool err, bool verity)
+static void f2fs_put_dic(struct decompress_io_ctx *dic)
+{
+	if (refcount_dec_and_test(&dic->refcnt))
+		f2fs_free_dic(dic);
+}
+
+/*
+ * Update and unlock the cluster's pagecache pages, and release the reference to
+ * the decompress_io_ctx that was being held for I/O completion.
+ */
+static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
 {
 	int i;

-	for (i = 0; i < cluster_size; i++) {
-		struct page *rpage = rpages[i];
+	for (i = 0; i < dic->cluster_size; i++) {
+		struct page *rpage = dic->rpages[i];

 		if (!rpage)
 			continue;

-		if (err || PageError(rpage))
-			goto clear_uptodate;
-
-		if (!verity || fsverity_verify_page(rpage)) {
+		/* PG_error was set if verity failed. */
+		if (failed || PageError(rpage)) {
+			ClearPageUptodate(rpage);
+			/* will re-read again later */
+			ClearPageError(rpage);
+		} else {
 			SetPageUptodate(rpage);
-			goto unlock;
 		}
-clear_uptodate:
-		ClearPageUptodate(rpage);
-		ClearPageError(rpage);
-unlock:
 		unlock_page(rpage);
 	}
+
+	f2fs_put_dic(dic);
+}
+
+static void f2fs_verify_cluster(struct work_struct *work)
+{
+	struct decompress_io_ctx *dic =
+		container_of(work, struct decompress_io_ctx, verity_work);
+	int i;
+
+	/* Verify the cluster's decompressed pages with fs-verity. */
+	for (i = 0; i < dic->cluster_size; i++) {
+		struct page *rpage = dic->rpages[i];
+
+		if (rpage && !fsverity_verify_page(rpage))
+			SetPageError(rpage);
+	}
+
+	__f2fs_decompress_end_io(dic, false);
+}
+
+/*
+ * This is called when a compressed cluster has been decompressed
+ * (or failed to be read and/or decompressed).
+ */
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+{
+	if (!failed && dic->need_verity) {
+		/*
+		 * Note that to avoid deadlocks, the verity work can't be done
+		 * on the decompression workqueue.  This is because verifying
+		 * the data pages can involve reading metadata pages from the
+		 * file, and these metadata pages may be compressed.
+		 */
+		INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
+		fsverity_enqueue_verify_work(&dic->verity_work);
+	} else {
+		__f2fs_decompress_end_io(dic, failed);
+	}
+}
+
+/*
+ * Put a reference to a compressed page's decompress_io_ctx.
+ *
+ * This is called when the page is no longer needed and can be freed.
+ */
+void f2fs_put_page_dic(struct page *page)
+{
+	struct decompress_io_ctx *dic =
+			(struct decompress_io_ctx *)page_private(page);
+
+	f2fs_put_dic(dic);
 }

 int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@ -23,7 +23,6 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
-#include "trace.h"
 #include <trace/events/f2fs.h>
 #include <trace/events/android_fs.h>

@ -49,27 +48,6 @@ void f2fs_destroy_bioset(void)
 	bioset_exit(&f2fs_bioset);
 }

-static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
-						unsigned int nr_iovecs)
-{
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
-}
-
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
-{
-	if (noio) {
-		/* No failure on bio allocation */
-		return __f2fs_bio_alloc(GFP_NOIO, npages);
-	}
-
-	if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
-		f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
-		return NULL;
-	}
-
-	return __f2fs_bio_alloc(GFP_KERNEL, npages);
-}
-
 static bool __is_cp_guaranteed(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@ -114,10 +92,21 @@ static enum count_type __read_io_type(struct page *page)

 /* postprocessing steps for read bios */
 enum bio_post_read_step {
-	STEP_DECRYPT,
-	STEP_DECOMPRESS_NOWQ,		/* handle normal cluster data inplace */
-	STEP_DECOMPRESS,		/* handle compressed cluster data in workqueue */
-	STEP_VERITY,
+#ifdef CONFIG_FS_ENCRYPTION
+	STEP_DECRYPT	= 1 << 0,
+#else
+	STEP_DECRYPT	= 0,	/* compile out the decryption-related code */
+#endif
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+	STEP_DECOMPRESS	= 1 << 1,
+#else
+	STEP_DECOMPRESS	= 0,	/* compile out the decompression-related code */
+#endif
+#ifdef CONFIG_FS_VERITY
+	STEP_VERITY	= 1 << 2,
+#else
+	STEP_VERITY	= 0,	/* compile out the verity-related code */
+#endif
 };

 struct bio_post_read_ctx {
@ -127,25 +116,26 @@ struct bio_post_read_ctx {
 	unsigned int enabled_steps;
 };

-static void __read_end_io(struct bio *bio, bool compr, bool verity)
+static void f2fs_finish_read_bio(struct bio *bio)
 {
-	struct page *page;
 	struct bio_vec *bv;
-	int i;
+	int iter_all;

-	bio_for_each_segment_all(bv, bio, i) {
-		page = bv->bv_page;
+	/*
+	 * Update and unlock the bio's pagecache pages, and put the
+	 * decompression context for any compressed pages.
+	 */
+	bio_for_each_segment_all(bv, bio, iter_all) {
+		struct page *page = bv->bv_page;

-#ifdef CONFIG_F2FS_FS_COMPRESSION
-		if (compr && f2fs_is_compressed_page(page)) {
-			f2fs_decompress_pages(bio, page, verity);
+		if (f2fs_is_compressed_page(page)) {
+			if (bio->bi_status)
+				f2fs_end_read_compressed_page(page, true);
+			f2fs_put_page_dic(page);
 			continue;
 		}
-		if (verity)
-			continue;
-#endif

-		/* PG_error was set if any post_read step failed */
+		/* PG_error was set if decryption or verity failed. */
 		if (bio->bi_status || PageError(page)) {
 			ClearPageUptodate(page);
 			/* will re-read again later */
@ -156,106 +146,104 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
 		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
 		unlock_page(page);
 	}
+
+	if (bio->bi_private)
+		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
+	bio_put(bio);
 }

-static void f2fs_release_read_bio(struct bio *bio);
-static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity)
-{
-	if (!compr)
-		__read_end_io(bio, false, verity);
-	f2fs_release_read_bio(bio);
-}
-
-static void f2fs_decompress_bio(struct bio *bio, bool verity)
-{
-	__read_end_io(bio, true, verity);
-}
-
-static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
-
-static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx)
-{
-	fscrypt_decrypt_bio(ctx->bio);
-}
-
-static void f2fs_decompress_work(struct bio_post_read_ctx *ctx)
-{
-	f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY));
-}
-
-#ifdef CONFIG_F2FS_FS_COMPRESSION
-static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
-{
-	f2fs_decompress_end_io(rpages, cluster_size, false, true);
-}
-
-static void f2fs_verify_bio(struct bio *bio)
-{
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
-		struct decompress_io_ctx *dic;
-
-		dic = (struct decompress_io_ctx *)page_private(page);
-
-		if (dic) {
-			if (atomic_dec_return(&dic->verity_pages))
-				continue;
-			f2fs_verify_pages(dic->rpages,
-						dic->cluster_size);
-			f2fs_free_dic(dic);
-			continue;
-		}
-
-		if (bio->bi_status || PageError(page))
-			goto clear_uptodate;
-
-		if (fsverity_verify_page(page)) {
-			SetPageUptodate(page);
-			goto unlock;
-		}
-clear_uptodate:
-		ClearPageUptodate(page);
-		ClearPageError(page);
-unlock:
-		dec_page_count(F2FS_P_SB(page), __read_io_type(page));
-		unlock_page(page);
-	}
-}
-#endif
-
-static void f2fs_verity_work(struct work_struct *work)
+static void f2fs_verify_bio(struct work_struct *work)
 {
 	struct bio_post_read_ctx *ctx =
 		container_of(work, struct bio_post_read_ctx, work);
 	struct bio *bio = ctx->bio;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
-	unsigned int enabled_steps = ctx->enabled_steps;
-#endif
+	bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS);

 	/*
 	 * fsverity_verify_bio() may call readpages() again, and while verity
-	 * will be disabled for this, decryption may still be needed, resulting
-	 * in another bio_post_read_ctx being allocated.  So to prevent
-	 * deadlocks we need to release the current ctx to the mempool first.
-	 * This assumes that verity is the last post-read step.
+	 * will be disabled for this, decryption and/or decompression may still
+	 * be needed, resulting in another bio_post_read_ctx being allocated.
+	 * So to prevent deadlocks we need to release the current ctx to the
+	 * mempool first.  This assumes that verity is the last post-read step.
 	 */
 	mempool_free(ctx, bio_post_read_ctx_pool);
 	bio->bi_private = NULL;

-#ifdef CONFIG_F2FS_FS_COMPRESSION
-	/* previous step is decompression */
-	if (enabled_steps & (1 << STEP_DECOMPRESS)) {
-		f2fs_verify_bio(bio);
-		f2fs_release_read_bio(bio);
-		return;
-	}
-#endif
+	/*
+	 * Verify the bio's pages with fs-verity.  Exclude compressed pages,
+	 * as those were handled separately by f2fs_end_read_compressed_page().
+	 */
+	if (may_have_compressed_pages) {
+		struct bio_vec *bv;
+		int iter_all;

-	fsverity_verify_bio(bio);
-	__f2fs_read_end_io(bio, false, false);
+		bio_for_each_segment_all(bv, bio, iter_all) {
+			struct page *page = bv->bv_page;
+
+			if (!f2fs_is_compressed_page(page) &&
+			    !PageError(page) && !fsverity_verify_page(page))
+				SetPageError(page);
+		}
+	} else {
+		fsverity_verify_bio(bio);
+	}
+
+	f2fs_finish_read_bio(bio);
+}
+
+/*
+ * If the bio's data needs to be verified with fs-verity, then enqueue the
+ * verity work for the bio.  Otherwise finish the bio now.
+ *
+ * Note that to avoid deadlocks, the verity work can't be done on the
+ * decryption/decompression workqueue.  This is because verifying the data pages
+ * can involve reading verity metadata pages from the file, and these verity
+ * metadata pages may be encrypted and/or compressed.
+ */
+static void f2fs_verify_and_finish_bio(struct bio *bio)
+{
+	struct bio_post_read_ctx *ctx = bio->bi_private;
+
+	if (ctx && (ctx->enabled_steps & STEP_VERITY)) {
+		INIT_WORK(&ctx->work, f2fs_verify_bio);
+		fsverity_enqueue_verify_work(&ctx->work);
+	} else {
+		f2fs_finish_read_bio(bio);
+	}
+}
+
+/*
+ * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last
+ * remaining page was read by @ctx->bio.
+ *
+ * Note that a bio may span clusters (even a mix of compressed and uncompressed
+ * clusters) or be for just part of a cluster.  STEP_DECOMPRESS just indicates
+ * that the bio includes at least one compressed page.  The actual decompression
+ * is done on a per-cluster basis, not a per-bio basis.
+ */
+static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
+{
+	struct bio_vec *bv;
+	int iter_all;
+	bool all_compressed = true;
+
+	bio_for_each_segment_all(bv, ctx->bio, iter_all) {
+		struct page *page = bv->bv_page;
+
+		/* PG_error was set if decryption failed. */
+		if (f2fs_is_compressed_page(page))
+			f2fs_end_read_compressed_page(page, PageError(page));
+		else
+			all_compressed = false;
+	}
+
+	/*
+	 * Optimization: if all the bio's pages are compressed, then scheduling
+	 * the per-bio verity work is unnecessary, as verity will be fully
+	 * handled at the compression cluster level.
+	 */
+	if (all_compressed)
+		ctx->enabled_steps &= ~STEP_VERITY;
 }

 static void f2fs_post_read_work(struct work_struct *work)
@ -263,96 +251,59 @@ static void f2fs_post_read_work(struct work_struct *work)
 	struct bio_post_read_ctx *ctx =
 		container_of(work, struct bio_post_read_ctx, work);

-	if (ctx->enabled_steps & (1 << STEP_DECRYPT))
-		f2fs_decrypt_work(ctx);
+	if (ctx->enabled_steps & STEP_DECRYPT)
+		fscrypt_decrypt_bio(ctx->bio);

-	if (ctx->enabled_steps & (1 << STEP_DECOMPRESS))
-		f2fs_decompress_work(ctx);
+	if (ctx->enabled_steps & STEP_DECOMPRESS)
+		f2fs_handle_step_decompress(ctx);

-	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
-		INIT_WORK(&ctx->work, f2fs_verity_work);
-		fsverity_enqueue_verify_work(&ctx->work);
-		return;
-	}
-
-	__f2fs_read_end_io(ctx->bio,
-		ctx->enabled_steps & (1 << STEP_DECOMPRESS), false);
-}
-
-static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi,
-						struct work_struct *work)
-{
-	queue_work(sbi->post_read_wq, work);
-}
-
-static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
-{
-	/*
-	 * We use different work queues for decryption and for verity because
-	 * verity may require reading metadata pages that need decryption, and
-	 * we shouldn't recurse to the same workqueue.
-	 */
-
-	if (ctx->enabled_steps & (1 << STEP_DECRYPT) ||
-		ctx->enabled_steps & (1 << STEP_DECOMPRESS)) {
-		INIT_WORK(&ctx->work, f2fs_post_read_work);
-		f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work);
-		return;
-	}
-
-	if (ctx->enabled_steps & (1 << STEP_VERITY)) {
-		INIT_WORK(&ctx->work, f2fs_verity_work);
-		fsverity_enqueue_verify_work(&ctx->work);
-		return;
-	}
-
-	__f2fs_read_end_io(ctx->bio, false, false);
-}
-
-static bool f2fs_bio_post_read_required(struct bio *bio)
-{
-	return bio->bi_private;
+	f2fs_verify_and_finish_bio(ctx->bio);
 }

 static void f2fs_read_end_io(struct bio *bio)
 {
 	struct page *first_page = bio->bi_io_vec[0].bv_page;
 	struct f2fs_sb_info *sbi = F2FS_P_SB(first_page);
+	struct bio_post_read_ctx *ctx = bio->bi_private;

 	if (time_to_inject(sbi, FAULT_READ_IO)) {
 		f2fs_show_injection_info(sbi, FAULT_READ_IO);
 		bio->bi_status = BLK_STS_IOERR;
 	}

-	if (f2fs_bio_post_read_required(bio)) {
-		struct bio_post_read_ctx *ctx = bio->bi_private;
-
-		bio_post_read_processing(ctx);
+	if (bio->bi_status) {
+		f2fs_finish_read_bio(bio);
 		return;
 	}

 	if (first_page != NULL &&
 		__read_io_type(first_page) == F2FS_RD_DATA) {
-		trace_android_fs_dataread_end(first_page->mapping->host,
-						page_offset(first_page),
-						bio->bi_iter.bi_size);
+		trace_android_fs_dataread_end(
+					page_file_mapping(first_page)->host,
+					page_file_offset(first_page),
+					bio->bi_iter.bi_size);
 	}

-	__f2fs_read_end_io(bio, false, false);
+	if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) {
+		INIT_WORK(&ctx->work, f2fs_post_read_work);
+		queue_work(ctx->sbi->post_read_wq, &ctx->work);
+	} else {
+		f2fs_verify_and_finish_bio(bio);
+	}
 }

 static void f2fs_write_end_io(struct bio *bio)
 {
 	struct f2fs_sb_info *sbi = bio->bi_private;
 	struct bio_vec *bvec;
-	int i;
+	int iter_all;

 	if (time_to_inject(sbi, FAULT_WRITE_IO)) {
 		f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
 		bio->bi_status = BLK_STS_IOERR;
 	}

-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
 		enum count_type type = WB_DATA_TYPE(page);

@ -449,7 +400,7 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
 	struct f2fs_sb_info *sbi = fio->sbi;
 	struct bio *bio;

-	bio = f2fs_bio_alloc(sbi, npages, true);
+	bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);

 	f2fs_target_device(sbi, fio->new_blkaddr, bio);
 	if (is_read_io(fio->op)) {
@ -510,7 +461,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 		if (f2fs_lfs_mode(sbi) && current->plug)
 			blk_finish_plug(current->plug);

-		if (F2FS_IO_ALIGNED(sbi))
+		if (!F2FS_IO_ALIGNED(sbi))
 			goto submit_io;

 		start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
@ -631,7 +582,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
 						struct page *page, nid_t ino)
 {
 	struct bio_vec *bvec;
-	int i;
+	int iter_all;

 	if (!bio)
 		return false;
@ -639,7 +590,7 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode,
 	if (!inode && !page && !ino)
 		return true;

-	bio_for_each_segment_all(bvec, bio, i) {
+	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *target = bvec->bv_page;

 		if (fscrypt_is_bounce_page(target)) {
@ -744,7 +695,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 		return -EFSCORRUPTED;

 	trace_f2fs_submit_page_bio(page, fio);
-	f2fs_trace_ios(fio, 0);

 	/* Allocate a new bio */
 	bio = __bio_alloc(fio, 1);
@ -951,7 +901,6 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
 		return -EFSCORRUPTED;

 	trace_f2fs_submit_page_bio(page, fio);
-	f2fs_trace_ios(fio, 0);

 alloc_new:
 	if (!bio) {
@ -1047,7 +996,6 @@ alloc_new:
 		wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE);

 	io->last_block_in_bio = fio->new_blkaddr;
-	f2fs_trace_ios(fio, 0);

 	trace_f2fs_submit_page_write(fio->page, fio);
 skip:
@ -1060,24 +1008,18 @@ out:
 	up_write(&io->io_rwsem);
 }

-static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
-{
-	return fsverity_active(inode) &&
-	       idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
-}
-
 static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 				      unsigned nr_pages, unsigned op_flag,
-				      pgoff_t first_idx, bool for_write,
-				      bool for_verity)
+				      pgoff_t first_idx, bool for_write)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct bio *bio;
 	struct bio_post_read_ctx *ctx;
 	unsigned int post_read_steps = 0;

-	bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
-								for_write);
+	bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL,
+			       min_t(int, nr_pages, BIO_MAX_PAGES),
+			       &f2fs_bioset);
 	if (!bio)
 		return ERR_PTR(-ENOMEM);

@ -1088,13 +1030,19 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	bio_set_op_attrs(bio, REQ_OP_READ, op_flag);

 	if (fscrypt_inode_uses_fs_layer_crypto(inode))
-		post_read_steps |= 1 << STEP_DECRYPT;
-	if (f2fs_compressed_file(inode))
-		post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ;
-	if (for_verity && f2fs_need_verity(inode, first_idx))
-		post_read_steps |= 1 << STEP_VERITY;
+		post_read_steps |= STEP_DECRYPT;

-	if (post_read_steps) {
+	if (f2fs_need_verity(inode, first_idx))
+		post_read_steps |= STEP_VERITY;
+
+	/*
+	 * STEP_DECOMPRESS is handled specially, since a compressed file might
+	 * contain both compressed and uncompressed clusters.  We'll allocate a
+	 * bio_post_read_ctx if the file is compressed, but the caller is
+	 * responsible for enabling STEP_DECOMPRESS if it's actually needed.
+	 */
+
+	if (post_read_steps || f2fs_compressed_file(inode)) {
 		/* Due to the mempool, this never fails. */
 		ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
 		ctx->bio = bio;
@ -1106,13 +1054,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
 	return bio;
 }

-static void f2fs_release_read_bio(struct bio *bio)
-{
-	if (bio->bi_private)
-		mempool_free(bio->bi_private, bio_post_read_ctx_pool);
-	bio_put(bio);
-}
-
 /* This can handle encryption stuffs */
 static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 				 block_t blkaddr, int op_flags, bool for_write)
@ -1121,7 +1062,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
 	struct bio *bio;

 	bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags,
-					page->index, for_write, true);
+					page->index, for_write);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);

@ -2000,6 +1941,7 @@ next:
 	}

 	if (size) {
+		flags |= FIEMAP_EXTENT_MERGED;
 		if (IS_ENCRYPTED(inode))
 			flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;

@ -2158,7 +2100,7 @@ submit_and_realloc:
 	if (bio == NULL) {
 		bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
 				is_readahead ? REQ_RAHEAD : 0, page->index,
-				false, true);
+				false);
 		if (IS_ERR(bio)) {
 			ret = PTR_ERR(bio);
 			bio = NULL;
@ -2204,8 +2146,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 	sector_t last_block_in_file;
 	const unsigned blocksize = blks_to_bytes(inode, 1);
 	struct decompress_io_ctx *dic = NULL;
-	struct bio_post_read_ctx *ctx;
-	bool for_verity = false;
 	int i;
 	int ret = 0;

@ -2271,29 +2211,10 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 		goto out_put_dnode;
 	}

-	/*
-	 * It's possible to enable fsverity on the fly when handling a cluster,
-	 * which requires complicated error handling. Instead of adding more
-	 * complexity, let's give a rule where end_io post-processes fsverity
-	 * per cluster. In order to do that, we need to submit bio, if previous
-	 * bio sets a different post-process policy.
-	 */
-	if (fsverity_active(cc->inode)) {
-		atomic_set(&dic->verity_pages, cc->nr_cpages);
-		for_verity = true;
-
-		if (bio) {
-			ctx = bio->bi_private;
-			if (!(ctx->enabled_steps & (1 << STEP_VERITY))) {
-				__submit_bio(sbi, bio, DATA);
-				bio = NULL;
-			}
-		}
-	}
-
 	for (i = 0; i < dic->nr_cpages; i++) {
 		struct page *page = dic->cpages[i];
 		block_t blkaddr;
+		struct bio_post_read_ctx *ctx;

 		blkaddr = data_blkaddr(dn.inode, dn.node_page,
 						dn.ofs_in_node + i + 1);
@ -2309,31 +2230,10 @@ submit_and_realloc:
 		if (!bio) {
 			bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
 					is_readahead ? REQ_RAHEAD : 0,
-					page->index, for_write, for_verity);
+					page->index, for_write);
 			if (IS_ERR(bio)) {
-				unsigned int remained = dic->nr_cpages - i;
-				bool release = false;
-
 				ret = PTR_ERR(bio);
-				dic->failed = true;
-
-				if (for_verity) {
-					if (!atomic_sub_return(remained,
-						&dic->verity_pages))
-						release = true;
-				} else {
-					if (!atomic_sub_return(remained,
-						&dic->pending_pages))
-						release = true;
-				}
-
-				if (release) {
-					f2fs_decompress_end_io(dic->rpages,
-						cc->cluster_size, true,
-						false);
-					f2fs_free_dic(dic);
-				}
-
+				f2fs_decompress_end_io(dic, ret);
 				f2fs_put_dnode(&dn);
 				*bio_ret = NULL;
 				return ret;
@ -2345,10 +2245,9 @@ submit_and_realloc:
 		if (bio_add_page(bio, page, blocksize, 0) < blocksize)
 			goto submit_and_realloc;

-		/* tag STEP_DECOMPRESS to handle IO in wq */
 		ctx = bio->bi_private;
-		if (!(ctx->enabled_steps & (1 << STEP_DECOMPRESS)))
-			ctx->enabled_steps |= 1 << STEP_DECOMPRESS;
+		ctx->enabled_steps |= STEP_DECOMPRESS;
+		refcount_inc(&dic->refcnt);

 		inc_page_count(sbi, F2FS_RD_DATA);
 		f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
@ -2365,7 +2264,13 @@ submit_and_realloc:
 out_put_dnode:
 	f2fs_put_dnode(&dn);
 out:
-	f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false);
+	for (i = 0; i < cc->cluster_size; i++) {
+		if (cc->rpages[i]) {
+			ClearPageUptodate(cc->rpages[i]);
+			ClearPageError(cc->rpages[i]);
+			unlock_page(cc->rpages[i]);
+		}
+	}
 	*bio_ret = bio;
 	return ret;
 }
@ -2792,7 +2697,8 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 				sector_t *last_block,
 				struct writeback_control *wbc,
 				enum iostat_type io_type,
-				int compr_blocks)
+				int compr_blocks,
+				bool allow_balance)
 {
 	struct inode *inode = page->mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@ -2930,7 +2836,7 @@ out:
 	}
 	unlock_page(page);
 	if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
-					!F2FS_I(inode)->cp_task)
+			!F2FS_I(inode)->cp_task && allow_balance)
 		f2fs_balance_fs(sbi, need_balance_fs);

 	if (unlikely(f2fs_cp_error(sbi))) {
@ -2977,7 +2883,7 @@ out:
 #endif

 	return f2fs_write_single_data_page(page, NULL, NULL, NULL,
-						wbc, FS_DATA_IO, 0);
+						wbc, FS_DATA_IO, 0, true);
 }

 /*
@ -3147,7 +3053,8 @@ continue_unlock:
 			}
 #endif
 			ret = f2fs_write_single_data_page(page, &submitted,
-					&bio, &last_block, wbc, io_type, 0);
+					&bio, &last_block, wbc, io_type,
+					0, true);
 			if (ret == AOP_WRITEPAGE_ACTIVATE)
 				unlock_page(page);
 #ifdef CONFIG_F2FS_FS_COMPRESSION
@ -3923,7 +3830,7 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
 		filemap_write_and_wait(mapping);

 	/* Block number less than F2FS MAX BLOCKS */
-	if (unlikely(block >= F2FS_I_SB(inode)->max_file_blocks))
+	if (unlikely(block >= max_file_blocks(inode)))
 		goto out;

 	if (f2fs_compressed_file(inode)) {
@ -4194,12 +4101,13 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	if (!f2fs_disable_compressed_file(inode))
 		return -EINVAL;

+	f2fs_precache_extents(inode);
+
 	ret = check_swap_activate(sis, file, span);
 	if (ret < 0)
 		return ret;

 	set_inode_flag(inode, FI_PIN_FILE);
-	f2fs_precache_extents(inode);
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return ret;
 }
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@ -120,6 +120,13 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 			atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt);
 		si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks;
 	}
+	si->nr_issued_ckpt = atomic_read(&sbi->cprc_info.issued_ckpt);
+	si->nr_total_ckpt = atomic_read(&sbi->cprc_info.total_ckpt);
+	si->nr_queued_ckpt = atomic_read(&sbi->cprc_info.queued_ckpt);
+	spin_lock(&sbi->cprc_info.stat_lock);
+	si->cur_ckpt_time = sbi->cprc_info.cur_time;
+	si->peak_ckpt_time = sbi->cprc_info.peak_time;
+	spin_unlock(&sbi->cprc_info.stat_lock);
 	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
 	si->rsvd_segs = reserved_segments(sbi);
 	si->overp_segs = overprovision_segments(sbi);
@ -417,6 +424,11 @@ static int stat_show(struct seq_file *s, void *v)
 				si->meta_count[META_NAT]);
 		seq_printf(s, "  - ssa blocks : %u\n",
 				si->meta_count[META_SSA]);
+		seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
+				"Cur time: %4d(ms), Peak time: %4d(ms))\n",
+				si->nr_queued_ckpt, si->nr_issued_ckpt,
+				si->nr_total_ckpt, si->cur_ckpt_time,
+				si->peak_ckpt_time);
 		seq_printf(s, "GC calls: %d (BG: %d)\n",
 			   si->call_count, si->bg_gc);
 		seq_printf(s, "  - data segments : %d (%d)\n",
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@ -42,7 +42,6 @@ enum {
 	FAULT_KVMALLOC,
 	FAULT_PAGE_ALLOC,
 	FAULT_PAGE_GET,
-	FAULT_ALLOC_BIO,
 	FAULT_ALLOC_NID,
 	FAULT_ORPHAN,
 	FAULT_BLOCK,
@ -96,6 +95,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
 #define F2FS_MOUNT_DISABLE_CHECKPOINT	0x02000000
 #define F2FS_MOUNT_NORECOVERY		0x04000000
 #define F2FS_MOUNT_ATGC			0x08000000
+#define F2FS_MOUNT_MERGE_CHECKPOINT	0x10000000

 #define F2FS_OPTION(sbi)	((sbi)->mount_opt)
 #define clear_opt(sbi, option)	(F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@ -148,6 +148,7 @@ struct f2fs_mount_info {
 	/* For compression */
 	unsigned char compress_algorithm;	/* algorithm type */
 	unsigned char compress_log_size;	/* cluster log size */
+	unsigned char compress_level;		/* compress level */
 	bool compress_chksum;			/* compressed data chksum */
 	unsigned char compress_ext_cnt;		/* extension count */
 	int compress_mode;			/* compression mode */
@ -268,6 +269,26 @@ struct fsync_node_entry {
 	unsigned int seq_id;	/* sequence id */
 };

+struct ckpt_req {
+	struct completion wait;		/* completion for checkpoint done */
+	struct llist_node llnode;	/* llist_node to be linked in wait queue */
+	int ret;			/* return code of checkpoint */
+	ktime_t queue_time;		/* request queued time */
+};
+
+struct ckpt_req_control {
+	struct task_struct *f2fs_issue_ckpt;	/* checkpoint task */
+	int ckpt_thread_ioprio;			/* checkpoint merge thread ioprio */
+	wait_queue_head_t ckpt_wait_queue;	/* waiting queue for wake-up */
+	atomic_t issued_ckpt;		/* # of actually issued ckpts */
+	atomic_t total_ckpt;		/* # of total ckpts */
+	atomic_t queued_ckpt;		/* # of queued ckpts */
+	struct llist_head issue_list;	/* list for command issue */
+	spinlock_t stat_lock;		/* lock for below checkpoint time stats */
+	unsigned int cur_time;		/* cur wait time in msec for currently issued checkpoint */
+	unsigned int peak_time;		/* peak wait time in msec until now */
+};
+
 /* for the bitmap indicate blocks to be discarded */
 struct discard_entry {
 	struct list_head list;	/* list head */
@ -737,6 +758,7 @@ struct f2fs_inode_info {
 	atomic_t i_compr_blocks;		/* # of compressed blocks */
 	unsigned char i_compress_algorithm;	/* algorithm type */
 	unsigned char i_log_cluster_size;	/* log of cluster size */
+	unsigned char i_compress_level;		/* compress level (lz4hc,zstd) */
 	unsigned short i_compress_flag;		/* compress flag */
 	unsigned int i_cluster_size;		/* cluster size */
 };
@ -1317,6 +1339,8 @@ struct compress_data {

 #define F2FS_COMPRESSED_PAGE_MAGIC	0xF5F2C000

+#define	COMPRESS_LEVEL_OFFSET	8
+
 /* compress context */
 struct compress_ctx {
 	struct inode *inode;		/* inode the context belong to */
@ -1344,7 +1368,7 @@ struct compress_io_ctx {
 	atomic_t pending_pages;		/* in-flight compressed page count */
 };

-/* decompress io context for read IO path */
+/* Context for decompressing one cluster on the read IO path */
 struct decompress_io_ctx {
 	u32 magic;			/* magic number to indicate page is compressed */
 	struct inode *inode;		/* inode the context belong to */
@ -1360,11 +1384,37 @@ struct decompress_io_ctx {
 	struct compress_data *cbuf;	/* virtual mapped address on cpages */
 	size_t rlen;			/* valid data length in rbuf */
 	size_t clen;			/* valid data length in cbuf */
-	atomic_t pending_pages;		/* in-flight compressed page count */
-	atomic_t verity_pages;		/* in-flight page count for verity */
-	bool failed;			/* indicate IO error during decompression */
+
+	/*
+	 * The number of compressed pages remaining to be read in this cluster.
+	 * This is initially nr_cpages.  It is decremented by 1 each time a page
+	 * has been read (or failed to be read).  When it reaches 0, the cluster
+	 * is decompressed (or an error is reported).
+	 *
+	 * If an error occurs before all the pages have been submitted for I/O,
+	 * then this will never reach 0.  In this case the I/O submitter is
+	 * responsible for calling f2fs_decompress_end_io() instead.
+	 */
+	atomic_t remaining_pages;
+
+	/*
+	 * Number of references to this decompress_io_ctx.
+	 *
+	 * One reference is held for I/O completion.  This reference is dropped
+	 * after the pagecache pages are updated and unlocked -- either after
+	 * decompression (and verity if enabled), or after an error.
+	 *
+	 * In addition, each compressed page holds a reference while it is in a
+	 * bio.  These references are necessary prevent compressed pages from
+	 * being freed while they are still in a bio.
+	 */
+	refcount_t refcnt;
+
+	bool failed;			/* IO error occurred before decompression? */
+	bool need_verity;		/* need fs-verity verification after decompression? */
 	void *private;			/* payload buffer for specified decompression algorithm */
 	void *private2;			/* extra payload buffer */
+	struct work_struct verity_work;	/* work to verify the decompressed pages */
 };

 #define NULL_CLUSTER			((unsigned int)(~0))
@ -1411,6 +1461,7 @@ struct f2fs_sb_info {
 	wait_queue_head_t cp_wait;
 	unsigned long last_time[MAX_TIME];	/* to store time in jiffies */
 	long interval_time[MAX_TIME];		/* to store thresholds */
+	struct ckpt_req_control cprc_info;	/* for checkpoint request control */

 	struct inode_management im[MAX_INO_ENTRY];	/* manage inode cache */

@ -1451,7 +1502,6 @@ struct f2fs_sb_info {
 	unsigned int total_sections;		/* total section count */
 	unsigned int total_node_count;		/* total node block count */
 	unsigned int total_valid_node_count;	/* valid node block count */
-	loff_t max_file_blocks;			/* max block index of file */
 	int dir_level;				/* directory level */
 	int readdir_ra;				/* readahead inode in readdir */
 	u64 max_io_bytes;			/* max io bytes to merge IOs */
@ -1548,9 +1598,12 @@ struct f2fs_sb_info {
 	unsigned int node_io_flag;

 	/* For sysfs suppport */
-	struct kobject s_kobj;
+	struct kobject s_kobj;			/* /sys/fs/f2fs/<devname> */
 	struct completion s_kobj_unregister;

+	struct kobject s_stat_kobj;		/* /sys/fs/f2fs/<devname>/stat */
+	struct completion s_stat_kobj_unregister;
+
 	/* For shrinker support */
 	struct list_head s_list;
 	int s_ndevs;				/* number of devices */
@ -3247,6 +3300,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync);
 void f2fs_inode_synced(struct inode *inode);
 int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
 int f2fs_quota_sync(struct super_block *sb, int type);
+loff_t max_file_blocks(struct inode *inode);
 void f2fs_quota_off_umount(struct super_block *sb);
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
 int f2fs_sync_fs(struct super_block *sb, int sync);
@ -3431,13 +3485,16 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
 void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
 int __init f2fs_create_checkpoint_caches(void);
 void f2fs_destroy_checkpoint_caches(void);
+int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi);
+int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi);
+void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);

 /*
 * data.c
 */
 int __init f2fs_init_bioset(void);
 void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
 int f2fs_init_bio_entry_cache(void);
 void f2fs_destroy_bio_entry_cache(void);
 void f2fs_submit_bio(struct f2fs_sb_info *sbi,
@ -3485,7 +3542,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 				struct bio **bio, sector_t *last_block,
 				struct writeback_control *wbc,
 				enum iostat_type io_type,
-				int compr_blocks);
+				int compr_blocks, bool allow_balance);
 void f2fs_invalidate_page(struct page *page, unsigned int offset,
 			unsigned int length);
 int f2fs_release_page(struct page *page, gfp_t wait);
@ -3546,6 +3603,8 @@ struct f2fs_stat_info {
 	int nr_discarding, nr_discarded;
 	int nr_discard_cmd;
 	unsigned int undiscard_blks;
+	int nr_issued_ckpt, nr_total_ckpt, nr_queued_ckpt;
+	unsigned int cur_ckpt_time, peak_ckpt_time;
 	int inline_xattr, inline_inode, inline_dir, append, update, orphans;
 	int compr_inode;
 	unsigned long long compr_blocks;
@ -3731,8 +3790,6 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
 #define stat_dec_compr_inode(inode)			do { } while (0)
 #define stat_add_compr_blocks(inode, blocks)		do { } while (0)
 #define stat_sub_compr_blocks(inode, blocks)		do { } while (0)
-#define stat_inc_atomic_write(inode)			do { } while (0)
-#define stat_dec_atomic_write(inode)			do { } while (0)
 #define stat_update_max_atomic_write(inode)		do { } while (0)
 #define stat_inc_volatile_write(inode)			do { } while (0)
 #define stat_dec_volatile_write(inode)			do { } while (0)
@ -3892,7 +3949,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
 bool f2fs_is_compress_backend_ready(struct inode *inode);
 int f2fs_init_compress_mempool(void);
 void f2fs_destroy_compress_mempool(void);
-void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity);
+void f2fs_end_read_compressed_page(struct page *page, bool failed);
 bool f2fs_cluster_is_empty(struct compress_ctx *cc);
 bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
 void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page);
@ -3905,9 +3962,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
 				unsigned nr_pages, sector_t *last_block_in_bio,
 				bool is_readahead, bool for_write);
 struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
-void f2fs_free_dic(struct decompress_io_ctx *dic);
-void f2fs_decompress_end_io(struct page **rpages,
-			unsigned int cluster_size, bool err, bool verity);
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
+void f2fs_put_page_dic(struct page *page);
 int f2fs_init_compress_ctx(struct compress_ctx *cc);
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc);
 void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@ -3931,6 +3987,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page)
 }
 static inline int f2fs_init_compress_mempool(void) { return 0; }
 static inline void f2fs_destroy_compress_mempool(void) { }
+static inline void f2fs_end_read_compressed_page(struct page *page, bool failed)
+{
+	WARN_ON_ONCE(1);
+}
+static inline void f2fs_put_page_dic(struct page *page)
+{
+	WARN_ON_ONCE(1);
+}
 static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { }
 static inline int __init f2fs_init_compress_cache(void) { return 0; }
@ -3950,6 +4014,11 @@ static inline void set_compress_context(struct inode *inode)
 				1 << COMPRESS_CHKSUM : 0;
 	F2FS_I(inode)->i_cluster_size =
 			1 << F2FS_I(inode)->i_log_cluster_size;
+	if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 &&
+			F2FS_OPTION(sbi).compress_level)
+		F2FS_I(inode)->i_compress_flag |=
+				F2FS_OPTION(sbi).compress_level <<
+				COMPRESS_LEVEL_OFFSET;
 	F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
 	set_inode_flag(inode, FI_COMPRESSED_FILE);
 	stat_inc_compr_inode(inode);
@ -4150,6 +4219,12 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
 	return false;
 }

+static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
+{
+	return fsverity_active(inode) &&
+	       idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+}
+
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
 							unsigned int type);
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@ -29,7 +29,6 @@
 #include "xattr.h"
 #include "acl.h"
 #include "gc.h"
-#include "trace.h"
 #include <trace/events/f2fs.h>
 #include <trace/events/android_fs.h>
 #include <uapi/linux/f2fs.h>
@ -61,6 +60,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 	bool need_alloc = true;
 	int err = 0;

+	if (unlikely(IS_IMMUTABLE(inode)))
+		return VM_FAULT_SIGBUS;
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		err = -EIO;
 		goto err;
@ -74,6 +76,10 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 		goto err;
 	}

+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		goto err;
+
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 	if (f2fs_compressed_file(inode)) {
 		int ret = f2fs_is_compressed_cluster(inode, page->index);
@ -379,9 +385,7 @@ flush_out:
 	f2fs_update_time(sbi, REQ_TIME);
 out:
 	trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
-	f2fs_trace_ios(NULL, 1);
 	trace_android_fs_fsync_end(inode, start, end - start);
-
 	return ret;
 }

@ -517,6 +521,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *inode = file->f_mapping->host;
 	loff_t maxbytes = inode->i_sb->s_maxbytes;

+	if (f2fs_compressed_file(inode))
+		maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
+
 	switch (whence) {
 	case SEEK_SET:
 	case SEEK_CUR:
@ -536,7 +543,6 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
 static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
-	int err;

 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
@ -544,11 +550,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!f2fs_is_compress_backend_ready(inode))
 		return -EOPNOTSUPP;

-	/* we don't need to use inline_data strictly */
-	err = f2fs_convert_inline_inode(inode);
-	if (err)
-		return err;
-
 	file_accessed(file);
 	vma->vm_ops = &f2fs_file_vm_ops;
 	set_inode_flag(inode, FI_MMAP_FILE);
@ -701,7 +702,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)

 	free_from = (pgoff_t)F2FS_BLK_ALIGN(from);

-	if (free_from >= sbi->max_file_blocks)
+	if (free_from >= max_file_blocks(inode))
 		goto free_partial;

 	if (lock)
@ -907,6 +908,14 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;

+	if (unlikely(IS_IMMUTABLE(inode)))
+		return -EPERM;
+
+	if (unlikely(IS_APPEND(inode) &&
+			(attr->ia_valid & (ATTR_MODE | ATTR_UID |
+				  ATTR_GID | ATTR_TIMES_SET))))
+		return -EPERM;
+
 	if ((attr->ia_valid & ATTR_SIZE) &&
 		!f2fs_is_compress_backend_ready(inode))
 		return -EOPNOTSUPP;
@ -991,8 +1000,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)

 	if (attr->ia_valid & ATTR_MODE) {
 		err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode));
-		if (err || is_inode_flag_set(inode, FI_ACL_MODE)) {
-			inode->i_mode = F2FS_I(inode)->i_acl_mode;
+
+		if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+			if (!err)
+				inode->i_mode = F2FS_I(inode)->i_acl_mode;
 			clear_inode_flag(inode, FI_ACL_MODE);
 		}
 	}
@ -2774,7 +2785,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
 		return -EINVAL;

 	if (unlikely((range.start + range.len) >> PAGE_SHIFT >
-					sbi->max_file_blocks))
+					max_file_blocks(inode)))
 		return -EINVAL;

 	err = mnt_want_write_file(filp);
@ -3337,7 +3348,7 @@ int f2fs_precache_extents(struct inode *inode)
 	map.m_next_extent = &m_next_extent;
 	map.m_seg_type = NO_CHECK_TYPE;
 	map.m_may_create = false;
-	end = F2FS_I_SB(inode)->max_file_blocks;
+	end = max_file_blocks(inode);

 	while (map.m_lblk < end) {
 		map.m_len = end - map.m_lblk;
@ -3401,6 +3412,14 @@ static int f2fs_ioc_measure_verity(struct file *filp, unsigned long arg)
 	return fsverity_ioctl_measure(filp, (void __user *)arg);
 }

+static int f2fs_ioc_read_verity_metadata(struct file *filp, unsigned long arg)
+{
+	if (!f2fs_sb_has_verity(F2FS_I_SB(file_inode(filp))))
+		return -EOPNOTSUPP;
+
+	return fsverity_ioctl_read_metadata(filp, (const void __user *)arg);
+}
+
 static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@ -4084,8 +4103,10 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)

 	for (i = 0; i < page_len; i++, redirty_idx++) {
 		page = find_lock_page(mapping, redirty_idx);
-		if (!page)
-			ret = -ENOENT;
+		if (!page) {
+			ret = -ENOMEM;
+			break;
+		}
 		set_page_dirty(page);
 		f2fs_put_page(page, 1);
 		f2fs_put_page(page, 0);
@ -4313,6 +4334,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_enable_verity(filp, arg);
 	case FS_IOC_MEASURE_VERITY:
 		return f2fs_ioc_measure_verity(filp, arg);
+	case FS_IOC_READ_VERITY_METADATA:
+		return f2fs_ioc_read_verity_metadata(filp, arg);
 	case FS_IOC_GETFSLABEL:
 		return f2fs_ioc_getfslabel(filp, arg);
 	case FS_IOC_SETFSLABEL:
@ -4390,6 +4413,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		inode_lock(inode);
 	}

+	if (unlikely(IS_IMMUTABLE(inode))) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
 		bool preallocated = false;
@ -4454,6 +4482,7 @@ write:
 		if (ret > 0)
 			f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
 	}
+unlock:
 	inode_unlock(inode);
 out:
 	trace_f2fs_file_write_iter(inode, iocb->ki_pos,
@ -4563,6 +4592,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC_RESIZE_FS:
 	case FS_IOC_ENABLE_VERITY:
 	case FS_IOC_MEASURE_VERITY:
+	case FS_IOC_READ_VERITY_METADATA:
 	case FS_IOC_GETFSLABEL:
 	case FS_IOC_SETFSLABEL:
 	case F2FS_IOC_GET_COMPRESS_BLOCKS:
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@ -1174,8 +1174,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	if (err)
 		goto put_out;

-	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
-
 	/* read page */
 	fio.page = page;
 	fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
@ -1212,6 +1210,9 @@ static int move_data_block(struct inode *inode, block_t bidx,
 		}
 	}

+	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+	/* allocate block address */
 	f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
 				&sum, type, NULL);

@ -1238,9 +1239,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	set_page_writeback(fio.encrypted_page);
 	ClearPageError(page);

-	/* allocate block address */
-	f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
-
 	fio.op = REQ_OP_WRITE;
 	fio.op_flags = REQ_SYNC;
 	fio.new_blkaddr = newaddr;
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@ -17,7 +17,6 @@
 #include "node.h"
 #include "segment.h"
 #include "xattr.h"
-#include "trace.h"
 #include <trace/events/f2fs.h>

 #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
@ -2106,7 +2105,6 @@ static int f2fs_set_node_page_dirty(struct page *page)
 		__set_page_dirty_nobuffers(page);
 		inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
 		f2fs_set_page_private(page, 0);
-		f2fs_trace_pid(page);
 		return 1;
 	}
 	return 0;
@ -2713,7 +2711,7 @@ retry:
 	src = F2FS_INODE(page);
 	dst = F2FS_INODE(ipage);

-	memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
+	memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
 	dst->i_size = 0;
 	dst->i_blocks = cpu_to_le64(1);
 	dst->i_links = cpu_to_le32(1);
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@ -20,7 +20,6 @@
 #include "segment.h"
 #include "node.h"
 #include "gc.h"
-#include "trace.h"
 #include <trace/events/f2fs.h>

 #define __reverse_ffz(x) __reverse_ffs(~(x))
@ -187,8 +186,6 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 {
 	struct inmem_pages *new;

-	f2fs_trace_pid(page);
-
 	f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE);

 	new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@ -569,17 +566,7 @@ do_sync:
 static int __submit_flush_wait(struct f2fs_sb_info *sbi,
 				struct block_device *bdev)
 {
-	struct bio *bio;
-	int ret;
-
-	bio = f2fs_bio_alloc(sbi, 0, false);
-	if (!bio)
-		return -ENOMEM;
-
-	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
-	bio_set_dev(bio, bdev);
-	ret = submit_bio_wait(bio);
-	bio_put(bio);
+	int ret = blkdev_issue_flush(bdev, GFP_NOFS, NULL);

 	trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
 				test_opt(sbi, FLUSH_MERGE), ret);
@ -613,8 +600,6 @@ repeat:
 	if (kthread_should_stop())
 		return 0;

-	sb_start_intwrite(sbi->sb);
-
 	if (!llist_empty(&fcc->issue_list)) {
 		struct flush_cmd *cmd, *next;
 		int ret;
@ -635,8 +620,6 @@ repeat:
 		fcc->dispatch_list = NULL;
 	}

-	sb_end_intwrite(sbi->sb);
-
 	wait_event_interruptible(*q,
 		kthread_should_stop() || !llist_empty(&fcc->issue_list));
 	goto repeat;
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@ -24,13 +24,14 @@
 #include <linux/sysfs.h>
 #include <linux/quota.h>
 #include <linux/unicode.h>
+#include <linux/zstd.h>
+#include <linux/lz4.h>

 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
 #include "xattr.h"
 #include "gc.h"
-#include "trace.h"

 #define CREATE_TRACE_POINTS
 #include <trace/events/f2fs.h>
@ -44,7 +45,6 @@ const char *f2fs_fault_name[FAULT_MAX] = {
 	[FAULT_KVMALLOC]	= "kvmalloc",
 	[FAULT_PAGE_ALLOC]	= "page alloc",
 	[FAULT_PAGE_GET]	= "page get",
-	[FAULT_ALLOC_BIO]	= "alloc bio",
 	[FAULT_ALLOC_NID]	= "alloc nid",
 	[FAULT_ORPHAN]		= "orphan",
 	[FAULT_BLOCK]		= "no more block",
@ -142,6 +142,8 @@ enum {
 	Opt_checkpoint_disable_cap,
 	Opt_checkpoint_disable_cap_perc,
 	Opt_checkpoint_enable,
+	Opt_checkpoint_merge,
+	Opt_nocheckpoint_merge,
 	Opt_compress_algorithm,
 	Opt_compress_log_size,
 	Opt_compress_extension,
@ -212,6 +214,8 @@ static match_table_t f2fs_tokens = {
 	{Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
 	{Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
 	{Opt_checkpoint_enable, "checkpoint=enable"},
+	{Opt_checkpoint_merge, "checkpoint_merge"},
+	{Opt_nocheckpoint_merge, "nocheckpoint_merge"},
 	{Opt_compress_algorithm, "compress_algorithm=%s"},
 	{Opt_compress_log_size, "compress_log_size=%u"},
 	{Opt_compress_extension, "compress_extension=%s"},
@ -463,6 +467,74 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
 	return 0;
 }

+#ifdef CONFIG_F2FS_FS_COMPRESSION
+#ifdef CONFIG_F2FS_FS_LZ4
+static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
+{
+#ifdef CONFIG_F2FS_FS_LZ4HC
+	unsigned int level;
+#endif
+
+	if (strlen(str) == 3) {
+		F2FS_OPTION(sbi).compress_level = 0;
+		return 0;
+	}
+
+#ifdef CONFIG_F2FS_FS_LZ4HC
+	str += 3;
+
+	if (str[0] != ':') {
+		f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+		return -EINVAL;
+	}
+	if (kstrtouint(str + 1, 10, &level))
+		return -EINVAL;
+
+	if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) {
+		f2fs_info(sbi, "invalid lz4hc compress level: %d", level);
+		return -EINVAL;
+	}
+
+	F2FS_OPTION(sbi).compress_level = level;
+	return 0;
+#else
+	f2fs_info(sbi, "kernel doesn't support lz4hc compression");
+	return -EINVAL;
+#endif
+}
+#endif
+
+#ifdef CONFIG_F2FS_FS_ZSTD
+static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
+{
+	unsigned int level;
+	int len = 4;
+
+	if (strlen(str) == len) {
+		F2FS_OPTION(sbi).compress_level = 0;
+		return 0;
+	}
+
+	str += len;
+
+	if (str[0] != ':') {
+		f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+		return -EINVAL;
+	}
+	if (kstrtouint(str + 1, 10, &level))
+		return -EINVAL;
+
+	if (!level || level > ZSTD_maxCLevel()) {
+		f2fs_info(sbi, "invalid zstd compress level: %d", level);
+		return -EINVAL;
+	}
+
+	F2FS_OPTION(sbi).compress_level = level;
+	return 0;
+}
+#endif
+#endif
+
 static int parse_options(struct super_block *sb, char *options, bool is_remount)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@ -871,6 +943,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 		case Opt_checkpoint_enable:
 			clear_opt(sbi, DISABLE_CHECKPOINT);
 			break;
+		case Opt_checkpoint_merge:
+			set_opt(sbi, MERGE_CHECKPOINT);
+			break;
+		case Opt_nocheckpoint_merge:
+			clear_opt(sbi, MERGE_CHECKPOINT);
+			break;
 #ifdef CONFIG_F2FS_FS_COMPRESSION
 		case Opt_compress_algorithm:
 			if (!f2fs_sb_has_compression(sbi)) {
@ -881,14 +959,37 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			if (!name)
 				return -ENOMEM;
 			if (!strcmp(name, "lzo")) {
+#ifdef CONFIG_F2FS_FS_LZO
+				F2FS_OPTION(sbi).compress_level = 0;
 				F2FS_OPTION(sbi).compress_algorithm =
 								COMPRESS_LZO;
-			} else if (!strcmp(name, "lz4")) {
+#else
+				f2fs_info(sbi, "kernel doesn't support lzo compression");
+#endif
+			} else if (!strncmp(name, "lz4", 3)) {
+#ifdef CONFIG_F2FS_FS_LZ4
+				ret = f2fs_set_lz4hc_level(sbi, name);
+				if (ret) {
+					kfree(name);
+					return -EINVAL;
+				}
 				F2FS_OPTION(sbi).compress_algorithm =
 								COMPRESS_LZ4;
-			} else if (!strcmp(name, "zstd")) {
+#else
+				f2fs_info(sbi, "kernel doesn't support lz4 compression");
+#endif
+			} else if (!strncmp(name, "zstd", 4)) {
+#ifdef CONFIG_F2FS_FS_ZSTD
+				ret = f2fs_set_zstd_level(sbi, name);
+				if (ret) {
+					kfree(name);
+					return -EINVAL;
+				}
 				F2FS_OPTION(sbi).compress_algorithm =
 								COMPRESS_ZSTD;
+#else
+				f2fs_info(sbi, "kernel doesn't support zstd compression");
+#endif
 			} else {
 				kfree(name);
 				return -EINVAL;
@ -1248,6 +1349,12 @@ static void f2fs_put_super(struct super_block *sb)
 	/* prevent remaining shrinker jobs */
 	mutex_lock(&sbi->umount_mutex);

+	/*
+	 * flush all issued checkpoints and stop checkpoint issue thread.
+	 * after then, all checkpoints should be done by each process context.
+	 */
+	f2fs_stop_ckpt_thread(sbi);
+
 	/*
 	 * We don't need to do checkpoint when superblock is clean.
 	 * But, the previous checkpoint was not done by umount, it needs to do
@ -1346,16 +1453,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		return -EAGAIN;

-	if (sync) {
-		struct cp_control cpc;
-
-		cpc.reason = __get_cp_reason(sbi);
-
-		down_write(&sbi->gc_lock);
-		err = f2fs_write_checkpoint(sbi, &cpc);
-		up_write(&sbi->gc_lock);
-	}
-	f2fs_trace_ios(NULL, 1);
+	if (sync)
+		err = f2fs_issue_checkpoint(sbi);

 	return err;
 }
@ -1372,6 +1471,10 @@ static int f2fs_freeze(struct super_block *sb)
 	/* must be clean, since sync_filesystem() was already called */
 	if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
 		return -EINVAL;
+
+	/* ensure no checkpoint required */
+	if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
+		return -EINVAL;
 	return 0;
 }

@ -1540,6 +1643,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
 	}
 	seq_printf(seq, ",compress_algorithm=%s", algtype);

+	if (F2FS_OPTION(sbi).compress_level)
+		seq_printf(seq, ":%d", F2FS_OPTION(sbi).compress_level);
+
 	seq_printf(seq, ",compress_log_size=%u",
 			F2FS_OPTION(sbi).compress_log_size);

@ -1677,6 +1783,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	if (test_opt(sbi, DISABLE_CHECKPOINT))
 		seq_printf(seq, ",checkpoint=disable:%u",
 				F2FS_OPTION(sbi).unusable_cap);
+	if (test_opt(sbi, MERGE_CHECKPOINT))
+		seq_puts(seq, ",checkpoint_merge");
+	else
+		seq_puts(seq, ",nocheckpoint_merge");
 	if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX)
 		seq_printf(seq, ",fsync_mode=%s", "posix");
 	else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT)
@ -1961,6 +2071,19 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		}
 	}

+	if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+			test_opt(sbi, MERGE_CHECKPOINT)) {
+		err = f2fs_start_ckpt_thread(sbi);
+		if (err) {
+			f2fs_err(sbi,
+			    "Failed to start F2FS issue_checkpoint_thread (%d)",
+			    err);
+			goto restore_gc;
+		}
+	} else {
+		f2fs_stop_ckpt_thread(sbi);
+	}
+
 	/*
 	 * We stop issue flush thread if FS is mounted as RO
 	 * or if flush_merge is not passed in mount option.
@ -2652,10 +2775,10 @@ static const struct export_operations f2fs_export_ops = {
 	.get_parent = f2fs_get_parent,
 };

-static loff_t max_file_blocks(void)
+loff_t max_file_blocks(struct inode *inode)
 {
 	loff_t result = 0;
-	loff_t leaf_count = DEF_ADDRS_PER_BLOCK;
+	loff_t leaf_count;

 	/*
 	 * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
@ -2664,6 +2787,11 @@ static loff_t max_file_blocks(void)
 	 * result as zero.
 	 */

+	if (inode && f2fs_compressed_file(inode))
+		leaf_count = ADDRS_PER_BLOCK(inode);
+	else
+		leaf_count = DEF_ADDRS_PER_BLOCK;
+
 	/* two direct node blocks */
 	result += (leaf_count * 2);

@ -3546,8 +3674,7 @@ try_onemore:
 	if (err)
 		goto free_options;

-	sbi->max_file_blocks = max_file_blocks();
-	sb->s_maxbytes = sbi->max_file_blocks <<
+	sb->s_maxbytes = max_file_blocks(NULL) <<
 				le32_to_cpu(raw_super->log_blocksize);
 	sb->s_max_links = F2FS_LINK_MAX;

@ -3714,6 +3841,19 @@ try_onemore:

 	f2fs_init_fsync_node_info(sbi);

+	/* setup checkpoint request control and start checkpoint issue thread */
+	f2fs_init_ckpt_req_control(sbi);
+	if (!test_opt(sbi, DISABLE_CHECKPOINT) &&
+			test_opt(sbi, MERGE_CHECKPOINT)) {
+		err = f2fs_start_ckpt_thread(sbi);
+		if (err) {
+			f2fs_err(sbi,
+			    "Failed to start F2FS issue_checkpoint_thread (%d)",
+			    err);
+			goto stop_ckpt_thread;
+		}
+	}
+
 	/* setup f2fs internal modules */
 	err = f2fs_build_segment_manager(sbi);
 	if (err) {
@ -3799,12 +3939,10 @@ try_onemore:
 		 * previous checkpoint was not done by clean system shutdown.
 		 */
 		if (f2fs_hw_is_readonly(sbi)) {
-			if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
-				err = -EROFS;
+			if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))
 				f2fs_err(sbi, "Need to recover fsync data, but write access unavailable");
-				goto free_meta;
-			}
-			f2fs_info(sbi, "write access unavailable, skipping recovery");
+			else
+				f2fs_info(sbi, "write access unavailable, skipping recovery");
 			goto reset_checkpoint;
 		}

@ -3912,6 +4050,8 @@ free_nm:
 free_sm:
 	f2fs_destroy_segment_manager(sbi);
 	f2fs_destroy_post_read_wq(sbi);
+stop_ckpt_thread:
+	f2fs_stop_ckpt_thread(sbi);
 free_devices:
 	destroy_device_list(sbi);
 	kvfree(sbi->ckpt);
@ -4026,8 +4166,6 @@ static int __init init_f2fs_fs(void)
 		return -EINVAL;
 	}

-	f2fs_build_trace_ios();
-
 	err = init_inodecache();
 	if (err)
 		goto fail;
@ -4120,7 +4258,6 @@ static void __exit exit_f2fs_fs(void)
 	f2fs_destroy_segment_manager_caches();
 	f2fs_destroy_node_manager_caches();
 	destroy_inodecache();
-	f2fs_destroy_trace_ios();
 }

 module_init(init_f2fs_fs)
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@ -11,6 +11,7 @@
 #include <linux/f2fs_fs.h>
 #include <linux/seq_file.h>
 #include <linux/unicode.h>
+#include <linux/ioprio.h>

 #include "f2fs.h"
 #include "segment.h"
@ -34,6 +35,7 @@ enum {
 	FAULT_INFO_TYPE,	/* struct f2fs_fault_info */
 #endif
 	RESERVED_BLOCKS,	/* struct f2fs_sb_info */
+	CPRC_INFO,	/* struct ckpt_req_control */
 };

 struct f2fs_attr {
@ -70,6 +72,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
 	else if (struct_type == STAT_INFO)
 		return (unsigned char *)F2FS_STAT(sbi);
 #endif
+	else if (struct_type == CPRC_INFO)
+		return (unsigned char *)&sbi->cprc_info;
 	return NULL;
 }

@ -90,26 +94,23 @@ static ssize_t free_segments_show(struct f2fs_attr *a,
 static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	struct super_block *sb = sbi->sb;
-
-	if (!sb->s_bdev->bd_part)
-		return sprintf(buf, "0\n");
-
 	return sprintf(buf, "%llu\n",
 			(unsigned long long)(sbi->kbytes_written +
 			((f2fs_get_sectors_written(sbi) -
 				sbi->sectors_written_start) >> 1)));
 }

+static ssize_t sb_status_show(struct f2fs_attr *a,
+		struct f2fs_sb_info *sbi, char *buf)
+{
+	return sprintf(buf, "%lx\n", sbi->s_flag);
+}
+
 static ssize_t features_show(struct f2fs_attr *a,
 		struct f2fs_sb_info *sbi, char *buf)
 {
-	struct super_block *sb = sbi->sb;
 	int len = 0;

-	if (!sb->s_bdev->bd_part)
-		return sprintf(buf, "0\n");
-
 	if (f2fs_sb_has_encrypt(sbi))
 		len += scnprintf(buf, PAGE_SIZE - len, "%s",
 						"encryption");
@ -264,6 +265,23 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
 		return len;
 	}

+	if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
+		struct ckpt_req_control *cprc = &sbi->cprc_info;
+		int len = 0;
+		int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio);
+		int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio);
+
+		if (class == IOPRIO_CLASS_RT)
+			len += scnprintf(buf + len, PAGE_SIZE - len, "rt,");
+		else if (class == IOPRIO_CLASS_BE)
+			len += scnprintf(buf + len, PAGE_SIZE - len, "be,");
+		else
+			return -EINVAL;
+
+		len += scnprintf(buf + len, PAGE_SIZE - len, "%d\n", data);
+		return len;
+	}
+
 	ui = (unsigned int *)(ptr + a->offset);

 	return sprintf(buf, "%u\n", *ui);
@ -317,6 +335,38 @@ out:
 		return ret ? ret : count;
 	}

+	if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
+		const char *name = strim((char *)buf);
+		struct ckpt_req_control *cprc = &sbi->cprc_info;
+		int class;
+		long data;
+		int ret;
+
+		if (!strncmp(name, "rt,", 3))
+			class = IOPRIO_CLASS_RT;
+		else if (!strncmp(name, "be,", 3))
+			class = IOPRIO_CLASS_BE;
+		else
+			return -EINVAL;
+
+		name += 3;
+		ret = kstrtol(name, 10, &data);
+		if (ret)
+			return ret;
+		if (data >= IOPRIO_BE_NR || data < 0)
+			return -EINVAL;
+
+		cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
+		if (test_opt(sbi, MERGE_CHECKPOINT)) {
+			ret = set_task_ioprio(cprc->f2fs_issue_ckpt,
+					cprc->ckpt_thread_ioprio);
+			if (ret)
+				return ret;
+		}
+
+		return count;
+	}
+
 	ui = (unsigned int *)(ptr + a->offset);

 	ret = kstrtoul(skip_spaces(buf), 0, &t);
@ -576,6 +626,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
 #endif
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
+F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
 F2FS_GENERAL_RO_ATTR(dirty_segments);
 F2FS_GENERAL_RO_ATTR(free_segments);
 F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
@ -661,6 +712,7 @@ static struct attribute *f2fs_attrs[] = {
 #endif
 	ATTR_LIST(data_io_flag),
 	ATTR_LIST(node_io_flag),
+	ATTR_LIST(ckpt_thread_ioprio),
 	ATTR_LIST(dirty_segments),
 	ATTR_LIST(free_segments),
 	ATTR_LIST(unusable),
@ -709,6 +761,12 @@ static struct attribute *f2fs_feat_attrs[] = {
 	NULL,
 };

+F2FS_GENERAL_RO_ATTR(sb_status);
+static struct attribute *f2fs_stat_attrs[] = {
+	ATTR_LIST(sb_status),
+	NULL,
+};
+
 static const struct sysfs_ops f2fs_attr_ops = {
 	.show	= f2fs_attr_show,
 	.store	= f2fs_attr_store,
@ -737,6 +795,44 @@ static struct kobject f2fs_feat = {
 	.kset	= &f2fs_kset,
 };

+static ssize_t f2fs_stat_attr_show(struct kobject *kobj,
+				struct attribute *attr, char *buf)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+								s_stat_kobj);
+	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+	return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr,
+						const char *buf, size_t len)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+								s_stat_kobj);
+	struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
+
+	return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void f2fs_stat_kobj_release(struct kobject *kobj)
+{
+	struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
+								s_stat_kobj);
+	complete(&sbi->s_stat_kobj_unregister);
+}
+
+static const struct sysfs_ops f2fs_stat_attr_ops = {
+	.show	= f2fs_stat_attr_show,
+	.store	= f2fs_stat_attr_store,
+};
+
+static struct kobj_type f2fs_stat_ktype = {
+	.default_attrs	= f2fs_stat_attrs,
+	.sysfs_ops	= &f2fs_stat_attr_ops,
+	.release	= f2fs_stat_kobj_release,
+};
+
 static int __maybe_unused segment_info_seq_show(struct seq_file *seq,
 						void *offset)
 {
@ -943,11 +1039,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 	init_completion(&sbi->s_kobj_unregister);
 	err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
 				"%s", sb->s_id);
-	if (err) {
-		kobject_put(&sbi->s_kobj);
-		wait_for_completion(&sbi->s_kobj_unregister);
-		return err;
-	}
+	if (err)
+		goto put_sb_kobj;
+
+	sbi->s_stat_kobj.kset = &f2fs_kset;
+	init_completion(&sbi->s_stat_kobj_unregister);
+	err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype,
+						&sbi->s_kobj, "stat");
+	if (err)
+		goto put_stat_kobj;

 	if (f2fs_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@ -963,6 +1063,13 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 				victim_bits_seq_show, sb);
 	}
 	return 0;
+put_stat_kobj:
+	kobject_put(&sbi->s_stat_kobj);
+	wait_for_completion(&sbi->s_stat_kobj_unregister);
+put_sb_kobj:
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
+	return err;
 }

 void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
@ -974,6 +1081,11 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
 		remove_proc_entry("victim_bits", sbi->s_proc);
 		remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
 	}
+
+	kobject_del(&sbi->s_stat_kobj);
+	kobject_put(&sbi->s_stat_kobj);
+	wait_for_completion(&sbi->s_stat_kobj_unregister);
+
 	kobject_del(&sbi->s_kobj);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@ -1,165 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * f2fs IO tracer
- *
- * Copyright (c) 2014 Motorola Mobility
- * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- */
-#include <linux/fs.h>
-#include <linux/f2fs_fs.h>
-#include <linux/sched.h>
-#include <linux/radix-tree.h>
-
-#include "f2fs.h"
-#include "trace.h"
-
-static RADIX_TREE(pids, GFP_ATOMIC);
-static spinlock_t pids_lock;
-static struct last_io_info last_io;
-
-static inline void __print_last_io(void)
-{
-	if (!last_io.len)
-		return;
-
-	trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n",
-			last_io.major, last_io.minor,
-			last_io.pid, "----------------",
-			last_io.type,
-			last_io.fio.op, last_io.fio.op_flags,
-			last_io.fio.new_blkaddr,
-			last_io.len);
-	memset(&last_io, 0, sizeof(last_io));
-}
-
-static int __file_type(struct inode *inode, pid_t pid)
-{
-	if (f2fs_is_atomic_file(inode))
-		return __ATOMIC_FILE;
-	else if (f2fs_is_volatile_file(inode))
-		return __VOLATILE_FILE;
-	else if (S_ISDIR(inode->i_mode))
-		return __DIR_FILE;
-	else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
-		return __NODE_FILE;
-	else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
-		return __META_FILE;
-	else if (pid)
-		return __NORMAL_FILE;
-	else
-		return __MISC_FILE;
-}
-
-void f2fs_trace_pid(struct page *page)
-{
-	struct inode *inode = page->mapping->host;
-	pid_t pid = task_pid_nr(current);
-	void *p;
-
-	set_page_private(page, (unsigned long)pid);
-
-retry:
-	if (radix_tree_preload(GFP_NOFS))
-		return;
-
-	spin_lock(&pids_lock);
-	p = radix_tree_lookup(&pids, pid);
-	if (p == current)
-		goto out;
-	if (p)
-		radix_tree_delete(&pids, pid);
-
-	if (radix_tree_insert(&pids, pid, current)) {
-		spin_unlock(&pids_lock);
-		radix_tree_preload_end();
-		cond_resched();
-		goto retry;
-	}
-
-	trace_printk("%3x:%3x %4x %-16s\n",
-			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
-			pid, current->comm);
-out:
-	spin_unlock(&pids_lock);
-	radix_tree_preload_end();
-}
-
-void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
-{
-	struct inode *inode;
-	pid_t pid;
-	int major, minor;
-
-	if (flush) {
-		__print_last_io();
-		return;
-	}
-
-	inode = fio->page->mapping->host;
-	pid = page_private(fio->page);
-
-	major = MAJOR(inode->i_sb->s_dev);
-	minor = MINOR(inode->i_sb->s_dev);
-
-	if (last_io.major == major && last_io.minor == minor &&
-			last_io.pid == pid &&
-			last_io.type == __file_type(inode, pid) &&
-			last_io.fio.op == fio->op &&
-			last_io.fio.op_flags == fio->op_flags &&
-			last_io.fio.new_blkaddr + last_io.len ==
-							fio->new_blkaddr) {
-		last_io.len++;
-		return;
-	}
-
-	__print_last_io();
-
-	last_io.major = major;
-	last_io.minor = minor;
-	last_io.pid = pid;
-	last_io.type = __file_type(inode, pid);
-	last_io.fio = *fio;
-	last_io.len = 1;
-	return;
-}
-
-void f2fs_build_trace_ios(void)
-{
-	spin_lock_init(&pids_lock);
-}
-
-#define PIDVEC_SIZE	128
-static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
-							unsigned int max_items)
-{
-	struct radix_tree_iter iter;
-	void **slot;
-	unsigned int ret = 0;
-
-	if (unlikely(!max_items))
-		return 0;
-
-	radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
-		results[ret] = iter.index;
-		if (++ret == max_items)
-			break;
-	}
-	return ret;
-}
-
-void f2fs_destroy_trace_ios(void)
-{
-	pid_t pid[PIDVEC_SIZE];
-	pid_t next_pid = 0;
-	unsigned int found;
-
-	spin_lock(&pids_lock);
-	while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
-		unsigned idx;
-
-		next_pid = pid[found - 1] + 1;
-		for (idx = 0; idx < found; idx++)
-			radix_tree_delete(&pids, pid[idx]);
-	}
-	spin_unlock(&pids_lock);
-}
--- a/fs/f2fs/trace.h
+++ b/fs/f2fs/trace.h
@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * f2fs IO tracer
- *
- * Copyright (c) 2014 Motorola Mobility
- * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
- */
-#ifndef __F2FS_TRACE_H__
-#define __F2FS_TRACE_H__
-
-#ifdef CONFIG_F2FS_IO_TRACE
-#include <trace/events/f2fs.h>
-
-enum file_type {
-	__NORMAL_FILE,
-	__DIR_FILE,
-	__NODE_FILE,
-	__META_FILE,
-	__ATOMIC_FILE,
-	__VOLATILE_FILE,
-	__MISC_FILE,
-};
-
-struct last_io_info {
-	int major, minor;
-	pid_t pid;
-	enum file_type type;
-	struct f2fs_io_info fio;
-	block_t len;
-};
-
-extern void f2fs_trace_pid(struct page *);
-extern void f2fs_trace_ios(struct f2fs_io_info *, int);
-extern void f2fs_build_trace_ios(void);
-extern void f2fs_destroy_trace_ios(void);
-#else
-#define f2fs_trace_pid(p)
-#define f2fs_trace_ios(i, n)
-#define f2fs_build_trace_ios()
-#define f2fs_destroy_trace_ios()
-
-#endif
-#endif /* __F2FS_TRACE_H__ */
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@ -327,7 +327,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 	void *last_addr = NULL;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
 	unsigned int inline_size = inline_xattr_size(inode);
-	int err = 0;
+	int err;

 	if (!xnid && !inline_size)
 		return -ENODATA;
@ -515,7 +515,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 		void *buffer, size_t buffer_size, struct page *ipage)
 {
 	struct f2fs_xattr_entry *entry = NULL;
-	int error = 0;
+	int error;
 	unsigned int size, len;
 	void *base_addr = NULL;
 	int base_size;
@ -562,7 +562,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 	struct inode *inode = d_inode(dentry);
 	struct f2fs_xattr_entry *entry;
 	void *base_addr, *last_base_addr;
-	int error = 0;
+	int error;
 	size_t rest = buffer_size;

 	down_read(&F2FS_I(inode)->i_xattr_sem);
@ -632,7 +632,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	int found, newsize;
 	size_t len;
 	__u32 new_hsize;
-	int error = 0;
+	int error;

 	if (name == NULL)
 		return -EINVAL;
@ -673,7 +673,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 		}

 		if (value && f2fs_xattr_value_same(here, value, size))
-			goto exit;
+			goto same;
 	} else if ((flags & XATTR_REPLACE)) {
 		error = -ENODATA;
 		goto exit;
@ -745,17 +745,20 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	if (error)
 		goto exit;

-	if (is_inode_flag_set(inode, FI_ACL_MODE)) {
-		inode->i_mode = F2FS_I(inode)->i_acl_mode;
-		inode->i_ctime = current_time(inode);
-		clear_inode_flag(inode, FI_ACL_MODE);
-	}
 	if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
 			!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
 		f2fs_set_encrypted_inode(inode);
 	f2fs_mark_inode_dirty_sync(inode, true);
 	if (!error && S_ISDIR(inode->i_mode))
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
+
+same:
+	if (is_inode_flag_set(inode, FI_ACL_MODE)) {
+		inode->i_mode = F2FS_I(inode)->i_acl_mode;
+		inode->i_ctime = current_time(inode);
+		clear_inode_flag(inode, FI_ACL_MODE);
+	}
+
 exit:
 	kfree(base_addr);
 	return error;
--- a/fs/libfs.c
+++ b/fs/libfs.c
@ -1285,8 +1285,8 @@ static bool needs_casefold(const struct inode *dir)
 *
 * Return: 0 if names match, 1 if mismatch, or -ERRNO
 */
-int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
-			  const char *str, const struct qstr *name)
+static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+				const char *str, const struct qstr *name)
 {
 	const struct dentry *parent = READ_ONCE(dentry->d_parent);
 	const struct inode *dir = READ_ONCE(parent->d_inode);
@ -1323,7 +1323,6 @@ fallback:
 		return 1;
 	return !!memcmp(str, name->name, len);
 }
-EXPORT_SYMBOL(generic_ci_d_compare);

 /**
 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@ -1332,7 +1331,7 @@ EXPORT_SYMBOL(generic_ci_d_compare);
 *
 * Return: 0 if hash was successful or unchanged, and -EINVAL on error
 */
-int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 {
 	const struct inode *dir = READ_ONCE(dentry->d_inode);
 	struct super_block *sb = dentry->d_sb;
@ -1347,7 +1346,6 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 		return -EINVAL;
 	return 0;
 }
-EXPORT_SYMBOL(generic_ci_d_hash);

 static const struct dentry_operations generic_ci_dentry_ops = {
 	.d_hash = generic_ci_d_hash,
--- a/fs/verity/Makefile
+++ b/fs/verity/Makefile
@ -5,6 +5,7 @@ obj-$(CONFIG_FS_VERITY) += enable.o \
 			   init.o \
 			   measure.o \
 			   open.o \
+			   read_metadata.o \
 			   verify.o

 obj-$(CONFIG_FS_VERITY_BUILTIN_SIGNATURES) += signature.o
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@ -122,12 +122,17 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 				     const u8 *salt, size_t salt_size);

 struct fsverity_info *fsverity_create_info(const struct inode *inode,
-					   void *desc, size_t desc_size);
+					   struct fsverity_descriptor *desc,
+					   size_t desc_size);

 void fsverity_set_info(struct inode *inode, struct fsverity_info *vi);

 void fsverity_free_info(struct fsverity_info *vi);

+int fsverity_get_descriptor(struct inode *inode,
+			    struct fsverity_descriptor **desc_ret,
+			    size_t *desc_size_ret);
+
 int __init fsverity_init_info_cache(void);
 void __init fsverity_exit_info_cache(void);

@ -135,15 +140,13 @@ void __init fsverity_exit_info_cache(void);

 #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
 int fsverity_verify_signature(const struct fsverity_info *vi,
-			      const struct fsverity_descriptor *desc,
-			      size_t desc_size);
+			      const u8 *signature, size_t sig_size);

 int __init fsverity_init_signature(void);
 #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
 static inline int
 fsverity_verify_signature(const struct fsverity_info *vi,
-			  const struct fsverity_descriptor *desc,
-			  size_t desc_size)
+			  const u8 *signature, size_t sig_size)
 {
 	return 0;
 }
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@ -142,45 +142,17 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg,
 }

 /*
- * Validate the given fsverity_descriptor and create a new fsverity_info from
- * it.  The signature (if present) is also checked.
+ * Create a new fsverity_info from the given fsverity_descriptor (with optional
+ * appended signature), and check the signature if present.  The
+ * fsverity_descriptor must have already undergone basic validation.
 */
 struct fsverity_info *fsverity_create_info(const struct inode *inode,
-					   void *_desc, size_t desc_size)
+					   struct fsverity_descriptor *desc,
+					   size_t desc_size)
 {
-	struct fsverity_descriptor *desc = _desc;
 	struct fsverity_info *vi;
 	int err;

-	if (desc_size < sizeof(*desc)) {
-		fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
-			     desc_size);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (desc->version != 1) {
-		fsverity_err(inode, "Unrecognized descriptor version: %u",
-			     desc->version);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
-		fsverity_err(inode, "Reserved bits set in descriptor");
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (desc->salt_size > sizeof(desc->salt)) {
-		fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
-		return ERR_PTR(-EINVAL);
-	}
-
-	if (le64_to_cpu(desc->data_size) != inode->i_size) {
-		fsverity_err(inode,
-			     "Wrong data_size: %llu (desc) != %lld (inode)",
-			     le64_to_cpu(desc->data_size), inode->i_size);
-		return ERR_PTR(-EINVAL);
-	}
-
 	vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL);
 	if (!vi)
 		return ERR_PTR(-ENOMEM);
@ -209,7 +181,8 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
 		 vi->tree_params.hash_alg->name,
 		 vi->tree_params.digest_size, vi->file_digest);

-	err = fsverity_verify_signature(vi, desc, desc_size);
+	err = fsverity_verify_signature(vi, desc->signature,
+					le32_to_cpu(desc->sig_size));
 out:
 	if (err) {
 		fsverity_free_info(vi);
@ -221,11 +194,20 @@ out:
 void fsverity_set_info(struct inode *inode, struct fsverity_info *vi)
 {
 	/*
-	 * Multiple processes may race to set ->i_verity_info, so use cmpxchg.
-	 * This pairs with the READ_ONCE() in fsverity_get_info().
+	 * Multiple tasks may race to set ->i_verity_info, so use
+	 * cmpxchg_release().  This pairs with the smp_load_acquire() in
+	 * fsverity_get_info().  I.e., here we publish ->i_verity_info with a
+	 * RELEASE barrier so that other tasks can ACQUIRE it.
 	 */
-	if (cmpxchg(&inode->i_verity_info, NULL, vi) != NULL)
+	if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) {
+		/* Lost the race, so free the fsverity_info we allocated. */
 		fsverity_free_info(vi);
+		/*
+		 * Afterwards, the caller may access ->i_verity_info directly,
+		 * so make sure to ACQUIRE the winning fsverity_info.
+		 */
+		(void)fsverity_get_info(inode);
+	}
 }

 void fsverity_free_info(struct fsverity_info *vi)
@ -236,15 +218,57 @@ void fsverity_free_info(struct fsverity_info *vi)
 	kmem_cache_free(fsverity_info_cachep, vi);
 }

-/* Ensure the inode has an ->i_verity_info */
-static int ensure_verity_info(struct inode *inode)
+static bool validate_fsverity_descriptor(struct inode *inode,
+					 const struct fsverity_descriptor *desc,
+					 size_t desc_size)
 {
-	struct fsverity_info *vi = fsverity_get_info(inode);
-	struct fsverity_descriptor *desc;
-	int res;
+	if (desc_size < sizeof(*desc)) {
+		fsverity_err(inode, "Unrecognized descriptor size: %zu bytes",
+			     desc_size);
+		return false;
+	}

-	if (vi)
-		return 0;
+	if (desc->version != 1) {
+		fsverity_err(inode, "Unrecognized descriptor version: %u",
+			     desc->version);
+		return false;
+	}
+
+	if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) {
+		fsverity_err(inode, "Reserved bits set in descriptor");
+		return false;
+	}
+
+	if (desc->salt_size > sizeof(desc->salt)) {
+		fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size);
+		return false;
+	}
+
+	if (le64_to_cpu(desc->data_size) != inode->i_size) {
+		fsverity_err(inode,
+			     "Wrong data_size: %llu (desc) != %lld (inode)",
+			     le64_to_cpu(desc->data_size), inode->i_size);
+		return false;
+	}
+
+	if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) {
+		fsverity_err(inode, "Signature overflows verity descriptor");
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Read the inode's fsverity_descriptor (with optional appended signature) from
+ * the filesystem, and do basic validation of it.
+ */
+int fsverity_get_descriptor(struct inode *inode,
+			    struct fsverity_descriptor **desc_ret,
+			    size_t *desc_size_ret)
+{
+	int res;
+	struct fsverity_descriptor *desc;

 	res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0);
 	if (res < 0) {
@ -263,20 +287,46 @@ static int ensure_verity_info(struct inode *inode)
 	res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res);
 	if (res < 0) {
 		fsverity_err(inode, "Error %d reading verity descriptor", res);
-		goto out_free_desc;
+		kfree(desc);
+		return res;
 	}

-	vi = fsverity_create_info(inode, desc, res);
+	if (!validate_fsverity_descriptor(inode, desc, res)) {
+		kfree(desc);
+		return -EINVAL;
+	}
+
+	*desc_ret = desc;
+	*desc_size_ret = res;
+	return 0;
+}
+
+/* Ensure the inode has an ->i_verity_info */
+static int ensure_verity_info(struct inode *inode)
+{
+	struct fsverity_info *vi = fsverity_get_info(inode);
+	struct fsverity_descriptor *desc;
+	size_t desc_size;
+	int err;
+
+	if (vi)
+		return 0;
+
+	err = fsverity_get_descriptor(inode, &desc, &desc_size);
+	if (err)
+		return err;
+
+	vi = fsverity_create_info(inode, desc, desc_size);
 	if (IS_ERR(vi)) {
-		res = PTR_ERR(vi);
+		err = PTR_ERR(vi);
 		goto out_free_desc;
 	}

 	fsverity_set_info(inode, vi);
-	res = 0;
+	err = 0;
 out_free_desc:
 	kfree(desc);
-	return res;
+	return err;
 }

 /**
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Ioctl to read verity metadata
+ *
+ * Copyright 2021 Google LLC
+ */
+
+#include "fsverity_private.h"
+
+#include <linux/backing-dev.h>
+#include <linux/highmem.h>
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+
+static int fsverity_read_merkle_tree(struct inode *inode,
+				     const struct fsverity_info *vi,
+				     void __user *buf, u64 offset, int length)
+{
+	const struct fsverity_operations *vops = inode->i_sb->s_vop;
+	u64 end_offset;
+	unsigned int offs_in_page;
+	pgoff_t index, last_index;
+	int retval = 0;
+	int err = 0;
+
+	end_offset = min(offset + length, vi->tree_params.tree_size);
+	if (offset >= end_offset)
+		return 0;
+	offs_in_page = offset_in_page(offset);
+	last_index = (end_offset - 1) >> PAGE_SHIFT;
+
+	/*
+	 * Iterate through each Merkle tree page in the requested range and copy
+	 * the requested portion to userspace.  Note that the Merkle tree block
+	 * size isn't important here, as we are returning a byte stream; i.e.,
+	 * we can just work with pages even if the tree block size != PAGE_SIZE.
+	 */
+	for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
+		unsigned long num_ra_pages =
+			min_t(unsigned long, last_index - index + 1,
+			      inode->i_sb->s_bdi->io_pages);
+		unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
+						   PAGE_SIZE - offs_in_page);
+		struct page *page;
+		const void *virt;
+
+		page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			fsverity_err(inode,
+				     "Error %d reading Merkle tree page %lu",
+				     err, index);
+			break;
+		}
+
+		virt = kmap(page);
+		if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
+			kunmap(page);
+			put_page(page);
+			err = -EFAULT;
+			break;
+		}
+		kunmap(page);
+		put_page(page);
+
+		retval += bytes_to_copy;
+		buf += bytes_to_copy;
+		offset += bytes_to_copy;
+
+		if (fatal_signal_pending(current))  {
+			err = -EINTR;
+			break;
+		}
+		cond_resched();
+		offs_in_page = 0;
+	}
+	return retval ? retval : err;
+}
+
+/* Copy the requested portion of the buffer to userspace. */
+static int fsverity_read_buffer(void __user *dst, u64 offset, int length,
+				const void *src, size_t src_length)
+{
+	if (offset >= src_length)
+		return 0;
+	src += offset;
+	src_length -= offset;
+
+	length = min_t(size_t, length, src_length);
+
+	if (copy_to_user(dst, src, length))
+		return -EFAULT;
+
+	return length;
+}
+
+static int fsverity_read_descriptor(struct inode *inode,
+				    void __user *buf, u64 offset, int length)
+{
+	struct fsverity_descriptor *desc;
+	size_t desc_size;
+	int res;
+
+	res = fsverity_get_descriptor(inode, &desc, &desc_size);
+	if (res)
+		return res;
+
+	/* don't include the signature */
+	desc_size = offsetof(struct fsverity_descriptor, signature);
+	desc->sig_size = 0;
+
+	res = fsverity_read_buffer(buf, offset, length, desc, desc_size);
+
+	kfree(desc);
+	return res;
+}
+
+static int fsverity_read_signature(struct inode *inode,
+				   void __user *buf, u64 offset, int length)
+{
+	struct fsverity_descriptor *desc;
+	size_t desc_size;
+	int res;
+
+	res = fsverity_get_descriptor(inode, &desc, &desc_size);
+	if (res)
+		return res;
+
+	if (desc->sig_size == 0) {
+		res = -ENODATA;
+		goto out;
+	}
+
+	/*
+	 * Include only the signature.  Note that fsverity_get_descriptor()
+	 * already verified that sig_size is in-bounds.
+	 */
+	res = fsverity_read_buffer(buf, offset, length, desc->signature,
+				   le32_to_cpu(desc->sig_size));
+out:
+	kfree(desc);
+	return res;
+}
+
+/**
+ * fsverity_ioctl_read_metadata() - read verity metadata from a file
+ * @filp: file to read the metadata from
+ * @uarg: user pointer to fsverity_read_metadata_arg
+ *
+ * Return: length read on success, 0 on EOF, -errno on failure
+ */
+int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg)
+{
+	struct inode *inode = file_inode(filp);
+	const struct fsverity_info *vi;
+	struct fsverity_read_metadata_arg arg;
+	int length;
+	void __user *buf;
+
+	vi = fsverity_get_info(inode);
+	if (!vi)
+		return -ENODATA; /* not a verity file */
+	/*
+	 * Note that we don't have to explicitly check that the file is open for
+	 * reading, since verity files can only be opened for reading.
+	 */
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (arg.__reserved)
+		return -EINVAL;
+
+	/* offset + length must not overflow. */
+	if (arg.offset + arg.length < arg.offset)
+		return -EINVAL;
+
+	/* Ensure that the return value will fit in INT_MAX. */
+	length = min_t(u64, arg.length, INT_MAX);
+
+	buf = u64_to_user_ptr(arg.buf_ptr);
+
+	switch (arg.metadata_type) {
+	case FS_VERITY_METADATA_TYPE_MERKLE_TREE:
+		return fsverity_read_merkle_tree(inode, vi, buf, arg.offset,
+						 length);
+	case FS_VERITY_METADATA_TYPE_DESCRIPTOR:
+		return fsverity_read_descriptor(inode, buf, arg.offset, length);
+	case FS_VERITY_METADATA_TYPE_SIGNATURE:
+		return fsverity_read_signature(inode, buf, arg.offset, length);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL_GPL(fsverity_ioctl_read_metadata);
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@ -26,6 +26,27 @@ static int fsverity_require_signatures;
 */
 static struct key *fsverity_keyring;

+/**
+ * fsverity_verify_signature() - check a verity file's signature
+ * @vi: the file's fsverity_info
+ * @signature: the file's built-in signature
+ * @sig_size: size of signature in bytes, or 0 if no signature
+ *
+ * If the file includes a signature of its fs-verity file digest, verify it
+ * against the certificates in the fs-verity keyring.
+ *
+ * Return: 0 on success (signature valid or not required); -errno on failure
+ */
+int fsverity_verify_signature(const struct fsverity_info *vi,
+			      const u8 *signature, size_t sig_size)
+{
+	unsigned int digest_algorithm =
+		vi->tree_params.hash_alg - fsverity_hash_algs;
+
+	return __fsverity_verify_signature(vi->inode, signature, sig_size,
+					   vi->file_digest, digest_algorithm);
+}
+
 /**
 * __fsverity_verify_signature() - check a verity file's signature
 * @inode: the file's inode
@ -69,8 +90,7 @@ int __fsverity_verify_signature(const struct inode *inode, const u8 *signature,
 	memcpy(d->digest, file_digest, hash_alg->digest_size);

 	err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size,
-				     signature, sig_size,
-				     fsverity_keyring,
+				     signature, sig_size, fsverity_keyring,
 				     VERIFYING_UNSPECIFIED_SIGNATURE,
 				     NULL, NULL);
 	kfree(d);
@ -95,34 +115,6 @@ int __fsverity_verify_signature(const struct inode *inode, const u8 *signature,
 }
 EXPORT_SYMBOL_GPL(__fsverity_verify_signature);

-/**
- * fsverity_verify_signature() - check a verity file's signature
- * @vi: the file's fsverity_info
- * @desc: the file's fsverity_descriptor
- * @desc_size: size of @desc
- *
- * If the file's fs-verity descriptor includes a signature of the file digest,
- * verify it against the certificates in the fs-verity keyring.
- *
- * Return: 0 on success (signature valid or not required); -errno on failure
- */
-int fsverity_verify_signature(const struct fsverity_info *vi,
-			      const struct fsverity_descriptor *desc,
-			      size_t desc_size)
-{
-	const struct inode *inode = vi->inode;
-	const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg;
-	const u32 sig_size = le32_to_cpu(desc->sig_size);
-
-	if (sig_size > desc_size - sizeof(*desc)) {
-		fsverity_err(inode, "Signature overflows verity descriptor");
-		return -EBADMSG;
-	}
-
-	return __fsverity_verify_signature(inode, desc->signature, sig_size,
-				vi->file_digest, hash_alg - fsverity_hash_algs);
-}
-
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fsverity_sysctl_header;

--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@ -274,6 +274,9 @@ struct f2fs_inode {
 			__u8 i_compress_algorithm;	/* compress algorithm */
 			__u8 i_log_cluster_size;	/* log of cluster size */
 			__le16 i_compress_flag;		/* compress flag */
+						/* 0 bit: chksum flag
+						 * [10,15] bits: compress level
+						 */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
 		} __packed;
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@ -3348,11 +3348,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

 extern int generic_check_addressable(unsigned, u64);

-#ifdef CONFIG_UNICODE
-extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
-extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
-				const char *str, const struct qstr *name);
-#endif
 extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);

 #ifdef CONFIG_MIGRATION
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@ -115,8 +115,13 @@ struct fsverity_operations {

 static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
 {
-	/* pairs with the cmpxchg() in fsverity_set_info() */
-	return READ_ONCE(inode->i_verity_info);
+	/*
+	 * Pairs with the cmpxchg_release() in fsverity_set_info().
+	 * I.e., another task may publish ->i_verity_info concurrently,
+	 * executing a RELEASE barrier.  We need to use smp_load_acquire() here
+	 * to safely ACQUIRE the memory the other task published.
+	 */
+	return smp_load_acquire(&inode->i_verity_info);
 }

 /* enable.c */
@ -133,6 +138,10 @@ int fsverity_file_open(struct inode *inode, struct file *filp);
 int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
 void fsverity_cleanup_inode(struct inode *inode);

+/* read_metadata.c */
+
+int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);
+
 /* verify.c */

 bool fsverity_verify_page(struct page *page);
@ -178,6 +187,14 @@ static inline void fsverity_cleanup_inode(struct inode *inode)
 {
 }

+/* read_metadata.c */
+
+static inline int fsverity_ioctl_read_metadata(struct file *filp,
+					       const void __user *uarg)
+{
+	return -EOPNOTSUPP;
+}
+
 /* verify.c */

 static inline bool fsverity_verify_page(struct page *page)
--- a/include/uapi/linux/fsverity.h
+++ b/include/uapi/linux/fsverity.h
@ -83,7 +83,21 @@ struct fsverity_formatted_digest {
 	__u8 digest[];
 };

+#define FS_VERITY_METADATA_TYPE_MERKLE_TREE	1
+#define FS_VERITY_METADATA_TYPE_DESCRIPTOR	2
+#define FS_VERITY_METADATA_TYPE_SIGNATURE	3
+
+struct fsverity_read_metadata_arg {
+	__u64 metadata_type;
+	__u64 offset;
+	__u64 length;
+	__u64 buf_ptr;
+	__u64 __reserved;
+};
+
 #define FS_IOC_ENABLE_VERITY	_IOW('f', 133, struct fsverity_enable_arg)
 #define FS_IOC_MEASURE_VERITY	_IOWR('f', 134, struct fsverity_digest)
+#define FS_IOC_READ_VERITY_METADATA \
+	_IOWR('f', 135, struct fsverity_read_metadata_arg)

 #endif /* _UAPI_LINUX_FSVERITY_H */