android_kernel_xiaomi_sm7250/fs
Robert Ho 855af072b6 mm, proc: fix region lost in /proc/self/smaps
Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
more detailed info please refer to:

   https://bugzilla.redhat.com/show_bug.cgi?id=1365721

Actually, this bug is not only for NVDIMM/DAX but also for any other
file systems.  This simple test case abstracted from nvml can easily
reproduce this bug in common environment:

-------------------------- testcase.c -----------------------------

int
is_pmem_proc(const void *addr, size_t len)
{
        const char *caddr = addr;

        FILE *fp;
        if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
                printf("!/proc/self/smaps");
                return 0;
        }

        int retval = 0;         /* assume false until proven otherwise */
        char line[PROCMAXLEN];  /* for fgets() */
        char *lo = NULL;        /* beginning of current range in smaps file */
        char *hi = NULL;        /* end of current range in smaps file */
        int needmm = 0;         /* looking for mm flag for current range */
        while (fgets(line, PROCMAXLEN, fp) != NULL) {
                static const char vmflags[] = "VmFlags:";
                static const char mm[] = " wr";

                /* check for range line */
                if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
                        if (needmm) {
                                /* last range matched, but no mm flag found */
                                printf("never found mm flag.\n");
                                break;
                        } else if (caddr < lo) {
                                /* never found the range for caddr */
                                printf("#######no match for addr %p.\n", caddr);
                                break;
                        } else if (caddr < hi) {
                                /* start address is in this range */
                                size_t rangelen = (size_t)(hi - caddr);

                                /* remember that matching has started */
                                needmm = 1;

                                /* calculate remaining range to search for */
                                if (len > rangelen) {
                                        len -= rangelen;
                                        caddr += rangelen;
                                        printf("matched %zu bytes in range "
                                                "%p-%p, %zu left over.\n",
                                                        rangelen, lo, hi, len);
                                } else {
                                        len = 0;
                                        printf("matched all bytes in range "
                                                        "%p-%p.\n", lo, hi);
                                }
                        }
                } else if (needmm && strncmp(line, vmflags,
                                        sizeof(vmflags) - 1) == 0) {
                        if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
                                printf("mm flag found.\n");
                                if (len == 0) {
                                        /* entire range matched */
                                        retval = 1;
                                        break;
                                }
                                needmm = 0;     /* saw what was needed */
                        } else {
                                /* mm flag not set for some or all of range */
                                printf("range has no mm flag.\n");
                                break;
                        }
                }
        }

        fclose(fp);

        printf("returning %d.\n", retval);
        return retval;
}

void *Addr;
size_t Size;

/*
 * worker -- the work each thread performs
 */
static void *
worker(void *arg)
{
        int *ret = (int *)arg;
        *ret =  is_pmem_proc(Addr, Size);
        return NULL;
}

int main(int argc, char *argv[])
{
        if (argc <  2 || argc > 3) {
                printf("usage: %s file [env].\n", argv[0]);
                return -1;
        }

        int fd = open(argv[1], O_RDWR);

        struct stat stbuf;
        fstat(fd, &stbuf);

        Size = stbuf.st_size;
        Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);

        close(fd);

        pthread_t threads[NTHREAD];
        int ret[NTHREAD];

        /* kick off NTHREAD threads */
        for (int i = 0; i < NTHREAD; i++)
                pthread_create(&threads[i], NULL, worker, &ret[i]);

        /* wait for all the threads to complete */
        for (int i = 0; i < NTHREAD; i++)
                pthread_join(threads[i], NULL);

        /* verify that all the threads return the same value */
        for (int i = 1; i < NTHREAD; i++) {
                if (ret[0] != ret[i]) {
                        printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
                                ret[0], ret[i]);
                }
        }

        printf("%d", ret[0]);
        return 0;
}

It failed as some threads can not find the memory region in
"/proc/self/smaps" which is allocated in the main process

It is caused by proc fs which uses 'file->version' to indicate the VMA that
is the last one has already been handled by read() system call. When the
next read() issues, it uses the 'version' to find the VMA, then the next
VMA is what we want to handle, the related code is as follows:

        if (last_addr) {
                vma = find_vma(mm, last_addr);
                if (vma && (vma = m_next_vma(priv, vma)))
                        return vma;
        }

However, VMA will be lost if the last VMA is gone, e.g:

The process VMA list is A->B->C->D

CPU 0                                  CPU 1
read() system call
   handle VMA B
   version = B
return to userspace

                                   unmap VMA B

issue read() again to continue to get
the region info
   find_vma(version) will get VMA C
   m_next_vma(C) will get VMA D
   handle D
   !!! VMA C is lost !!!

In order to fix this bug, we make 'file->version' indicate the end address
of the current VMA.  m_start will then look up a vma which with vma_start
< last_vm_end and moves on to the next vma if we found the same or an
overlapping vma.  This will guarantee that we will not miss an exclusive
vma but we can still miss one if the previous vma was shrunk.  This is
acceptable because guaranteeing "never miss a vma" is simply not feasible.
User has to cope with some inconsistencies if the file is not read in one
go.

[mhocko@suse.com: changelog fixes]
Link: http://lkml.kernel.org/r/1475296958-27652-1-git-send-email-robert.hu@intel.com
Acked-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Robert Hu <robert.hu@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-10-07 18:46:30 -07:00
..
9p
adfs
affs
afs rxrpc: Rewrite the data and ack handling code 2016-09-08 11:10:12 +01:00
autofs4 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace 2016-10-06 09:52:23 -07:00
befs
bfs
btrfs Merge branch 'for-linus-4.8' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs 2016-09-23 13:39:37 -07:00
cachefiles
ceph ceph: do not modify fi->frag in need_reset_readdir() 2016-09-05 14:30:35 +02:00
cifs Move check for prefix path to within cifs_get_root() 2016-09-09 23:58:07 -05:00
coda
configfs configfs: Return -EFBIG from configfs_write_bin_file. 2016-09-16 12:58:28 +02:00
cramfs
crypto fscrypto: require write access to mount to set encryption policy 2016-09-10 01:18:57 -04:00
debugfs debugfs: propagate release() call result 2016-09-27 12:45:57 +02:00
devpts devpts: Change the owner of /dev/pts/ptmx to the mounter of /dev/pts 2016-09-23 11:31:31 +02:00
dlm
ecryptfs
efivarfs fs/efivarfs: Fix double kfree() in error path 2016-09-09 16:08:48 +01:00
efs
exofs
exportfs
ext2 ext2/4, xfs: call thp_get_unmapped_area() for pmd mappings 2016-10-07 18:46:28 -07:00
ext4 ext2/4, xfs: call thp_get_unmapped_area() for pmd mappings 2016-10-07 18:46:28 -07:00
f2fs In this round, we've investigated how f2fs deals with errors given by our fault 2016-10-06 15:30:40 -07:00
fat
freevxfs
fscache
fuse fuse: limit xattr returned size 2016-10-03 11:06:05 +02:00
gfs2 We've only got six GFS2 patches for this merge window. In patch order: 2016-10-04 13:42:13 -07:00
hfs
hfsplus
hostfs
hpfs
hugetlbfs mm: remove unnecessary condition in remove_inode_hugepages 2016-10-07 18:46:29 -07:00
isofs
jbd2
jffs2
jfs jfs: Simplify code 2016-09-06 12:17:24 -05:00
kernfs kernfs: don't depend on d_find_any_alias() when generating notifications 2016-08-31 14:48:52 +02:00
lockd
logfs
minix
ncpfs
nfs mm: remove page_file_index 2016-10-07 18:46:28 -07:00
nfs_common
nfsd
nilfs2
nls
notify fsnotify: clean up spinlock assertions 2016-10-07 18:46:26 -07:00
ntfs
ocfs2 ocfs2: fix undefined struct variable in inode.h 2016-10-07 18:46:26 -07:00
omfs
openpromfs
orangefs Revert "orangefs: bump minimum userspace version" 2016-10-03 15:07:36 -04:00
overlayfs Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security 2016-10-04 14:48:27 -07:00
proc mm, proc: fix region lost in /proc/self/smaps 2016-10-07 18:46:30 -07:00
pstore ramoops: move spin_lock_init after kmalloc error checking 2016-09-08 15:01:13 -07:00
qnx4
qnx6
quota
ramfs ipc/shm: fix crash if CONFIG_SHMEM is not set 2016-09-19 15:36:17 -07:00
reiserfs reiserfs: Unlock superblock before calling reiserfs_quota_on_mount() 2016-09-16 17:20:59 +02:00
romfs
squashfs
sysfs sysfs print name of undiscoverable attribute group 2016-09-27 12:24:29 +02:00
sysv
tracefs
ubifs
udf udf: don't bother with full-page write optimisations in adinicb case 2016-09-19 10:47:01 +02:00
ufs
xfs ext2/4, xfs: call thp_get_unmapped_area() for pmd mappings 2016-10-07 18:46:28 -07:00
aio.c aio: mark AIO pseudo-fs noexec 2016-09-15 15:49:28 -07:00
anon_inodes.c
attr.c
bad_inode.c
binfmt_aout.c
binfmt_elf_fdpic.c
binfmt_elf.c x86/coredump: Use pr_reg size, rather that TIF_IA32 flag 2016-09-14 21:28:10 +02:00
binfmt_em86.c
binfmt_flat.c
binfmt_misc.c
binfmt_script.c
block_dev.c
buffer.c
char_dev.c
compat_binfmt_elf.c
compat_ioctl.c
compat.c
coredump.c
dax.c thp: reduce usage of huge zero page's atomic counter 2016-10-07 18:46:28 -07:00
dcache.c
dcookies.c
direct-io.c
drop_caches.c
eventfd.c
eventpoll.c
exec.c
fcntl.c
fhandle.c
file_table.c
file.c
filesystems.c
fs_pin.c
fs_struct.c
fs-writeback.c
inode.c
internal.h iomap: expose iomap_apply outside iomap.c 2016-09-19 11:24:49 +10:00
ioctl.c vfs: cap dedupe request structure size at PAGE_SIZE 2016-09-15 13:29:52 -07:00
iomap.c Merge branch 'iomap-4.9-dax' into for-next 2016-10-03 09:53:59 +11:00
Kconfig mm/hugetlb: introduce ARCH_HAS_GIGANTIC_PAGE 2016-10-07 18:46:29 -07:00
Kconfig.binfmt
libfs.c
locks.c File locking related changes for v4.9 2016-10-04 13:36:19 -07:00
Makefile
mbcache.c
mount.h mnt: Add a per mount namespace limit on the number of mounts 2016-09-30 12:46:48 -05:00
mpage.c
namei.c
namespace.c mnt: Add a per mount namespace limit on the number of mounts 2016-09-30 12:46:48 -05:00
no-block.c
nsfs.c nsfs: Simplify __ns_get_path 2016-09-22 20:06:20 -05:00
open.c
pipe.c
pnode.c mnt: Add a per mount namespace limit on the number of mounts 2016-09-30 12:46:48 -05:00
pnode.h mnt: Add a per mount namespace limit on the number of mounts 2016-09-30 12:46:48 -05:00
posix_acl.c
proc_namespace.c
read_write.c
readdir.c
select.c
seq_file.c seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char 2016-10-07 18:46:30 -07:00
signalfd.c
splice.c
stack.c
stat.c
statfs.c
super.c
sync.c
timerfd.c
userfaultfd.c
utimes.c
xattr.c