2018-04-03 19:23:33 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2011 STRATO. All rights reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/workqueue.h>
|
2013-01-29 07:04:50 +01:00
|
|
|
#include <linux/btrfs.h>
|
2017-12-22 09:06:39 +01:00
|
|
|
#include <linux/sizes.h>
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
#include "ctree.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "locking.h"
|
|
|
|
#include "ulist.h"
|
|
|
|
#include "backref.h"
|
2013-04-25 18:04:51 +02:00
|
|
|
#include "extent_io.h"
|
2014-05-14 02:30:47 +02:00
|
|
|
#include "qgroup.h"
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2015-04-17 04:23:16 +02:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/* TODO XXX FIXME
|
|
|
|
* - subvol delete -> delete when ref goes to 0? delete limits also?
|
|
|
|
* - reorganize keys
|
|
|
|
* - compressed
|
|
|
|
* - sync
|
|
|
|
* - copy also limits on subvol creation
|
|
|
|
* - limit
|
|
|
|
* - caches fuer ulists
|
|
|
|
* - performance benchmarks
|
|
|
|
* - check all ioctl parameters
|
|
|
|
*/
|
|
|
|
|
2017-12-12 08:34:24 +01:00
|
|
|
/*
|
|
|
|
* Helpers to access qgroup reservation
|
|
|
|
*
|
|
|
|
* Callers should ensure the lock context and type are valid
|
|
|
|
*/
|
|
|
|
|
|
|
|
static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
|
|
|
|
{
|
|
|
|
u64 ret = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
|
|
|
|
ret += qgroup->rsv.values[i];
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
|
|
|
if (type == BTRFS_QGROUP_RSV_DATA)
|
|
|
|
return "data";
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:29 +01:00
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
|
|
|
|
return "meta_pertrans";
|
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
|
|
|
|
return "meta_prealloc";
|
2017-12-12 08:34:24 +01:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2017-12-12 08:34:27 +01:00
|
|
|
static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *qgroup, u64 num_bytes,
|
2017-12-12 08:34:24 +01:00
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
2017-12-12 08:34:27 +01:00
|
|
|
trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
|
2017-12-12 08:34:24 +01:00
|
|
|
qgroup->rsv.values[type] += num_bytes;
|
|
|
|
}
|
|
|
|
|
2017-12-12 08:34:27 +01:00
|
|
|
static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *qgroup, u64 num_bytes,
|
2017-12-12 08:34:24 +01:00
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
2017-12-12 08:34:27 +01:00
|
|
|
trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
|
2017-12-12 08:34:24 +01:00
|
|
|
if (qgroup->rsv.values[type] >= num_bytes) {
|
|
|
|
qgroup->rsv.values[type] -= num_bytes;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#ifdef CONFIG_BTRFS_DEBUG
|
|
|
|
WARN_RATELIMIT(1,
|
|
|
|
"qgroup %llu %s reserved space underflow, have %llu to free %llu",
|
|
|
|
qgroup->qgroupid, qgroup_rsv_type_str(type),
|
|
|
|
qgroup->rsv.values[type], num_bytes);
|
|
|
|
#endif
|
|
|
|
qgroup->rsv.values[type] = 0;
|
|
|
|
}
|
|
|
|
|
2017-12-12 08:34:27 +01:00
|
|
|
static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *dest,
|
|
|
|
struct btrfs_qgroup *src)
|
2017-12-12 08:34:24 +01:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
|
2017-12-12 08:34:27 +01:00
|
|
|
qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
|
2017-12-12 08:34:24 +01:00
|
|
|
}
|
|
|
|
|
2017-12-12 08:34:27 +01:00
|
|
|
static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *dest,
|
2017-12-12 08:34:24 +01:00
|
|
|
struct btrfs_qgroup *src)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
|
2017-12-12 08:34:27 +01:00
|
|
|
qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
|
2017-12-12 08:34:24 +01:00
|
|
|
}
|
|
|
|
|
2015-03-12 09:10:13 +01:00
|
|
|
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
|
|
|
|
int mod)
|
|
|
|
{
|
|
|
|
if (qg->old_refcnt < seq)
|
|
|
|
qg->old_refcnt = seq;
|
|
|
|
qg->old_refcnt += mod;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
|
|
|
|
int mod)
|
|
|
|
{
|
|
|
|
if (qg->new_refcnt < seq)
|
|
|
|
qg->new_refcnt = seq;
|
|
|
|
qg->new_refcnt += mod;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
|
|
|
|
{
|
|
|
|
if (qg->old_refcnt < seq)
|
|
|
|
return 0;
|
|
|
|
return qg->old_refcnt - seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
|
|
|
|
{
|
|
|
|
if (qg->new_refcnt < seq)
|
|
|
|
return 0;
|
|
|
|
return qg->new_refcnt - seq;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* glue structure to represent the relations between qgroups.
|
|
|
|
*/
|
|
|
|
struct btrfs_qgroup_list {
|
|
|
|
struct list_head next_group;
|
|
|
|
struct list_head next_member;
|
|
|
|
struct btrfs_qgroup *group;
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
};
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
|
|
|
|
{
|
|
|
|
return (u64)(uintptr_t)qg;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
|
|
|
|
{
|
|
|
|
return (struct btrfs_qgroup *)(uintptr_t)n->aux;
|
|
|
|
}
|
2014-05-14 02:30:47 +02:00
|
|
|
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
static int
|
|
|
|
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
|
|
|
|
int init_flags);
|
|
|
|
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
|
2013-04-25 18:04:51 +02:00
|
|
|
|
2013-04-07 12:50:17 +02:00
|
|
|
/* must be called with qgroup_ioctl_lock held */
|
2012-06-28 18:03:02 +02:00
|
|
|
static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 qgroupid)
|
|
|
|
{
|
|
|
|
struct rb_node *n = fs_info->qgroup_tree.rb_node;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
while (n) {
|
|
|
|
qgroup = rb_entry(n, struct btrfs_qgroup, node);
|
|
|
|
if (qgroup->qgroupid < qgroupid)
|
|
|
|
n = n->rb_left;
|
|
|
|
else if (qgroup->qgroupid > qgroupid)
|
|
|
|
n = n->rb_right;
|
|
|
|
else
|
|
|
|
return qgroup;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 qgroupid)
|
|
|
|
{
|
|
|
|
struct rb_node **p = &fs_info->qgroup_tree.rb_node;
|
|
|
|
struct rb_node *parent = NULL;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
while (*p) {
|
|
|
|
parent = *p;
|
|
|
|
qgroup = rb_entry(parent, struct btrfs_qgroup, node);
|
|
|
|
|
|
|
|
if (qgroup->qgroupid < qgroupid)
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
else if (qgroup->qgroupid > qgroupid)
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
else
|
|
|
|
return qgroup;
|
|
|
|
}
|
|
|
|
|
|
|
|
qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
|
|
|
|
if (!qgroup)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
qgroup->qgroupid = qgroupid;
|
|
|
|
INIT_LIST_HEAD(&qgroup->groups);
|
|
|
|
INIT_LIST_HEAD(&qgroup->members);
|
|
|
|
INIT_LIST_HEAD(&qgroup->dirty);
|
|
|
|
|
|
|
|
rb_link_node(&qgroup->node, parent, p);
|
|
|
|
rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
|
|
|
|
|
|
|
|
return qgroup;
|
|
|
|
}
|
|
|
|
|
2013-08-14 03:13:36 +02:00
|
|
|
static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_qgroup_list *list;
|
|
|
|
|
|
|
|
list_del(&qgroup->dirty);
|
|
|
|
while (!list_empty(&qgroup->groups)) {
|
|
|
|
list = list_first_entry(&qgroup->groups,
|
|
|
|
struct btrfs_qgroup_list, next_group);
|
|
|
|
list_del(&list->next_group);
|
|
|
|
list_del(&list->next_member);
|
|
|
|
kfree(list);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (!list_empty(&qgroup->members)) {
|
|
|
|
list = list_first_entry(&qgroup->members,
|
|
|
|
struct btrfs_qgroup_list, next_member);
|
|
|
|
list_del(&list->next_group);
|
|
|
|
list_del(&list->next_member);
|
|
|
|
kfree(list);
|
|
|
|
}
|
|
|
|
kfree(qgroup);
|
2013-08-14 03:13:36 +02:00
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-08-14 03:13:36 +02:00
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
|
|
|
|
if (!qgroup)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
|
|
|
|
__del_qgroup_rb(qgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static int add_relation_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 memberid, u64 parentid)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup_list *list;
|
|
|
|
|
|
|
|
member = find_qgroup_rb(fs_info, memberid);
|
|
|
|
parent = find_qgroup_rb(fs_info, parentid);
|
|
|
|
if (!member || !parent)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
list = kzalloc(sizeof(*list), GFP_ATOMIC);
|
|
|
|
if (!list)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
list->group = parent;
|
|
|
|
list->member = member;
|
|
|
|
list_add_tail(&list->next_group, &member->groups);
|
|
|
|
list_add_tail(&list->next_member, &parent->members);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* must be called with qgroup_lock held */
|
|
|
|
static int del_relation_rb(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 memberid, u64 parentid)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup_list *list;
|
|
|
|
|
|
|
|
member = find_qgroup_rb(fs_info, memberid);
|
|
|
|
parent = find_qgroup_rb(fs_info, parentid);
|
|
|
|
if (!member || !parent)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
list_for_each_entry(list, &member->groups, next_group) {
|
|
|
|
if (list->group == parent) {
|
|
|
|
list_del(&list->next_group);
|
|
|
|
list_del(&list->next_member);
|
|
|
|
kfree(list);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
2014-05-07 23:06:09 +02:00
|
|
|
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
|
|
|
int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
|
|
|
|
u64 rfer, u64 excl)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
if (!qgroup)
|
|
|
|
return -EINVAL;
|
|
|
|
if (qgroup->rfer != rfer || qgroup->excl != excl)
|
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* The full config is read in one go, only called from open_ctree()
|
|
|
|
* It doesn't use any locking, as at this point we're still single-threaded
|
|
|
|
*/
|
|
|
|
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
int slot;
|
|
|
|
int ret = 0;
|
|
|
|
u64 flags = 0;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
u64 rescan_progress = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2016-09-02 21:40:02 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2012-06-28 18:03:02 +02:00
|
|
|
return 0;
|
|
|
|
|
2017-02-13 12:10:20 +01:00
|
|
|
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
|
2013-05-06 13:03:27 +02:00
|
|
|
if (!fs_info->qgroup_ulist) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* default this to quota off, in case no status key is found */
|
|
|
|
fs_info->qgroup_flags = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pass 1: read status, all qgroup infos and limits
|
|
|
|
*/
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = 0;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
slot = path->slots[0];
|
|
|
|
l = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(l, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
|
|
|
|
struct btrfs_qgroup_status_item *ptr;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr(l, slot,
|
|
|
|
struct btrfs_qgroup_status_item);
|
|
|
|
|
|
|
|
if (btrfs_qgroup_status_version(l, ptr) !=
|
|
|
|
BTRFS_QGROUP_STATUS_VERSION) {
|
2013-12-20 17:37:06 +01:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
"old qgroup version, quota disabled");
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (btrfs_qgroup_status_generation(l, ptr) !=
|
|
|
|
fs_info->generation) {
|
|
|
|
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2013-12-20 17:37:06 +01:00
|
|
|
btrfs_err(fs_info,
|
2016-09-20 16:05:00 +02:00
|
|
|
"qgroup generation mismatch, marked as inconsistent");
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
|
|
|
|
ptr);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
|
2012-06-28 18:03:02 +02:00
|
|
|
goto next1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
|
|
|
|
found_key.type != BTRFS_QGROUP_LIMIT_KEY)
|
|
|
|
goto next1;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, found_key.offset);
|
|
|
|
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
|
|
|
|
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
|
2015-07-06 15:38:11 +02:00
|
|
|
btrfs_err(fs_info, "inconsistent qgroup config");
|
2012-06-28 18:03:02 +02:00
|
|
|
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
}
|
|
|
|
if (!qgroup) {
|
|
|
|
qgroup = add_qgroup_rb(fs_info, found_key.offset);
|
|
|
|
if (IS_ERR(qgroup)) {
|
|
|
|
ret = PTR_ERR(qgroup);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
switch (found_key.type) {
|
|
|
|
case BTRFS_QGROUP_INFO_KEY: {
|
|
|
|
struct btrfs_qgroup_info_item *ptr;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr(l, slot,
|
|
|
|
struct btrfs_qgroup_info_item);
|
|
|
|
qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
|
|
|
|
qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
|
|
|
|
qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
|
|
|
|
qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
|
|
|
|
/* generation currently unused */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case BTRFS_QGROUP_LIMIT_KEY: {
|
|
|
|
struct btrfs_qgroup_limit_item *ptr;
|
|
|
|
|
|
|
|
ptr = btrfs_item_ptr(l, slot,
|
|
|
|
struct btrfs_qgroup_limit_item);
|
|
|
|
qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
|
|
|
|
qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
|
|
|
|
qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
|
|
|
|
qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
|
|
|
|
qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
next1:
|
|
|
|
ret = btrfs_next_item(quota_root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pass 2: read all qgroup relations
|
|
|
|
*/
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_RELATION_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
while (1) {
|
|
|
|
slot = path->slots[0];
|
|
|
|
l = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(l, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
|
|
|
|
goto next2;
|
|
|
|
|
|
|
|
if (found_key.objectid > found_key.offset) {
|
|
|
|
/* parent <- member, not needed to build config */
|
|
|
|
/* FIXME should we omit the key completely? */
|
|
|
|
goto next2;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = add_relation_rb(fs_info, found_key.objectid,
|
|
|
|
found_key.offset);
|
2013-01-17 09:22:08 +01:00
|
|
|
if (ret == -ENOENT) {
|
2013-12-20 17:37:06 +01:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"orphan qgroup relation 0x%llx->0x%llx",
|
2013-08-20 13:20:07 +02:00
|
|
|
found_key.objectid, found_key.offset);
|
2013-01-17 09:22:08 +01:00
|
|
|
ret = 0; /* ignore the error */
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
next2:
|
|
|
|
ret = btrfs_next_item(quota_root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
out:
|
btrfs: fix lockdep splat when reading qgroup config on mount
commit 3d05cad3c357a2b749912914356072b38435edfa upstream.
Lockdep reported the following splat when running test btrfs/190 from
fstests:
[ 9482.126098] ======================================================
[ 9482.126184] WARNING: possible circular locking dependency detected
[ 9482.126281] 5.10.0-rc4-btrfs-next-73 #1 Not tainted
[ 9482.126365] ------------------------------------------------------
[ 9482.126456] mount/24187 is trying to acquire lock:
[ 9482.126534] ffffa0c869a7dac0 (&fs_info->qgroup_rescan_lock){+.+.}-{3:3}, at: qgroup_rescan_init+0x43/0xf0 [btrfs]
[ 9482.126647]
but task is already holding lock:
[ 9482.126777] ffffa0c892ebd3a0 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x27/0x120 [btrfs]
[ 9482.126886]
which lock already depends on the new lock.
[ 9482.127078]
the existing dependency chain (in reverse order) is:
[ 9482.127213]
-> #1 (btrfs-quota-00){++++}-{3:3}:
[ 9482.127366] lock_acquire+0xd8/0x490
[ 9482.127436] down_read_nested+0x45/0x220
[ 9482.127528] __btrfs_tree_read_lock+0x27/0x120 [btrfs]
[ 9482.127613] btrfs_read_lock_root_node+0x41/0x130 [btrfs]
[ 9482.127702] btrfs_search_slot+0x514/0xc30 [btrfs]
[ 9482.127788] update_qgroup_status_item+0x72/0x140 [btrfs]
[ 9482.127877] btrfs_qgroup_rescan_worker+0xde/0x680 [btrfs]
[ 9482.127964] btrfs_work_helper+0xf1/0x600 [btrfs]
[ 9482.128039] process_one_work+0x24e/0x5e0
[ 9482.128110] worker_thread+0x50/0x3b0
[ 9482.128181] kthread+0x153/0x170
[ 9482.128256] ret_from_fork+0x22/0x30
[ 9482.128327]
-> #0 (&fs_info->qgroup_rescan_lock){+.+.}-{3:3}:
[ 9482.128464] check_prev_add+0x91/0xc60
[ 9482.128551] __lock_acquire+0x1740/0x3110
[ 9482.128623] lock_acquire+0xd8/0x490
[ 9482.130029] __mutex_lock+0xa3/0xb30
[ 9482.130590] qgroup_rescan_init+0x43/0xf0 [btrfs]
[ 9482.131577] btrfs_read_qgroup_config+0x43a/0x550 [btrfs]
[ 9482.132175] open_ctree+0x1228/0x18a0 [btrfs]
[ 9482.132756] btrfs_mount_root.cold+0x13/0xed [btrfs]
[ 9482.133325] legacy_get_tree+0x30/0x60
[ 9482.133866] vfs_get_tree+0x28/0xe0
[ 9482.134392] fc_mount+0xe/0x40
[ 9482.134908] vfs_kern_mount.part.0+0x71/0x90
[ 9482.135428] btrfs_mount+0x13b/0x3e0 [btrfs]
[ 9482.135942] legacy_get_tree+0x30/0x60
[ 9482.136444] vfs_get_tree+0x28/0xe0
[ 9482.136949] path_mount+0x2d7/0xa70
[ 9482.137438] do_mount+0x75/0x90
[ 9482.137923] __x64_sys_mount+0x8e/0xd0
[ 9482.138400] do_syscall_64+0x33/0x80
[ 9482.138873] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 9482.139346]
other info that might help us debug this:
[ 9482.140735] Possible unsafe locking scenario:
[ 9482.141594] CPU0 CPU1
[ 9482.142011] ---- ----
[ 9482.142411] lock(btrfs-quota-00);
[ 9482.142806] lock(&fs_info->qgroup_rescan_lock);
[ 9482.143216] lock(btrfs-quota-00);
[ 9482.143629] lock(&fs_info->qgroup_rescan_lock);
[ 9482.144056]
*** DEADLOCK ***
[ 9482.145242] 2 locks held by mount/24187:
[ 9482.145637] #0: ffffa0c8411c40e8 (&type->s_umount_key#44/1){+.+.}-{3:3}, at: alloc_super+0xb9/0x400
[ 9482.146061] #1: ffffa0c892ebd3a0 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x27/0x120 [btrfs]
[ 9482.146509]
stack backtrace:
[ 9482.147350] CPU: 1 PID: 24187 Comm: mount Not tainted 5.10.0-rc4-btrfs-next-73 #1
[ 9482.147788] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 9482.148709] Call Trace:
[ 9482.149169] dump_stack+0x8d/0xb5
[ 9482.149628] check_noncircular+0xff/0x110
[ 9482.150090] check_prev_add+0x91/0xc60
[ 9482.150561] ? kvm_clock_read+0x14/0x30
[ 9482.151017] ? kvm_sched_clock_read+0x5/0x10
[ 9482.151470] __lock_acquire+0x1740/0x3110
[ 9482.151941] ? __btrfs_tree_read_lock+0x27/0x120 [btrfs]
[ 9482.152402] lock_acquire+0xd8/0x490
[ 9482.152887] ? qgroup_rescan_init+0x43/0xf0 [btrfs]
[ 9482.153354] __mutex_lock+0xa3/0xb30
[ 9482.153826] ? qgroup_rescan_init+0x43/0xf0 [btrfs]
[ 9482.154301] ? qgroup_rescan_init+0x43/0xf0 [btrfs]
[ 9482.154768] ? qgroup_rescan_init+0x43/0xf0 [btrfs]
[ 9482.155226] qgroup_rescan_init+0x43/0xf0 [btrfs]
[ 9482.155690] btrfs_read_qgroup_config+0x43a/0x550 [btrfs]
[ 9482.156160] open_ctree+0x1228/0x18a0 [btrfs]
[ 9482.156643] btrfs_mount_root.cold+0x13/0xed [btrfs]
[ 9482.157108] ? rcu_read_lock_sched_held+0x5d/0x90
[ 9482.157567] ? kfree+0x31f/0x3e0
[ 9482.158030] legacy_get_tree+0x30/0x60
[ 9482.158489] vfs_get_tree+0x28/0xe0
[ 9482.158947] fc_mount+0xe/0x40
[ 9482.159403] vfs_kern_mount.part.0+0x71/0x90
[ 9482.159875] btrfs_mount+0x13b/0x3e0 [btrfs]
[ 9482.160335] ? rcu_read_lock_sched_held+0x5d/0x90
[ 9482.160805] ? kfree+0x31f/0x3e0
[ 9482.161260] ? legacy_get_tree+0x30/0x60
[ 9482.161714] legacy_get_tree+0x30/0x60
[ 9482.162166] vfs_get_tree+0x28/0xe0
[ 9482.162616] path_mount+0x2d7/0xa70
[ 9482.163070] do_mount+0x75/0x90
[ 9482.163525] __x64_sys_mount+0x8e/0xd0
[ 9482.163986] do_syscall_64+0x33/0x80
[ 9482.164437] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 9482.164902] RIP: 0033:0x7f51e907caaa
This happens because at btrfs_read_qgroup_config() we can call
qgroup_rescan_init() while holding a read lock on a quota btree leaf,
acquired by the previous call to btrfs_search_slot_for_read(), and
qgroup_rescan_init() acquires the mutex qgroup_rescan_lock.
A qgroup rescan worker does the opposite: it acquires the mutex
qgroup_rescan_lock, at btrfs_qgroup_rescan_worker(), and then tries to
update the qgroup status item in the quota btree through the call to
update_qgroup_status_item(). This inversion of locking order
between the qgroup_rescan_lock mutex and quota btree locks causes the
splat.
Fix this simply by releasing and freeing the path before calling
qgroup_rescan_init() at btrfs_read_qgroup_config().
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2020-11-23 15:28:44 +01:00
|
|
|
btrfs_free_path(path);
|
2012-06-28 18:03:02 +02:00
|
|
|
fs_info->qgroup_flags |= flags;
|
2016-09-02 21:40:02 +02:00
|
|
|
if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
|
|
|
|
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
|
|
|
|
else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
|
|
|
|
ret >= 0)
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-05-28 17:47:23 +02:00
|
|
|
if (ret < 0) {
|
2013-05-06 13:03:27 +02:00
|
|
|
ulist_free(fs_info->qgroup_ulist);
|
2013-05-28 17:47:23 +02:00
|
|
|
fs_info->qgroup_ulist = NULL;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
2013-05-28 17:47:23 +02:00
|
|
|
}
|
2013-05-06 13:03:27 +02:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret < 0 ? ret : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-08-14 03:13:37 +02:00
|
|
|
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
|
|
|
|
* first two are in single-threaded paths.And for the third one, we have set
|
|
|
|
* quota_root to be null with qgroup_lock held before, so it is safe to clean
|
|
|
|
* up the in-memory structures without qgroup_lock held.
|
2012-06-28 18:03:02 +02:00
|
|
|
*/
|
|
|
|
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct rb_node *n;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
while ((n = rb_first(&fs_info->qgroup_tree))) {
|
|
|
|
qgroup = rb_entry(n, struct btrfs_qgroup, node);
|
|
|
|
rb_erase(n, &fs_info->qgroup_tree);
|
2013-08-14 03:13:36 +02:00
|
|
|
__del_qgroup_rb(qgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
2013-07-13 15:02:54 +02:00
|
|
|
/*
|
|
|
|
* we call btrfs_free_qgroup_config() when umounting
|
2016-05-20 03:18:45 +02:00
|
|
|
* filesystem and disabling quota, so we set qgroup_ulist
|
2013-07-13 15:02:54 +02:00
|
|
|
* to be null here to avoid double free.
|
|
|
|
*/
|
2013-05-06 13:03:27 +02:00
|
|
|
ulist_free(fs_info->qgroup_ulist);
|
2013-07-13 15:02:54 +02:00
|
|
|
fs_info->qgroup_ulist = NULL;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:24 +02:00
|
|
|
static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2018-07-18 08:45:24 +02:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = src;
|
|
|
|
key.type = BTRFS_QGROUP_RELATION_KEY;
|
|
|
|
key.offset = dst;
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(path->nodes[0]);
|
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:25 +02:00
|
|
|
static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2018-07-18 08:45:25 +02:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = src;
|
|
|
|
key.type = BTRFS_QGROUP_RELATION_KEY;
|
|
|
|
key.offset = dst;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_del_item(trans, quota_root, path);
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int add_qgroup_item(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *quota_root, u64 qgroupid)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_qgroup_info_item *qgroup_info;
|
|
|
|
struct btrfs_qgroup_limit_item *qgroup_limit;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
2016-06-21 15:52:41 +02:00
|
|
|
if (btrfs_is_testing(quota_root->fs_info))
|
2014-05-07 23:06:09 +02:00
|
|
|
return 0;
|
2014-09-29 23:53:21 +02:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_INFO_KEY;
|
|
|
|
key.offset = qgroupid;
|
|
|
|
|
2014-08-18 23:01:17 +02:00
|
|
|
/*
|
|
|
|
* Avoid a transaction abort by catching -EEXIST here. In that
|
|
|
|
* case, we proceed by re-initializing the existing structure
|
|
|
|
* on disk.
|
|
|
|
*/
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
|
|
|
|
sizeof(*qgroup_info));
|
2014-08-18 23:01:17 +02:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_qgroup_info_item);
|
|
|
|
btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
|
|
|
|
btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
|
|
|
|
btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
|
|
|
|
btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
|
|
|
|
btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
key.type = BTRFS_QGROUP_LIMIT_KEY;
|
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
|
|
|
|
sizeof(*qgroup_limit));
|
2014-08-18 23:01:17 +02:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_qgroup_limit_item);
|
|
|
|
btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
|
|
|
|
btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:26 +02:00
|
|
|
static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret;
|
2018-07-18 08:45:26 +02:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_INFO_KEY;
|
|
|
|
key.offset = qgroupid;
|
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_del_item(trans, quota_root, path);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
key.type = BTRFS_QGROUP_LIMIT_KEY;
|
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_del_item(trans, quota_root, path);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
|
2014-11-21 03:01:41 +01:00
|
|
|
struct btrfs_qgroup *qgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 08:45:27 +02:00
|
|
|
struct btrfs_root *quota_root = trans->fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
struct btrfs_qgroup_limit_item *qgroup_limit;
|
|
|
|
int ret;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_LIMIT_KEY;
|
2014-11-21 03:01:41 +01:00
|
|
|
key.offset = qgroup->qgroupid;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2013-02-27 12:20:56 +01:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-18 08:45:27 +02:00
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2013-11-04 22:34:29 +01:00
|
|
|
qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
|
2014-11-21 03:01:41 +01:00
|
|
|
btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
|
|
|
|
btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
|
|
|
|
btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
|
|
|
|
btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
|
|
|
|
btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(l);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_qgroup *qgroup)
|
|
|
|
{
|
2018-07-18 08:45:28 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
struct btrfs_qgroup_info_item *qgroup_info;
|
|
|
|
int ret;
|
|
|
|
int slot;
|
|
|
|
|
2018-07-18 08:45:28 +02:00
|
|
|
if (btrfs_is_testing(fs_info))
|
2014-05-07 23:06:09 +02:00
|
|
|
return 0;
|
2014-09-29 23:53:21 +02:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_INFO_KEY;
|
|
|
|
key.offset = qgroup->qgroupid;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2013-02-27 12:20:56 +01:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-18 08:45:28 +02:00
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
2013-11-04 22:34:29 +01:00
|
|
|
qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
|
|
|
|
btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
|
|
|
|
btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
|
|
|
|
btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
|
|
|
|
btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
|
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(l);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:29 +02:00
|
|
|
static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 08:45:29 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *l;
|
|
|
|
struct btrfs_qgroup_status_item *ptr;
|
|
|
|
int ret;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_STATUS_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2013-02-27 12:20:56 +01:00
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-18 08:45:29 +02:00
|
|
|
ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret > 0)
|
|
|
|
ret = -ENOENT;
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
l = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
|
|
|
|
btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
|
|
|
|
btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
|
2013-04-25 18:04:51 +02:00
|
|
|
btrfs_set_qgroup_status_rescan(l, ptr,
|
|
|
|
fs_info->qgroup_rescan_progress.objectid);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(l);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* called with qgroup_lock held
|
|
|
|
*/
|
|
|
|
static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_root *root)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
2013-02-27 12:16:57 +01:00
|
|
|
struct extent_buffer *leaf = NULL;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret;
|
2013-02-27 12:16:57 +01:00
|
|
|
int nr = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-02-27 12:16:57 +01:00
|
|
|
path->leave_spinning = 1;
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.offset = 0;
|
|
|
|
key.type = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-02-27 12:16:57 +01:00
|
|
|
while (1) {
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
2013-02-27 12:16:57 +01:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
nr = btrfs_header_nritems(leaf);
|
|
|
|
if (!nr)
|
2012-06-28 18:03:02 +02:00
|
|
|
break;
|
2013-02-27 12:16:57 +01:00
|
|
|
/*
|
|
|
|
* delete the leaf one by one
|
|
|
|
* since the whole tree is going
|
|
|
|
* to be deleted.
|
|
|
|
*/
|
|
|
|
path->slots[0] = 0;
|
|
|
|
ret = btrfs_del_items(trans, root, path, 0, nr);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2013-02-27 12:16:57 +01:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_release_path(path);
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-05 13:50:48 +02:00
|
|
|
int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_root *quota_root;
|
2013-04-07 12:24:57 +02:00
|
|
|
struct btrfs_root *tree_root = fs_info->tree_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct btrfs_qgroup_status_item *ptr;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_key key;
|
2013-04-07 12:24:57 +02:00
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_qgroup *qgroup = NULL;
|
2018-07-05 13:50:48 +02:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
2013-04-07 12:24:57 +02:00
|
|
|
int slot;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2018-01-31 09:52:04 +01:00
|
|
|
if (fs_info->quota_root)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
|
2018-07-05 13:50:48 +02:00
|
|
|
/*
|
|
|
|
* 1 for quota root item
|
|
|
|
* 1 for BTRFS_QGROUP_STATUS item
|
|
|
|
*
|
|
|
|
* Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
|
|
|
|
* per subvolume. However those are not currently reserved since it
|
|
|
|
* would be a lot of overkill.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(tree_root, 2);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-02-13 11:03:44 +01:00
|
|
|
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
|
2013-05-06 13:03:27 +02:00
|
|
|
if (!fs_info->qgroup_ulist) {
|
|
|
|
ret = -ENOMEM;
|
2018-07-05 13:50:48 +02:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-05-06 13:03:27 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* initially create the quota tree
|
|
|
|
*/
|
|
|
|
quota_root = btrfs_create_tree(trans, fs_info,
|
|
|
|
BTRFS_QUOTA_TREE_OBJECTID);
|
|
|
|
if (IS_ERR(quota_root)) {
|
|
|
|
ret = PTR_ERR(quota_root);
|
2018-07-05 13:50:48 +02:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
2012-10-16 07:44:21 +02:00
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
2018-07-05 13:50:48 +02:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-10-16 07:44:21 +02:00
|
|
|
goto out_free_root;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_QGROUP_STATUS_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
|
|
|
|
sizeof(*ptr));
|
2018-07-05 13:50:48 +02:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2012-10-16 07:44:21 +02:00
|
|
|
goto out_free_path;
|
2018-07-05 13:50:48 +02:00
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
ptr = btrfs_item_ptr(leaf, path->slots[0],
|
|
|
|
struct btrfs_qgroup_status_item);
|
|
|
|
btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
|
|
|
|
btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
|
|
|
|
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
|
2013-04-25 18:04:51 +02:00
|
|
|
btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(leaf);
|
|
|
|
|
2013-04-07 12:24:57 +02:00
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_ROOT_REF_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
|
|
|
|
if (ret > 0)
|
|
|
|
goto out_add_root;
|
2018-07-05 13:50:48 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 12:24:57 +02:00
|
|
|
goto out_free_path;
|
2018-07-05 13:50:48 +02:00
|
|
|
}
|
2013-04-07 12:24:57 +02:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
slot = path->slots[0];
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.type == BTRFS_ROOT_REF_KEY) {
|
|
|
|
ret = add_qgroup_item(trans, quota_root,
|
|
|
|
found_key.offset);
|
2018-07-05 13:50:48 +02:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 12:24:57 +02:00
|
|
|
goto out_free_path;
|
2018-07-05 13:50:48 +02:00
|
|
|
}
|
2013-04-07 12:24:57 +02:00
|
|
|
|
|
|
|
qgroup = add_qgroup_rb(fs_info, found_key.offset);
|
|
|
|
if (IS_ERR(qgroup)) {
|
|
|
|
ret = PTR_ERR(qgroup);
|
2018-07-05 13:50:48 +02:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 12:24:57 +02:00
|
|
|
goto out_free_path;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = btrfs_next_item(tree_root, path);
|
2018-07-05 13:50:48 +02:00
|
|
|
if (ret < 0) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 12:24:57 +02:00
|
|
|
goto out_free_path;
|
2018-07-05 13:50:48 +02:00
|
|
|
}
|
2013-04-07 12:24:57 +02:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out_add_root:
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
|
2018-07-05 13:50:48 +02:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 12:24:57 +02:00
|
|
|
goto out_free_path;
|
2018-07-05 13:50:48 +02:00
|
|
|
}
|
2013-04-07 12:24:57 +02:00
|
|
|
|
|
|
|
qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
|
|
|
|
if (IS_ERR(qgroup)) {
|
|
|
|
ret = PTR_ERR(qgroup);
|
2018-07-05 13:50:48 +02:00
|
|
|
btrfs_abort_transaction(trans, ret);
|
2013-04-07 12:24:57 +02:00
|
|
|
goto out_free_path;
|
|
|
|
}
|
2018-07-05 13:50:48 +02:00
|
|
|
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
2018-08-20 10:25:33 +02:00
|
|
|
trans = NULL;
|
|
|
|
if (ret)
|
2018-07-05 13:50:48 +02:00
|
|
|
goto out_free_path;
|
|
|
|
|
2018-11-19 15:15:36 +01:00
|
|
|
/*
|
|
|
|
* Set quota enabled flag after committing the transaction, to avoid
|
|
|
|
* deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
|
|
|
|
* creation.
|
|
|
|
*/
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
fs_info->quota_root = quota_root;
|
|
|
|
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
2018-01-31 09:52:04 +01:00
|
|
|
ret = qgroup_rescan_init(fs_info, 0, 1);
|
|
|
|
if (!ret) {
|
|
|
|
qgroup_rescan_zero_tracking(fs_info);
|
2020-02-07 06:38:20 +01:00
|
|
|
fs_info->qgroup_rescan_running = true;
|
2018-01-31 09:52:04 +01:00
|
|
|
btrfs_queue_work(fs_info->qgroup_rescan_workers,
|
|
|
|
&fs_info->qgroup_rescan_work);
|
2022-08-23 13:45:42 +02:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* We have set both BTRFS_FS_QUOTA_ENABLED and
|
|
|
|
* BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
|
|
|
|
* -EINPROGRESS. That can happen because someone started the
|
|
|
|
* rescan worker by calling quota rescan ioctl before we
|
|
|
|
* attempted to initialize the rescan worker. Failure due to
|
|
|
|
* quotas disabled in the meanwhile is not possible, because
|
|
|
|
* we are holding a write lock on fs_info->subvol_sem, which
|
|
|
|
* is also acquired when disabling quotas.
|
|
|
|
* Ignore such error, and any other error would need to undo
|
|
|
|
* everything we did in the transaction we just committed.
|
|
|
|
*/
|
|
|
|
ASSERT(ret == -EINPROGRESS);
|
|
|
|
ret = 0;
|
2018-01-31 09:52:04 +01:00
|
|
|
}
|
|
|
|
|
2012-10-16 07:44:21 +02:00
|
|
|
out_free_path:
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_free_path(path);
|
2012-10-16 07:44:21 +02:00
|
|
|
out_free_root:
|
|
|
|
if (ret) {
|
|
|
|
free_extent_buffer(quota_root->node);
|
|
|
|
free_extent_buffer(quota_root->commit_root);
|
|
|
|
kfree(quota_root);
|
|
|
|
}
|
|
|
|
out:
|
2013-05-28 17:47:23 +02:00
|
|
|
if (ret) {
|
2013-05-06 13:03:27 +02:00
|
|
|
ulist_free(fs_info->qgroup_ulist);
|
2013-05-28 17:47:23 +02:00
|
|
|
fs_info->qgroup_ulist = NULL;
|
2018-07-05 13:50:48 +02:00
|
|
|
if (trans)
|
|
|
|
btrfs_end_transaction(trans);
|
2013-05-28 17:47:23 +02:00
|
|
|
}
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-05 13:50:48 +02:00
|
|
|
int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_root *quota_root;
|
2018-07-05 13:50:48 +02:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2013-04-07 12:50:17 +02:00
|
|
|
if (!fs_info->quota_root)
|
2013-04-07 12:50:16 +02:00
|
|
|
goto out;
|
2018-07-05 13:50:48 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* 1 For the root item
|
|
|
|
*
|
|
|
|
* We should also reserve enough items for the quota tree deletion in
|
|
|
|
* btrfs_clean_quota_tree but this is not done.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(fs_info->tree_root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-09-02 21:40:02 +02:00
|
|
|
clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
|
2016-08-09 04:08:06 +02:00
|
|
|
btrfs_qgroup_wait_for_completion(fs_info, false);
|
2015-11-06 19:36:42 +01:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
quota_root = fs_info->quota_root;
|
|
|
|
fs_info->quota_root = NULL;
|
2015-02-27 09:24:26 +01:00
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
2013-08-14 03:13:37 +02:00
|
|
|
btrfs_free_qgroup_config(fs_info);
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = btrfs_clean_quota_tree(trans, quota_root);
|
2018-07-05 13:50:48 +02:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto end_trans;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2018-08-01 05:32:27 +02:00
|
|
|
ret = btrfs_del_root(trans, "a_root->root_key);
|
2018-07-05 13:50:48 +02:00
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
goto end_trans;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
list_del("a_root->dirty_list);
|
|
|
|
|
|
|
|
btrfs_tree_lock(quota_root->node);
|
2017-02-10 18:47:57 +01:00
|
|
|
clean_tree_block(fs_info, quota_root->node);
|
2012-06-28 18:03:02 +02:00
|
|
|
btrfs_tree_unlock(quota_root->node);
|
|
|
|
btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
|
|
|
|
|
|
|
|
free_extent_buffer(quota_root->node);
|
|
|
|
free_extent_buffer(quota_root->commit_root);
|
|
|
|
kfree(quota_root);
|
2018-07-05 13:50:48 +02:00
|
|
|
|
|
|
|
end_trans:
|
|
|
|
ret = btrfs_end_transaction(trans);
|
2012-06-28 18:03:02 +02:00
|
|
|
out:
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-04-25 18:04:51 +02:00
|
|
|
static void qgroup_dirty(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup *qgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2013-04-25 18:04:51 +02:00
|
|
|
if (list_empty(&qgroup->dirty))
|
|
|
|
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
2015-02-27 09:24:27 +01:00
|
|
|
/*
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
* The easy accounting, we're updating qgroup relationship whose child qgroup
|
|
|
|
* only has exclusive extents.
|
|
|
|
*
|
|
|
|
* In this case, all exclsuive extents will also be exlusive for parent, so
|
|
|
|
* excl/rfer just get added/removed.
|
|
|
|
*
|
|
|
|
* So is qgroup reservation space, which should also be added/removed to
|
|
|
|
* parent.
|
|
|
|
* Or when child tries to release reservation space, parent will underflow its
|
|
|
|
* reservation (for relationship adding case).
|
2015-02-27 09:24:27 +01:00
|
|
|
*
|
|
|
|
* Caller should hold fs_info->qgroup_lock.
|
|
|
|
*/
|
|
|
|
static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *tmp, u64 ref_root,
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
struct btrfs_qgroup *src, int sign)
|
2015-02-27 09:24:27 +01:00
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
u64 num_bytes = src->excl;
|
2015-02-27 09:24:27 +01:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup->rfer += sign * num_bytes;
|
|
|
|
qgroup->rfer_cmpr += sign * num_bytes;
|
|
|
|
|
|
|
|
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
|
|
|
|
qgroup->excl += sign * num_bytes;
|
|
|
|
qgroup->excl_cmpr += sign * num_bytes;
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
|
|
|
|
if (sign > 0)
|
2017-12-12 08:34:27 +01:00
|
|
|
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
else
|
2017-12-12 08:34:27 +01:00
|
|
|
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
|
2015-02-27 09:24:27 +01:00
|
|
|
|
|
|
|
qgroup_dirty(fs_info, qgroup);
|
|
|
|
|
|
|
|
/* Get all of the parent groups that contain this qgroup */
|
|
|
|
list_for_each_entry(glist, &qgroup->groups, next_group) {
|
|
|
|
ret = ulist_add(tmp, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2015-02-27 09:24:27 +01:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Iterate all of the parents and adjust their reference counts */
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(tmp, &uiter))) {
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup = unode_aux_to_qgroup(unode);
|
2015-02-27 09:24:27 +01:00
|
|
|
qgroup->rfer += sign * num_bytes;
|
|
|
|
qgroup->rfer_cmpr += sign * num_bytes;
|
|
|
|
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
|
|
|
|
qgroup->excl += sign * num_bytes;
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
if (sign > 0)
|
2017-12-12 08:34:27 +01:00
|
|
|
qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
else
|
2017-12-12 08:34:27 +01:00
|
|
|
qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
|
2015-02-27 09:24:27 +01:00
|
|
|
qgroup->excl_cmpr += sign * num_bytes;
|
|
|
|
qgroup_dirty(fs_info, qgroup);
|
|
|
|
|
|
|
|
/* Add any parents of the parents */
|
|
|
|
list_for_each_entry(glist, &qgroup->groups, next_group) {
|
|
|
|
ret = ulist_add(tmp, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2015-02-27 09:24:27 +01:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Quick path for updating qgroup with only excl refs.
|
|
|
|
*
|
|
|
|
* In that case, just update all parent will be enough.
|
|
|
|
* Or we needs to do a full rescan.
|
|
|
|
* Caller should also hold fs_info->qgroup_lock.
|
|
|
|
*
|
|
|
|
* Return 0 for quick update, return >0 for need to full rescan
|
|
|
|
* and mark INCONSISTENT flag.
|
|
|
|
* Return < 0 for other error.
|
|
|
|
*/
|
|
|
|
static int quick_update_accounting(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *tmp, u64 src, u64 dst,
|
|
|
|
int sign)
|
|
|
|
{
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
int ret = 1;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, src);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
if (qgroup->excl == qgroup->rfer) {
|
|
|
|
ret = 0;
|
|
|
|
err = __qgroup_excl_accounting(fs_info, tmp, dst,
|
btrfs: qgroup: Fix wrong qgroup reservation update for relationship modification
When modifying qgroup relationship, for qgroup which only owns exclusive
extents, we will go through quick update path.
In this path, we will add/subtract exclusive and reference number for
parent qgroup, since the source (child) qgroup only has exclusive
extents, destination (parent) qgroup will also own or lose those extents
exclusively.
The same should be the same for reservation, since later reservation
adding/releasing will also affect parent qgroup, without the reservation
carried from child, parent will underflow reservation or have dead
reservation which will never be freed.
However original code doesn't do the same thing for reservation.
It handles qgroup reservation quite differently:
It removes qgroup reservation, as it's allocating space from the
reserved qgroup for relationship adding.
But does nothing for qgroup reservation if we're removing a qgroup
relationship.
According to the original code, it looks just like because we're adding
qgroup->rfer, the code assumes we're writing new data, so it's follows
the normal write routine, by reducing qgroup->reserved and adding
qgroup->rfer/excl.
This old behavior is wrong, and should be fixed to follow the same
excl/rfer behavior.
Just fix it by using the correct behavior described above.
Fixes: 31193213f1f9 ("Btrfs: qgroup: Introduce a may_use to account space_info->bytes_may_use.")
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:26 +01:00
|
|
|
qgroup, sign);
|
2015-02-27 09:24:27 +01:00
|
|
|
if (err < 0) {
|
|
|
|
ret = err;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:30 +02:00
|
|
|
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 08:45:30 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_root *quota_root;
|
2013-04-07 12:50:18 +02:00
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup *member;
|
2013-04-17 16:49:51 +02:00
|
|
|
struct btrfs_qgroup_list *list;
|
2015-02-27 09:24:27 +01:00
|
|
|
struct ulist *tmp;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2015-02-27 09:24:22 +01:00
|
|
|
/* Check the level of src and dst first */
|
|
|
|
if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2017-02-13 12:41:02 +01:00
|
|
|
tmp = ulist_alloc(GFP_KERNEL);
|
2015-05-02 17:19:55 +02:00
|
|
|
if (!tmp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
quota_root = fs_info->quota_root;
|
2013-04-07 12:50:16 +02:00
|
|
|
if (!quota_root) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-04-07 12:50:18 +02:00
|
|
|
member = find_qgroup_rb(fs_info, src);
|
|
|
|
parent = find_qgroup_rb(fs_info, dst);
|
|
|
|
if (!member || !parent) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-17 16:49:51 +02:00
|
|
|
/* check if such qgroup relation exist firstly */
|
|
|
|
list_for_each_entry(list, &member->groups, next_group) {
|
|
|
|
if (list->group == parent) {
|
|
|
|
ret = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:24 +02:00
|
|
|
ret = add_qgroup_relation_item(trans, src, dst);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
2013-04-07 12:50:16 +02:00
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2018-07-18 08:45:24 +02:00
|
|
|
ret = add_qgroup_relation_item(trans, dst, src);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret) {
|
2018-07-18 08:45:25 +02:00
|
|
|
del_qgroup_relation_item(trans, src, dst);
|
2013-04-07 12:50:16 +02:00
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2016-06-23 00:54:23 +02:00
|
|
|
ret = add_relation_rb(fs_info, src, dst);
|
2015-02-27 09:24:27 +01:00
|
|
|
if (ret < 0) {
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2013-04-07 12:50:16 +02:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2015-02-27 09:24:27 +01:00
|
|
|
ulist_free(tmp);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:31 +02:00
|
|
|
static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 08:45:31 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_root *quota_root;
|
2013-04-17 16:49:51 +02:00
|
|
|
struct btrfs_qgroup *parent;
|
|
|
|
struct btrfs_qgroup *member;
|
|
|
|
struct btrfs_qgroup_list *list;
|
2015-02-27 09:24:27 +01:00
|
|
|
struct ulist *tmp;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
int err;
|
|
|
|
|
2017-02-13 12:41:02 +01:00
|
|
|
tmp = ulist_alloc(GFP_KERNEL);
|
2015-02-27 09:24:27 +01:00
|
|
|
if (!tmp)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
quota_root = fs_info->quota_root;
|
2013-04-07 12:50:16 +02:00
|
|
|
if (!quota_root) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-17 16:49:51 +02:00
|
|
|
member = find_qgroup_rb(fs_info, src);
|
|
|
|
parent = find_qgroup_rb(fs_info, dst);
|
|
|
|
if (!member || !parent) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check if such qgroup relation exist firstly */
|
|
|
|
list_for_each_entry(list, &member->groups, next_group) {
|
|
|
|
if (list->group == parent)
|
|
|
|
goto exist;
|
|
|
|
}
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
exist:
|
2018-07-18 08:45:25 +02:00
|
|
|
ret = del_qgroup_relation_item(trans, src, dst);
|
|
|
|
err = del_qgroup_relation_item(trans, dst, src);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (err && !ret)
|
|
|
|
ret = err;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
del_relation_rb(fs_info, src, dst);
|
2015-02-27 09:24:27 +01:00
|
|
|
ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2013-04-07 12:50:16 +02:00
|
|
|
out:
|
2015-02-27 09:24:27 +01:00
|
|
|
ulist_free(tmp);
|
2014-11-24 16:27:09 +01:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:32 +02:00
|
|
|
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
|
|
|
|
u64 dst)
|
2014-11-24 16:27:09 +01:00
|
|
|
{
|
2018-07-18 08:45:32 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2014-11-24 16:27:09 +01:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2018-07-18 08:45:31 +02:00
|
|
|
ret = __del_qgroup_relation(trans, src, dst);
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2014-11-24 16:27:09 +01:00
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:33 +02:00
|
|
|
int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 08:45:33 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_root *quota_root;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
int ret = 0;
|
|
|
|
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
quota_root = fs_info->quota_root;
|
2013-04-07 12:50:16 +02:00
|
|
|
if (!quota_root) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-04-17 16:49:51 +02:00
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
if (qgroup) {
|
|
|
|
ret = -EEXIST;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
ret = add_qgroup_item(trans, quota_root, qgroupid);
|
2013-04-17 16:49:51 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
qgroup = add_qgroup_rb(fs_info, qgroupid);
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
|
|
|
if (IS_ERR(qgroup))
|
|
|
|
ret = PTR_ERR(qgroup);
|
2013-04-07 12:50:16 +02:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:34 +02:00
|
|
|
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 08:45:34 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_root *quota_root;
|
2013-01-17 09:22:09 +01:00
|
|
|
struct btrfs_qgroup *qgroup;
|
2014-11-24 16:27:09 +01:00
|
|
|
struct btrfs_qgroup_list *list;
|
2012-06-28 18:03:02 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
quota_root = fs_info->quota_root;
|
2013-04-07 12:50:16 +02:00
|
|
|
if (!quota_root) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-01-17 09:22:09 +01:00
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
2013-04-17 16:49:51 +02:00
|
|
|
if (!qgroup) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
} else {
|
2014-11-24 16:27:09 +01:00
|
|
|
/* check if there are no children of this qgroup */
|
|
|
|
if (!list_empty(&qgroup->members)) {
|
2013-04-07 12:50:16 +02:00
|
|
|
ret = -EBUSY;
|
|
|
|
goto out;
|
2013-01-17 09:22:09 +01:00
|
|
|
}
|
|
|
|
}
|
2018-07-18 08:45:26 +02:00
|
|
|
ret = del_qgroup_item(trans, qgroupid);
|
2017-09-17 11:02:29 +02:00
|
|
|
if (ret && ret != -ENOENT)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2014-11-24 16:27:09 +01:00
|
|
|
while (!list_empty(&qgroup->groups)) {
|
|
|
|
list = list_first_entry(&qgroup->groups,
|
|
|
|
struct btrfs_qgroup_list, next_group);
|
2018-07-18 08:45:31 +02:00
|
|
|
ret = __del_qgroup_relation(trans, qgroupid,
|
|
|
|
list->group->qgroupid);
|
2014-11-24 16:27:09 +01:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2016-06-23 00:54:23 +02:00
|
|
|
del_qgroup_rb(fs_info, qgroupid);
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2013-04-07 12:50:16 +02:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:35 +02:00
|
|
|
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup_limit *limit)
|
|
|
|
{
|
2018-07-18 08:45:35 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2013-04-07 12:50:16 +02:00
|
|
|
struct btrfs_root *quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
int ret = 0;
|
2015-06-03 08:57:32 +02:00
|
|
|
/* Sometimes we would want to clear the limit on this qgroup.
|
|
|
|
* To meet this requirement, we treat the -1 as a special value
|
|
|
|
* which tell kernel to clear the limit on this qgroup.
|
|
|
|
*/
|
|
|
|
const u64 CLEAR_VALUE = -1;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 12:50:16 +02:00
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
|
|
|
quota_root = fs_info->quota_root;
|
|
|
|
if (!quota_root) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 12:50:20 +02:00
|
|
|
qgroup = find_qgroup_rb(fs_info, qgroupid);
|
|
|
|
if (!qgroup) {
|
|
|
|
ret = -ENOENT;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 12:50:17 +02:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2015-06-03 08:57:32 +02:00
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
|
|
|
|
if (limit->max_rfer == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
|
|
|
|
qgroup->max_rfer = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->max_rfer = limit->max_rfer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
|
|
|
|
if (limit->max_excl == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
|
|
|
|
qgroup->max_excl = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->max_excl = limit->max_excl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
|
|
|
|
if (limit->rsv_rfer == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
|
|
|
|
qgroup->rsv_rfer = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->rsv_rfer = limit->rsv_rfer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
|
|
|
|
if (limit->rsv_excl == CLEAR_VALUE) {
|
|
|
|
qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
|
|
|
|
limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
|
|
|
|
qgroup->rsv_excl = 0;
|
|
|
|
} else {
|
|
|
|
qgroup->rsv_excl = limit->rsv_excl;
|
|
|
|
}
|
|
|
|
}
|
2015-02-06 17:06:25 +01:00
|
|
|
qgroup->lim_flags |= limit->flags;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2014-11-21 03:01:41 +01:00
|
|
|
|
2018-07-18 08:45:27 +02:00
|
|
|
ret = update_qgroup_limit_item(trans, qgroup);
|
2014-11-21 03:01:41 +01:00
|
|
|
if (ret) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
btrfs_info(fs_info, "unable to update quota limit for %llu",
|
|
|
|
qgroupid);
|
|
|
|
}
|
|
|
|
|
2013-04-07 12:50:16 +02:00
|
|
|
out:
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
2014-07-17 21:39:01 +02:00
|
|
|
|
2016-10-18 03:31:27 +02:00
|
|
|
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
struct btrfs_delayed_ref_root *delayed_refs,
|
|
|
|
struct btrfs_qgroup_extent_record *record)
|
2015-04-16 08:34:17 +02:00
|
|
|
{
|
|
|
|
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
|
|
|
|
struct rb_node *parent_node = NULL;
|
|
|
|
struct btrfs_qgroup_extent_record *entry;
|
|
|
|
u64 bytenr = record->bytenr;
|
|
|
|
|
2018-03-16 02:21:22 +01:00
|
|
|
lockdep_assert_held(&delayed_refs->lock);
|
2016-10-18 03:31:27 +02:00
|
|
|
trace_btrfs_qgroup_trace_extent(fs_info, record);
|
2015-11-05 23:38:00 +01:00
|
|
|
|
2015-04-16 08:34:17 +02:00
|
|
|
while (*p) {
|
|
|
|
parent_node = *p;
|
|
|
|
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
|
|
|
|
node);
|
|
|
|
if (bytenr < entry->bytenr)
|
|
|
|
p = &(*p)->rb_left;
|
|
|
|
else if (bytenr > entry->bytenr)
|
|
|
|
p = &(*p)->rb_right;
|
|
|
|
else
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
return 1;
|
2015-04-16 08:34:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
rb_link_node(&record->node, parent_node, p);
|
|
|
|
rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-15 03:43:03 +01:00
|
|
|
int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_qgroup_extent_record *qrecord)
|
|
|
|
{
|
|
|
|
struct ulist *old_root;
|
|
|
|
u64 bytenr = qrecord->bytenr;
|
|
|
|
int ret;
|
|
|
|
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 19:58:45 +02:00
|
|
|
ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
|
2018-01-29 14:53:01 +01:00
|
|
|
if (ret < 0) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
|
|
|
|
ret);
|
|
|
|
return 0;
|
|
|
|
}
|
2017-02-15 03:43:03 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here we don't need to get the lock of
|
|
|
|
* trans->transaction->delayed_refs, since inserted qrecord won't
|
|
|
|
* be deleted, only qrecord->node may be modified (new qrecord insert)
|
|
|
|
*
|
|
|
|
* So modifying qrecord->old_roots is safe here
|
|
|
|
*/
|
|
|
|
qrecord->old_roots = old_root;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-18 10:28:03 +02:00
|
|
|
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
|
|
|
|
u64 num_bytes, gfp_t gfp_flag)
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
{
|
2018-07-18 10:28:03 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
struct btrfs_qgroup_extent_record *record;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
int ret;
|
|
|
|
|
2016-09-02 21:40:02 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|
|
|
|
|| bytenr == 0 || num_bytes == 0)
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
return 0;
|
|
|
|
record = kmalloc(sizeof(*record), gfp_flag);
|
|
|
|
if (!record)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
|
|
|
record->bytenr = bytenr;
|
|
|
|
record->num_bytes = num_bytes;
|
|
|
|
record->old_roots = NULL;
|
|
|
|
|
|
|
|
spin_lock(&delayed_refs->lock);
|
2016-06-23 00:54:24 +02:00
|
|
|
ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
spin_unlock(&delayed_refs->lock);
|
2017-02-15 03:43:03 +01:00
|
|
|
if (ret > 0) {
|
btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent()
Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions:
1. btrfs_qgroup_insert_dirty_extent_nolock()
Almost the same with original code.
For delayed_ref usage, which has delayed refs locked.
Change the return value type to int, since caller never needs the
pointer, but only needs to know if they need to free the allocated
memory.
2. btrfs_qgroup_insert_dirty_extent()
The more encapsulated version.
Will do the delayed_refs lock, memory allocation, quota enabled check
and other things.
The original design is to keep exported functions to minimal, but since
more btrfs hacks exposed, like replacing path in balance, we need to
record dirty extents manually, so we have to add such functions.
Also, add comment for both functions, to info developers how to keep
qgroup correct when doing hacks.
Cc: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-and-Tested-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2016-08-15 04:36:50 +02:00
|
|
|
kfree(record);
|
2017-02-15 03:43:03 +01:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return btrfs_qgroup_trace_extent_post(fs_info, record);
|
2015-04-16 08:34:17 +02:00
|
|
|
}
|
|
|
|
|
2016-10-18 03:31:28 +02:00
|
|
|
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *eb)
|
|
|
|
{
|
2018-07-18 08:45:37 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2016-10-18 03:31:28 +02:00
|
|
|
int nr = btrfs_header_nritems(eb);
|
|
|
|
int i, extent_type, ret;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_file_extent_item *fi;
|
|
|
|
u64 bytenr, num_bytes;
|
|
|
|
|
|
|
|
/* We can be called directly from walk_up_proc() */
|
2016-06-23 00:54:23 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2016-10-18 03:31:28 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
|
|
btrfs_item_key_to_cpu(eb, &key, i);
|
|
|
|
|
|
|
|
if (key.type != BTRFS_EXTENT_DATA_KEY)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
|
|
|
|
/* filter out non qgroup-accountable extents */
|
|
|
|
extent_type = btrfs_file_extent_type(eb, fi);
|
|
|
|
|
|
|
|
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
|
|
|
|
if (!bytenr)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
|
|
|
|
|
2018-07-18 10:28:03 +02:00
|
|
|
ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
|
|
|
|
GFP_NOFS);
|
2016-10-18 03:31:28 +02:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
2017-06-20 14:15:26 +02:00
|
|
|
cond_resched();
|
2016-10-18 03:31:28 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk up the tree from the bottom, freeing leaves and any interior
|
|
|
|
* nodes which have had all slots visited. If a node (leaf or
|
|
|
|
* interior) is freed, the node above it will have it's slot
|
|
|
|
* incremented. The root node will never be freed.
|
|
|
|
*
|
|
|
|
* At the end of this function, we should have a path which has all
|
|
|
|
* slots incremented to the next position for a search. If we need to
|
|
|
|
* read a new node it will be NULL and the node above it will have the
|
|
|
|
* correct slot selected for a later read.
|
|
|
|
*
|
|
|
|
* If we increment the root nodes slot counter past the number of
|
|
|
|
* elements, 1 is returned to signal completion of the search.
|
|
|
|
*/
|
2017-02-10 20:30:23 +01:00
|
|
|
static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
|
2016-10-18 03:31:28 +02:00
|
|
|
{
|
|
|
|
int level = 0;
|
|
|
|
int nr, slot;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
|
|
|
|
if (root_level == 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
while (level <= root_level) {
|
|
|
|
eb = path->nodes[level];
|
|
|
|
nr = btrfs_header_nritems(eb);
|
|
|
|
path->slots[level]++;
|
|
|
|
slot = path->slots[level];
|
|
|
|
if (slot >= nr || level == 0) {
|
|
|
|
/*
|
|
|
|
* Don't free the root - we will detect this
|
|
|
|
* condition after our loop and return a
|
|
|
|
* positive value for caller to stop walking the tree.
|
|
|
|
*/
|
|
|
|
if (level != root_level) {
|
|
|
|
btrfs_tree_unlock_rw(eb, path->locks[level]);
|
|
|
|
path->locks[level] = 0;
|
|
|
|
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
path->nodes[level] = NULL;
|
|
|
|
path->slots[level] = 0;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* We have a valid slot to walk back down
|
|
|
|
* from. Stop here so caller can process these
|
|
|
|
* new nodes.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
level++;
|
|
|
|
}
|
|
|
|
|
|
|
|
eb = path->nodes[root_level];
|
|
|
|
if (path->slots[root_level] >= btrfs_header_nritems(eb))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
|
|
|
|
struct extent_buffer *root_eb,
|
|
|
|
u64 root_gen, int root_level)
|
|
|
|
{
|
2018-07-18 08:45:38 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2016-10-18 03:31:28 +02:00
|
|
|
int ret = 0;
|
|
|
|
int level;
|
|
|
|
struct extent_buffer *eb = root_eb;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
|
2017-07-12 08:42:19 +02:00
|
|
|
BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
|
2016-10-18 03:31:28 +02:00
|
|
|
BUG_ON(root_eb == NULL);
|
|
|
|
|
2016-06-23 00:54:23 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2016-10-18 03:31:28 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!extent_buffer_uptodate(root_eb)) {
|
2018-03-29 03:08:11 +02:00
|
|
|
ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
|
2016-10-18 03:31:28 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (root_level == 0) {
|
2018-07-18 08:45:37 +02:00
|
|
|
ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
|
2016-10-18 03:31:28 +02:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk down the tree. Missing extent blocks are filled in as
|
|
|
|
* we go. Metadata is accounted every time we read a new
|
|
|
|
* extent block.
|
|
|
|
*
|
|
|
|
* When we reach a leaf, we account for file extent items in it,
|
|
|
|
* walk back up the tree (adjusting slot pointers as we go)
|
|
|
|
* and restart the search process.
|
|
|
|
*/
|
|
|
|
extent_buffer_get(root_eb); /* For path */
|
|
|
|
path->nodes[root_level] = root_eb;
|
|
|
|
path->slots[root_level] = 0;
|
|
|
|
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
|
|
|
|
walk_down:
|
|
|
|
level = root_level;
|
|
|
|
while (level >= 0) {
|
|
|
|
if (path->nodes[level] == NULL) {
|
2018-03-29 03:08:11 +02:00
|
|
|
struct btrfs_key first_key;
|
2016-10-18 03:31:28 +02:00
|
|
|
int parent_slot;
|
|
|
|
u64 child_gen;
|
|
|
|
u64 child_bytenr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to get child blockptr/gen from parent before
|
|
|
|
* we can read it.
|
|
|
|
*/
|
|
|
|
eb = path->nodes[level + 1];
|
|
|
|
parent_slot = path->slots[level + 1];
|
|
|
|
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
|
|
|
|
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
|
2018-03-29 03:08:11 +02:00
|
|
|
btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
|
2016-10-18 03:31:28 +02:00
|
|
|
|
2018-03-29 03:08:11 +02:00
|
|
|
eb = read_tree_block(fs_info, child_bytenr, child_gen,
|
|
|
|
level, &first_key);
|
2016-10-18 03:31:28 +02:00
|
|
|
if (IS_ERR(eb)) {
|
|
|
|
ret = PTR_ERR(eb);
|
|
|
|
goto out;
|
|
|
|
} else if (!extent_buffer_uptodate(eb)) {
|
|
|
|
free_extent_buffer(eb);
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->nodes[level] = eb;
|
|
|
|
path->slots[level] = 0;
|
|
|
|
|
|
|
|
btrfs_tree_read_lock(eb);
|
|
|
|
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
|
|
|
|
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
|
|
|
|
|
2018-07-18 10:28:03 +02:00
|
|
|
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
|
2016-06-23 00:54:23 +02:00
|
|
|
fs_info->nodesize,
|
|
|
|
GFP_NOFS);
|
2016-10-18 03:31:28 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (level == 0) {
|
2018-07-18 08:45:37 +02:00
|
|
|
ret = btrfs_qgroup_trace_leaf_items(trans,
|
|
|
|
path->nodes[level]);
|
2016-10-18 03:31:28 +02:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Nonzero return here means we completed our search */
|
2017-02-10 20:30:23 +01:00
|
|
|
ret = adjust_slots_upwards(path, root_level);
|
2016-10-18 03:31:28 +02:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/* Restart search with new slots */
|
|
|
|
goto walk_down;
|
|
|
|
}
|
|
|
|
|
|
|
|
level--;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-04-12 10:52:34 +02:00
|
|
|
#define UPDATE_NEW 0
|
|
|
|
#define UPDATE_OLD 1
|
|
|
|
/*
|
|
|
|
* Walk all of the roots that points to the bytenr and adjust their refcnts.
|
|
|
|
*/
|
|
|
|
static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *roots, struct ulist *tmp,
|
|
|
|
struct ulist *qgroups, u64 seq, int update_old)
|
|
|
|
{
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
struct ulist_node *tmp_unode;
|
|
|
|
struct ulist_iterator tmp_uiter;
|
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!roots)
|
|
|
|
return 0;
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(roots, &uiter))) {
|
|
|
|
qg = find_qgroup_rb(fs_info, unode->val);
|
|
|
|
if (!qg)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ulist_reinit(tmp);
|
2016-10-26 16:23:50 +02:00
|
|
|
ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
|
2015-04-12 10:52:34 +02:00
|
|
|
GFP_ATOMIC);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2016-10-26 16:23:50 +02:00
|
|
|
ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
|
2015-04-12 10:52:34 +02:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
ULIST_ITER_INIT(&tmp_uiter);
|
|
|
|
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(tmp_unode);
|
2015-04-12 10:52:34 +02:00
|
|
|
if (update_old)
|
|
|
|
btrfs_qgroup_update_old_refcnt(qg, seq, 1);
|
|
|
|
else
|
|
|
|
btrfs_qgroup_update_new_refcnt(qg, seq, 1);
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
|
|
|
ret = ulist_add(qgroups, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group),
|
2015-04-12 10:52:34 +02:00
|
|
|
GFP_ATOMIC);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
ret = ulist_add(tmp, glist->group->qgroupid,
|
2016-10-26 16:23:50 +02:00
|
|
|
qgroup_to_aux(glist->group),
|
2015-04-12 10:52:34 +02:00
|
|
|
GFP_ATOMIC);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-04-12 10:59:57 +02:00
|
|
|
/*
|
|
|
|
* Update qgroup rfer/excl counters.
|
|
|
|
* Rfer update is easy, codes can explain themselves.
|
2015-04-17 04:23:16 +02:00
|
|
|
*
|
2015-04-12 10:59:57 +02:00
|
|
|
* Excl update is tricky, the update is split into 2 part.
|
|
|
|
* Part 1: Possible exclusive <-> sharing detect:
|
|
|
|
* | A | !A |
|
|
|
|
* -------------------------------------
|
|
|
|
* B | * | - |
|
|
|
|
* -------------------------------------
|
|
|
|
* !B | + | ** |
|
|
|
|
* -------------------------------------
|
|
|
|
*
|
|
|
|
* Conditions:
|
|
|
|
* A: cur_old_roots < nr_old_roots (not exclusive before)
|
|
|
|
* !A: cur_old_roots == nr_old_roots (possible exclusive before)
|
|
|
|
* B: cur_new_roots < nr_new_roots (not exclusive now)
|
2016-05-20 03:18:45 +02:00
|
|
|
* !B: cur_new_roots == nr_new_roots (possible exclusive now)
|
2015-04-12 10:59:57 +02:00
|
|
|
*
|
|
|
|
* Results:
|
|
|
|
* +: Possible sharing -> exclusive -: Possible exclusive -> sharing
|
|
|
|
* *: Definitely not changed. **: Possible unchanged.
|
|
|
|
*
|
|
|
|
* For !A and !B condition, the exception is cur_old/new_roots == 0 case.
|
|
|
|
*
|
|
|
|
* To make the logic clear, we first use condition A and B to split
|
|
|
|
* combination into 4 results.
|
|
|
|
*
|
|
|
|
* Then, for result "+" and "-", check old/new_roots == 0 case, as in them
|
|
|
|
* only on variant maybe 0.
|
|
|
|
*
|
|
|
|
* Lastly, check result **, since there are 2 variants maybe 0, split them
|
|
|
|
* again(2x2).
|
|
|
|
* But this time we don't need to consider other things, the codes and logic
|
|
|
|
* is easy to understand now.
|
|
|
|
*/
|
|
|
|
static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
|
|
|
|
struct ulist *qgroups,
|
|
|
|
u64 nr_old_roots,
|
|
|
|
u64 nr_new_roots,
|
|
|
|
u64 num_bytes, u64 seq)
|
|
|
|
{
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
u64 cur_new_count, cur_old_count;
|
|
|
|
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(qgroups, &uiter))) {
|
|
|
|
bool dirty = false;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2015-04-12 10:59:57 +02:00
|
|
|
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
|
|
|
|
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
|
|
|
|
|
2018-04-30 09:04:44 +02:00
|
|
|
trace_qgroup_update_counters(fs_info, qg, cur_old_count,
|
|
|
|
cur_new_count);
|
2016-03-30 02:19:55 +02:00
|
|
|
|
2015-04-12 10:59:57 +02:00
|
|
|
/* Rfer update part */
|
|
|
|
if (cur_old_count == 0 && cur_new_count > 0) {
|
|
|
|
qg->rfer += num_bytes;
|
|
|
|
qg->rfer_cmpr += num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
if (cur_old_count > 0 && cur_new_count == 0) {
|
|
|
|
qg->rfer -= num_bytes;
|
|
|
|
qg->rfer_cmpr -= num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Excl update part */
|
|
|
|
/* Exclusive/none -> shared case */
|
|
|
|
if (cur_old_count == nr_old_roots &&
|
|
|
|
cur_new_count < nr_new_roots) {
|
|
|
|
/* Exclusive -> shared */
|
|
|
|
if (cur_old_count != 0) {
|
|
|
|
qg->excl -= num_bytes;
|
|
|
|
qg->excl_cmpr -= num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Shared -> exclusive/none case */
|
|
|
|
if (cur_old_count < nr_old_roots &&
|
|
|
|
cur_new_count == nr_new_roots) {
|
|
|
|
/* Shared->exclusive */
|
|
|
|
if (cur_new_count != 0) {
|
|
|
|
qg->excl += num_bytes;
|
|
|
|
qg->excl_cmpr += num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Exclusive/none -> exclusive/none case */
|
|
|
|
if (cur_old_count == nr_old_roots &&
|
|
|
|
cur_new_count == nr_new_roots) {
|
|
|
|
if (cur_old_count == 0) {
|
|
|
|
/* None -> exclusive/none */
|
|
|
|
|
|
|
|
if (cur_new_count != 0) {
|
|
|
|
/* None -> exclusive */
|
|
|
|
qg->excl += num_bytes;
|
|
|
|
qg->excl_cmpr += num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
/* None -> none, nothing changed */
|
|
|
|
} else {
|
|
|
|
/* Exclusive -> exclusive/none */
|
|
|
|
|
|
|
|
if (cur_new_count == 0) {
|
|
|
|
/* Exclusive -> none */
|
|
|
|
qg->excl -= num_bytes;
|
|
|
|
qg->excl_cmpr -= num_bytes;
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
/* Exclusive -> exclusive, nothing changed */
|
|
|
|
}
|
|
|
|
}
|
2015-08-03 08:44:29 +02:00
|
|
|
|
2015-04-12 10:59:57 +02:00
|
|
|
if (dirty)
|
|
|
|
qgroup_dirty(fs_info, qg);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-02-27 08:10:34 +01:00
|
|
|
/*
|
|
|
|
* Check if the @roots potentially is a list of fs tree roots
|
|
|
|
*
|
|
|
|
* Return 0 for definitely not a fs/subvol tree roots ulist
|
|
|
|
* Return 1 for possible fs/subvol tree roots in the list (considering an empty
|
|
|
|
* one as well)
|
|
|
|
*/
|
|
|
|
static int maybe_fs_roots(struct ulist *roots)
|
|
|
|
{
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
|
|
|
|
/* Empty one, still possible for fs roots */
|
|
|
|
if (!roots || roots->nnodes == 0)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
unode = ulist_next(roots, &uiter);
|
|
|
|
if (!unode)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If it contains fs tree roots, then it must belong to fs/subvol
|
|
|
|
* trees.
|
|
|
|
* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
|
|
|
|
*/
|
|
|
|
return is_fstree(unode->val);
|
|
|
|
}
|
|
|
|
|
2018-07-18 08:45:39 +02:00
|
|
|
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
|
|
|
|
u64 num_bytes, struct ulist *old_roots,
|
|
|
|
struct ulist *new_roots)
|
2015-04-16 09:37:33 +02:00
|
|
|
{
|
2018-07-18 08:45:39 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2015-04-16 09:37:33 +02:00
|
|
|
struct ulist *qgroups = NULL;
|
|
|
|
struct ulist *tmp = NULL;
|
|
|
|
u64 seq;
|
|
|
|
u64 nr_new_roots = 0;
|
|
|
|
u64 nr_old_roots = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
2020-01-08 13:07:32 +01:00
|
|
|
/*
|
|
|
|
* If quotas get disabled meanwhile, the resouces need to be freed and
|
|
|
|
* we can't just exit here.
|
|
|
|
*/
|
2017-02-13 14:05:24 +01:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2020-01-08 13:07:32 +01:00
|
|
|
goto out_free;
|
2017-02-13 14:05:24 +01:00
|
|
|
|
2017-02-27 08:10:34 +01:00
|
|
|
if (new_roots) {
|
|
|
|
if (!maybe_fs_roots(new_roots))
|
|
|
|
goto out_free;
|
2015-04-16 09:37:33 +02:00
|
|
|
nr_new_roots = new_roots->nnodes;
|
2017-02-27 08:10:34 +01:00
|
|
|
}
|
|
|
|
if (old_roots) {
|
|
|
|
if (!maybe_fs_roots(old_roots))
|
|
|
|
goto out_free;
|
2015-04-16 09:37:33 +02:00
|
|
|
nr_old_roots = old_roots->nnodes;
|
2017-02-27 08:10:34 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Quick exit, either not fs tree roots, or won't affect any qgroup */
|
|
|
|
if (nr_old_roots == 0 && nr_new_roots == 0)
|
|
|
|
goto out_free;
|
2015-04-16 09:37:33 +02:00
|
|
|
|
|
|
|
BUG_ON(!fs_info->quota_root);
|
|
|
|
|
2018-05-03 03:59:02 +02:00
|
|
|
trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
|
|
|
|
num_bytes, nr_old_roots, nr_new_roots);
|
2016-03-30 02:19:55 +02:00
|
|
|
|
2015-04-16 09:37:33 +02:00
|
|
|
qgroups = ulist_alloc(GFP_NOFS);
|
|
|
|
if (!qgroups) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_free;
|
|
|
|
}
|
|
|
|
tmp = ulist_alloc(GFP_NOFS);
|
|
|
|
if (!tmp) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_free;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
|
|
|
|
if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
ret = 0;
|
|
|
|
goto out_free;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
seq = fs_info->qgroup_seq;
|
|
|
|
|
|
|
|
/* Update old refcnts using old_roots */
|
|
|
|
ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
|
|
|
|
UPDATE_OLD);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* Update new refcnts using new_roots */
|
|
|
|
ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
|
|
|
|
UPDATE_NEW);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
|
|
|
|
num_bytes, seq);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bump qgroup_seq to avoid seq overlap
|
|
|
|
*/
|
|
|
|
fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
out_free:
|
|
|
|
ulist_free(tmp);
|
|
|
|
ulist_free(qgroups);
|
|
|
|
ulist_free(old_roots);
|
|
|
|
ulist_free(new_roots);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-03-15 15:00:25 +01:00
|
|
|
int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
|
2015-04-16 09:37:33 +02:00
|
|
|
{
|
2018-03-15 15:00:25 +01:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2015-04-16 09:37:33 +02:00
|
|
|
struct btrfs_qgroup_extent_record *record;
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs;
|
|
|
|
struct ulist *new_roots = NULL;
|
|
|
|
struct rb_node *node;
|
2015-04-20 03:53:50 +02:00
|
|
|
u64 qgroup_to_skip;
|
2015-04-16 09:37:33 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
delayed_refs = &trans->transaction->delayed_refs;
|
2015-04-20 03:53:50 +02:00
|
|
|
qgroup_to_skip = delayed_refs->qgroup_to_skip;
|
2015-04-16 09:37:33 +02:00
|
|
|
while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
|
|
|
|
record = rb_entry(node, struct btrfs_qgroup_extent_record,
|
|
|
|
node);
|
|
|
|
|
2016-06-09 23:27:55 +02:00
|
|
|
trace_btrfs_qgroup_account_extents(fs_info, record);
|
2016-03-30 02:19:55 +02:00
|
|
|
|
2015-04-16 09:37:33 +02:00
|
|
|
if (!ret) {
|
2017-02-27 08:10:35 +01:00
|
|
|
/*
|
|
|
|
* Old roots should be searched when inserting qgroup
|
|
|
|
* extent record
|
|
|
|
*/
|
|
|
|
if (WARN_ON(!record->old_roots)) {
|
|
|
|
/* Search commit root to find old_roots */
|
|
|
|
ret = btrfs_find_all_roots(NULL, fs_info,
|
|
|
|
record->bytenr, 0,
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 19:58:45 +02:00
|
|
|
&record->old_roots, false);
|
2017-02-27 08:10:35 +01:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
2015-04-16 09:37:33 +02:00
|
|
|
/*
|
2017-03-16 17:04:34 +01:00
|
|
|
* Use SEQ_LAST as time_seq to do special search, which
|
2015-04-16 09:37:33 +02:00
|
|
|
* doesn't lock tree or delayed_refs and search current
|
|
|
|
* root. It's safe inside commit_transaction().
|
|
|
|
*/
|
|
|
|
ret = btrfs_find_all_roots(trans, fs_info,
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 19:58:45 +02:00
|
|
|
record->bytenr, SEQ_LAST, &new_roots, false);
|
2015-04-16 09:37:33 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
2017-02-27 08:10:35 +01:00
|
|
|
if (qgroup_to_skip) {
|
2015-04-20 03:53:50 +02:00
|
|
|
ulist_del(new_roots, qgroup_to_skip, 0);
|
2017-02-27 08:10:35 +01:00
|
|
|
ulist_del(record->old_roots, qgroup_to_skip,
|
|
|
|
0);
|
|
|
|
}
|
2018-07-18 08:45:39 +02:00
|
|
|
ret = btrfs_qgroup_account_extent(trans, record->bytenr,
|
|
|
|
record->num_bytes,
|
|
|
|
record->old_roots,
|
|
|
|
new_roots);
|
2015-04-16 09:37:33 +02:00
|
|
|
record->old_roots = NULL;
|
|
|
|
new_roots = NULL;
|
|
|
|
}
|
|
|
|
cleanup:
|
|
|
|
ulist_free(record->old_roots);
|
|
|
|
ulist_free(new_roots);
|
|
|
|
new_roots = NULL;
|
|
|
|
rb_erase(node, &delayed_refs->dirty_extent_root);
|
|
|
|
kfree(record);
|
|
|
|
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* called from commit_transaction. Writes all changed qgroups to disk.
|
|
|
|
*/
|
2018-07-18 08:45:40 +02:00
|
|
|
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
2018-07-18 08:45:40 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (!quota_root)
|
2018-01-31 09:52:04 +01:00
|
|
|
return ret;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
while (!list_empty(&fs_info->dirty_qgroups)) {
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
qgroup = list_first_entry(&fs_info->dirty_qgroups,
|
|
|
|
struct btrfs_qgroup, dirty);
|
|
|
|
list_del_init(&qgroup->dirty);
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
2018-07-18 08:45:28 +02:00
|
|
|
ret = update_qgroup_info_item(trans, qgroup);
|
2014-11-21 03:04:56 +01:00
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |=
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2018-07-18 08:45:27 +02:00
|
|
|
ret = update_qgroup_limit_item(trans, qgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |=
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
}
|
2016-09-02 21:40:02 +02:00
|
|
|
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2012-06-28 18:03:02 +02:00
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
|
|
|
|
else
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
|
2018-07-18 08:45:29 +02:00
|
|
|
ret = update_qgroup_status_item(trans);
|
2012-06-28 18:03:02 +02:00
|
|
|
if (ret)
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-05-20 03:18:45 +02:00
|
|
|
* Copy the accounting information between qgroups. This is necessary
|
2016-03-31 02:57:48 +02:00
|
|
|
* when a snapshot or a subvolume is created. Throwing an error will
|
|
|
|
* cause a transaction abort so we take extra care here to only error
|
|
|
|
* when a readonly fs is a reasonable outcome.
|
2012-06-28 18:03:02 +02:00
|
|
|
*/
|
2018-07-18 08:45:41 +02:00
|
|
|
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
|
|
|
|
u64 objectid, struct btrfs_qgroup_inherit *inherit)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
int i;
|
|
|
|
u64 *i_qgroups;
|
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[ Upstream commit e88439debd0a7f969b3ddba6f147152cd0732676 ]
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2019-06-13 11:31:24 +02:00
|
|
|
bool committing = false;
|
2018-07-18 08:45:41 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2018-11-19 17:20:34 +01:00
|
|
|
struct btrfs_root *quota_root;
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *srcgroup;
|
|
|
|
struct btrfs_qgroup *dstgroup;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[ Upstream commit cbab8ade585a18c4334b085564d9d046e01a3f70 ]
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-04-02 08:37:35 +02:00
|
|
|
bool need_rescan = false;
|
2012-06-28 18:03:02 +02:00
|
|
|
u32 level_size = 0;
|
2013-04-07 12:50:19 +02:00
|
|
|
u64 nums;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[ Upstream commit e88439debd0a7f969b3ddba6f147152cd0732676 ]
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2019-06-13 11:31:24 +02:00
|
|
|
/*
|
|
|
|
* There are only two callers of this function.
|
|
|
|
*
|
|
|
|
* One in create_subvol() in the ioctl context, which needs to hold
|
|
|
|
* the qgroup_ioctl_lock.
|
|
|
|
*
|
|
|
|
* The other one in create_pending_snapshot() where no other qgroup
|
|
|
|
* code can modify the fs as they all need to either start a new trans
|
|
|
|
* or hold a trans handler, thus we don't need to hold
|
|
|
|
* qgroup_ioctl_lock.
|
|
|
|
* This would avoid long and complex lock chain and make lockdep happy.
|
|
|
|
*/
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
|
|
|
|
committing = true;
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
|
|
|
|
if (!committing)
|
|
|
|
mutex_lock(&fs_info->qgroup_ioctl_lock);
|
2016-09-02 21:40:02 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
|
2013-04-07 12:50:16 +02:00
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2018-11-19 17:20:34 +01:00
|
|
|
quota_root = fs_info->quota_root;
|
2013-04-07 12:50:16 +02:00
|
|
|
if (!quota_root) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2013-04-07 12:50:19 +02:00
|
|
|
if (inherit) {
|
|
|
|
i_qgroups = (u64 *)(inherit + 1);
|
|
|
|
nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
|
|
|
|
2 * inherit->num_excl_copies;
|
|
|
|
for (i = 0; i < nums; ++i) {
|
|
|
|
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
|
2014-11-11 13:18:22 +01:00
|
|
|
|
2016-03-31 02:57:48 +02:00
|
|
|
/*
|
|
|
|
* Zero out invalid groups so we can ignore
|
|
|
|
* them later.
|
|
|
|
*/
|
|
|
|
if (!srcgroup ||
|
|
|
|
((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
|
|
|
|
*i_qgroups = 0ULL;
|
|
|
|
|
2013-04-07 12:50:19 +02:00
|
|
|
++i_qgroups;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* create a tracking group for the subvol itself
|
|
|
|
*/
|
|
|
|
ret = add_qgroup_item(trans, quota_root, objectid);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* add qgroup to all inherited groups
|
|
|
|
*/
|
|
|
|
if (inherit) {
|
|
|
|
i_qgroups = (u64 *)(inherit + 1);
|
2016-03-31 02:57:48 +02:00
|
|
|
for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
|
|
|
|
if (*i_qgroups == 0)
|
|
|
|
continue;
|
2018-07-18 08:45:24 +02:00
|
|
|
ret = add_qgroup_relation_item(trans, objectid,
|
|
|
|
*i_qgroups);
|
2016-03-31 02:57:48 +02:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
2018-07-18 08:45:24 +02:00
|
|
|
ret = add_qgroup_relation_item(trans, *i_qgroups,
|
|
|
|
objectid);
|
2016-03-31 02:57:48 +02:00
|
|
|
if (ret && ret != -EEXIST)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto out;
|
|
|
|
}
|
2016-03-31 02:57:48 +02:00
|
|
|
ret = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
|
|
|
|
dstgroup = add_qgroup_rb(fs_info, objectid);
|
2012-07-30 10:15:43 +02:00
|
|
|
if (IS_ERR(dstgroup)) {
|
|
|
|
ret = PTR_ERR(dstgroup);
|
2012-06-28 18:03:02 +02:00
|
|
|
goto unlock;
|
2012-07-30 10:15:43 +02:00
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2014-11-21 02:58:34 +01:00
|
|
|
if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
|
|
|
|
dstgroup->lim_flags = inherit->lim.flags;
|
|
|
|
dstgroup->max_rfer = inherit->lim.max_rfer;
|
|
|
|
dstgroup->max_excl = inherit->lim.max_excl;
|
|
|
|
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
|
|
|
|
dstgroup->rsv_excl = inherit->lim.rsv_excl;
|
2014-11-21 03:01:41 +01:00
|
|
|
|
2018-07-18 08:45:27 +02:00
|
|
|
ret = update_qgroup_limit_item(trans, dstgroup);
|
2014-11-21 03:01:41 +01:00
|
|
|
if (ret) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2016-09-20 16:05:00 +02:00
|
|
|
btrfs_info(fs_info,
|
|
|
|
"unable to update quota limit for %llu",
|
|
|
|
dstgroup->qgroupid);
|
2014-11-21 03:01:41 +01:00
|
|
|
goto unlock;
|
|
|
|
}
|
2014-11-21 02:58:34 +01:00
|
|
|
}
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
if (srcid) {
|
|
|
|
srcgroup = find_qgroup_rb(fs_info, srcid);
|
2012-09-15 02:06:30 +02:00
|
|
|
if (!srcgroup)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto unlock;
|
2014-05-14 02:30:47 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We call inherit after we clone the root in order to make sure
|
|
|
|
* our counts don't go crazy, so at this point the only
|
|
|
|
* difference between the two roots should be the root node.
|
|
|
|
*/
|
2018-07-17 10:58:22 +02:00
|
|
|
level_size = fs_info->nodesize;
|
2014-05-14 02:30:47 +02:00
|
|
|
dstgroup->rfer = srcgroup->rfer;
|
|
|
|
dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
|
|
|
|
dstgroup->excl = level_size;
|
|
|
|
dstgroup->excl_cmpr = level_size;
|
2012-06-28 18:03:02 +02:00
|
|
|
srcgroup->excl = level_size;
|
|
|
|
srcgroup->excl_cmpr = level_size;
|
2014-11-21 02:14:38 +01:00
|
|
|
|
|
|
|
/* inherit the limit info */
|
|
|
|
dstgroup->lim_flags = srcgroup->lim_flags;
|
|
|
|
dstgroup->max_rfer = srcgroup->max_rfer;
|
|
|
|
dstgroup->max_excl = srcgroup->max_excl;
|
|
|
|
dstgroup->rsv_rfer = srcgroup->rsv_rfer;
|
|
|
|
dstgroup->rsv_excl = srcgroup->rsv_excl;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
qgroup_dirty(fs_info, dstgroup);
|
|
|
|
qgroup_dirty(fs_info, srcgroup);
|
|
|
|
}
|
|
|
|
|
2012-09-15 02:06:30 +02:00
|
|
|
if (!inherit)
|
2012-06-28 18:03:02 +02:00
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
i_qgroups = (u64 *)(inherit + 1);
|
|
|
|
for (i = 0; i < inherit->num_qgroups; ++i) {
|
2016-03-31 02:57:48 +02:00
|
|
|
if (*i_qgroups) {
|
2016-06-23 00:54:23 +02:00
|
|
|
ret = add_relation_rb(fs_info, objectid, *i_qgroups);
|
2016-03-31 02:57:48 +02:00
|
|
|
if (ret)
|
|
|
|
goto unlock;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
++i_qgroups;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[ Upstream commit cbab8ade585a18c4334b085564d9d046e01a3f70 ]
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-04-02 08:37:35 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're doing a snapshot, and adding the snapshot to a new
|
|
|
|
* qgroup, the numbers are guaranteed to be incorrect.
|
|
|
|
*/
|
|
|
|
if (srcid)
|
|
|
|
need_rescan = true;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
2016-03-31 02:57:48 +02:00
|
|
|
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *src;
|
|
|
|
struct btrfs_qgroup *dst;
|
|
|
|
|
2016-03-31 02:57:48 +02:00
|
|
|
if (!i_qgroups[0] || !i_qgroups[1])
|
|
|
|
continue;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
src = find_qgroup_rb(fs_info, i_qgroups[0]);
|
|
|
|
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
|
|
|
|
|
|
|
|
if (!src || !dst) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
dst->rfer = src->rfer - level_size;
|
|
|
|
dst->rfer_cmpr = src->rfer_cmpr - level_size;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[ Upstream commit cbab8ade585a18c4334b085564d9d046e01a3f70 ]
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-04-02 08:37:35 +02:00
|
|
|
|
|
|
|
/* Manually tweaking numbers certainly needs a rescan */
|
|
|
|
need_rescan = true;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
2016-03-31 02:57:48 +02:00
|
|
|
for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *src;
|
|
|
|
struct btrfs_qgroup *dst;
|
|
|
|
|
2016-03-31 02:57:48 +02:00
|
|
|
if (!i_qgroups[0] || !i_qgroups[1])
|
|
|
|
continue;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
src = find_qgroup_rb(fs_info, i_qgroups[0]);
|
|
|
|
dst = find_qgroup_rb(fs_info, i_qgroups[1]);
|
|
|
|
|
|
|
|
if (!src || !dst) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
dst->excl = src->excl + level_size;
|
|
|
|
dst->excl_cmpr = src->excl_cmpr + level_size;
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[ Upstream commit cbab8ade585a18c4334b085564d9d046e01a3f70 ]
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-04-02 08:37:35 +02:00
|
|
|
need_rescan = true;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
unlock:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
out:
|
btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit()
[ Upstream commit e88439debd0a7f969b3ddba6f147152cd0732676 ]
[BUG]
Lockdep will report the following circular locking dependency:
WARNING: possible circular locking dependency detected
5.2.0-rc2-custom #24 Tainted: G O
------------------------------------------------------
btrfs/8631 is trying to acquire lock:
000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs]
but task is already holding lock:
000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #2 (&fs_info->tree_log_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x475/0xa00 [btrfs]
btrfs_commit_super+0x71/0x80 [btrfs]
close_ctree+0x2bd/0x320 [btrfs]
btrfs_put_super+0x15/0x20 [btrfs]
generic_shutdown_super+0x72/0x110
kill_anon_super+0x18/0x30
btrfs_kill_super+0x16/0xa0 [btrfs]
deactivate_locked_super+0x3a/0x80
deactivate_super+0x51/0x60
cleanup_mnt+0x3f/0x80
__cleanup_mnt+0x12/0x20
task_work_run+0x94/0xb0
exit_to_usermode_loop+0xd8/0xe0
do_syscall_64+0x210/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #1 (&fs_info->reloc_mutex){+.+.}:
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_commit_transaction+0x40d/0xa00 [btrfs]
btrfs_quota_enable+0x2da/0x730 [btrfs]
btrfs_ioctl+0x2691/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}:
lock_acquire+0xa7/0x190
__mutex_lock+0x76/0x940
mutex_lock_nested+0x1b/0x20
btrfs_qgroup_inherit+0x40/0x620 [btrfs]
create_pending_snapshot+0x9d7/0xe60 [btrfs]
create_pending_snapshots+0x94/0xb0 [btrfs]
btrfs_commit_transaction+0x415/0xa00 [btrfs]
btrfs_mksubvol+0x496/0x4e0 [btrfs]
btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs]
btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs]
btrfs_ioctl+0xa90/0x2b40 [btrfs]
do_vfs_ioctl+0xa9/0x6d0
ksys_ioctl+0x67/0x90
__x64_sys_ioctl+0x1a/0x20
do_syscall_64+0x65/0x240
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
&fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_info->tree_log_mutex);
lock(&fs_info->reloc_mutex);
lock(&fs_info->tree_log_mutex);
lock(&fs_info->qgroup_ioctl_lock#2);
*** DEADLOCK ***
6 locks held by btrfs/8631:
#0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60
#1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs]
#2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs]
#3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs]
#4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs]
#5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs]
[CAUSE]
Due to the delayed subvolume creation, we need to call
btrfs_qgroup_inherit() inside commit transaction code, with a lot of
other mutex hold.
This hell of lock chain can lead to above problem.
[FIX]
On the other hand, we don't really need to hold qgroup_ioctl_lock if
we're in the context of create_pending_snapshot().
As in that context, we're the only one being able to modify qgroup.
All other qgroup functions which needs qgroup_ioctl_lock are either
holding a transaction handle, or will start a new transaction:
Functions will start a new transaction():
* btrfs_quota_enable()
* btrfs_quota_disable()
Functions hold a transaction handler:
* btrfs_add_qgroup_relation()
* btrfs_del_qgroup_relation()
* btrfs_create_qgroup()
* btrfs_remove_qgroup()
* btrfs_limit_qgroup()
* btrfs_qgroup_inherit() call inside create_subvol()
So we have a higher level protection provided by transaction, thus we
don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit().
Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold
qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in
create_pending_snapshot() is already protected by transaction.
So the fix is to detect the context by checking
trans->transaction->state.
If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction
context and no need to get the mutex.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2019-06-13 11:31:24 +02:00
|
|
|
if (!committing)
|
|
|
|
mutex_unlock(&fs_info->qgroup_ioctl_lock);
|
btrfs: qgroup: mark qgroup inconsistent if we're inherting snapshot to a new qgroup
[ Upstream commit cbab8ade585a18c4334b085564d9d046e01a3f70 ]
[BUG]
For the following operation, qgroup is guaranteed to be screwed up due
to snapshot adding to a new qgroup:
# mkfs.btrfs -f $dev
# mount $dev $mnt
# btrfs qgroup en $mnt
# btrfs subv create $mnt/src
# xfs_io -f -c "pwrite 0 1m" $mnt/src/file
# sync
# btrfs qgroup create 1/0 $mnt/src
# btrfs subv snapshot -i 1/0 $mnt/src $mnt/snapshot
# btrfs qgroup show -prce $mnt/src
qgroupid rfer excl max_rfer max_excl parent child
-------- ---- ---- -------- -------- ------ -----
0/5 16.00KiB 16.00KiB none none --- ---
0/257 1.02MiB 16.00KiB none none --- ---
0/258 1.02MiB 16.00KiB none none 1/0 ---
1/0 0.00B 0.00B none none --- 0/258
^^^^^^^^^^^^^^^^^^^^
[CAUSE]
The problem is in btrfs_qgroup_inherit(), we don't have good enough
check to determine if the new relation would break the existing
accounting.
Unlike btrfs_add_qgroup_relation(), which has proper check to determine
if we can do quick update without a rescan, in btrfs_qgroup_inherit() we
can even assign a snapshot to multiple qgroups.
[FIX]
Fix it by manually marking qgroup inconsistent for snapshot inheritance.
For subvolume creation, since all its extents are exclusively owned, we
don't need to rescan.
In theory, we should call relation check like quick_update_accounting()
when doing qgroup inheritance and inform user about qgroup accounting
inconsistency.
But we don't have good mechanism to relay that back to the user in the
snapshot creation context, thus we can only silently mark the qgroup
inconsistent.
Anyway, user shouldn't use qgroup inheritance during snapshot creation,
and should add qgroup relationship after snapshot creation by 'btrfs
qgroup assign', which has a much better UI to inform user about qgroup
inconsistent and kick in rescan automatically.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-04-02 08:37:35 +02:00
|
|
|
if (need_rescan)
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
2012-06-28 18:03:02 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-12-22 09:06:39 +01:00
|
|
|
/*
|
|
|
|
* Two limits to commit transaction in advance.
|
|
|
|
*
|
2019-01-25 00:55:27 +01:00
|
|
|
* For RATIO, it will be 1/RATIO of the remaining limit as threshold.
|
2017-12-22 09:06:39 +01:00
|
|
|
* For SIZE, it will be in byte unit as threshold.
|
|
|
|
*/
|
2019-01-25 00:55:27 +01:00
|
|
|
#define QGROUP_FREE_RATIO 32
|
|
|
|
#define QGROUP_FREE_SIZE SZ_32M
|
2017-12-22 09:06:39 +01:00
|
|
|
static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
|
|
|
|
const struct btrfs_qgroup *qg, u64 num_bytes)
|
2017-01-25 15:50:33 +01:00
|
|
|
{
|
2019-01-25 00:55:27 +01:00
|
|
|
u64 free;
|
2017-12-22 09:06:39 +01:00
|
|
|
u64 threshold;
|
|
|
|
|
2017-01-25 15:50:33 +01:00
|
|
|
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
|
2017-12-12 08:34:25 +01:00
|
|
|
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
|
2017-01-25 15:50:33 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
|
2017-12-12 08:34:25 +01:00
|
|
|
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
|
2017-01-25 15:50:33 +01:00
|
|
|
return false;
|
|
|
|
|
2017-12-22 09:06:39 +01:00
|
|
|
/*
|
|
|
|
* Even if we passed the check, it's better to check if reservation
|
|
|
|
* for meta_pertrans is pushing us near limit.
|
|
|
|
* If there is too much pertrans reservation or it's near the limit,
|
|
|
|
* let's try commit transaction to free some, using transaction_kthread
|
|
|
|
*/
|
|
|
|
if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
|
|
|
|
BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
|
2019-01-25 00:55:27 +01:00
|
|
|
if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
|
|
|
|
free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
|
|
|
|
threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
|
|
|
|
QGROUP_FREE_SIZE);
|
|
|
|
} else {
|
|
|
|
free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
|
|
|
|
threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
|
|
|
|
QGROUP_FREE_SIZE);
|
|
|
|
}
|
2017-12-22 09:06:39 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Use transaction_kthread to commit transaction, so we no
|
|
|
|
* longer need to bother nested transaction nor lock context.
|
|
|
|
*/
|
2019-01-25 00:55:27 +01:00
|
|
|
if (free < threshold)
|
2017-12-22 09:06:39 +01:00
|
|
|
btrfs_commit_transaction_locksafe(fs_info);
|
|
|
|
}
|
|
|
|
|
2017-01-25 15:50:33 +01:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-12-12 08:34:25 +01:00
|
|
|
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_root *quota_root;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
u64 ref_root = root->root_key.objectid;
|
|
|
|
int ret = 0;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
|
|
|
|
if (!is_fstree(ref_root))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return 0;
|
2017-05-11 23:17:33 +02:00
|
|
|
|
|
|
|
if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
|
|
|
|
capable(CAP_SYS_RESOURCE))
|
|
|
|
enforce = false;
|
|
|
|
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
quota_root = fs_info->quota_root;
|
|
|
|
if (!quota_root)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* in a first step, we check all affected qgroups if any limits would
|
|
|
|
* be exceeded
|
|
|
|
*/
|
2013-05-06 13:03:27 +02:00
|
|
|
ulist_reinit(fs_info->qgroup_ulist);
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(qgroup), GFP_ATOMIC);
|
2013-04-17 16:00:36 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
ULIST_ITER_INIT(&uiter);
|
2013-05-06 13:03:27 +02:00
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2017-12-22 09:06:39 +01:00
|
|
|
if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
|
2012-06-28 18:03:02 +02:00
|
|
|
ret = -EDQUOT;
|
2013-03-06 12:51:47 +01:00
|
|
|
goto out;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
2013-05-06 13:03:27 +02:00
|
|
|
ret = ulist_add(fs_info->qgroup_ulist,
|
|
|
|
glist->group->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2013-04-17 16:00:36 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
}
|
2013-04-17 16:00:36 +02:00
|
|
|
ret = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
/*
|
|
|
|
* no limits exceeded, now record the reservation into all qgroups
|
|
|
|
*/
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
2013-05-06 13:03:27 +02:00
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2017-12-12 08:34:27 +01:00
|
|
|
trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
|
|
|
|
qgroup_rsv_add(fs_info, qg, num_bytes, type);
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-12-12 08:34:30 +01:00
|
|
|
/*
|
|
|
|
* Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
|
|
|
|
* qgroup).
|
|
|
|
*
|
|
|
|
* Will handle all higher level qgroup too.
|
|
|
|
*
|
|
|
|
* NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
|
|
|
|
* This special case is only used for META_PERTRANS type.
|
|
|
|
*/
|
2015-09-08 11:08:37 +02:00
|
|
|
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
|
2017-12-12 08:34:23 +01:00
|
|
|
u64 ref_root, u64 num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
2012-06-28 18:03:02 +02:00
|
|
|
{
|
|
|
|
struct btrfs_root *quota_root;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
2013-04-17 16:00:36 +02:00
|
|
|
int ret = 0;
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
if (!is_fstree(ref_root))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return;
|
|
|
|
|
2017-12-12 08:34:30 +01:00
|
|
|
if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
|
|
|
|
WARN(1, "%s: Invalid type to free", __func__);
|
|
|
|
return;
|
|
|
|
}
|
2012-06-28 18:03:02 +02:00
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
|
|
|
|
quota_root = fs_info->quota_root;
|
|
|
|
if (!quota_root)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
|
2017-12-12 08:34:30 +01:00
|
|
|
if (num_bytes == (u64)-1)
|
2017-12-12 08:34:34 +01:00
|
|
|
/*
|
|
|
|
* We're freeing all pertrans rsv, get reserved value from
|
|
|
|
* level 0 qgroup as real num_bytes to free.
|
|
|
|
*/
|
2017-12-12 08:34:30 +01:00
|
|
|
num_bytes = qgroup->rsv.values[type];
|
|
|
|
|
2013-05-06 13:03:27 +02:00
|
|
|
ulist_reinit(fs_info->qgroup_ulist);
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(qgroup), GFP_ATOMIC);
|
2013-04-17 16:00:36 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
ULIST_ITER_INIT(&uiter);
|
2013-05-06 13:03:27 +02:00
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
2012-06-28 18:03:02 +02:00
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
2016-10-26 16:23:50 +02:00
|
|
|
qg = unode_aux_to_qgroup(unode);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
2017-12-12 08:34:27 +01:00
|
|
|
trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
|
|
|
|
qgroup_rsv_release(fs_info, qg, num_bytes, type);
|
2012-06-28 18:03:02 +02:00
|
|
|
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
2013-05-06 13:03:27 +02:00
|
|
|
ret = ulist_add(fs_info->qgroup_ulist,
|
|
|
|
glist->group->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2013-04-17 16:00:36 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2012-06-28 18:03:02 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 03:38:13 +02:00
|
|
|
/*
|
|
|
|
* Check if the leaf is the last leaf. Which means all node pointers
|
|
|
|
* are at their last position.
|
|
|
|
*/
|
|
|
|
static bool is_last_leaf(struct btrfs_path *path)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
|
|
|
|
if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-04-25 18:04:51 +02:00
|
|
|
/*
|
|
|
|
* returns < 0 on error, 0 when more leafs are to be scanned.
|
2015-02-27 09:24:24 +01:00
|
|
|
* returns 1 when done.
|
2013-04-25 18:04:51 +02:00
|
|
|
*/
|
2018-07-18 08:45:42 +02:00
|
|
|
static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
|
|
|
|
struct btrfs_path *path)
|
2013-04-25 18:04:51 +02:00
|
|
|
{
|
2018-07-18 08:45:42 +02:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2013-04-25 18:04:51 +02:00
|
|
|
struct btrfs_key found;
|
2015-10-26 02:19:43 +01:00
|
|
|
struct extent_buffer *scratch_leaf = NULL;
|
2013-04-25 18:04:51 +02:00
|
|
|
struct ulist *roots = NULL;
|
2014-05-14 02:30:47 +02:00
|
|
|
u64 num_bytes;
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 03:38:13 +02:00
|
|
|
bool done;
|
2013-04-25 18:04:51 +02:00
|
|
|
int slot;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
ret = btrfs_search_slot_for_read(fs_info->extent_root,
|
|
|
|
&fs_info->qgroup_rescan_progress,
|
|
|
|
path, 1, 0);
|
|
|
|
|
2016-09-20 16:05:02 +02:00
|
|
|
btrfs_debug(fs_info,
|
|
|
|
"current progress key (%llu %u %llu), search_slot ret %d",
|
|
|
|
fs_info->qgroup_rescan_progress.objectid,
|
|
|
|
fs_info->qgroup_rescan_progress.type,
|
|
|
|
fs_info->qgroup_rescan_progress.offset, ret);
|
2013-04-25 18:04:51 +02:00
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
/*
|
|
|
|
* The rescan is about to end, we will not be scanning any
|
|
|
|
* further blocks. We cannot unset the RESCAN flag here, because
|
|
|
|
* we want to commit the transaction if everything went well.
|
|
|
|
* To make the live accounting work in this phase, we set our
|
|
|
|
* scan progress pointer such that every real extent objectid
|
|
|
|
* will be smaller.
|
|
|
|
*/
|
|
|
|
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
|
|
|
|
btrfs_release_path(path);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 03:38:13 +02:00
|
|
|
done = is_last_leaf(path);
|
2013-04-25 18:04:51 +02:00
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &found,
|
|
|
|
btrfs_header_nritems(path->nodes[0]) - 1);
|
|
|
|
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
|
|
|
|
|
2015-10-26 02:19:43 +01:00
|
|
|
scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
|
|
|
|
if (!scratch_leaf) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
extent_buffer_get(scratch_leaf);
|
|
|
|
btrfs_tree_read_lock(scratch_leaf);
|
|
|
|
btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
|
2013-04-25 18:04:51 +02:00
|
|
|
slot = path->slots[0];
|
|
|
|
btrfs_release_path(path);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
|
|
|
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
|
|
|
|
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
|
2014-01-23 22:45:10 +01:00
|
|
|
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
|
|
|
|
found.type != BTRFS_METADATA_ITEM_KEY)
|
2013-04-25 18:04:51 +02:00
|
|
|
continue;
|
2014-01-23 22:45:10 +01:00
|
|
|
if (found.type == BTRFS_METADATA_ITEM_KEY)
|
2016-06-15 15:22:56 +02:00
|
|
|
num_bytes = fs_info->nodesize;
|
2014-01-23 22:45:10 +01:00
|
|
|
else
|
|
|
|
num_bytes = found.offset;
|
|
|
|
|
2014-05-14 02:30:47 +02:00
|
|
|
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
|
btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents
The LOGICAL_INO ioctl provides a backward mapping from extent bytenr and
offset (encoded as a single logical address) to a list of extent refs.
LOGICAL_INO complements TREE_SEARCH, which provides the forward mapping
(extent ref -> extent bytenr and offset, or logical address). These are
useful capabilities for programs that manipulate extents and extent
references from userspace (e.g. dedup and defrag utilities).
When the extents are uncompressed (and not encrypted and not other),
check_extent_in_eb performs filtering of the extent refs to remove any
extent refs which do not contain the same extent offset as the 'logical'
parameter's extent offset. This prevents LOGICAL_INO from returning
references to more than a single block.
To find the set of extent references to an uncompressed extent from [a, b),
userspace has to run a loop like this pseudocode:
for (i = a; i < b; ++i)
extent_ref_set += LOGICAL_INO(i);
At each iteration of the loop (up to 32768 iterations for a 128M extent),
data we are interested in is collected in the kernel, then deleted by
the filter in check_extent_in_eb.
When the extents are compressed (or encrypted or other), the 'logical'
parameter must be an extent bytenr (the 'a' parameter in the loop).
No filtering by extent offset is done (or possible?) so the result is
the complete set of extent refs for the entire extent. This removes
the need for the loop, since we get all the extent refs in one call.
Add an 'ignore_offset' argument to iterate_inodes_from_logical,
[...several levels of function call graph...], and check_extent_in_eb, so
that we can disable the extent offset filtering for uncompressed extents.
This flag can be set by an improved version of the LOGICAL_INO ioctl to
get either behavior as desired.
There is no functional change in this patch. The new flag is always
false.
Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ minor coding style fixes ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-22 19:58:45 +02:00
|
|
|
&roots, false);
|
2013-04-25 18:04:51 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
2015-04-13 05:02:16 +02:00
|
|
|
/* For rescan, just pass old_roots as NULL */
|
2018-07-18 08:45:39 +02:00
|
|
|
ret = btrfs_qgroup_account_extent(trans, found.objectid,
|
|
|
|
num_bytes, NULL, roots);
|
2015-04-13 05:02:16 +02:00
|
|
|
if (ret < 0)
|
2014-05-14 02:30:47 +02:00
|
|
|
goto out;
|
2013-04-25 18:04:51 +02:00
|
|
|
}
|
|
|
|
out:
|
2015-10-26 02:19:43 +01:00
|
|
|
if (scratch_leaf) {
|
|
|
|
btrfs_tree_read_unlock_blocking(scratch_leaf);
|
|
|
|
free_extent_buffer(scratch_leaf);
|
|
|
|
}
|
2013-04-25 18:04:51 +02:00
|
|
|
|
2018-06-27 12:19:55 +02:00
|
|
|
if (done && !ret) {
|
btrfs: qgroup: Finish rescan when hit the last leaf of extent tree
Under the following case, qgroup rescan can double account cowed tree
blocks:
In this case, extent tree only has one tree block.
-
| transid=5 last committed=4
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 4).
| Scan it, set qgroup_rescan_progress to the last
| EXTENT/META_ITEM + 1
| now qgroup_rescan_progress = A + 1.
|
| fs tree get CoWed, new tree block is at A + 16K
| transid 5 get committed
-
| transid=6 last committed=5
| btrfs_qgroup_rescan_worker()
| btrfs_qgroup_rescan_worker()
| |- btrfs_start_transaction()
| | transid = 5
| |- qgroup_rescan_leaf()
| |- btrfs_search_slot_for_read() on extent tree
| Get the only extent tree block from commit root (transid = 5).
| scan it using qgroup_rescan_progress (A + 1).
| found new tree block beyong A, and it's fs tree block,
| account it to increase qgroup numbers.
-
In above case, tree block A, and tree block A + 16K get accounted twice,
while qgroup rescan should stop when it already reach the last leaf,
other than continue using its qgroup_rescan_progress.
Such case could happen by just looping btrfs/017 and with some
possibility it can hit such double qgroup accounting problem.
Fix it by checking the path to determine if we should finish qgroup
rescan, other than relying on next loop to exit.
Reported-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2018-05-14 03:38:13 +02:00
|
|
|
ret = 1;
|
2018-06-27 12:19:55 +02:00
|
|
|
fs_info->qgroup_rescan_progress.objectid = (u64)-1;
|
|
|
|
}
|
2013-04-25 18:04:51 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
btrfs: fix transaction leak and crash after RO remount caused by qgroup rescan
[ Upstream commit cb13eea3b49055bd78e6ddf39defd6340f7379fc ]
If we remount a filesystem in RO mode while the qgroup rescan worker is
running, we can end up having it still running after the remount is done,
and at unmount time we may end up with an open transaction that ends up
never getting committed. If that happens we end up with several memory
leaks and can crash when hardware acceleration is unavailable for crc32c.
Possibly it can lead to other nasty surprises too, due to use-after-free
issues.
The following steps explain how the problem happens.
1) We have a filesystem mounted in RW mode and the qgroup rescan worker is
running;
2) We remount the filesystem in RO mode, and never stop/pause the rescan
worker, so after the remount the rescan worker is still running. The
important detail here is that the rescan task is still running after
the remount operation committed any ongoing transaction through its
call to btrfs_commit_super();
3) The rescan is still running, and after the remount completed, the
rescan worker started a transaction, after it finished iterating all
leaves of the extent tree, to update the qgroup status item in the
quotas tree. It does not commit the transaction, it only releases its
handle on the transaction;
4) A filesystem unmount operation starts shortly after;
5) The unmount task, at close_ctree(), stops the transaction kthread,
which had not had a chance to commit the open transaction since it was
sleeping and the commit interval (default of 30 seconds) has not yet
elapsed since the last time it committed a transaction;
6) So after stopping the transaction kthread we still have the transaction
used to update the qgroup status item open. At close_ctree(), when the
filesystem is in RO mode and no transaction abort happened (or the
filesystem is in error mode), we do not expect to have any transaction
open, so we do not call btrfs_commit_super();
7) We then proceed to destroy the work queues, free the roots and block
groups, etc. After that we drop the last reference on the btree inode
by calling iput() on it. Since there are dirty pages for the btree
inode, corresponding to the COWed extent buffer for the quotas btree,
btree_write_cache_pages() is invoked to flush those dirty pages. This
results in creating a bio and submitting it, which makes us end up at
btrfs_submit_metadata_bio();
8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch
that calls btrfs_wq_submit_bio(), because check_async_write() returned
a value of 1. This value of 1 is because we did not have hardware
acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not
set in fs_info->flags;
9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the
workqueue at fs_info->workers, which was already freed before by the
call to btrfs_stop_all_workers() at close_ctree(). This results in an
invalid memory access due to a use-after-free, leading to a crash.
When this happens, before the crash there are several warnings triggered,
since we have reserved metadata space in a block group, the delayed refs
reservation, etc:
------------[ cut here ]------------
WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs]
Code: f0 01 00 00 48 39 c2 75 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8
RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800
RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x17f/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 48 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c6 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Code: 48 83 bb b0 03 00 00 00 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff
RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x24c/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c7 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Code: ad de 49 be 22 01 00 (...)
RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206
RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246
RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c8 ]---
BTRFS info (device sdc): space_info 4 has 268238848 free, is not full
BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536
BTRFS info (device sdc): global_block_rsv: size 0 reserved 0
BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0
BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0
And the crash, which only happens when we do not have crc32c hardware
acceleration, produces the following trace immediately after those
warnings:
stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs]
Code: 54 55 53 48 89 f3 (...)
RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282
RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0
RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8
R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000
FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_wq_submit_bio+0xb3/0xd0 [btrfs]
btrfs_submit_metadata_bio+0x44/0xc0 [btrfs]
submit_one_bio+0x61/0x70 [btrfs]
btree_write_cache_pages+0x414/0x450 [btrfs]
? kobject_put+0x9a/0x1d0
? trace_hardirqs_on+0x1b/0xf0
? _raw_spin_unlock_irqrestore+0x3c/0x60
? free_debug_processing+0x1e1/0x2b0
do_writepages+0x43/0xe0
? lock_acquired+0x199/0x490
__writeback_single_inode+0x59/0x650
writeback_single_inode+0xaf/0x120
write_inode_now+0x94/0xd0
iput+0x187/0x2b0
close_ctree+0x2c6/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f3cfebabee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000
RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0
R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000
R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
---[ end trace dd74718fef1ed5cc ]---
Finally when we remove the btrfs module (rmmod btrfs), there are several
warnings about objects that were allocated from our slabs but were never
freed, consequence of the transaction that was never committed and got
leaked:
=============================================================================
BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x0000000050cbdd61 @offset=12104
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
sync_filesystem+0x74/0x90
generic_shutdown_super+0x22/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x0000000086e9b0ff @offset=12776
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs]
commit_cowonly_roots+0x248/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 0b (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000001a340018 @offset=4408
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_commit_transaction+0x60/0xc40 [btrfs]
create_subvol+0x56a/0x990 [btrfs]
btrfs_mksubvol+0x3fb/0x4a0 [btrfs]
__btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs]
btrfs_ioctl_snap_create+0x58/0x80 [btrfs]
btrfs_ioctl+0x1a92/0x36f0 [btrfs]
__x64_sys_ioctl+0x83/0xb0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x000000002b46292a @offset=13648
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? __mutex_unlock_slowpath+0x45/0x2a0
kmem_cache_destroy+0x55/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000004cf95ea8 @offset=6264
INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_alloc_tree_block+0x1e0/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0xabd/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1
Fix this issue by having the remount path stop the qgroup rescan worker
when we are remounting RO and teach the rescan worker to stop when a
remount is in progress. If later a remount in RW mode happens, we are
already resuming the qgroup rescan worker through the call to
btrfs_qgroup_rescan_resume(), so we do not need to worry about that.
Tested-by: Fabian Vogt <fvogt@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-12-14 11:10:45 +01:00
|
|
|
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
return btrfs_fs_closing(fs_info) ||
|
|
|
|
test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
|
|
|
|
}
|
|
|
|
|
2014-02-28 03:46:19 +01:00
|
|
|
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
|
2013-04-25 18:04:51 +02:00
|
|
|
{
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
|
|
|
|
qgroup_rescan_work);
|
2013-04-25 18:04:51 +02:00
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
|
|
|
int err = -ENOMEM;
|
2015-02-27 09:24:25 +01:00
|
|
|
int ret = 0;
|
btrfs: fix transaction leak and crash after RO remount caused by qgroup rescan
[ Upstream commit cb13eea3b49055bd78e6ddf39defd6340f7379fc ]
If we remount a filesystem in RO mode while the qgroup rescan worker is
running, we can end up having it still running after the remount is done,
and at unmount time we may end up with an open transaction that ends up
never getting committed. If that happens we end up with several memory
leaks and can crash when hardware acceleration is unavailable for crc32c.
Possibly it can lead to other nasty surprises too, due to use-after-free
issues.
The following steps explain how the problem happens.
1) We have a filesystem mounted in RW mode and the qgroup rescan worker is
running;
2) We remount the filesystem in RO mode, and never stop/pause the rescan
worker, so after the remount the rescan worker is still running. The
important detail here is that the rescan task is still running after
the remount operation committed any ongoing transaction through its
call to btrfs_commit_super();
3) The rescan is still running, and after the remount completed, the
rescan worker started a transaction, after it finished iterating all
leaves of the extent tree, to update the qgroup status item in the
quotas tree. It does not commit the transaction, it only releases its
handle on the transaction;
4) A filesystem unmount operation starts shortly after;
5) The unmount task, at close_ctree(), stops the transaction kthread,
which had not had a chance to commit the open transaction since it was
sleeping and the commit interval (default of 30 seconds) has not yet
elapsed since the last time it committed a transaction;
6) So after stopping the transaction kthread we still have the transaction
used to update the qgroup status item open. At close_ctree(), when the
filesystem is in RO mode and no transaction abort happened (or the
filesystem is in error mode), we do not expect to have any transaction
open, so we do not call btrfs_commit_super();
7) We then proceed to destroy the work queues, free the roots and block
groups, etc. After that we drop the last reference on the btree inode
by calling iput() on it. Since there are dirty pages for the btree
inode, corresponding to the COWed extent buffer for the quotas btree,
btree_write_cache_pages() is invoked to flush those dirty pages. This
results in creating a bio and submitting it, which makes us end up at
btrfs_submit_metadata_bio();
8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch
that calls btrfs_wq_submit_bio(), because check_async_write() returned
a value of 1. This value of 1 is because we did not have hardware
acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not
set in fs_info->flags;
9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the
workqueue at fs_info->workers, which was already freed before by the
call to btrfs_stop_all_workers() at close_ctree(). This results in an
invalid memory access due to a use-after-free, leading to a crash.
When this happens, before the crash there are several warnings triggered,
since we have reserved metadata space in a block group, the delayed refs
reservation, etc:
------------[ cut here ]------------
WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs]
Code: f0 01 00 00 48 39 c2 75 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8
RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800
RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x17f/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 48 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c6 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Code: 48 83 bb b0 03 00 00 00 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff
RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x24c/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c7 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Code: ad de 49 be 22 01 00 (...)
RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206
RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246
RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c8 ]---
BTRFS info (device sdc): space_info 4 has 268238848 free, is not full
BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536
BTRFS info (device sdc): global_block_rsv: size 0 reserved 0
BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0
BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0
And the crash, which only happens when we do not have crc32c hardware
acceleration, produces the following trace immediately after those
warnings:
stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs]
Code: 54 55 53 48 89 f3 (...)
RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282
RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0
RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8
R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000
FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_wq_submit_bio+0xb3/0xd0 [btrfs]
btrfs_submit_metadata_bio+0x44/0xc0 [btrfs]
submit_one_bio+0x61/0x70 [btrfs]
btree_write_cache_pages+0x414/0x450 [btrfs]
? kobject_put+0x9a/0x1d0
? trace_hardirqs_on+0x1b/0xf0
? _raw_spin_unlock_irqrestore+0x3c/0x60
? free_debug_processing+0x1e1/0x2b0
do_writepages+0x43/0xe0
? lock_acquired+0x199/0x490
__writeback_single_inode+0x59/0x650
writeback_single_inode+0xaf/0x120
write_inode_now+0x94/0xd0
iput+0x187/0x2b0
close_ctree+0x2c6/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f3cfebabee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000
RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0
R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000
R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
---[ end trace dd74718fef1ed5cc ]---
Finally when we remove the btrfs module (rmmod btrfs), there are several
warnings about objects that were allocated from our slabs but were never
freed, consequence of the transaction that was never committed and got
leaked:
=============================================================================
BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x0000000050cbdd61 @offset=12104
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
sync_filesystem+0x74/0x90
generic_shutdown_super+0x22/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x0000000086e9b0ff @offset=12776
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs]
commit_cowonly_roots+0x248/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 0b (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000001a340018 @offset=4408
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_commit_transaction+0x60/0xc40 [btrfs]
create_subvol+0x56a/0x990 [btrfs]
btrfs_mksubvol+0x3fb/0x4a0 [btrfs]
__btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs]
btrfs_ioctl_snap_create+0x58/0x80 [btrfs]
btrfs_ioctl+0x1a92/0x36f0 [btrfs]
__x64_sys_ioctl+0x83/0xb0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x000000002b46292a @offset=13648
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? __mutex_unlock_slowpath+0x45/0x2a0
kmem_cache_destroy+0x55/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000004cf95ea8 @offset=6264
INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_alloc_tree_block+0x1e0/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0xabd/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1
Fix this issue by having the remount path stop the qgroup rescan worker
when we are remounting RO and teach the rescan worker to stop when a
remount is in progress. If later a remount in RW mode happens, we are
already resuming the qgroup rescan worker through the call to
btrfs_qgroup_rescan_resume(), so we do not need to worry about that.
Tested-by: Fabian Vogt <fvogt@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-12-14 11:10:45 +01:00
|
|
|
bool stopped = false;
|
2013-04-25 18:04:51 +02:00
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path)
|
|
|
|
goto out;
|
2018-05-14 03:38:12 +02:00
|
|
|
/*
|
|
|
|
* Rescan should only search for commit root, and any later difference
|
|
|
|
* should be recorded by qgroup
|
|
|
|
*/
|
|
|
|
path->search_commit_root = 1;
|
|
|
|
path->skip_locking = 1;
|
2013-04-25 18:04:51 +02:00
|
|
|
|
|
|
|
err = 0;
|
btrfs: fix transaction leak and crash after RO remount caused by qgroup rescan
[ Upstream commit cb13eea3b49055bd78e6ddf39defd6340f7379fc ]
If we remount a filesystem in RO mode while the qgroup rescan worker is
running, we can end up having it still running after the remount is done,
and at unmount time we may end up with an open transaction that ends up
never getting committed. If that happens we end up with several memory
leaks and can crash when hardware acceleration is unavailable for crc32c.
Possibly it can lead to other nasty surprises too, due to use-after-free
issues.
The following steps explain how the problem happens.
1) We have a filesystem mounted in RW mode and the qgroup rescan worker is
running;
2) We remount the filesystem in RO mode, and never stop/pause the rescan
worker, so after the remount the rescan worker is still running. The
important detail here is that the rescan task is still running after
the remount operation committed any ongoing transaction through its
call to btrfs_commit_super();
3) The rescan is still running, and after the remount completed, the
rescan worker started a transaction, after it finished iterating all
leaves of the extent tree, to update the qgroup status item in the
quotas tree. It does not commit the transaction, it only releases its
handle on the transaction;
4) A filesystem unmount operation starts shortly after;
5) The unmount task, at close_ctree(), stops the transaction kthread,
which had not had a chance to commit the open transaction since it was
sleeping and the commit interval (default of 30 seconds) has not yet
elapsed since the last time it committed a transaction;
6) So after stopping the transaction kthread we still have the transaction
used to update the qgroup status item open. At close_ctree(), when the
filesystem is in RO mode and no transaction abort happened (or the
filesystem is in error mode), we do not expect to have any transaction
open, so we do not call btrfs_commit_super();
7) We then proceed to destroy the work queues, free the roots and block
groups, etc. After that we drop the last reference on the btree inode
by calling iput() on it. Since there are dirty pages for the btree
inode, corresponding to the COWed extent buffer for the quotas btree,
btree_write_cache_pages() is invoked to flush those dirty pages. This
results in creating a bio and submitting it, which makes us end up at
btrfs_submit_metadata_bio();
8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch
that calls btrfs_wq_submit_bio(), because check_async_write() returned
a value of 1. This value of 1 is because we did not have hardware
acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not
set in fs_info->flags;
9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the
workqueue at fs_info->workers, which was already freed before by the
call to btrfs_stop_all_workers() at close_ctree(). This results in an
invalid memory access due to a use-after-free, leading to a crash.
When this happens, before the crash there are several warnings triggered,
since we have reserved metadata space in a block group, the delayed refs
reservation, etc:
------------[ cut here ]------------
WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs]
Code: f0 01 00 00 48 39 c2 75 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8
RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800
RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x17f/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 48 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c6 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Code: 48 83 bb b0 03 00 00 00 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff
RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x24c/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c7 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Code: ad de 49 be 22 01 00 (...)
RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206
RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246
RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c8 ]---
BTRFS info (device sdc): space_info 4 has 268238848 free, is not full
BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536
BTRFS info (device sdc): global_block_rsv: size 0 reserved 0
BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0
BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0
And the crash, which only happens when we do not have crc32c hardware
acceleration, produces the following trace immediately after those
warnings:
stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs]
Code: 54 55 53 48 89 f3 (...)
RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282
RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0
RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8
R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000
FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_wq_submit_bio+0xb3/0xd0 [btrfs]
btrfs_submit_metadata_bio+0x44/0xc0 [btrfs]
submit_one_bio+0x61/0x70 [btrfs]
btree_write_cache_pages+0x414/0x450 [btrfs]
? kobject_put+0x9a/0x1d0
? trace_hardirqs_on+0x1b/0xf0
? _raw_spin_unlock_irqrestore+0x3c/0x60
? free_debug_processing+0x1e1/0x2b0
do_writepages+0x43/0xe0
? lock_acquired+0x199/0x490
__writeback_single_inode+0x59/0x650
writeback_single_inode+0xaf/0x120
write_inode_now+0x94/0xd0
iput+0x187/0x2b0
close_ctree+0x2c6/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f3cfebabee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000
RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0
R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000
R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
---[ end trace dd74718fef1ed5cc ]---
Finally when we remove the btrfs module (rmmod btrfs), there are several
warnings about objects that were allocated from our slabs but were never
freed, consequence of the transaction that was never committed and got
leaked:
=============================================================================
BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x0000000050cbdd61 @offset=12104
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
sync_filesystem+0x74/0x90
generic_shutdown_super+0x22/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x0000000086e9b0ff @offset=12776
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs]
commit_cowonly_roots+0x248/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 0b (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000001a340018 @offset=4408
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_commit_transaction+0x60/0xc40 [btrfs]
create_subvol+0x56a/0x990 [btrfs]
btrfs_mksubvol+0x3fb/0x4a0 [btrfs]
__btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs]
btrfs_ioctl_snap_create+0x58/0x80 [btrfs]
btrfs_ioctl+0x1a92/0x36f0 [btrfs]
__x64_sys_ioctl+0x83/0xb0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x000000002b46292a @offset=13648
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? __mutex_unlock_slowpath+0x45/0x2a0
kmem_cache_destroy+0x55/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000004cf95ea8 @offset=6264
INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_alloc_tree_block+0x1e0/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0xabd/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1
Fix this issue by having the remount path stop the qgroup rescan worker
when we are remounting RO and teach the rescan worker to stop when a
remount is in progress. If later a remount in RW mode happens, we are
already resuming the qgroup rescan worker through the call to
btrfs_qgroup_rescan_resume(), so we do not need to worry about that.
Tested-by: Fabian Vogt <fvogt@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-12-14 11:10:45 +01:00
|
|
|
while (!err && !(stopped = rescan_should_stop(fs_info))) {
|
2013-04-25 18:04:51 +02:00
|
|
|
trans = btrfs_start_transaction(fs_info->fs_root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
|
|
|
break;
|
|
|
|
}
|
2016-09-02 21:40:02 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
|
2013-04-25 18:04:51 +02:00
|
|
|
err = -EINTR;
|
|
|
|
} else {
|
2018-07-18 08:45:42 +02:00
|
|
|
err = qgroup_rescan_leaf(trans, path);
|
2013-04-25 18:04:51 +02:00
|
|
|
}
|
|
|
|
if (err > 0)
|
2016-09-10 03:39:03 +02:00
|
|
|
btrfs_commit_transaction(trans);
|
2013-04-25 18:04:51 +02:00
|
|
|
else
|
2016-09-10 03:39:03 +02:00
|
|
|
btrfs_end_transaction(trans);
|
2013-04-25 18:04:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
2015-02-27 09:24:24 +01:00
|
|
|
if (err > 0 &&
|
2013-04-25 18:04:51 +02:00
|
|
|
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
} else if (err < 0) {
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
|
|
|
}
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
2015-02-27 09:24:25 +01:00
|
|
|
/*
|
2016-05-20 03:18:45 +02:00
|
|
|
* only update status, since the previous part has already updated the
|
2015-02-27 09:24:25 +01:00
|
|
|
* qgroup info.
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(fs_info->quota_root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
err = PTR_ERR(trans);
|
Btrfs: fix race setting up and completing qgroup rescan workers
commit 13fc1d271a2e3ab8a02071e711add01fab9271f6 upstream.
There is a race between setting up a qgroup rescan worker and completing
a qgroup rescan worker that can lead to callers of the qgroup rescan wait
ioctl to either not wait for the rescan worker to complete or to hang
forever due to missing wake ups. The following diagram shows a sequence
of steps that illustrates the race.
CPU 1 CPU 2 CPU 3
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts the worker
btrfs_qgroup_rescan_worker()
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_flags &=
~BTRFS_QGROUP_STATUS_FLAG_RESCAN
mutex_unlock(&fs_info->qgroup_rescan_lock)
starts transaction, updates qgroup status
item, etc
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts another worker
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_rescan_running = false
mutex_unlock(&fs_info->qgroup_rescan_lock)
complete_all(&fs_info->qgroup_rescan_completion)
Before the rescan worker started by the task at CPU 3 completes, if
another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS
because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at
fs_info->qgroup_flags, which is expected and correct behaviour.
However if other task calls btrfs_ioctl_quota_rescan_wait() before the
rescan worker started by the task at CPU 3 completes, it will return
immediately without waiting for the new rescan worker to complete,
because fs_info->qgroup_rescan_running is set to false by CPU 2.
This race is making test case btrfs/171 (from fstests) to fail often:
btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad)
# --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100
# +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100
# @@ -1,2 +1,3 @@
# QA output created by 171
# +ERROR: quota rescan failed: Operation now in progress
# Silence is golden
# ...
# (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff)
That is because the test calls the btrfs-progs commands "qgroup quota
rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes
calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs"
commands 'qgroup assign' and 'qgroup remove' often call the rescan start
ioctl after calling the qgroup assign ioctl,
btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait
for a rescan worker to complete.
Another problem the race can cause is missing wake ups for waiters,
since the call to complete_all() happens outside a critical section and
after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence
diagram above, if we have a waiter for the first rescan task (executed
by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and
if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and
before it calls complete_all() against
fs_info->qgroup_rescan_completion, the task at CPU 3 calls
init_completion() against fs_info->qgroup_rescan_completion which
re-initilizes its wait queue to an empty queue, therefore causing the
rescan worker at CPU 2 to call complete_all() against an empty queue,
never waking up the task waiting for that rescan worker.
Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting
fs_info->qgroup_rescan_running to false in the same critical section,
delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the
call to complete_all() in that same critical section. This gives the
protection needed to avoid rescan wait ioctl callers not waiting for a
running rescan worker and the lost wake ups problem, since setting that
rescan flag and boolean as well as initializing the wait queue is done
already in a critical section delimited by that mutex (at
qgroup_rescan_init()).
Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion")
Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-09-24 11:49:54 +02:00
|
|
|
trans = NULL;
|
2015-02-27 09:24:25 +01:00
|
|
|
btrfs_err(fs_info,
|
2017-07-13 15:32:18 +02:00
|
|
|
"fail to start transaction for status update: %d",
|
2015-02-27 09:24:25 +01:00
|
|
|
err);
|
|
|
|
}
|
Btrfs: fix race setting up and completing qgroup rescan workers
commit 13fc1d271a2e3ab8a02071e711add01fab9271f6 upstream.
There is a race between setting up a qgroup rescan worker and completing
a qgroup rescan worker that can lead to callers of the qgroup rescan wait
ioctl to either not wait for the rescan worker to complete or to hang
forever due to missing wake ups. The following diagram shows a sequence
of steps that illustrates the race.
CPU 1 CPU 2 CPU 3
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts the worker
btrfs_qgroup_rescan_worker()
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_flags &=
~BTRFS_QGROUP_STATUS_FLAG_RESCAN
mutex_unlock(&fs_info->qgroup_rescan_lock)
starts transaction, updates qgroup status
item, etc
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts another worker
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_rescan_running = false
mutex_unlock(&fs_info->qgroup_rescan_lock)
complete_all(&fs_info->qgroup_rescan_completion)
Before the rescan worker started by the task at CPU 3 completes, if
another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS
because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at
fs_info->qgroup_flags, which is expected and correct behaviour.
However if other task calls btrfs_ioctl_quota_rescan_wait() before the
rescan worker started by the task at CPU 3 completes, it will return
immediately without waiting for the new rescan worker to complete,
because fs_info->qgroup_rescan_running is set to false by CPU 2.
This race is making test case btrfs/171 (from fstests) to fail often:
btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad)
# --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100
# +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100
# @@ -1,2 +1,3 @@
# QA output created by 171
# +ERROR: quota rescan failed: Operation now in progress
# Silence is golden
# ...
# (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff)
That is because the test calls the btrfs-progs commands "qgroup quota
rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes
calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs"
commands 'qgroup assign' and 'qgroup remove' often call the rescan start
ioctl after calling the qgroup assign ioctl,
btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait
for a rescan worker to complete.
Another problem the race can cause is missing wake ups for waiters,
since the call to complete_all() happens outside a critical section and
after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence
diagram above, if we have a waiter for the first rescan task (executed
by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and
if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and
before it calls complete_all() against
fs_info->qgroup_rescan_completion, the task at CPU 3 calls
init_completion() against fs_info->qgroup_rescan_completion which
re-initilizes its wait queue to an empty queue, therefore causing the
rescan worker at CPU 2 to call complete_all() against an empty queue,
never waking up the task waiting for that rescan worker.
Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting
fs_info->qgroup_rescan_running to false in the same critical section,
delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the
call to complete_all() in that same critical section. This gives the
protection needed to avoid rescan wait ioctl callers not waiting for a
running rescan worker and the lost wake ups problem, since setting that
rescan flag and boolean as well as initializing the wait queue is done
already in a critical section delimited by that mutex (at
qgroup_rescan_init()).
Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion")
Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-09-24 11:49:54 +02:00
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
btrfs: fix transaction leak and crash after RO remount caused by qgroup rescan
[ Upstream commit cb13eea3b49055bd78e6ddf39defd6340f7379fc ]
If we remount a filesystem in RO mode while the qgroup rescan worker is
running, we can end up having it still running after the remount is done,
and at unmount time we may end up with an open transaction that ends up
never getting committed. If that happens we end up with several memory
leaks and can crash when hardware acceleration is unavailable for crc32c.
Possibly it can lead to other nasty surprises too, due to use-after-free
issues.
The following steps explain how the problem happens.
1) We have a filesystem mounted in RW mode and the qgroup rescan worker is
running;
2) We remount the filesystem in RO mode, and never stop/pause the rescan
worker, so after the remount the rescan worker is still running. The
important detail here is that the rescan task is still running after
the remount operation committed any ongoing transaction through its
call to btrfs_commit_super();
3) The rescan is still running, and after the remount completed, the
rescan worker started a transaction, after it finished iterating all
leaves of the extent tree, to update the qgroup status item in the
quotas tree. It does not commit the transaction, it only releases its
handle on the transaction;
4) A filesystem unmount operation starts shortly after;
5) The unmount task, at close_ctree(), stops the transaction kthread,
which had not had a chance to commit the open transaction since it was
sleeping and the commit interval (default of 30 seconds) has not yet
elapsed since the last time it committed a transaction;
6) So after stopping the transaction kthread we still have the transaction
used to update the qgroup status item open. At close_ctree(), when the
filesystem is in RO mode and no transaction abort happened (or the
filesystem is in error mode), we do not expect to have any transaction
open, so we do not call btrfs_commit_super();
7) We then proceed to destroy the work queues, free the roots and block
groups, etc. After that we drop the last reference on the btree inode
by calling iput() on it. Since there are dirty pages for the btree
inode, corresponding to the COWed extent buffer for the quotas btree,
btree_write_cache_pages() is invoked to flush those dirty pages. This
results in creating a bio and submitting it, which makes us end up at
btrfs_submit_metadata_bio();
8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch
that calls btrfs_wq_submit_bio(), because check_async_write() returned
a value of 1. This value of 1 is because we did not have hardware
acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not
set in fs_info->flags;
9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the
workqueue at fs_info->workers, which was already freed before by the
call to btrfs_stop_all_workers() at close_ctree(). This results in an
invalid memory access due to a use-after-free, leading to a crash.
When this happens, before the crash there are several warnings triggered,
since we have reserved metadata space in a block group, the delayed refs
reservation, etc:
------------[ cut here ]------------
WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs]
Code: f0 01 00 00 48 39 c2 75 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8
RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800
RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x17f/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 48 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c6 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Code: 48 83 bb b0 03 00 00 00 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff
RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x24c/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c7 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Code: ad de 49 be 22 01 00 (...)
RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206
RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246
RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c8 ]---
BTRFS info (device sdc): space_info 4 has 268238848 free, is not full
BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536
BTRFS info (device sdc): global_block_rsv: size 0 reserved 0
BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0
BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0
And the crash, which only happens when we do not have crc32c hardware
acceleration, produces the following trace immediately after those
warnings:
stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs]
Code: 54 55 53 48 89 f3 (...)
RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282
RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0
RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8
R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000
FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_wq_submit_bio+0xb3/0xd0 [btrfs]
btrfs_submit_metadata_bio+0x44/0xc0 [btrfs]
submit_one_bio+0x61/0x70 [btrfs]
btree_write_cache_pages+0x414/0x450 [btrfs]
? kobject_put+0x9a/0x1d0
? trace_hardirqs_on+0x1b/0xf0
? _raw_spin_unlock_irqrestore+0x3c/0x60
? free_debug_processing+0x1e1/0x2b0
do_writepages+0x43/0xe0
? lock_acquired+0x199/0x490
__writeback_single_inode+0x59/0x650
writeback_single_inode+0xaf/0x120
write_inode_now+0x94/0xd0
iput+0x187/0x2b0
close_ctree+0x2c6/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f3cfebabee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000
RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0
R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000
R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
---[ end trace dd74718fef1ed5cc ]---
Finally when we remove the btrfs module (rmmod btrfs), there are several
warnings about objects that were allocated from our slabs but were never
freed, consequence of the transaction that was never committed and got
leaked:
=============================================================================
BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x0000000050cbdd61 @offset=12104
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
sync_filesystem+0x74/0x90
generic_shutdown_super+0x22/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x0000000086e9b0ff @offset=12776
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs]
commit_cowonly_roots+0x248/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 0b (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000001a340018 @offset=4408
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_commit_transaction+0x60/0xc40 [btrfs]
create_subvol+0x56a/0x990 [btrfs]
btrfs_mksubvol+0x3fb/0x4a0 [btrfs]
__btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs]
btrfs_ioctl_snap_create+0x58/0x80 [btrfs]
btrfs_ioctl+0x1a92/0x36f0 [btrfs]
__x64_sys_ioctl+0x83/0xb0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x000000002b46292a @offset=13648
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? __mutex_unlock_slowpath+0x45/0x2a0
kmem_cache_destroy+0x55/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000004cf95ea8 @offset=6264
INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_alloc_tree_block+0x1e0/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0xabd/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1
Fix this issue by having the remount path stop the qgroup rescan worker
when we are remounting RO and teach the rescan worker to stop when a
remount is in progress. If later a remount in RW mode happens, we are
already resuming the qgroup rescan worker through the call to
btrfs_qgroup_rescan_resume(), so we do not need to worry about that.
Tested-by: Fabian Vogt <fvogt@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-12-14 11:10:45 +01:00
|
|
|
if (!stopped)
|
Btrfs: fix race setting up and completing qgroup rescan workers
commit 13fc1d271a2e3ab8a02071e711add01fab9271f6 upstream.
There is a race between setting up a qgroup rescan worker and completing
a qgroup rescan worker that can lead to callers of the qgroup rescan wait
ioctl to either not wait for the rescan worker to complete or to hang
forever due to missing wake ups. The following diagram shows a sequence
of steps that illustrates the race.
CPU 1 CPU 2 CPU 3
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts the worker
btrfs_qgroup_rescan_worker()
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_flags &=
~BTRFS_QGROUP_STATUS_FLAG_RESCAN
mutex_unlock(&fs_info->qgroup_rescan_lock)
starts transaction, updates qgroup status
item, etc
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts another worker
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_rescan_running = false
mutex_unlock(&fs_info->qgroup_rescan_lock)
complete_all(&fs_info->qgroup_rescan_completion)
Before the rescan worker started by the task at CPU 3 completes, if
another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS
because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at
fs_info->qgroup_flags, which is expected and correct behaviour.
However if other task calls btrfs_ioctl_quota_rescan_wait() before the
rescan worker started by the task at CPU 3 completes, it will return
immediately without waiting for the new rescan worker to complete,
because fs_info->qgroup_rescan_running is set to false by CPU 2.
This race is making test case btrfs/171 (from fstests) to fail often:
btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad)
# --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100
# +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100
# @@ -1,2 +1,3 @@
# QA output created by 171
# +ERROR: quota rescan failed: Operation now in progress
# Silence is golden
# ...
# (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff)
That is because the test calls the btrfs-progs commands "qgroup quota
rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes
calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs"
commands 'qgroup assign' and 'qgroup remove' often call the rescan start
ioctl after calling the qgroup assign ioctl,
btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait
for a rescan worker to complete.
Another problem the race can cause is missing wake ups for waiters,
since the call to complete_all() happens outside a critical section and
after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence
diagram above, if we have a waiter for the first rescan task (executed
by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and
if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and
before it calls complete_all() against
fs_info->qgroup_rescan_completion, the task at CPU 3 calls
init_completion() against fs_info->qgroup_rescan_completion which
re-initilizes its wait queue to an empty queue, therefore causing the
rescan worker at CPU 2 to call complete_all() against an empty queue,
never waking up the task waiting for that rescan worker.
Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting
fs_info->qgroup_rescan_running to false in the same critical section,
delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the
call to complete_all() in that same critical section. This gives the
protection needed to avoid rescan wait ioctl callers not waiting for a
running rescan worker and the lost wake ups problem, since setting that
rescan flag and boolean as well as initializing the wait queue is done
already in a critical section delimited by that mutex (at
qgroup_rescan_init()).
Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion")
Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-09-24 11:49:54 +02:00
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
|
|
|
if (trans) {
|
|
|
|
ret = update_qgroup_status_item(trans);
|
|
|
|
if (ret < 0) {
|
|
|
|
err = ret;
|
|
|
|
btrfs_err(fs_info, "fail to update qgroup status: %d",
|
|
|
|
err);
|
|
|
|
}
|
2015-02-27 09:24:25 +01:00
|
|
|
}
|
Btrfs: fix race setting up and completing qgroup rescan workers
commit 13fc1d271a2e3ab8a02071e711add01fab9271f6 upstream.
There is a race between setting up a qgroup rescan worker and completing
a qgroup rescan worker that can lead to callers of the qgroup rescan wait
ioctl to either not wait for the rescan worker to complete or to hang
forever due to missing wake ups. The following diagram shows a sequence
of steps that illustrates the race.
CPU 1 CPU 2 CPU 3
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts the worker
btrfs_qgroup_rescan_worker()
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_flags &=
~BTRFS_QGROUP_STATUS_FLAG_RESCAN
mutex_unlock(&fs_info->qgroup_rescan_lock)
starts transaction, updates qgroup status
item, etc
btrfs_ioctl_quota_rescan()
btrfs_qgroup_rescan()
qgroup_rescan_init()
mutex_lock(&fs_info->qgroup_rescan_lock)
spin_lock(&fs_info->qgroup_lock)
fs_info->qgroup_flags |=
BTRFS_QGROUP_STATUS_FLAG_RESCAN
init_completion(
&fs_info->qgroup_rescan_completion)
fs_info->qgroup_rescan_running = true
mutex_unlock(&fs_info->qgroup_rescan_lock)
spin_unlock(&fs_info->qgroup_lock)
btrfs_init_work()
--> starts another worker
mutex_lock(&fs_info->qgroup_rescan_lock)
fs_info->qgroup_rescan_running = false
mutex_unlock(&fs_info->qgroup_rescan_lock)
complete_all(&fs_info->qgroup_rescan_completion)
Before the rescan worker started by the task at CPU 3 completes, if
another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS
because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at
fs_info->qgroup_flags, which is expected and correct behaviour.
However if other task calls btrfs_ioctl_quota_rescan_wait() before the
rescan worker started by the task at CPU 3 completes, it will return
immediately without waiting for the new rescan worker to complete,
because fs_info->qgroup_rescan_running is set to false by CPU 2.
This race is making test case btrfs/171 (from fstests) to fail often:
btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad)
# --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100
# +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100
# @@ -1,2 +1,3 @@
# QA output created by 171
# +ERROR: quota rescan failed: Operation now in progress
# Silence is golden
# ...
# (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff)
That is because the test calls the btrfs-progs commands "qgroup quota
rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes
calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs"
commands 'qgroup assign' and 'qgroup remove' often call the rescan start
ioctl after calling the qgroup assign ioctl,
btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait
for a rescan worker to complete.
Another problem the race can cause is missing wake ups for waiters,
since the call to complete_all() happens outside a critical section and
after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence
diagram above, if we have a waiter for the first rescan task (executed
by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and
if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and
before it calls complete_all() against
fs_info->qgroup_rescan_completion, the task at CPU 3 calls
init_completion() against fs_info->qgroup_rescan_completion which
re-initilizes its wait queue to an empty queue, therefore causing the
rescan worker at CPU 2 to call complete_all() against an empty queue,
never waking up the task waiting for that rescan worker.
Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting
fs_info->qgroup_rescan_running to false in the same critical section,
delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the
call to complete_all() in that same critical section. This gives the
protection needed to avoid rescan wait ioctl callers not waiting for a
running rescan worker and the lost wake ups problem, since setting that
rescan flag and boolean as well as initializing the wait queue is done
already in a critical section delimited by that mutex (at
qgroup_rescan_init()).
Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion")
Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-09-24 11:49:54 +02:00
|
|
|
fs_info->qgroup_rescan_running = false;
|
|
|
|
complete_all(&fs_info->qgroup_rescan_completion);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
|
|
|
if (!trans)
|
|
|
|
return;
|
|
|
|
|
2016-09-10 03:39:03 +02:00
|
|
|
btrfs_end_transaction(trans);
|
2015-02-27 09:24:25 +01:00
|
|
|
|
btrfs: fix transaction leak and crash after RO remount caused by qgroup rescan
[ Upstream commit cb13eea3b49055bd78e6ddf39defd6340f7379fc ]
If we remount a filesystem in RO mode while the qgroup rescan worker is
running, we can end up having it still running after the remount is done,
and at unmount time we may end up with an open transaction that ends up
never getting committed. If that happens we end up with several memory
leaks and can crash when hardware acceleration is unavailable for crc32c.
Possibly it can lead to other nasty surprises too, due to use-after-free
issues.
The following steps explain how the problem happens.
1) We have a filesystem mounted in RW mode and the qgroup rescan worker is
running;
2) We remount the filesystem in RO mode, and never stop/pause the rescan
worker, so after the remount the rescan worker is still running. The
important detail here is that the rescan task is still running after
the remount operation committed any ongoing transaction through its
call to btrfs_commit_super();
3) The rescan is still running, and after the remount completed, the
rescan worker started a transaction, after it finished iterating all
leaves of the extent tree, to update the qgroup status item in the
quotas tree. It does not commit the transaction, it only releases its
handle on the transaction;
4) A filesystem unmount operation starts shortly after;
5) The unmount task, at close_ctree(), stops the transaction kthread,
which had not had a chance to commit the open transaction since it was
sleeping and the commit interval (default of 30 seconds) has not yet
elapsed since the last time it committed a transaction;
6) So after stopping the transaction kthread we still have the transaction
used to update the qgroup status item open. At close_ctree(), when the
filesystem is in RO mode and no transaction abort happened (or the
filesystem is in error mode), we do not expect to have any transaction
open, so we do not call btrfs_commit_super();
7) We then proceed to destroy the work queues, free the roots and block
groups, etc. After that we drop the last reference on the btree inode
by calling iput() on it. Since there are dirty pages for the btree
inode, corresponding to the COWed extent buffer for the quotas btree,
btree_write_cache_pages() is invoked to flush those dirty pages. This
results in creating a bio and submitting it, which makes us end up at
btrfs_submit_metadata_bio();
8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch
that calls btrfs_wq_submit_bio(), because check_async_write() returned
a value of 1. This value of 1 is because we did not have hardware
acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not
set in fs_info->flags;
9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the
workqueue at fs_info->workers, which was already freed before by the
call to btrfs_stop_all_workers() at close_ctree(). This results in an
invalid memory access due to a use-after-free, leading to a crash.
When this happens, before the crash there are several warnings triggered,
since we have reserved metadata space in a block group, the delayed refs
reservation, etc:
------------[ cut here ]------------
WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 4 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs]
Code: f0 01 00 00 48 39 c2 75 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8
RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800
RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x17f/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 48 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c6 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 2 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
Code: 48 83 bb b0 03 00 00 00 (...)
RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff
RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_free_block_groups+0x24c/0x2f0 [btrfs]
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c7 ]---
------------[ cut here ]------------
WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
CPU: 5 PID: 1729896 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
Code: ad de 49 be 22 01 00 (...)
RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206
RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246
RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00
R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
FS: 00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
close_ctree+0x2ba/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f15ee221ee7
Code: ff 0b 00 f7 d8 64 89 (...)
RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
irq event stamp: 0
hardirqs last enabled at (0): [<0000000000000000>] 0x0
hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace dd74718fef1ed5c8 ]---
BTRFS info (device sdc): space_info 4 has 268238848 free, is not full
BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536
BTRFS info (device sdc): global_block_rsv: size 0 reserved 0
BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0
BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0
BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0
And the crash, which only happens when we do not have crc32c hardware
acceleration, produces the following trace immediately after those
warnings:
stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
CPU: 2 PID: 1749129 Comm: umount Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs]
Code: 54 55 53 48 89 f3 (...)
RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282
RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000
RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0
RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8
R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000
FS: 00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
btrfs_wq_submit_bio+0xb3/0xd0 [btrfs]
btrfs_submit_metadata_bio+0x44/0xc0 [btrfs]
submit_one_bio+0x61/0x70 [btrfs]
btree_write_cache_pages+0x414/0x450 [btrfs]
? kobject_put+0x9a/0x1d0
? trace_hardirqs_on+0x1b/0xf0
? _raw_spin_unlock_irqrestore+0x3c/0x60
? free_debug_processing+0x1e1/0x2b0
do_writepages+0x43/0xe0
? lock_acquired+0x199/0x490
__writeback_single_inode+0x59/0x650
writeback_single_inode+0xaf/0x120
write_inode_now+0x94/0xd0
iput+0x187/0x2b0
close_ctree+0x2c6/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f3cfebabee7
Code: ff 0b 00 f7 d8 64 89 01 (...)
RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7
RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000
RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0
R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000
R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60
Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
---[ end trace dd74718fef1ed5cc ]---
Finally when we remove the btrfs module (rmmod btrfs), there are several
warnings about objects that were allocated from our slabs but were never
freed, consequence of the transaction that was never committed and got
leaked:
=============================================================================
BUG btrfs_delayed_ref_head (Tainted: G B W ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x0000000050cbdd61 @offset=12104
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
sync_filesystem+0x74/0x90
generic_shutdown_super+0x22/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x0000000086e9b0ff @offset=12776
INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs]
commit_cowonly_roots+0x248/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 0b (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_tree_ref (Tainted: G B W ): Objects remaining in btrfs_delayed_tree_ref on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x0000000011f78dc0 objects=37 used=2 fp=0x0000000032d55d91 flags=0x17fffc000010200
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? lock_release+0x20e/0x4c0
kmem_cache_destroy+0x55/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000001a340018 @offset=4408
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1917 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_free_tree_block+0x128/0x360 [btrfs]
__btrfs_cow_block+0x489/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=4167 cpu=4 pid=1729795
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
btrfs_commit_transaction+0x60/0xc40 [btrfs]
create_subvol+0x56a/0x990 [btrfs]
btrfs_mksubvol+0x3fb/0x4a0 [btrfs]
__btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs]
btrfs_ioctl_snap_create+0x58/0x80 [btrfs]
btrfs_ioctl+0x1a92/0x36f0 [btrfs]
__x64_sys_ioctl+0x83/0xb0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
INFO: Object 0x000000002b46292a @offset=13648
INFO: Allocated in btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs] age=1923 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_add_delayed_tree_ref+0x9e/0x480 [btrfs]
btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
INFO: Freed in __btrfs_run_delayed_refs+0x63d/0x1290 [btrfs] age=3164 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0x63d/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_tree_ref: Slab cache still has objects
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
btrfs_delayed_ref_exit+0x1d/0x35 [btrfs]
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
=============================================================================
BUG btrfs_delayed_extent_op (Tainted: G B W ): Objects remaining in btrfs_delayed_extent_op on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
INFO: Slab 0x00000000f145ce2f objects=22 used=1 fp=0x00000000af0f92cf flags=0x17fffc000010200
CPU: 5 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
slab_err+0xb7/0xdc
? lock_acquired+0x199/0x490
__kmem_cache_shutdown+0x1ac/0x3c0
? __mutex_unlock_slowpath+0x45/0x2a0
kmem_cache_destroy+0x55/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 f5 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
INFO: Object 0x000000004cf95ea8 @offset=6264
INFO: Allocated in btrfs_alloc_tree_block+0x1e0/0x360 [btrfs] age=1931 cpu=6 pid=1729873
__slab_alloc.isra.0+0x109/0x1c0
kmem_cache_alloc+0x7bb/0x830
btrfs_alloc_tree_block+0x1e0/0x360 [btrfs]
alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
__btrfs_cow_block+0x12d/0x5f0 [btrfs]
btrfs_cow_block+0xf7/0x220 [btrfs]
btrfs_search_slot+0x62a/0xc40 [btrfs]
btrfs_del_orphan_item+0x65/0xd0 [btrfs]
btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
open_ctree+0x125a/0x18a0 [btrfs]
btrfs_mount_root.cold+0x13/0xed [btrfs]
legacy_get_tree+0x30/0x60
vfs_get_tree+0x28/0xe0
fc_mount+0xe/0x40
vfs_kern_mount.part.0+0x71/0x90
btrfs_mount+0x13b/0x3e0 [btrfs]
INFO: Freed in __btrfs_run_delayed_refs+0xabd/0x1290 [btrfs] age=3173 cpu=6 pid=1729803
kmem_cache_free+0x34c/0x3c0
__btrfs_run_delayed_refs+0xabd/0x1290 [btrfs]
btrfs_run_delayed_refs+0x81/0x210 [btrfs]
commit_cowonly_roots+0xfb/0x300 [btrfs]
btrfs_commit_transaction+0x367/0xc40 [btrfs]
close_ctree+0x113/0x2fa [btrfs]
generic_shutdown_super+0x6c/0x100
kill_anon_super+0x14/0x30
btrfs_kill_super+0x12/0x20 [btrfs]
deactivate_locked_super+0x31/0x70
cleanup_mnt+0x100/0x160
task_work_run+0x68/0xb0
exit_to_user_mode_prepare+0x1bb/0x1c0
syscall_exit_to_user_mode+0x4b/0x260
entry_SYSCALL_64_after_hwframe+0x44/0xa9
kmem_cache_destroy btrfs_delayed_extent_op: Slab cache still has objects
CPU: 3 PID: 1729921 Comm: rmmod Tainted: G B W 5.10.0-rc4-btrfs-next-73 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0x8d/0xb5
kmem_cache_destroy+0x119/0x120
exit_btrfs_fs+0xa/0x59 [btrfs]
__x64_sys_delete_module+0x194/0x260
? fpregs_assert_state_consistent+0x1e/0x40
? exit_to_user_mode_prepare+0x55/0x1c0
? trace_hardirqs_on+0x1b/0xf0
do_syscall_64+0x33/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f693e305897
Code: 73 01 c3 48 8b 0d f9 (...)
RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
BTRFS: state leak: start 30408704 end 30425087 state 1 in tree 1 refs 1
Fix this issue by having the remount path stop the qgroup rescan worker
when we are remounting RO and teach the rescan worker to stop when a
remount is in progress. If later a remount in RW mode happens, we are
already resuming the qgroup rescan worker through the call to
btrfs_qgroup_rescan_resume(), so we do not need to worry about that.
Tested-by: Fabian Vogt <fvogt@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2020-12-14 11:10:45 +01:00
|
|
|
if (stopped) {
|
2015-11-05 00:56:16 +01:00
|
|
|
btrfs_info(fs_info, "qgroup scan paused");
|
|
|
|
} else if (err >= 0) {
|
2013-12-20 17:37:06 +01:00
|
|
|
btrfs_info(fs_info, "qgroup scan completed%s",
|
2015-02-27 09:24:24 +01:00
|
|
|
err > 0 ? " (inconsistency flag cleared)" : "");
|
2013-04-25 18:04:51 +02:00
|
|
|
} else {
|
2013-12-20 17:37:06 +01:00
|
|
|
btrfs_err(fs_info, "qgroup scan failed with %d", err);
|
2013-04-25 18:04:51 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
/*
|
|
|
|
* Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
|
|
|
|
* memory required for the rescan context.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
|
|
|
|
int init_flags)
|
2013-04-25 18:04:51 +02:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
2018-05-02 07:28:03 +02:00
|
|
|
if (!init_flags) {
|
|
|
|
/* we're resuming qgroup rescan at mount time */
|
2018-06-27 01:43:15 +02:00
|
|
|
if (!(fs_info->qgroup_flags &
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
|
2018-05-02 07:28:03 +02:00
|
|
|
btrfs_warn(fs_info,
|
2019-11-18 13:16:44 +01:00
|
|
|
"qgroup rescan init failed, qgroup rescan is not queued");
|
2018-06-27 01:43:15 +02:00
|
|
|
ret = -EINVAL;
|
|
|
|
} else if (!(fs_info->qgroup_flags &
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_ON)) {
|
2018-05-02 07:28:03 +02:00
|
|
|
btrfs_warn(fs_info,
|
2019-11-18 13:16:44 +01:00
|
|
|
"qgroup rescan init failed, qgroup is not enabled");
|
2018-06-27 01:43:15 +02:00
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
}
|
2013-04-25 18:04:51 +02:00
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
|
|
|
|
if (init_flags) {
|
2018-05-02 07:28:03 +02:00
|
|
|
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"qgroup rescan is already in progress");
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
ret = -EINPROGRESS;
|
2018-05-02 07:28:03 +02:00
|
|
|
} else if (!(fs_info->qgroup_flags &
|
|
|
|
BTRFS_QGROUP_STATUS_FLAG_ON)) {
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"qgroup rescan init failed, qgroup is not enabled");
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
ret = -EINVAL;
|
2018-05-02 07:28:03 +02:00
|
|
|
}
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
2018-05-02 07:28:03 +02:00
|
|
|
return ret;
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
}
|
|
|
|
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
2013-04-25 18:04:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
memset(&fs_info->qgroup_rescan_progress, 0,
|
|
|
|
sizeof(fs_info->qgroup_rescan_progress));
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
|
2015-11-05 11:06:23 +01:00
|
|
|
init_completion(&fs_info->qgroup_rescan_completion);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
|
|
|
memset(&fs_info->qgroup_rescan_work, 0,
|
|
|
|
sizeof(fs_info->qgroup_rescan_work));
|
2014-02-28 03:46:16 +01:00
|
|
|
btrfs_init_work(&fs_info->qgroup_rescan_work,
|
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in
both 3.15 and 3.16.
Btrfs now migrates to use kernel workqueue, but it introduces this hang problem.
Btrfs has a kind of work queued as an ordered way, which means that its
ordered_func() must be processed in the way of FIFO, so it usually looks like --
normal_work_helper(arg)
work = container_of(arg, struct btrfs_work, normal_work);
work->func() <---- (we name it work X)
for ordered_work in wq->ordered_list
ordered_work->ordered_func()
ordered_work->ordered_free()
The hang is a rare case, first when we find free space, we get an uncached block
group, then we go to read its free space cache inode for free space information,
so it will
file a readahead request
btrfs_readpages()
for page that is not in page cache
__do_readpage()
submit_extent_page()
btrfs_submit_bio_hook()
btrfs_bio_wq_end_io()
submit_bio()
end_workqueue_bio() <--(ret by the 1st endio)
queue a work(named work Y) for the 2nd
also the real endio()
So the hang occurs when work Y's work_struct and work X's work_struct happens
to share the same address.
A bit more explanation,
A,B,C -- struct btrfs_work
arg -- struct work_struct
kthread:
worker_thread()
pick up a work_struct from @worklist
process_one_work(arg)
worker->current_work = arg; <-- arg is A->normal_work
worker->current_func(arg)
normal_work_helper(arg)
A = container_of(arg, struct btrfs_work, normal_work);
A->func()
A->ordered_func()
A->ordered_free() <-- A gets freed
B->ordered_func()
submit_compressed_extents()
find_free_extent()
load_free_space_inode()
... <-- (the above readhead stack)
end_workqueue_bio()
btrfs_queue_work(work C)
B->ordered_free()
As if work A has a high priority in wq->ordered_list and there are more ordered
works queued after it, such as B->ordered_func(), its memory could have been
freed before normal_work_helper() returns, which means that kernel workqueue
code worker_thread() still has worker->current_work pointer to be work
A->normal_work's, ie. arg's address.
Meanwhile, work C is allocated after work A is freed, work C->normal_work
and work A->normal_work are likely to share the same address(I confirmed this
with ftrace output, so I'm not just guessing, it's rare though).
When another kthread picks up work C->normal_work to process, and finds our
kthread is processing it(see find_worker_executing_work()), it'll think
work C as a collision and skip then, which ends up nobody processing work C.
So the situation is that our kthread is waiting forever on work C.
Besides, there're other cases that can lead to deadlock, but the real problem
is that all btrfs workqueue shares one work->func, -- normal_work_helper,
so this makes each workqueue to have its own helper function, but only a
wraper pf normal_work_helper.
With this patch, I no long hit the above hang.
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-15 17:36:53 +02:00
|
|
|
btrfs_qgroup_rescan_helper,
|
2014-02-28 03:46:16 +01:00
|
|
|
btrfs_qgroup_rescan_worker, NULL, NULL);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct rb_node *n;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2013-04-25 18:04:51 +02:00
|
|
|
/* clear all current qgroup tracking information */
|
|
|
|
for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
|
|
|
|
qgroup = rb_entry(n, struct btrfs_qgroup, node);
|
|
|
|
qgroup->rfer = 0;
|
|
|
|
qgroup->rfer_cmpr = 0;
|
|
|
|
qgroup->excl = 0;
|
|
|
|
qgroup->excl_cmpr = 0;
|
2018-08-10 04:20:26 +02:00
|
|
|
qgroup_dirty(fs_info, qgroup);
|
2013-04-25 18:04:51 +02:00
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
}
|
2013-04-25 18:04:51 +02:00
|
|
|
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
int
|
|
|
|
btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
|
|
|
|
ret = qgroup_rescan_init(fs_info, 0, 1);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have set the rescan_progress to 0, which means no more
|
|
|
|
* delayed refs will be accounted by btrfs_qgroup_account_ref.
|
|
|
|
* However, btrfs_qgroup_account_ref may be right after its call
|
|
|
|
* to btrfs_find_all_roots, in which case it would still do the
|
|
|
|
* accounting.
|
|
|
|
* To solve this, we're committing the transaction, which will
|
|
|
|
* ensure we run all delayed refs and only after that, we are
|
|
|
|
* going to clear all tracking information for a clean start.
|
|
|
|
*/
|
|
|
|
|
|
|
|
trans = btrfs_join_transaction(fs_info->fs_root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
2016-09-10 03:39:03 +02:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
if (ret) {
|
|
|
|
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
qgroup_rescan_zero_tracking(fs_info);
|
|
|
|
|
2020-02-07 06:38:20 +01:00
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
fs_info->qgroup_rescan_running = true;
|
2014-02-28 03:46:16 +01:00
|
|
|
btrfs_queue_work(fs_info->qgroup_rescan_workers,
|
|
|
|
&fs_info->qgroup_rescan_work);
|
2020-02-07 06:38:20 +01:00
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
2013-04-25 18:04:51 +02:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2013-05-06 21:14:17 +02:00
|
|
|
|
2016-08-09 04:08:06 +02:00
|
|
|
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
|
|
|
|
bool interruptible)
|
2013-05-06 21:14:17 +02:00
|
|
|
{
|
|
|
|
int running;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
2016-08-15 18:10:33 +02:00
|
|
|
running = fs_info->qgroup_rescan_running;
|
2013-05-06 21:14:17 +02:00
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
|
2016-08-09 04:08:06 +02:00
|
|
|
if (!running)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (interruptible)
|
2013-05-06 21:14:17 +02:00
|
|
|
ret = wait_for_completion_interruptible(
|
|
|
|
&fs_info->qgroup_rescan_completion);
|
2016-08-09 04:08:06 +02:00
|
|
|
else
|
|
|
|
wait_for_completion(&fs_info->qgroup_rescan_completion);
|
2013-05-06 21:14:17 +02:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* this is only called from open_ctree where we're still single threaded, thus
|
|
|
|
* locking is omitted here.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2020-02-07 06:38:20 +01:00
|
|
|
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
|
|
|
|
mutex_lock(&fs_info->qgroup_rescan_lock);
|
|
|
|
fs_info->qgroup_rescan_running = true;
|
2014-02-28 03:46:16 +01:00
|
|
|
btrfs_queue_work(fs_info->qgroup_rescan_workers,
|
|
|
|
&fs_info->qgroup_rescan_work);
|
2020-02-07 06:38:20 +01:00
|
|
|
mutex_unlock(&fs_info->qgroup_rescan_lock);
|
|
|
|
}
|
Btrfs: fix qgroup rescan resume on mount
When called during mount, we cannot start the rescan worker thread until
open_ctree is done. This commit restuctures the qgroup rescan internals to
enable a clean deferral of the rescan resume operation.
First of all, the struct qgroup_rescan is removed, saving us a malloc and
some initialization synchronizations problems. Its only element (the worker
struct) now lives within fs_info just as the rest of the rescan code.
Then setting up a rescan worker is split into several reusable stages.
Currently we have three different rescan startup scenarios:
(A) rescan ioctl
(B) rescan resume by mount
(C) rescan by quota enable
Each case needs its own combination of the four following steps:
(1) set the progress [A, C: zero; B: state of umount]
(2) commit the transaction [A]
(3) set the counters [A, C: zero; B: state of umount]
(4) start worker [A, B, C]
qgroup_rescan_init does step (1). There's no extra function added to commit
a transaction, we've got that already. qgroup_rescan_zero_tracking does
step (3). Step (4) is nothing more than a call to the generic
btrfs_queue_worker.
We also get rid of a double check for the rescan progress during
btrfs_qgroup_account_ref, which is no longer required due to having step 2
from the list above.
As a side effect, this commit prepares to move the rescan start code from
btrfs_run_qgroups (which is run during commit) to a less time critical
section.
Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-05-28 17:47:24 +02:00
|
|
|
}
|
2015-10-12 10:05:40 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Reserve qgroup space for range [start, start + len).
|
|
|
|
*
|
|
|
|
* This function will either reserve space from related qgroups or doing
|
|
|
|
* nothing if the range is already reserved.
|
|
|
|
*
|
|
|
|
* Return 0 for successful reserve
|
|
|
|
* Return <0 for error (including -EQUOT)
|
|
|
|
*
|
|
|
|
* NOTE: this function may sleep for memory allocation.
|
2017-02-27 08:10:38 +01:00
|
|
|
* if btrfs_qgroup_reserve_data() is called multiple times with
|
|
|
|
* same @reserved, caller must ensure when error happens it's OK
|
|
|
|
* to free *ALL* reserved space.
|
2015-10-12 10:05:40 +02:00
|
|
|
*/
|
2017-02-27 08:10:38 +01:00
|
|
|
int btrfs_qgroup_reserve_data(struct inode *inode,
|
|
|
|
struct extent_changeset **reserved_ret, u64 start,
|
|
|
|
u64 len)
|
2015-10-12 10:05:40 +02:00
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
2017-02-27 08:10:38 +01:00
|
|
|
struct extent_changeset *reserved;
|
|
|
|
u64 orig_reserved;
|
|
|
|
u64 to_reserve;
|
2015-10-12 10:05:40 +02:00
|
|
|
int ret;
|
|
|
|
|
2016-09-02 21:40:02 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
|
|
|
|
!is_fstree(root->objectid) || len == 0)
|
2015-10-12 10:05:40 +02:00
|
|
|
return 0;
|
|
|
|
|
2017-02-27 08:10:38 +01:00
|
|
|
/* @reserved parameter is mandatory for qgroup */
|
|
|
|
if (WARN_ON(!reserved_ret))
|
|
|
|
return -EINVAL;
|
|
|
|
if (!*reserved_ret) {
|
|
|
|
*reserved_ret = extent_changeset_alloc();
|
|
|
|
if (!*reserved_ret)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
reserved = *reserved_ret;
|
|
|
|
/* Record already reserved space */
|
|
|
|
orig_reserved = reserved->bytes_changed;
|
2015-10-12 10:05:40 +02:00
|
|
|
ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
|
2017-02-27 08:10:38 +01:00
|
|
|
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
|
|
|
|
|
|
|
|
/* Newly reserved space */
|
|
|
|
to_reserve = reserved->bytes_changed - orig_reserved;
|
2015-09-28 10:57:53 +02:00
|
|
|
trace_btrfs_qgroup_reserve_data(inode, start, len,
|
2017-02-27 08:10:38 +01:00
|
|
|
to_reserve, QGROUP_RESERVE);
|
2015-10-12 10:05:40 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
2017-12-12 08:34:25 +01:00
|
|
|
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
|
2015-10-12 10:05:40 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto cleanup;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
cleanup:
|
2017-02-27 08:10:38 +01:00
|
|
|
/* cleanup *ALL* already reserved ranges */
|
2015-10-12 10:05:40 +02:00
|
|
|
ULIST_ITER_INIT(&uiter);
|
2017-02-27 08:10:38 +01:00
|
|
|
while ((unode = ulist_next(&reserved->range_changed, &uiter)))
|
2015-10-12 10:05:40 +02:00
|
|
|
clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
|
2017-10-31 16:37:52 +01:00
|
|
|
unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
|
btrfs: qgroup: Fix reserved data space leak if we have multiple reserve calls
commit d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 upstream.
[BUG]
The following script can cause btrfs qgroup data space leak:
mkfs.btrfs -f $dev
mount $dev -o nospace_cache $mnt
btrfs subv create $mnt/subv
btrfs quota en $mnt
btrfs quota rescan -w $mnt
btrfs qgroup limit 128m $mnt/subv
for (( i = 0; i < 3; i++)); do
# Create 3 64M holes for latter fallocate to fail
truncate -s 192m $mnt/subv/file
xfs_io -c "pwrite 64m 4k" $mnt/subv/file > /dev/null
xfs_io -c "pwrite 128m 4k" $mnt/subv/file > /dev/null
sync
# it's supposed to fail, and each failure will leak at least 64M
# data space
xfs_io -f -c "falloc 0 192m" $mnt/subv/file &> /dev/null
rm $mnt/subv/file
sync
done
# Shouldn't fail after we removed the file
xfs_io -f -c "falloc 0 64m" $mnt/subv/file
[CAUSE]
Btrfs qgroup data reserve code allow multiple reservations to happen on
a single extent_changeset:
E.g:
btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_1M);
btrfs_qgroup_reserve_data(inode, &data_reserved, SZ_1M, SZ_2M);
btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_4M);
Btrfs qgroup code has its internal tracking to make sure we don't
double-reserve in above example.
The only pattern utilizing this feature is in the main while loop of
btrfs_fallocate() function.
However btrfs_qgroup_reserve_data()'s error handling has a bug in that
on error it clears all ranges in the io_tree with EXTENT_QGROUP_RESERVED
flag but doesn't free previously reserved bytes.
This bug has a two fold effect:
- Clearing EXTENT_QGROUP_RESERVED ranges
This is the correct behavior, but it prevents
btrfs_qgroup_check_reserved_leak() to catch the leakage as the
detector is purely EXTENT_QGROUP_RESERVED flag based.
- Leak the previously reserved data bytes.
The bug manifests when N calls to btrfs_qgroup_reserve_data are made and
the last one fails, leaking space reserved in the previous ones.
[FIX]
Also free previously reserved data bytes when btrfs_qgroup_reserve_data
fails.
Fixes: 524725537023 ("btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-09-16 14:02:39 +02:00
|
|
|
/* Also free data bytes of already reserved one */
|
|
|
|
btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
|
|
|
|
orig_reserved, BTRFS_QGROUP_RSV_DATA);
|
2017-02-27 08:10:38 +01:00
|
|
|
extent_changeset_release(reserved);
|
2015-10-12 10:05:40 +02:00
|
|
|
return ret;
|
|
|
|
}
|
2015-10-12 10:28:06 +02:00
|
|
|
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
/* Free ranges specified by @reserved, normally in error path */
|
|
|
|
static int qgroup_free_reserved_data(struct inode *inode,
|
|
|
|
struct extent_changeset *reserved, u64 start, u64 len)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
struct extent_changeset changeset;
|
|
|
|
int freed = 0;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
extent_changeset_init(&changeset);
|
|
|
|
len = round_up(start + len, root->fs_info->sectorsize);
|
|
|
|
start = round_down(start, root->fs_info->sectorsize);
|
|
|
|
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
|
|
|
|
u64 range_start = unode->val;
|
|
|
|
/* unode->aux is the inclusive end */
|
|
|
|
u64 range_len = unode->aux - range_start + 1;
|
|
|
|
u64 free_start;
|
|
|
|
u64 free_len;
|
|
|
|
|
|
|
|
extent_changeset_release(&changeset);
|
|
|
|
|
|
|
|
/* Only free range in range [start, start + len) */
|
|
|
|
if (range_start >= start + len ||
|
|
|
|
range_start + range_len <= start)
|
|
|
|
continue;
|
|
|
|
free_start = max(range_start, start);
|
|
|
|
free_len = min(start + len, range_start + range_len) -
|
|
|
|
free_start;
|
|
|
|
/*
|
|
|
|
* TODO: To also modify reserved->ranges_reserved to reflect
|
|
|
|
* the modification.
|
|
|
|
*
|
|
|
|
* However as long as we free qgroup reserved according to
|
|
|
|
* EXTENT_QGROUP_RESERVED, we won't double free.
|
|
|
|
* So not need to rush.
|
|
|
|
*/
|
btrfs: qgroup: Fix the wrong target io_tree when freeing reserved data space
commit bab32fc069ce8829c416e8737c119f62a57970f9 upstream.
[BUG]
Under the following case with qgroup enabled, if some error happened
after we have reserved delalloc space, then in error handling path, we
could cause qgroup data space leakage:
From btrfs_truncate_block() in inode.c:
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
block_start, blocksize);
if (ret)
goto out;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
ret = -ENOMEM;
goto out;
}
[CAUSE]
In the above case, btrfs_delalloc_reserve_space() will call
btrfs_qgroup_reserve_data() and mark the io_tree range with
EXTENT_QGROUP_RESERVED flag.
In the error handling path, we have the following call stack:
btrfs_delalloc_release_space()
|- btrfs_free_reserved_data_space()
|- btrsf_qgroup_free_data()
|- __btrfs_qgroup_release_data(reserved=@reserved, free=1)
|- qgroup_free_reserved_data(reserved=@reserved)
|- clear_record_extent_bits();
|- freed += changeset.bytes_changed;
However due to a completion bug, qgroup_free_reserved_data() will clear
EXTENT_QGROUP_RESERVED flag in BTRFS_I(inode)->io_failure_tree, other
than the correct BTRFS_I(inode)->io_tree.
Since io_failure_tree is never marked with that flag,
btrfs_qgroup_free_data() will not free any data reserved space at all,
causing a leakage.
This type of error handling can only be triggered by errors outside of
qgroup code. So EDQUOT error from qgroup can't trigger it.
[FIX]
Fix the wrong target io_tree.
Reported-by: Josef Bacik <josef@toxicpanda.com>
Fixes: bc42bda22345 ("btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges")
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2019-09-16 14:02:38 +02:00
|
|
|
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
free_start, free_start + free_len - 1,
|
|
|
|
EXTENT_QGROUP_RESERVED, &changeset);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
freed += changeset.bytes_changed;
|
|
|
|
}
|
2017-12-12 08:34:23 +01:00
|
|
|
btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
|
|
|
|
BTRFS_QGROUP_RSV_DATA);
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
ret = freed;
|
|
|
|
out:
|
|
|
|
extent_changeset_release(&changeset);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __btrfs_qgroup_release_data(struct inode *inode,
|
|
|
|
struct extent_changeset *reserved, u64 start, u64 len,
|
|
|
|
int free)
|
2015-10-12 10:28:06 +02:00
|
|
|
{
|
|
|
|
struct extent_changeset changeset;
|
2015-09-28 10:57:53 +02:00
|
|
|
int trace_op = QGROUP_RELEASE;
|
2015-10-12 10:28:06 +02:00
|
|
|
int ret;
|
|
|
|
|
2018-10-09 08:36:45 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
|
|
|
|
&BTRFS_I(inode)->root->fs_info->flags))
|
|
|
|
return 0;
|
|
|
|
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
/* In release case, we shouldn't have @reserved */
|
|
|
|
WARN_ON(!free && reserved);
|
|
|
|
if (free && reserved)
|
|
|
|
return qgroup_free_reserved_data(inode, reserved, start, len);
|
2017-02-27 08:10:38 +01:00
|
|
|
extent_changeset_init(&changeset);
|
2015-10-12 10:28:06 +02:00
|
|
|
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
|
2016-04-26 23:54:39 +02:00
|
|
|
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
|
2015-10-12 10:28:06 +02:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
2017-03-13 08:52:09 +01:00
|
|
|
if (free)
|
2015-09-28 10:57:53 +02:00
|
|
|
trace_op = QGROUP_FREE;
|
|
|
|
trace_btrfs_qgroup_release_data(inode, start, len,
|
|
|
|
changeset.bytes_changed, trace_op);
|
2017-03-13 08:52:09 +01:00
|
|
|
if (free)
|
|
|
|
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
|
|
|
|
BTRFS_I(inode)->root->objectid,
|
2017-12-12 08:34:23 +01:00
|
|
|
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
|
2017-02-27 08:10:36 +01:00
|
|
|
ret = changeset.bytes_changed;
|
2015-10-12 10:28:06 +02:00
|
|
|
out:
|
2017-02-27 08:10:38 +01:00
|
|
|
extent_changeset_release(&changeset);
|
2015-10-12 10:28:06 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a reserved space range from io_tree and related qgroups
|
|
|
|
*
|
|
|
|
* Should be called when a range of pages get invalidated before reaching disk.
|
|
|
|
* Or for error cleanup case.
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
* if @reserved is given, only reserved range in [@start, @start + @len) will
|
|
|
|
* be freed.
|
2015-10-12 10:28:06 +02:00
|
|
|
*
|
|
|
|
* For data written to disk, use btrfs_qgroup_release_data().
|
|
|
|
*
|
|
|
|
* NOTE: This function may sleep for memory allocation.
|
|
|
|
*/
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
int btrfs_qgroup_free_data(struct inode *inode,
|
|
|
|
struct extent_changeset *reserved, u64 start, u64 len)
|
2015-10-12 10:28:06 +02:00
|
|
|
{
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
|
2015-10-12 10:28:06 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Release a reserved space range from io_tree only.
|
|
|
|
*
|
|
|
|
* Should be called when a range of pages get written to disk and corresponding
|
|
|
|
* FILE_EXTENT is inserted into corresponding root.
|
|
|
|
*
|
|
|
|
* Since new qgroup accounting framework will only update qgroup numbers at
|
|
|
|
* commit_transaction() time, its reserved space shouldn't be freed from
|
|
|
|
* related qgroups.
|
|
|
|
*
|
|
|
|
* But we should release the range from io_tree, to allow further write to be
|
|
|
|
* COWed.
|
|
|
|
*
|
|
|
|
* NOTE: This function may sleep for memory allocation.
|
|
|
|
*/
|
|
|
|
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
|
|
|
|
{
|
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges
[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)
Task A | Task B
----------------------------------------------------------------------
Buffered_write [0, 2K) |
|- check_data_free_space() |
| |- qgroup_reserve_data() |
| Range aligned to page |
| range [0, 4K) <<< |
| 4K bytes reserved <<< |
|- copy pages to page cache |
| Buffered_write [2K, 4K)
| |- check_data_free_space()
| | |- qgroup_reserved_data()
| | Range alinged to page
| | range [0, 4K)
| | Already reserved by A <<<
| | 0 bytes reserved <<<
| |- delalloc_reserve_metadata()
| | And it *FAILED* (Maybe EQUOTA)
| |- free_reserved_data_space()
|- qgroup_free_data()
Range aligned to page range
[0, 4K)
Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)
[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.
And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.
[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.
Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 08:10:39 +01:00
|
|
|
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
|
2015-10-12 10:28:06 +02:00
|
|
|
}
|
2015-09-08 11:08:38 +02:00
|
|
|
|
2017-12-12 08:34:34 +01:00
|
|
|
static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
|
|
|
if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
|
|
|
|
type != BTRFS_QGROUP_RSV_META_PERTRANS)
|
|
|
|
return;
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&root->qgroup_meta_rsv_lock);
|
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
|
|
|
|
root->qgroup_meta_rsv_prealloc += num_bytes;
|
|
|
|
else
|
|
|
|
root->qgroup_meta_rsv_pertrans += num_bytes;
|
|
|
|
spin_unlock(&root->qgroup_meta_rsv_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
|
|
|
{
|
|
|
|
if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
|
|
|
|
type != BTRFS_QGROUP_RSV_META_PERTRANS)
|
|
|
|
return 0;
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
spin_lock(&root->qgroup_meta_rsv_lock);
|
|
|
|
if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
|
|
|
|
num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
|
|
|
|
num_bytes);
|
|
|
|
root->qgroup_meta_rsv_prealloc -= num_bytes;
|
|
|
|
} else {
|
|
|
|
num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
|
|
|
|
num_bytes);
|
|
|
|
root->qgroup_meta_rsv_pertrans -= num_bytes;
|
|
|
|
}
|
|
|
|
spin_unlock(&root->qgroup_meta_rsv_lock);
|
|
|
|
return num_bytes;
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:29 +01:00
|
|
|
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type, bool enforce)
|
2015-09-08 11:08:38 +02:00
|
|
|
{
|
2016-06-23 00:54:23 +02:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2015-09-08 11:08:38 +02:00
|
|
|
int ret;
|
|
|
|
|
2016-06-23 00:54:23 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
2016-09-02 21:40:02 +02:00
|
|
|
!is_fstree(root->objectid) || num_bytes == 0)
|
2015-09-08 11:08:38 +02:00
|
|
|
return 0;
|
|
|
|
|
2016-06-23 00:54:23 +02:00
|
|
|
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
|
2019-10-17 04:38:36 +02:00
|
|
|
trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:29 +01:00
|
|
|
ret = qgroup_reserve(root, num_bytes, enforce, type);
|
2015-09-08 11:08:38 +02:00
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
2017-12-12 08:34:34 +01:00
|
|
|
/*
|
|
|
|
* Record what we have reserved into root.
|
|
|
|
*
|
|
|
|
* To avoid quota disabled->enabled underflow.
|
|
|
|
* In that case, we may try to free space we haven't reserved
|
|
|
|
* (since quota was disabled), so record what we reserved into root.
|
|
|
|
* And ensure later release won't underflow this number.
|
|
|
|
*/
|
|
|
|
add_root_meta_rsv(root, num_bytes, type);
|
2015-09-08 11:08:38 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:29 +01:00
|
|
|
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
|
2015-09-08 11:08:38 +02:00
|
|
|
{
|
2016-06-23 00:54:23 +02:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
2015-09-08 11:08:38 +02:00
|
|
|
|
2016-06-23 00:54:23 +02:00
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
2016-09-02 21:40:02 +02:00
|
|
|
!is_fstree(root->objectid))
|
2015-09-08 11:08:38 +02:00
|
|
|
return;
|
|
|
|
|
2017-12-12 08:34:30 +01:00
|
|
|
/* TODO: Update trace point to handle such free */
|
2017-12-12 08:34:35 +01:00
|
|
|
trace_qgroup_meta_free_all_pertrans(root);
|
2017-12-12 08:34:30 +01:00
|
|
|
/* Special value -1 means to free all reserved space */
|
|
|
|
btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:29 +01:00
|
|
|
BTRFS_QGROUP_RSV_META_PERTRANS);
|
2015-09-08 11:08:38 +02:00
|
|
|
}
|
|
|
|
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:29 +01:00
|
|
|
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
|
|
|
|
enum btrfs_qgroup_rsv_type type)
|
2015-09-08 11:08:38 +02:00
|
|
|
{
|
2016-06-23 00:54:23 +02:00
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
2016-09-02 21:40:02 +02:00
|
|
|
!is_fstree(root->objectid))
|
2015-09-08 11:08:38 +02:00
|
|
|
return;
|
|
|
|
|
2017-12-12 08:34:34 +01:00
|
|
|
/*
|
|
|
|
* reservation for META_PREALLOC can happen before quota is enabled,
|
|
|
|
* which can lead to underflow.
|
|
|
|
* Here ensure we will only free what we really have reserved.
|
|
|
|
*/
|
|
|
|
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
|
2016-06-23 00:54:23 +02:00
|
|
|
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
|
2019-10-17 04:38:36 +02:00
|
|
|
trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
|
btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans
Btrfs uses 2 different methods to reseve metadata qgroup space.
1) Reserve at btrfs_start_transaction() time
This is quite straightforward, caller will use the trans handler
allocated to modify b-trees.
In this case, reserved metadata should be kept until qgroup numbers
are updated.
2) Reserve by using block_rsv first, and later btrfs_join_transaction()
This is more complicated, caller will reserve space using block_rsv
first, and then later call btrfs_join_transaction() to get a trans
handle.
In this case, before we modify trees, the reserved space can be
modified on demand, and after btrfs_join_transaction(), such reserved
space should also be kept until qgroup numbers are updated.
Since these two types behave differently, split the original "META"
reservation type into 2 sub-types:
META_PERTRANS:
For above case 1)
META_PREALLOC:
For reservations that happened before btrfs_join_transaction() of
case 2)
NOTE: This patch will only convert existing qgroup meta reservation
callers according to its situation, not ensuring all callers are at
correct timing.
Such fix will be added in later patches.
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ update comments ]
Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 08:34:29 +01:00
|
|
|
btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
|
2015-09-08 11:08:38 +02:00
|
|
|
}
|
2015-10-13 03:53:10 +02:00
|
|
|
|
2017-12-12 08:34:31 +01:00
|
|
|
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
|
|
|
|
int num_bytes)
|
|
|
|
{
|
|
|
|
struct btrfs_root *quota_root = fs_info->quota_root;
|
|
|
|
struct btrfs_qgroup *qgroup;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator uiter;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (num_bytes == 0)
|
|
|
|
return;
|
|
|
|
if (!quota_root)
|
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->qgroup_lock);
|
|
|
|
qgroup = find_qgroup_rb(fs_info, ref_root);
|
|
|
|
if (!qgroup)
|
|
|
|
goto out;
|
|
|
|
ulist_reinit(fs_info->qgroup_ulist);
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(qgroup), GFP_ATOMIC);
|
2017-12-12 08:34:31 +01:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
ULIST_ITER_INIT(&uiter);
|
|
|
|
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
|
|
|
|
struct btrfs_qgroup *qg;
|
|
|
|
struct btrfs_qgroup_list *glist;
|
|
|
|
|
|
|
|
qg = unode_aux_to_qgroup(unode);
|
|
|
|
|
|
|
|
qgroup_rsv_release(fs_info, qg, num_bytes,
|
|
|
|
BTRFS_QGROUP_RSV_META_PREALLOC);
|
|
|
|
qgroup_rsv_add(fs_info, qg, num_bytes,
|
|
|
|
BTRFS_QGROUP_RSV_META_PERTRANS);
|
|
|
|
list_for_each_entry(glist, &qg->groups, next_group) {
|
|
|
|
ret = ulist_add(fs_info->qgroup_ulist,
|
|
|
|
glist->group->qgroupid,
|
2018-03-27 19:04:50 +02:00
|
|
|
qgroup_to_aux(glist->group), GFP_ATOMIC);
|
2017-12-12 08:34:31 +01:00
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
spin_unlock(&fs_info->qgroup_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = root->fs_info;
|
|
|
|
|
|
|
|
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
|
|
|
|
!is_fstree(root->objectid))
|
|
|
|
return;
|
2017-12-12 08:34:34 +01:00
|
|
|
/* Same as btrfs_qgroup_free_meta_prealloc() */
|
|
|
|
num_bytes = sub_root_meta_rsv(root, num_bytes,
|
|
|
|
BTRFS_QGROUP_RSV_META_PREALLOC);
|
2017-12-12 08:34:35 +01:00
|
|
|
trace_qgroup_meta_convert(root, num_bytes);
|
2017-12-12 08:34:31 +01:00
|
|
|
qgroup_convert_meta(fs_info, root->objectid, num_bytes);
|
|
|
|
}
|
|
|
|
|
2015-10-13 03:53:10 +02:00
|
|
|
/*
|
2016-05-20 03:18:45 +02:00
|
|
|
* Check qgroup reserved space leaking, normally at destroy inode
|
2015-10-13 03:53:10 +02:00
|
|
|
* time
|
|
|
|
*/
|
|
|
|
void btrfs_qgroup_check_reserved_leak(struct inode *inode)
|
|
|
|
{
|
|
|
|
struct extent_changeset changeset;
|
|
|
|
struct ulist_node *unode;
|
|
|
|
struct ulist_iterator iter;
|
|
|
|
int ret;
|
|
|
|
|
2017-02-27 08:10:38 +01:00
|
|
|
extent_changeset_init(&changeset);
|
2015-10-13 03:53:10 +02:00
|
|
|
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
|
2016-04-26 23:54:39 +02:00
|
|
|
EXTENT_QGROUP_RESERVED, &changeset);
|
2015-10-13 03:53:10 +02:00
|
|
|
|
|
|
|
WARN_ON(ret < 0);
|
|
|
|
if (WARN_ON(changeset.bytes_changed)) {
|
|
|
|
ULIST_ITER_INIT(&iter);
|
2017-02-13 13:42:29 +01:00
|
|
|
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
|
2015-10-13 03:53:10 +02:00
|
|
|
btrfs_warn(BTRFS_I(inode)->root->fs_info,
|
|
|
|
"leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
|
|
|
|
inode->i_ino, unode->val, unode->aux);
|
|
|
|
}
|
2017-02-13 14:24:35 +01:00
|
|
|
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
|
|
|
|
BTRFS_I(inode)->root->objectid,
|
2017-12-12 08:34:23 +01:00
|
|
|
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
|
2017-02-13 14:24:35 +01:00
|
|
|
|
2015-10-13 03:53:10 +02:00
|
|
|
}
|
2017-02-27 08:10:38 +01:00
|
|
|
extent_changeset_release(&changeset);
|
2015-10-13 03:53:10 +02:00
|
|
|
}
|