Skip to content

Commit 20cb1c2

Browse files
Ming Leiaxboe
authored andcommitted
blk-cgroup: Flush stats before releasing blkcg_gq
As noted by Michal, the blkg_iostat_set's in the lockless list hold reference to blkg's to protect against their removal. Those blkg's hold reference to blkcg. When a cgroup is being destroyed, cgroup_rstat_flush() is only called at css_release_work_fn() which is called when the blkcg reference count reaches 0. This circular dependency will prevent blkcg and some blkgs from being freed after they are made offline. It is less a problem if the cgroup to be destroyed also has other controllers like memory that will call cgroup_rstat_flush() which will clean up the reference count. If block is the only controller that uses rstat, these offline blkcg and blkgs may never be freed leaking more and more memory over time. To prevent this potential memory leak: - flush blkcg per-cpu stats list in __blkg_release(), when no new stat can be added - add global blkg_stat_lock for covering concurrent parent blkg stat update - don't grab bio->bi_blkg reference when adding the stats into blkcg's per-cpu stat list since all stats are guaranteed to be consumed before releasing blkg instance, and grabbing blkg reference for stats was the most fragile part of original patch Based on Waiman's patch: https://lore.kernel.org/linux-block/20221215033132.230023-3-longman@redhat.com/ Fixes: 3b8cc62 ("blk-cgroup: Optimize blkcg_rstat_flush()") Cc: stable@vger.kernel.org Reported-by: Jay Shin <jaeshin@redhat.com> Acked-by: Tejun Heo <tj@kernel.org> Cc: Waiman Long <longman@redhat.com> Cc: mkoutny@suse.com Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://lore.kernel.org/r/20230609234249.1412858-1-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent ccc45cb commit 20cb1c2

File tree

1 file changed

+31
-9
lines changed

1 file changed

+31
-9
lines changed

block/blk-cgroup.c

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#include "blk-ioprio.h"
3535
#include "blk-throttle.h"
3636

37+
static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
38+
3739
/*
3840
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
3941
* blkcg_pol_register_mutex nests outside of it and synchronizes entire
@@ -56,6 +58,8 @@ static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
5658

5759
bool blkcg_debug_stats = false;
5860

61+
static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
62+
5963
#define BLKG_DESTROY_BATCH_SIZE 64
6064

6165
/*
@@ -163,10 +167,20 @@ static void blkg_free(struct blkcg_gq *blkg)
163167
static void __blkg_release(struct rcu_head *rcu)
164168
{
165169
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
170+
struct blkcg *blkcg = blkg->blkcg;
171+
int cpu;
166172

167173
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
168174
WARN_ON(!bio_list_empty(&blkg->async_bios));
169175
#endif
176+
/*
177+
* Flush all the non-empty percpu lockless lists before releasing
178+
* us, given these stat belongs to us.
179+
*
180+
* blkg_stat_lock is for serializing blkg stat update
181+
*/
182+
for_each_possible_cpu(cpu)
183+
__blkcg_rstat_flush(blkcg, cpu);
170184

171185
/* release the blkcg and parent blkg refs this blkg has been holding */
172186
css_put(&blkg->blkcg->css);
@@ -951,23 +965,26 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
951965
u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
952966
}
953967

954-
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
968+
static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
955969
{
956-
struct blkcg *blkcg = css_to_blkcg(css);
957970
struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
958971
struct llist_node *lnode;
959972
struct blkg_iostat_set *bisc, *next_bisc;
960973

961-
/* Root-level stats are sourced from system-wide IO stats */
962-
if (!cgroup_parent(css->cgroup))
963-
return;
964-
965974
rcu_read_lock();
966975

967976
lnode = llist_del_all(lhead);
968977
if (!lnode)
969978
goto out;
970979

980+
/*
981+
* For covering concurrent parent blkg update from blkg_release().
982+
*
983+
* When flushing from cgroup, cgroup_rstat_lock is always held, so
984+
* this lock won't cause contention most of time.
985+
*/
986+
raw_spin_lock(&blkg_stat_lock);
987+
971988
/*
972989
* Iterate only the iostat_cpu's queued in the lockless list.
973990
*/
@@ -991,13 +1008,19 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
9911008
if (parent && parent->parent)
9921009
blkcg_iostat_update(parent, &blkg->iostat.cur,
9931010
&blkg->iostat.last);
994-
percpu_ref_put(&blkg->refcnt);
9951011
}
996-
1012+
raw_spin_unlock(&blkg_stat_lock);
9971013
out:
9981014
rcu_read_unlock();
9991015
}
10001016

1017+
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
1018+
{
1019+
/* Root-level stats are sourced from system-wide IO stats */
1020+
if (cgroup_parent(css->cgroup))
1021+
__blkcg_rstat_flush(css_to_blkcg(css), cpu);
1022+
}
1023+
10011024
/*
10021025
* We source root cgroup stats from the system-wide stats to avoid
10031026
* tracking the same information twice and incurring overhead when no
@@ -2075,7 +2098,6 @@ void blk_cgroup_bio_start(struct bio *bio)
20752098

20762099
llist_add(&bis->lnode, lhead);
20772100
WRITE_ONCE(bis->lqueued, true);
2078-
percpu_ref_get(&bis->blkg->refcnt);
20792101
}
20802102

20812103
u64_stats_update_end_irqrestore(&bis->sync, flags);

0 commit comments

Comments
 (0)