Skip to content

Commit b817525

Browse files
htejunaxboe
authored andcommitted
writeback: bdi_writeback iteration must not skip dying ones
bdi_for_each_wb() is used in several places to wake up or issue writeback work items to all wb's (bdi_writeback's) on a given bdi. The iteration is performed by walking bdi->cgwb_tree; however, the tree only indexes wb's which are currently active. For example, when a memcg gets associated with a different blkcg, the old wb is removed from the tree so that the new one can be indexed. The old wb starts dying from then on but will linger till all its inodes are drained. As these dying wb's may still host dirty inodes, writeback operations which affect all wb's must include them. bdi_for_each_wb() skipping dying wb's led to sync(2) missing and failing to sync the inodes belonging to those wb's. This patch adds a RCU protected @bdi->wb_list which lists all wb's beloinging to that bdi. wb's are added on creation and removed on release rather than on the start of destruction. bdi_for_each_wb() usages are replaced with list_for_each[_continue]_rcu() iterations over @bdi->wb_list and bdi_for_each_wb() and its helpers are removed. v2: Updated as per Jan. last_wb ref leak in bdi_split_work_to_wbs() fixed and unnecessary list head severing in cgwb_bdi_destroy() removed. Signed-off-by: Tejun Heo <[email protected]> Reported-and-tested-by: Artem Bityutskiy <[email protected]> Fixes: ebe41ab ("writeback: implement bdi_for_each_wb()") Link: http://lkml.kernel.org/g/[email protected] Cc: Jan Kara <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 6fdf860 commit b817525

File tree

5 files changed

+39
-75
lines changed

5 files changed

+39
-75
lines changed

fs/fs-writeback.c

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
778778
struct wb_writeback_work *base_work,
779779
bool skip_if_busy)
780780
{
781-
int next_memcg_id = 0;
782-
struct bdi_writeback *wb;
783-
struct wb_iter iter;
781+
struct bdi_writeback *last_wb = NULL;
782+
struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
783+
struct bdi_writeback, bdi_node);
784784

785785
might_sleep();
786786
restart:
787787
rcu_read_lock();
788-
bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
788+
list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
789789
DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
790790
struct wb_writeback_work fallback_work;
791791
struct wb_writeback_work *work;
792792
long nr_pages;
793793

794+
if (last_wb) {
795+
wb_put(last_wb);
796+
last_wb = NULL;
797+
}
798+
794799
/* SYNC_ALL writes out I_DIRTY_TIME too */
795800
if (!wb_has_dirty_io(wb) &&
796801
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,12 +824,22 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
819824

820825
wb_queue_work(wb, work);
821826

822-
next_memcg_id = wb->memcg_css->id + 1;
827+
/*
828+
* Pin @wb so that it stays on @bdi->wb_list. This allows
829+
* continuing iteration from @wb after dropping and
830+
* regrabbing rcu read lock.
831+
*/
832+
wb_get(wb);
833+
last_wb = wb;
834+
823835
rcu_read_unlock();
824836
wb_wait_for_completion(bdi, &fallback_work_done);
825837
goto restart;
826838
}
827839
rcu_read_unlock();
840+
841+
if (last_wb)
842+
wb_put(last_wb);
828843
}
829844

830845
#else /* CONFIG_CGROUP_WRITEBACK */
@@ -1857,12 +1872,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
18571872
rcu_read_lock();
18581873
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
18591874
struct bdi_writeback *wb;
1860-
struct wb_iter iter;
18611875

18621876
if (!bdi_has_dirty_io(bdi))
18631877
continue;
18641878

1865-
bdi_for_each_wb(wb, bdi, &iter, 0)
1879+
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
18661880
wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
18671881
false, reason);
18681882
}
@@ -1894,9 +1908,8 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
18941908
rcu_read_lock();
18951909
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
18961910
struct bdi_writeback *wb;
1897-
struct wb_iter iter;
18981911

1899-
bdi_for_each_wb(wb, bdi, &iter, 0)
1912+
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
19001913
if (!list_empty(&wb->b_dirty_time))
19011914
wb_wakeup(wb);
19021915
}

include/linux/backing-dev-defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ struct bdi_writeback {
116116
struct list_head work_list;
117117
struct delayed_work dwork; /* work item used for writeback */
118118

119+
struct list_head bdi_node; /* anchored at bdi->wb_list */
120+
119121
#ifdef CONFIG_CGROUP_WRITEBACK
120122
struct percpu_ref refcnt; /* used only for !root wb's */
121123
struct fprop_local_percpu memcg_completions;
@@ -150,6 +152,7 @@ struct backing_dev_info {
150152
atomic_long_t tot_write_bandwidth;
151153

152154
struct bdi_writeback wb; /* the root writeback info for this bdi */
155+
struct list_head wb_list; /* list of all wbs */
153156
#ifdef CONFIG_CGROUP_WRITEBACK
154157
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
155158
struct rb_root cgwb_congested_tree; /* their congested states */

include/linux/backing-dev.h

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -408,61 +408,6 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
408408
rcu_read_unlock();
409409
}
410410

411-
struct wb_iter {
412-
int start_memcg_id;
413-
struct radix_tree_iter tree_iter;
414-
void **slot;
415-
};
416-
417-
static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
418-
struct backing_dev_info *bdi)
419-
{
420-
struct radix_tree_iter *titer = &iter->tree_iter;
421-
422-
WARN_ON_ONCE(!rcu_read_lock_held());
423-
424-
if (iter->start_memcg_id >= 0) {
425-
iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
426-
iter->start_memcg_id = -1;
427-
} else {
428-
iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
429-
}
430-
431-
if (!iter->slot)
432-
iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
433-
if (iter->slot)
434-
return *iter->slot;
435-
return NULL;
436-
}
437-
438-
static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
439-
struct backing_dev_info *bdi,
440-
int start_memcg_id)
441-
{
442-
iter->start_memcg_id = start_memcg_id;
443-
444-
if (start_memcg_id)
445-
return __wb_iter_next(iter, bdi);
446-
else
447-
return &bdi->wb;
448-
}
449-
450-
/**
451-
* bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
452-
* @wb_cur: cursor struct bdi_writeback pointer
453-
* @bdi: bdi to walk wb's of
454-
* @iter: pointer to struct wb_iter to be used as iteration buffer
455-
* @start_memcg_id: memcg ID to start iteration from
456-
*
457-
* Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
458-
* memcg ID order starting from @start_memcg_id. @iter is struct wb_iter
459-
* to be used as temp storage during iteration. rcu_read_lock() must be
460-
* held throughout iteration.
461-
*/
462-
#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \
463-
for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \
464-
(wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
465-
466411
#else /* CONFIG_CGROUP_WRITEBACK */
467412

468413
static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -522,14 +467,6 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg)
522467
{
523468
}
524469

525-
struct wb_iter {
526-
int next_id;
527-
};
528-
529-
#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \
530-
for ((iter)->next_id = (start_blkcg_id); \
531-
({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
532-
533470
static inline int inode_congested(struct inode *inode, int cong_bits)
534471
{
535472
return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);

mm/backing-dev.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,10 @@ static void cgwb_release_workfn(struct work_struct *work)
480480
release_work);
481481
struct backing_dev_info *bdi = wb->bdi;
482482

483+
spin_lock_irq(&cgwb_lock);
484+
list_del_rcu(&wb->bdi_node);
485+
spin_unlock_irq(&cgwb_lock);
486+
483487
wb_shutdown(wb);
484488

485489
css_put(wb->memcg_css);
@@ -575,6 +579,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
575579
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
576580
if (!ret) {
577581
atomic_inc(&bdi->usage_cnt);
582+
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
578583
list_add(&wb->memcg_node, memcg_cgwb_list);
579584
list_add(&wb->blkcg_node, blkcg_cgwb_list);
580585
css_get(memcg_css);
@@ -764,15 +769,22 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
764769

765770
int bdi_init(struct backing_dev_info *bdi)
766771
{
772+
int ret;
773+
767774
bdi->dev = NULL;
768775

769776
bdi->min_ratio = 0;
770777
bdi->max_ratio = 100;
771778
bdi->max_prop_frac = FPROP_FRAC_BASE;
772779
INIT_LIST_HEAD(&bdi->bdi_list);
780+
INIT_LIST_HEAD(&bdi->wb_list);
773781
init_waitqueue_head(&bdi->wb_waitq);
774782

775-
return cgwb_bdi_init(bdi);
783+
ret = cgwb_bdi_init(bdi);
784+
785+
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
786+
787+
return ret;
776788
}
777789
EXPORT_SYMBOL(bdi_init);
778790

mm/page-writeback.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1956,7 +1956,6 @@ void laptop_mode_timer_fn(unsigned long data)
19561956
int nr_pages = global_page_state(NR_FILE_DIRTY) +
19571957
global_page_state(NR_UNSTABLE_NFS);
19581958
struct bdi_writeback *wb;
1959-
struct wb_iter iter;
19601959

19611960
/*
19621961
* We want to write everything out, not just down to the dirty
@@ -1966,7 +1965,7 @@ void laptop_mode_timer_fn(unsigned long data)
19661965
return;
19671966

19681967
rcu_read_lock();
1969-
bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
1968+
list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
19701969
if (wb_has_dirty_io(wb))
19711970
wb_start_writeback(wb, nr_pages, true,
19721971
WB_REASON_LAPTOP_TIMER);

0 commit comments

Comments
 (0)