Skip to content

Commit 45a2966

Browse files
jankaratorvalds
authored andcommitted
writeback: fix bandwidth estimate for spiky workload
Michael Stapelberg has reported that for workload with short big spikes of writes (GCC linker seem to trigger this frequently) the write throughput is heavily underestimated and tends to steadily sink until it reaches zero. This has rather bad impact on writeback throttling (causing stalls). The problem is that writeback throughput estimate gets updated at most once per 200 ms. One update happens early after we submit pages for writeback (at that point writeout of only small fraction of pages is completed and thus observed throughput is tiny). Next update happens only during the next write spike (updates happen only from inode writeback and dirty throttling code) and if that is more than 1s after previous spike, we decide system was idle and just ignore whatever was written until this moment. Fix the problem by making sure writeback throughput estimate is also updated shortly after writeback completes to get reasonable estimate of throughput for spiky workloads. [[email protected]: avoid division by 0 in wb_update_dirty_ratelimit()] Link: https://lore.kernel.org/lkml/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Jan Kara <[email protected]> Reported-by: Michael Stapelberg <[email protected]> Tested-by: Michael Stapelberg <[email protected]> Cc: Wu Fengguang <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent fee468f commit 45a2966

File tree

4 files changed

+37
-14
lines changed

4 files changed

+37
-14
lines changed

include/linux/backing-dev-defs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ struct bdi_writeback {
143143
spinlock_t work_lock; /* protects work_list & dwork scheduling */
144144
struct list_head work_list;
145145
struct delayed_work dwork; /* work item used for writeback */
146+
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
146147

147148
unsigned long dirty_sleep; /* last wait */
148149

include/linux/writeback.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
379379
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
380380
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
381381

382+
void wb_update_bandwidth(struct bdi_writeback *wb);
382383
void balance_dirty_pages_ratelimited(struct address_space *mapping);
383384
bool wb_over_bg_thresh(struct bdi_writeback *wb);
384385

mm/backing-dev.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,14 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
271271
spin_unlock_bh(&wb->work_lock);
272272
}
273273

274+
static void wb_update_bandwidth_workfn(struct work_struct *work)
275+
{
276+
struct bdi_writeback *wb = container_of(to_delayed_work(work),
277+
struct bdi_writeback, bw_dwork);
278+
279+
wb_update_bandwidth(wb);
280+
}
281+
274282
/*
275283
* Initial write bandwidth: 100 MB/s
276284
*/
@@ -303,6 +311,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
303311
spin_lock_init(&wb->work_lock);
304312
INIT_LIST_HEAD(&wb->work_list);
305313
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
314+
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
306315
wb->dirty_sleep = jiffies;
307316

308317
err = fprop_local_init_percpu(&wb->completions, gfp);
@@ -351,6 +360,7 @@ static void wb_shutdown(struct bdi_writeback *wb)
351360
mod_delayed_work(bdi_wq, &wb->dwork, 0);
352361
flush_delayed_work(&wb->dwork);
353362
WARN_ON(!list_empty(&wb->work_list));
363+
flush_delayed_work(&wb->bw_dwork);
354364
}
355365

356366
static void wb_exit(struct bdi_writeback *wb)

mm/page-writeback.c

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,18 +1336,19 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
13361336
{
13371337
struct bdi_writeback *wb = gdtc->wb;
13381338
unsigned long now = jiffies;
1339-
unsigned long elapsed = now - wb->bw_time_stamp;
1339+
unsigned long elapsed;
13401340
unsigned long dirtied;
13411341
unsigned long written;
13421342

1343-
lockdep_assert_held(&wb->list_lock);
1343+
spin_lock(&wb->list_lock);
13441344

13451345
/*
1346-
* rate-limit, only update once every 200ms.
1346+
* Lockless checks for elapsed time are racy and delayed update after
1347+
* IO completion doesn't do it at all (to make sure written pages are
1348+
* accounted reasonably quickly). Make sure elapsed >= 1 to avoid
1349+
* division errors.
13471350
*/
1348-
if (elapsed < BANDWIDTH_INTERVAL)
1349-
return;
1350-
1351+
elapsed = max(now - wb->bw_time_stamp, 1UL);
13511352
dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
13521353
written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
13531354

@@ -1369,15 +1370,14 @@ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
13691370
wb->dirtied_stamp = dirtied;
13701371
wb->written_stamp = written;
13711372
wb->bw_time_stamp = now;
1373+
spin_unlock(&wb->list_lock);
13721374
}
13731375

1374-
static void wb_update_bandwidth(struct bdi_writeback *wb)
1376+
void wb_update_bandwidth(struct bdi_writeback *wb)
13751377
{
13761378
struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
13771379

1378-
spin_lock(&wb->list_lock);
13791380
__wb_update_bandwidth(&gdtc, NULL, false);
1380-
spin_unlock(&wb->list_lock);
13811381
}
13821382

13831383
/* Interval after which we consider wb idle and don't estimate bandwidth */
@@ -1722,11 +1722,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
17221722
wb->dirty_exceeded = 1;
17231723

17241724
if (time_is_before_jiffies(wb->bw_time_stamp +
1725-
BANDWIDTH_INTERVAL)) {
1726-
spin_lock(&wb->list_lock);
1725+
BANDWIDTH_INTERVAL))
17271726
__wb_update_bandwidth(gdtc, mdtc, true);
1728-
spin_unlock(&wb->list_lock);
1729-
}
17301727

17311728
/* throttle according to the chosen dtc */
17321729
dirty_ratelimit = wb->dirty_ratelimit;
@@ -2374,7 +2371,13 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
23742371
cond_resched();
23752372
congestion_wait(BLK_RW_ASYNC, HZ/50);
23762373
}
2377-
wb_update_bandwidth(wb);
2374+
/*
2375+
* Usually few pages are written by now from those we've just submitted
2376+
* but if there's constant writeback being submitted, this makes sure
2377+
* writeback bandwidth is updated once in a while.
2378+
*/
2379+
if (time_is_before_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL))
2380+
wb_update_bandwidth(wb);
23782381
return ret;
23792382
}
23802383

@@ -2754,6 +2757,14 @@ static void wb_inode_writeback_start(struct bdi_writeback *wb)
27542757
static void wb_inode_writeback_end(struct bdi_writeback *wb)
27552758
{
27562759
atomic_dec(&wb->writeback_inodes);
2760+
/*
2761+
* Make sure estimate of writeback throughput gets updated after
2762+
* writeback completed. We delay the update by BANDWIDTH_INTERVAL
2763+
* (which is the interval other bandwidth updates use for batching) so
2764+
* that if multiple inodes end writeback at a similar time, they get
2765+
* batched into one bandwidth update.
2766+
*/
2767+
queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
27572768
}
27582769

27592770
int test_clear_page_writeback(struct page *page)

0 commit comments

Comments
 (0)