Skip to content

Commit 6be3e21

Browse files
apopple-nvidiaakpm00
authored andcommitted
fs/dax: don't skip locked entries when scanning entries
Several functions internal to FS DAX use the following pattern when trying to obtain an unlocked entry: xas_for_each(&xas, entry, end_idx) { if (dax_is_locked(entry)) entry = get_unlocked_entry(&xas, 0); This is problematic because get_unlocked_entry() will get the next present entry in the range, and the next entry may not be locked. Therefore any processing of the original locked entry will be skipped. This can cause dax_layout_busy_page_range() to miss DMA-busy pages in the range, leading file systems to free blocks whilst DMA operations are ongoing which can lead to file system corruption. Instead callers from within a xas_for_each() loop should be waiting for the current entry to be unlocked without advancing the XArray state so a new function is introduced to wait. Also while we are here rename get_unlocked_entry() to get_next_unlocked_entry() to make it clear that it may advance the iterator state. Link: https://lkml.kernel.org/r/b11b2baed7157dc900bf07a4571bf71b7cd82d97.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <[email protected]> Reviewed-by: Dan Williams <[email protected]> Tested-by: Alison Schofield <[email protected]> Cc: Alexander Gordeev <[email protected]> Cc: Asahi Lina <[email protected]> Cc: Balbir Singh <[email protected]> Cc: Bjorn Helgaas <[email protected]> Cc: Catalin Marinas <[email protected]> Cc: Christian Borntraeger <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Chunyan Zhang <[email protected]> Cc: "Darrick J. Wong" <[email protected]> Cc: Dave Chinner <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Dave Jiang <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Gerald Schaefer <[email protected]> Cc: Heiko Carstens <[email protected]> Cc: Huacai Chen <[email protected]> Cc: Ira Weiny <[email protected]> Cc: Jan Kara <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: Jason Gunthorpe <[email protected]> Cc: John Hubbard <[email protected]> Cc: linmiaohe <[email protected]> Cc: Logan Gunthorpe <[email protected]> Cc: Matthew Wilcow (Oracle) <[email protected]> Cc: Michael "Camp Drill Sergeant" Ellerman <[email protected]> Cc: Nicholas Piggin <[email protected]> Cc: Peter Xu <[email protected]> Cc: Sven Schnelle <[email protected]> Cc: Ted Ts'o <[email protected]> Cc: Vasily Gorbik <[email protected]> Cc: Vishal Verma <[email protected]> Cc: Vivek Goyal <[email protected]> Cc: WANG Xuerui <[email protected]> Cc: Will Deacon <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent cee91fa commit 6be3e21

File tree

1 file changed

+41
-9
lines changed

1 file changed

+41
-9
lines changed

fs/dax.c

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry,
206206
*
207207
* Must be called with the i_pages lock held.
208208
*/
209-
static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
209+
static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
210210
{
211211
void *entry;
212212
struct wait_exceptional_entry_queue ewait;
@@ -235,6 +235,37 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
235235
}
236236
}
237237

238+
/*
239+
* Wait for the given entry to become unlocked. Caller must hold the i_pages
240+
* lock and call either put_unlocked_entry() if it did not lock the entry or
241+
* dax_unlock_entry() if it did. Returns an unlocked entry if still present.
242+
*/
243+
static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
244+
{
245+
struct wait_exceptional_entry_queue ewait;
246+
wait_queue_head_t *wq;
247+
248+
init_wait(&ewait.wait);
249+
ewait.wait.func = wake_exceptional_entry_func;
250+
251+
while (unlikely(dax_is_locked(entry))) {
252+
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
253+
prepare_to_wait_exclusive(wq, &ewait.wait,
254+
TASK_UNINTERRUPTIBLE);
255+
xas_pause(xas);
256+
xas_unlock_irq(xas);
257+
schedule();
258+
finish_wait(wq, &ewait.wait);
259+
xas_lock_irq(xas);
260+
entry = xas_load(xas);
261+
}
262+
263+
if (xa_is_internal(entry))
264+
return NULL;
265+
266+
return entry;
267+
}
268+
238269
/*
239270
* The only thing keeping the address space around is the i_pages lock
240271
* (it's cycled in clear_inode() after removing the entries from i_pages)
@@ -250,7 +281,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
250281

251282
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
252283
/*
253-
* Unlike get_unlocked_entry() there is no guarantee that this
284+
* Unlike get_next_unlocked_entry() there is no guarantee that this
254285
* path ever successfully retrieves an unlocked entry before an
255286
* inode dies. Perform a non-exclusive wait in case this path
256287
* never successfully performs its own wake up.
@@ -581,7 +612,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
581612
retry:
582613
pmd_downgrade = false;
583614
xas_lock_irq(xas);
584-
entry = get_unlocked_entry(xas, order);
615+
entry = get_next_unlocked_entry(xas, order);
585616

586617
if (entry) {
587618
if (dax_is_conflict(entry))
@@ -717,8 +748,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
717748
xas_for_each(&xas, entry, end_idx) {
718749
if (WARN_ON_ONCE(!xa_is_value(entry)))
719750
continue;
720-
if (unlikely(dax_is_locked(entry)))
721-
entry = get_unlocked_entry(&xas, 0);
751+
entry = wait_entry_unlocked_exclusive(&xas, entry);
722752
if (entry)
723753
page = dax_busy_page(entry);
724754
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -751,7 +781,7 @@ static int __dax_invalidate_entry(struct address_space *mapping,
751781
void *entry;
752782

753783
xas_lock_irq(&xas);
754-
entry = get_unlocked_entry(&xas, 0);
784+
entry = get_next_unlocked_entry(&xas, 0);
755785
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
756786
goto out;
757787
if (!trunc &&
@@ -777,7 +807,9 @@ static int __dax_clear_dirty_range(struct address_space *mapping,
777807

778808
xas_lock_irq(&xas);
779809
xas_for_each(&xas, entry, end) {
780-
entry = get_unlocked_entry(&xas, 0);
810+
entry = wait_entry_unlocked_exclusive(&xas, entry);
811+
if (!entry)
812+
continue;
781813
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
782814
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
783815
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -941,7 +973,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
941973
if (unlikely(dax_is_locked(entry))) {
942974
void *old_entry = entry;
943975

944-
entry = get_unlocked_entry(xas, 0);
976+
entry = get_next_unlocked_entry(xas, 0);
945977

946978
/* Entry got punched out / reallocated? */
947979
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
@@ -1950,7 +1982,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
19501982
vm_fault_t ret;
19511983

19521984
xas_lock_irq(&xas);
1953-
entry = get_unlocked_entry(&xas, order);
1985+
entry = get_next_unlocked_entry(&xas, order);
19541986
/* Did we race with someone splitting entry or so? */
19551987
if (!entry || dax_is_conflict(entry) ||
19561988
(order == 0 && !dax_is_pte_entry(entry))) {

0 commit comments

Comments
 (0)