Skip to content

Commit 1dd44c0

Browse files
Baolin Wangakpm00
authored andcommitted
mm: shmem: skip swapcache for swapin of synchronous swap device
With fast swap devices (such as zram), swapin latency is crucial to applications. For shmem swapin, similar to anonymous memory swapin, we can skip the swapcache operation to improve swapin latency. Testing 1G shmem sequential swapin without THP enabled, I observed approximately a 6% performance improvement: (Note: I repeated 5 times and took the mean data for each test) w/o patch w/ patch changes 534.8ms 501ms +6.3% In addition, currently, we always split the large swap entry stored in the shmem mapping during shmem large folio swapin, which is not perfect, especially with a fast swap device. We should swap in the whole large folio instead of splitting the precious large folios to take advantage of the large folios and improve the swapin latency if the swap device is synchronous device, which is similar to anonymous memory mTHP swapin. Testing 1G shmem sequential swapin with 64K mTHP and 2M mTHP, I observed obvious performance improvement: mTHP=64K w/o patch w/ patch changes 550.4ms 169.6ms +69% mTHP=2M w/o patch w/ patch changes 542.8ms 126.8ms +77% Note that skipping swapcache requires attention to concurrent swapin scenarios. Fortunately the swapcache_prepare() and shmem_add_to_page_cache() can help identify concurrent swapin and large swap entry split scenarios, and return -EEXIST for retry. [[email protected]: use IS_ENABLED(), tweak comment grammar] Link: https://lkml.kernel.org/r/3d9f3bd3bc6ec953054baff5134f66feeaae7c1e.1736301701.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: "Huang, Ying" <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Kairui Song <[email protected]> Cc: Kefeng Wang <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Ryan Roberts <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent b2aad24 commit 1dd44c0

File tree

1 file changed

+105
-5
lines changed

1 file changed

+105
-5
lines changed

mm/shmem.c

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1967,6 +1967,65 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
19671967
return ERR_PTR(error);
19681968
}
19691969

1970+
static struct folio *shmem_swap_alloc_folio(struct inode *inode,
1971+
struct vm_area_struct *vma, pgoff_t index,
1972+
swp_entry_t entry, int order, gfp_t gfp)
1973+
{
1974+
struct shmem_inode_info *info = SHMEM_I(inode);
1975+
struct folio *new;
1976+
void *shadow;
1977+
int nr_pages;
1978+
1979+
/*
1980+
* We have arrived here because our zones are constrained, so don't
1981+
* limit chance of success with further cpuset and node constraints.
1982+
*/
1983+
gfp &= ~GFP_CONSTRAINT_MASK;
1984+
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
1985+
gfp_t huge_gfp = vma_thp_gfp_mask(vma);
1986+
1987+
gfp = limit_gfp_mask(huge_gfp, gfp);
1988+
}
1989+
1990+
new = shmem_alloc_folio(gfp, order, info, index);
1991+
if (!new)
1992+
return ERR_PTR(-ENOMEM);
1993+
1994+
nr_pages = folio_nr_pages(new);
1995+
if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
1996+
gfp, entry)) {
1997+
folio_put(new);
1998+
return ERR_PTR(-ENOMEM);
1999+
}
2000+
2001+
/*
2002+
* Prevent parallel swapin from proceeding with the swap cache flag.
2003+
*
2004+
* Of course there is another possible concurrent scenario as well,
2005+
* that is to say, the swap cache flag of a large folio has already
2006+
* been set by swapcache_prepare(), while another thread may have
2007+
* already split the large swap entry stored in the shmem mapping.
2008+
* In this case, shmem_add_to_page_cache() will help identify the
2009+
* concurrent swapin and return -EEXIST.
2010+
*/
2011+
if (swapcache_prepare(entry, nr_pages)) {
2012+
folio_put(new);
2013+
return ERR_PTR(-EEXIST);
2014+
}
2015+
2016+
__folio_set_locked(new);
2017+
__folio_set_swapbacked(new);
2018+
new->swap = entry;
2019+
2020+
mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
2021+
shadow = get_shadow_from_swap_cache(entry);
2022+
if (shadow)
2023+
workingset_refault(new, shadow);
2024+
folio_add_lru(new);
2025+
swap_read_folio(new, NULL);
2026+
return new;
2027+
}
2028+
19702029
/*
19712030
* When a page is moved from swapcache to shmem filecache (either by the
19722031
* usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
@@ -2070,7 +2129,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
20702129
}
20712130

20722131
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
2073-
struct folio *folio, swp_entry_t swap)
2132+
struct folio *folio, swp_entry_t swap,
2133+
bool skip_swapcache)
20742134
{
20752135
struct address_space *mapping = inode->i_mapping;
20762136
swp_entry_t swapin_error;
@@ -2086,7 +2146,8 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
20862146

20872147
nr_pages = folio_nr_pages(folio);
20882148
folio_wait_writeback(folio);
2089-
delete_from_swap_cache(folio);
2149+
if (!skip_swapcache)
2150+
delete_from_swap_cache(folio);
20902151
/*
20912152
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
20922153
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
@@ -2190,6 +2251,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
21902251
struct shmem_inode_info *info = SHMEM_I(inode);
21912252
struct swap_info_struct *si;
21922253
struct folio *folio = NULL;
2254+
bool skip_swapcache = false;
21932255
swp_entry_t swap;
21942256
int error, nr_pages;
21952257

@@ -2211,6 +2273,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
22112273
/* Look it up and read it in.. */
22122274
folio = swap_cache_get_folio(swap, NULL, 0);
22132275
if (!folio) {
2276+
int order = xa_get_order(&mapping->i_pages, index);
2277+
bool fallback_order0 = false;
22142278
int split_order;
22152279

22162280
/* Or update major stats only when swapin succeeds?? */
@@ -2220,6 +2284,33 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
22202284
count_memcg_event_mm(fault_mm, PGMAJFAULT);
22212285
}
22222286

2287+
/*
2288+
* If uffd is active for the vma, we need per-page fault
2289+
* fidelity to maintain the uffd semantics, then fallback
2290+
* to swapin order-0 folio, as well as for zswap case.
2291+
*/
2292+
if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
2293+
!zswap_never_enabled()))
2294+
fallback_order0 = true;
2295+
2296+
/* Skip swapcache for synchronous device. */
2297+
if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
2298+
folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
2299+
if (!IS_ERR(folio)) {
2300+
skip_swapcache = true;
2301+
goto alloced;
2302+
}
2303+
2304+
/*
2305+
* Fallback to swapin order-0 folio unless the swap entry
2306+
* already exists.
2307+
*/
2308+
error = PTR_ERR(folio);
2309+
folio = NULL;
2310+
if (error == -EEXIST)
2311+
goto failed;
2312+
}
2313+
22232314
/*
22242315
* Now swap device can only swap in order 0 folio, then we
22252316
* should split the large swap entry stored in the pagecache
@@ -2250,9 +2341,10 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
22502341
}
22512342
}
22522343

2344+
alloced:
22532345
/* We have to do this with folio locked to prevent races */
22542346
folio_lock(folio);
2255-
if (!folio_test_swapcache(folio) ||
2347+
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
22562348
folio->swap.val != swap.val ||
22572349
!shmem_confirm_swap(mapping, index, swap)) {
22582350
error = -EEXIST;
@@ -2288,7 +2380,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
22882380
if (sgp == SGP_WRITE)
22892381
folio_mark_accessed(folio);
22902382

2291-
delete_from_swap_cache(folio);
2383+
if (skip_swapcache) {
2384+
folio->swap.val = 0;
2385+
swapcache_clear(si, swap, nr_pages);
2386+
} else {
2387+
delete_from_swap_cache(folio);
2388+
}
22922389
folio_mark_dirty(folio);
22932390
swap_free_nr(swap, nr_pages);
22942391
put_swap_device(si);
@@ -2299,8 +2396,11 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
22992396
if (!shmem_confirm_swap(mapping, index, swap))
23002397
error = -EEXIST;
23012398
if (error == -EIO)
2302-
shmem_set_folio_swapin_error(inode, index, folio, swap);
2399+
shmem_set_folio_swapin_error(inode, index, folio, swap,
2400+
skip_swapcache);
23032401
unlock:
2402+
if (skip_swapcache)
2403+
swapcache_clear(si, swap, folio_nr_pages(folio));
23042404
if (folio) {
23052405
folio_unlock(folio);
23062406
folio_put(folio);

0 commit comments

Comments
 (0)