Skip to content

Commit ab50973

Browse files
author
Paul Dagnelie
committed
Add allocation profile export and zleak utility for import
When attempting to debug performance problems on large systems, one of the major factors that affect performance is free space fragmentation. This heavily affects the allocation process, which is an area of active development in ZFS. Unfortunately, fragmenting a large pool for testing purposes is time consuming; it usually involves filling the pool and then repeatedly overwriting data until the free space becomes fragmented, which can take many hours. And even if the time is available, artificial workloads rarely generate the same fragmentation patterns as the natural workloads they're attempting to mimic. This patch has two parts. First, in zdb, we add the ability to export the full allocation map of the pool. It iterates over each vdev, printing every allocated segment in the ms_allocatable range tree. This can be done while the pool is online, though in that case the allocation map may actually be from several different TXGs as new ones are loaded on demand. The second is a new utility called zleak (and its supporting library and kernel changes). This is a small python program that invokes a new ioctl (via libzfs_core): zfs_ioc_raw_alloc. This ioctl takes in an nvlist of allocations to perform, and then allocates them. It does not currently store those allocations anywhere to make them reversible, and there is no corresponding raw_free ioctl (which would be extremely dangerous); this is an irreversible process, only intended for performance testing. The only way to reclaim the space afterwards is to destroy the pool or roll back to a checkpoint. Signed-off-by: Paul Dagnelie <[email protected]> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc.
1 parent b21e04e commit ab50973

File tree

23 files changed

+386
-18
lines changed

23 files changed

+386
-18
lines changed

cmd/Makefile.am

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,17 @@ endif
9898

9999

100100
if USING_PYTHON
101-
bin_SCRIPTS += arc_summary arcstat dbufstat zilstat
102-
CLEANFILES += arc_summary arcstat dbufstat zilstat
103-
dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in
101+
bin_SCRIPTS += arc_summary arcstat dbufstat zilstat zleak
102+
CLEANFILES += arc_summary arcstat dbufstat zilstat zleak
103+
dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in %D%/zleak
104104

105105
$(call SUBST,arcstat,%D%/)
106106
$(call SUBST,dbufstat,%D%/)
107107
$(call SUBST,zilstat,%D%/)
108108
arc_summary: %D%/arc_summary
109109
$(AM_V_at)cp $< $@
110+
zleak: %D%/zleak
111+
$(AM_V_at)cp $< $@
110112
endif
111113

112114

cmd/zdb/zdb.c

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,9 @@ extern uint_t zfs_reconstruct_indirect_combinations_max;
107107
extern uint_t zfs_btree_verify_intensity;
108108

109109
static const char cmdname[] = "zdb";
110-
uint8_t dump_opt[256];
110+
uint8_t dump_opt[512];
111+
112+
#define ALLOCATABLE_OPT 256
111113

112114
typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
113115

@@ -1650,6 +1652,16 @@ dump_metaslab_stats(metaslab_t *msp)
16501652
dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
16511653
}
16521654

1655+
static void
1656+
dump_allocated(void *arg, uint64_t start, uint64_t size)
1657+
{
1658+
uint64_t *off = arg;
1659+
if (*off != start)
1660+
(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", *off,
1661+
start - *off);
1662+
*off = start + size;
1663+
}
1664+
16531665
static void
16541666
dump_metaslab(metaslab_t *msp)
16551667
{
@@ -1666,13 +1678,24 @@ dump_metaslab(metaslab_t *msp)
16661678
(u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
16671679
(u_longlong_t)space_map_object(sm), freebuf);
16681680

1669-
if (dump_opt['m'] > 2 && !dump_opt['L']) {
1681+
if (dump_opt[ALLOCATABLE_OPT] ||
1682+
(dump_opt['m'] > 2 && !dump_opt['L'])) {
16701683
mutex_enter(&msp->ms_lock);
16711684
VERIFY0(metaslab_load(msp));
1685+
}
1686+
1687+
if (dump_opt['m'] > 2 && !dump_opt['L']) {
16721688
zfs_range_tree_stat_verify(msp->ms_allocatable);
16731689
dump_metaslab_stats(msp);
1674-
metaslab_unload(msp);
1675-
mutex_exit(&msp->ms_lock);
1690+
}
1691+
1692+
if (dump_opt[ALLOCATABLE_OPT]) {
1693+
uint64_t off = msp->ms_start;
1694+
zfs_range_tree_walk(msp->ms_allocatable, dump_allocated,
1695+
&off);
1696+
if (off != msp->ms_start + msp->ms_size)
1697+
(void) printf("ALLOC: %"PRIu64" %"PRIu64"\n", off,
1698+
msp->ms_size - off);
16761699
}
16771700

16781701
if (dump_opt['m'] > 1 && sm != NULL &&
@@ -1687,6 +1710,12 @@ dump_metaslab(metaslab_t *msp)
16871710
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
16881711
}
16891712

1713+
if (dump_opt[ALLOCATABLE_OPT] ||
1714+
(dump_opt['m'] > 2 && !dump_opt['L'])) {
1715+
metaslab_unload(msp);
1716+
mutex_exit(&msp->ms_lock);
1717+
}
1718+
16901719
if (vd->vdev_ops == &vdev_draid_ops)
16911720
ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
16921721
else
@@ -1723,8 +1752,9 @@ print_vdev_metaslab_header(vdev_t *vd)
17231752
}
17241753
}
17251754

1726-
(void) printf("\tvdev %10llu %s",
1727-
(u_longlong_t)vd->vdev_id, bias_str);
1755+
(void) printf("\tvdev %10llu\t%s metaslab shift %4lld",
1756+
(u_longlong_t)vd->vdev_id, bias_str,
1757+
(u_longlong_t)vd->vdev_ms_shift);
17281758

17291759
if (ms_flush_data_obj != 0) {
17301760
(void) printf(" ms_unflushed_phys object %llu",
@@ -9315,6 +9345,8 @@ main(int argc, char **argv)
93159345
{"all-reconstruction", no_argument, NULL, 'Y'},
93169346
{"livelist", no_argument, NULL, 'y'},
93179347
{"zstd-headers", no_argument, NULL, 'Z'},
9348+
{"allocatable-map", no_argument, NULL,
9349+
ALLOCATABLE_OPT},
93189350
{0, 0, 0, 0}
93199351
};
93209352

@@ -9345,6 +9377,7 @@ main(int argc, char **argv)
93459377
case 'u':
93469378
case 'y':
93479379
case 'Z':
9380+
case ALLOCATABLE_OPT:
93489381
dump_opt[c]++;
93499382
dump_all = 0;
93509383
break;

cmd/zdb/zdb.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,6 @@
2929
#define _ZDB_H
3030

3131
void dump_intent_log(zilog_t *);
32-
extern uint8_t dump_opt[256];
32+
extern uint8_t dump_opt[512];
3333

3434
#endif /* _ZDB_H */

cmd/zdb/zdb_il.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848

4949
#include "zdb.h"
5050

51-
extern uint8_t dump_opt[256];
51+
extern uint8_t dump_opt[512];
5252

5353
static char tab_prefix[4] = "\t\t\t";
5454

cmd/zleak

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/usr/bin/env python3
2+
# SPDX-License-Identifier: CDDL-1.0
3+
4+
#
5+
# This file and its contents are supplied under the terms of the
6+
# Common Development and Distribution License ("CDDL"), version 1.0.
7+
# You may only use this file in accordance with the terms of version
8+
# 1.0 of the CDDL.
9+
#
10+
# A full copy of the text of the CDDL should have accompanied this
11+
# source. A copy of the CDDL is also available via the Internet at
12+
# http://www.illumos.org/license/CDDL.
13+
#
14+
15+
#
16+
# Copyright (c) 2025 by Klara, Inc.
17+
#
18+
19+
import argparse, fileinput, libzfs_core, sys, errno
20+
21+
def perform_raw_alloc(pool, ms_shift, ms_count, vdev_id, allocs, force,
22+
verbose):
23+
if args.verbose == 1:
24+
print(f"Raw alloc: vdev {vdev_id}, {count} starting with offset "
25+
f"{allocs[0][0]}")
26+
if args.verbose >= 2:
27+
print(f"Raw alloc: {pool} {ms_shift} {ms_count} {vdev_id} {count}")
28+
try:
29+
libzfs_core.lzc_raw_alloc(pool, 1 << ms_shift, ms_count, vdev_id,
30+
allocs, args.force)
31+
except libzfs_core.exceptions.ZFSGenericError as e:
32+
if e.errno == errno.EINVAL:
33+
print("Invalid map for provided pool")
34+
sys.exit(1)
35+
assert (e.errno == errno.E2BIG and force)
36+
sys.exit(0)
37+
38+
allocs = []
39+
count = 0
40+
41+
parser = argparse.ArgumentParser(
42+
prog='zleak',
43+
description='facility to replicate memory fragmentation in ZFS'
44+
)
45+
parser.add_argument('poolname')
46+
parser.add_argument('-v', '--verbose', action='count', default=0)
47+
parser.add_argument('-f', '--force', action='store_true', default=False)
48+
args = parser.parse_args()
49+
50+
pool = args.poolname.encode('utf-8')
51+
52+
for line in fileinput.input('-'):
53+
dump = False
54+
line = line.rstrip()
55+
if not line.startswith(("ALLOC: ", "\tvdev ", "\tmetaslabs ")):
56+
continue
57+
58+
tokens = line.split()
59+
if line.startswith("\tvdev "):
60+
next_vdev_id = int(tokens[1])
61+
next_ms_shift = int(tokens[4])
62+
next_ms_count = 0
63+
dump = True
64+
elif line.startswith("\tmetaslabs "):
65+
next_ms_count = int(tokens[1])
66+
else:
67+
start = int(tokens[1])
68+
size = int(tokens[2])
69+
allocs.append((start, size))
70+
count = count + 1
71+
72+
if count == 1000000 or (dump and count != 0):
73+
perform_raw_alloc(pool, ms_shift, ms_count, vdev_id, allocs,
74+
args.force, args.verbose)
75+
count = 0
76+
allocs = []
77+
vdev_id = next_vdev_id
78+
ms_shift = next_ms_shift
79+
ms_count = next_ms_count
80+
81+
82+
if count > 0:
83+
perform_raw_alloc(pool, ms_shift, ms_count, vdev_id, allocs,
84+
args.force, args.verbose)
85+

contrib/debian/openzfs-zfsutils.install

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ usr/sbin/arc_summary
4040
usr/sbin/arcstat
4141
usr/sbin/dbufstat
4242
usr/sbin/zilstat
43+
usr/sbin/zleak
4344
usr/share/zfs/compatibility.d/
4445
usr/share/bash-completion/completions
4546
usr/share/man/man1/arcstat.1

contrib/debian/rules.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ override_dh_auto_install:
8585
mv '$(CURDIR)/debian/tmp/usr/bin/arcstat' '$(CURDIR)/debian/tmp/usr/sbin/arcstat'
8686
mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat'
8787
mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat'
88+
mv '$(CURDIR)/debian/tmp/usr/bin/zleak' '$(CURDIR)/debian/tmp/usr/sbin/zleak'
8889

8990
@# Zed has dependencies outside of the system root.
9091
mv '$(CURDIR)/debian/tmp/sbin/zed' '$(CURDIR)/debian/tmp/usr/sbin/zed'

contrib/pyzfs/libzfs_core/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
lzc_set_props,
9696
lzc_list_children,
9797
lzc_list_snaps,
98+
lzc_raw_alloc,
9899
receive_header,
99100
)
100101

@@ -151,6 +152,7 @@
151152
'lzc_set_props',
152153
'lzc_list_children',
153154
'lzc_list_snaps',
155+
'lzc_raw_alloc',
154156
'receive_header',
155157
]
156158

contrib/pyzfs/libzfs_core/_error_translation.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,12 @@ def lzc_list_translate_error(ret, name, opts):
696696
raise _generic_exception(ret, name, "Error obtaining a list")
697697

698698

699+
def lzc_raw_alloc_translate_errors(ret, name):
700+
if ret == 0:
701+
return
702+
raise _generic_exception(ret, name, "Error performing raw allocations")
703+
704+
699705
def _handle_err_list(ret, errlist, names, exception, mapper):
700706
'''
701707
Convert one or more errors from an operation into the requested exception.

contrib/pyzfs/libzfs_core/_libzfs_core.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2056,6 +2056,38 @@ def lzc_list_snaps(name):
20562056
return iter(snaps)
20572057

20582058

2059+
def lzc_raw_alloc(poolname, metaslab_size, metaslab_count, vdev_id,
2060+
allocations, force):
2061+
'''
2062+
Allocate regions of the provided vdev directly; useful primarily for
2063+
performance analysis of fragmented pools. Results in space leakage that it
2064+
is not currently possible to reclaim.
2065+
2066+
:param bytes poolname: the name of the pool to allocate in
2067+
:param int metaslab_size: the size of a metaslab in this pool (for
2068+
validation)
2069+
:param int metaslab_count: the number of metaslabs in this top level
2070+
vdev (for validation)
2071+
:param int vdev_id: the id of the top-level vdev to perform allocations
2072+
from
2073+
:param allocations: pairs of offset and size to allocate
2074+
:type fromsnap: list of (int, int)
2075+
2076+
:raises TooManyArguments: if too many allocations are passed in
2077+
'''
2078+
if len(allocations) > 1000000:
2079+
raise exceptions.TooManyArguments()
2080+
allocs = _ffi.new(f"uint64_t[{2 * len(allocations)}]")
2081+
for i in range(len(allocations)):
2082+
(s, l) = allocations[i]
2083+
allocs[2 * i] = s
2084+
allocs[2 * i + 1] = l
2085+
ret = _lib.lzc_raw_alloc(poolname, uint64_t(metaslab_size),
2086+
uint64_t(metaslab_count), uint64_t(vdev_id),
2087+
allocs, 2 * len(allocations), force)
2088+
errors.lzc_raw_alloc_translate_errors(ret, poolname)
2089+
2090+
20592091
# TODO: a better way to init and uninit the library
20602092
def _initialize():
20612093
class LazyInit(object):

0 commit comments

Comments
 (0)