Skip to content

Commit 9138823

Browse files
author
Paul Dagnelie
committed
Respond to review feedback
Update man pages, properly handle sit-outs of non-direct children, add efficient low-res time function, and update stats every time Signed-off-by: Paul Dagnelie <[email protected]> Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc.
1 parent de36f05 commit 9138823

File tree

8 files changed

+127
-37
lines changed

8 files changed

+127
-37
lines changed

include/os/freebsd/spl/sys/time.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,23 @@ typedef longlong_t hrtime_t;
6262
#define SEC_TO_TICK(sec) ((sec) * hz)
6363
#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz))
6464

65+
static __inline hrtime_t
66+
getlrtime(void)
67+
{
68+
struct timespec ts;
69+
hrtime_t nsec;
70+
71+
getnanotime(&ts);
72+
nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec;
73+
return (nsec);
74+
}
75+
76+
static __inline void
77+
nanouptime(struct timespec *ts)
78+
{
79+
clock(&ts);
80+
}
81+
6582
static __inline hrtime_t
6683
gethrtime(void)
6784
{

include/os/linux/spl/sys/time.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,14 @@ gethrestime_sec(void)
7979
return (ts.tv_sec);
8080
}
8181

82+
static inline hrtime_t
83+
getlrtime(void)
84+
{
85+
inode_timespec_t ts;
86+
ktime_get_coarse_real_ts64(&ts);
87+
return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec);
88+
}
89+
8290
static inline hrtime_t
8391
gethrtime(void)
8492
{

include/sys/vdev_raidz.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ void vdev_raidz_reflow_copy_scratch(spa_t *);
6363
void raidz_dtl_reassessed(vdev_t *);
6464
boolean_t vdev_sit_out_reads(vdev_t *, zio_flag_t);
6565
extern void vdev_raidz_sit_child(vdev_t *svd);
66+
extern void vdev_raidz_unsit_child(vdev_t *vd);
6667

6768
extern const zio_vsd_ops_t vdev_raidz_vsd_ops;
6869

lib/libspl/include/sys/time.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ gethrestime_sec(void)
9797
return (tv.tv_sec);
9898
}
9999

100+
static inline hrtime_t
101+
getlrtime(void)
102+
{
103+
struct timeval tv;
104+
(void) gettimeofday(&tv, NULL);
105+
return ((((uint64_t)tv.tv_sec) * NANOSEC) +
106+
((uint64_t)tv.tv_usec * NSEC_PER_USEC));
107+
}
108+
100109
static inline hrtime_t
101110
gethrtime(void)
102111
{

lib/libzfs/libzfs_pool.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5712,8 +5712,16 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname,
57125712
nvlist_free(nvl);
57135713
nvlist_free(outnvl);
57145714

5715-
if (ret)
5716-
(void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf);
5715+
if (ret) {
5716+
if (errno == ENOTSUP) {
5717+
zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN,
5718+
"property not supported for this vdev"));
5719+
(void) zfs_error(zhp->zpool_hdl, EZFS_PROPTYPE, errbuf);
5720+
} else {
5721+
(void) zpool_standard_error(zhp->zpool_hdl, errno,
5722+
errbuf);
5723+
}
5724+
}
57175725

57185726
return (ret);
57195727
}

man/man7/vdevprops.7

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,6 @@ Comma separated list of children of this vdev
105105
The number of children belonging to this vdev
106106
.It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
107107
The number of errors of each type encountered by this vdev
108-
.It Sy sit_out
109-
True when a slow disk outlier was detected and the vdev is currently in a sit
110-
out state.
111-
While sitting out, the vdev will not participate in normal reads, instead its
112-
data will be reconstructed as needed from parity.
113108
.It Sy slow_ios
114109
This indicates the number of slow I/O operations encountered by this vdev.
115110
A slow I/O is defined as an operation that did not complete within the
@@ -161,6 +156,26 @@ The amount of space to reserve for the EFI system partition
161156
.It Sy failfast
162157
If this device should propagate BIO errors back to ZFS, used to disable
163158
failfast.
159+
.It Sy sit_out
160+
True when a slow disk outlier was detected and the vdev is currently in a sit
161+
out state.
162+
This property can be manually set to cause vdevs to sit out.
163+
It will also be automatically set by the
164+
.Sy autosit
165+
logic if that is enabled.
166+
While sitting out, the vdev will not participate in normal reads, instead its
167+
data will be reconstructed as needed from parity.
168+
.It Sy autosit
169+
Only valid for
170+
.Sy RAIDZ
171+
and
172+
.Sy DRAID
173+
vdevs.
174+
If set, this enables the kernel-level slow disk detection logic.
175+
This logic automatically causes any vdevs that are significant negative
176+
performance outliers to sit out, as described in the
177+
.Sy sit_out
178+
property.
164179
.It Sy path
165180
The path to the device for this vdev
166181
.It Sy allocating

module/zfs/vdev.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6150,10 +6150,13 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
61506150
break;
61516151
}
61526152
if (intval == 1) {
6153+
vdev_t *ancestor = vd;
6154+
while (ancestor->vdev_parent != vd->vdev_top)
6155+
ancestor = ancestor->vdev_parent;
61536156
vdev_t *pvd = vd->vdev_top;
61546157
uint_t sitouts = 0;
61556158
for (int i = 0; i < pvd->vdev_children; i++) {
6156-
if (pvd->vdev_child[i] == vd)
6159+
if (pvd->vdev_child[i] == ancestor)
61576160
continue;
61586161
if (vdev_sit_out_reads(
61596162
pvd->vdev_child[i], 0)) {
@@ -6167,7 +6170,7 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
61676170
if (error == 0)
61686171
vdev_raidz_sit_child(vd);
61696172
} else {
6170-
vd->vdev_read_sit_out_expire = 0;
6173+
vdev_raidz_unsit_child(vd);
61716174
}
61726175
break;
61736176
case VDEV_PROP_AUTOSIT:

module/zfs/vdev_raidz.c

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2340,6 +2340,15 @@ vdev_sit_out_reads(vdev_t *vd, zio_flag_t io_flags)
23402340
/* Avoid skipping a data column read when scrubbing */
23412341
if (io_flags & ZIO_FLAG_SCRUB)
23422342
return (B_FALSE);
2343+
2344+
if (!vd->vdev_ops->vdev_op_leaf) {
2345+
boolean_t sitting = B_FALSE;
2346+
for (int c = 0; c < vd->vdev_children; c++) {
2347+
sitting |= vdev_sit_out_reads(vd->vdev_child[c],
2348+
io_flags);
2349+
}
2350+
return (sitting);
2351+
}
23432352
if (vd->vdev_read_sit_out_expire >= gethrestime_sec())
23442353
return (B_TRUE);
23452354
vd->vdev_read_sit_out_expire = 0;
@@ -2904,6 +2913,13 @@ latency_compare(const void *arg1, const void *arg2)
29042913
void
29052914
vdev_raidz_sit_child(vdev_t *svd)
29062915
{
2916+
for (int c = 0; c < svd->vdev_children; c++) {
2917+
vdev_raidz_sit_child(svd->vdev_child[c]);
2918+
}
2919+
2920+
if (!svd->vdev_ops->vdev_op_leaf)
2921+
return;
2922+
29072923
/*
29082924
* Begin a sit out period for this slow drive
29092925
*/
@@ -2915,6 +2931,19 @@ vdev_raidz_sit_child(vdev_t *svd)
29152931
mutex_exit(&svd->vdev_stat_lock);
29162932
}
29172933

2934+
void
2935+
vdev_raidz_unsit_child(vdev_t *vd)
2936+
{
2937+
for (int c = 0; c < vd->vdev_children; c++) {
2938+
vdev_raidz_sit_child(vd->vdev_child[c]);
2939+
}
2940+
2941+
if (!vd->vdev_ops->vdev_op_leaf)
2942+
return;
2943+
2944+
vd->vdev_read_sit_out_expire = 0;
2945+
}
2946+
29182947
/*
29192948
* Check for any latency outlier from latest set of child reads.
29202949
*
@@ -2943,7 +2972,7 @@ vdev_child_slow_outlier(zio_t *zio)
29432972
vd->vdev_children < LAT_SAMPLES_MIN)
29442973
return;
29452974

2946-
hrtime_t now = gethrtime();
2975+
hrtime_t now = getlrtime();
29472976
uint64_t last = atomic_load_64(&vd->vdev_last_latency_check);
29482977

29492978
if ((now - last) < MSEC2NSEC(vdev_raidz_outlier_check_interval_ms) ||
@@ -2976,64 +3005,64 @@ vdev_child_slow_outlier(zio_t *zio)
29763005
uint64_t max = 0;
29773006
vdev_t *svd = NULL; /* suspect vdev */
29783007
uint_t sitouts = 0;
3008+
boolean_t skip = B_FALSE, svd_sitting = B_FALSE;
29793009
for (int c = 0; c < samples; c++) {
29803010
vdev_t *cvd = vd->vdev_child[c];
3011+
boolean_t sitting = vdev_sit_out_reads(cvd, 0);
29813012

2982-
if (cvd->vdev_read_sit_out_expire != 0) {
2983-
if (cvd->vdev_read_sit_out_expire < gethrestime_sec()) {
2984-
/*
2985-
* Done with our sit out, wait for new outlier
2986-
* to emerge.
2987-
*/
2988-
cvd->vdev_read_sit_out_expire = 0;
2989-
} else if (sitouts++ >= vdev_get_nparity(vd)) {
2990-
/*
2991-
* We can't sit out more disks than we have
2992-
* parity
2993-
*/
2994-
goto out;
2995-
}
3013+
if (sitting && sitouts++ >= vdev_get_nparity(vd)) {
3014+
/*
3015+
* We can't sit out more disks than we have
3016+
* parity
3017+
*/
3018+
skip = B_TRUE;
29963019
}
29973020
mutex_enter(&cvd->vdev_stat_lock);
3021+
3022+
uint64_t *prev_histo = cvd->vdev_prev_histo;
29983023
uint64_t *histo =
29993024
cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ];
3000-
uint64_t *prev_histo = cvd->vdev_prev_histo;
3025+
if (skip) {
3026+
size_t size =
3027+
sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3028+
memcpy(prev_histo, histo, size);
3029+
mutex_exit(&cvd->vdev_stat_lock);
3030+
continue;
3031+
}
30013032
uint64_t count = 0;
30023033
lat_data[c] = 0;
30033034
for (int i = 0; i < VDEV_L_HISTO_BUCKETS; i++) {
30043035
uint64_t this_count = histo[i] - prev_histo[i];
30053036
lat_data[c] += (1ULL << i) * this_count;
30063037
count += this_count;
30073038
}
3039+
size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3040+
memcpy(prev_histo, histo, size);
30083041
mutex_exit(&cvd->vdev_stat_lock);
30093042
lat_data[c] /= MAX(1, count);
30103043

30113044
/* wait until all disks have been read from */
3012-
if (lat_data[c] == 0 && cvd->vdev_read_sit_out_expire == 0)
3013-
goto out;
3045+
if (lat_data[c] == 0 && !sitting) {
3046+
skip = B_TRUE;
3047+
continue;
3048+
}
30143049

30153050
/* keep track of the vdev with largest value */
30163051
if (lat_data[c] > max) {
30173052
max = lat_data[c];
30183053
svd = cvd;
3054+
svd_sitting = sitting;
30193055
}
30203056
}
3021-
3022-
for (int c = 0; c < samples; c++) {
3023-
vdev_t *cvd = vd->vdev_child[c];
3024-
mutex_enter(&cvd->vdev_stat_lock);
3025-
size_t size = sizeof (cvd->vdev_stat_ex.vsx_disk_histo[0]);
3026-
memcpy(cvd->vdev_prev_histo,
3027-
cvd->vdev_stat_ex.vsx_disk_histo[ZIO_TYPE_READ], size);
3028-
mutex_exit(&cvd->vdev_stat_lock);
3029-
}
3057+
if (skip)
3058+
goto out;
30303059

30313060
qsort((void *)lat_data, samples, sizeof (uint64_t), latency_compare);
30323061
uint64_t iqr = 0;
30333062
uint64_t fence = latency_quartiles_fence(lat_data, samples, &iqr);
30343063

30353064
ASSERT3U(lat_data[samples - 1], ==, max);
3036-
if (max > fence && svd->vdev_read_sit_out_expire == 0) {
3065+
if (max > fence && !svd_sitting) {
30373066
uint64_t incr = MAX(1, (max - fence) / iqr);
30383067
vd->vdev_outlier_count += incr;
30393068
if (vd->vdev_outlier_count >= samples) {

0 commit comments

Comments
 (0)