diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c index db51b8818aa6..a824ff027ef4 100644 --- a/cmd/raidz_test/raidz_bench.c +++ b/cmd/raidz_test/raidz_bench.c @@ -86,7 +86,8 @@ run_gen_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( &zio_bench, - rto_opts.rto_ashift, ncols+1, ncols, + rto_opts.rto_ashift, 1, + ncols + rto_opts.rto_expand, ncols, fn+1, rto_opts.rto_expand_offset, 0, B_FALSE); } else { @@ -174,7 +175,8 @@ run_rec_bench_impl(const char *impl) if (rto_opts.rto_expand) { rm_bench = vdev_raidz_map_alloc_expanded( &zio_bench, - BENCH_ASHIFT, ncols+1, ncols, + BENCH_ASHIFT, 1, + ncols + rto_opts.rto_expand, ncols, PARITY_PQR, rto_opts.rto_expand_offset, 0, B_FALSE); } else { diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c index cf3e123c6090..eb8a35e621cb 100644 --- a/cmd/raidz_test/raidz_test.c +++ b/cmd/raidz_test/raidz_test.c @@ -119,7 +119,7 @@ static void usage(boolean_t requested) "\t[-S parameter sweep (default: %s)]\n" "\t[-t timeout for parameter sweep test]\n" "\t[-B benchmark all raidz implementations]\n" - "\t[-e use expanded raidz map (default: %s)]\n" + "\t[-e vdevs attached to expanded raidz (default: %llx)]\n" "\t[-r expanded raidz map reflow offset (default: %llx)]\n" "\t[-v increase verbosity (default: %d)]\n" "\t[-h (print help)]\n" @@ -131,7 +131,7 @@ static void usage(boolean_t requested) o->rto_dcols, /* -d */ ilog2(o->rto_dsize), /* -s */ rto_opts.rto_sweep ? "yes" : "no", /* -S */ - rto_opts.rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)rto_opts.rto_expand, /* -e */ (u_longlong_t)o->rto_expand_offset, /* -r */ o->rto_v); /* -v */ @@ -146,14 +146,15 @@ static void process_options(int argc, char **argv) memcpy(o, &rto_opts_defaults, sizeof (*o)); - while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { + while ((opt = getopt(argc, argv, "TDBSvha:e:r:o:d:s:t:")) != -1) { switch (opt) { case 'a': value = strtoull(optarg, NULL, 0); o->rto_ashift = MIN(13, MAX(9, value)); break; case 'e': - o->rto_expand = 1; + value = strtoull(optarg, NULL, 0); + o->rto_expand = MIN(255, MAX(1, value)); break; case 'r': o->rto_expand_offset = strtoull(optarg, NULL, 0); @@ -329,11 +330,11 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) if (opts->rto_expand) { opts->rm_golden = vdev_raidz_map_alloc_expanded(opts->zio_golden, - opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset, 0, B_FALSE); + opts->rto_ashift, 1, total_ncols + opts->rto_expand, + total_ncols, parity, opts->rto_expand_offset, 0, B_FALSE); rm_test = vdev_raidz_map_alloc_expanded(zio_test, - opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset, 0, B_FALSE); + opts->rto_ashift, 1, total_ncols + opts->rto_expand, + total_ncols, parity, opts->rto_expand_offset, 0, B_FALSE); } else { opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, opts->rto_ashift, total_ncols, parity); @@ -380,8 +381,8 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) if (opts->rto_expand) { rm = vdev_raidz_map_alloc_expanded(*zio, - opts->rto_ashift, total_ncols+1, total_ncols, - parity, opts->rto_expand_offset, 0, B_FALSE); + opts->rto_ashift, 1, total_ncols + opts->rto_expand, + total_ncols, parity, opts->rto_expand_offset, 0, B_FALSE); } else { rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, total_ncols, parity); diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h index f0b854cefb5d..3c693854d48d 100644 --- a/cmd/raidz_test/raidz_test.h +++ b/cmd/raidz_test/raidz_test.h @@ -58,7 +58,7 @@ typedef struct raidz_test_opts { size_t rto_sweep; size_t rto_sweep_timeout; size_t rto_benchmark; - size_t rto_expand; + uint64_t rto_expand; uint64_t rto_expand_offset; size_t rto_sanity; size_t rto_gdb; diff --git a/cmd/zhack.c b/cmd/zhack.c index 8244bc83fa0d..c36c90b83315 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -54,6 +54,7 @@ #include #include #include +#include static importargs_t g_importargs; static char *g_pool; @@ -157,8 +158,10 @@ zhack_import(char *target, boolean_t readonly) .lpc_printerr = B_TRUE }; error = zpool_find_config(&lpch, target, &config, &g_importargs); - if (error) + if (error) { + printf("zhack_import():P0\n"); fatal(NULL, FTAG, "cannot import '%s'", target); + } props = NULL; if (readonly) { @@ -175,9 +178,11 @@ zhack_import(char *target, boolean_t readonly) if (error == EEXIST) error = 0; - if (error) + if (error) { + printf("zhack_import():P1\n"); fatal(NULL, FTAG, "can't import '%s': %s", target, strerror(error)); + } } static void @@ -966,6 +971,142 @@ zhack_do_label(int argc, char **argv) return (err); } +static nvlist_t * +make_vdev_file(char *path[], int count, uint64_t ashift) +{ + nvlist_t **file; + nvlist_t *root; + + file = umem_alloc(count * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (int i = 0; i < count; i++) { + file[i] = fnvlist_alloc(); + fnvlist_add_string(file[i], ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE); + fnvlist_add_string(file[i], ZPOOL_CONFIG_PATH, path[i]); + fnvlist_add_uint64(file[i], ZPOOL_CONFIG_ASHIFT, ashift); + } + + root = fnvlist_alloc(); + fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + fnvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN, + (const nvlist_t **)file, count); + + return (root); +} + +#define MAX_DEVS_IN_RAIDZ 255 + +static int +zhack_do_raidz_expand(int argc, char **argv) +{ + spa_t *spa; + char *target; + char *newpath[MAX_DEVS_IN_RAIDZ]; + nvlist_t *root; + vdev_t *cvd, *rzvd; + pool_raidz_expand_stat_t rzx_stats; + int count, err = 0; + + argc--; + argv++; + + if (argc == 0) { + (void) fprintf(stderr, + "error: no pool to attach specified\n"); + usage(); + } + + target = argv[0]; + + argc--; + argv++; + + for (count = 0; argc != 0; count++,argc--,argv++) + newpath[count] = argv[0]; + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + + printf("Attaching to %s:\n", target); + for (int i = 0; i < count; i++) + printf("device %s\n", newpath[i]); + + rzvd = spa->spa_root_vdev->vdev_child[0]; + cvd = rzvd->vdev_child[0]; + root = make_vdev_file(newpath, count, cvd->vdev_ashift); + if (root == NULL) { + printf("raidz expand: cannot file config\n"); + exit(1); + } + + dump_nvlist(root, 0); + + err = spa_vdev_attach(spa, rzvd->vdev_guid, root, B_FALSE, B_FALSE); + nvlist_free(root); + if (err != 0) { + printf("raidz expand: attach returned %d", err); + exit(1); + } + + /* + * Wait for reflow to begin + */ + while (spa->spa_raidz_expand == NULL) { + txg_wait_synced(spa_get_dsl(spa), 0); + sleep(1); + } + + printf("Reflow started...\n"); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, &rzx_stats); + spa_config_exit(spa, SCL_CONFIG, FTAG); + while (rzx_stats.pres_state == DSS_SCANNING) { + txg_wait_synced(spa_get_dsl(spa), 0); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + (void) spa_raidz_expand_get_stats(spa, &rzx_stats); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + printf("%ld/%ld,", rzx_stats.pres_reflowed/(1024*1024), + rzx_stats.pres_to_reflow/(1024*1024)); + fflush(stdout); + + sleep(10); + } + + printf("\n"); + printf("Reflow done\n"); + + spa_close(spa, FTAG); + + return (err); +} + +static int +zhack_do_rze(int argc, char **argv) +{ + char *subcommand; + int err; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no label operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "expand") == 0) { + err = zhack_do_raidz_expand(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (err); +} + #define MAX_NUM_PATHS 1024 int @@ -1011,6 +1152,8 @@ main(int argc, char **argv) rv = zhack_do_feature(argc, argv); } else if (strcmp(subcommand, "label") == 0) { return (zhack_do_label(argc, argv)); + } else if (strcmp(subcommand, "raidz") == 0) { + return (zhack_do_rze(argc, argv)); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); @@ -1026,3 +1169,59 @@ main(int argc, char **argv) return (rv); } + +#if 0 +#!/bin/bash + +POOL_NAME="test" +REF_POOL="/home/user/Pools/Ref" +TEST_POOL="/home/user/Pools/Test" +VDEV_SIZE="1G" +VDEVS=4 + +create_ref_pool() +{ + for i in $(seq 0 $(($VDEVS-1))); do + echo "Allocate file $REF_POOL/file${i}" + truncate -s $VDEV_SIZE $REF_POOL/file${i} + done + + zpool create -f $POOL_NAME raidz $REF_POOL/file* + + zpool status + + dd if=/dev/urandom of=/test/file bs=1M status=progress + + zpool export $POOL_NAME +} + +attach_raidz_vdev() +{ + zpool status + + echo "Copy ref pool..." + rm -r -f $TEST_POOL + mkdir $TEST_POOL + + pids=() + for i in $(seq 0 $(($VDEVS-1))); do + cp $REF_POOL/file${i} $TEST_POOL/ & + pids[${i}]=$! + done + + # wait for all pids + for pid in ${pids[*]}; do + wait $pid + done + + truncate -s $VDEV_SIZE $TEST_POOL/file${VDEVS} + + /home/user/Sources/zfs/zhack -d $TEST_POOL raidz expand $POOL_NAME $TEST_POOL/file${VDEVS} + + zdb -bcc -d -Y -e -p $TEST_POOL $POOL_NAME +} + +# MAIN +# create_ref_pool +attach_raidz_vdev +#endif diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 23cc590cc304..771950d1467a 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7492,12 +7492,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) boolean_t wait = B_FALSE; int c; nvlist_t *nvroot; + char raidz_prefix[] = "raidz"; char *poolname, *old_disk, *new_disk; zpool_handle_t *zhp; nvlist_t *props = NULL; char *propval; int ret; + printf("=====:\n"); + for (int i = 0; i < argc; i++) + printf("i=%d, argv=%s\n", i, argv[i]); + /* check options */ while ((c = getopt(argc, argv, "fo:sw")) != -1) { switch (c) { @@ -7564,7 +7569,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) argv += 2; } - if (argc > 1) { + if (argc > 1 && + (replacing || strncmp(old_disk, raidz_prefix, strlen(raidz_prefix)))) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } @@ -7604,12 +7610,17 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) return (1); } + printf("nvroot:\n"); + dump_nvlist(nvroot, 0); + + printf("vdev_tree:\n"); + print_vdev_tree(zhp, NULL, nvroot, 0, "", VDEV_NAME_PATH); + ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, rebuild); if (ret == 0 && wait) { zpool_wait_activity_t activity = ZPOOL_WAIT_RESILVER; - char raidz_prefix[] = "raidz"; if (replacing) { activity = ZPOOL_WAIT_REPLACE; } else if (strncmp(old_disk, diff --git a/cmd/ztest.c b/cmd/ztest.c index 2e88ae3e7994..cf9610638681 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -3032,6 +3032,9 @@ ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) spa_t *spa; nvlist_t *nvroot; + // XXX: SKIP + return; + if (zo->zo_mmp_test) return; @@ -3094,6 +3097,9 @@ ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa = ztest_spa; + // XXX: SKIP + return; + if (zo->zo_mmp_test) return; @@ -3332,6 +3338,9 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) nvlist_t *nvroot; int error; + // XXX: SKIP + return; + if (ztest_opts.zo_mmp_test) return; @@ -3424,6 +3433,9 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; int error; + // XXX: SKIP + return; + /* * By default add a special vdev 50% of the time */ @@ -3507,6 +3519,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) uint64_t guid = 0; int error, ignore_err = 0; + // XXX: SKIP + return; + if (ztest_opts.zo_mmp_test) return; @@ -3727,6 +3742,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) int oldvd_is_special; int error, expected_error; + // XXX: SKIP + return; + if (ztest_opts.zo_mmp_test) return; @@ -4075,11 +4093,13 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) (void) zd, (void) id; ztest_shared_t *zs = ztest_shared; spa_t *spa = ztest_spa; - uint64_t leaves, raidz_children, newsize, ashift = ztest_get_ashift(); + uint64_t leaves, raidz_children, raidz_attach_children = 0, newsize; + uint64_t ashift = ztest_get_ashift(); kthread_t *scratch_thread = NULL; vdev_t *newvd, *pvd; - nvlist_t *root; - char *newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + nvlist_t *root = NULL; + nvlist_t **child = NULL; + char **newpath = NULL; int error, expected_error = 0; mutex_enter(&ztest_vdev_lock); @@ -4106,6 +4126,11 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) ASSERT(pvd->vdev_ops == &vdev_raidz_ops); + /* + * Get number of raidz childrent to attach + */ + raidz_attach_children = 2 + ztest_random(2); + /* * Get size of a child of the raidz group, * make sure device is a bit bigger @@ -4125,17 +4150,33 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) spa_config_exit(spa, SCL_ALL, FTAG); - /* - * Path to vdev to be attached - */ - (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, - ztest_opts.zo_dir, ztest_opts.zo_pool, zs->zs_vdev_next_leaf); + newpath = umem_alloc(raidz_attach_children * sizeof (char*), + UMEM_NOFAIL); + child = umem_alloc(raidz_attach_children * sizeof (nvlist_t *), + UMEM_NOFAIL); + for (int i = 0; i < raidz_attach_children; i++) { + /* + * Path to vdev to be attached + */ + newpath[i] = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) snprintf(newpath[i], MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, + zs->zs_vdev_next_leaf + i); - /* - * Build the nvlist describing newpath. - */ - root = make_vdev_root(newpath, NULL, NULL, newsize, ashift, NULL, - 0, 0, 1); + /* + * Build the nvlist describing newpath. + */ + child[i] = make_vdev_file(newpath[i], NULL, NULL, newsize, + ashift); + } + + root = fnvlist_alloc(); + fnvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT); + fnvlist_add_nvlist_array(root, ZPOOL_CONFIG_CHILDREN, + (const nvlist_t **)child, raidz_attach_children); + + printf("==== ztest_vdev_raidz_attach():\n"); + dump_nvlist(root, 0); /* * 50% of the time, set raidz_expand_pause_point to cause @@ -4143,7 +4184,7 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) * then kill the test after 10 seconds so raidz_scratch_verify() * can confirm consistency when the pool is imported. */ - if (ztest_random(2) == 0 && expected_error == 0) { + if (0 /*ztest_random(2) == 0 && expected_error == 0*/) { raidz_expand_pause_point = ztest_random(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2) + 1; scratch_thread = thread_create(NULL, 0, ztest_scratch_thread, @@ -4161,7 +4202,7 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) if (error != 0 && error != expected_error) { fatal(0, "raidz attach (%s %"PRIu64") returned %d, expected %d", - newpath, newsize, error, expected_error); + newpath[0], newsize, error, expected_error); } if (raidz_expand_pause_point) { @@ -4178,7 +4219,13 @@ ztest_vdev_raidz_attach(ztest_ds_t *zd, uint64_t id) out: mutex_exit(&ztest_vdev_lock); - umem_free(newpath, MAXPATHLEN); + for (int i = 0; i < raidz_attach_children; i++) { + fnvlist_free(child[i]); + umem_free(newpath[i], MAXPATHLEN); + } + + umem_free(child, raidz_attach_children * sizeof (nvlist_t *)); + umem_free(newpath, raidz_attach_children * sizeof (char*)); } void @@ -4190,6 +4237,9 @@ ztest_device_removal(ztest_ds_t *zd, uint64_t id) uint64_t guid; int error; + // XXX: SKIP + return; + mutex_enter(&ztest_vdev_lock); if (ztest_device_removal_active) { @@ -4370,6 +4420,9 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; + // XXX: SKIP + return; + mutex_enter(&ztest_checkpoint_lock); mutex_enter(&ztest_vdev_lock); spa_config_enter(spa, SCL_STATE, spa, RW_READER); @@ -6438,6 +6491,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + // XXX: SKIP + return; + mutex_enter(&ztest_vdev_lock); /* diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c8deb5be419e..f7f0de7e468c 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -821,6 +821,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_NPARITY "nparity" #define ZPOOL_CONFIG_RAIDZ_EXPANDING "raidz_expanding" #define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs" +#define ZPOOL_CONFIG_RAIDZ_EXPAND_VDEVS "raidz_expand_vdevs" #define ZPOOL_CONFIG_HOSTID "hostid" #define ZPOOL_CONFIG_HOSTNAME "hostname" #define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index 3b02728cdbf3..e94a94d686bb 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -49,7 +49,8 @@ struct kernel_param {}; struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); struct raidz_map *vdev_raidz_map_alloc_expanded(struct zio *, - uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, boolean_t); + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, + boolean_t); void vdev_raidz_map_free(struct raidz_map *); void vdev_raidz_free(struct vdev_raidz *); void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); @@ -84,6 +85,11 @@ typedef struct vdev_raidz_expand { kmutex_t vre_lock; kcondvar_t vre_cv; + /* + * Number of children attached during current expasnion. + */ + uint64_t vre_children_attached; + /* * How much i/o is outstanding (issued and not completed). */ diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index debce6f09a22..17198b7e04e8 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -160,6 +160,7 @@ typedef struct raidz_map { */ typedef struct reflow_node { uint64_t re_txg; + uint64_t re_children_attached; uint64_t re_logical_width; avl_node_t re_link; } reflow_node_t; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index b6fb153c4968..5907b6dd61c1 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3640,9 +3640,12 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0 || children != 1) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "new device must be a single disk")); - return (zfs_error(hdl, EZFS_INVALCONFIG, errbuf)); + if (strcmp(type, VDEV_TYPE_RAIDZ)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device must be a single disk")); + return (zfs_error(hdl, EZFS_INVALCONFIG, errbuf)); + // XXX more clean logic is required + } } config_root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), @@ -3670,6 +3673,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, zcmd_write_conf_nvlist(hdl, &zc, nvroot); + printf("zc.zc_name=%s\n", zc.zc_name); + printf("zc.zc_guid=%lx\n", zc.zc_guid); + printf("zc.zc_cookie=%ld\n", zc.zc_cookie); + printf("zc.zc_simple=%d\n", zc.zc_simple); + dump_nvlist(nvroot, 0); + + printf("==== zfs_ioctl:attach\n"); ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); zcmd_free_nvlists(&zc); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c0876c935405..088ed41c92ba 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7526,13 +7526,17 @@ spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd) * should be performed instead of traditional healing reconstruction. From * an administrators perspective these are both resilver operations. */ + +/* + * XXX guid is raidz vdev guid in case of raidz expansion + */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; - vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; + vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd, *ivd, **rzcvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare = B_FALSE; @@ -7576,6 +7580,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; +#if defined(_KERNEL) && defined(__linux__) + printk("====== spa_vdev_attach(+)"); +#else + printf("====== spa_vdev_attach(+):raidz=%d\n", raidz); +#endif + if (raidz) { if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); @@ -7584,6 +7594,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * Can't expand a raidz while prior expand is in progress. */ if (spa->spa_raidz_expand != NULL) { +#ifndef _KERNEL + printf("====== spa_vdev_attach(-):ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS\n"); +#endif return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); } @@ -7600,16 +7613,34 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, VDEV_ALLOC_ATTACH) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); - if (newrootvd->vdev_children != 1) + if (newrootvd->vdev_children != 1 && !raidz) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); +#ifndef _KERNEL + printf("spa_vdev_attach():attach children=%ld\n", + newrootvd->vdev_children); +#endif + newvd = newrootvd->vdev_child[0]; - if (!newvd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + /// XXX: free it + rzcvd = kmem_zalloc((1 + newrootvd->vdev_children) * sizeof (vdev_t *), KM_SLEEP); - if ((error = vdev_create(newrootvd, txg, replacing)) != 0) + for (int i = 0; i < newrootvd->vdev_children; i++) { + ivd = newrootvd->vdev_child[i]; + rzcvd[i] = ivd; + if (!ivd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + } + + if ((error = vdev_create(newrootvd, txg, replacing)) != 0) { +#if defined(_KERNEL) && defined(__linux__) + printk("====== spa_vdev_attach():vdev_create(), err=%d", error); +#else + printf("====== spa_vdev_attach():vdev_create(), err=%d\n", error); +#endif return (spa_vdev_exit(spa, newrootvd, txg, error)); + } /* * log, dedup and special vdevs should not be replaced by spares. @@ -7622,9 +7653,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * A dRAID spare can only replace a child of its parent dRAID vdev. */ - if (newvd->vdev_ops == &vdev_draid_spare_ops && - oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + for (int i = 0; i < newrootvd->vdev_children; i++) { + ivd = newrootvd->vdev_child[i]; + if (ivd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(ivd)) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } } if (rebuild) { @@ -7657,6 +7691,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, pvops = &vdev_mirror_ops; } else { + newvd = newrootvd->vdev_child[0]; + ASSERT(newrootvd->vdev_children == 1); + /* * Active hot spares can only be replaced by inactive hot * spares. @@ -7697,25 +7734,38 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, /* * Make sure the new device is big enough. */ - vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; - if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) - return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); - /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ - if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) { - return (spa_vdev_exit(spa, newrootvd, txg, - ZFS_ERR_ASHIFT_MISMATCH)); + for (int i = 0; i < newrootvd->vdev_children; i++) { + ivd = newrootvd->vdev_child[i]; + vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; + if (ivd->vdev_asize < vdev_get_min_asize(min_vdev)) { +#if defined(_KERNEL) && defined(__linux__) + printk("====== spa_vdev_attach() => EOVERFLOW, raidz=%d, %llu < %llu", + raidz, ivd->vdev_asize, vdev_get_min_asize(min_vdev)); +#else + printf("====== spa_vdev_attach() => EOVERFLOW, raidz=%d, %lu < %lu\n", + raidz, ivd->vdev_asize, vdev_get_min_asize(min_vdev)); +#endif + return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); + } + + + if (ivd->vdev_ashift > oldvd->vdev_top->vdev_ashift) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } /* * RAIDZ-expansion-specific checks. */ if (raidz) { - if (vdev_raidz_attach_check(newvd) != 0) - return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + for (int i = 0; i < newrootvd->vdev_children; i++) { + ivd = newrootvd->vdev_child[i]; + if (vdev_raidz_attach_check(ivd) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } /* * Fail early if a child is not healthy or being replaced @@ -7733,9 +7783,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, EADDRINUSE)); } } - } - if (raidz) { /* * Note: oldvdpath is freed by spa_strfree(), but * kmem_asprintf() is freed by kmem_strfree(), so we have to @@ -7781,23 +7829,28 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, ASSERT(pvd->vdev_top->vdev_parent == rvd); /* - * Extract the new device from its root and add it to pvd. + * Reevaluate the parent vdev state. */ - vdev_remove_child(newrootvd, newvd); - newvd->vdev_id = pvd->vdev_children; - newvd->vdev_crtxg = oldvd->vdev_crtxg; - vdev_add_child(pvd, newvd); + vdev_propagate_state(pvd); /* - * Reevaluate the parent vdev state. + * Extract the new device from its root and add it to pvd. */ - vdev_propagate_state(pvd); + tvd = newvd; // XXX prevent warning about uninitilized variable + for (int i = 0; i < newrootvd->vdev_children; i++) { /// XXX it is possible that children will be changed inside the lopp + ivd = newrootvd->vdev_child[i]; - tvd = newvd->vdev_top; - ASSERT(pvd->vdev_top == tvd); - ASSERT(tvd->vdev_parent == rvd); + vdev_remove_child(newrootvd, ivd); + ivd->vdev_id = pvd->vdev_children; + ivd->vdev_crtxg = oldvd->vdev_crtxg; + vdev_add_child(pvd, ivd); - vdev_config_dirty(tvd); + tvd = ivd->vdev_top; + ASSERT(pvd->vdev_top == tvd); + ASSERT(tvd->vdev_parent == rvd); + + vdev_config_dirty(tvd); + } /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account @@ -7828,7 +7881,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, dtl_max_txg); dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, - newvd, tx); + rzcvd, tx); /// XXX pass attached childrent thru void *arg dmu_tx_commit(tx); } else { vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, @@ -7870,15 +7923,22 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, } if (spa->spa_bootfs) - spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); + for (int i = 0; i < newrootvd->vdev_children; i++) { + ivd = tvd->vdev_child[i]; + spa_event_notify(spa, ivd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); + } - spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); + for (int i = 0; i < newrootvd->vdev_children; i++) { + newvd = tvd->vdev_child[i]; + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); + } /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); + // XXX update spa history to support multiple devices attach spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index a5fa9a604936..f0f601b8daab 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -733,7 +733,8 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, */ noinline raidz_map_t * vdev_raidz_map_alloc_expanded(zio_t *zio, - uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t ashift, uint64_t new_children, + uint64_t physical_cols, uint64_t logical_cols, uint64_t nparity, uint64_t reflow_offset_synced, uint64_t reflow_offset_next, boolean_t use_scratch) { @@ -741,6 +742,11 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, uint64_t offset = zio->io_offset; uint64_t size = zio->io_size; +#ifndef _KERNEL +// printf("vdev_raidz_map_alloc_expanded():pcols=%ld,lcols=%ld\n", +// physical_cols, logical_cols); +#endif + /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = size >> ashift; @@ -801,7 +807,7 @@ vdev_raidz_map_alloc_expanded(zio_t *zio, */ int row_phys_cols = physical_cols; if (b + cols > reflow_offset_synced >> ashift) - row_phys_cols--; + row_phys_cols-=new_children; else if (use_scratch) row_use_scratch = B_TRUE; @@ -2178,9 +2184,11 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, } if (vd->vdev_rz_expanding) { - *asize *= vd->vdev_children - 1; - *max_asize *= vd->vdev_children - 1; + *asize *= vd->vdev_children - vdrz->vn_vre.vre_children_attached; + *max_asize *= vd->vdev_children - + vdrz->vn_vre.vre_children_attached; + ASSERT3U(vdrz->vn_vre.vre_children_attached, >=, 1); vd->vdev_min_asize = *asize; } else { *asize *= vd->vdev_children; @@ -2212,6 +2220,8 @@ vdev_raidz_close(vdev_t *vd) * which reflects when the BP was relocated, but we can ignore these because * they can't be on RAIDZ (device removal doesn't support RAIDZ). */ +// #define TRACE_LOGICAL_WIDTH +#ifdef TRACE_LOGICAL_WIDTH static uint64_t vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) { @@ -2220,21 +2230,63 @@ vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) }; avl_index_t where; + int place = 0; + uint64_t width; mutex_enter(&vdrz->vd_expand_lock); reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); if (re != NULL) { + place = 1; width = re->re_logical_width; } else { re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); - if (re != NULL) + if (re != NULL) { + place = 2; width = re->re_logical_width; - else + } else { + place = 3; + width = vdrz->vd_original_width; + } + } + mutex_exit(&vdrz->vd_expand_lock); + +#ifndef _KERNEL + printf("vdev_raidz_get_logical_width():txg=%lu, place=%d,width=%lu\n", + txg, place, width); +#else + (void)place; +#endif + + return (width); +} +#else +static uint64_t +vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) +{ + reflow_node_t lookup = { + .re_txg = txg, + }; + avl_index_t where; + + uint64_t width; + mutex_enter(&vdrz->vd_expand_lock); + reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); + if (re != NULL) { + width = re->re_logical_width; + } else { + re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); + if (re != NULL) { + width = re->re_logical_width; + } else { width = vdrz->vd_original_width; + } } mutex_exit(&vdrz->vd_expand_lock); + return (width); } +#endif + /* * This code converts an asize into the largest psize that can safely be written * to an allocation of that size for this vdev. @@ -2571,6 +2623,7 @@ vdev_raidz_io_start(zio_t *zio) zfs_locked_range_t *lr = NULL; uint64_t synced_offset = UINT64_MAX; uint64_t next_offset = UINT64_MAX; + uint64_t new_children = 0; boolean_t use_scratch = B_FALSE; /* * Note: when the expansion is completing, we set @@ -2604,6 +2657,8 @@ vdev_raidz_io_start(zio_t *zio) if (next_offset == UINT64_MAX) { next_offset = synced_offset; } + + new_children = vdrz->vn_vre.vre_children_attached; } if (use_scratch) { zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" @@ -2617,8 +2672,8 @@ vdev_raidz_io_start(zio_t *zio) } rm = vdev_raidz_map_alloc_expanded(zio, - tvd->vdev_ashift, vdrz->vd_physical_width, - logical_width, vdrz->vd_nparity, + tvd->vdev_ashift, new_children, + vdrz->vd_physical_width, logical_width, vdrz->vd_nparity, synced_offset, next_offset, use_scratch); rm->rm_lr = lr; } else { @@ -3605,6 +3660,15 @@ vdev_raidz_io_done(zio_t *zio) zio->io_error = vdev_raidz_combrec(zio); if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { +#ifndef _KERNEL + zbookmark_phys_t *zb = &zio->io_bookmark; + printf("======== CSUM ERROR:type=%u,off=%lu,size=%lu - <%llu, %llu, %lld, %llx>\n", + zio->io_type, zio->io_offset, zio->io_size, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid); +#endif vdev_raidz_io_done_unrecoverable(zio); } } @@ -3793,6 +3857,7 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; + re->re_children_attached = vre->vre_children_attached; re->re_logical_width = vdrz->vd_physical_width; mutex_enter(&vdrz->vd_expand_lock); avl_add(&vdrz->vd_expand_txgs, re); @@ -4003,7 +4068,8 @@ raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, zfs_range_tree_t *rt, ASSERT(IS_P2ALIGNED(size, 1 << ashift)); uint64_t blkid = offset >> ashift; - uint_t old_children = vd->vdev_children - 1; + + int old_children = vd->vdev_children - vre->vre_children_attached; /* * We can only progress to the point that writes will not overlap @@ -4170,11 +4236,13 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); int ashift = raidvd->vdev_ashift; + uint64_t new_children = vre->vre_children_attached; uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift, uint64_t); uint64_t logical_size = write_size * raidvd->vdev_children; uint64_t read_size = - P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), + P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - + new_children)), 1 << ashift); /* @@ -4235,7 +4303,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) * Read from original location. */ pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - for (int i = 0; i < raidvd->vdev_children - 1; i++) { + for (int i = 0; i < raidvd->vdev_children - new_children; i++) { ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], 0, abds[i], read_size, ZIO_TYPE_READ, @@ -4261,8 +4329,9 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) */ uint64_t logical_sectors = logical_size >> ashift; for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { - int oldchild = i % (raidvd->vdev_children - 1); - uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; + int oldchild = i % (raidvd->vdev_children - new_children); + uint64_t oldoff = + (i / (raidvd->vdev_children - new_children)) << ashift; int newchild = i % raidvd->vdev_children; uint64_t newoff = (i / raidvd->vdev_children) << ashift; @@ -4798,28 +4867,51 @@ vdev_raidz_attach_check(vdev_t *new_child) void vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) { - vdev_t *new_child = arg; - spa_t *spa = new_child->vdev_spa; - vdev_t *raidvd = new_child->vdev_parent; + vdev_t **new_child = arg; + spa_t *spa = new_child[0]->vdev_spa; + vdev_t *raidvd = new_child[0]->vdev_parent; vdev_raidz_t *vdrz = raidvd->vdev_tsd; + + uint_t new_children = 0; + while (new_child[new_children] != NULL) + new_children++; + ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); ASSERT3P(raidvd->vdev_top, ==, raidvd); ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); - ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); + ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + new_children); ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, - new_child); + new_child[new_children - 1]); spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); - vdrz->vd_physical_width++; + /// XXX: single expansion verification + // ASSERT(new_children == 1); + + for (int i = 0; i < vdrz->vd_physical_width - 1; i++) + for (int j = vdrz->vd_physical_width; j < vdrz->vd_physical_width + new_children - 1; j++) + if (raidvd->vdev_child[i]->vdev_guid == raidvd->vdev_child[j]->vdev_guid) { +#ifndef _KERNEL + printf("==== DUPLICATED DEVICE DETECTED!!!\n"); +#endif + } + + vdrz->vn_vre.vre_children_attached = new_children; + vdrz->vd_physical_width += new_children; VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); + vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; vdrz->vn_vre.vre_offset = 0; vdrz->vn_vre.vre_failed_offset = UINT64_MAX; spa->spa_raidz_expand = &vdrz->vn_vre; zthr_wakeup(spa->spa_raidz_expand_zthr); +#ifndef _KERNEL + printf("vdev_raidz_attach_sync():pw=%d, new_children=%lu, vreoff=%lx\n", + vdrz->vd_physical_width, vdrz->vn_vre.vre_children_attached, vdrz->vn_vre.vre_offset); +#endif + /* * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get * written to the config. @@ -5010,35 +5102,73 @@ vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) if (reflow_in_progress) { spa->spa_raidz_expand = &vdrz->vn_vre; vdrz->vn_vre.vre_state = DSS_SCANNING; + error = nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING, + &vdrz->vn_vre.vre_children_attached); + ASSERT(error == 0); + + /// XXX: single expansion verification + // ASSERT(vdrz->vn_vre.vre_children_attached == 1); } vdrz->vd_original_width = children; uint64_t *txgs; - unsigned int txgs_size = 0; + uint64_t *widths; + unsigned int txgs_size = 0, widths_size = 0; + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, &txgs, &txgs_size); + error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_VDEVS, + &widths, &widths_size); if (error == 0) { + ASSERT(txgs_size == widths_size); + + uint64_t logical_width = vdrz->vd_physical_width; /// ???: move reflow_in_progress here for (int i = 0; i < txgs_size; i++) { reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); re->re_txg = txgs[txgs_size - i - 1]; - re->re_logical_width = vdrz->vd_physical_width - i; + re->re_children_attached = widths[widths_size - i - 1]; + re->re_logical_width = logical_width; + logical_width -= re->re_children_attached; + ASSERT3U(logical_width, <, 255); + - if (reflow_in_progress) - re->re_logical_width--; + /// XXX: single expansion verification + // ASSERT(re->re_children_attached == 1); + + if (reflow_in_progress) { + re->re_logical_width-=re->re_children_attached; /// XXXX:HERE!!! + ASSERT3U(re->re_logical_width, <, 255); + ASSERT3U(re->re_logical_width, >=, 4); + } avl_add(&vdrz->vd_expand_txgs, re); } - vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; + // XXX: actual only for single expansion + // vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; + for (int i = 0; i < txgs_size; i++) { + vdrz->vd_original_width-=widths[i]; + } } if (reflow_in_progress) { - vdrz->vd_original_width--; + vdrz->vd_original_width-=vdrz->vn_vre.vre_children_attached; zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", children, txgs_size); } + ASSERT3U(vdrz->vd_original_width, <, 255); + ASSERT3U(vdrz->vd_original_width, >=, 4); + *tsd = vdrz; +#ifndef _KERNEL + printf("vdev_raidz_init():reflow_in_progress=%d,children=%u,new_children=%lu,ow=%u,pw=%u\n", + reflow_in_progress, children, vdrz->vn_vre.vre_children_attached, + vdrz->vd_original_width, vdrz->vd_physical_width); + + for (int i = 0; i < txgs_size; i++) + printf(" %lu:%lu\n", txgs[i], widths[i]); +#endif return (0); } @@ -5067,6 +5197,9 @@ vdev_raidz_fini(vdev_t *vd) static void vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) { + uint64_t *txgs, *widths; + uint64_t count = 0; + ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); vdev_raidz_t *vdrz = vd->vdev_tsd; @@ -5088,26 +5221,48 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); if (vdrz->vn_vre.vre_state == DSS_SCANNING) { - fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING, + vdrz->vn_vre.vre_children_attached); + + /// XXX: single expansion verification + // ASSERT(vdrz->vn_vre.vre_children_attached == 1); } mutex_enter(&vdrz->vd_expand_lock); if (!avl_is_empty(&vdrz->vd_expand_txgs)) { - uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); - uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, + /*uint64_t*/ count = avl_numnodes(&vdrz->vd_expand_txgs); + /*uint64_t* */txgs = kmem_alloc(sizeof (uint64_t) * count, + KM_SLEEP); + /*uint64_t* */ widths = kmem_alloc(sizeof (uint64_t) * count, KM_SLEEP); uint64_t i = 0; for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); - re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { - txgs[i++] = re->re_txg; + re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re), i++) { + txgs[i] = re->re_txg; + widths[i] = re->re_children_attached; } fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, txgs, count); - + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_VDEVS, + widths, count); + +#ifndef _KERNEL + printf("vdev_raidz_config_generate():dss_scanning=%d, children_attached=%lu, outstand=%lu, off=%lu\n", + vdrz->vn_vre.vre_state, + vdrz->vn_vre.vre_children_attached, + vdrz->vn_vre.vre_outstanding_bytes, + vdrz->vn_vre.vre_offset); + + for (int i = 0; i < count; i++) { + printf(" %lu:%lu\n", txgs[i], widths[i]); + } +#endif kmem_free(txgs, sizeof (uint64_t) * count); + kmem_free(widths, sizeof (uint64_t) * count); } + mutex_exit(&vdrz->vd_expand_lock); }