diff --git a/.golangci.yml b/.golangci.yml index 87a605cda55..5cbf22a13f4 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -236,3 +236,4 @@ formatters: paths: - examples$ - ^go/vt/proto/ + - ^test/antithesis/ diff --git a/CLAUDE.md b/CLAUDE.md index 355b21fe2fa..2a0e8bf24b6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -197,6 +197,11 @@ return user.NeedsMigration() && migrate(user) || user - **Copyright header** - New Go files must include the project copyright header with the current year - **Always run `gofumpt -w`** on changed Go files before committing - this is mandatory - **Always run `goimports -local "vitess.io/vitess" -w`** on changed Go files before committing +- **Always run `golangci-lint run --path-mode=abs --timeout 10m`** (from the `go/` directory, scoped to the changed package(s)) before reporting work complete. CI runs it and will surface modernize/style issues that `go vet`, `gofumpt`, and `goimports` do not — for example: + - `waitgroup`: prefer `WaitGroup.Go(func() { ... })` over `wg.Add(1); go func() { defer wg.Done(); ... }()` + - `rangeint`: prefer `for range N` over `for i := 0; i < N; i++` when the index is unused + - `bloop`: prefer `b.Loop()` over `for i := 0; i < b.N; i++` in benchmarks + - `unusedparams`, `unusedwrite`, `unusedfunc`: clean these in code you touch - **Use format verbs precisely** - Use `%s` for strings and `%d` for integers, not `%v` for everything - **Structured logging** - New log messages should use structured logging with `slog`-style fields (e.g., `log.Warn("message", slog.Any("error", err))`) rather than printf-style logging with format strings - **Reuse existing helpers** - Before writing new parsing/validation code, check for existing utilities (e.g., `sqlerror` package for MySQL error codes, `mysqlctl.ParseVersionString()`, `strings.Split()`, `topoproto.TabletAliasString()` for formatting tablet aliases) diff --git a/changelog/25.0/25.0.0/summary.md b/changelog/25.0/25.0.0/summary.md index 4f4d0ca89de..46cb62ecef1 100644 --- a/changelog/25.0/25.0.0/summary.md +++ b/changelog/25.0/25.0.0/summary.md @@ -6,6 +6,7 @@ - **[Major Changes](#major-changes)** - **[New Support](#new-support)** + - [Experimental parallel VReplication applier](#vreplication-parallel-applier) - **[Breaking Changes](#breaking-changes)** - [`--watch-replication-stream` flag removed](#vttablet-watch-replication-stream-removed) - [Snapshot Topology feature removed](#vtorc-snapshot-topology-removed) @@ -15,6 +16,8 @@ - **[Minor Changes](#minor-changes)** - **[VReplication](#minor-changes-vreplication)** - [Default data protection for `_reverse` workflow cancel/complete](#vreplication-reverse-workflow-data-protection) + - [Unknown VStream event types are now hard errors in the applier](#vreplication-unknown-event-error) + - [Workflow config overrides sent to source tablets are now allowlisted](#vreplication-source-overrides-allowlist) - **[VTGate](#minor-changes-vtgate)** - [New controls for cross-keyspace reads](#vtgate-cross-keyspace-reads) - **[VTTablet](#minor-changes-vttablet)** @@ -26,6 +29,15 @@ ### New Support +#### Experimental parallel VReplication applier + +> [!WARNING] +> This feature is experimental. + +VReplication can now apply binlog events using multiple concurrent MySQL connections instead of a single serial connection. Set `--vreplication-parallel-replication-workers=N` (default `1` = serial, maximum `64`) on `vttablet`, or the `vreplication-parallel-replication-workers` per-workflow config override, to dispatch non-conflicting transactions to `N` worker goroutines during the replication (running) phase. Conflicts are detected with target-side writeset hashing (primary key, unique key, and foreign key values — similar to MySQL's own `WRITESET` dependency tracking), so it works regardless of the source's `binlog_transaction_dependency_tracking` setting. Commits remain strictly ordered, so the workflow position, lag metrics, and `WaitForPos` semantics are unchanged. Transactions the conflict detector cannot reason about (DDL, statement-based events, partial row images, prefix/expression unique indexes, and similar) fall back to serial application. + +Note that each worker holds two MySQL connections, so a workflow with `N` workers uses `2N+2` target-side connections. + ### Breaking Changes #### `--watch-replication-stream` flag removed @@ -84,6 +96,14 @@ When calling `cancel` or `complete` on an auto-generated `_reverse` workflow wit The `--keep-data` flag help text has been updated to note this default explicitly. This change applies to MoveTables, Reshard, and other VReplication workflow types that use the shared cancel/complete paths. +#### Unknown VStream event types are now hard errors in the applier + +The VReplication applier previously ignored VStream event types it did not recognize. It now fails the workflow with an error for unknown event types (and unknown `on-ddl` actions), failing closed instead of silently skipping events. All event types produced by supported Vitess versions are handled; this only affects streams from sources emitting event types unknown to the target's version. + +#### Workflow config overrides sent to source tablets are now allowlisted + +When a workflow has per-workflow config overrides, the target now sends only the source-relevant subset (packet size, timeouts, experimental flags, and similar) to the source tablet's VStreamer instead of the full override map. This keeps newer target-only override keys from failing workflows whose source tablets run an older version that rejects unknown keys. + See [#19906](https://github.com/vitessio/vitess/pull/19906) for details. ### VTGate diff --git a/examples/benchmark/bench_compare.sh b/examples/benchmark/bench_compare.sh new file mode 100755 index 00000000000..2b0a780a6b2 --- /dev/null +++ b/examples/benchmark/bench_compare.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +# A/B comparison: serial (workers=1) vs parallel (workers=4) VReplication applier +# with mixed write workload (INSERT/UPDATE/DELETE/bulk operations). + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" || exit 1 + +ROW_COUNT=${ROW_COUNT:-200000} +SEED_ROWS=${SEED_ROWS:-10000} +RUN_ORDER=${RUN_ORDER:-random} +export ROW_COUNT SEED_ROWS + +echo "============================================" +echo " VReplication Parallel Applier Benchmark" +echo " ROW_COUNT=$ROW_COUNT SEED_ROWS=$SEED_ROWS" +echo "============================================" +echo "" + +run_bench() { + local workers=$1 + local label=$2 + + echo ">>> Run: $label (PARALLEL_WORKERS=$workers) <<<" + echo "" + + # Teardown any previous state + (cd "$SCRIPT_DIR/../local" && ./501_teardown.sh) 2>/dev/null + + # Setup cluster with specified worker count + PARALLEL_WORKERS=$workers ./bench_setup.sh || { echo "FAILED: setup for $label"; return 1; } + + # Run benchmark. Use pipefail so a bench_run.sh validation failure is not + # masked by tee's zero exit status. + ( + set -o pipefail + ./bench_run.sh 2>&1 | tee "/tmp/bench_${workers}_workers.log" + ) || { echo "FAILED: bench_run for $label (validation or drain failure)"; return 1; } + + echo "" + echo ">>> $label complete <<<" + echo "" +} + +case "$RUN_ORDER" in + serial-first) + first_workers=1 + first_label="Serial (1 worker)" + second_workers=4 + second_label="Parallel (4 workers)" + ;; + parallel-first) + first_workers=4 + first_label="Parallel (4 workers)" + second_workers=1 + second_label="Serial (1 worker)" + ;; + random) + if (( RANDOM % 2 == 0 )); then + first_workers=1 + first_label="Serial (1 worker)" + second_workers=4 + second_label="Parallel (4 workers)" + RUN_ORDER=serial-first + else + first_workers=4 + first_label="Parallel (4 workers)" + second_workers=1 + second_label="Serial (1 worker)" + RUN_ORDER=parallel-first + fi + ;; + *) + echo "Invalid RUN_ORDER: $RUN_ORDER" + exit 1 + ;; +esac + +echo "Run order: $RUN_ORDER" + +# Run 1 +run_bench "$first_workers" "$first_label" || exit 1 + +# Teardown between runs +echo "Tearing down between runs..." +(cd "$SCRIPT_DIR/../local" && ./501_teardown.sh) 2>/dev/null +sleep 3 + +# Run 2 +run_bench "$second_workers" "$second_label" || exit 1 + +# Teardown after +echo "Tearing down after benchmark..." +(cd "$SCRIPT_DIR/../local" && ./501_teardown.sh) 2>/dev/null + +# Compare results +echo "" +echo "============================================" +echo " COMPARISON" +echo "============================================" + +for workers in 1 4; do + logfile="/tmp/bench_${workers}_workers.log" + if [[ -f "$logfile" ]]; then + echo "" + echo "--- Workers=$workers ---" + grep -E "(Drain time|Throughput|Backlog ops|Seed rows)" "$logfile" + fi +done + +# Calculate speedup if both logs exist +serial_log="/tmp/bench_1_workers.log" +parallel_log="/tmp/bench_4_workers.log" +if [[ -f "$serial_log" ]] && [[ -f "$parallel_log" ]]; then + serial_time=$(grep "Drain time" "$serial_log" | grep -o '[0-9]*') + parallel_time=$(grep "Drain time" "$parallel_log" | grep -o '[0-9]*') + if [[ -n "$serial_time" ]] && [[ -n "$parallel_time" ]] && [[ "$parallel_time" -gt 0 ]]; then + # Integer math: multiply by 100 for 2 decimal places + speedup_x100=$((serial_time * 100 / parallel_time)) + speedup_whole=$((speedup_x100 / 100)) + speedup_frac=$((speedup_x100 % 100)) + printf -v speedup_str '%d.%02d' "$speedup_whole" "$speedup_frac" + echo "" + echo "--- Speedup ---" + echo " Serial: ${serial_time}s" + echo " Parallel: ${parallel_time}s" + echo " Speedup: ${speedup_str}x" + fi +fi + +echo "" +echo "============================================" +echo "Full logs: /tmp/bench_1_workers.log and /tmp/bench_4_workers.log" +echo "============================================" diff --git a/examples/benchmark/bench_generate_load.sh b/examples/benchmark/bench_generate_load.sh new file mode 100755 index 00000000000..e136eaa2e24 --- /dev/null +++ b/examples/benchmark/bench_generate_load.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +# Generate workload for VReplication benchmark. Supports two modes: +# LOAD_TYPE=seed — INSERT-only (builds base data for UPDATE/DELETE targets) +# LOAD_TYPE=mixed — Mixed INSERT/UPDATE/DELETE/bulk operations +# +# The random generator uses a FIXED SEED so output is deterministic and +# benchmark runs are repeatable for proper A/B comparisons. +# +# Environment variables: +# ROW_COUNT — total operations to generate (default 5000000) +# LOAD_TYPE — "seed" or "mixed" (default "mixed") +# SEED_ROWS — rows per table available for UPDATE/DELETE (used in mixed mode) + +source ../common/env.sh + +TOTAL_OPS=${ROW_COUNT:-200000} +OPS_PER_TABLE=$((TOTAL_OPS / 4)) +LOAD_TYPE=${LOAD_TYPE:-mixed} +SEED_ROWS=${SEED_ROWS:-10000} + +echo "=== Generating Load: $TOTAL_OPS total ops ($OPS_PER_TABLE per table, type=$LOAD_TYPE) ===" + +TMPDIR="$VTDATAROOT/tmp/bench_load" +mkdir -p "$TMPDIR" + +python3 -c " +import random +import string +import os + +ops_per_table = $OPS_PER_TABLE +load_type = '$LOAD_TYPE' +seed_rows = $SEED_ROWS +tmpdir = '$TMPDIR' + +# Fixed seed for deterministic, repeatable output. +random.seed(42) + +# Pre-compute a pool of random strings to avoid per-row generation cost. +_pool_size = 10000 +_str_pool = {} +def _init_pool(n): + if n not in _str_pool: + chars = string.ascii_letters + string.digits + _str_pool[n] = [''.join(random.choices(chars, k=n)) for _ in range(_pool_size)] +def rand_str(n): + pool = _str_pool.get(n) + if pool is None: + _init_pool(n) + pool = _str_pool[n] + return pool[random.randint(0, _pool_size - 1)] + +def gen_insert_orders(f): + name = rand_str(60) + sku = rand_str(40) + qty = random.randint(1, 100) + price = random.randint(100, 100000) + status = random.choice(['pending', 'shipped', 'delivered', 'cancelled', 'returned', 'processing']) + region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1']) + notes = rand_str(400) + f.write(f\"INSERT INTO bench_orders (customer_name, product_sku, quantity, total_price, status, region, notes) VALUES ('{name}', '{sku}', {qty}, {price}, '{status}', '{region}', '{notes}');\n\") + +def gen_insert_events(f): + etype = random.choice(['click', 'purchase', 'view', 'signup', 'logout', 'error', 'timeout', 'retry']) + source = rand_str(60) + payload = rand_str(600) + severity = random.randint(1, 10) + created = random.randint(1700000000, 1800000000) + category = rand_str(40) + f.write(f\"INSERT INTO bench_events (event_type, source, payload, severity, created_at, category) VALUES ('{etype}', '{source}', '{payload}', {severity}, {created}, '{category}');\n\") + +def gen_insert_accounts(f): + username = rand_str(40) + email = rand_str(30) + '@' + rand_str(20) + '.com' + balance = random.randint(0, 1000000) + region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1']) + bio = rand_str(400) + tier = random.choice(['free', 'basic', 'pro', 'enterprise', 'unlimited']) + f.write(f\"INSERT INTO bench_accounts (username, email, balance, region, bio, tier) VALUES ('{username}', '{email}', {balance}, '{region}', '{bio}', '{tier}');\n\") + +def gen_insert_logs(f): + level = random.choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']) + message = rand_str(400) + component = random.choice(['api', 'worker', 'scheduler', 'gateway', 'cache', 'auth', 'billing', 'storage']) + error_code = random.randint(0, 9999) + trace_id = rand_str(32) + span_id = rand_str(16) + f.write(f\"INSERT INTO bench_logs (level, message, component, error_code, trace_id, span_id) VALUES ('{level}', '{message}', '{component}', {error_code}, '{trace_id}', '{span_id}');\n\") + +insert_fns = { + 'orders': gen_insert_orders, + 'events': gen_insert_events, + 'accounts': gen_insert_accounts, + 'logs': gen_insert_logs, +} + +# UPDATE generators — modify multiple indexed columns to create significant MySQL work +def gen_update_orders(f, pk): + name = rand_str(60) + status = random.choice(['pending', 'shipped', 'delivered', 'cancelled', 'returned', 'processing']) + region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1']) + notes = rand_str(400) + f.write(f\"UPDATE bench_orders SET customer_name='{name}', status='{status}', region='{region}', notes='{notes}' WHERE id={pk};\n\") + +def gen_update_events(f, pk): + etype = random.choice(['click', 'purchase', 'view', 'signup', 'logout', 'error', 'timeout', 'retry']) + source = rand_str(60) + payload = rand_str(600) + category = rand_str(40) + f.write(f\"UPDATE bench_events SET event_type='{etype}', source='{source}', payload='{payload}', category='{category}' WHERE id={pk};\n\") + +def gen_update_accounts(f, pk): + username = rand_str(40) + email = rand_str(30) + '@' + rand_str(20) + '.com' + balance = random.randint(0, 1000000) + bio = rand_str(400) + tier = random.choice(['free', 'basic', 'pro', 'enterprise', 'unlimited']) + f.write(f\"UPDATE bench_accounts SET username='{username}', email='{email}', balance={balance}, bio='{bio}', tier='{tier}' WHERE id={pk};\n\") + +def gen_update_logs(f, pk): + level = random.choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']) + message = rand_str(400) + component = random.choice(['api', 'worker', 'scheduler', 'gateway', 'cache', 'auth', 'billing', 'storage']) + error_code = random.randint(0, 9999) + f.write(f\"UPDATE bench_logs SET level='{level}', message='{message}', component='{component}', error_code={error_code} WHERE id={pk};\n\") + +update_fns = { + 'orders': gen_update_orders, + 'events': gen_update_events, + 'accounts': gen_update_accounts, + 'logs': gen_update_logs, +} + +# Bulk UPDATE generators — update N rows in one statement +def gen_bulk_update(table, f, pks): + pk_list = ','.join(str(p) for p in pks) + if table == 'orders': + status = random.choice(['pending', 'shipped', 'delivered', 'cancelled', 'returned', 'processing']) + region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1']) + notes = rand_str(400) + f.write(f\"UPDATE bench_orders SET status='{status}', region='{region}', notes='{notes}' WHERE id IN ({pk_list});\n\") + elif table == 'events': + etype = random.choice(['click', 'purchase', 'view', 'signup', 'logout', 'error', 'timeout', 'retry']) + payload = rand_str(600) + f.write(f\"UPDATE bench_events SET event_type='{etype}', payload='{payload}' WHERE id IN ({pk_list});\n\") + elif table == 'accounts': + balance = random.randint(0, 1000000) + tier = random.choice(['free', 'basic', 'pro', 'enterprise', 'unlimited']) + f.write(f\"UPDATE bench_accounts SET balance={balance}, tier='{tier}' WHERE id IN ({pk_list});\n\") + elif table == 'logs': + level = random.choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL']) + message = rand_str(400) + f.write(f\"UPDATE bench_logs SET level='{level}', message='{message}' WHERE id IN ({pk_list});\n\") + +def gen_bulk_delete(table, f, pks): + pk_list = ','.join(str(p) for p in pks) + f.write(f\"DELETE FROM bench_{table} WHERE id IN ({pk_list});\n\") + +tables = ['orders', 'events', 'accounts', 'logs'] + +if load_type == 'seed': + # Seed mode: INSERT-only, one file per table + for table in tables: + fn = insert_fns[table] + with open(os.path.join(tmpdir, f'{table}.sql'), 'w') as f: + for _ in range(ops_per_table): + fn(f) + print('Seed SQL files generated.') +else: + # Mixed mode: diverse write operations + # Operation mix (as fractions of total per table): + # 50% single-row INSERT — light txns, good for serial batching + # 20% single-row UPDATE — medium txns, index maintenance + # 5% single-row DELETE — light txns + # 15% bulk UPDATE (5-15 rows) — heavy txns, lots of row events + # 10% bulk DELETE (3-8 rows) — medium-heavy txns + for table in tables: + insert_fn = insert_fns[table] + update_fn = update_fns[table] + with open(os.path.join(tmpdir, f'{table}.sql'), 'w') as f: + for i in range(ops_per_table): + r = random.random() + if r < 0.50: + # Single-row INSERT + insert_fn(f) + elif r < 0.70: + # Single-row UPDATE on existing seed row + pk = random.randint(1, seed_rows) + update_fn(f, pk) + elif r < 0.75: + # Single-row DELETE + pk = random.randint(1, seed_rows) + f.write(f\"DELETE FROM bench_{table} WHERE id={pk};\n\") + elif r < 0.90: + # Bulk UPDATE (5-15 rows) + n = random.randint(5, 15) + pks = [random.randint(1, seed_rows) for _ in range(n)] + gen_bulk_update(table, f, pks) + else: + # Bulk DELETE (3-8 rows) + n = random.randint(3, 8) + pks = [random.randint(1, seed_rows) for _ in range(n)] + gen_bulk_delete(table, f, pks) + print('Mixed SQL files generated.') +" || fail "Failed to generate SQL files" + +echo "Loading data into commerce keyspace via vtgate (4 concurrent streams)..." + +load_start=$(date +%s) + +# Pipe all 4 SQL files concurrently through vtgate +load_pids=() +for table in orders events accounts logs; do + command mysql --no-defaults -h 127.0.0.1 -P 15306 --binary-as-hex=false commerce < "$TMPDIR/${table}.sql" & + load_pids+=("$!") +done + +for pid in "${load_pids[@]}"; do + wait "$pid" || fail "Failed to load one or more benchmark SQL streams" +done + +load_end=$(date +%s) +load_elapsed=$((load_end - load_start)) + +echo "=== Load Generation Complete ===" +echo "Total operations: $TOTAL_OPS" +echo "Time: ${load_elapsed}s" +if [ "$load_elapsed" -gt 0 ]; then + echo "Rate: $((TOTAL_OPS / load_elapsed)) ops/sec" +fi diff --git a/examples/benchmark/bench_run.sh b/examples/benchmark/bench_run.sh new file mode 100755 index 00000000000..3d9281ff9b6 --- /dev/null +++ b/examples/benchmark/bench_run.sh @@ -0,0 +1,412 @@ +#!/bin/bash + +# Run the VReplication parallel applier benchmark with mixed write workload. +# Prerequisites: bench_setup.sh must have been run first. +# +# Flow: +# 1. Seed source tables with initial data (for UPDATE/DELETE targets) +# 2. Create MoveTables workflow, copy seed data, stop +# 3. Generate mixed backlog (INSERT/UPDATE/DELETE/bulk) while stopped +# 4. Start workflow, time drain until lag reaches 0 +# +# Environment variables: +# ROW_COUNT — total backlog operations (default 200000) +# SEED_ROWS — seed rows per table for UPDATE/DELETE targets (default 10000) + +source ../common/env.sh + +TOTAL_OPS=${ROW_COUNT:-200000} +SEED_ROWS=${SEED_ROWS:-10000} +TOTAL_SEED=$((SEED_ROWS * 4)) +BENCH_TABLES="bench_orders,bench_events,bench_accounts,bench_logs" + +source_mysql() { + command mysql --no-defaults -h 127.0.0.1 -P 15306 --binary-as-hex=false "$@" +} + +# Find the primary tablet for a keyspace and return its MySQL socket path +detect_tablet_socket() { + local ks=$1 + local primary_tablet + primary_tablet=$(vtctldclient GetTablets --keyspace "$ks" --shard 0 2>/dev/null | grep -w primary | awk '{print $1}') + if [[ -z "$primary_tablet" ]]; then + fail "Could not find primary tablet for $ks keyspace" + fi + local uid + uid=$(echo "$primary_tablet" | sed 's/.*-0*//') + local sock="$VTDATAROOT/vt_$(printf '%010d' "$uid")/mysql.sock" + echo "$sock" +} + +detect_primaries() { + SOURCE_SOCKET=$(detect_tablet_socket commerce) + TARGET_SOCKET=$(detect_tablet_socket customer) + echo "Source socket: $SOURCE_SOCKET" + echo "Target socket: $TARGET_SOCKET" +} + +source_direct_mysql() { + command mysql --no-defaults -u vt_dba -S "$SOURCE_SOCKET" "$@" +} + +target_mysql() { + command mysql --no-defaults -u vt_dba -S "$TARGET_SOCKET" "$@" +} + +# Extract the max GTID transaction sequence number from a GTID set string. +# Handles formats like "uuid:1-N" and "MySQL56/uuid:1-N". +max_gtid_seq() { + echo "$1" | tr ',' '\n' | grep -oE ':[0-9]+-[0-9]+' | grep -oE '[0-9]+$' | sort -n | tail -1 +} + +# Get the target's current replication position from _vt.vreplication +target_pos() { + target_mysql -N -e \ + "SELECT pos FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null +} + +# Get replication lag in seconds (for display only, not reliable for drain detection) +replication_lag() { + target_mysql -N -e \ + "SELECT UNIX_TIMESTAMP() - FLOOR(time_updated) FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null +} + +echo "=== Bench Run (ROW_COUNT=$TOTAL_OPS, SEED_ROWS=$SEED_ROWS) ===" + +detect_primaries + +cleanup_workflow() { + vtctldclient MoveTables --workflow bench_move --target-keyspace customer cancel 2>/dev/null +} + +timeout_failed=0 + +add_target_indexes() { + target_mysql vt_customer -e " + ALTER TABLE bench_orders + ADD INDEX idx_name_status (customer_name, status), + ADD INDEX idx_name_region_qty (customer_name, region, quantity), + ADD INDEX idx_sku_status_region (product_sku, status, region), + ADD INDEX idx_region_status_price (region, status, total_price), + ADD INDEX idx_notes_prefix (notes(255)), + ADD INDEX idx_qty_price (quantity, total_price), + ADD INDEX idx_status_qty_price (status, quantity, total_price), + ADD INDEX idx_sku_qty (product_sku, quantity), + ADD INDEX idx_name_price (customer_name, total_price), + ADD INDEX idx_region_qty_price (region, quantity, total_price), + ADD INDEX idx_status_name (status, customer_name), + ADD INDEX idx_sku_region (product_sku, region), + ADD INDEX idx_status_sku_price (status, product_sku, total_price), + ADD INDEX idx_name_qty_status (customer_name, quantity, status), + ADD INDEX idx_region_name (region, customer_name), + ADD INDEX idx_sku_name_region (product_sku, customer_name, region), + ADD INDEX idx_qty_status_region (quantity, status, region), + ADD INDEX idx_price_status (total_price, status), + ADD INDEX idx_price_region_name (total_price, region, customer_name), + ADD INDEX idx_notes_prefix2 (notes(128)); +" || return 1 + + target_mysql vt_customer -e " + ALTER TABLE bench_events + ADD INDEX idx_source_type (source, event_type), + ADD INDEX idx_type_category (event_type, category), + ADD INDEX idx_category_severity (category, severity), + ADD INDEX idx_created_severity (created_at, severity), + ADD INDEX idx_source_category (source, category), + ADD INDEX idx_payload_prefix (payload(255)), + ADD INDEX idx_type_created_severity (event_type, created_at, severity), + ADD INDEX idx_source_severity (source, severity), + ADD INDEX idx_category_created (category, created_at), + ADD INDEX idx_type_source_severity (event_type, source, severity), + ADD INDEX idx_severity_category (severity, category), + ADD INDEX idx_created_type (created_at, event_type), + ADD INDEX idx_source_created_type (source, created_at, event_type), + ADD INDEX idx_category_type_created (category, event_type, created_at), + ADD INDEX idx_severity_source (severity, source), + ADD INDEX idx_type_severity_created (event_type, severity, created_at), + ADD INDEX idx_created_category_severity (created_at, category, severity), + ADD INDEX idx_source_type_category (source, event_type, category), + ADD INDEX idx_severity_type_source (severity, event_type, source), + ADD INDEX idx_payload_prefix2 (payload(128)); +" || return 1 + + target_mysql vt_customer -e " + ALTER TABLE bench_accounts + ADD INDEX idx_username_tier (username, tier), + ADD INDEX idx_email_region (email, region), + ADD INDEX idx_tier_balance (tier, balance), + ADD INDEX idx_region_tier (region, tier), + ADD INDEX idx_bio_prefix (bio(255)), + ADD INDEX idx_tier_region_balance (tier, region, balance), + ADD INDEX idx_username_balance (username, balance), + ADD INDEX idx_email_tier (email, tier), + ADD INDEX idx_username_region (username, region), + ADD INDEX idx_balance_tier (balance, tier), + ADD INDEX idx_region_balance_tier (region, balance, tier), + ADD INDEX idx_tier_username (tier, username), + ADD INDEX idx_email_balance (email, balance), + ADD INDEX idx_region_username (region, username), + ADD INDEX idx_username_tier_balance (username, tier, balance), + ADD INDEX idx_tier_email (tier, email), + ADD INDEX idx_balance_region (balance, region), + ADD INDEX idx_email_tier_region (email, tier, region), + ADD INDEX idx_region_email_balance (region, email, balance), + ADD INDEX idx_bio_prefix2 (bio(128)); +" || return 1 + + target_mysql vt_customer -e " + ALTER TABLE bench_logs + ADD INDEX idx_component_level (component, level), + ADD INDEX idx_trace_span (trace_id, span_id), + ADD INDEX idx_level_error (level, error_code), + ADD INDEX idx_component_error (component, error_code), + ADD INDEX idx_message_prefix (message(255)), + ADD INDEX idx_span_level (span_id, level), + ADD INDEX idx_error_component_level (error_code, component, level), + ADD INDEX idx_level_component_error (level, component, error_code), + ADD INDEX idx_trace_level (trace_id, level), + ADD INDEX idx_component_trace (component, trace_id), + ADD INDEX idx_error_level (error_code, level), + ADD INDEX idx_span_component (span_id, component), + ADD INDEX idx_level_trace (level, trace_id), + ADD INDEX idx_trace_component_level (trace_id, component, level), + ADD INDEX idx_error_span (error_code, span_id), + ADD INDEX idx_component_span_level (component, span_id, level), + ADD INDEX idx_level_span_error (level, span_id, error_code), + ADD INDEX idx_span_error_component (span_id, error_code, component), + ADD INDEX idx_trace_error (trace_id, error_code), + ADD INDEX idx_message_prefix2 (message(128)); +" || return 1 +} + +# Step 1: Seed source tables with initial data +# Retry the seed step: vtgate's connection pool to the primary tablet can be +# briefly unavailable right after cluster startup, surfacing as +# "connection pool is closed" when the seed script runs too soon. +echo "" +echo "Seeding source tables ($SEED_ROWS rows per table = $TOTAL_SEED total)..." +seed_attempts=0 +seed_max_attempts=3 +until LOAD_TYPE=seed ROW_COUNT=$TOTAL_SEED ./bench_generate_load.sh; do + seed_attempts=$((seed_attempts+1)) + if [[ $seed_attempts -ge $seed_max_attempts ]]; then + fail "Failed to seed data after $seed_max_attempts attempts" + fi + echo "Seed failed (attempt $seed_attempts); retrying in 10s..." + sleep 10 +done + +# Step 2: Create MoveTables workflow (auto-start, copies seed data) +echo "" +echo "Creating MoveTables workflow..." +vtctldclient MoveTables --workflow bench_move --target-keyspace customer create \ + --source-keyspace commerce \ + --tables "$BENCH_TABLES" || fail "Failed to create MoveTables workflow" + +# Step 3: Wait for copy phase to complete (state transitions to Running) +echo "Waiting for copy phase to complete..." +max_wait=600 +for i in $(seq 1 $max_wait); do + state=$(target_mysql -N -e \ + "SELECT state FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null | head -1) + if [[ "$state" == "Running" ]]; then + echo "Copy phase complete, workflow is running." + break + fi + if [[ $((i % 10)) -eq 0 ]]; then + echo " ...still copying (state=$state, ${i}s elapsed)" + fi + sleep 1 +done + +if [[ "$state" != "Running" ]]; then + fail "Timed out waiting for copy phase to complete (state=$state)" +fi + +# Step 4: Stop the workflow so we can build a backlog +echo "Stopping workflow..." +vtctldclient MoveTables --workflow bench_move --target-keyspace customer stop || fail "Failed to stop workflow" + +for i in $(seq 1 30); do + state=$(target_mysql -N -e \ + "SELECT state FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null | head -1) + if [[ "$state" == "Stopped" ]]; then + break + fi + sleep 1 +done + +if [[ "$state" != "Stopped" ]]; then + fail "Workflow did not stop (state=$state)" +fi +echo "Workflow stopped." + +# Step 4b: Add extra indexes on the TARGET to increase per-statement MySQL cost. +# The source keeps lightweight indexes so the vstreamer produces events fast. +# Heavy target indexes make the applier the bottleneck, allowing parallel workers +# to demonstrate their advantage by overlapping expensive index maintenance. +# With ~25 indexes per table and an 8MB buffer pool, each INSERT/UPDATE/DELETE +# requires many random page reads that can be overlapped by parallel workers. +echo "" +echo "Adding extra indexes on target to increase applier workload..." +add_target_indexes || { + echo "ERROR: failed to add target indexes" + cleanup_workflow + exit 1 +} + +echo "Target indexes added (~25 per table)." + +# Step 5: Generate mixed backlog on source +echo "" +echo "Generating mixed backlog on source ($TOTAL_OPS operations)..." +LOAD_TYPE=mixed ROW_COUNT=$TOTAL_OPS SEED_ROWS=$SEED_ROWS ./bench_generate_load.sh || fail "Failed to generate backlog" + +# Step 5b: Capture source GTID position after backlog generation. +# This is the definitive marker — when the target's pos reaches this point, +# all backlog events have been applied. +source_gtid=$(source_direct_mysql -N -e "SELECT @@gtid_executed" 2>/dev/null | tr -d '[:space:]') +source_seq=$(max_gtid_seq "$source_gtid") +echo "Source GTID seq after backlog: $source_seq" + +if [[ -z "$source_seq" ]] || [[ "$source_seq" -eq 0 ]]; then + fail "Could not capture source GTID position" +fi + +# Step 6: Record start time and start the workflow +echo "" +echo "Starting workflow to drain backlog..." +start_time=$(date +%s) + +vtctldclient MoveTables --workflow bench_move --target-keyspace customer start || fail "Failed to start workflow" + +# Step 7: Poll until target GTID position catches up to source. +# We use GTID comparison instead of time_updated lag because: +# - time_updated is refreshed by the controller loop regardless of applier progress +# - With parallel workers, the controller doesn't block on the applier, so +# time_updated stays near-current even while the backlog is being processed +# - GTID position accurately reflects committed progress +echo "Waiting for target to catch up (source_seq=$source_seq)..." +last_report=0 +while true; do + now=$(date +%s) + elapsed=$((now - start_time)) + + # Get target's current replicated position + tpos=$(target_pos) + target_seq=$(max_gtid_seq "$tpos") + + # Check if target has caught up to source + if [[ -n "$target_seq" ]] && [[ "$target_seq" =~ ^[0-9]+$ ]] && [[ "$target_seq" -ge "$source_seq" ]]; then + echo "Target caught up! (target_seq=$target_seq >= source_seq=$source_seq, ${elapsed}s elapsed)" + break + fi + + if [[ $((elapsed - last_report)) -ge 5 ]]; then + lag=$(replication_lag) + pct="" + if [[ -n "$target_seq" ]] && [[ "$target_seq" =~ ^[0-9]+$ ]] && [[ "$source_seq" -gt 0 ]]; then + pct=" $(( target_seq * 100 / source_seq ))%" + fi + echo " ...draining (pos=${target_seq:-?}/${source_seq}${pct} lag=${lag:-?}s ${elapsed}s)" + last_report=$elapsed + fi + + if [[ "$elapsed" -ge 7200 ]]; then + echo "ERROR: Timed out after ${elapsed}s (target_seq=${target_seq:-?})" + timeout_failed=1 + break + fi + + sleep 1 +done + +if [[ "$timeout_failed" -ne 0 ]]; then + echo "" + echo "ERROR: drain timed out before reaching source GTID position" + cleanup_workflow + exit 1 +fi + +end_time=$(date +%s) + +# Step 8: Calculate and report results +elapsed_s=$((end_time - start_time)) + +echo "" +echo "============================================" +echo " BENCHMARK RESULTS" +echo "============================================" +echo " Backlog ops: $TOTAL_OPS" +echo " Seed rows: $TOTAL_SEED" +echo " Drain time: ${elapsed_s}s" +if [ "$elapsed_s" -gt 0 ]; then + echo " Throughput: $((TOTAL_OPS / elapsed_s)) ops/sec" +fi +echo "============================================" + +# Step 9: Validate source and target are semantically equivalent (only after +# drain, when target is idle). COUNT(*) alone is too weak: reordering errors, +# wrong-row updates, or corrupted values can preserve cardinality while changing +# row content. We compute a content checksum per table that is order-independent +# (BIT_XOR of CRC32 over all column values) so parallel apply reordering is +# not flagged as divergence as long as the final state is equivalent. +echo "" +echo "Validating row counts and content checksums..." +validation_failed=0 + +# Returns the column list for the given table. These must match +# create_bench_schema.sql exactly — keep in sync. Using a function here +# instead of an associative array for bash 3.2 compatibility (macOS). +table_columns() { + case "$1" in + bench_orders) echo "id,customer_name,product_sku,quantity,total_price,status,region,notes" ;; + bench_events) echo "id,event_type,source,payload,severity,created_at,category" ;; + bench_accounts) echo "id,username,email,balance,region,bio,tier" ;; + bench_logs) echo "id,level,message,component,error_code,trace_id,span_id" ;; + esac +} + +for table in bench_orders bench_events bench_accounts bench_logs; do + cols="$(table_columns "$table")" + # Build CONCAT_WS over all columns with IFNULL so NULLs don't collapse the row. + concat_expr="CONCAT_WS('|'" + old_ifs="$IFS" + IFS=',' + for col in $cols; do + concat_expr="$concat_expr,IFNULL($col,'\\0')" + done + IFS="$old_ifs" + concat_expr="$concat_expr)" + checksum_sql="SELECT COUNT(*), COALESCE(BIT_XOR(CAST(CRC32($concat_expr) AS UNSIGNED)), 0) FROM" + + source_row=$(source_mysql -N -e "$checksum_sql commerce.$table" 2>/dev/null) + target_row=$(target_mysql -N -e "$checksum_sql vt_customer.$table" 2>/dev/null) + source_count=$(echo "$source_row" | awk '{print $1}') + source_cksum=$(echo "$source_row" | awk '{print $2}') + target_count=$(echo "$target_row" | awk '{print $1}') + target_cksum=$(echo "$target_row" | awk '{print $2}') + + match="OK" + if [[ "$source_count" != "$target_count" ]] || [[ "$source_cksum" != "$target_cksum" ]]; then + match="MISMATCH" + validation_failed=1 + fi + echo " $table: source=(count=$source_count, cksum=$source_cksum) target=(count=$target_count, cksum=$target_cksum) [$match]" +done + +if [[ "$validation_failed" -ne 0 ]]; then + echo "" + echo "ERROR: validation FAILED — source and target diverged. See mismatches above." + echo "=== Bench Run Failed ===" + # Still attempt workflow cleanup before exiting. + cleanup_workflow + exit 1 +fi + +# Step 10: Cleanup workflow +echo "" +echo "Cleaning up workflow..." +cleanup_workflow + +echo "=== Bench Run Complete ===" diff --git a/examples/benchmark/bench_scripts_test.sh b/examples/benchmark/bench_scripts_test.sh new file mode 100644 index 00000000000..6beadff6689 --- /dev/null +++ b/examples/benchmark/bench_scripts_test.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +fail_test() { + printf 'FAIL: %s\n' "$*" >&2 + exit 1 +} + +assert_equals() { + local got=$1 + local want=$2 + local message=$3 + if [[ "$got" != "$want" ]]; then + fail_test "$message (got=$got want=$want)" + fi +} + +assert_contains() { + local haystack=$1 + local needle=$2 + local message=$3 + if [[ "$haystack" != *"$needle"* ]]; then + fail_test "$message" + fi +} + +assert_not_contains() { + local haystack=$1 + local needle=$2 + local message=$3 + if [[ "$haystack" == *"$needle"* ]]; then + fail_test "$message" + fi +} + +new_sandbox() { + local sandbox + sandbox=$(mktemp -d) + mkdir -p "$sandbox/examples/benchmark" "$sandbox/examples/common" "$sandbox/examples/local" "$sandbox/bin" "$sandbox/vtdataroot" + cp "$REPO_ROOT/examples/benchmark/bench_run.sh" "$sandbox/examples/benchmark/bench_run.sh" + cp "$REPO_ROOT/examples/benchmark/bench_compare.sh" "$sandbox/examples/benchmark/bench_compare.sh" + cat > "$sandbox/examples/common/env.sh" <<'EOF' +#!/bin/bash + +fail() { + echo "$*" >&2 + exit 1 +} + +export VTDATAROOT="${VTDATAROOT:-$PWD/vtdataroot}" +mkdir -p "$VTDATAROOT" +EOF + cat > "$sandbox/examples/benchmark/bench_generate_load.sh" <<'EOF' +#!/bin/bash +set -euo pipefail +printf '%s\n' "${LOAD_TYPE:-unset}:${ROW_COUNT:-unset}" >> "$BENCH_TEST_TMP/load_calls" +EOF + cat > "$sandbox/examples/local/501_teardown.sh" <<'EOF' +#!/bin/bash +set -euo pipefail +printf 'teardown\n' >> "$BENCH_TEST_TMP/teardown_calls" +EOF + cat > "$sandbox/bin/vtctldclient" <<'EOF' +#!/bin/bash +set -euo pipefail +printf '%s\n' "$*" >> "$BENCH_TEST_TMP/vtctld_calls" +if [[ "${1:-}" == "GetTablets" ]]; then + if [[ "$*" == *"--keyspace commerce"* ]]; then + printf 'zone1-0000000100 primary\n' + else + printf 'zone1-0000000200 primary\n' + fi +fi +EOF + cat > "$sandbox/bin/mysql" <<'EOF' +#!/bin/bash +set -euo pipefail + +query="" +while (($#)); do + case "$1" in + -e) + query=$2 + shift 2 + ;; + --no-defaults|--binary-as-hex=false|-N) + shift + ;; + -h|-P|-u|-S) + shift 2 + ;; + *) + shift + ;; + esac +done + +state_calls_file="$BENCH_TEST_TMP/mysql_state_calls" +case "$query" in + *"SELECT state FROM _vt.vreplication WHERE workflow='bench_move'"*) + state_calls=0 + if [[ -f "$state_calls_file" ]]; then + state_calls=$(cat "$state_calls_file") + fi + state_calls=$((state_calls + 1)) + printf '%s' "$state_calls" > "$state_calls_file" + if [[ "$state_calls" -eq 1 ]]; then + printf 'Running\n' + else + printf 'Stopped\n' + fi + ;; + *"ALTER TABLE bench_orders"*) + if [[ "${BENCH_FAIL_FIRST_TARGET_ALTER:-0}" == "1" ]]; then + echo 'simulated index build failure' >&2 + exit 1 + fi + ;; + *"ALTER TABLE bench_events"*|*"ALTER TABLE bench_accounts"*|*"ALTER TABLE bench_logs"*) + ;; + *"SELECT @@gtid_executed"*) + printf 'uuid:1-100\n' + ;; + *"SELECT pos FROM _vt.vreplication WHERE workflow='bench_move'"*) + printf 'uuid:1-50\n' + ;; + *"SELECT UNIX_TIMESTAMP() - FLOOR(time_updated) FROM _vt.vreplication WHERE workflow='bench_move'"*) + printf '999\n' + ;; + *"SELECT COUNT(*), COALESCE(BIT_XOR("*) + printf '1 2\n' + ;; + esac +EOF + cat > "$sandbox/bin/date" <<'EOF' +#!/bin/bash +set -euo pipefail +if [[ "${1:-}" != "+%s" ]]; then + /bin/date "$@" + exit 0 +fi + +calls_file="$BENCH_TEST_TMP/date_calls" +calls=0 +if [[ -f "$calls_file" ]]; then + calls=$(cat "$calls_file") +fi +calls=$((calls + 1)) +printf '%s' "$calls" > "$calls_file" + +case "$calls" in + 1) + printf '0\n' + ;; + 2) + printf '%s\n' "${BENCH_TIMEOUT_ELAPSED:-7200}" + ;; + *) + printf '%s\n' "${BENCH_TIMEOUT_ELAPSED_END:-7201}" + ;; +esac +EOF + chmod +x "$sandbox/examples/common/env.sh" "$sandbox/examples/benchmark/bench_generate_load.sh" "$sandbox/examples/local/501_teardown.sh" "$sandbox/bin/vtctldclient" "$sandbox/bin/mysql" "$sandbox/bin/date" + printf '%s\n' "$sandbox" +} + +test_bench_run_timeout_fails_without_results() { + local sandbox output status + sandbox=$(new_sandbox) + trap 'rm -rf "$sandbox"' RETURN + output=$(cd "$sandbox/examples/benchmark" && BENCH_TEST_TMP="$sandbox" VTDATAROOT="$sandbox/vtdataroot" PATH="$sandbox/bin:$PATH" bash ./bench_run.sh 2>&1) || status=$? + status=${status:-0} + + if [[ "$status" -eq 0 ]]; then + fail_test "bench_run timeout should fail" + fi + assert_contains "$output" "Timed out after" "bench_run should report the timeout" + assert_not_contains "$output" "BENCHMARK RESULTS" "bench_run should not print results after a timeout" + assert_contains "$(cat "$sandbox/vtctld_calls")" "MoveTables --workflow bench_move --target-keyspace customer cancel" "bench_run should clean up the workflow after a timeout" + trap - RETURN + rm -rf "$sandbox" +} + +test_bench_run_index_failure_is_fatal() { + local sandbox output status load_calls + sandbox=$(new_sandbox) + trap 'rm -rf "$sandbox"' RETURN + output=$(cd "$sandbox/examples/benchmark" && BENCH_TEST_TMP="$sandbox" BENCH_FAIL_FIRST_TARGET_ALTER=1 VTDATAROOT="$sandbox/vtdataroot" PATH="$sandbox/bin:$PATH" bash ./bench_run.sh 2>&1) || status=$? + status=${status:-0} + + if [[ "$status" -eq 0 ]]; then + fail_test "bench_run should fail when target index creation fails" + fi + assert_not_contains "$output" "Target indexes added (~25 per table)." "bench_run should not report index success after an index build failure" + load_calls=$(cat "$sandbox/load_calls") + assert_equals "$load_calls" "seed:40000" "bench_run should stop before generating the mixed backlog when index creation fails" + assert_contains "$(cat "$sandbox/vtctld_calls")" "MoveTables --workflow bench_move --target-keyspace customer cancel" "bench_run should clean up the workflow after an index build failure" + trap - RETURN + rm -rf "$sandbox" +} + +test_bench_compare_can_run_parallel_first() { + local sandbox output order + sandbox=$(mktemp -d) + trap 'rm -rf "$sandbox"; rm -f /tmp/bench_1_workers.log /tmp/bench_4_workers.log' RETURN + mkdir -p "$sandbox/examples/benchmark" "$sandbox/examples/local" + cp "$REPO_ROOT/examples/benchmark/bench_compare.sh" "$sandbox/examples/benchmark/bench_compare.sh" + cat > "$sandbox/examples/benchmark/bench_setup.sh" <<'EOF' +#!/bin/bash +set -euo pipefail +printf '%s\n' "$PARALLEL_WORKERS" >> "$BENCH_TEST_TMP/setup_order" +EOF + cat > "$sandbox/examples/benchmark/bench_run.sh" <<'EOF' +#!/bin/bash +set -euo pipefail +echo " Backlog ops: 200000" +echo " Seed rows: 40000" +echo " Drain time: 10s" +echo " Throughput: 20000 ops/sec" +EOF + cat > "$sandbox/examples/local/501_teardown.sh" <<'EOF' +#!/bin/bash +set -euo pipefail +: +EOF + chmod +x "$sandbox/examples/benchmark/bench_setup.sh" "$sandbox/examples/benchmark/bench_run.sh" "$sandbox/examples/local/501_teardown.sh" + + output=$(cd "$sandbox/examples/benchmark" && BENCH_TEST_TMP="$sandbox" RUN_ORDER=parallel-first bash ./bench_compare.sh 2>&1) + order=$(paste -sd ',' "$sandbox/setup_order") + assert_equals "$order" "4,1" "bench_compare should honor RUN_ORDER=parallel-first" + assert_contains "$output" "Run order: parallel-first" "bench_compare should print the selected run order" + trap - RETURN + rm -rf "$sandbox" + rm -f /tmp/bench_1_workers.log /tmp/bench_4_workers.log +} + +test_bench_run_timeout_fails_without_results +test_bench_run_index_failure_is_fatal +test_bench_compare_can_run_parallel_first + +echo "PASS: benchmark script regressions" diff --git a/examples/benchmark/bench_setup.sh b/examples/benchmark/bench_setup.sh new file mode 100755 index 00000000000..63b19b093ca --- /dev/null +++ b/examples/benchmark/bench_setup.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# Bring up commerce keyspace + customer keyspace tablets with configurable +# parallel replication workers for benchmarking VReplication throughput. + +BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +source ../common/env.sh + +PARALLEL_WORKERS=${PARALLEL_WORKERS:-1} +SIDECAR_DB_NAME=${SIDECAR_DB_NAME:-"_vt"} +# VReplication experimental flags: 1=OptimizeInserts, 4=VPlayerBatching, 8=AllowNoBlobBinlogRowImage +# Default 13 (all enabled). Set DISABLE_BATCHING=1 to test without multi-statement batching. +VREPL_FLAGS=${VREPL_FLAGS:-13} + +echo "=== Bench Setup (parallel_workers=$PARALLEL_WORKERS) ===" + +# Step 1: Bring up the commerce keyspace (topo, vtctld, commerce tablets, vtorc, vtgate). +# The local scripts must run from examples/local/ because they use relative paths. +(cd "$BENCH_DIR/../local" && ./101_initial_cluster.sh) || fail "Failed to bring up initial cluster" + +# Step 2: Apply bench schema and vschema to commerce. +vtctldclient ApplySchema --sql-file "$BENCH_DIR/create_bench_schema.sql" commerce || fail "Failed to apply bench schema" +vtctldclient ApplyVSchema --vschema-file "$BENCH_DIR/vschema_bench.json" commerce || fail "Failed to apply bench vschema" + +echo "Bench schema and vschema applied to commerce keyspace." + +# Step 3: Create customer keyspace +if vtctldclient GetKeyspace customer > /dev/null 2>&1; then + vtctldclient SetKeyspaceDurabilityPolicy --durability-policy=none customer || fail "Failed to set durability policy on customer keyspace" +else + vtctldclient CreateKeyspace --sidecar-db-name="${SIDECAR_DB_NAME}" --durability-policy=none customer || fail "Failed to create customer keyspace" +fi + +# Step 4: Start mysqlctls for customer tablets with small buffer pool. +# We set innodb_buffer_pool_chunk_size=1M at startup so that the buffer pool +# can actually be reduced below the default 128MB chunk size. +BENCH_EXTRA_CNF="$VTDATAROOT/tmp/bench_target.cnf" +cat > "$BENCH_EXTRA_CNF" <<'EOF' +# Bench: small buffer pool to force disk I/O on secondary index access. +# 32MB gives each parallel worker ~8MB (matching serial's total), avoiding +# destructive cache thrashing between workers while keeping I/O significant. +innodb_buffer_pool_chunk_size = 1048576 +innodb_buffer_pool_size = 33554432 +EOF + +for i in 200 201 202; do + EXTRA_MY_CNF="$BENCH_EXTRA_CNF" CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh & +done + +sleep 2 +echo "Waiting for customer mysqlctls to start..." +wait +echo "Customer mysqlctls are running!" + +# Step 5: Start customer vttablets with --vreplication-parallel-replication-workers flag +cell='zone1' +keyspace='customer' + +for uid in 200 201 202; do + mysql_port=$((17000 + uid)) + port=$((15000 + uid)) + grpc_port=$((16000 + uid)) + printf -v alias '%s-%010d' "$cell" "$uid" + printf -v tablet_dir 'vt_%010d' "$uid" + printf -v tablet_logfile 'vttablet_%010d_querylog.txt' "$uid" + + tablet_type=replica + if [[ "${uid: -1}" -gt 1 ]]; then + tablet_type=rdonly + fi + + echo "Starting vttablet for $alias with vreplication-parallel-replication-workers=$PARALLEL_WORKERS..." + + # shellcheck disable=SC2086 + vttablet \ + $TOPOLOGY_FLAGS \ + --log-queries-to-file "$VTDATAROOT/tmp/$tablet_logfile" \ + --tablet-path "$alias" \ + --tablet-hostname "" \ + --init-keyspace "$keyspace" \ + --init-shard "0" \ + --init-tablet-type "$tablet_type" \ + --health-check-interval 5s \ + --backup-storage-implementation file \ + --file-backup-storage-root "$VTDATAROOT/backups" \ + --restore-from-backup \ + --port "$port" \ + --grpc-port "$grpc_port" \ + --service-map 'grpc-queryservice,grpc-tabletmanager,grpc-updatestream' \ + --pid-file "$VTDATAROOT/$tablet_dir/vttablet.pid" \ + --heartbeat-on-demand-duration=5s \ + --pprof-http \ + --log-format text \ + --vreplication-parallel-replication-workers "$PARALLEL_WORKERS" \ + --relay-log-max-size 250000 \ + --relay-log-max-items 5000 \ + --vreplication-experimental-flags "$VREPL_FLAGS" \ + >"$VTDATAROOT/$tablet_dir/vttablet.out" 2>&1 & + + # Wait for tablet to be listening + for _ in $(seq 0 300); do + curl -I "http://$(hostname -f):$port/debug/status" >/dev/null 2>&1 && break + sleep 0.1 + done + curl -I "http://$(hostname -f):$port/debug/status" || fail "vttablet for $alias could not be started!" + echo "vttablet for $alias is running!" +done + +# Step 6: Wait for healthy shard +wait_for_healthy_shard customer 0 || fail "Customer shard not healthy" + +# Step 7: Tune MySQL durability for benchmark throughput on all tablets. +# With innodb_flush_log_at_trx_commit=0 and sync_binlog=0, redo log fsyncs +# don't happen on every COMMIT. This removes fsync as a variable so we can +# isolate the applier throughput difference between serial and parallel. +echo "" +echo "Tuning MySQL settings for benchmark..." +for uid in 100 101 102 200 201 202; do + printf -v tablet_dir 'vt_%010d' "$uid" + sock="$VTDATAROOT/$tablet_dir/mysql.sock" + if [[ -S "$sock" ]]; then + command mysql --no-defaults -u vt_dba -S "$sock" -e \ + "SET GLOBAL innodb_flush_log_at_trx_commit = 0; SET GLOBAL sync_binlog = 0; SET GLOBAL rpl_semi_sync_source_enabled = 0;" 2>/dev/null && \ + echo " Tuned tablet $uid (durability, semi-sync off)" || echo " Warning: could not tune tablet $uid" + fi +done +# Tune target tablets: disable change buffering to force immediate B-tree +# page reads on every INSERT/UPDATE/DELETE. Combined with the 8MB buffer pool +# set at startup, this makes each applier statement very expensive. +for uid in 200 201 202; do + printf -v tablet_dir 'vt_%010d' "$uid" + sock="$VTDATAROOT/$tablet_dir/mysql.sock" + if [[ -S "$sock" ]]; then + command mysql --no-defaults -u vt_dba -S "$sock" -e \ + "SET GLOBAL innodb_change_buffering = 'none';" 2>/dev/null && \ + echo " Tuned tablet $uid (change buffering off, 8MB buffer pool)" || echo " Warning: could not tune tablet $uid" + fi +done + +echo "" +echo "=== Bench Setup Complete ===" +echo "Commerce keyspace: bench tables loaded" +echo "Customer keyspace: 3 tablets (parallel_workers=$PARALLEL_WORKERS)" diff --git a/examples/benchmark/create_bench_schema.sql b/examples/benchmark/create_bench_schema.sql new file mode 100644 index 00000000000..8739e2f825d --- /dev/null +++ b/examples/benchmark/create_bench_schema.sql @@ -0,0 +1,60 @@ +create table if not exists bench_orders( + id bigint not null auto_increment, + customer_name varchar(255), + product_sku varchar(128), + quantity int, + total_price bigint, + status varchar(64), + region varchar(64), + notes text, + primary key(id), + index idx_customer (customer_name), + index idx_sku_price (product_sku, total_price), + index idx_status_region (status, region), + index idx_region_price (region, total_price) +) ENGINE=InnoDB; + +create table if not exists bench_events( + id bigint not null auto_increment, + event_type varchar(128), + source varchar(255), + payload text, + severity int, + created_at bigint, + category varchar(128), + primary key(id), + index idx_type_severity (event_type, severity), + index idx_source (source), + index idx_category (category), + index idx_created (created_at) +) ENGINE=InnoDB; + +create table if not exists bench_accounts( + id bigint not null auto_increment, + username varchar(128), + email varchar(255), + balance bigint, + region varchar(64), + bio text, + tier varchar(32), + primary key(id), + index idx_username (username), + index idx_email (email), + index idx_region_balance (region, balance), + index idx_tier (tier) +) ENGINE=InnoDB; + +create table if not exists bench_logs( + id bigint not null auto_increment, + level varchar(32), + message text, + component varchar(128), + error_code int, + trace_id varchar(64), + span_id varchar(64), + primary key(id), + index idx_level_component (level, component), + index idx_error_code (error_code), + index idx_trace (trace_id), + index idx_span (span_id) +) ENGINE=InnoDB; diff --git a/examples/benchmark/vschema_bench.json b/examples/benchmark/vschema_bench.json new file mode 100644 index 00000000000..094d2ddc55b --- /dev/null +++ b/examples/benchmark/vschema_bench.json @@ -0,0 +1,8 @@ +{ + "tables": { + "bench_orders": {}, + "bench_events": {}, + "bench_accounts": {}, + "bench_logs": {} + } +} diff --git a/go/cmd/vtctldclient/command/vreplication/override_request_test.go b/go/cmd/vtctldclient/command/vreplication/override_request_test.go new file mode 100644 index 00000000000..1a910eea220 --- /dev/null +++ b/go/cmd/vtctldclient/command/vreplication/override_request_test.go @@ -0,0 +1,108 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication_test + +import ( + "context" + "os" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/cmd/vtctldclient/command" + vtctldclientcommon "vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/common" + _ "vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/movetables" + _ "vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/workflow" + "vitess.io/vitess/go/vt/vtctl/localvtctldclient" + + vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata" + vtctlservicepb "vitess.io/vitess/go/vt/proto/vtctlservice" +) + +type overrideCaptureServer struct { + vtctlservicepb.UnimplementedVtctldServer + + moveTablesCreateReq *vtctldatapb.MoveTablesCreateRequest + workflowUpdateReq *vtctldatapb.WorkflowUpdateRequest +} + +func (s *overrideCaptureServer) MoveTablesCreate(_ context.Context, req *vtctldatapb.MoveTablesCreateRequest) (*vtctldatapb.WorkflowStatusResponse, error) { + s.moveTablesCreateReq = req + return &vtctldatapb.WorkflowStatusResponse{}, nil +} + +func (s *overrideCaptureServer) WorkflowUpdate(_ context.Context, req *vtctldatapb.WorkflowUpdateRequest) (*vtctldatapb.WorkflowUpdateResponse, error) { + s.workflowUpdateReq = req + return &vtctldatapb.WorkflowUpdateResponse{}, nil +} + +func TestVtctldclientConfigOverrideRequestsIncludeParallelReplicationWorkers(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + server := &overrideCaptureServer{} + localvtctldclient.SetServer(server) + + origArgs := append([]string{}, os.Args...) + origProtocol := command.VtctldClientProtocol + t.Cleanup(func() { + os.Args = append([]string{}, origArgs...) + command.VtctldClientProtocol = origProtocol + }) + + command.VtctldClientProtocol = "local" + vtctldclientcommon.SetCommandCtx(ctx) + + t.Run("MoveTablesCreate", func(t *testing.T) { + os.Args = []string{ + "vtctldclient", + "--server", "ignored", + "MoveTables", + "--workflow", "wf1", + "--target-keyspace", "target", + "create", + "--source-keyspace", "source", + "--all-tables", + "--config-overrides", "vreplication-parallel-replication-workers=7", + } + + err := command.Root.Execute() + require.NoError(t, err) + require.NotNil(t, server.moveTablesCreateReq) + require.NotNil(t, server.moveTablesCreateReq.WorkflowOptions) + require.Equal(t, "7", server.moveTablesCreateReq.WorkflowOptions.Config["vreplication-parallel-replication-workers"]) + }) + + t.Run("WorkflowUpdate", func(t *testing.T) { + os.Args = []string{ + "vtctldclient", + "--server", "ignored", + "Workflow", + "--keyspace", "target", + "update", + "--workflow", "wf1", + "--config-overrides", "vreplication-parallel-replication-workers=9", + } + + err := command.Root.Execute() + require.NoError(t, err) + require.NotNil(t, server.workflowUpdateReq) + require.NotNil(t, server.workflowUpdateReq.TabletRequest) + require.Equal(t, "9", server.workflowUpdateReq.TabletRequest.ConfigOverrides["vreplication-parallel-replication-workers"]) + }) +} diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index 860ff5c62fa..d8c94af24f1 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -434,6 +434,7 @@ Flags: --vreplication-net-read-timeout int Session value of net_read_timeout for vreplication, in seconds (default 300) --vreplication-net-write-timeout int Session value of net_write_timeout for vreplication, in seconds (default 600) --vreplication-parallel-insert-workers int Number of parallel insertion workers to use during copy phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent insertion during copy phase. (default 1) + --vreplication-parallel-replication-workers int Number of parallel replication workers to use during the replication phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent apply. (default 1) --vreplication-replica-lag-tolerance duration Replica lag threshold duration: once lag is below this we switch from copy phase to the replication (streaming) phase (default 1m0s) --vreplication-retry-delay duration delay before retrying a failed workflow event in the replication phase (default 5s) --vreplication-store-compressed-gtid Store compressed gtids in the pos column of the sidecar database's vreplication table diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 145ff5997b6..e797601289b 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -430,6 +430,7 @@ Flags: --vreplication-net-read-timeout int Session value of net_read_timeout for vreplication, in seconds (default 300) --vreplication-net-write-timeout int Session value of net_write_timeout for vreplication, in seconds (default 600) --vreplication-parallel-insert-workers int Number of parallel insertion workers to use during copy phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent insertion during copy phase. (default 1) + --vreplication-parallel-replication-workers int Number of parallel replication workers to use during the replication phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent apply. (default 1) --vreplication-replica-lag-tolerance duration Replica lag threshold duration: once lag is below this we switch from copy phase to the replication (streaming) phase (default 1m0s) --vreplication-retry-delay duration delay before retrying a failed workflow event in the replication phase (default 5s) --vreplication-store-compressed-gtid Store compressed gtids in the pos column of the sidecar database's vreplication table diff --git a/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go b/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go index 3d989a998ca..c4f4784d378 100644 --- a/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go +++ b/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go @@ -29,11 +29,13 @@ package vreplstress import ( "context" + "errors" "flag" "fmt" "math/rand/v2" "os" "path" + "strconv" "strings" "sync" "sync/atomic" @@ -70,6 +72,25 @@ type testcase struct { autoIncInsert bool } +type stressErrorState struct { + firstErr atomic.Pointer[error] +} + +func (s *stressErrorState) record(err error) { + if err == nil { + return + } + errCopy := err + s.firstErr.CompareAndSwap(nil, &errCopy) +} + +func (s *stressErrorState) err() error { + if errPtr := s.firstErr.Load(); errPtr != nil { + return *errPtr + } + return nil +} + var ( clusterInstance *cluster.LocalProcessCluster primaryTablet *cluster.Vttablet @@ -378,7 +399,7 @@ const ( maxConcurrency = 15 singleConnectionSleepInterval = 5 * time.Millisecond periodicSleepPercent = 10 // in the range (0,100). 10 means 10% sleep time throught the stress load. - waitForStatusTimeout = 180 * time.Second + waitForStatusTimeout = 300 * time.Second ) func resetOpOrder() { @@ -428,11 +449,16 @@ func TestMain(m *testing.M) { // --vstream-packet-size is set to a small value that ensures we get multiple stream iterations, // thereby examining lastPK on vcopier side. We will be iterating tables using non-PK order throughout // this test suite, and so the low setting ensures we hit the more interesting code paths. + parallelWorkers := 4 + txPoolSize := max(parallelWorkers, 100) clusterInstance.VtTabletExtraArgs = []string{ "--heartbeat-interval", "250ms", "--heartbeat-on-demand-duration", "5s", "--migration-check-interval", "5s", "--vstream-packet-size", "4096", // Keep this value small and below 10k to ensure multilple vstream iterations + "--queryserver-config-transaction-cap", strconv.Itoa(txPoolSize), + "--transaction-limit-per-user", "0.9", + "--vreplication-parallel-replication-workers", strconv.Itoa(parallelWorkers), } clusterInstance.VtGateExtraArgs = []string{ "--ddl-strategy", "online", @@ -526,9 +552,9 @@ func TestVreplStressSchemaChanges(t *testing.T) { } status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, waitForStatusTimeout, expectStatus) fmt.Printf("# Migration status (for debug purposes): <%s>\n", status) - onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, expectStatus) cancel() // will cause runMultipleConnections() to terminate wg.Wait() + require.Equal(t, string(expectStatus), string(status), "migration did not reach expected status within timeout") if !testcase.expectFailure { testCompareBeforeAfterTables(t, testcase.autoIncInsert) } @@ -670,7 +696,7 @@ func generateDelete(t *testing.T, conn *mysql.Conn) error { return err } -func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool, done *int64) { +func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool, done *int64, errs *stressErrorState) { log.Info("Running single connection") conn, err := mysql.Connect(ctx, &vtParams) require.Nil(t, err) @@ -712,7 +738,10 @@ func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool, } } } - assert.Nil(t, err) + if err != nil { + errs.record(err) + return + } time.Sleep(singleConnectionSleepInterval) // Most o fthe time, we want the load to be high, so as to create real stress and potentially // expose bugs in vreplication (the objective of this test!). @@ -730,16 +759,18 @@ func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool, func runMultipleConnections(ctx context.Context, t *testing.T, autoIncInsert bool) { log.Info("Running multiple connections") var done int64 + errState := &stressErrorState{} var wg sync.WaitGroup for range maxConcurrency { wg.Go(func() { - runSingleConnection(ctx, t, autoIncInsert, &done) + runSingleConnection(ctx, t, autoIncInsert, &done, errState) }) } <-ctx.Done() atomic.StoreInt64(&done, 1) log.Info("Running multiple connections: done") wg.Wait() + require.NoError(t, errState.err()) log.Info("All connections cancelled") } @@ -846,3 +877,22 @@ func testCompareBeforeAfterTables(t *testing.T, autoIncInsert bool) { require.Equal(t, beforeOutput, afterOutput, "results mismatch: (%s) and (%s)", selectBeforeTable, selectAfterTable) } } + +func TestStressErrorStateRecordsFirstUnexpectedError(t *testing.T) { + state := &stressErrorState{} + firstErr := errors.New("first") + secondErr := errors.New("second") + + state.record(firstErr) + state.record(secondErr) + + require.ErrorIs(t, state.err(), firstErr) +} + +func TestStressErrorStateIgnoresNilErrors(t *testing.T) { + state := &stressErrorState{} + + state.record(nil) + + require.NoError(t, state.err()) +} diff --git a/go/test/endtoend/vreplication/cluster_test.go b/go/test/endtoend/vreplication/cluster_test.go index 23c47fb8a20..a3c8e54bfe3 100644 --- a/go/test/endtoend/vreplication/cluster_test.go +++ b/go/test/endtoend/vreplication/cluster_test.go @@ -64,7 +64,8 @@ var ( // This variable can be used within specific tests to alter vttablet behavior. extraVTTabletArgs = []string{} - parallelInsertWorkers = "--vreplication-parallel-insert-workers=4" + parallelInsertWorkers = "--vreplication-parallel-insert-workers=4" + parallelReplicationWorkers = "--vreplication-parallel-replication-workers=4" throttlerConfig = throttler.Config{Threshold: 15} ) diff --git a/go/test/endtoend/vreplication/fk_ext_load_generator_test.go b/go/test/endtoend/vreplication/fk_ext_load_generator_test.go index 36f4a3bc709..d51f8c8fd7b 100644 --- a/go/test/endtoend/vreplication/fk_ext_load_generator_test.go +++ b/go/test/endtoend/vreplication/fk_ext_load_generator_test.go @@ -103,6 +103,11 @@ type SimpleLoadGenerator struct { ch chan bool runCtx context.Context runCtxCancel context.CancelFunc + // vtgateConn is reused across execQueryWithRetry calls during Start()'s + // goroutine. Opening a fresh TCP connection per DML piles up thousands of + // sockets in TIME_WAIT and exhausts the macOS ephemeral port range + // (49152-65535), which in turn stalls unrelated gRPC reconnects. + vtgateConn *mysql.Conn } func (lg *SimpleLoadGenerator) SetOverrideConstraints(allow bool) { @@ -199,7 +204,6 @@ func (lg *SimpleLoadGenerator) execQueryWithRetry(query string) (*sqltypes.Resul defer cancel() errCh := make(chan error) qrCh := make(chan *sqltypes.Result) - var vtgateConn *mysql.Conn go func() { var qr *sqltypes.Result var err error @@ -219,23 +223,27 @@ func (lg *SimpleLoadGenerator) execQueryWithRetry(query string) (*sqltypes.Resul if retry { time.Sleep(tickInterval) } - // We need to parse the error as well as the output of vdiff to determine if the error is retryable, since - // sometimes it is observed that we get the error output as part of vdiff output. - vtgateConn, err = lg.getVtgateConn(ctx) - if err != nil { - if !isQueryRetryable(err) { - errCh <- err - return + // Reuse lg.vtgateConn across calls so we don't burn a TCP + // connection per DML. On error we close and null it out so the + // retry path above opens a fresh one. + if lg.vtgateConn == nil { + lg.vtgateConn, err = lg.getVtgateConn(ctx) + if err != nil { + if !isQueryRetryable(err) { + errCh <- err + return + } + time.Sleep(tickInterval) + continue } - time.Sleep(tickInterval) - continue } - qr, err = vtgateConn.ExecuteFetch(query, 1000, false) - vtgateConn.Close() + qr, err = lg.vtgateConn.ExecuteFetch(query, 1000, false) if err == nil { qrCh <- qr return } + lg.vtgateConn.Close() + lg.vtgateConn = nil if !isQueryRetryable(err) { errCh <- err return @@ -276,6 +284,10 @@ func (lg *SimpleLoadGenerator) Start() error { lg.state = LoadGeneratorStateRunning go func() { defer func() { + if lg.vtgateConn != nil { + lg.vtgateConn.Close() + lg.vtgateConn = nil + } lg.state = LoadGeneratorStateStopped log.Info("Load generator stopped") }() diff --git a/go/test/endtoend/vreplication/fk_ext_test.go b/go/test/endtoend/vreplication/fk_ext_test.go index ef46cec047c..abd82e4a634 100644 --- a/go/test/endtoend/vreplication/fk_ext_test.go +++ b/go/test/endtoend/vreplication/fk_ext_test.go @@ -88,7 +88,8 @@ func TestFKExt(t *testing.T) { extraVTTabletArgs = append(extraVTTabletArgs, "--vstream-packet-size=256", "--queryserver-config-schema-change-signal", - parallelInsertWorkers) + parallelInsertWorkers, + parallelReplicationWorkers) extraVTGateArgs = append(extraVTGateArgs, "--schema-change-signal"+"=true", "--planner-version", "Gen4") defer func() { extraVTTabletArgs = nil }() initFKExtConfig(t) diff --git a/go/test/endtoend/vreplication/vreplication_test.go b/go/test/endtoend/vreplication/vreplication_test.go index 4fbfe25f4c9..fbf581eaccc 100644 --- a/go/test/endtoend/vreplication/vreplication_test.go +++ b/go/test/endtoend/vreplication/vreplication_test.go @@ -268,11 +268,12 @@ func TestBasicVreplicationWorkflow(t *testing.T) { testBasicVreplicationWorkflow(t, "noblob") } -func TestVreplicationCopyParallel(t *testing.T) { - defaultSourceKsOpts["DBTypeVersion"] = "mysql-5.7" - defaultTargetKsOpts["DBTypeVersion"] = "mysql-5.7" +func TestVreplicationParallel(t *testing.T) { + defaultSourceKsOpts["DBTypeVersion"] = "mysql-8.4" + defaultTargetKsOpts["DBTypeVersion"] = "mysql-8.4" extraVTTabletArgs = []string{ parallelInsertWorkers, + parallelReplicationWorkers, } testBasicVreplicationWorkflow(t, "") } diff --git a/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go b/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go index 5f185de8ab6..c5cf4ecc485 100644 --- a/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go +++ b/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go @@ -224,9 +224,10 @@ func TestVtctldclientCLI(t *testing.T) { func testMoveTablesFlags1(t *testing.T, mt *iMoveTables, sourceKeyspace, targetKeyspace, defaultWorkflowName string, targetTabs map[string]*cluster.VttabletProcess) { tables := "customer,customer2" overrides := map[string]string{ - "vreplication-net-read-timeout": "6000", - "relay-log-max-items": "10000", - "vreplication-parallel-insert-workers": "10", + "vreplication-net-read-timeout": "6000", + "relay-log-max-items": "10000", + "vreplication-parallel-insert-workers": "10", + "vreplication-parallel-replication-workers": "3", } createFlags := []string{ "--auto-start=false", "--defer-secondary-keys=false", "--stop-after-copy", @@ -496,8 +497,9 @@ func testWorkflowUpdateConfig(t *testing.T, mt *iMoveTables, targetTabs map[stri { name: "two values", config: map[string]string{ - "vreplication-heartbeat-update-interval": "100", - "vreplication-store-compressed-gtid": "true", + "vreplication-heartbeat-update-interval": "100", + "vreplication-store-compressed-gtid": "true", + "vreplication-parallel-replication-workers": "5", }, }, { diff --git a/go/vt/binlog/binlog_connection.go b/go/vt/binlog/binlog_connection.go index 1cac5bb458d..82f78111495 100644 --- a/go/vt/binlog/binlog_connection.go +++ b/go/vt/binlog/binlog_connection.go @@ -142,8 +142,11 @@ func (bc *BinlogConnection) StartBinlogDumpFromPosition(ctx context.Context, bin // streamEvents returns a channel on which events are streamed and a channel on // which errors are propagated. func (bc *BinlogConnection) streamEvents(ctx context.Context) (chan mysql.BinlogEvent, chan error) { - // FIXME(alainjobart) I think we can use a buffered channel for better performance. - eventChan := make(chan mysql.BinlogEvent) + // Buffer the event channel so the binlog reader goroutine can make + // progress without blocking on the consumer for every single event. + // An unbuffered channel here forces a context switch per event, which + // becomes a throughput bottleneck at high event rates. + eventChan := make(chan mysql.BinlogEvent, 10) errChan := make(chan error) // Start reading events. diff --git a/go/vt/binlog/binlog_streamer.go b/go/vt/binlog/binlog_streamer.go index b4cc4ad28e8..b2967b81183 100644 --- a/go/vt/binlog/binlog_streamer.go +++ b/go/vt/binlog/binlog_streamer.go @@ -255,6 +255,7 @@ func (bls *Streamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog pos := bls.startPos autocommit := true var err error + var pendingStreamErr error // Remember the RBR state. // tableMaps is indexed by tableID. @@ -298,12 +299,33 @@ func (bls *Streamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog select { case ev, ok = <-events: if !ok { + if pendingStreamErr != nil { + return pos, pendingStreamErr + } + if errs != nil { + select { + case err, ok := <-errs: + if ok && err != nil { + return pos, err + } + default: + } + } // events channel has been closed, which means the connection died. log.Info("reached end of binlog event stream") return pos, ErrServerEOF } - case err = <-errs: - return pos, err + case err, ok = <-errs: + if !ok { + errs = nil + continue + } + if len(events) == 0 { + return pos, err + } + pendingStreamErr = err + errs = nil + continue case <-ctx.Done(): log.Info("stopping early due to binlog Streamer service shutdown or client disconnect") return pos, ctx.Err() diff --git a/go/vt/binlog/binlog_streamer_test.go b/go/vt/binlog/binlog_streamer_test.go index 93856015a86..2eedd58f8c9 100644 --- a/go/vt/binlog/binlog_streamer_test.go +++ b/go/vt/binlog/binlog_streamer_test.go @@ -200,6 +200,70 @@ func TestStreamerParseEventsCommit(t *testing.T) { assert.Truef(t, got.equal(want), "binlogConnStreamer.parseEvents(): got %v, want %v", got, want) } +func TestStreamerParseEventsDrainsBufferedEventsBeforeTerminalError(t *testing.T) { + f := mysql.NewMySQL56BinlogFormat() + s := mysql.NewFakeBinlogStream() + s.ServerID = 62344 + + input := []mysql.BinlogEvent{ + mysql.NewRotateEvent(f, s, 0, ""), + mysql.NewFormatDescriptionEvent(f, s), + mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */), + mysql.NewQueryEvent(f, s, mysql.Query{ + Database: "vt_test_keyspace", + SQL: "BEGIN", + }), + mysql.NewQueryEvent(f, s, mysql.Query{ + Database: "vt_test_keyspace", + SQL: "insert into vt_a(eid, id) values (1, 1) /* _stream vt_a (eid id ) (1 1 ); */", + }), + mysql.NewXIDEvent(f, s), + } + + want := []*binlogdatapb.BinlogTransaction{ + { + Statements: []*binlogdatapb.BinlogTransaction_Statement{ + {Category: binlogdatapb.BinlogTransaction_Statement_BL_SET, Sql: []byte("SET TIMESTAMP=1407805592")}, + {Category: binlogdatapb.BinlogTransaction_Statement_BL_INSERT, Sql: []byte("insert into vt_a(eid, id) values (1, 1) /* _stream vt_a (eid id ) (1 1 ); */")}, + }, + EventToken: &querypb.EventToken{ + Timestamp: 1407805592, + Position: replication.EncodePosition(replication.Position{ + GTIDSet: replication.MariadbGTIDSet{ + 0: replication.MariadbGTID{ + Domain: 0, + Server: 62344, + Sequence: 0x0d, + }, + }, + }), + }, + }, + } + + mcp := &mysql.ConnParams{DbName: "vt_test_keyspace"} + dbcfgs := dbconfigs.New(mcp) + streamErr := errors.New("stream ended after buffered events") + + for i := range 64 { + events := make(chan mysql.BinlogEvent, len(input)) + errs := make(chan error, 1) + for _, ev := range input { + events <- ev + } + close(events) + errs <- streamErr + close(errs) + + var got binlogStatements + bls := NewStreamer(dbcfgs, nil, nil, replication.Position{}, 0, (&got).sendTransaction) + + _, err := bls.parseEvents(t.Context(), events, errs) + require.ErrorIs(t, err, streamErr, "iteration %d", i) + require.True(t, got.equal(want), "iteration %d: got %#v want %#v", i, got, want) + } +} + func TestStreamerStop(t *testing.T) { events := make(chan mysql.BinlogEvent) errs := make(chan error) diff --git a/go/vt/vttablet/common/config.go b/go/vt/vttablet/common/config.go index d413a94db01..511751fa0c0 100644 --- a/go/vt/vttablet/common/config.go +++ b/go/vt/vttablet/common/config.go @@ -19,13 +19,27 @@ package vttablet import ( "encoding/json" "fmt" + "log/slog" "maps" + "slices" "strconv" "strings" "sync" "time" + + "vitess.io/vitess/go/vt/log" ) +// maxParallelReplicationWorkers bounds --vreplication-parallel-replication-workers +// and its per-workflow override. Each worker holds two MySQL connections per +// workflow (double-buffered apply), plus the main connection, so an unbounded +// value would let a single workflow exhaust the target's max_connections. +const maxParallelReplicationWorkers = 64 + +// warnParallelReplicationWorkersCap rate-limits the flag-clamp warning to once +// per process; GetVReplicationConfigDefaults can be called per workflow. +var warnParallelReplicationWorkersCap sync.Once + /* This file contains the model for all the configuration parameters for VReplication workflows. It also provides methods to initialize the default configuration and to override the default configuration with user-provided values. The overrides @@ -37,21 +51,22 @@ import ( // target (vreplication)and the source (vstreamer) side. type VReplicationConfig struct { // Config parameters applicable to the target side (vreplication) - ExperimentalFlags int64 - NetReadTimeout int - NetWriteTimeout int - CopyPhaseDuration time.Duration - RetryDelay time.Duration - MaxTimeToRetryError time.Duration - RelayLogMaxSize int - RelayLogMaxItems int - ReplicaLagTolerance time.Duration - HeartbeatUpdateInterval int - StoreCompressedGTID bool - ParallelInsertWorkers int - TabletTypesStr string - EnableHttpLog bool // Enable the /debug/vrlog endpoint - MaxRowJSONBytes int64 + ExperimentalFlags int64 + NetReadTimeout int + NetWriteTimeout int + CopyPhaseDuration time.Duration + RetryDelay time.Duration + MaxTimeToRetryError time.Duration + RelayLogMaxSize int + RelayLogMaxItems int + ReplicaLagTolerance time.Duration + HeartbeatUpdateInterval int + StoreCompressedGTID bool + ParallelInsertWorkers int + ParallelReplicationWorkers int + TabletTypesStr string + EnableHttpLog bool // Enable the /debug/vrlog endpoint + MaxRowJSONBytes int64 // Config parameters applicable to the source side (vstreamer) // The coresponding Override fields are used to determine if the user has provided a value for the parameter so @@ -83,21 +98,22 @@ func GetVReplicationConfigDefaults(useCached bool) *VReplicationConfig { return DefaultVReplicationConfig } DefaultVReplicationConfig = &VReplicationConfig{ - ExperimentalFlags: vreplicationExperimentalFlags, - NetReadTimeout: vreplicationNetReadTimeout, - NetWriteTimeout: vreplicationNetWriteTimeout, - CopyPhaseDuration: vreplicationCopyPhaseDuration, - RetryDelay: vreplicationRetryDelay, - MaxTimeToRetryError: vreplicationMaxTimeToRetryError, - RelayLogMaxSize: vreplicationRelayLogMaxSize, - RelayLogMaxItems: vreplicationRelayLogMaxItems, - ReplicaLagTolerance: vreplicationReplicaLagTolerance, - HeartbeatUpdateInterval: vreplicationHeartbeatUpdateInterval, - StoreCompressedGTID: vreplicationStoreCompressedGTID, - ParallelInsertWorkers: vreplicationParallelInsertWorkers, - TabletTypesStr: vreplicationTabletTypesStr, - EnableHttpLog: vreplicationEnableHttpLog, - MaxRowJSONBytes: vreplicationMaxRowJSONBytes, + ExperimentalFlags: vreplicationExperimentalFlags, + NetReadTimeout: vreplicationNetReadTimeout, + NetWriteTimeout: vreplicationNetWriteTimeout, + CopyPhaseDuration: vreplicationCopyPhaseDuration, + RetryDelay: vreplicationRetryDelay, + MaxTimeToRetryError: vreplicationMaxTimeToRetryError, + RelayLogMaxSize: vreplicationRelayLogMaxSize, + RelayLogMaxItems: vreplicationRelayLogMaxItems, + ReplicaLagTolerance: vreplicationReplicaLagTolerance, + HeartbeatUpdateInterval: vreplicationHeartbeatUpdateInterval, + StoreCompressedGTID: vreplicationStoreCompressedGTID, + ParallelInsertWorkers: vreplicationParallelInsertWorkers, + ParallelReplicationWorkers: cappedParallelReplicationWorkers(vreplicationParallelReplicationWorkers), + TabletTypesStr: vreplicationTabletTypesStr, + EnableHttpLog: vreplicationEnableHttpLog, + MaxRowJSONBytes: vreplicationMaxRowJSONBytes, VStreamPacketSizeOverride: false, VStreamPacketSize: VStreamerDefaultPacketSize, @@ -133,7 +149,21 @@ func NewVReplicationConfig(overrides map[string]string) (*VReplicationConfig, er getError := func(k, v string) string { return fmt.Sprintf("invalid value for %s: %s", k, v) } - for k, v := range overrides { + // Iterate keys in sorted order so the resulting config is deterministic + // when the caller supplies both the hyphen and underscore variants of the + // same setting (e.g. `vstream-packet-size` and `vstream_packet_size`). + // Go map iteration is intentionally randomized; without sorting, last- + // write-wins would produce different results across runs of the same + // vttablet. ASCII '-' (0x2D) < '_' (0x5F), so hyphen variants are + // applied first and underscore variants override — matching the + // "UseEffectiveValues" behaviour exercised by the test suite. + keys := make([]string, 0, len(overrides)) + for k := range overrides { + keys = append(keys, k) + } + slices.Sort(keys) + for _, k := range keys { + v := overrides[k] if v == "" { continue } @@ -194,6 +224,20 @@ func NewVReplicationConfig(overrides map[string]string) (*VReplicationConfig, er } else { c.RelayLogMaxItems = value } + case "vreplication-parallel-replication-workers": + value, err := strconv.Atoi(v) + if err != nil { + errors = append(errors, getError(k, v)) + } else if value < 0 { + // Negative values are never meaningful; the flag help text + // documents "<= 1 to disable parallelism" so 0 and 1 both + // fall through to the serial applier path. + errors = append(errors, fmt.Sprintf("invalid value for %s: %d (must be >= 0; 0 or 1 disables parallel apply)", k, value)) + } else if value > maxParallelReplicationWorkers { + errors = append(errors, fmt.Sprintf("invalid value for %s: %d (must be at most %d; each worker holds two MySQL connections per workflow)", k, value, maxParallelReplicationWorkers)) + } else { + c.ParallelReplicationWorkers = value + } case "vreplication-replica-lag-tolerance": value, err := time.ParseDuration(v) if err != nil { @@ -263,31 +307,57 @@ func NewVReplicationConfig(overrides map[string]string) (*VReplicationConfig, er return c, nil } +// SourceOverrides returns only the vstreamer-side overrides that can be sent to source tablets. +func (c VReplicationConfig) SourceOverrides() map[string]string { + sourceOverrides := make(map[string]string) + for _, key := range []string{ + "vreplication-experimental-flags", + "vreplication-net-read-timeout", + "vreplication-net-write-timeout", + "vreplication-copy-phase-duration", + } { + if value, ok := c.Overrides[key]; ok && value != "" { + sourceOverrides[key] = value + } + } + if c.VStreamPacketSizeOverride { + sourceOverrides["vstream-packet-size"] = strconv.Itoa(c.VStreamPacketSize) + } + if c.VStreamDynamicPacketSizeOverride { + sourceOverrides["vstream-dynamic-packet-size"] = strconv.FormatBool(c.VStreamDynamicPacketSize) + } + if c.VStreamBinlogRotationThresholdOverride { + sourceOverrides["vstream_binlog_rotation_threshold"] = strconv.FormatInt(c.VStreamBinlogRotationThreshold, 10) + } + return sourceOverrides +} + // Map returns a map of the VReplicationConfig: the keys are the flag names and the values are string representations. // Used in tests to compare the expected and actual configuration values and in validations to check if the user-provided // keys are one of those that are supported. func (c VReplicationConfig) Map() map[string]string { return map[string]string{ - "vreplication-experimental-flags": strconv.FormatInt(c.ExperimentalFlags, 10), - "vreplication-net-read-timeout": strconv.Itoa(c.NetReadTimeout), - "vreplication-net-write-timeout": strconv.Itoa(c.NetWriteTimeout), - "vreplication-copy-phase-duration": c.CopyPhaseDuration.String(), - "vreplication-retry-delay": c.RetryDelay.String(), - "vreplication-max-time-to-retry-on-error": c.MaxTimeToRetryError.String(), - "relay-log-max-size": strconv.Itoa(c.RelayLogMaxSize), - "relay_log_max_size": strconv.Itoa(c.RelayLogMaxSize), - "relay-log-max-items": strconv.Itoa(c.RelayLogMaxItems), - "relay_log_max_items": strconv.Itoa(c.RelayLogMaxItems), - "vreplication-replica-lag-tolerance": c.ReplicaLagTolerance.String(), - "vreplication-heartbeat-update-interval": strconv.Itoa(c.HeartbeatUpdateInterval), - "vreplication-store-compressed-gtid": strconv.FormatBool(c.StoreCompressedGTID), - "vreplication-parallel-insert-workers": strconv.Itoa(c.ParallelInsertWorkers), - "vstream-packet-size": strconv.Itoa(c.VStreamPacketSize), - "vstream_packet_size": strconv.Itoa(c.VStreamPacketSize), - "vstream-dynamic-packet-size": strconv.FormatBool(c.VStreamDynamicPacketSize), - "vstream_dynamic_packet_size": strconv.FormatBool(c.VStreamDynamicPacketSize), - "vstream_binlog_rotation_threshold": strconv.FormatInt(c.VStreamBinlogRotationThreshold, 10), - "max-row-json-bytes": strconv.FormatInt(c.MaxRowJSONBytes, 10), + "vreplication-experimental-flags": strconv.FormatInt(c.ExperimentalFlags, 10), + "vreplication-net-read-timeout": strconv.Itoa(c.NetReadTimeout), + "vreplication-net-write-timeout": strconv.Itoa(c.NetWriteTimeout), + "vreplication-copy-phase-duration": c.CopyPhaseDuration.String(), + "vreplication-retry-delay": c.RetryDelay.String(), + "vreplication-max-time-to-retry-on-error": c.MaxTimeToRetryError.String(), + "relay-log-max-size": strconv.Itoa(c.RelayLogMaxSize), + "relay_log_max_size": strconv.Itoa(c.RelayLogMaxSize), + "relay-log-max-items": strconv.Itoa(c.RelayLogMaxItems), + "relay_log_max_items": strconv.Itoa(c.RelayLogMaxItems), + "vreplication-replica-lag-tolerance": c.ReplicaLagTolerance.String(), + "vreplication-heartbeat-update-interval": strconv.Itoa(c.HeartbeatUpdateInterval), + "vreplication-store-compressed-gtid": strconv.FormatBool(c.StoreCompressedGTID), + "vreplication-parallel-insert-workers": strconv.Itoa(c.ParallelInsertWorkers), + "vreplication-parallel-replication-workers": strconv.Itoa(c.ParallelReplicationWorkers), + "vstream-packet-size": strconv.Itoa(c.VStreamPacketSize), + "vstream_packet_size": strconv.Itoa(c.VStreamPacketSize), + "vstream-dynamic-packet-size": strconv.FormatBool(c.VStreamDynamicPacketSize), + "vstream_dynamic_packet_size": strconv.FormatBool(c.VStreamDynamicPacketSize), + "vstream_binlog_rotation_threshold": strconv.FormatInt(c.VStreamBinlogRotationThreshold, 10), + "max-row-json-bytes": strconv.FormatInt(c.MaxRowJSONBytes, 10), } } @@ -295,3 +365,21 @@ func (c VReplicationConfig) String() string { s, _ := json.Marshal(c.Map()) return string(s) } + +// cappedParallelReplicationWorkers clamps the tablet-wide flag value to +// maxParallelReplicationWorkers, warning once per process. The per-workflow +// override rejects out-of-range values outright (it has an error path); the +// flag is clamped instead so an over-eager value degrades gracefully rather +// than failing tablet startup. +func cappedParallelReplicationWorkers(value int) int { + if value <= maxParallelReplicationWorkers { + return value + } + warnParallelReplicationWorkersCap.Do(func() { + log.Warn("--vreplication-parallel-replication-workers exceeds the maximum; capping", + slog.Int("requested", value), + slog.Int("max", maxParallelReplicationWorkers), + ) + }) + return maxParallelReplicationWorkers +} diff --git a/go/vt/vttablet/common/config_test.go b/go/vt/vttablet/common/config_test.go index a38eb537c78..a8d75159922 100644 --- a/go/vt/vttablet/common/config_test.go +++ b/go/vt/vttablet/common/config_test.go @@ -71,6 +71,7 @@ func TestNewVReplicationConfig(t *testing.T) { HeartbeatUpdateInterval: 2, StoreCompressedGTID: true, ParallelInsertWorkers: 4, + ParallelReplicationWorkers: 1, // flag default VStreamPacketSize: 1024, VStreamDynamicPacketSize: false, VStreamBinlogRotationThreshold: 2048, @@ -125,6 +126,7 @@ func TestNewVReplicationConfig(t *testing.T) { HeartbeatUpdateInterval: DefaultVReplicationConfig.HeartbeatUpdateInterval, StoreCompressedGTID: !DefaultVReplicationConfig.StoreCompressedGTID, ParallelInsertWorkers: DefaultVReplicationConfig.ParallelInsertWorkers, + ParallelReplicationWorkers: DefaultVReplicationConfig.ParallelReplicationWorkers, VStreamPacketSize: DefaultVReplicationConfig.VStreamPacketSize, VStreamDynamicPacketSize: !DefaultVReplicationConfig.VStreamDynamicPacketSize, VStreamBinlogRotationThreshold: DefaultVReplicationConfig.VStreamBinlogRotationThreshold, @@ -160,6 +162,85 @@ func TestNewVReplicationConfig(t *testing.T) { } } +func TestVReplicationConfigSourceOverrides(t *testing.T) { + config, err := NewVReplicationConfig(map[string]string{ + "vreplication-parallel-replication-workers": "4", + "vreplication-parallel-insert-workers": "8", + "vstream-packet-size": "1024", + "vstream_dynamic_packet_size": "false", + "vstream_binlog_rotation_threshold": "2048", + }) + require.NoError(t, err) + + require.Equal(t, map[string]string{ + "vstream-packet-size": "1024", + "vstream-dynamic-packet-size": "false", + "vstream_binlog_rotation_threshold": "2048", + }, config.SourceOverrides()) +} + +func TestVReplicationConfigSourceOverridesUseEffectiveValues(t *testing.T) { + config, err := NewVReplicationConfig(map[string]string{ + "vstream-packet-size": "1024", + "vstream_packet_size": "2048", + }) + require.NoError(t, err) + + require.Equal(t, map[string]string{ + "vstream-packet-size": "2048", + }, config.SourceOverrides()) +} + +// TestVReplicationConfigRejectsInvalidParallelWorkers verifies that +// negative worker counts are rejected at config-parse time. 0 is allowed +// (documented as "disable parallel apply"); see +// TestVReplicationConfigAcceptsZeroParallelReplicationWorkers below. +func TestVReplicationConfigRejectsInvalidParallelWorkers(t *testing.T) { + for _, bad := range []string{"-1", "-5"} { + t.Run(bad, func(t *testing.T) { + _, err := NewVReplicationConfig(map[string]string{ + "vreplication-parallel-replication-workers": bad, + }) + require.Error(t, err, "expected error for invalid value %q", bad) + require.Contains(t, err.Error(), "vreplication-parallel-replication-workers") + require.Contains(t, err.Error(), "must be >= 0") + }) + } +} + +func TestVReplicationConfigAcceptsZeroParallelReplicationWorkers(t *testing.T) { + config, err := NewVReplicationConfig(map[string]string{ + "vreplication-parallel-replication-workers": "0", + }) + require.NoError(t, err) + require.Equal(t, 0, config.ParallelReplicationWorkers) +} + +func TestVReplicationConfigSourceOverridesIncludeSourceConsumedWorkflowKeys(t *testing.T) { + config, err := NewVReplicationConfig(map[string]string{ + "vreplication-experimental-flags": "3", + "vreplication-net-read-timeout": "123", + "vreplication-net-write-timeout": "456", + "vreplication-copy-phase-duration": "2h", + "vreplication-parallel-replication-workers": "4", + "vreplication-parallel-insert-workers": "8", + "vstream-packet-size": "1024", + "vstream_dynamic_packet_size": "false", + "vstream_binlog_rotation_threshold": "2048", + }) + require.NoError(t, err) + + require.Equal(t, map[string]string{ + "vreplication-experimental-flags": "3", + "vreplication-net-read-timeout": "123", + "vreplication-net-write-timeout": "456", + "vreplication-copy-phase-duration": "2h", + "vstream-packet-size": "1024", + "vstream-dynamic-packet-size": "false", + "vstream_binlog_rotation_threshold": "2048", + }, config.SourceOverrides()) +} + func TestMaxRowJSONBytesOverride(t *testing.T) { InitVReplicationConfigDefaults() cfg, err := NewVReplicationConfig(map[string]string{"max-row-json-bytes": "1048576"}) @@ -200,3 +281,18 @@ func TestVReplicationMaxRowJSONBytesFlagRejectsNegative(t *testing.T) { require.Error(t, err) require.ErrorContains(t, err, "must be non-negative") } + +// TestVReplicationConfigCapsParallelReplicationWorkers pins the upper bound: +// the per-workflow override rejects values above the cap, and the tablet-wide +// flag value is clamped (each worker holds two MySQL connections per +// workflow, so an unbounded value could exhaust the target's max_connections). +func TestVReplicationConfigCapsParallelReplicationWorkers(t *testing.T) { + _, err := NewVReplicationConfig(map[string]string{ + "vreplication-parallel-replication-workers": "65", + }) + require.Error(t, err) + require.Contains(t, err.Error(), "must be at most 64") + + require.Equal(t, 64, cappedParallelReplicationWorkers(1000)) + require.Equal(t, 4, cappedParallelReplicationWorkers(4)) +} diff --git a/go/vt/vttablet/common/flags.go b/go/vt/vttablet/common/flags.go index 89daa0bf4db..5c9eb9944af 100644 --- a/go/vt/vttablet/common/flags.go +++ b/go/vt/vttablet/common/flags.go @@ -81,9 +81,10 @@ var ( vreplicationHeartbeatUpdateInterval = 1 - vreplicationStoreCompressedGTID = false - vreplicationParallelInsertWorkers = 1 - vreplicationMaxRowJSONBytes = int64(0) + vreplicationStoreCompressedGTID = false + vreplicationParallelInsertWorkers = 1 + vreplicationParallelReplicationWorkers = 1 + vreplicationMaxRowJSONBytes = int64(0) // VStreamerBinlogRotationThreshold is the threshold, above which we rotate binlogs, before taking a GTID snapshot VStreamerBinlogRotationThreshold = int64(64 * 1024 * 1024) // 64MiB @@ -132,6 +133,8 @@ func registerFlags(fs *pflag.FlagSet) { fs.IntVar(&vreplicationParallelInsertWorkers, "vreplication-parallel-insert-workers", vreplicationParallelInsertWorkers, "Number of parallel insertion workers to use during copy phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent insertion during copy phase.") + fs.IntVar(&vreplicationParallelReplicationWorkers, "vreplication-parallel-replication-workers", vreplicationParallelReplicationWorkers, "Number of parallel replication workers to use during the replication phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent apply.") + fs.Uint64Var(&mysql.ZstdInMemoryDecompressorMaxSize, "binlog-in-memory-decompressor-max-size", mysql.ZstdInMemoryDecompressorMaxSize, "This value sets the uncompressed transaction payload size at which we switch from in-memory buffer based decompression to the slower streaming mode.") fs.BoolVar(&vreplicationEnableHttpLog, "vreplication-enable-http-log", vreplicationEnableHttpLog, "Enable the /debug/vrlog HTTP endpoint, which will produce a log of the events replicated on primary tablets in the target keyspace by all VReplication workflows that are in the running/replicating phase.") diff --git a/go/vt/vttablet/common/flags_test.go b/go/vt/vttablet/common/flags_test.go new file mode 100644 index 00000000000..f35c8b1e12e --- /dev/null +++ b/go/vt/vttablet/common/flags_test.go @@ -0,0 +1,33 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vttablet + +import ( + "testing" + + "github.com/spf13/pflag" + "github.com/stretchr/testify/require" +) + +func TestRegisterFlags_ParallelReplicationWorkersUsage(t *testing.T) { + fs := pflag.NewFlagSet("test", pflag.ContinueOnError) + registerFlags(fs) + + flag := fs.Lookup("vreplication-parallel-replication-workers") + require.NotNil(t, flag) + require.NotContains(t, flag.Usage, "Experimental") +} diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index ff17d4c1e7a..db35cc1a01e 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -22,8 +22,10 @@ package onlineddl import ( "context" + "encoding/json" "errors" "fmt" + "log/slog" "os" "slices" "strconv" @@ -56,6 +58,7 @@ import ( "vitess.io/vitess/go/vt/topo/topoproto" "vitess.io/vitess/go/vt/utils" "vitess.io/vitess/go/vt/vterrors" + vttablet "vitess.io/vitess/go/vt/vttablet/common" "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication" "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" @@ -66,6 +69,7 @@ import ( binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata" vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" ) @@ -1028,6 +1032,12 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh go log.Info(fmt.Sprintf("cutOverVReplMigration %v: unbuffered queries", s.workflow)) }) } + + shouldWaitForParallelApply, err := shouldPreBufferWaitForParallelApply(s) + if err != nil { + return vterrors.Wrapf(err, "failed parsing vreplication workflow options before pre-buffer wait") + } + e.updateMigrationStage(ctx, onlineDDL.UUID, "buffering queries") // stop writes on source: err = toggleBuffering(true) @@ -1075,6 +1085,23 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh return vterrors.Wrapf(err, "failed locking tables") } + if shouldWaitForParallelApply { + // With writes blocked on the original table, the stream can catch up without a queued + // RENAME holding conflicting metadata locks on the shadow table. + e.updateMigrationStage(ctx, onlineDDL.UUID, "post-lock: waiting for vreplication to catch up") + preRenamePos, err := e.primaryPosition(ctx) + if err != nil { + return vterrors.Wrapf(err, "failed reading primary position before renaming") + } + if s, err = e.readVReplStream(ctx, s.workflow, false); err != nil { + return vterrors.Wrapf(err, "failed reading vreplication stream before renaming") + } + if err := waitForPos(s, preRenamePos, onlineDDL.CutOverThreshold); err != nil { + return vterrors.Wrapf(err, "failed waiting for vreplication to catch up before renaming") + } + go log.Info("cutOverVReplMigration: post-lock waitForPos reached", slog.String("workflow", s.workflow), slog.String("position", replication.EncodePosition(preRenamePos))) + } + e.updateMigrationStage(ctx, onlineDDL.UUID, "renaming tables") killWhileRenamingContext, killWhileRenamingCancel := context.WithCancel(ctx) defer killWhileRenamingCancel() @@ -1118,12 +1145,20 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh return vterrors.Wrapf(err, "failed reading vreplication table after locking") } - e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-lock pos: %v", replication.EncodePosition(postWritesPos)) - if err := waitForPos(s, postWritesPos, onlineDDL.CutOverThreshold); err != nil { - e.updateMigrationStage(ctx, onlineDDL.UUID, "timeout while waiting for post-lock pos: %v", err) - return vterrors.Wrapf(err, "failed waiting for pos after locking") + // The parallel-apply pre-rename wait happens in-lock in the production + // branch above, so the post-lock wait is redundant there. The test-suite + // branch has no queued RENAME (and therefore no MDL conflict to avoid) + // but also never performed the in-lock wait, so it must still wait here — + // otherwise StopVReplication below can fire before the stream has caught + // up to postWritesPos and the cutover loses tail writes. + if !shouldWaitForParallelApply || isVreplicationTestSuite { + e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-lock pos: %v", replication.EncodePosition(postWritesPos)) + if err := waitForPos(s, postWritesPos, onlineDDL.CutOverThreshold); err != nil { + e.updateMigrationStage(ctx, onlineDDL.UUID, "timeout while waiting for post-lock pos: %v", err) + return vterrors.Wrapf(err, "failed waiting for pos after locking") + } + go log.Info(fmt.Sprintf("cutOverVReplMigration %v: done waiting for position %v", s.workflow, replication.EncodePosition(postWritesPos))) } - go log.Info(fmt.Sprintf("cutOverVReplMigration %v: done waiting for position %v", s.workflow, replication.EncodePosition(postWritesPos))) // Stop vreplication e.updateMigrationStage(ctx, onlineDDL.UUID, "stopping vreplication") if _, err := e.vreplicationExec(ctx, tablet.Tablet, binlogplayer.StopVReplication(s.id, "stopped for online DDL cutover")); err != nil { @@ -1211,6 +1246,42 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh // deferred function will re-enable writes now } +// shouldPreBufferWaitForParallelApply reports whether the VReplication +// stream backing this migration runs with parallel apply enabled (>1 +// worker), taking workflow-level config overrides into account. The +// cut-over path consults this because parallel apply introduces a +// reorder buffer that must drain cleanly before tables can be swapped; +// the serial applier has no such buffer and does not need the extra +// wait. +func shouldPreBufferWaitForParallelApply(s *VReplStream) (bool, error) { + workers := vttablet.InitVReplicationConfigDefaults().ParallelReplicationWorkers + if s == nil || s.options == "" { + return workers > 1, nil + } + + var options vtctldatapb.WorkflowOptions + if err := json.Unmarshal([]byte(s.options), &options); err != nil { + return false, err + } + if len(options.Config) == 0 { + return workers > 1, nil + } + + workerOverrides := map[string]string{} + if value, ok := options.Config["vreplication-parallel-replication-workers"]; ok { + workerOverrides["vreplication-parallel-replication-workers"] = value + } + if len(workerOverrides) == 0 { + return workers > 1, nil + } + + config, err := vttablet.NewVReplicationConfig(workerOverrides) + if err != nil { + return false, err + } + return config.ParallelReplicationWorkers > 1, nil +} + // initMigrationSQLMode sets sql_mode according to DDL strategy, and returns a function that // restores sql_mode to original state func (e *Executor) initMigrationSQLMode(ctx context.Context, onlineDDL *schema.OnlineDDL, conn *dbconnpool.DBConnection) (deferFunc func(), err error) { @@ -3112,6 +3183,7 @@ func (e *Executor) readVReplStream(ctx context.Context, uuid string, okIfMissing id: row.AsInt32("id", 0), workflow: row.AsString("workflow", ""), source: row.AsString("source", ""), + options: row.AsString("options", ""), pos: row.AsString("pos", ""), timeUpdated: row.AsInt64("time_updated", 0), timeHeartbeat: row.AsInt64("time_heartbeat", 0), diff --git a/go/vt/vttablet/onlineddl/executor_test.go b/go/vt/vttablet/onlineddl/executor_test.go index 08485a083f9..46b359c07e1 100644 --- a/go/vt/vttablet/onlineddl/executor_test.go +++ b/go/vt/vttablet/onlineddl/executor_test.go @@ -22,6 +22,9 @@ package onlineddl import ( "context" + "fmt" + "strings" + "sync" "testing" "time" @@ -34,13 +37,20 @@ import ( "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/dbconnpool" "vitess.io/vitess/go/vt/schema" + "vitess.io/vitess/go/vt/topo" "vitess.io/vitess/go/vt/topo/memorytopo" "vitess.io/vitess/go/vt/vtenv" + "vitess.io/vitess/go/vt/vterrors" + vttablet "vitess.io/vitess/go/vt/vttablet/common" + "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" "vitess.io/vitess/go/vt/vttablet/tmclient" "vitess.io/vitess/go/vt/vttablet/tmclienttest" + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" ) func TestShouldCutOverAccordingToBackoff(t *testing.T) { @@ -349,6 +359,511 @@ func TestExecuteDirectlySetsLockWaitTimeout(t *testing.T) { assert.Contains(t, queryLog, "set @@session.lock_wait_timeout=@lock_wait_timeout") } +func TestShouldPreBufferWaitForParallelApply(t *testing.T) { + config := vttablet.InitVReplicationConfigDefaults() + savedParallelWorkers := config.ParallelReplicationWorkers + t.Cleanup(func() { + config.ParallelReplicationWorkers = savedParallelWorkers + }) + + config.ParallelReplicationWorkers = 1 + shouldWait, err := shouldPreBufferWaitForParallelApply(nil) + require.NoError(t, err) + require.False(t, shouldWait) + + config.ParallelReplicationWorkers = 2 + shouldWait, err = shouldPreBufferWaitForParallelApply(nil) + require.NoError(t, err) + require.True(t, shouldWait) +} + +func TestShouldPreBufferWaitForParallelApplyPrefersWorkflowOverride(t *testing.T) { + config := vttablet.InitVReplicationConfigDefaults() + savedParallelWorkers := config.ParallelReplicationWorkers + t.Cleanup(func() { + config.ParallelReplicationWorkers = savedParallelWorkers + }) + + config.ParallelReplicationWorkers = 1 + + stream := &VReplStream{} + stream.bls = &binlogdatapb.BinlogSource{} + stream.options = `{"config":{"vreplication-parallel-replication-workers":"2"}}` + + shouldWait, err := shouldPreBufferWaitForParallelApply(stream) + require.NoError(t, err) + require.True(t, shouldWait) +} + +func TestShouldPreBufferWaitForParallelApplyRejectsInvalidWorkflowOverride(t *testing.T) { + config := vttablet.InitVReplicationConfigDefaults() + savedParallelWorkers := config.ParallelReplicationWorkers + t.Cleanup(func() { + config.ParallelReplicationWorkers = savedParallelWorkers + }) + + config.ParallelReplicationWorkers = 2 + + stream := &VReplStream{} + stream.bls = &binlogdatapb.BinlogSource{} + stream.options = `{"config":{"vreplication-parallel-replication-workers":"not-an-int"}}` + + _, err := shouldPreBufferWaitForParallelApply(stream) + require.Error(t, err) + require.ErrorContains(t, err, "invalid value for vreplication-parallel-replication-workers") +} + +func TestShouldPreBufferWaitForParallelApplyIgnoresUnknownWorkflowOverrideKeys(t *testing.T) { + config := vttablet.InitVReplicationConfigDefaults() + savedParallelWorkers := config.ParallelReplicationWorkers + t.Cleanup(func() { + config.ParallelReplicationWorkers = savedParallelWorkers + }) + + config.ParallelReplicationWorkers = 2 + + stream := &VReplStream{} + stream.bls = &binlogdatapb.BinlogSource{} + stream.options = `{"config":{"user":"admin","password":"secret"}}` + + shouldWait, err := shouldPreBufferWaitForParallelApply(stream) + require.NoError(t, err) + require.True(t, shouldWait) +} + +type recordingTabletManagerClient struct { + tmclient.TabletManagerClient + + mu sync.Mutex + waitCalls []string + waitErr error + waitErrs []error + refreshStateCalled int +} + +func (c *recordingTabletManagerClient) Close() {} + +func (c *recordingTabletManagerClient) ReloadSchema(ctx context.Context, tablet *topodatapb.Tablet, waitPosition string) error { + return nil +} + +func (c *recordingTabletManagerClient) RefreshState(ctx context.Context, tablet *topodatapb.Tablet) error { + c.mu.Lock() + defer c.mu.Unlock() + c.refreshStateCalled++ + return nil +} + +func (c *recordingTabletManagerClient) VReplicationWaitForPos(ctx context.Context, tablet *topodatapb.Tablet, id int32, pos string) error { + c.mu.Lock() + defer c.mu.Unlock() + c.waitCalls = append(c.waitCalls, pos) + if len(c.waitErrs) > 0 { + err := c.waitErrs[0] + c.waitErrs = c.waitErrs[1:] + if err != nil { + return err + } + return nil + } + if c.waitErr != nil { + return c.waitErr + } + return nil +} + +func (c *recordingTabletManagerClient) VReplicationExec(ctx context.Context, tablet *topodatapb.Tablet, query string) (*querypb.QueryResult, error) { + return &querypb.QueryResult{}, nil +} + +func (c *recordingTabletManagerClient) WaitCalls() []string { + c.mu.Lock() + defer c.mu.Unlock() + return append([]string(nil), c.waitCalls...) +} + +func (c *recordingTabletManagerClient) RefreshStateCalled() int { + c.mu.Lock() + defer c.mu.Unlock() + return c.refreshStateCalled +} + +func newCutoverTestExecutor(t *testing.T, db *fakesqldb.DB, ts *topo.Server, alias *topodatapb.TabletAlias) *Executor { + t.Helper() + + cfg := tabletenv.NewDefaultConfig() + cfg.DB = dbconfigs.NewTestDBConfigs(*db.ConnParams(), *db.ConnParams(), db.ConnParams().DbName) + venv := vtenv.NewTestEnv() + + executor := &Executor{ + env: tabletenv.NewEnv(venv, cfg, "ExecutorTest"), + ts: ts, + tabletAlias: alias, + ticks: timer.NewTimer(migrationCheckInterval), + isPreparedPoolEmpty: func(tableName string) bool { + return false + }, + } + executor.execQuery = func(ctx context.Context, query string) (*sqltypes.Result, error) { + loweredQuery := strings.ToLower(query) + switch { + case strings.Contains(loweredQuery, "from _vt.schema_migrations"): + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("migration_uuid|keyspace|mysql_table|mysql_schema|migration_statement|strategy|options|migration_status|retries|ready_to_complete|was_ready_to_complete|tablet|migration_context|cutover_threshold_seconds|shadow_analyzed_timestamp", "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|int64|int64|int64|varchar|varchar|int64|varchar"), + strings.Join([]string{ + t.Name(), + "ks", + "t1", + db.ConnParams().DbName, + "alter table t1 add column i int", + "vitess", + "-vreplication-test-suite", + "running", + "0", + "1", + "1", + "cell-0000000001", + "", + "5", + "null", + }, "|"), + ), nil + case strings.Contains(loweredQuery, "from _vt.vreplication_log"): + return &sqltypes.Result{Fields: sqltypes.MakeTestFields("state|message", "varchar|varchar")}, nil + case strings.Contains(loweredQuery, "from _vt.vreplication"): + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("id|workflow|source|options|pos|time_updated|transaction_timestamp|time_heartbeat|time_throttled|component_throttled|reason_throttled|state|message|rows_copied", "int64|varchar|varchar|varchar|varchar|int64|int64|int64|int64|varchar|varchar|varchar|varchar|int64"), + "1|"+t.Name()+"|keyspace:\"ks\" shard:\"0\" filter:{rules:{match:\"_vt_HOLD_"+t.Name()+"\"}}|{\"config\":{\"vreplication-parallel-replication-workers\":\"2\"}}|MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-4|1|1|1|0|||Running||10", + ), nil + default: + return &sqltypes.Result{}, nil + } + } + executor.pool = connpool.NewPool(executor.env, "OnlineDDLExecutorPoolTest", tabletenv.ConnPoolConfig{ + Size: databasePoolSize, + IdleTimeout: executor.env.Config().OltpReadPool.IdleTimeout, + }) + executor.pool.Open(executor.env.Config().DB.AppWithDB(), executor.env.Config().DB.DbaWithDB(), executor.env.Config().DB.AppDebugWithDB()) + t.Cleanup(executor.pool.Close) + + return executor +} + +func TestCutOverVReplMigrationBuffersBeforeParallelApplyCatchUpWait(t *testing.T) { + ctx := t.Context() + db := fakesqldb.New(t) + defer db.Close() + protocolName := t.Name() + resetProtocol := tmclienttest.SetProtocol(t.Name(), protocolName) + defer resetProtocol() + + waitErr := vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "vreplication still catching up") + tmClient := &recordingTabletManagerClient{waitErr: waitErr} + tmclient.RegisterTabletManagerClientFactory(protocolName, func() tmclient.TabletManagerClient { + return tmClient + }) + + alias := &topodatapb.TabletAlias{Cell: "cell", Uid: 1} + ts := memorytopo.NewServer(ctx, "cell") + err := ts.CreateTablet(ctx, &topodatapb.Tablet{ + Alias: alias, + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + }) + require.NoError(t, err) + + addSessionTimeoutQueries := func(lockWaitSeconds int64) { + db.AddQuery("set @lock_wait_timeout=@@session.lock_wait_timeout", &sqltypes.Result{}) + db.AddQuery(fmt.Sprintf("set @@session.lock_wait_timeout=%d", lockWaitSeconds), &sqltypes.Result{}) + db.AddQuery("set @wait_timeout=@@session.wait_timeout", &sqltypes.Result{}) + db.AddQuery(fmt.Sprintf("set @@session.wait_timeout=%d", int64(waitTimeoutDuringCutOver.Seconds())), &sqltypes.Result{}) + db.AddQuery("set @@session.wait_timeout=@wait_timeout", &sqltypes.Result{}) + db.AddQuery("set @@session.lock_wait_timeout=@lock_wait_timeout", &sqltypes.Result{}) + } + addSessionTimeoutQueries(15) + addSessionTimeoutQueries(10) + + db.AddQuery("show global variables like 'rename_table_preserve_foreign_key'", &sqltypes.Result{}) + db.AddQuery("SELECT @@global.gtid_executed", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@global.gtid_executed", "varchar"), + "3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + )) + db.AddQueryPattern(`(?is)^drop table if exists .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^unlock tables$`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^kill \d+$`, &sqltypes.Result{}) + + executor := newCutoverTestExecutor(t, db, ts, alias) + + bufferEvents := []bool{} + var bufferMu sync.Mutex + executor.toggleBufferTableFunc = func(cancelCtx context.Context, tableName string, timeout time.Duration, bufferQueries bool) { + bufferMu.Lock() + defer bufferMu.Unlock() + bufferEvents = append(bufferEvents, bufferQueries) + } + + stream := &VReplStream{ + id: 1, + workflow: t.Name(), + options: `{"config":{"vreplication-parallel-replication-workers":"2"}}`, + bls: &binlogdatapb.BinlogSource{ + Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "_vt_HOLD_" + t.Name()}}}, + }, + } + + err = executor.cutOverVReplMigration(ctx, stream, true) + require.Error(t, err) + require.ErrorContains(t, err, "checking prepared pool for table") + + bufferMu.Lock() + bufferEventsCopy := append([]bool(nil), bufferEvents...) + bufferMu.Unlock() + assert.Equal(t, []bool{true, false}, bufferEventsCopy) + assert.Equal(t, 1, tmClient.RefreshStateCalled()) + assert.Len(t, tmClient.WaitCalls(), 0) +} + +func TestCutOverVReplMigrationWaitsForParallelApplyAfterLocking(t *testing.T) { + ctx := t.Context() + db := fakesqldb.New(t) + defer db.Close() + params := db.ConnParams() + + protocolName := t.Name() + resetProtocol := tmclienttest.SetProtocol(t.Name(), protocolName) + defer resetProtocol() + + waitErr := vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "vreplication still catching up") + tmClient := &recordingTabletManagerClient{waitErrs: []error{nil, waitErr}} + tmclient.RegisterTabletManagerClientFactory(protocolName, func() tmclient.TabletManagerClient { + return tmClient + }) + + alias := &topodatapb.TabletAlias{Cell: "cell", Uid: 1} + ts := memorytopo.NewServer(ctx, "cell") + err := ts.CreateTablet(ctx, &topodatapb.Tablet{ + Alias: alias, + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + }) + require.NoError(t, err) + + addSessionTimeoutQueries := func(lockWaitSeconds int64) { + db.AddQuery("set @lock_wait_timeout=@@session.lock_wait_timeout", &sqltypes.Result{}) + db.AddQuery(fmt.Sprintf("set @@session.lock_wait_timeout=%d", lockWaitSeconds), &sqltypes.Result{}) + db.AddQuery("set @wait_timeout=@@session.wait_timeout", &sqltypes.Result{}) + db.AddQuery(fmt.Sprintf("set @@session.wait_timeout=%d", int64(waitTimeoutDuringCutOver.Seconds())), &sqltypes.Result{}) + db.AddQuery("set @@session.wait_timeout=@wait_timeout", &sqltypes.Result{}) + db.AddQuery("set @@session.lock_wait_timeout=@lock_wait_timeout", &sqltypes.Result{}) + } + addSessionTimeoutQueries(15) + addSessionTimeoutQueries(10) + + db.AddQuery("show global variables like 'rename_table_preserve_foreign_key'", &sqltypes.Result{}) + db.AddQuery("SELECT @@global.gtid_executed", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@global.gtid_executed", "varchar"), + "3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + )) + db.AddQueryPattern(`(?is)^update _vt\.schema_migrations set artifacts=.*$`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^drop table if exists .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^create table if not exists .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^lock tables .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^unlock tables$`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^kill \d+$`, &sqltypes.Result{}) + + executor := newCutoverTestExecutor(t, db, ts, alias) + executor.execQuery = func(ctx context.Context, query string) (*sqltypes.Result, error) { + loweredQuery := strings.ToLower(query) + switch { + case strings.Contains(loweredQuery, "from _vt.schema_migrations"): + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("migration_uuid|keyspace|mysql_table|mysql_schema|migration_statement|strategy|options|migration_status|retries|ready_to_complete|was_ready_to_complete|tablet|migration_context|cutover_threshold_seconds|shadow_analyzed_timestamp", "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|int64|int64|int64|varchar|varchar|int64|varchar"), + strings.Join([]string{ + t.Name(), + "ks", + "t1", + params.DbName, + "alter table t1 add column i int", + "vitess", + "", + "running", + "0", + "1", + "1", + "cell-0000000001", + "", + "5", + "done", + }, "|"), + ), nil + case strings.Contains(loweredQuery, "select id, info as info from information_schema.processlist"): + return &sqltypes.Result{Fields: sqltypes.MakeTestFields("id|info", "int64|varchar")}, nil + case strings.Contains(loweredQuery, "from _vt.vreplication_log"): + return &sqltypes.Result{Fields: sqltypes.MakeTestFields("state|message", "varchar|varchar")}, nil + case strings.Contains(loweredQuery, "from _vt.vreplication"): + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("id|workflow|source|options|pos|time_updated|transaction_timestamp|time_heartbeat|time_throttled|component_throttled|reason_throttled|state|message|rows_copied", "int64|varchar|varchar|varchar|varchar|int64|int64|int64|int64|varchar|varchar|varchar|varchar|int64"), + "1|"+t.Name()+"|keyspace:\"ks\" shard:\"0\" filter:{rules:{match:\"_vt_HOLD_"+t.Name()+"\"}}|{\"config\":{\"vreplication-parallel-replication-workers\":\"2\"}}|MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-4|1|1|1|0|||Running||10", + ), nil + default: + return &sqltypes.Result{}, nil + } + } + + bufferEvents := []bool{} + var bufferMu sync.Mutex + executor.toggleBufferTableFunc = func(cancelCtx context.Context, tableName string, timeout time.Duration, bufferQueries bool) { + bufferMu.Lock() + defer bufferMu.Unlock() + bufferEvents = append(bufferEvents, bufferQueries) + } + + stream := &VReplStream{ + id: 1, + workflow: t.Name(), + options: `{"config":{"vreplication-parallel-replication-workers":"2"}}`, + bls: &binlogdatapb.BinlogSource{ + Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "_vt_HOLD_" + t.Name()}}}, + }, + } + + err = executor.cutOverVReplMigration(ctx, stream, false) + require.Error(t, err) + require.ErrorContains(t, err, "failed waiting for vreplication to catch up before renaming") + + bufferMu.Lock() + bufferEventsCopy := append([]bool(nil), bufferEvents...) + bufferMu.Unlock() + assert.Equal(t, []bool{true, false}, bufferEventsCopy) + assert.Equal(t, 1, tmClient.RefreshStateCalled()) + assert.Len(t, tmClient.WaitCalls(), 2) + assert.NotContains(t, db.QueryLog(), "rename table") +} + +func TestCutOverVReplMigrationSkipsSecondPostLockWaitAfterParallelApplyCatchUp(t *testing.T) { + ctx := t.Context() + db := fakesqldb.New(t) + defer db.Close() + params := db.ConnParams() + + protocolName := t.Name() + resetProtocol := tmclienttest.SetProtocol(t.Name(), protocolName) + defer resetProtocol() + + waitErr := vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "unexpected extra wait after parallel apply catch up") + tmClient := &recordingTabletManagerClient{waitErrs: []error{nil, nil, waitErr}} + tmclient.RegisterTabletManagerClientFactory(protocolName, func() tmclient.TabletManagerClient { + return tmClient + }) + + alias := &topodatapb.TabletAlias{Cell: "cell", Uid: 1} + ts := memorytopo.NewServer(ctx, "cell") + err := ts.CreateTablet(ctx, &topodatapb.Tablet{ + Alias: alias, + Keyspace: "ks", + Shard: "0", + Type: topodatapb.TabletType_PRIMARY, + }) + require.NoError(t, err) + + addSessionTimeoutQueries := func(lockWaitSeconds int64) { + db.AddQuery("set @lock_wait_timeout=@@session.lock_wait_timeout", &sqltypes.Result{}) + db.AddQuery(fmt.Sprintf("set @@session.lock_wait_timeout=%d", lockWaitSeconds), &sqltypes.Result{}) + db.AddQuery("set @wait_timeout=@@session.wait_timeout", &sqltypes.Result{}) + db.AddQuery(fmt.Sprintf("set @@session.wait_timeout=%d", int64(waitTimeoutDuringCutOver.Seconds())), &sqltypes.Result{}) + db.AddQuery("set @@session.wait_timeout=@wait_timeout", &sqltypes.Result{}) + db.AddQuery("set @@session.lock_wait_timeout=@lock_wait_timeout", &sqltypes.Result{}) + } + addSessionTimeoutQueries(15) + addSessionTimeoutQueries(10) + + db.AddQuery("show global variables like 'rename_table_preserve_foreign_key'", &sqltypes.Result{}) + db.AddQuery("SELECT @@global.gtid_executed", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("@@global.gtid_executed", "varchar"), + "3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + )) + db.AddQueryPattern(`(?is)^update _vt\.schema_migrations set artifacts=.*$`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^drop table if exists .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^create table if not exists .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^lock tables .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^rename table .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^drop table .*`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^unlock tables$`, &sqltypes.Result{}) + db.AddQueryPattern(`(?is)^kill \d+$`, &sqltypes.Result{}) + + executor := newCutoverTestExecutor(t, db, ts, alias) + executor.execQuery = func(ctx context.Context, query string) (*sqltypes.Result, error) { + loweredQuery := strings.ToLower(query) + switch { + case strings.Contains(loweredQuery, "from _vt.schema_migrations"): + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("migration_uuid|keyspace|mysql_table|mysql_schema|migration_statement|strategy|options|migration_status|retries|ready_to_complete|was_ready_to_complete|tablet|migration_context|cutover_threshold_seconds|shadow_analyzed_timestamp", "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|int64|int64|int64|varchar|varchar|int64|varchar"), + strings.Join([]string{ + t.Name(), + "ks", + "t1", + params.DbName, + "alter table t1 add column i int", + "vitess", + "", + "running", + "0", + "1", + "1", + "cell-0000000001", + "", + "5", + "done", + }, "|"), + ), nil + case strings.Contains(loweredQuery, "select id, info as info from information_schema.processlist"): + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("id|info", "int64|varchar"), + "3|rename table `t1` to `_vt_hld_dummy`", + ), nil + case strings.Contains(loweredQuery, "from _vt.vreplication_log"): + return &sqltypes.Result{Fields: sqltypes.MakeTestFields("state|message", "varchar|varchar")}, nil + case strings.Contains(loweredQuery, "from _vt.vreplication"): + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("id|workflow|source|options|pos|time_updated|transaction_timestamp|time_heartbeat|time_throttled|component_throttled|reason_throttled|state|message|rows_copied", "int64|varchar|varchar|varchar|varchar|int64|int64|int64|int64|varchar|varchar|varchar|varchar|int64"), + "1|"+t.Name()+"|keyspace:\"ks\" shard:\"0\" filter:{rules:{match:\"_vt_HOLD_"+t.Name()+"\"}}|{\"config\":{\"vreplication-parallel-replication-workers\":\"2\"}}|MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-4|1|1|1|0|||Running||10", + ), nil + default: + return &sqltypes.Result{}, nil + } + } + + bufferEvents := []bool{} + var bufferMu sync.Mutex + executor.toggleBufferTableFunc = func(cancelCtx context.Context, tableName string, timeout time.Duration, bufferQueries bool) { + bufferMu.Lock() + defer bufferMu.Unlock() + bufferEvents = append(bufferEvents, bufferQueries) + } + + stream := &VReplStream{ + id: 1, + workflow: t.Name(), + options: `{"config":{"vreplication-parallel-replication-workers":"2"}}`, + bls: &binlogdatapb.BinlogSource{ + Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "_vt_HOLD_" + t.Name()}}}, + }, + } + + err = executor.cutOverVReplMigration(ctx, stream, false) + require.NoError(t, err) + + bufferMu.Lock() + bufferEventsCopy := append([]bool(nil), bufferEvents...) + bufferMu.Unlock() + assert.Equal(t, []bool{true, false}, bufferEventsCopy) + assert.Equal(t, 1, tmClient.RefreshStateCalled()) + assert.Len(t, tmClient.WaitCalls(), 2) + assert.Contains(t, strings.ToLower(db.QueryLog()), "rename table") +} + type fakeTabletManagerClient struct { tmclient.TabletManagerClient } diff --git a/go/vt/vttablet/onlineddl/schema.go b/go/vt/vttablet/onlineddl/schema.go index f6fc0606efc..9210eed6bd7 100644 --- a/go/vt/vttablet/onlineddl/schema.go +++ b/go/vt/vttablet/onlineddl/schema.go @@ -529,6 +529,7 @@ const ( id, workflow, source, + options, pos, time_updated, transaction_timestamp, diff --git a/go/vt/vttablet/onlineddl/vrepl.go b/go/vt/vttablet/onlineddl/vrepl.go index 18fa34790f4..001b9a690b9 100644 --- a/go/vt/vttablet/onlineddl/vrepl.go +++ b/go/vt/vttablet/onlineddl/vrepl.go @@ -50,6 +50,7 @@ type VReplStream struct { id int32 workflow string source string + options string pos string timeUpdated int64 timeHeartbeat int64 diff --git a/go/vt/vttablet/tabletmanager/restore_test.go b/go/vt/vttablet/tabletmanager/restore_test.go index 7eabaa6b584..f5b059809f1 100644 --- a/go/vt/vttablet/tabletmanager/restore_test.go +++ b/go/vt/vttablet/tabletmanager/restore_test.go @@ -17,6 +17,7 @@ limitations under the License. package tabletmanager import ( + "context" "errors" "os" "path/filepath" @@ -153,3 +154,50 @@ func TestInvokeRestoreDoneHook_Timestamps(t *testing.T) { } } } + +func TestDisableReplicationRecoversFromRecoverableReplicationInitError(t *testing.T) { + fakeMysqlDaemon := newTestMysqlDaemon(t, 1) + fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{ + "STOP REPLICA", + "FAKE RESET REPLICA ALL", + "FAKE SET SOURCE", + "FAKE RESET REPLICA ALL", + "FAKE SET SOURCE", + "START REPLICA", + } + + setSourceCalls := 0 + fakeMysqlDaemon.SetReplicationSourceFunc = func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error { + setSourceCalls++ + + require.Equal(t, "//", host) + require.Zero(t, port) + require.Zero(t, heartbeatInterval) + require.False(t, stopReplicationBefore) + require.False(t, startReplicationAfter) + + if setSourceCalls == 1 { + require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"FAKE SET SOURCE"})) + return recoverableReplicationInitError() + } + + if setSourceCalls == 2 { + require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"FAKE SET SOURCE"})) + fakeMysqlDaemon.CurrentSourceHost = host + fakeMysqlDaemon.CurrentSourcePort = port + return nil + } + + return errors.New("unexpected SetReplicationSource call") + } + + tm := newTestTabletManager(t) + tm.MysqlDaemon = fakeMysqlDaemon + + err := tm.disableReplication(t.Context()) + require.NoError(t, err) + require.Equal(t, 2, setSourceCalls) + require.Equal(t, "//", fakeMysqlDaemon.CurrentSourceHost) + require.Zero(t, fakeMysqlDaemon.CurrentSourcePort) + require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList()) +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/controller.go b/go/vt/vttablet/tabletmanager/vreplication/controller.go index 2c02073ddb6..8da1d829261 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/controller.go +++ b/go/vt/vttablet/tabletmanager/vreplication/controller.go @@ -344,7 +344,16 @@ func (ct *controller) runBlp(ctx context.Context) (err error) { // non-recoverable BUT it has persisted beyond the retry limit // (maxTimeToRetryError). In addition, we cannot restart a workflow // started with AtomicCopy which has _any_ error during copy phase. - if (err != nil && vr.WorkflowSubType == int32(binlogdatapb.VReplicationWorkflowSubType_AtomicCopy) && vr.state == binlogdatapb.VReplicationWorkflowState_Copying) || + // The copy-phase check consults both signals: vr.getState() is only + // updated by setState calls, and AtomicCopy's copy path (copyAll) + // never calls setState(Copying) — only initTablesForCopy does, on + // first start — so after a tablet restart the in-memory state stays + // at its zero value for the whole remaining copy and a copy-phase + // error would be misclassified as retryable. isInCopyPhase() is + // refreshed from the durable _vt.copy_state contents on every + // loadSettings call and covers the restarted-copy case. + if (err != nil && vr.WorkflowSubType == int32(binlogdatapb.VReplicationWorkflowSubType_AtomicCopy) && + (vr.getState() == binlogdatapb.VReplicationWorkflowState_Copying || vr.isInCopyPhase())) || isUnrecoverableError(err) || !ct.lastWorkflowError.ShouldRetry() { err = vterrors.Wrapf(err, TerminalErrorIndicator) diff --git a/go/vt/vttablet/tabletmanager/vreplication/framework_test.go b/go/vt/vttablet/tabletmanager/vreplication/framework_test.go index be8a314873e..f1c640e5601 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/framework_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/framework_test.go @@ -18,6 +18,7 @@ package vreplication import ( "context" + "errors" "fmt" "io" "os" @@ -225,6 +226,9 @@ func TestMain(m *testing.M) { return ret } cancel() + if testing.Short() { + return 0 + } runNoBlobTest = true if err := utils.SetBinlogRowImageOptions("noblob", runPartialJSONTest, tempDir); err != nil { @@ -247,7 +251,7 @@ func resetBinlogClient() { func primaryPosition(t *testing.T) string { t.Helper() - pos, err := env.Mysqld.PrimaryPosition(t.Context()) + pos, err := env.Mysqld.PrimaryPosition(context.Background()) require.NoError(t, err) return replication.EncodePosition(pos) } @@ -261,7 +265,7 @@ func primaryPositionParsed(t *testing.T) replication.Position { func execStatements(t *testing.T, queries []string) { t.Helper() - if err := env.Mysqld.ExecuteSuperQueryList(t.Context(), queries); err != nil { + if err := env.Mysqld.ExecuteSuperQueryList(context.Background(), queries); err != nil { log.Error("Error executing query: " + err.Error()) assert.NoError(t, err) } @@ -278,6 +282,174 @@ func execConnStatements(t *testing.T, conn *dbconnpool.DBConnection, queries []s } } +func TestShortModeHarnessInitialized(t *testing.T) { + if !testing.Short() { + t.Skip("short-mode only") + } + + require.NotNil(t, env) + require.NotNil(t, playerEngine) + require.NotNil(t, streamerEngine) +} + +func TestFakeTabletConnVStreamRowsForwardsOptions(t *testing.T) { + execStatements(t, []string{ + "create table vstream_rows_options(id int, val varbinary(9), primary key(id))", + "insert into vstream_rows_options values (1, '123456789'), (2, '123456789')", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table vstream_rows_options"}) + }) + + var packetRowCounts []int + err := (&fakeTabletConn{}).VStreamRows(context.Background(), &binlogdatapb.VStreamRowsRequest{ + Query: "select * from vstream_rows_options", + Options: &binlogdatapb.VStreamOptions{ + ConfigOverrides: map[string]string{ + "vstream-dynamic-packet-size": "false", + "vstream-packet-size": "10", + }, + }, + }, func(rows *binlogdatapb.VStreamRowsResponse) error { + if len(rows.Rows) > 0 { + packetRowCounts = append(packetRowCounts, len(rows.Rows)) + } + return nil + }) + require.NoError(t, err) + require.Equal(t, []int{1, 1}, packetRowCounts) +} + +func TestFakeTabletConnVStreamForwardsOptions(t *testing.T) { + execStatements(t, []string{ + "create table vstream_options_t1(id int, primary key(id))", + "create table vstream_options_t2(id int, primary key(id))", + "insert into vstream_options_t1 values (1)", + "insert into vstream_options_t2 values (2)", + }) + t.Cleanup(func() { + execStatements(t, []string{ + "drop table vstream_options_t1", + "drop table vstream_options_t2", + }) + }) + + copiedTables := map[string]bool{} + err := (&fakeTabletConn{tablet: &topodatapb.Tablet{Alias: &topodatapb.TabletAlias{Uid: 100}}}).VStream(context.Background(), &binlogdatapb.VStreamRequest{ + Target: &querypb.Target{Keyspace: "vttest"}, + Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: "vstream_options_t1", + Filter: "select * from vstream_options_t1", + }, { + Match: "vstream_options_t2", + Filter: "select * from vstream_options_t2", + }}}, + Options: &binlogdatapb.VStreamOptions{TablesToCopy: []string{"vstream_options_t1"}}, + }, func(evs []*binlogdatapb.VEvent) error { + for _, ev := range evs { + switch ev.Type { + case binlogdatapb.VEventType_FIELD: + copiedTables[ev.FieldEvent.TableName] = true + case binlogdatapb.VEventType_ROW: + copiedTables[ev.RowEvent.TableName] = true + case binlogdatapb.VEventType_LASTPK: + copiedTables[ev.LastPKEvent.TableLastPK.TableName] = true + case binlogdatapb.VEventType_COPY_COMPLETED: + return io.EOF + } + } + return nil + }) + require.ErrorIs(t, err, io.EOF) + require.Contains(t, copiedTables, "vstream_options_t1") + require.NotContains(t, copiedTables, "vstream_options_t2") +} + +func TestVPlayerForwardsWorkflowOverridesToSourceVStream(t *testing.T) { + streamer := &capturingVStreamerClient{err: errors.New("stream failed")} + mockDB := binlogplayer.NewMockDBClient(t) + mockDB.ExpectRequest("SELECT rows_copied FROM _vt.vreplication WHERE id=1", sqltypes.MakeTestResult(sqltypes.MakeTestFields("rows_copied", "int64"), "0"), nil) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + config, err := vttablet.NewVReplicationConfig(map[string]string{ + "vreplication-net-read-timeout": "123", + "vreplication-parallel-replication-workers": "4", + "vstream-packet-size": "42", + }) + require.NoError(t, err) + + stats := binlogplayer.NewStats() + t.Cleanup(stats.Stop) + vr := newVReplicator(1, &binlogdatapb.BinlogSource{Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "/.*"}}}}, streamer, stats, mockDB, nil, playerEngine, config) + vp := newVPlayer(vr, binlogplayer.VRSettings{}, nil, replication.Position{}, "replicate") + vp.replicatorPlan = &ReplicatorPlan{VStreamFilter: vr.source.Filter} + + err = vp.fetchAndApply(t.Context()) + require.ErrorContains(t, err, "stream failed") + mockDB.Wait() + require.NotNil(t, streamer.vstreamOptions) + require.Equal(t, map[string]string{ + "vreplication-net-read-timeout": "123", + "vstream-packet-size": "42", + }, streamer.vstreamOptions.ConfigOverrides) +} + +func TestVCopierCopyAllForwardsWorkflowOverridesToSourceVStreamTables(t *testing.T) { + streamer := &capturingVStreamerClient{vstreamTablesErr: errors.New("stream tables failed")} + mockDB := binlogplayer.NewMockDBClient(t) + mockDB.ExpectRequest("SELECT rows_copied FROM _vt.vreplication WHERE id=1", sqltypes.MakeTestResult(sqltypes.MakeTestFields("rows_copied", "int64"), "0"), nil) + mockDB.ExpectRequest(SqlMaxAllowedPacket, sqltypes.MakeTestResult(sqltypes.MakeTestFields("max_allowed_packet", "int64"), "67108864"), nil) + config, err := vttablet.NewVReplicationConfig(map[string]string{ + "vreplication-net-read-timeout": "123", + "vreplication-parallel-replication-workers": "4", + "vstream-packet-size": "42", + }) + require.NoError(t, err) + + stats := binlogplayer.NewStats() + t.Cleanup(stats.Stop) + vr := newVReplicator(1, &binlogdatapb.BinlogSource{Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "t1"}}}}, streamer, stats, mockDB, nil, playerEngine, config) + vr.colInfoMap = map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}}} + + err = newVCopier(vr).copyAll(t.Context(), binlogplayer.VRSettings{WorkflowName: "copy-all"}) + require.ErrorContains(t, err, "stream tables failed") + mockDB.Wait() + require.NotNil(t, streamer.vstreamTablesOptions) + require.Equal(t, map[string]string{ + "vreplication-net-read-timeout": "123", + "vstream-packet-size": "42", + }, streamer.vstreamTablesOptions.ConfigOverrides) +} + +func TestVCopierCopyTableForwardsWorkflowOverridesToSourceVStreamRows(t *testing.T) { + streamer := &capturingVStreamerClient{vstreamRowsErr: errors.New("stream rows failed")} + mockDB := binlogplayer.NewMockDBClient(t) + mockDB.ExpectRequest("SELECT rows_copied FROM _vt.vreplication WHERE id=1", sqltypes.MakeTestResult(sqltypes.MakeTestFields("rows_copied", "int64"), "0"), nil) + mockDB.ExpectRequest(SqlMaxAllowedPacket, sqltypes.MakeTestResult(sqltypes.MakeTestFields("max_allowed_packet", "int64"), "67108864"), nil) + config, err := vttablet.NewVReplicationConfig(map[string]string{ + "vreplication-net-read-timeout": "123", + "vreplication-parallel-replication-workers": "4", + "vstream-packet-size": "42", + }) + require.NoError(t, err) + + stats := binlogplayer.NewStats() + t.Cleanup(stats.Stop) + vr := newVReplicator(1, &binlogdatapb.BinlogSource{Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "t1"}}}}, streamer, stats, mockDB, nil, playerEngine, config) + vr.colInfoMap = map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}}} + + err = newVCopier(vr).copyTable(t.Context(), "t1", map[string]*sqltypes.Result{"t1": nil}) + require.ErrorContains(t, err, "stream rows failed") + mockDB.Wait() + require.NotNil(t, streamer.vstreamRowsOptions) + require.Equal(t, map[string]string{ + "vreplication-net-read-timeout": "123", + "vstream-packet-size": "42", + }, streamer.vstreamRowsOptions.ConfigOverrides) +} + // -------------------------------------- // Topos and tablets @@ -339,6 +511,43 @@ type fakeTabletConn struct { tablet *topodatapb.Tablet } +type capturingVStreamerClient struct { + vstreamOptions *binlogdatapb.VStreamOptions + vstreamRowsOptions *binlogdatapb.VStreamOptions + vstreamTablesOptions *binlogdatapb.VStreamOptions + err error + vstreamRowsErr error + vstreamTablesErr error +} + +func (c *capturingVStreamerClient) Open(context.Context) error { return nil } + +func (c *capturingVStreamerClient) Close(context.Context) error { return nil } + +func (c *capturingVStreamerClient) VStream(ctx context.Context, startPos string, tablePKs []*binlogdatapb.TableLastPK, filter *binlogdatapb.Filter, send func([]*binlogdatapb.VEvent) error, options *binlogdatapb.VStreamOptions) error { + c.vstreamOptions = options + if c.err != nil { + return c.err + } + return io.EOF +} + +func (c *capturingVStreamerClient) VStreamRows(ctx context.Context, query string, lastpk *querypb.QueryResult, send func(*binlogdatapb.VStreamRowsResponse) error, options *binlogdatapb.VStreamOptions) error { + c.vstreamRowsOptions = options + if c.vstreamRowsErr != nil { + return c.vstreamRowsErr + } + return nil +} + +func (c *capturingVStreamerClient) VStreamTables(ctx context.Context, send func(*binlogdatapb.VStreamTablesResponse) error, options *binlogdatapb.VStreamOptions) error { + c.vstreamTablesOptions = options + if c.vstreamTablesErr != nil { + return c.vstreamTablesErr + } + return nil +} + // StreamHealth is part of queryservice.QueryService. func (ftc *fakeTabletConn) StreamHealth(ctx context.Context, callback func(*querypb.StreamHealthResponse) error) error { return callback(&querypb.StreamHealthResponse{ @@ -372,7 +581,7 @@ func (ftc *fakeTabletConn) VStream(ctx context.Context, request *binlogdatapb.VS return err } } - return streamerEngine.Stream(ctx, request.Position, request.TableLastPKs, request.Filter, throttlerapp.VStreamerName, send, nil) + return streamerEngine.Stream(ctx, request.Position, request.TableLastPKs, request.Filter, throttlerapp.VStreamerName, send, request.Options) } // vstreamRowsHook allows you to do work just before calling VStreamRows. @@ -394,15 +603,12 @@ func (ftc *fakeTabletConn) VStreamRows(ctx context.Context, request *binlogdatap } row = r.Rows[0] } - vstreamOptions := &binlogdatapb.VStreamOptions{ - ConfigOverrides: vttablet.GetVReplicationConfigDefaults(false).Map(), - } return streamerEngine.StreamRows(ctx, request.Query, row, func(rows *binlogdatapb.VStreamRowsResponse) error { if vstreamRowsSendHook != nil { vstreamRowsSendHook(ctx) } return send(rows) - }, vstreamOptions) + }, request.Options) } // -------------------------------------- diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply.go new file mode 100644 index 00000000000..b28e4dc2227 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply.go @@ -0,0 +1,2482 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "context" + "errors" + "fmt" + "io" + "log/slog" + "maps" + "math" + "runtime/debug" + "slices" + "strings" + "sync" + "sync/atomic" + "time" + + "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/binlog/binlogplayer" + "vitess.io/vitess/go/vt/log" + "vitess.io/vitess/go/vt/sqlparser" + "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" +) + +// recoverParallelApply is a defer helper used by every goroutine the parallel +// applier spawns. A panic in a worker, commitLoop, or scheduleLoop must not +// crash the entire vttablet process; it must turn into a normal error that +// propagates through the usual shutdown machinery. The callback receives the +// converted error and is expected to (a) push the error onto the orchestrator's +// error channel and (b) cancel the shared context so sibling goroutines +// unwind promptly. Passing nil for cb is allowed when the caller already +// returns an error it will examine itself. +func recoverParallelApply(name string, cb func(err error)) { + r := recover() + if r == nil { + return + } + stack := debug.Stack() + log.Error("parallel apply goroutine panicked", + slog.String("goroutine", name), + slog.Any("panic", r), + slog.String("stack", string(stack)), + ) + err := vterrors.Errorf(vtrpcpb.Code_INTERNAL, "parallel apply: %s panicked: %v", name, r) + if cb != nil { + cb(err) + } +} + +type applyTxnPayload struct { + // pos is the GTID position to record when committing this transaction. + pos replication.Position + // timestamp is the source binlog timestamp, used for lag calculation + // and the time_updated column in _vt.vreplication. + timestamp int64 + // mustSave forces an immediate position save (e.g., stop position reached + // or time-based flush bound exceeded). + mustSave bool + // events holds the VEvents that make up this transaction's data. + // For row transactions these are ROW/FIELD events; for commitOnly + // transactions this is typically a single DDL, OTHER, or COMMIT event. + events []*binlogdatapb.VEvent + // rowOnly is true when the transaction contains only ROW and FIELD events + // (no DDL, OTHER, or JOURNAL). Row-only transactions can have writesets + // computed for parallel conflict detection. FIELD events are pure metadata + // and do not affect conflict detection. + rowOnly bool + // commitOnly is true for transactions that are applied by the commitLoop + // on the main connection rather than by a worker (DDL, OTHER, JOURNAL, + // position-only saves). Workers forward these directly to commitCh + // without applying events or waiting on txn.done. + commitOnly bool + // updatePosOnly is true for position-only saves (idle timeout flush). + // The commitLoop calls updatePos without applying any events. + updatePosOnly bool + // query/commit/client are the DB connection functions for this transaction. + // For worker transactions, these are set by the worker after applying events. + // For commitOnly transactions, these point to the main vplayer connection. + query func(ctx context.Context, sql string) (*sqltypes.Result, error) + commit func() error + client *vdbClient + // Pre-computed during scheduling so commitLoop doesn't need to scan + // all events to find the last qualifying timestamp for lag calculation. + // Zero means no qualifying event was found. + lastEventTimestamp int64 + lastEventCurrentTime int64 +} + +var ( + applyTxnPool = sync.Pool{ + New: func() any { return new(applyTxn) }, + } + applyTxnPayloadPool = sync.Pool{ + New: func() any { return new(applyTxnPayload) }, + } +) + +// acquireApplyTxn gets an applyTxn from the pool with a fresh done channel. +// A fresh channel is allocated each time (not reused from the pool) because +// the worker captures a reference to the done channel via pendingDone before +// the txn is returned to the pool. If the channel were reused, the worker's +// pendingDone and the new txn's done would alias the same channel, and the +// drain here would steal the signal intended for the worker. +func acquireApplyTxn() *applyTxn { + txn := applyTxnPool.Get().(*applyTxn) + txn.done = make(chan struct{}, 1) + return txn +} + +// acquireApplyTxnPayload gets an applyTxnPayload from the pool. +func acquireApplyTxnPayload() *applyTxnPayload { + return applyTxnPayloadPool.Get().(*applyTxnPayload) +} + +// releaseApplyTxn returns an applyTxn and its payload to their pools. +// Must only be called after commitLoop has fully processed the txn. +func releaseApplyTxn(txn *applyTxn) { + if txn.payload != nil { + p := txn.payload + *p = applyTxnPayload{} + applyTxnPayloadPool.Put(p) + } + // Zero out the txn completely (including done channel). acquireApplyTxn + // always creates a fresh done channel, so there's nothing to preserve. + *txn = applyTxn{} + applyTxnPool.Put(txn) +} + +type postDDLStalePlan struct { + stalePlan *TablePlan + refreshedPlans map[string]*TablePlan + allowDisappear bool +} + +// clonePostDDLStalePlan deep-copies the refreshed-name set so scheduler and +// commitLoop state can evolve independently without sharing inner maps. +func clonePostDDLStalePlan(stale postDDLStalePlan) postDDLStalePlan { + clone := stale + if len(stale.refreshedPlans) == 0 { + return clone + } + clone.refreshedPlans = make(map[string]*TablePlan, len(stale.refreshedPlans)) + maps.Copy(clone.refreshedPlans, stale.refreshedPlans) + return clone +} + +// clonePostDDLStalePlans returns a detached copy of the current barrier state. +func clonePostDDLStalePlans(src map[string]postDDLStalePlan) map[string]postDDLStalePlan { + if len(src) == 0 { + return nil + } + cloned := make(map[string]postDDLStalePlan, len(src)) + for name, stale := range src { + cloned[name] = clonePostDDLStalePlan(stale) + } + return cloned +} + +// cloneDroppedTables mirrors clonePostDDLStalePlans for the dropped-table set: +// scheduler and commitLoop evolve independent copies so a resync between them +// does not alias their inner maps. +func cloneDroppedTables(src map[string]struct{}) map[string]struct{} { + if len(src) == 0 { + return nil + } + cloned := make(map[string]struct{}, len(src)) + for name := range src { + cloned[name] = struct{}{} + } + return cloned +} + +// canonicalPostDDLTableKey resolves the exact map key that corresponds to a +// DDL-parsed table name. MySQL identifiers can arrive in inconsistent casing +// between binlog events, the parser, and cached plans; without this +// reconciliation, barrier bookkeeping would silently miss entries because of +// a case mismatch. +func canonicalPostDDLTableKey[T any](entries map[string]T, name string) string { + if name == "" { + return "" + } + if _, ok := entries[name]; ok { + return name + } + for candidate := range entries { + if strings.EqualFold(candidate, name) { + return candidate + } + } + return name +} + +// postDDLTableKeyMatches compares two table names case-insensitively so that +// binlog-event names and tracked barrier names line up regardless of the DDL's +// original casing. +func postDDLTableKeyMatches(a, b string) bool { + return a != "" && b != "" && strings.EqualFold(a, b) +} + +// snapshotPostDDLStalePlans widens an unknown-DDL barrier to all currently +// live plans except names already known to have been dropped. +func snapshotPostDDLStalePlans(tablePlans map[string]*TablePlan, droppedTables map[string]struct{}) map[string]postDDLStalePlan { + if len(tablePlans) == 0 { + return nil + } + tracked := make(map[string]postDDLStalePlan, len(tablePlans)) + for name, plan := range tablePlans { + if _, dropped := droppedTables[canonicalPostDDLTableKey(droppedTables, name)]; dropped { + continue + } + tracked[name] = postDDLStalePlan{ + stalePlan: plan, + refreshedPlans: map[string]*TablePlan{name: plan}, + } + } + if len(tracked) == 0 { + return nil + } + return tracked +} + +// addPostDDLStalePlan registers a stale plan along with the refreshed table +// names whose FIELD event will satisfy the barrier. Centralizes the +// canonical-key, dropped-table, and missing-plan checks so +// extractDDLAffectedTables can emit consistent entries across CREATE / RENAME / +// ALTER / DROP without duplicating those rules per case. +func addPostDDLStalePlan(tracked map[string]postDDLStalePlan, tablePlans map[string]*TablePlan, droppedTables map[string]struct{}, allowDroppedRefreshedNames bool, staleName string, refreshedNames ...string) { + staleName = canonicalPostDDLTableKey(tablePlans, staleName) + if _, dropped := droppedTables[canonicalPostDDLTableKey(droppedTables, staleName)]; dropped { + return + } + plan, ok := tablePlans[staleName] + if !ok { + return + } + entry := postDDLStalePlan{ + stalePlan: plan, + refreshedPlans: make(map[string]*TablePlan, len(refreshedNames)), + } + for _, refreshedName := range refreshedNames { + if refreshedName == "" { + continue + } + refreshedName = canonicalPostDDLTableKey(tablePlans, refreshedName) + if !allowDroppedRefreshedNames && refreshedName != staleName { + if _, dropped := droppedTables[canonicalPostDDLTableKey(droppedTables, refreshedName)]; dropped { + continue + } + } + if tablePlans == nil { + continue + } + entry.refreshedPlans[refreshedName] = tablePlans[refreshedName] + } + if len(entry.refreshedPlans) == 0 { + return + } + tracked[staleName] = entry +} + +// extractDDLAffectedTables parses a DDL statement and returns the tracked stale +// plans plus the table names whose future FIELD refresh can satisfy each entry. +// The caller uses this to keep the stale-plan barrier scoped to the plans that +// actually matter for the DDL, including rename operations where the refreshed +// FIELD arrives under a different table name. +func extractDDLAffectedTables(sql string, parser *sqlparser.Parser, tablePlans map[string]*TablePlan, droppedTables map[string]struct{}) (map[string]postDDLStalePlan, bool) { + stmt, err := parser.ParseStrictDDL(sql) + if err != nil { + tracked := snapshotPostDDLStalePlans(tablePlans, droppedTables) + return tracked, len(tracked) != 0 + } + ddlStmt, ok := stmt.(sqlparser.DDLStatement) + if !ok { + tracked := snapshotPostDDLStalePlans(tablePlans, droppedTables) + return tracked, len(tracked) != 0 + } + tracked := make(map[string]postDDLStalePlan) + switch stmt := ddlStmt.(type) { + case *sqlparser.CreateTable: + // A same-name recreate can arrive before FIELD refreshes the live plan. + // Keep the stale pre-drop plan tracked until the new FIELD replaces it. + addPostDDLStalePlan(tracked, tablePlans, nil, true, stmt.Table.Name.String(), stmt.Table.Name.String()) + case *sqlparser.RenameTable: + for _, pair := range stmt.TablePairs { + addPostDDLStalePlan(tracked, tablePlans, droppedTables, true, pair.FromTable.Name.String(), pair.ToTable.Name.String()) + } + case *sqlparser.AlterTable: + refreshedNames := []string{stmt.Table.Name.String()} + allowDroppedRefreshedNames := false + for _, option := range stmt.AlterOptions { + if rename, ok := option.(*sqlparser.RenameTableName); ok { + refreshedNames = []string{rename.Table.Name.String()} + allowDroppedRefreshedNames = true + } + } + addPostDDLStalePlan(tracked, tablePlans, droppedTables, allowDroppedRefreshedNames, stmt.Table.Name.String(), refreshedNames...) + case *sqlparser.DropTable: + for _, table := range stmt.FromTables { + if table.IsEmpty() { + continue + } + name := canonicalPostDDLTableKey(tablePlans, table.Name.String()) + addPostDDLStalePlan(tracked, tablePlans, nil, true, name, name) + entry := tracked[name] + entry.allowDisappear = true + tracked[name] = entry + } + default: + for _, table := range ddlStmt.AffectedTables() { + if table.IsEmpty() { + continue + } + name := table.Name.String() + addPostDDLStalePlan(tracked, tablePlans, droppedTables, false, name, name) + } + } + if len(tracked) == 0 { + return nil, false + } + return tracked, false +} + +// extractDroppedTables pulls the set of DROP TABLE names out of a DDL. A stale +// plan for a dropped table will never be satisfied by a future FIELD refresh, +// so the barrier has to allow those plans to simply disappear rather than +// stall the pipeline waiting for a refresh that will never arrive. +func extractDroppedTables(sql string, parser *sqlparser.Parser) map[string]struct{} { + stmt, err := parser.ParseStrictDDL(sql) + if err != nil { + return nil + } + dropped := map[string]struct{}{} + switch stmt := stmt.(type) { + case *sqlparser.DropTable: + for _, table := range stmt.FromTables { + if table.IsEmpty() { + continue + } + dropped[strings.ToLower(table.Name.String())] = struct{}{} + } + } + if len(dropped) == 0 { + return nil + } + return dropped +} + +// retireResolvedPostDDLTablePlans removes stale rename-source plans once the +// rename barrier is fully satisfied. This keeps later unknown-DDL snapshots +// from tracking names that no longer exist, while preserving fail-closed +// behavior until the rename target actually refreshes. +func retireResolvedPostDDLTablePlans(tablePlans map[string]*TablePlan, stalePlans map[string]postDDLStalePlan) bool { + retired := false + for staleName, stale := range stalePlans { + if stale.stalePlan == nil { + continue + } + if _, recreated := stale.refreshedPlans[staleName]; recreated { + continue + } + if tablePlans[staleName] != stale.stalePlan { + continue + } + delete(tablePlans, staleName) + retired = true + } + return retired +} + +// resolvedPostDDLStalePlans returns the subset of barrier entries whose +// refreshed plans have already arrived (or, for DROP entries, whose name now +// appears in droppedTables). Callers use this to retire resolved entries from +// shared state while still-unresolved ones remain active. +func resolvedPostDDLStalePlans(tablePlans map[string]*TablePlan, droppedTables map[string]struct{}, stalePlans map[string]postDDLStalePlan) map[string]postDDLStalePlan { + if len(stalePlans) == 0 { + return nil + } + resolved := make(map[string]postDDLStalePlan, len(stalePlans)) + for name, stale := range stalePlans { + if stale.allowDisappear { + if _, ok := droppedTables[canonicalPostDDLTableKey(droppedTables, name)]; ok { + resolved[name] = clonePostDDLStalePlan(stale) + continue + } + } + allRefreshed := true + for refreshedName, priorPlan := range stale.refreshedPlans { + refreshedPlan, ok := tablePlans[canonicalPostDDLTableKey(tablePlans, refreshedName)] + if !ok || refreshedPlan == priorPlan { + allRefreshed = false + break + } + } + if allRefreshed { + resolved[name] = clonePostDDLStalePlan(stale) + } + } + if len(resolved) == 0 { + return nil + } + return resolved +} + +// mergePostDDLStalePlans union-merges two barrier maps so a resync between +// scheduler and commitLoop preserves every refreshed name that either side +// observed. Losing an entry here would leave a rename or same-name recreate's +// barrier unsatisfiable. +func mergePostDDLStalePlans(dst, src map[string]postDDLStalePlan) map[string]postDDLStalePlan { + if len(src) == 0 { + return dst + } + if dst == nil { + dst = make(map[string]postDDLStalePlan, len(src)) + } + for name, stale := range src { + existing, ok := dst[name] + if !ok { + dst[name] = clonePostDDLStalePlan(stale) + continue + } + merged := existing + if merged.stalePlan == nil { + merged.stalePlan = stale.stalePlan + } + if len(stale.refreshedPlans) != 0 { + if merged.refreshedPlans == nil { + merged.refreshedPlans = make(map[string]*TablePlan, len(existing.refreshedPlans)+len(stale.refreshedPlans)) + maps.Copy(merged.refreshedPlans, existing.refreshedPlans) + } + for refreshedName, refreshedPlan := range stale.refreshedPlans { + if _, ok := merged.refreshedPlans[refreshedName]; !ok { + merged.refreshedPlans[refreshedName] = refreshedPlan + } + } + } + merged.allowDisappear = merged.allowDisappear || stale.allowDisappear + dst[name] = merged + } + return dst +} + +// extractDDLRenameTargets pulls the from→to pairs out of a RENAME TABLE or +// ALTER TABLE ... RENAME so barrier entries can be retargeted: the FIELD event +// that satisfies the stale plan will now arrive under the destination table +// name, and the watched refreshed-name has to follow. +func extractDDLRenameTargets(sql string, parser *sqlparser.Parser) map[string]string { + stmt, err := parser.ParseStrictDDL(sql) + if err != nil { + return nil + } + renames := map[string]string{} + switch stmt := stmt.(type) { + case *sqlparser.RenameTable: + for _, pair := range stmt.TablePairs { + fromName := strings.ToLower(pair.FromTable.Name.String()) + toName := strings.ToLower(pair.ToTable.Name.String()) + if fromName == "" || toName == "" { + continue + } + renames[fromName] = toName + } + case *sqlparser.AlterTable: + for _, option := range stmt.AlterOptions { + rename, ok := option.(*sqlparser.RenameTableName) + if !ok { + continue + } + fromName := strings.ToLower(stmt.Table.Name.String()) + toName := strings.ToLower(rename.Table.Name.String()) + if fromName == "" || toName == "" { + continue + } + renames[fromName] = toName + } + } + if len(renames) == 0 { + return nil + } + return renames +} + +// uniqueKeyColumnsEqual reports whether two per-index unique-key column lists +// are identical (same number of indexes, each with the same ordered columns). +func uniqueKeyColumnsEqual(a, b [][]string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if !slices.Equal(a[i], b[i]) { + return false + } + } + return true +} + +// shouldPublishExecIgnoreDDLBarrier decides whether an ALTER that adds or +// drops a unique secondary index has actually changed the cached plan's +// writeset state — either its force-serialize (HasExtraUniqueSecondary) flag +// or its hashable unique-key column lists. When the state changes, an +// ExecIgnore barrier must be published so rows planned under the old state do +// not leak into execution under the new one. Returning false means the ALTER +// does not touch correctness-relevant plan state and the barrier can be +// skipped. +func shouldPublishExecIgnoreDDLBarrier(ctx context.Context, vp *vplayer, statement string) (bool, error) { + if vp == nil || vp.vr == nil || vp.vr.vre == nil || vp.vr.vre.env == nil { + return false, nil + } + parser := vp.vr.vre.env.Parser() + stmt, err := parser.ParseStrictDDL(statement) + if err != nil { + return false, nil + } + alter, ok := stmt.(*sqlparser.AlterTable) + if !ok || alter.Table.IsEmpty() { + return false, nil + } + tableName := alter.Table.Name.String() + vp.tablePlansMu.RLock() + cachedPlan := vp.tablePlans[canonicalPostDDLTableKey(vp.tablePlans, tableName)] + vp.tablePlansMu.RUnlock() + if cachedPlan == nil { + return false, nil + } + for _, option := range alter.AlterOptions { + switch option := option.(type) { + case *sqlparser.AddIndexDefinition: + if option.IndexDefinition == nil || option.IndexDefinition.Info == nil || !option.IndexDefinition.Info.IsUnique() { + continue + } + uniqueKeys, mustSerialize, err := vp.vr.writesetUniqueKeys(ctx, cachedPlan.TargetName, cachedPlan) + if err != nil { + return false, err + } + return mustSerialize != cachedPlan.HasExtraUniqueSecondary || + !uniqueKeyColumnsEqual(uniqueKeys, cachedPlan.UniqueKeyColumns), nil + case *sqlparser.DropKey: + if option.Type != sqlparser.NormalKeyType && option.Type != sqlparser.ConstraintType { + continue + } + uniqueKeys, mustSerialize, err := vp.vr.writesetUniqueKeys(ctx, cachedPlan.TargetName, cachedPlan) + if err != nil { + return false, err + } + return mustSerialize != cachedPlan.HasExtraUniqueSecondary || + !uniqueKeyColumnsEqual(uniqueKeys, cachedPlan.UniqueKeyColumns), nil + } + } + return false, nil +} + +// retargetPostDDLStalePlans rewrites in-flight barrier entries after a RENAME +// has landed: the FIELD refresh that satisfies each stale plan will now +// arrive under the destination name, so the watched refreshed-name must +// follow. Only the entries originally watched are retargeted, so overlapping +// rename sets do not cascade based on map iteration order. +func retargetPostDDLStalePlans(stalePlans map[string]postDDLStalePlan, renameTargets map[string]string, tablePlans map[string]*TablePlan) { + if len(stalePlans) == 0 || len(renameTargets) == 0 { + return + } + for staleName, stale := range stalePlans { + if len(stale.refreshedPlans) == 0 { + continue + } + refreshedPlans := make(map[string]*TablePlan, len(stale.refreshedPlans)) + changed := false + for refreshedName, priorPlan := range stale.refreshedPlans { + if toName, ok := renameTargets[canonicalPostDDLTableKey(renameTargets, refreshedName)]; ok { + // Retarget from the original watched names only so overlapping + // rename sets do not cascade based on map iteration order. + toName = canonicalPostDDLTableKey(tablePlans, toName) + refreshedPlans[toName] = tablePlans[toName] + changed = true + continue + } + refreshedPlans[refreshedName] = priorPlan + } + if !changed { + continue + } + stale.refreshedPlans = refreshedPlans + stalePlans[staleName] = stale + } +} + +// unresolvedPostDDLStalePlans drops entries whose replacement FIELD has already +// arrived, so only still-stale table plans participate in later scheduling. +func unresolvedPostDDLStalePlans(tablePlans map[string]*TablePlan, droppedTables map[string]struct{}, stalePlans map[string]postDDLStalePlan) map[string]postDDLStalePlan { + if len(stalePlans) == 0 { + return nil + } + unresolved := make(map[string]postDDLStalePlan, len(stalePlans)) + for name, stale := range stalePlans { + if stale.allowDisappear { + if _, ok := droppedTables[canonicalPostDDLTableKey(droppedTables, name)]; ok { + continue + } + } + allRefreshed := true + for refreshedName, priorPlan := range stale.refreshedPlans { + refreshedPlan, ok := tablePlans[canonicalPostDDLTableKey(tablePlans, refreshedName)] + if !ok { + allRefreshed = false + break + } + if refreshedPlan == priorPlan { + allRefreshed = false + break + } + } + if allRefreshed { + continue + } + unresolved[name] = clonePostDDLStalePlan(stale) + } + if len(unresolved) == 0 { + return nil + } + return unresolved +} + +// txnTouchesPostDDLBarrier keeps known DDL barriers table-scoped while still +// letting unknown DDLs remain conservative until every tracked plan refreshes. +func txnTouchesPostDDLBarrier(events []*binlogdatapb.VEvent, stalePlans map[string]postDDLStalePlan, conservative bool) bool { + if len(stalePlans) == 0 { + return false + } + for _, event := range events { + var tableName string + switch event.Type { + case binlogdatapb.VEventType_FIELD: + if event.FieldEvent != nil { + tableName = event.FieldEvent.TableName + } + case binlogdatapb.VEventType_ROW: + if event.RowEvent != nil { + tableName = event.RowEvent.TableName + } + } + if tableName == "" { + continue + } + if conservative { + return true + } + for staleName, stale := range stalePlans { + if postDDLTableKeyMatches(tableName, staleName) { + return true + } + for refreshedName := range stale.refreshedPlans { + if postDDLTableKeyMatches(tableName, refreshedName) { + return true + } + } + } + } + return false +} + +// postDDLRefreshTargetMatchesCachedPlan reports whether the currently-cached +// plan for refreshedName is still the stale one tracked by a barrier entry. +// Callers use this to distinguish a genuine replacement (progress) from a +// no-op refresh that would not advance the barrier. +func postDDLRefreshTargetMatchesCachedPlan(stalePlans map[string]postDDLStalePlan, refreshedName string, cachedPlan *TablePlan) bool { + for _, stale := range stalePlans { + for trackedName, priorPlan := range stale.refreshedPlans { + if !postDDLTableKeyMatches(trackedName, refreshedName) { + continue + } + if priorPlan == cachedPlan { + return true + } + } + } + return false +} + +// mergeDroppedTables is the dropped-table counterpart to +// mergePostDDLStalePlans: a resync between scheduler and commitLoop has to +// preserve the allow-disappear bookkeeping from both sides, or a DROP +// observed by only one side could leave a barrier stuck. +func mergeDroppedTables(dst, src map[string]struct{}) map[string]struct{} { + if len(src) == 0 { + return dst + } + merged := make(map[string]struct{}, len(dst)+len(src)) + for name := range dst { + merged[name] = struct{}{} + } + for name := range src { + merged[name] = struct{}{} + } + return merged +} + +// extractFieldRefreshTables collects the table names refreshed by FIELD +// events in a txn. Used by txnNeedsFieldRefreshSerialization to detect the +// same-txn FIELD+ROW case that cannot be parallelized (the row would apply +// against a plan the same txn is replacing). +func extractFieldRefreshTables(events []*binlogdatapb.VEvent) map[string]struct{} { + var refreshed map[string]struct{} + for _, event := range events { + if event.Type != binlogdatapb.VEventType_FIELD || event.FieldEvent == nil || event.FieldEvent.TableName == "" { + continue + } + if refreshed == nil { + refreshed = make(map[string]struct{}) + } + refreshed[event.FieldEvent.TableName] = struct{}{} + } + return refreshed +} + +// Same-txn FIELD+ROW means the worker may build or replace the plan before it +// applies the row. The scheduler cannot safely compute a writeset from the +// pre-apply snapshot, so serialize these rare refresh transactions. +// +// This guard is also what keeps concurrent plan refreshes ordered: vstreamer +// emits FIELD in the same transaction as the first ROW for that table, so +// every plan-storing transaction is forceGlobal and two workers can never +// store table plans for the same table out of order. A FIELD-only +// transaction (no ROW for that table) would bypass this; if vstreamer ever +// emits those, the pendingFieldRefreshTables serialization still covers the +// ordering of later row transactions, but the plan-store race between two +// FIELD-only txns would need revisiting. +func txnNeedsFieldRefreshSerialization(events []*binlogdatapb.VEvent) bool { + refreshedTables := extractFieldRefreshTables(events) + if len(refreshedTables) == 0 { + return false + } + for _, event := range events { + if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil { + continue + } + if _, ok := refreshedTables[event.RowEvent.TableName]; ok { + return true + } + } + return false +} + +// txnTouchesPendingFieldRefresh reports whether any ROW event in the txn +// targets a table whose FIELD refresh is still queued ahead of it. Such txns +// must serialize behind the pending refresh; otherwise the worker would apply +// rows against a plan that is about to be replaced. +func txnTouchesPendingFieldRefresh(events []*binlogdatapb.VEvent, pending map[string]int) bool { + if len(pending) == 0 { + return false + } + for _, event := range events { + if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil { + continue + } + for tableName, count := range pending { + if count <= 0 { + continue + } + if postDDLTableKeyMatches(event.RowEvent.TableName, tableName) { + return true + } + } + } + return false +} + +// workerLocalVPlayer builds a worker-scoped shadow of the orchestrator's +// vplayer that exposes only the fields workers are allowed to share +// (tablePlans, replicatorPlan, serialMu, etc). Keeps worker code from +// reaching into main-goroutine-owned vplayer state by accident. +func workerLocalVPlayer(vp *vplayer) vplayer { + return vplayer{ + vr: vp.vr, + copyState: vp.copyState, + replicatorPlan: vp.replicatorPlan, + canAcceptStmtEvents: vp.canAcceptStmtEvents, + tablePlansMu: vp.tablePlansMu, + tablePlans: vp.tablePlans, + tablePlansVersion: vp.tablePlansVersion, + batchMode: vp.batchMode, + phase: vp.phase, + serialMu: vp.serialMu, + } +} + +// writesetErrorForcesSerialization flags the specific writeset-build errors +// (partial row image, missing streamed fields) that mean we cannot prove +// absence of conflict for the txn, so it must take the serial path. Other +// writeset errors propagate as real failures. +func writesetErrorForcesSerialization(err error) bool { + if vterrors.Code(err) != vtrpcpb.Code_FAILED_PRECONDITION { + return false + } + return strings.Contains(err.Error(), "partial row image on table ") || + strings.Contains(err.Error(), "not in streamed fields") || + strings.Contains(err.Error(), "no usable writeset identity") || + strings.Contains(err.Error(), "streamed field metadata mismatch") +} + +// computeLastEventTimestamp scans events in reverse to find the last event +// with a non-zero timestamp that isn't a throttled heartbeat. Returns the +// timestamp and currentTime from that event, or (0, 0) if none qualifies. +func computeLastEventTimestamp(events []*binlogdatapb.VEvent) (timestamp, currentTime int64) { + for _, ev := range slices.Backward(events) { + if ev.Timestamp == 0 { + continue + } + if ev.Type == binlogdatapb.VEventType_HEARTBEAT && ev.Throttled { + continue + } + return ev.Timestamp, ev.CurrentTime + } + return 0, 0 +} + +// txnNeedsWorker reports whether a transaction has work for a worker connection. +func txnNeedsWorker(events []*binlogdatapb.VEvent) bool { + for _, ev := range events { + if ev.Type != binlogdatapb.VEventType_ROWS_QUERY { + return true + } + } + return false +} + +// applyEventsParallel is the top-level orchestrator for the parallel applier. +// It creates N worker goroutines and a commitLoop goroutine, then runs +// scheduleLoop on the calling goroutine. On exit, it tears down the pipeline +// in order: close scheduler → wait workers → close commitCh → wait commitLoop. +func (vp *vplayer) applyEventsParallel(ctx context.Context, relay *relayLog) error { + workerCount := vp.vr.workflowConfig.ParallelReplicationWorkers + if workerCount <= 1 { + return vp.applyEvents(ctx, relay) + } + + // Mirror the serial applier: reset lag stats to MaxInt64 when we exit, + // signalling that replication is no longer running. + defer vp.vr.stats.ReplicationLagSeconds.Store(math.MaxInt64) + defer vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, math.MaxInt64) + + ctx, cancel := context.WithCancel(ctx) + defer cancel() + if err := ctx.Err(); err != nil { + return err + } + + scheduler := newApplyScheduler(ctx) + // Buffer 4x worker count to decouple worker throughput from commit + // latency. Workers block when commitCh is full, stalling the pipeline. + commitCh := make(chan *applyTxn, workerCount*4) + // Cap total ordered work in the parallel pipeline to approximately one + // applying transaction per worker plus the commit buffer. This provides + // end-to-end backpressure when commitLoop is stalled on an early order. + scheduler.maxOutstandingOrders = int64(workerCount + cap(commitCh)) + applyErr := make(chan error, 2) + commitLoopErr := make(chan error, 1) + workerErr := make(chan error, workerCount) + + workers := make([]*applyWorker, 0, workerCount) + // Register the defer BEFORE the creation loop so that if creating + // worker N fails, workers 0..N-1 are still closed. Without this, + // a partial creation failure would leak DB connections. + defer func() { + for _, worker := range workers { + worker.close() + } + }() + for range workerCount { + worker, err := newApplyWorker(ctx, vp.vr) + if err != nil { + return err + } + workers = append(workers, worker) + } + + // Query FK constraints from the target database so that we can + // generate writeset keys that create conflicts between child and + // parent table transactions, preventing FK constraint violations + // during parallel apply. + // + // Fail closed: if we can't read FK metadata we cannot know whether + // the schema has FK constraints that require cross-table ordering. + // Continuing with fkRefs=nil would silently degrade to PK-only + // writeset scheduling and could reorder parent/child transactions. + // Return the error so the workflow retries via the normal retry path. + fkRefs, err := queryFKRefs(vp.vr.dbClient, vp.vr.dbClient.DBName()) + if err != nil { + return vterrors.Wrapf(err, "parallel apply: failed to query FK metadata from %q", vp.vr.dbClient.DBName()) + } + if len(fkRefs) > 0 { + for table, refs := range fkRefs { + for _, ref := range refs { + log.Info("Parallel apply: FK ref", slog.String("child", table), slog.String("parent", ref.ParentTable), slog.Any("childCols", ref.ChildColumnNames), slog.Any("referencedCols", ref.ReferencedColumnNames)) + } + } + } else { + log.Info("Parallel apply: no FK refs found", slog.String("db", vp.vr.dbClient.DBName())) + } + vp.fkRefs = fkRefs + vp.parentFKRefs = buildParentFKRefs(fkRefs) + + // sendWorkerErr is a non-blocking send to workerErr. The channel is + // buffered to workerCount, so in normal operation this always succeeds; + // the non-blocking form is defensive against test hooks or double-send + // scenarios and mirrors the convention used elsewhere in this package. + sendWorkerErr := func(err error) { + select { + case workerErr <- err: + default: + } + cancel() + } + sendCommitLoopErr := func(err error) { + select { + case commitLoopErr <- err: + default: + } + cancel() + } + + var wg sync.WaitGroup + for i := range workerCount { + worker := workers[i] + workerIdx := i + wg.Go(func() { + // Recover from panics so a buggy event or driver crash does not + // tear down the entire vttablet process. The recovered error is + // routed through the same path as a normal worker error. + defer recoverParallelApply(fmt.Sprintf("worker %d", workerIdx), sendWorkerErr) + err := vp.workerLoop(ctx, scheduler, commitCh, worker) + if err != nil && err != io.EOF { + sendWorkerErr(err) + } + }) + } + + commitDone := make(chan struct{}) + go func() { + defer close(commitDone) + // Recover from panics so a buggy commit path does not tear down the + // entire vttablet process. The recovered error is routed through the + // same path as a normal commitLoop error. + defer recoverParallelApply("commitLoop", sendCommitLoopErr) + if err := vp.commitLoop(ctx, scheduler, commitCh); err != nil { + commitLoopErr <- err + // Always cancel context when commitLoop exits with an error, + // including io.EOF (stop position reached). This ensures + // scheduleLoop and workers shut down promptly instead of + // blocking on a commitCh that has no reader. + cancel() + } + }() + + // Recover from panics in scheduleLoop so they become a normal applyErr + // rather than crashing the process. Routed to applyErr via the closure. + var schedulePanicErr error + func() { + defer recoverParallelApply("scheduleLoop", func(err error) { + schedulePanicErr = err + cancel() + }) + schedulePanicErr = vp.scheduleLoop(ctx, relay, scheduler) + }() + schedErr := schedulePanicErr + if schedErr != nil { + applyErr <- schedErr + } + + scheduler.close() + wg.Wait() + close(commitCh) + <-commitDone + select { + case err := <-commitLoopErr: + if err == io.EOF { + return nil + } + applyErr <- err + default: + } + + // Now that commitLoop is done, it's safe to rollback any leftover + // transaction on the main connection. This must happen after commitDone + // because commitOnlyTxn in the commitLoop also uses the main connection. + vp.vr.dbClient.Rollback() + + // Drain all errors and prioritize real failures over io.EOF/context.Canceled. + // We must always inspect workerErr too: teardown after a worker failure makes + // scheduleLoop/commitLoop commonly return io.EOF/context.Canceled, and those + // benign shutdown signals must not mask the original worker error. + var realErrs []error + var hasEOF bool + var hasCanceled bool + classifyErr := func(err error) { + if err == nil { + return + } + if err == io.EOF { + hasEOF = true + return + } + if errors.Is(err, context.Canceled) { + hasCanceled = true + return + } + realErrs = append(realErrs, err) + } +drainApplyErrs: + for { + select { + case err := <-applyErr: + classifyErr(err) + default: + break drainApplyErrs + } + } +drainWorkerErrs: + for { + select { + case err := <-workerErr: + classifyErr(err) + default: + break drainWorkerErrs + } + } + if len(realErrs) > 0 { + return errors.Join(realErrs...) + } + // Convert io.EOF (stop position reached) and context.Canceled (shutdown) + // to nil. fetchAndApply's caller treats nil from applyEventsParallel + // the same as io.EOF from the serial path — it stops the controller + // without retrying. + if hasEOF || hasCanceled { + return nil + } + return nil +} + +// scheduleLoop reads event batches from the relay log and dispatches them +// through scheduleItems. It also handles idle-timeout position saves and +// throttle-lag estimation. Runs on the main goroutine of applyEventsParallel. +func (vp *vplayer) scheduleLoop(ctx context.Context, relay *relayLog, scheduler *applyScheduler) error { + // Note: do NOT defer vp.vr.dbClient.Rollback() here. The main connection + // is shared with commitLoop (via commitOnlyTxn), which may still be running + // when scheduleLoop returns. The rollback is deferred in applyEventsParallel + // after commitLoop has finished. + workerCount := vp.vr.workflowConfig.ParallelReplicationWorkers + // Compute the max number of source transactions to batch into one + // mega-transaction. With parallel workers, we need enough separate + // mega-transactions per relay fetch to keep all workers busy. + // + // The relay log size limit (default 250KB) often limits each fetch to + // far fewer transactions than maxItems (5000). With 1-2KB rows, a + // typical fetch may contain only ~150-250 source transactions. To + // ensure all workers get work, we limit each mega-transaction to a + // small multiple of the worker count. This produces enough independent + // mega-transactions for the scheduler to keep all workers busy. + maxBatched := 0 // 0 means unlimited (serial behavior) + if workerCount > 1 { + // Batch multiple source transactions into each mega-transaction. + // This amortizes per-commit overhead (position update, MySQL COMMIT, + // done-signal, scheduler dispatch) across multiple source txns. + // With workerCount*4, a single relay fetch produces enough + // mega-transactions to keep all workers busy while still reducing + // commit overhead by Nx. The writeset for the mega-txn is the + // union of all contained source txns, so conflict detection + // remains correct — if any source txn in mega-A conflicts with + // any source txn in mega-B, they serialize. + maxBatched = workerCount * 4 + } + state := ¶llelScheduleState{ + maxBatchedCommits: maxBatched, + } + for { + if ctx.Err() != nil { + return ctx.Err() + } + vp.serialMu.Lock() + if time.Since(vp.timeLastSaved) >= idleTimeout && vp.unsavedEvent != nil { + event := vp.unsavedEvent + vp.unsavedEvent = nil + vp.timeLastSaved = time.Now() + vp.serialMu.Unlock() + if err := vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false); err != nil { + return err + } + } else { + vp.serialMu.Unlock() + } + if checkResult, ok := vp.vr.vre.throttlerClient.ThrottleCheckOKOrWaitAppName(ctx, throttlerapp.Name(vp.throttlerAppName)); !ok { + // Must hold serialMu when calling updateTimeThrottled because + // it uses vr.dbClient, which may also be in use by the + // commitLoop for commitOnly transactions on the main connection. + vp.serialMu.Lock() + _ = vp.vr.updateTimeThrottled(throttlerapp.VPlayerName, checkResult.Summary()) + vp.serialMu.Unlock() + snap := vp.loadLagSnapshot() + // Estimate lag while throttled, same as the serial applier. + if snap.timestampNs > 0 { + behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs + if behind >= 0 { + behindSecs := behind / 1e9 + vp.vr.stats.ReplicationLagSeconds.Store(behindSecs) + vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs) + } + } + continue + } + items, err := relay.Fetch() + if err != nil { + return err + } + if err := vp.scheduleItems(ctx, scheduler, state, items); err != nil { + return err + } + // If a DDL was in this fetch, wait for the commitLoop to process it + // (including FK metadata refresh) before starting the next fetch. + // Without this barrier, the next fetch would snapshot stale FK refs. + // + // Also drain when the post-DDL stale-plan guard is active. This + // ensures that all serialized work (including FIELD events for the + // DDL-affected table) has been applied by workers before the next + // fetch's scheduleItems re-evaluates the guard by comparing plan + // pointers. + if state.ddlSeen || state.postDDLStalePlans != nil { + if err := scheduler.waitForIdle(ctx); err != nil { + return err + } + } + } +} + +type parallelScheduleState struct { + // curEvents accumulates VEvents for the current transaction being built. + // Reset after each flush (COMMIT or DDL boundary). + curEvents []*binlogdatapb.VEvent + // curRowOnly tracks whether the current transaction contains only ROW + // events. Set to true on the first ROW event, false on FIELD/DDL/OTHER/ + // JOURNAL events. Only meaningful when curRowOnlySet is true. + curRowOnly bool + // curRowOnlySet indicates whether curRowOnly has been determined for the + // current transaction. False at the start of each transaction; set to + // true on the first event that classifies it. This distinguishes + // "not yet classified" from "classified as not row-only". + curRowOnlySet bool + // curTimestamp is the most recent non-zero event timestamp seen in the + // current transaction, used for the time_updated column on flush. + curTimestamp int64 + // curMustSave forces the next flush to save the position immediately + // (set when stop position is reached or time-based batch bound fires). + curMustSave bool + // curPos is the GTID position from the most recent GTID event, + // recorded in _vt.vreplication when the transaction is committed. + curPos replication.Position + // curCommitParent is the source MySQL commit parent from the GTID event, + // used for commit-parent ordering when writeset is unavailable. + curCommitParent int64 + // curSequence is the source MySQL sequence number from the GTID event, + // used to track lastCommittedSequence in the scheduler. + curSequence int64 + // curHasCommitMeta is true when the current transaction's GTID event + // carried non-zero sequenceNumber or commitParent metadata. + curHasCommitMeta bool + // batchMissingCommitMeta is sticky across batched source transactions. + // Once a merged batch contains any txn without commit metadata, the + // flushed mega-transaction must stay in the no-metadata scheduler mode. + batchMissingCommitMeta bool + // lastFlushTime tracks when the last transaction was flushed, used to + // enforce the 500ms time-based batch bound during catch-up replay. + lastFlushTime time.Time + // lastHeartbeatRefresh tracks when time_updated was last refreshed via + // SQL for empty transaction streams, independent of lastFlushTime so + // that the idle timeout position save still fires normally. + lastHeartbeatRefresh time.Time + // cachedPlanSnapshot is a copy-on-write snapshot of vplayer.tablePlans, + // refreshed only when tablePlansVersion changes (new FIELD events). + cachedPlanSnapshot map[string]*TablePlan + // cachedPlanVersion tracks which tablePlansVersion the snapshot + // corresponds to, so we know when to re-snapshot. + cachedPlanVersion int64 + // fieldIdxCache caches the field-name→index map per table to avoid + // rebuilding it on every transaction. Most transactions touch the same + // tables so this eliminates redundant map construction. Invalidated + // when tablePlansVersion changes (new FIELD events arrive). + fieldIdxCache map[string]map[string]int + fieldIdxCacheVersion int64 + // planFlagsVersion, planHasExtraUniqueSecondary, and + // planHasUnsupportedWritesetMapping cache aggregate flags for the + // cached plan snapshot. Avoids a per-txn scan of every plan when the + // workflow's tables carry none of these properties (the common case). + // Recomputed lazily when the plan version changes. + planFlagsVersion int64 + planHasExtraUniqueSecondary bool + planHasUnsupportedWritesetMapping bool + // curHasFieldEvent is true when the current transaction has + // accumulated at least one FIELD event. Lets the flush path skip + // txnNeedsFieldRefreshSerialization entirely for the common + // rowOnly-with-no-FIELDs case. + curHasFieldEvent bool + // batchedCommitCount tracks how many source transactions have been + // merged into the current mega-transaction via commit batching. When + // this exceeds maxBatchedCommits, the mega-transaction is flushed even + // if more consecutive commits follow. This ensures the parallel applier + // produces enough mega-transactions per relay fetch to keep all workers + // busy, rather than merging everything into one huge transaction that + // only a single worker can process. + batchedCommitCount int + // maxBatchedCommits is the maximum number of source transactions to + // merge into one mega-transaction. Set once based on the relay log + // max items and worker count. + maxBatchedCommits int + // mergedSequences tracks sequence numbers of transactions that were + // merged into the current batch. These are advanced in the scheduler + // when the batch is actually enqueued (not before), so that + // commit-parent dependencies aren't prematurely satisfied. + mergedSequences []int64 + // ddlSeen is set to true when a DDL event is seen in the current fetch. + // The scheduleLoop checks this after scheduleItems returns and waits + // for the commitLoop to drain (so FK refs are refreshed) before + // starting the next fetch. Reset at the start of each scheduleItems call. + ddlSeen bool + // postDDLStalePlans records a snapshot of the tablePlans entries at the + // time an executed DDL was observed. Parallel scheduling is force- + // serialized as long as any plan in this snapshot is still the same + // object in the live tablePlans map. When a FIELD event for the DDL- + // affected table arrives, vplayer.applyEvent builds a new *TablePlan + // and stores it in tablePlans, replacing the stale pointer. At that + // point the guard clears for that table. + // + // This is per-table rather than global-version-based because vstreamer + // only emits FIELD on first-seen or remapped table ids. An unrelated + // table's FIELD would bump the global tablePlansVersion but not replace + // the DDL-affected table's plan pointer. + // + // nil means no DDL barrier is active. + postDDLStalePlans map[string]postDDLStalePlan + // postDDLDroppedTables records dropped tables that have been explicitly + // satisfied for the current DDL barrier. + postDDLDroppedTables map[string]struct{} + // postDDLConservative marks barriers from unparsed DDL, where we must keep + // serializing tracked-table transactions until every captured plan refreshes. + postDDLConservative bool +} + +// scheduleItems processes one relay log fetch worth of event batches. It tracks +// transaction boundaries (GTID → events → COMMIT), classifies transactions, +// builds writesets, handles batching of consecutive commits, and enqueues +// applyTxn structs into the scheduler. Empty transactions bypass the scheduler +// and are saved via unsavedEvent / idle timeout. +func (vp *vplayer) scheduleItems(ctx context.Context, scheduler *applyScheduler, state *parallelScheduleState, items [][]*binlogdatapb.VEvent) error { + stopPosReached := func(pos replication.Position) bool { + return !vp.stopPos.IsZero() && !pos.IsZero() && pos.AtLeast(vp.stopPos) + } + journalTerminates := func(event *binlogdatapb.VEvent) bool { + if event.Type != binlogdatapb.VEventType_JOURNAL || event.Journal == nil { + return false + } + if event.Journal.MigrationType != binlogdatapb.MigrationType_TABLES { + return true + } + jtables := make(map[string]struct{}, len(event.Journal.Tables)) + for _, table := range event.Journal.Tables { + jtables[table] = struct{}{} + } + found := false + notFound := false + for tableName := range vp.replicatorPlan.TablePlans { + if _, ok := jtables[tableName]; ok { + found = true + } else { + notFound = true + } + } + switch { + case found && notFound: + return true + case notFound: + return false + default: + return true + } + } + ddlTerminates := func(event *binlogdatapb.VEvent) bool { + return event.Type == binlogdatapb.VEventType_DDL && vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_STOP + } + + // Snapshot FK refs under serialMu so we have a consistent view for this + // relay fetch. The commitLoop may update these after DDL events. + // pendingFieldRefreshTables is needed for FIELD events during normal + // replication (initial table plan setup), so we always clone it. + // postDDLDroppedTables and postDDLStalePlans can only be populated when + // OnDdl is EXEC or EXEC_IGNORE, so we skip that work otherwise. + ddlExecEnabled := vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC || + vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE + vp.serialMu.Lock() + fkRefs := vp.fkRefs + parentFKRefs := vp.parentFKRefs + pendingFieldRefreshTables := maps.Clone(vp.pendingFieldRefreshTables) + if ddlExecEnabled { + state.postDDLDroppedTables = cloneDroppedTables(vp.postDDLDroppedTables) + if len(vp.postDDLStalePlans) != 0 { + if state.postDDLStalePlans == nil { + state.postDDLStalePlans = make(map[string]postDDLStalePlan, len(vp.postDDLStalePlans)) + } + for name, stale := range vp.postDDLStalePlans { + state.postDDLStalePlans[name] = clonePostDDLStalePlan(stale) + } + } + state.postDDLConservative = state.postDDLConservative || vp.postDDLConservative + } + vp.serialMu.Unlock() + + // After DDL events that may change schema or FK topology, force all + // remaining transactions in this relay fetch to serialize. The + // commitLoop will refresh FK metadata when the DDL commits, so the + // next relay fetch will have updated snapshots. + // + // If a previous DDL that was executed on the target changed the schema, + // force-serialize until the DDL-affected table's plan pointer has been + // replaced by a new FIELD event. We check by comparing the live + // tablePlans pointers to the snapshot taken at DDL time. When the + // affected table's plan is replaced (new *TablePlan from FIELD), the + // stale pointer no longer matches and the guard clears. + forceSerialize := false + if state.postDDLStalePlans != nil { + // Keep serializing until every affected table plan captured at DDL time + // has been replaced. Unrelated FIELD events must not clear the barrier. + vp.tablePlansMu.Lock() + resolvedStalePlans := resolvedPostDDLStalePlans(vp.tablePlans, state.postDDLDroppedTables, state.postDDLStalePlans) + if retireResolvedPostDDLTablePlans(vp.tablePlans, resolvedStalePlans) { + vp.tablePlansVersion.Add(1) + } + state.postDDLStalePlans = unresolvedPostDDLStalePlans(vp.tablePlans, state.postDDLDroppedTables, state.postDDLStalePlans) + vp.tablePlansMu.Unlock() + if state.postDDLStalePlans == nil { + state.postDDLStalePlans = nil + state.postDDLDroppedTables = nil + state.postDDLConservative = false + vp.serialMu.Lock() + vp.postDDLStalePlans = nil + vp.postDDLConservative = false + vp.serialMu.Unlock() + } + } + state.ddlSeen = false + var fkBatchingResolvedTables map[string]struct{} + fkBatchingResolvedVersion := int64(-1) + writesetCache := &txnWritesetCache{fieldIdxCache: state.fieldIdxCache} + getFKBatchingSnapshot := func() (map[string]*TablePlan, map[string]struct{}) { + planSnapshot := snapshotTablePlans(vp.tablePlansMu, vp.tablePlans, vp.tablePlansVersion, &state.cachedPlanVersion, state.cachedPlanSnapshot) + state.cachedPlanSnapshot = planSnapshot + if fkBatchingResolvedVersion == state.cachedPlanVersion { + return planSnapshot, fkBatchingResolvedTables + } + fkBatchingResolvedTables = buildResolvedFKRefTableSet(fkRefs, parentFKRefs, buildCanonicalTargetTableNames(planSnapshot)) + fkBatchingResolvedVersion = state.cachedPlanVersion + return planSnapshot, fkBatchingResolvedTables + } + + flush := func(commitOnly bool) error { + if len(state.curEvents) == 0 && !commitOnly { + return nil + } + order := vp.parallelOrder.Add(1) + lastTs, lastCT := computeLastEventTimestamp(state.curEvents) + payload := acquireApplyTxnPayload() + payload.pos = state.curPos + payload.timestamp = state.curTimestamp + payload.mustSave = state.curMustSave + payload.events = state.curEvents + payload.rowOnly = state.curRowOnly + payload.commitOnly = commitOnly + payload.updatePosOnly = false + payload.lastEventTimestamp = lastTs + payload.lastEventCurrentTime = lastCT + // query/commit/client are left nil here; the worker will + // set them to its own connection before sending to commitCh. + txn := acquireApplyTxn() + txn.order = order + if !state.batchMissingCommitMeta { + txn.sequenceNumber = state.curSequence + txn.commitParent = state.curCommitParent + txn.hasCommitMeta = state.curHasCommitMeta + } + txn.payload = payload + postDDLSerialize := state.postDDLStalePlans != nil && txnTouchesPostDDLBarrier(state.curEvents, state.postDDLStalePlans, state.postDDLConservative) + if forceSerialize { + txn.forceGlobal = true + } else if postDDLSerialize { + txn.forceGlobal = true + } else if state.curRowOnlySet && !state.curRowOnly { + txn.forceGlobal = true + } else if len(vp.copyState) != 0 { + txn.forceGlobal = true + } else if state.curHasFieldEvent && txnNeedsFieldRefreshSerialization(state.curEvents) { + txn.forceGlobal = true + } else if txnTouchesPendingFieldRefresh(state.curEvents, pendingFieldRefreshTables) { + txn.forceGlobal = true + } else { + planSnapshot := snapshotTablePlans(vp.tablePlansMu, vp.tablePlans, vp.tablePlansVersion, &state.cachedPlanVersion, state.cachedPlanSnapshot) + state.cachedPlanSnapshot = planSnapshot + if state.planFlagsVersion != state.cachedPlanVersion { + state.planHasExtraUniqueSecondary = false + state.planHasUnsupportedWritesetMapping = false + for _, plan := range planSnapshot { + if plan == nil { + continue + } + if plan.HasExtraUniqueSecondary { + state.planHasExtraUniqueSecondary = true + } + if plan.HasUnsupportedWritesetMapping { + state.planHasUnsupportedWritesetMapping = true + } + } + state.planFlagsVersion = state.cachedPlanVersion + } + extraUniqueTouched := state.planHasExtraUniqueSecondary && txnTouchesExtraUniqueSecondary(state.curEvents, planSnapshot) + unsupportedTouched := state.planHasUnsupportedWritesetMapping && txnTouchesUnsupportedWritesetMapping(state.curEvents, planSnapshot) + if extraUniqueTouched || unsupportedTouched { + txn.forceGlobal = true + } else { + // Invalidate fieldIdxCache when table plans change (new FIELD events). + if state.fieldIdxCacheVersion != state.cachedPlanVersion { + state.fieldIdxCache = make(map[string]map[string]int) + state.fieldIdxCacheVersion = state.cachedPlanVersion + writesetCache = &txnWritesetCache{fieldIdxCache: state.fieldIdxCache} + } else if writesetCache == nil { + writesetCache = &txnWritesetCache{fieldIdxCache: state.fieldIdxCache} + } + writeset, err := buildTxnWritesetWithCache(planSnapshot, fkRefs, parentFKRefs, state.curEvents, writesetCache) + if err != nil { + if writesetErrorForcesSerialization(err) { + txn.forceGlobal = true + } else { + releaseApplyTxn(txn) + return err + } + } else { + txn.writeset = writeset + } + } + } + // Attach any merged-away sequences to the txn so the scheduler can + // advance lastCommittedSequence for them when this batch actually + // commits (inside markCommitted), not now at enqueue time. Advancing + // at enqueue would satisfy commit-parent dependencies for later + // empty-writeset txns before the batch containing those sequences + // has actually committed. + if len(state.mergedSequences) > 0 { + txn.mergedSequences = append(txn.mergedSequences[:0], state.mergedSequences...) + state.mergedSequences = state.mergedSequences[:0] + } + if state.batchMissingCommitMeta && state.curHasCommitMeta && state.curSequence > 0 { + txn.mergedSequences = append(txn.mergedSequences, state.curSequence) + } + // Increment pendingFieldRefreshTables BEFORE scheduler.enqueue so the + // counter is visible to commitLoop's matching decrement (parallel_apply.go + // ~L2148-2160). Otherwise a worker could pick up this txn and commitLoop + // could observe an empty map (no-op decrement) before this increment + // runs, leaving the counter permanently stuck at 1 — every future ROW + // txn touching this table would then be force-serialized for the + // lifetime of the workflow. + // Skip the full event scan in the common rowOnly case; curHasFieldEvent + // already tracks whether any FIELD event was accumulated (the + // commitLoop side has the analogous map-emptiness guard). + var refreshedTables map[string]struct{} + if state.curHasFieldEvent { + refreshedTables = extractFieldRefreshTables(payload.events) + } + if len(refreshedTables) != 0 { + if pendingFieldRefreshTables == nil { + pendingFieldRefreshTables = make(map[string]int, len(refreshedTables)) + } + vp.serialMu.Lock() + if vp.pendingFieldRefreshTables == nil { + vp.pendingFieldRefreshTables = make(map[string]int, len(refreshedTables)) + } + for tableName := range refreshedTables { + pendingFieldRefreshTables[tableName]++ + vp.pendingFieldRefreshTables[tableName]++ + } + vp.serialMu.Unlock() + } + if err := scheduler.enqueue(txn); err != nil { + // Roll back the increment so a transient enqueue error + // (scheduler closed during teardown, ctx cancellation) does not + // leave the table permanently force-serialized after restart. + if len(refreshedTables) != 0 { + vp.serialMu.Lock() + for tableName := range refreshedTables { + pendingFieldRefreshTables[tableName]-- + if pendingFieldRefreshTables[tableName] <= 0 { + delete(pendingFieldRefreshTables, tableName) + } + if remaining := vp.pendingFieldRefreshTables[tableName] - 1; remaining > 0 { + vp.pendingFieldRefreshTables[tableName] = remaining + } else { + delete(vp.pendingFieldRefreshTables, tableName) + } + } + vp.serialMu.Unlock() + } + // Return the unsent txn to the pool, matching the DDL/OTHER/JOURNAL + // and enqueueCommitOnly paths: a retry storm must not defeat the + // pool by leaking one applyTxn + payload per failed enqueue. + releaseApplyTxn(txn) + return err + } + // Pre-allocate with capacity 16 to avoid the nil→1→2→4→8 growth + // pattern on the hot path. We can't reuse the old slice via [:0] + // because the payload still references the backing array. + state.curEvents = make([]*binlogdatapb.VEvent, 0, 16) + state.curRowOnly = false + state.curRowOnlySet = false + state.curMustSave = false + state.curTimestamp = 0 + state.curCommitParent = 0 + state.curSequence = 0 + state.curHasCommitMeta = false + state.curHasFieldEvent = false + state.batchMissingCommitMeta = false + state.batchedCommitCount = 0 + state.lastFlushTime = time.Now() + return nil + } + + for i := range items { + for j := 0; j < len(items[i]); j++ { + event := items[i][j] + switch event.Type { + case binlogdatapb.VEventType_GTID: + pos, err := binlogplayer.DecodePosition(event.Gtid) + if err != nil { + return err + } + state.curPos = pos + state.curCommitParent = event.CommitParent + state.curSequence = event.SequenceNumber + state.curHasCommitMeta = event.SequenceNumber != 0 || event.CommitParent != 0 + if !state.curHasCommitMeta { + state.batchMissingCommitMeta = true + } + vp.serialMu.Lock() + vp.pos = pos + vp.unsavedEvent = nil + vp.serialMu.Unlock() + case binlogdatapb.VEventType_ROW: + state.curEvents = append(state.curEvents, event) + if !state.curRowOnlySet { + state.curRowOnly = true + state.curRowOnlySet = true + } + case binlogdatapb.VEventType_COMMIT: + posReached := stopPosReached(state.curPos) + state.curMustSave = posReached + if !txnNeedsWorker(state.curEvents) { + if state.curMustSave { + eventCopy := event + if err := vp.enqueueCommitOnly(ctx, scheduler, eventCopy, true, true, state.curSequence, state.curCommitParent, state.curHasCommitMeta); err != nil { + return err + } + return io.EOF + } + + now := time.Now() + queuePositionSave := false + vp.serialMu.Lock() + if time.Since(vp.timeLastSaved) >= idleTimeout { + vp.timeLastSaved = now + queuePositionSave = true + } + vp.serialMu.Unlock() + if queuePositionSave { + state.lastHeartbeatRefresh = now + eventCopy := event + if err := vp.enqueueCommitOnly(ctx, scheduler, eventCopy, true, true, state.curSequence, state.curCommitParent, state.curHasCommitMeta); err != nil { + return err + } + } else { + // During catch-up, a stream may continuously process + // empty transactions (from other shards' data) that + // keep the scheduleLoop busy, so the idle timeout at + // the top of the loop never fires. Periodically refresh + // time_updated directly via SQL to keep + // max_v_replication_lag fresh until the next ordered + // position save is queued. + needRefresh := time.Since(state.lastHeartbeatRefresh) >= idleTimeout + if needRefresh { + state.lastHeartbeatRefresh = now + vp.serialMu.Lock() + err := vp.vr.updateHeartbeatTime(now.Unix()) + vp.serialMu.Unlock() + if err != nil { + return err + } + } + vp.serialMu.Lock() + vp.unsavedEvent = event + vp.serialMu.Unlock() + // Advance lastCommittedSequence immediately only when the + // empty transaction stays on the unsavedEvent path. + // Queued position saves publish their sequence on commit. + if state.curHasCommitMeta { + scheduler.advanceCommittedSequence(state.curSequence) + } + } + // Advance lastCommittedSequence immediately for this empty + // transaction. Empty txns have no data effects, so their + // commit-parent dependency is trivially satisfied. Deferring + // the advance (via mergedSequences) would deadlock: a later + // txn with commitParent=thisSequence and empty writeset would + // block forever waiting for lastCommittedSequence to reach + // its commitParent, but the deferred sequence only publishes + // when that later txn commits — a circular dependency. + // + // markCommitted() uses max() for lastCommittedSequence, so + // this early advance cannot regress the watermark when a + // later txn commits with a lower sequence number. + state.curEvents = make([]*binlogdatapb.VEvent, 0, 16) + state.curRowOnly = false + state.curRowOnlySet = false + state.curHasFieldEvent = false + state.curMustSave = false + state.curTimestamp = 0 + state.curCommitParent = 0 + state.curSequence = 0 + state.curHasCommitMeta = false + state.batchMissingCommitMeta = false + continue + } + // Group multiple consecutive transactions into a single batch + // to reduce the number of MySQL COMMITs. This mirrors the serial + // applier's hasAnotherCommit lookahead. If another COMMIT is + // ahead in this relay batch and we don't need to force-save, + // skip the flush and let events accumulate. The next GTID will + // update curPos/curSequence/curCommitParent and the accumulated + // events will be flushed as one larger transaction. + // + // Time-based bound: during heavy catch-up, heartbeats don't + // arrive to set curMustSave. Without a time bound, a single + // batch can grow for 30+ seconds, keeping time_updated stale + // and max_v_replication_lag stuck at 1+. Force a flush every + // 500ms to keep lag fresh. + if !state.lastFlushTime.IsZero() && time.Since(state.lastFlushTime) > 500*time.Millisecond { + state.curMustSave = true + } + // When the current transaction touches FK-related tables, + // skip batching to keep writesets small. Merging parent/ + // child operations into one mega-transaction would make + // nearly all batches conflict on FK ref keys, serializing + // the workload. Flushing each source transaction + // individually lets the scheduler detect truly independent + // transactions and run them in parallel. Transactions on + // unrelated tables can still batch normally. + hasFKRefs := false + if len(fkRefs) > 0 || len(parentFKRefs) > 0 { + planSnapshot, resolvedFKRefTables := getFKBatchingSnapshot() + for _, ev := range state.curEvents { + if ev.Type != binlogdatapb.VEventType_ROW || ev.RowEvent == nil { + continue + } + tableName := ev.RowEvent.TableName + if plan := planSnapshot[tableName]; plan != nil { + tableName = plan.TargetName + } + if _, ok := resolvedFKRefTables[tableName]; ok { + hasFKRefs = true + break + } + } + } + // With parallel workers, limit the mega-transaction size + // to ensure enough transactions for all workers. Without + // this limit, all consecutive commits in a relay fetch + // merge into one mega-transaction, leaving all but one + // worker idle. + if state.maxBatchedCommits > 0 { + state.batchedCommitCount++ + if state.batchedCommitCount >= state.maxBatchedCommits { + state.curMustSave = true + } + } + if !state.curMustSave && !hasFKRefs && hasAnotherCommit(items, i, j+1) { + // Track merged sequence numbers so they can be advanced + // when the batch actually commits. We must NOT advance + // lastCommittedSequence here because the batch hasn't + // committed yet. Empty-writeset transactions that depend + // on commit-parent ordering would otherwise become + // runnable too early. + if state.curHasCommitMeta { + state.mergedSequences = append(state.mergedSequences, state.curSequence) + } + // Reset only metadata — keep accumulated events and + // rowOnly state. The next GTID will set new metadata. + state.curCommitParent = 0 + state.curSequence = 0 + state.curHasCommitMeta = false + state.curMustSave = false + continue + } + if err := flush(false); err != nil { + return err + } + if posReached { + return io.EOF + } + case binlogdatapb.VEventType_BEGIN: + // No-op: BEGIN is handled on-demand by workers when they encounter + // ROW/FIELD events (via activeDBClient().Begin()). We intentionally + // do NOT add BEGIN to curEvents so that empty transactions + // (GTID→BEGIN→COMMIT) have curEvents=0 and take the fast path + // (unsavedEvent) instead of being enqueued through the scheduler. + case binlogdatapb.VEventType_FIELD: + // FIELD events carry table metadata (column definitions) and + // must be applied before the ROW events that follow them, but + // they are emitted routinely by MySQL at the start of each + // transaction — they do not indicate a schema change. The + // execution plan only actually changes after DDL, which + // already sets forceSerialize. Accumulate FIELD events like + // ROW events so they stay in the same applyTxn. + state.curEvents = append(state.curEvents, event) + state.curHasFieldEvent = true + case binlogdatapb.VEventType_INSERT, + binlogdatapb.VEventType_DELETE, + binlogdatapb.VEventType_UPDATE, + binlogdatapb.VEventType_REPLACE, + binlogdatapb.VEventType_SAVEPOINT: + // Statement-based DML events are supported for external MySQL + // streams. Keep them in the transaction payload, but classify the + // transaction as non-row-only so it serializes like the serial + // applier's statement path. + state.curEvents = append(state.curEvents, event) + state.curRowOnly = false + state.curRowOnlySet = true + case binlogdatapb.VEventType_DDL, binlogdatapb.VEventType_OTHER, binlogdatapb.VEventType_JOURNAL: + if err := flush(false); err != nil { + return err + } + posReached := stopPosReached(state.curPos) + order := vp.parallelOrder.Add(1) + vp.serialMu.Lock() + query := vp.query + commit := vp.commit + client := vp.dbClient + vp.serialMu.Unlock() + payload := acquireApplyTxnPayload() + payload.pos = state.curPos + payload.timestamp = event.Timestamp + payload.mustSave = true + payload.events = []*binlogdatapb.VEvent{event} + payload.rowOnly = false + payload.commitOnly = true + payload.updatePosOnly = false + payload.query = query + payload.commit = commit + payload.client = client + payload.lastEventTimestamp = event.Timestamp + payload.lastEventCurrentTime = event.CurrentTime + txn := acquireApplyTxn() + txn.order = order + txn.sequenceNumber = state.curSequence + txn.commitParent = state.curCommitParent + txn.hasCommitMeta = state.curHasCommitMeta + txn.forceGlobal = true + // OTHER events and DDL events with OnDdl=IGNORE only update the + // replication position — they never touch user table data. Marking + // them noConflict lets workers pick them up immediately without + // waiting for all inflight row transactions to drain first. The + // commitLoop still enforces strict ordering, so the position write + // happens after all prior commits. This eliminates the forceGlobal + // serialization stall that occurs during Online DDL cutover when the + // RENAME TABLE DDL event arrives while workers are still applying rows. + txn.noConflict = event.Type == binlogdatapb.VEventType_OTHER || + (event.Type == binlogdatapb.VEventType_DDL && vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_IGNORE) + txn.payload = payload + if err := scheduler.enqueue(txn); err != nil { + // Return the unsent txn to the pool so a retry storm + // (scheduler close + workflow restart in a tight loop) + // does not defeat the pool by leaking one applyTxn + + // payload per failed enqueue. + releaseApplyTxn(txn) + return err + } + // DDL that is actually executed on the target (EXEC, EXEC_IGNORE) + // may change schema or FK topology. Force all remaining + // transactions in this relay fetch to serialize so they don't + // use stale FK refs or table plans for writeset computation. + // Also set state.ddlSeen so the scheduleLoop waits for the + // commitLoop to refresh FK metadata before the next fetch. + // Record the current tablePlansVersion so that force- + // serialization persists until workers apply new FIELD events + // that bump the version past this snapshot. + // + // IGNORE and STOP DDLs do not modify the target schema, so + // they don't need the barrier. STOP terminates the workflow + // entirely. IGNORE just advances the position. + if event.Type == binlogdatapb.VEventType_DDL && + (vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC || + vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE) { + forceSerialize = true + state.ddlSeen = true + } + if posReached || journalTerminates(event) || ddlTerminates(event) { + return io.EOF + } + case binlogdatapb.VEventType_HEARTBEAT: + // Handle heartbeats inline without enqueuing through the scheduler. + // Heartbeats are very frequent (~250/sec) and enqueuing them as + // forceGlobal transactions serializes the entire pipeline, making + // VDiff sync cycles extremely slow. + // + // If we have accumulated events, force the next COMMIT to flush + // instead of continuing to batch. Without this, commit batching + // can create unbounded super-transactions during catch-up replay, + // starving time_updated refreshes and causing max_v_replication_lag + // to stay high indefinitely. Heartbeats arrive regularly from the + // source (~1/sec), providing a natural bound on batch size. + if len(state.curEvents) > 0 { + state.curMustSave = true + } + // + // Must hold serialMu for DB writes (updateTimeThrottled, + // recordHeartbeat) because they use vr.dbClient, which may + // also be in use by the commitLoop for commitOnly transactions. + if event.Throttled { + vp.serialMu.Lock() + err := vp.vr.updateTimeThrottled(throttlerapp.VStreamerName, event.ThrottledReason) + vp.serialMu.Unlock() + if err != nil { + return err + } + } + vp.serialMu.Lock() + vp.numAccumulatedHeartbeats++ + err := vp.recordHeartbeat() + vp.serialMu.Unlock() + if err != nil { + return err + } + // Update lag from heartbeat timestamp. + if event.Timestamp != 0 && !event.Throttled { + tsNs := event.Timestamp * 1e9 + now := time.Now().UnixNano() + offset := now - event.CurrentTime + vp.storeLagSnapshot(tsNs, offset) + lag := now - tsNs - offset + if lag >= 0 { + lagSecs := lag / 1e9 + vp.vr.stats.ReplicationLagSeconds.Store(lagSecs) + vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, lagSecs) + } + } else if event.Throttled { + // When the vstreamer is throttled, we can't determine the + // actual lag from the event. Estimate it from the last known + // timestamp, matching the serial applier's estimateLag(). + snap := vp.loadLagSnapshot() + if snap.timestampNs > 0 { + behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs + if behind >= 0 { + behindSecs := behind / 1e9 + vp.vr.stats.ReplicationLagSeconds.Store(behindSecs) + vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs) + } + } + } + case binlogdatapb.VEventType_ROWS_QUERY: + // Informational only; keep it with the surrounding txn if present. + // vstreamer emits ROWS_QUERY ahead of the ROW events it describes, + // so this metadata must not force serialization on its own. + state.curEvents = append(state.curEvents, event) + case binlogdatapb.VEventType_VERSION: + // VERSION is informational only for the applier. Preserve the + // old serial behavior by ignoring it instead of failing the stream. + default: + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unsupported vevent type: %v", event.Type) + } + if event.Timestamp != 0 { + state.curTimestamp = event.Timestamp + } + } + } + return nil +} + +// enqueueCommitOnly creates a commitOnly transaction and enqueues it into the +// scheduler. Used for DDL, OTHER, JOURNAL events, and position-only saves +// (idle timeout). These transactions are applied by the commitLoop on the main +// connection, not by workers. +func (vp *vplayer) enqueueCommitOnly(ctx context.Context, scheduler *applyScheduler, event *binlogdatapb.VEvent, mustSave bool, updatePosOnly bool, sequenceNumber int64, commitParent int64, hasCommitMeta bool) error { + var order int64 + var pos replication.Position + var query func(ctx context.Context, sql string) (*sqltypes.Result, error) + var commit func() error + var client *vdbClient + order = vp.parallelOrder.Add(1) + vp.serialMu.Lock() + pos = vp.pos + query = vp.query + commit = vp.commit + client = vp.dbClient + vp.serialMu.Unlock() + payload := acquireApplyTxnPayload() + payload.pos = pos + payload.timestamp = event.Timestamp + payload.mustSave = mustSave + payload.events = []*binlogdatapb.VEvent{event} + payload.rowOnly = false + payload.commitOnly = true + payload.updatePosOnly = updatePosOnly + payload.query = query + payload.commit = commit + payload.client = client + payload.lastEventTimestamp = event.Timestamp + payload.lastEventCurrentTime = event.CurrentTime + txn := acquireApplyTxn() + txn.order = order + txn.sequenceNumber = sequenceNumber + txn.commitParent = commitParent + txn.hasCommitMeta = hasCommitMeta + txn.forceGlobal = true + txn.noConflict = updatePosOnly + txn.payload = payload + // commitOnly transactions carry a pooled done channel like any other + // txn, but it is unused: workers forward them directly to commitCh + // without waiting for completion. + if err := scheduler.enqueue(txn); err != nil { + // Match the DDL/OTHER/JOURNAL branch above: return the unsent txn + // to the pool so retry storms don't leak per-call. + releaseApplyTxn(txn) + return err + } + return nil +} + +// workerLoop runs on each of the N worker goroutines. It blocks on +// scheduler.nextReady() until a transaction is dispatched, applies the row +// events using the worker's private MySQL connection, then sends the txn +// to commitCh. Each worker has double-buffered connections: after sending +// a transaction, the worker rotates to its spare connection and immediately +// starts the next transaction, overlapping apply with the commitLoop's commit. +func (vp *vplayer) workerLoop(ctx context.Context, scheduler *applyScheduler, commitCh chan<- *applyTxn, worker *applyWorker) error { + // Workers only apply ROW/FIELD/ROWS_QUERY events. Build a narrow local + // vplayer view once and refresh the DDL barrier snapshots per transaction + // under serialMu, instead of racing on a whole-struct shallow copy. + workerVP := workerLocalVPlayer(vp) + + // pendingDone holds the done channel of the most recently sent worker + // transaction that the commitLoop may still be committing. We capture + // only the channel (not the *applyTxn) because the commitLoop returns + // the applyTxn to the pool after signaling done — if the scheduleLoop + // reacquires it, it drains the channel, which would cause waitPending + // to block forever if we were still dereferencing through the txn. + var pendingDone chan struct{} + + waitPending := func() error { + if pendingDone == nil { + return nil + } + select { + case <-pendingDone: + pendingDone = nil + return nil + case <-ctx.Done(): + return ctx.Err() + } + } + + // Register a single ctx.AfterFunc for the lifetime of this worker. + // On ctx cancellation, close whichever client is currently executing + // MySQL calls — published via activeApplyClient. Registering per-txn + // allocates a new closure + runtime bookkeeping on every transaction; + // this hoists both to once per worker. + var activeApplyClient atomic.Pointer[vdbClient] + stopInterrupt := context.AfterFunc(ctx, func() { + if c := activeApplyClient.Load(); c != nil { + c.Close() + } + }) + defer stopInterrupt() + + // Hoist the OnDdl check: the source's OnDdl action is fixed for the + // lifetime of the workflow, so we compute once rather than per-txn. + // When DDL execution is disabled, the worker can skip the per-txn + // serialMu acquisition that clones postDDL bookkeeping (since those + // maps stay empty in that mode). + ddlExecEnabled := vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC || + vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE + + for { + if ctx.Err() != nil { + return ctx.Err() + } + txn, err := scheduler.nextReady(ctx) + if err != nil { + return err + } + payload := txn.payload + if payload.commitOnly { + // Forward commitOnly txns (DDL, OTHER, JOURNAL, position saves) + // to the commitLoop immediately without waiting for any pending + // worker commit. commitOnly work runs on the main connection, not + // the worker's connection, so it has no dependency on the prior + // row txn's commit. The commitLoop enforces strict ordering via + // nextOrder regardless of when the txn arrives in commitCh. + select { + case commitCh <- txn: + case <-ctx.Done(): + return ctx.Err() + } + continue + } + + // Apply events on the current active connection. This runs + // concurrently with the commitLoop committing the previous + // transaction on the other connection (double-buffering). + // Publish the current worker client so the worker-scoped + // context.AfterFunc can close it if ctx is cancelled. + activeApplyClient.Store(worker.client) + // DDL bookkeeping (postDDLStalePlans, postDDLDroppedTables) is only + // populated when OnDdl is EXEC or EXEC_IGNORE. In the default IGNORE + // mode, these maps stay empty for the workflow's lifetime, so we can + // skip the serialMu acquisition and per-txn clone entirely. Taking + // serialMu here on every worker txn was the dominant contention point + // under parallel apply on OnDdl=IGNORE workflows. + if ddlExecEnabled { + vp.serialMu.Lock() + workerVP.postDDLStalePlans = clonePostDDLStalePlans(vp.postDDLStalePlans) + workerVP.postDDLDroppedTables = cloneDroppedTables(vp.postDDLDroppedTables) + vp.serialMu.Unlock() + } + for _, event := range payload.events { + if err := worker.applyEvent(ctx, event, payload.mustSave, &workerVP); err != nil { + activeApplyClient.Store(nil) + worker.rollback() + if ctx.Err() != nil { + return ctx.Err() + } + return err + } + } + // In batch mode, flush all buffered SQL statements to MySQL in + // one multi-statement call. This is the key parallelism point: + // all workers execute their batches concurrently here, while the + // commitLoop only needs to do a cheap COMMIT + position update. + if err := worker.flushWorkerBatch(); err != nil { + activeApplyClient.Store(nil) + worker.rollback() + if ctx.Err() != nil { + return ctx.Err() + } + return err + } + activeApplyClient.Store(nil) + + // Wait for the previous transaction's commit to complete. Because + // we waited AFTER applying the current transaction, the apply and + // commit phases overlapped — this is the key pipelining benefit. + // If the commit finished during our apply phase, this returns + // immediately. We must wait here because rotate() switches to the + // connection that the commitLoop was using for the previous txn. + if err := waitPending(); err != nil { + worker.rollback() + if ctx.Err() != nil { + return ctx.Err() + } + return err + } + + // Capture the current connection for the payload before rotating. + // The commitLoop will use these to commit this transaction while + // the worker moves on to the next transaction on the spare connection. + // + // In batch mode we leave payload.query/commit nil and let commitLoop + // dispatch directly off payload.client via AddQueryToTrxBatch + + // CommitTrxQueryBatch. The commit still sends "UPDATE …;commit" in + // one multi-statement round-trip (the combine-commit win), but we + // avoid allocating two closures per mega-txn just to hold a reference + // to the worker's active connection. + payload.client = worker.client + if !worker.batchMode { + payload.query = worker.query + payload.commit = worker.commit + } + + done := txn.done + select { + case commitCh <- txn: + case <-ctx.Done(): + worker.rollback() + return ctx.Err() + } + + // Capture the done channel BEFORE rotating. The commitLoop may + // return the txn to the pool after signaling done, and + // acquireApplyTxn drains the channel on reuse. By holding our + // own reference, we are immune to that race. + pendingDone = done + + // Rotate to the spare connection for the next transaction. + // The commitLoop will commit the current txn on the old connection + // and signal txn.done when it's safe to reuse. + worker.rotate() + } +} + +// commitLoop receives completed transactions from workers via commitCh and +// commits them in strict order (by the order field). For worker transactions, +// it executes the position update and commit on the worker's connection +// WITHOUT holding serialMu, then briefly locks to update vp state. +// For commitOnly transactions, it applies events on the main connection +// under serialMu. +func (vp *vplayer) commitLoop(ctx context.Context, scheduler *applyScheduler, commitCh <-chan *applyTxn) error { + updateLag := func(payload *applyTxnPayload) { + if payload.lastEventTimestamp != 0 { + tsNs := payload.lastEventTimestamp * 1e9 + now := time.Now().UnixNano() + offset := now - payload.lastEventCurrentTime + vp.storeLagSnapshot(tsNs, offset) + lag := now - tsNs - offset + if lag >= 0 { + lagSecs := lag / 1e9 + vp.vr.stats.ReplicationLagSeconds.Store(lagSecs) + vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, lagSecs) + return + } + } + snap := vp.loadLagSnapshot() + behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs + behindSecs := behind / 1e9 + vp.vr.stats.ReplicationLagSeconds.Store(behindSecs) + vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs) + } + + // commitWorkerTxn handles a worker's row transaction. It executes the + // position update SQL, optional stop-state update, and commit on the + // worker's private MySQL connection WITHOUT holding serialMu. This avoids + // blocking the scheduleLoop during slow MySQL commits. + commitWorkerTxn := func(txn *applyTxn) error { + if ctx.Err() != nil { + return ctx.Err() + } + payload := txn.payload + dbClient := payload.client + if dbClient == nil { + dbClient = vp.activeDBClient() + } + + // Worker batch-mode fast path: the worker set payload.client but left + // payload.query/commit nil so we wouldn't allocate a closure per + // mega-txn just to hold a reference to its connection. Use the client + // directly here. The AddQueryToTrxBatch + CommitTrxQueryBatch pair + // still sends "UPDATE _vt.vreplication …;commit" in a single + // multi-statement round-trip. + var posReached bool + if payload.client != nil && payload.query == nil && payload.commit == nil { + if err := payload.client.AddQueryToTrxBatch(vp.generateUpdatePosQuery(payload.pos, payload.timestamp)); err != nil { + return err + } + posReached = !vp.stopPos.IsZero() && payload.pos.AtLeast(vp.stopPos) + if posReached { + if err := vp.setStopPositionStateImmediate(dbClient); err != nil { + return err + } + } + if err := payload.client.CommitTrxQueryBatch(); err != nil { + return err + } + } else { + queryFn := payload.query + if queryFn == nil { + queryFn = vp.query + } + commitFn := payload.commit + if commitFn == nil { + commitFn = vp.commit + } + var err error + posReached, err = vp.updatePosWithoutStop(ctx, payload.pos, payload.timestamp, queryFn) + if err != nil { + return err + } + if posReached { + if err := vp.setStopPositionStateImmediate(dbClient); err != nil { + return err + } + } + if err := commitFn(); err != nil { + return err + } + } + + // Briefly lock to update vp state that scheduleLoop reads. + // Do NOT clear vp.unsavedEvent here: a later empty transaction + // may have recorded a higher position that hasn't been flushed yet. + // The idle-timeout saver at the top of scheduleLoop will handle it. + vp.serialMu.Lock() + vp.recordPositionSave(payload.pos, false) + // Skip the per-commit FIELD refresh scan when neither map has entries. + // The common ROW-only steady state has no FIELD events to process, + // and extractFieldRefreshTables otherwise does a full payload scan + // that returns nil on every call. + if len(vp.pendingFieldRefreshTables) != 0 || len(vp.postDDLDroppedTables) != 0 { + for refreshedName := range extractFieldRefreshTables(payload.events) { + if vp.pendingFieldRefreshTables != nil { + key := canonicalPostDDLTableKey(vp.pendingFieldRefreshTables, refreshedName) + if remaining := vp.pendingFieldRefreshTables[key] - 1; remaining > 0 { + vp.pendingFieldRefreshTables[key] = remaining + } else { + delete(vp.pendingFieldRefreshTables, key) + } + } + delete(vp.postDDLDroppedTables, canonicalPostDDLTableKey(vp.postDDLDroppedTables, refreshedName)) + } + } + vp.serialMu.Unlock() + + updateLag(payload) + + // Release scheduler inflight state BEFORE signaling the worker. If + // markCommitted errors (scheduler closed during teardown), we want + // the commitLoop to observe the error and unwind rather than letting + // the worker race ahead to its next txn. + if err := scheduler.markCommitted(txn); err != nil { + return err + } + + // Signal the worker that commit is done so it can reuse its + // DB connection for the next transaction. + txn.done <- struct{}{} + + if posReached { + return io.EOF + } + return nil + } + + // commitOnlyTxn handles commitOnly transactions (DDL, OTHER, JOURNAL, + // position-only saves). These run on the main connection under serialMu. + commitOnlyTxn := func(txn *applyTxn) error { + if ctx.Err() != nil { + return ctx.Err() + } + payload := txn.payload + dbClient := payload.client + if dbClient == nil { + dbClient = vp.activeDBClient() + } + vp.serialMu.Lock() + defer vp.serialMu.Unlock() + + if payload.updatePosOnly { + savePos := payload.pos + if savePos.IsZero() { + savePos = vp.pos + } + posReached := !savePos.IsZero() && !vp.stopPos.IsZero() && savePos.AtLeast(vp.stopPos) + if posReached && vp.saveStop { + if err := dbClient.BeginImmediate(); err != nil { + return err + } + if _, err := dbClient.ExecuteFetch(vp.generateUpdatePosQuery(savePos, payload.timestamp), 1); err != nil { + return fmt.Errorf("error %v updating position", err) + } + if err := vp.setStopPositionStateImmediate(dbClient); err != nil { + return err + } + if err := dbClient.Commit(); err != nil { + return err + } + vp.recordPositionSave(savePos, false) + updateLag(payload) + if err := scheduler.markCommitted(txn); err != nil { + return err + } + return io.EOF + } + + queryFn := payload.query + if queryFn == nil { + queryFn = vp.query + } + + posReached, err := vp.updatePosWithoutStop(ctx, savePos, payload.timestamp, queryFn) + if err != nil { + return err + } + if payload.timestamp == 0 { + if err := vp.vr.updateHeartbeatTime(time.Now().Unix()); err != nil { + return err + } + } + vp.recordPositionSave(savePos, false) + if posReached { + if err := vp.setStopPositionState(dbClient); err != nil { + return err + } + } + updateLag(payload) + if err := scheduler.markCommitted(txn); err != nil { + return err + } + if posReached { + return io.EOF + } + return nil + } + + // Temporarily swap pos for the main connection's updatePos call. + prevPos := vp.pos + if !payload.pos.IsZero() { + vp.pos = payload.pos + } + defer func() { vp.pos = prevPos }() + + // applyEvent handles position updates internally for DDL, OTHER, + // and JOURNAL events, and returns io.EOF when the stop position + // is reached or when a JOURNAL forces termination. We therefore + // do NOT call updatePos again below — doing so would produce a + // redundant _vt.vreplication write and create an awkward + // partial-failure window where applyEvent succeeded but a second + // position write could fail. + event := payload.events[0] + ddlExecuted := false + publishExecIgnoreDDLBarrier := false + var terminalErr error + if event.Type == binlogdatapb.VEventType_DDL { + var err error + ddlExecuted, err = vp.applyDDLEvent(ctx, event, nil) + if err != nil { + if !errors.Is(err, io.EOF) { + return err + } + terminalErr = err + } + if !ddlExecuted && vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE { + publishExecIgnoreDDLBarrier, err = shouldPublishExecIgnoreDDLBarrier(ctx, vp, event.Statement) + if err != nil { + return vterrors.Wrapf(err, "failed to inspect EXEC_IGNORE DDL target schema") + } + } + } else if err := vp.applyEvent(ctx, event, payload.mustSave); err != nil { + if !errors.Is(err, io.EOF) { + return err + } + terminalErr = err + } + // After EXEC DDLs and all EXEC_IGNORE DDLs, refresh FK metadata so + // that ADD/DROP FOREIGN KEY changes are reflected in subsequent + // writeset conflict detection. EXEC_IGNORE still advances the stream + // position after a statement error, and that error can mean the + // target is already in the post-DDL FK state (for example, dropping + // a foreign key that's already gone). Continuing with the old FK + // cache would silently use stale conflict metadata. + // + // We hold serialMu for the DB round-trip here. DDL is rare, and + // the main connection must not be used concurrently by scheduleLoop. + // Fail fast on refresh errors: stale FK topology after a schema + // change would silently compromise conflict detection. + if event.Type == binlogdatapb.VEventType_DDL && (ddlExecuted || vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE) { + newRefs, err := queryFKRefs(vp.vr.dbClient, vp.vr.dbClient.DBName()) + if err != nil { + return vterrors.Wrapf(err, "failed to refresh FK metadata after DDL") + } + vp.fkRefs = newRefs + vp.parentFKRefs = buildParentFKRefs(newRefs) + } + if event.Type == binlogdatapb.VEventType_DDL && (ddlExecuted || publishExecIgnoreDDLBarrier) { + vp.tablePlansMu.RLock() + renameTargets := extractDDLRenameTargets(event.Statement, vp.vr.vre.env.Parser()) + retargetPostDDLStalePlans(vp.postDDLStalePlans, renameTargets, vp.tablePlans) + ddlStalePlans, conservative := extractDDLAffectedTables(event.Statement, vp.vr.vre.env.Parser(), vp.tablePlans, vp.postDDLDroppedTables) + ddlStalePlans = unresolvedPostDDLStalePlans(vp.tablePlans, vp.postDDLDroppedTables, ddlStalePlans) + vp.tablePlansMu.RUnlock() + vp.postDDLStalePlans = mergePostDDLStalePlans(vp.postDDLStalePlans, ddlStalePlans) + vp.postDDLConservative = vp.postDDLConservative || conservative + vp.postDDLDroppedTables = mergeDroppedTables(vp.postDDLDroppedTables, extractDroppedTables(event.Statement, vp.vr.vre.env.Parser())) + } + updateLag(payload) + if err := scheduler.markCommitted(txn); err != nil { + return err + } + return terminalErr + } + + commitTxn := func(txn *applyTxn) error { + if txn.payload.commitOnly { + return commitOnlyTxn(txn) + } + return commitWorkerTxn(txn) + } + + pending := make(map[int64]*applyTxn) + nextOrder := int64(1) + + // On error exit, release all remaining pending entries to return pool + // objects that would otherwise be leaked. We intentionally do NOT + // signal txn.done here: workers unblock via ctx.Done() instead (the + // caller cancels the context on commitLoop error). Signaling done + // would tell the worker its old connection is safe to reuse, but + // the commit may have failed leaving the connection in a dirty state. + defer func() { + for _, txn := range pending { + releaseApplyTxn(txn) + } + }() + + drainPending := func() error { + for { + next := pending[nextOrder] + if next == nil { + break + } + delete(pending, nextOrder) + if err := commitTxn(next); err != nil { + // Re-add the failed txn so the defer cleanup can + // signal its done channel and release it to the pool. + pending[nextOrder] = next + return err + } + releaseApplyTxn(next) + nextOrder++ + } + return nil + } + + for { + select { + case txn, ok := <-commitCh: + if !ok { + // The commit channel has been closed so we cannot add anything else. + // We only need to drain any already pending transactions. + if err := drainPending(); err != nil { + return err + } + if len(pending) > 0 { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "parallel apply commit missing order: pending=%d next=%d", len(pending), nextOrder) + } + return nil + } + if ctx.Err() != nil { + return ctx.Err() + } + if txn.order == 0 { + // All production enqueue paths assign order via + // vp.parallelOrder.Add(1), which is monotonic and starts at 1. + // Reaching here means a regression introduced an unordered + // txn — silently committing it would bypass strict ordering + // and break the monotonic position invariant on + // _vt.vreplication.pos. Fail fast so the workflow restarts + // cleanly from the last durable position. + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "parallel apply commit txn missing order: payload=%+v", txn.payload) + } + // Add the new transaction to be committed and then drain all pending ones. + pending[txn.order] = txn + if err := drainPending(); err != nil { + return err + } + case <-ctx.Done(): + return ctx.Err() + } + } +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler.go new file mode 100644 index 00000000000..8596ddb8966 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler.go @@ -0,0 +1,591 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "context" + "errors" + "io" + "sync" +) + +// errSchedulerAbandonedPendingWork is returned by nextReady when the scheduler +// is closed with pending work that can never become ready because nothing is +// inflight to advance the scheduler's state (lastCommittedSequence, writeset, +// inflight* counters). Workers surface this to their caller so the controller +// retries the stream from the last saved position rather than silently +// treating the abandoned pending work as "stream finished cleanly". +var errSchedulerAbandonedPendingWork = errors.New("parallel apply scheduler closed with unreachable pending transactions") + +type applyTxn struct { + // order is a monotonically increasing sequence number assigned by + // scheduleLoop. The commitLoop commits transactions in strict order + // so that the position saved to _vt.vreplication only moves forward. + order int64 + // sequenceNumber is the source MySQL binlog sequence number from the + // GTID event. Used to advance lastCommittedSequence after commit. + sequenceNumber int64 + // commitParent is the source MySQL commit parent from the GTID event. + // When the writeset is empty, the scheduler falls back to commit-parent + // ordering: the transaction is ready only when commitParent <= + // lastCommittedSequence. + commitParent int64 + // hasCommitMeta is true when the GTID event carried non-zero + // sequenceNumber or commitParent. Transactions with and without + // commit metadata are never run concurrently (safety boundary). + hasCommitMeta bool + // forceGlobal is true for transactions that must serialize with + // everything: non-row-only transactions (DDL, FIELD, OTHER, JOURNAL) + // and copy-phase transactions. + forceGlobal bool + // noConflict is true for position-only saves and certain pass-through + // events (OTHER, ignored DDL). These bypass all conflict checking and + // are always ready, preventing deadlocks where an earlier-order + // position save is blocked by later-order inflight data transactions. + noConflict bool + // writeset holds xxhash digests of PK-based keys (e.g. hash of "table:pk1,pk2"). + // Using uint64 hashes instead of strings eliminates per-txn heap allocations + // in the scheduler hot path, reducing GC pressure at high TPS. + writeset []uint64 + // mergedSequences tracks sequence numbers of source transactions that + // were merged into this batched mega-transaction. They must be advanced + // in lastCommittedSequence only after this txn actually commits, so that + // later empty-writeset transactions whose commitParent references one of + // these sequences don't become runnable before the batch commits. + mergedSequences []int64 + // payload carries the transaction's events and DB connection info. + // Pooled via applyTxnPayloadPool to reduce allocations. + payload *applyTxnPayload + // done is a buffered channel (cap 1) used to synchronize the commitLoop + // with the worker that applied this transaction. The commitLoop sends on + // done after committing, unblocking the worker to reuse its DB connection. + // Always freshly allocated by acquireApplyTxn; commitOnly transactions + // carry one too but never use it (workers don't wait on them). + done chan struct{} +} + +type applyScheduler struct { + // ctx is the parent context for the parallel applier. When cancelled, + // all blocked nextReady/waitForIdle calls return immediately. + ctx context.Context + + mu sync.Mutex + cond *sync.Cond + // orderCond is a dedicated condition for the enqueue backpressure wait + // (maxOutstandingOrders). The shared cond's Signal in markCommitted can + // land on an idle worker that consumes the wakeup without re-signaling, + // leaving the scheduleLoop asleep until the pipeline fully drains (the + // allDrained Broadcast backstop). A dedicated cond makes the order-window + // wakeup deterministic. + orderCond *sync.Cond + + // pending is the queue of transactions waiting to be dispatched to + // workers. Entries are set to nil when consumed; pendingOff tracks + // how far into the slice consumed entries extend, and the slice is + // compacted when half its capacity is nil entries. + pending []*applyTxn + pendingOff int // offset into pending slice; entries before this index are consumed + pendingCount int // number of live (non-nil) entries in pending + // lastCommittedSequence is the highest source MySQL sequence number + // that has been committed. Used for commit-parent ordering: a + // transaction whose writeset is empty is ready only when its + // commitParent <= lastCommittedSequence. + lastCommittedSequence int64 + // lastCommittedOrder is the highest transaction order number that + // has been committed, used for diagnostics. + lastCommittedOrder int64 + // maxOutstandingOrders caps how many ordered transactions may exist ahead + // of durable commit progress. Zero disables the cap. + maxOutstandingOrders int64 + + // inflightWriteset maps writeset key hashes to reference counts. + // A transaction is blocked if any of its writeset keys are present + // in this map with count > 0. + inflightWriteset map[uint64]int + // inflightGlobal counts inflight forceGlobal transactions and + // no-metadata-no-writeset transactions. When > 0, all non-noConflict + // transactions are blocked. + inflightGlobal int + // inflightMissingMeta counts inflight transactions that lack commit + // metadata. When > 0, hasCommitMeta transactions are blocked to + // maintain the safety boundary between metadata modes. + inflightMissingMeta int + // inflightCommitMeta counts inflight transactions that have commit + // metadata. When > 0, no-metadata transactions with writesets must + // wait to prevent mixing metadata modes. + inflightCommitMeta int + // inflightNoConflict counts dispatched-but-uncommitted noConflict + // transactions. They do not participate in conflict checking, but the + // abandoned-pending-work check must not fire while one is in flight: + // its markCommitted can advance lastCommittedSequence and unblock the + // pending head. + inflightNoConflict int + + // closed is set by close() to signal that no more transactions will + // be enqueued. nextReady checks this to return io.EOF instead of + // blocking forever on cond.Wait after the scheduler is shut down. + closed bool +} + +// newApplyScheduler creates a scheduler and starts a background goroutine +// that broadcasts on cond when ctx is cancelled, unblocking any workers +// waiting in nextReady. +func newApplyScheduler(ctx context.Context) *applyScheduler { + s := &applyScheduler{ + ctx: ctx, + inflightWriteset: make(map[uint64]int), + } + s.cond = sync.NewCond(&s.mu) + s.orderCond = sync.NewCond(&s.mu) + go func() { + <-ctx.Done() + s.mu.Lock() + defer s.mu.Unlock() + s.cond.Broadcast() + s.orderCond.Broadcast() + }() + return s +} + +// enqueue adds a transaction to the pending queue and signals one waiting +// worker. On the first hasCommitMeta transaction, it seeds lastCommittedSequence +// from commitParent so that subsequent commit-parent checks have a baseline. +func (s *applyScheduler) enqueue(txn *applyTxn) error { + s.mu.Lock() + defer s.mu.Unlock() + if err := s.ctx.Err(); err != nil { + return err + } + if s.closed { + return io.EOF + } + for s.maxOutstandingOrders > 0 && txn.order > 0 && txn.order-s.lastCommittedOrder > s.maxOutstandingOrders { + s.orderCond.Wait() + if err := s.ctx.Err(); err != nil { + return err + } + if s.closed { + return io.EOF + } + } + if txn.hasCommitMeta && s.lastCommittedSequence == 0 && s.inflightGlobal == 0 && s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && s.pendingCount == 0 && txn.commitParent > 0 { + s.lastCommittedSequence = txn.commitParent + } + s.pending = append(s.pending, txn) + s.pendingCount++ + // Signal wakes one worker. enqueue adds at most one transaction, so at + // most one worker can dequeue it via popReadyLocked. This avoids the + // thundering-herd effect of Broadcast which wakes all N workers. + s.cond.Signal() + return nil +} + +// nextReady blocks until a transaction in the pending queue passes the +// readiness check, marks it inflight, removes it from the queue, and returns +// it to the calling worker. Returns io.EOF when the scheduler is closed and +// there is no pending work left to drain. +func (s *applyScheduler) nextReady(ctx context.Context) (*applyTxn, error) { + s.mu.Lock() + defer s.mu.Unlock() + + for { + if err := ctx.Err(); err != nil { + return nil, err + } + if err := s.ctx.Err(); err != nil { + return nil, err + } + txn := s.popReadyLocked() + if txn != nil { + s.markInflightLocked(txn) + // Pass the baton: one wakeup (e.g. a markCommitted that released + // a multi-key writeset) can make several pending transactions + // ready at once, but each waiter pops at most one. Signal the + // next waiter while pending work remains so independent ready + // transactions dispatch immediately instead of waiting for the + // next commit event. + if s.pendingCount > 0 { + s.cond.Signal() + } + return txn, nil + } + // Check closed only after attempting to drain any queued work so + // transactions already scheduled before shutdown still commit. + if s.closed { + if s.pendingCount == 0 { + return nil, io.EOF + } + // A closed scheduler may still have blocked pending work that + // becomes ready only after an inflight txn commits — in that + // case we keep waiting so the blocked pending txns unblock. + // But if nothing is inflight AND no pending txn is ready, + // nothing will ever advance lastCommittedSequence or release + // writeset/inflight counters, so workers would park forever. + // Return a non-EOF error so the controller retries the stream + // from the last saved position instead of silently abandoning + // the pending work. + if s.inflightGlobal == 0 && s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0 && s.inflightNoConflict == 0 { + return nil, errSchedulerAbandonedPendingWork + } + } + s.cond.Wait() + } +} + +// markCommitted releases the transaction's inflight state and advances +// lastCommittedSequence. Uses Broadcast when a global/missingMeta counter +// drops to zero (multiple txns may unblock), Signal otherwise. +func (s *applyScheduler) markCommitted(txn *applyTxn) error { + s.mu.Lock() + defer s.mu.Unlock() + if err := s.ctx.Err(); err != nil { + return err + } + if txn.hasCommitMeta && txn.sequenceNumber > s.lastCommittedSequence { + s.lastCommittedSequence = txn.sequenceNumber + } + // Advance any sequences that were batched (merged away) into this txn. + // These represent transactions whose events were merged into this batch + // but whose GTID sequence numbers must still become visible in + // lastCommittedSequence so that later empty-writeset commit-parent + // dependents can unblock. Doing this here (after commit) instead of at + // enqueue time preserves the invariant that commit-parent dependencies + // are only satisfied after the parent has actually committed. + for _, seq := range txn.mergedSequences { + if seq > s.lastCommittedSequence { + s.lastCommittedSequence = seq + } + } + if txn.order > 0 && txn.order > s.lastCommittedOrder { + s.lastCommittedOrder = txn.order + // Wake the scheduleLoop if it is blocked on the order window; only + // commits advance lastCommittedOrder, so this is the only wake site. + s.orderCond.Signal() + } + // Track pre-release state to decide between Signal and Broadcast. + wasForceGlobal := txn.forceGlobal + hadInflightGlobal := s.inflightGlobal > 0 + hadInflightMissingMeta := s.inflightMissingMeta > 0 + s.releaseInflightLocked(txn) + // Use Broadcast when releasing a forceGlobal txn, when a global/ + // missingMeta counter drops to zero, or when all inflight work has + // drained (so waitForIdle waiters are woken). Otherwise use Signal + // to avoid thundering-herd wakeup of N workers when only one txn + // can proceed. + allDrained := s.inflightGlobal == 0 && s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0 + if wasForceGlobal || + (hadInflightGlobal && s.inflightGlobal == 0) || + (hadInflightMissingMeta && s.inflightMissingMeta == 0) || + allDrained { + s.cond.Broadcast() + } else { + s.cond.Signal() + } + return nil +} + +// popReadyLocked scans the pending queue for the first dispatchable transaction. +// Once it encounters a blocked ordered transaction, it continues scanning only +// for later noConflict transactions. This preserves the deadlock protection for +// normal ordered work while still allowing position-only and OTHER/IGNORE stop +// transactions to bypass the blocked head and reach the commitLoop. +func (s *applyScheduler) popReadyLocked() *applyTxn { + blockedOrdered := false + for i := s.pendingOff; i < len(s.pending); i++ { + txn := s.pending[i] + if txn == nil { + continue + } + if txn.noConflict { + // noConflict transactions are always ready and don't affect + // inflight counters, so we can safely skip past them when + // looking for the next ready transaction. + if s.isReadyLocked(txn) { + s.removePendingLocked(i) + return txn + } + continue + } + if blockedOrdered { + continue + } + if s.isReadyLocked(txn) { + s.removePendingLocked(i) + return txn + } + // A non-noConflict transaction is not ready. We must NOT skip past it to + // dispatch a later ordered transaction, because doing so could create a + // deadlock: the later transaction's inflight state may prevent this + // earlier transaction from ever becoming ready, while the commitLoop + // (which requires strict ordering) waits for this earlier transaction to + // be committed before it can commit the later one. Keep scanning only so + // later noConflict transactions can bypass this blocked ordered head. + blockedOrdered = true + } + return nil +} + +// removePendingLocked removes the element at index i by setting it to nil and +// advancing pendingOff if it's the head element. This avoids O(n) memory shifts +// from append-based removal. The slice is compacted when half or more of its +// capacity is consumed by nil entries. +func (s *applyScheduler) removePendingLocked(i int) { + s.pending[i] = nil + s.pendingCount-- + // Advance the offset past any leading nils. + for s.pendingOff < len(s.pending) && s.pending[s.pendingOff] == nil { + s.pendingOff++ + } + // Compact when the offset has consumed half or more of the slice. + if s.pendingOff > 0 && s.pendingOff >= len(s.pending)/2 { + n := copy(s.pending, s.pending[s.pendingOff:]) + // Clear trailing pointers so GC can collect them. + for j := n; j < len(s.pending); j++ { + s.pending[j] = nil + } + s.pending = s.pending[:n] + s.pendingOff = 0 + } + // Shrink capacity after bursts to prevent permanent memory retention. + // If the backing array is >64 slots and >4x the live element count, + // allocate a right-sized slice and copy. + n := len(s.pending) + if cap(s.pending) > 64 && cap(s.pending) > 4*n { + shrunk := make([]*applyTxn, n, 2*n+1) + copy(shrunk, s.pending) + s.pending = shrunk + } +} + +// isReadyLocked checks whether a transaction can be dispatched to a worker +// based on its classification (noConflict, forceGlobal, hasCommitMeta) and +// the current inflight state. See the ready-check hierarchy in the PR docs. +func (s *applyScheduler) isReadyLocked(txn *applyTxn) bool { + // noConflict transactions (e.g., position-only saves) are always ready. + // They have no data conflicts and must not block or be blocked by other + // transactions. This prevents deadlocks where forceGlobal position saves + // (with earlier orders) are blocked by inflight data transactions (with + // later orders), while the commitLoop waits for those earlier orders. + if txn.noConflict { + return true + } + if s.inflightGlobal > 0 { + return false + } + if txn.forceGlobal { + ready := s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0 + return ready + } + if txn.hasCommitMeta { + if s.inflightMissingMeta > 0 { + return false + } + for _, key := range txn.writeset { + if s.inflightWriteset[key] > 0 { + return false + } + } + // When the transaction has a non-empty writeset, we use writeset-only + // conflict detection and skip the commit-parent dependency check. This + // is critical because the source MySQL may use COMMIT_ORDER dependency + // tracking, which produces a strict serial chain where every + // transaction's commitParent equals the immediately prior sequence + // number. Under COMMIT_ORDER, the commit-parent check alone would + // serialize ALL transactions regardless of whether their writesets + // actually conflict. With a valid writeset, the writeset conflict + // checks above are sufficient for correctness — the same approach + // MySQL uses internally with WRITESET dependency tracking. + // + // When the writeset is empty, we fall back to commit-parent ordering + // as the safety net. + if len(txn.writeset) > 0 { + return true + } + if s.inflightCommitMeta > 0 { + return false + } + // NOTE: sequence_number/last_committed reset per binlog FILE on the + // source, while lastCommittedSequence only advances (max). After a + // binlog rotation the new file's small commitParent values compare + // against the old file's high watermark, making this check vacuously + // true. That is safe ONLY because of the inflightCommitMeta == 0 + // gate above: with nothing inflight, every earlier-ordered txn has + // already committed, so the parent is durably applied regardless of + // what this comparison says. Do not remove that gate without + // rethinking rotation. + ready := txn.commitParent <= s.lastCommittedSequence + return ready + } + if s.inflightCommitMeta > 0 { + return false + } + if len(txn.writeset) == 0 { + return s.inflightMissingMeta == 0 && len(s.inflightWriteset) == 0 + } + for _, key := range txn.writeset { + if s.inflightWriteset[key] > 0 { + return false + } + } + return true +} + +// markInflightLocked increments the appropriate inflight counters and adds +// writeset keys to inflightWriteset. Must be called under s.mu. +func (s *applyScheduler) markInflightLocked(txn *applyTxn) { + if txn.noConflict { + s.inflightNoConflict++ + return + } + if txn.forceGlobal { + s.inflightGlobal++ + return + } + if txn.hasCommitMeta { + s.inflightCommitMeta++ + for _, key := range txn.writeset { + s.inflightWriteset[key]++ + } + return + } + if len(txn.writeset) == 0 { + s.inflightGlobal++ + s.inflightMissingMeta++ + return + } + s.inflightMissingMeta++ + for _, key := range txn.writeset { + s.inflightWriteset[key]++ + } +} + +// releaseInflightLocked decrements the inflight counters and removes +// writeset keys. The inverse of markInflightLocked. Must be called under s.mu. +func (s *applyScheduler) releaseInflightLocked(txn *applyTxn) { + if txn.noConflict { + if s.inflightNoConflict > 0 { + s.inflightNoConflict-- + } + return + } + if txn.forceGlobal { + if s.inflightGlobal > 0 { + s.inflightGlobal-- + } + return + } + if txn.hasCommitMeta { + if s.inflightCommitMeta > 0 { + s.inflightCommitMeta-- + } + for _, key := range txn.writeset { + count := s.inflightWriteset[key] + if count <= 1 { + delete(s.inflightWriteset, key) + } else { + s.inflightWriteset[key] = count - 1 + } + } + return + } + if len(txn.writeset) == 0 { + if s.inflightGlobal > 0 { + s.inflightGlobal-- + } + if s.inflightMissingMeta > 0 { + s.inflightMissingMeta-- + } + return + } + if s.inflightMissingMeta > 0 { + s.inflightMissingMeta-- + } + for _, key := range txn.writeset { + count := s.inflightWriteset[key] + if count <= 1 { + delete(s.inflightWriteset, key) + } else { + s.inflightWriteset[key] = count - 1 + } + } +} + +// advanceCommittedSequence advances lastCommittedSequence for transactions +// that bypass the scheduler (e.g., empty transactions handled via unsavedEvent). +// Without this, hasCommitMeta transactions whose commitParent references a +// skipped empty transaction would be blocked forever because lastCommittedSequence +// would never reach their commitParent value. +func (s *applyScheduler) advanceCommittedSequence(seq int64) { + if seq <= 0 { + return + } + s.mu.Lock() + defer s.mu.Unlock() + if seq > s.lastCommittedSequence { + s.lastCommittedSequence = seq + // Only wake waiters when there is pending work that could now be + // ready. During catch-up on a filtered shard this is called for + // every empty transaction (thousands/sec); an unconditional + // Broadcast would wake all N workers each time just to rescan an + // empty queue. + if s.pendingCount > 0 { + s.cond.Broadcast() + } + } +} + +// waitForIdle blocks until there are no pending or inflight transactions of +// any class. scheduleLoop calls it as a barrier after a DDL fetch so that the +// DDL, its FK-metadata refresh, and any FIELD events for DDL-affected tables +// are fully applied before the next fetch snapshots plans/FK refs. The idle +// predicate must therefore cover every inflight counter — including +// inflightNoConflict (position-only saves, OTHER/IGNORE stops) and the +// inflightWriteset map — so the barrier cannot return while any dispatched +// transaction is still uncommitted. This mirrors the fully-drained predicate +// in nextReady's abandoned-work check. +func (s *applyScheduler) waitForIdle(ctx context.Context) error { + s.mu.Lock() + defer s.mu.Unlock() + for { + if err := ctx.Err(); err != nil { + return err + } + if err := s.ctx.Err(); err != nil { + return err + } + if s.pendingCount == 0 && s.inflightGlobal == 0 && s.inflightMissingMeta == 0 && + s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0 && s.inflightNoConflict == 0 { + return nil + } + s.cond.Wait() + } +} + +// close marks the scheduler as closed and broadcasts to wake blocked workers. +// Already-enqueued work remains available so callers can drain the scheduled +// prefix before observing io.EOF. +func (s *applyScheduler) close() error { + s.mu.Lock() + defer s.mu.Unlock() + if err := s.ctx.Err(); err != nil { + return err + } + s.closed = true + s.cond.Broadcast() + s.orderCond.Broadcast() + return io.EOF +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler_test.go new file mode 100644 index 00000000000..873028afa74 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler_test.go @@ -0,0 +1,1005 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "context" + "errors" + "io" + "math/rand/v2" + "runtime" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func requireNoReadyTxn(t *testing.T, s *applyScheduler) { + t.Helper() + s.mu.Lock() + defer s.mu.Unlock() + require.Nil(t, s.popReadyLocked()) +} + +func requireReadyTxn(t *testing.T, s *applyScheduler, want *applyTxn) { + t.Helper() + s.mu.Lock() + defer s.mu.Unlock() + require.Same(t, want, s.popReadyLocked()) +} + +func TestApplySchedulerCommitParentOrder(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // txn2 is enqueued first. Since it's the first hasCommitMeta transaction + // and the scheduler is idle, enqueue seeds lastCommittedSequence to + // txn2.commitParent (1). This makes txn2 immediately ready because + // commitParent (1) <= lastCommittedSequence (1). The scheduler dispatches + // in FIFO order, so txn2 goes first. + txn2 := &applyTxn{sequenceNumber: 2, commitParent: 1, hasCommitMeta: true} + txn1 := &applyTxn{sequenceNumber: 1, commitParent: 0, hasCommitMeta: true} + + require.NoError(t, s.enqueue(txn2)) + require.NoError(t, s.enqueue(txn1)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn2, got1) + require.NoError(t, s.markCommitted(got1)) + + got2, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn1, got2) + require.NoError(t, s.markCommitted(got2)) +} + +func TestApplySchedulerAllowsIndependentWritesets(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + txn1 := &applyTxn{writeset: []uint64{1}} + txn2 := &applyTxn{writeset: []uint64{2}} + + require.NoError(t, s.enqueue(txn1)) + require.NoError(t, s.enqueue(txn2)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + got2, err := s.nextReady(ctx) + require.NoError(t, err) + + require.NotEqual(t, got1, got2) +} + +func TestApplySchedulerBlocksConflictingWritesets(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + txn1 := &applyTxn{writeset: []uint64{100}} + txn2 := &applyTxn{writeset: []uint64{100}} + + require.NoError(t, s.enqueue(txn1)) + require.NoError(t, s.enqueue(txn2)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(got1)) + + requireReadyTxn(t, s, txn2) +} + +func TestApplySchedulerBlocksCommitMetaDuringMissingMeta(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + missing := &applyTxn{writeset: []uint64{100}} + meta := &applyTxn{sequenceNumber: 2, commitParent: 0, hasCommitMeta: true} + + require.NoError(t, s.enqueue(missing)) + require.NoError(t, s.enqueue(meta)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, missing, got1) + + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(got1)) + + requireReadyTxn(t, s, meta) +} + +func TestApplySchedulerBlocksCommitMetaConflictingWritesets(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + txn1 := &applyTxn{writeset: []uint64{100}, sequenceNumber: 1, commitParent: 0, hasCommitMeta: true} + txn2 := &applyTxn{writeset: []uint64{100}, sequenceNumber: 2, commitParent: 0, hasCommitMeta: true} + + require.NoError(t, s.enqueue(txn1)) + require.NoError(t, s.enqueue(txn2)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn1, got1) + + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(got1)) + + requireReadyTxn(t, s, txn2) +} + +func TestApplySchedulerCommitMetaDoesNotAdvanceOnMissingMeta(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + require.Equal(t, int64(0), s.lastCommittedSequence) + + missing := &applyTxn{writeset: []uint64{100}} + meta := &applyTxn{sequenceNumber: 5, commitParent: 0, hasCommitMeta: true} + + require.NoError(t, s.enqueue(missing)) + require.NoError(t, s.enqueue(meta)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, missing, got1) + + require.NoError(t, s.markCommitted(got1)) + + got2, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, meta, got2) + + require.NoError(t, s.markCommitted(got2)) + require.Equal(t, int64(5), s.lastCommittedSequence) +} + +func TestApplySchedulerSeedsCommitParentOnFirstMeta(t *testing.T) { + ctx := t.Context() + // The scheduler seeds lastCommittedSequence from the first hasCommitMeta + // transaction when the scheduler is completely idle (no pending, no inflight). + // Enqueue meta as the very first transaction to trigger seeding. + meta := &applyTxn{sequenceNumber: 6, commitParent: 5, hasCommitMeta: true} + + s := newApplyScheduler(ctx) + + require.NoError(t, s.enqueue(meta)) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, meta, got) + require.Equal(t, int64(5), s.lastCommittedSequence) +} + +func TestApplySchedulerWritesetBypassesCommitParent(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // Simulate COMMIT_ORDER dependency tracking: each txn's commitParent is + // the immediately prior sequence number, forming a strict serial chain. + // With non-conflicting writesets, the scheduler should allow parallelism + // by ignoring the commit-parent dependency. + txn1 := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{1}} + txn2 := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true, writeset: []uint64{2}} + txn3 := &applyTxn{order: 3, sequenceNumber: 12, commitParent: 11, hasCommitMeta: true, writeset: []uint64{3}} + + require.NoError(t, s.enqueue(txn1)) + require.NoError(t, s.enqueue(txn2)) + require.NoError(t, s.enqueue(txn3)) + + // All three should be immediately ready since their writesets don't conflict. + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn1, got1) + + got2, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn2, got2) + + got3, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn3, got3) + + // Commit in order. + require.NoError(t, s.markCommitted(got1)) + require.NoError(t, s.markCommitted(got2)) + require.NoError(t, s.markCommitted(got3)) +} + +func TestApplySchedulerWritesetConflictStillBlocks(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // Even with the commit-parent bypass, conflicting writesets must still + // cause serialization. + txn1 := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}} + txn2 := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true, writeset: []uint64{100}} + + require.NoError(t, s.enqueue(txn1)) + require.NoError(t, s.enqueue(txn2)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn1, got1) + + // txn2 should be blocked because it conflicts with inflight txn1. + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(got1)) + + requireReadyTxn(t, s, txn2) +} + +func TestApplySchedulerEmptyWritesetFallsBackToCommitParent(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // When a hasCommitMeta transaction has an empty writeset (e.g., writeset + // build failed), it should fall back to commit-parent ordering. + txn1 := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true} + txn2 := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true} + + // Seed lastCommittedSequence to 9 so txn1 is ready. + require.NoError(t, s.enqueue(txn1)) + require.NoError(t, s.enqueue(txn2)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, txn1, got1) + + // txn2 has commitParent=10 but lastCommittedSequence is still 9 (seeded). + // txn2's writeset is empty, so it falls back to commit-parent check. + requireNoReadyTxn(t, s) + + // After committing txn1, lastCommittedSequence advances to 10, + // making txn2 ready (commitParent 10 <= 10). + require.NoError(t, s.markCommitted(got1)) + + requireReadyTxn(t, s, txn2) +} + +func TestApplySchedulerNoConflictDoesNotBlockPending(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // Enqueue a noConflict txn first and a normal txn second. + nc := &applyTxn{order: 1, noConflict: true} + normal := &applyTxn{order: 2, writeset: []uint64{100}} + + require.NoError(t, s.enqueue(nc)) + require.NoError(t, s.enqueue(normal)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, nc, got1) + + // Commit noConflict should not affect inflight counters for normal txn. + require.NoError(t, s.markCommitted(got1)) + + got2, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, normal, got2) +} + +func TestApplySchedulerForceGlobalBlocksWritesets(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + global := &applyTxn{order: 1, forceGlobal: true} + conflict := &applyTxn{order: 2, writeset: []uint64{100}} + + require.NoError(t, s.enqueue(global)) + require.NoError(t, s.enqueue(conflict)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, global, got1) + + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(got1)) + + requireReadyTxn(t, s, conflict) +} + +func TestApplySchedulerAdvanceCommittedSequenceUnblocks(t *testing.T) { + ctx := t.Context() + // Use a non-empty pending queue to prevent commit-parent seeding. + seed := &applyTxn{order: 1, noConflict: true} + meta := &applyTxn{order: 2, sequenceNumber: 6, commitParent: 5, hasCommitMeta: true} + + s := newApplyScheduler(ctx) + + require.NoError(t, s.enqueue(seed)) + require.NoError(t, s.enqueue(meta)) + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, seed, got1) + require.NoError(t, s.markCommitted(got1)) + + requireNoReadyTxn(t, s) + + s.advanceCommittedSequence(5) + + requireReadyTxn(t, s, meta) +} + +func TestApplySchedulerAdvanceCommittedSequenceDoesNotBypassInflightMetaParent(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + metaParent := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{1}} + metaChild := &applyTxn{order: 2, sequenceNumber: 12, commitParent: 11, hasCommitMeta: true} + + require.NoError(t, s.enqueue(metaParent)) + gotParent, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, metaParent, gotParent) + + require.NoError(t, s.enqueue(metaChild)) + s.advanceCommittedSequence(11) + + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(gotParent)) + + requireReadyTxn(t, s, metaChild) +} + +func TestApplySchedulerMergedSequencesUnblockCommitParentChild(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + batchedParent := &applyTxn{order: 1, writeset: []uint64{1}, mergedSequences: []int64{10}} + metaChild := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true} + + require.NoError(t, s.enqueue(batchedParent)) + require.NoError(t, s.enqueue(metaChild)) + + gotParent, err := s.nextReady(ctx) + require.NoError(t, err) + require.Same(t, batchedParent, gotParent) + + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(gotParent)) + + s.mu.Lock() + require.Equal(t, int64(10), s.lastCommittedSequence) + s.mu.Unlock() + + requireReadyTxn(t, s, metaChild) +} + +func TestApplySchedulerWaitForIdleReturnsWhenIdle(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + require.NoError(t, s.waitForIdle(ctx)) +} + +func TestApplySchedulerWaitForIdleReturnsOnSchedulerCancel(t *testing.T) { + ctx := t.Context() + sCtx, cancel := context.WithCancel(ctx) + s := newApplyScheduler(sCtx) + + require.NoError(t, s.enqueue(&applyTxn{writeset: []uint64{100}})) + + s.mu.Lock() + require.NotZero(t, s.pendingCount) + s.mu.Unlock() + + cancel() + + require.ErrorIs(t, s.waitForIdle(ctx), context.Canceled) +} + +func TestApplySchedulerClosePreservesPending(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + txn := &applyTxn{writeset: []uint64{100}, noConflict: true} + require.NoError(t, s.enqueue(txn)) + + err := s.close() + require.ErrorIs(t, err, io.EOF) + require.Equal(t, 1, s.pendingCount) + require.Zero(t, s.pendingOff) + require.Len(t, s.pending, 1) + require.Same(t, txn, s.pending[0]) +} + +func TestApplySchedulerNextReadyDrainsPendingAfterClose(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + txn := &applyTxn{order: 1, noConflict: true} + require.NoError(t, s.enqueue(txn)) + require.ErrorIs(t, s.close(), io.EOF) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Same(t, txn, got) + + _, err = s.nextReady(ctx) + require.ErrorIs(t, err, io.EOF) +} + +func TestApplySchedulerNextReadyWaitsForBlockedPendingAfterClose(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + blocker := &applyTxn{order: 1, writeset: []uint64{100}} + blocked := &applyTxn{order: 2, writeset: []uint64{100}} + + require.NoError(t, s.enqueue(blocker)) + require.NoError(t, s.enqueue(blocked)) + + gotBlocker, err := s.nextReady(ctx) + require.NoError(t, err) + require.Same(t, blocker, gotBlocker) + + require.ErrorIs(t, s.close(), io.EOF) + + type nextReadyResult struct { + txn *applyTxn + err error + } + resultCh := make(chan nextReadyResult, 1) + go func() { + txn, err := s.nextReady(ctx) + resultCh <- nextReadyResult{txn: txn, err: err} + }() + + assert.Never(t, func() bool { + return len(resultCh) > 0 + }, 100*time.Millisecond, 5*time.Millisecond) + + require.NoError(t, s.markCommitted(gotBlocker)) + + assert.Eventually(t, func() bool { + return len(resultCh) > 0 + }, 200*time.Millisecond, 5*time.Millisecond) + + gotBlocked := <-resultCh + require.NoError(t, gotBlocked.err) + require.Same(t, blocked, gotBlocked.txn) + + require.NoError(t, s.markCommitted(gotBlocked.txn)) + + _, err = s.nextReady(ctx) + require.ErrorIs(t, err, io.EOF) +} + +func TestApplySchedulerEnqueueBlocksWhenOutstandingOrdersReachCap(t *testing.T) { + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + + s := newApplyScheduler(ctx) + s.maxOutstandingOrders = 2 + + require.NoError(t, s.enqueue(&applyTxn{order: 1, noConflict: true})) + require.NoError(t, s.enqueue(&applyTxn{order: 2, noConflict: true})) + + errCh := make(chan error, 1) + go func() { + errCh <- s.enqueue(&applyTxn{order: 3, noConflict: true}) + }() + + assert.Never(t, func() bool { + return len(errCh) > 0 + }, 100*time.Millisecond, 5*time.Millisecond) + + // Advance durable progress through the real path: markCommitted bumps + // lastCommittedOrder and wakes the order-window waiter (orderCond). + require.NoError(t, s.markCommitted(&applyTxn{order: 1, noConflict: true})) + + assert.Eventually(t, func() bool { + return len(errCh) > 0 + }, 30*time.Second, 5*time.Millisecond) + require.NoError(t, <-errCh) + + s.mu.Lock() + require.Equal(t, 3, s.pendingCount) + s.mu.Unlock() +} + +func TestApplySchedulerLaterNoConflictBypassesBlockedEarlierTxn(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + blocker := &applyTxn{order: 1, writeset: []uint64{100}} + require.NoError(t, s.enqueue(blocker)) + + gotBlocker, err := s.nextReady(ctx) + require.NoError(t, err) + require.Same(t, blocker, gotBlocker) + + blocked := &applyTxn{order: 2, writeset: []uint64{100}} + stopTxn1 := &applyTxn{order: 3, noConflict: true} + require.NoError(t, s.enqueue(blocked)) + require.NoError(t, s.enqueue(stopTxn1)) + + requireReadyTxn(t, s, stopTxn1) + + // The first bypass leaves a nil gap in pending. A second noConflict txn + // must still be discoverable while the earlier normal txn remains blocked. + stopTxn2 := &applyTxn{order: 4, noConflict: true} + require.NoError(t, s.enqueue(stopTxn2)) + requireReadyTxn(t, s, stopTxn2) + + require.NoError(t, s.markCommitted(gotBlocker)) + requireReadyTxn(t, s, blocked) +} + +func TestApplySchedulerPendingCompaction(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + for i := range 4 { + require.NoError(t, s.enqueue(&applyTxn{order: int64(i + 1), noConflict: true})) + } + + got1, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, int64(1), got1.order) + require.NoError(t, s.markCommitted(got1)) + + got2, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, int64(2), got2.order) + require.NoError(t, s.markCommitted(got2)) + + require.Zero(t, s.pendingOff) + require.Len(t, s.pending, 2) + require.Equal(t, 2, s.pendingCount) +} + +// TestApplySchedulerConcurrentEnqueueAndCommitStress exercises the scheduler +// under concurrent producers and real worker goroutines (nextReady + +// markCommitted, so inflight state and the writeset-refcount machinery are +// genuinely engaged) to flush out deadlocks, lost wakeups, counter-balance +// bugs, and — most importantly — conflicting dispatches. +// +// Correctness properties checked: +// - No two concurrently-dispatched transactions share a writeset key, and +// forceGlobal transactions run exclusively (verified by an external +// conflict tracker, independent of the scheduler's own bookkeeping). +// - Every enqueued transaction is dispatched exactly once. +// - After all work drains, every inflight counter is zero. +func TestApplySchedulerConcurrentEnqueueAndCommitStress(t *testing.T) { + ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second) + defer cancel() + s := newApplyScheduler(ctx) + + const ( + numProducers = 2 + numWorkers = 6 + txnsPerProducer = 500 + maxWritesetKeys = 4 + writesetKeySpace = 32 + maxOutstandingOrder = int64(128) + ) + totalTxns := numProducers * txnsPerProducer + s.maxOutstandingOrders = maxOutstandingOrder + + // Atomically assigned order so all producers share one sequence. + var nextOrder atomic.Int64 + + // Producer goroutines enqueue a mix of writeset-based and forceGlobal + // transactions. Writeset keys are drawn from a small space so workers + // frequently conflict, exercising the writeset-refcount machinery. + var producers sync.WaitGroup + for p := range numProducers { + producers.Add(1) + go func(producerID int) { + defer producers.Done() + // Deterministic per-producer RNG so flakes are reproducible. + rng := rand.New(rand.NewPCG(uint64(producerID+1), 0x51ED)) + for i := range txnsPerProducer { + txn := &applyTxn{ + order: nextOrder.Add(1), + } + // 5% of transactions force-global, others carry a writeset. + if rng.IntN(20) == 0 { + txn.forceGlobal = true + } else { + n := 1 + rng.IntN(maxWritesetKeys) + txn.writeset = make([]uint64, 0, n) + seen := map[uint64]struct{}{} + for range n { + k := uint64(rng.IntN(writesetKeySpace)) + if _, dup := seen[k]; dup { + continue + } + seen[k] = struct{}{} + txn.writeset = append(txn.writeset, k) + } + } + if err := s.enqueue(txn); err != nil { + t.Errorf("producer %d txn %d enqueue: %v", producerID, i, err) + return + } + } + }(p) + } + + // External conflict tracker: validates, independently of the scheduler's + // own counters, that no two dispatched-and-uncommitted transactions + // conflict. Registration is atomic with the check under one mutex. + var ( + trackerMu sync.Mutex + activeKeys = map[uint64]int64{} // key -> holding txn order + activeGlobal int64 // order of the active forceGlobal txn, 0 = none + activeCount int + ) + dispatch := func(txn *applyTxn) { + trackerMu.Lock() + defer trackerMu.Unlock() + if activeGlobal != 0 { + t.Errorf("txn %d dispatched while forceGlobal txn %d active", txn.order, activeGlobal) + } + if txn.forceGlobal { + if activeCount != 0 { + t.Errorf("forceGlobal txn %d dispatched with %d txns active", txn.order, activeCount) + } + activeGlobal = txn.order + } + for _, k := range txn.writeset { + if holder, conflict := activeKeys[k]; conflict { + t.Errorf("txn %d dispatched with writeset key %d held by active txn %d", txn.order, k, holder) + } + activeKeys[k] = txn.order + } + activeCount++ + } + finish := func(txn *applyTxn) { + trackerMu.Lock() + defer trackerMu.Unlock() + if txn.forceGlobal { + activeGlobal = 0 + } + for _, k := range txn.writeset { + delete(activeKeys, k) + } + activeCount-- + } + + // Worker goroutines: the REAL dispatch path. nextReady marks inflight; + // markCommitted releases it. The tracker unregisters BEFORE + // markCommitted, mirroring the real pipeline where a conflicting txn may + // dispatch the instant the commit releases the scheduler state. + observed := make([]int64, 0, totalTxns) + var observedMu sync.Mutex + var workers sync.WaitGroup + for range numWorkers { + workers.Go(func() { + for { + txn, err := s.nextReady(ctx) + if err != nil { + if !errors.Is(err, io.EOF) && ctx.Err() == nil { + t.Errorf("nextReady: %v", err) + } + return + } + dispatch(txn) + if txn.order%7 == 0 { + runtime.Gosched() // widen the race window a little + } + observedMu.Lock() + observed = append(observed, txn.order) + observedMu.Unlock() + finish(txn) + if err := s.markCommitted(txn); err != nil { + t.Errorf("markCommitted: %v", err) + return + } + } + }) + } + + producers.Wait() + s.close() + workersDone := make(chan struct{}) + go func() { workers.Wait(); close(workersDone) }() + select { + case <-workersDone: + case <-ctx.Done(): + observedMu.Lock() + n := len(observed) + observedMu.Unlock() + t.Fatalf("stress test timed out: observed %d / %d transactions", n, totalTxns) + } + + // Invariants after the scheduler has drained. + s.mu.Lock() + defer s.mu.Unlock() + require.Zero(t, s.inflightGlobal, "inflightGlobal leaked") + require.Zero(t, s.inflightMissingMeta, "inflightMissingMeta leaked") + require.Zero(t, s.inflightCommitMeta, "inflightCommitMeta leaked") + require.Zero(t, s.inflightNoConflict, "inflightNoConflict leaked") + require.Empty(t, s.inflightWriteset, "inflightWriteset leaked") + require.Zero(t, s.pendingCount, "pendingCount not drained") + require.Len(t, observed, totalTxns) + + // All order numbers from 1..totalTxns must appear exactly once. + seen := make(map[int64]struct{}, totalTxns) + for _, o := range observed { + if _, dup := seen[o]; dup { + t.Fatalf("order %d observed twice", o) + } + seen[o] = struct{}{} + } + require.Len(t, seen, totalTxns) +} + +func TestApplySchedulerMultiKeyReleaseWakesAllReadyWaiters(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // megaTxn holds two writeset keys; blocker keeps a third key inflight for + // the whole test so markCommitted(megaTxn) does not take the + // all-drained Broadcast path. + megaTxn := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100, 200}} + blocker := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 9, hasCommitMeta: true, writeset: []uint64{900}} + require.NoError(t, s.enqueue(megaTxn)) + require.NoError(t, s.enqueue(blocker)) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, megaTxn, got) + got, err = s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, blocker, got) + + // Two pending transactions, each conflicting with a different one of + // megaTxn's keys. + waiterA := &applyTxn{order: 3, sequenceNumber: 12, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}} + waiterB := &applyTxn{order: 4, sequenceNumber: 13, commitParent: 9, hasCommitMeta: true, writeset: []uint64{200}} + require.NoError(t, s.enqueue(waiterA)) + require.NoError(t, s.enqueue(waiterB)) + + // Two workers block in nextReady before the commit. + results := make(chan *applyTxn, 2) + errs := make(chan error, 2) + for range 2 { + go func() { + txn, err := s.nextReady(ctx) + if err != nil { + errs <- err + return + } + results <- txn + }() + } + // Wait until both goroutines are parked in cond.Wait. There is no direct + // hook for "waiter count", so poll the scheduler state: both pending txns + // are still queued and neither result has arrived. + require.Eventually(t, func() bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.pendingCount == 2 + }, 30*time.Second, time.Millisecond) + + // One commit releases both keys; both waiters must be dispatched without + // any further commit happening (blocker stays inflight throughout). + require.NoError(t, s.markCommitted(megaTxn)) + + dispatched := make(map[int64]bool) + for range 2 { + select { + case txn := <-results: + dispatched[txn.order] = true + case err := <-errs: + t.Fatalf("nextReady returned error: %v", err) + case <-time.After(30 * time.Second): + t.Fatalf("timed out waiting for both ready transactions to be dispatched; got %v", dispatched) + } + } + require.True(t, dispatched[waiterA.order]) + require.True(t, dispatched[waiterB.order]) +} + +// TestApplySchedulerNoMetaNoWritesetIsGlobal pins ready-check case 7: a +// transaction without commit metadata and without a writeset must serialize +// as global, with BOTH inflightGlobal and inflightMissingMeta held and then +// released in balance. An unbalanced release here would silently wedge the +// scheduler (counter stuck > 0) or unsafely unblock it (counter goes +// negative-equivalent via early zero). +func TestApplySchedulerNoMetaNoWritesetIsGlobal(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + opaque := &applyTxn{order: 1} // no meta, no writeset + other := &applyTxn{order: 2, sequenceNumber: 10, commitParent: 0, hasCommitMeta: true, writeset: []uint64{100}} + require.NoError(t, s.enqueue(opaque)) + require.NoError(t, s.enqueue(other)) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, opaque, got) + s.mu.Lock() + require.Equal(t, 1, s.inflightGlobal, "no-meta/no-writeset must count as global") + require.Equal(t, 1, s.inflightMissingMeta, "no-meta/no-writeset must count as missing-meta") + s.mu.Unlock() + + // While the opaque txn is inflight, everything else is blocked. + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(opaque)) + s.mu.Lock() + require.Zero(t, s.inflightGlobal, "release must balance the global count") + require.Zero(t, s.inflightMissingMeta, "release must balance the missing-meta count") + s.mu.Unlock() + requireReadyTxn(t, s, other) +} + +// TestApplySchedulerNoMetaWritesetBlockedByInflightCommitMeta pins the +// blocked direction of ready-check case 8: a transaction without commit +// metadata (even with a non-conflicting writeset) must not run alongside an +// inflight transaction that has metadata — the two metadata modes never mix. +func TestApplySchedulerNoMetaWritesetBlockedByInflightCommitMeta(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + withMeta := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}} + noMeta := &applyTxn{order: 2, writeset: []uint64{200}} // disjoint writeset, no metadata + require.NoError(t, s.enqueue(withMeta)) + require.NoError(t, s.enqueue(noMeta)) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, withMeta, got) + + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(withMeta)) + requireReadyTxn(t, s, noMeta) +} + +// TestApplySchedulerForceGlobalWaitsForInflightAndThenBlocksAll pins the +// blocked direction of ready-check case 3: a forceGlobal transaction (e.g. a +// DDL) must wait until ALL inflight work drains, and while it is inflight it +// must block everything behind it. A regression here would let a DDL execute +// concurrently with inflight row transactions on other connections. +func TestApplySchedulerForceGlobalWaitsForInflightAndThenBlocksAll(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + row := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}} + global := &applyTxn{order: 2, forceGlobal: true} + row2 := &applyTxn{order: 3, sequenceNumber: 11, commitParent: 9, hasCommitMeta: true, writeset: []uint64{200}} + require.NoError(t, s.enqueue(row)) + require.NoError(t, s.enqueue(global)) + require.NoError(t, s.enqueue(row2)) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, row, got) + + // The DDL must not be dispatchable while the row txn is inflight, and + // head-of-line blocking must also keep row2 queued behind it. + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(row)) + // Dispatch via nextReady so the DDL is actually marked inflight + // (requireReadyTxn only pops, without marking). + got, err = s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, global, got) + + // While the DDL is inflight, nothing else may start. + requireNoReadyTxn(t, s) + + require.NoError(t, s.markCommitted(global)) + requireReadyTxn(t, s, row2) +} + +// TestApplySchedulerClosedWithUnreachablePendingWorkErrors pins the abandoned +// -work escape hatch: when the scheduler is closed while pending transactions +// exist that can never become ready (nothing is inflight to advance the +// scheduler state), nextReady must return errSchedulerAbandonedPendingWork — +// not io.EOF (which would silently drop the pending suffix) and not block +// forever (which would leak the worker). +func TestApplySchedulerClosedWithUnreachablePendingWorkErrors(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // commitParent 0 skips enqueue's lastCommittedSequence seeding, keeping + // the watermark at 0. + first := &applyTxn{order: 1, sequenceNumber: 1, commitParent: 0, hasCommitMeta: true, writeset: []uint64{100}} + // Empty writeset -> commit-parent fallback; parent 99 is never reached. + stuck := &applyTxn{order: 2, sequenceNumber: 100, commitParent: 99, hasCommitMeta: true} + require.NoError(t, s.enqueue(first)) + require.NoError(t, s.enqueue(stuck)) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, first, got) + require.NoError(t, s.markCommitted(got)) + + // Nothing inflight, the pending txn is permanently blocked, and the + // scheduler is closed: workers must get the abandoned-work error. + require.Equal(t, io.EOF, s.close()) + _, err = s.nextReady(ctx) + require.ErrorIs(t, err, errSchedulerAbandonedPendingWork) +} + +// TestApplySchedulerClosedWaitsForInflightNoConflict pins that the +// abandoned-pending-work check does NOT fire while a noConflict transaction +// is dispatched but uncommitted: a noConflict position-save carrying commit +// metadata advances lastCommittedSequence when it commits, which can unblock +// the pending head. Erroring early would convert a clean stop-drain into a +// spurious workflow restart. +func TestApplySchedulerClosedWaitsForInflightNoConflict(t *testing.T) { + ctx := t.Context() + s := newApplyScheduler(ctx) + + // Position-only save with metadata: its commit publishes sequence 99. + save := &applyTxn{order: 1, sequenceNumber: 99, commitParent: 0, hasCommitMeta: true, noConflict: true} + // Blocked on commit-parent 99 (empty writeset fallback). + stuck := &applyTxn{order: 2, sequenceNumber: 100, commitParent: 99, hasCommitMeta: true} + require.NoError(t, s.enqueue(save)) + require.NoError(t, s.enqueue(stuck)) + + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, save, got) + + require.Equal(t, io.EOF, s.close()) + + // A worker parks in nextReady. With the save still uncommitted it must + // WAIT, not return errSchedulerAbandonedPendingWork. + type result struct { + txn *applyTxn + err error + } + resCh := make(chan result, 1) + go func() { + txn, err := s.nextReady(ctx) + resCh <- result{txn, err} + }() + select { + case r := <-resCh: + t.Fatalf("nextReady returned early (txn=%v err=%v); it must wait for the inflight noConflict txn to commit", r.txn, r.err) + case <-time.After(2 * time.Second): + } + + // Committing the save publishes sequence 99 and unblocks the head. + require.NoError(t, s.markCommitted(save)) + select { + case r := <-resCh: + require.NoError(t, r.err) + require.Equal(t, stuck, r.txn) + case <-time.After(30 * time.Second): + t.Fatal("timed out waiting for the unblocked transaction to be dispatched") + } + + // With nothing inflight and nothing pending, drain ends cleanly. + require.NoError(t, s.markCommitted(stuck)) + _, err = s.nextReady(ctx) + require.ErrorIs(t, err, io.EOF) +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_test.go new file mode 100644 index 00000000000..1ae7e91d777 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_test.go @@ -0,0 +1,7254 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "context" + "errors" + "fmt" + "io" + "math" + "strconv" + "strings" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql/replication" + "vitess.io/vitess/go/mysql/sqlerror" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/stats" + "vitess.io/vitess/go/timer" + "vitess.io/vitess/go/vt/binlog/binlogplayer" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/sqlparser" + "vitess.io/vitess/go/vt/vtenv" + "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle" + "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base" + "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" + vttablet "vitess.io/vitess/go/vt/vttablet/common" +) + +// testCtx returns the test context, which is cancelled when the test ends. +// This is essential for tests that create an applyScheduler, because +// newApplyScheduler spawns a goroutine that blocks on ctx.Done(). +func testCtx(t *testing.T) context.Context { + t.Helper() + return t.Context() +} + +// ---------- computeLastEventTimestamp tests ---------- + +func TestComputeLastEventTimestamp_EmptyEvents(t *testing.T) { + ts, ct := computeLastEventTimestamp(nil) + assert.Equal(t, int64(0), ts) + assert.Equal(t, int64(0), ct) + + ts, ct = computeLastEventTimestamp([]*binlogdatapb.VEvent{}) + assert.Equal(t, int64(0), ts) + assert.Equal(t, int64(0), ct) +} + +func TestComputeLastEventTimestamp_LastEventHasTimestamp(t *testing.T) { + events := []*binlogdatapb.VEvent{ + {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200}, + {Type: binlogdatapb.VEventType_ROW, Timestamp: 300, CurrentTime: 400}, + } + ts, ct := computeLastEventTimestamp(events) + assert.Equal(t, int64(300), ts) + assert.Equal(t, int64(400), ct) +} + +func TestComputeLastEventTimestamp_SkipsZeroTimestamp(t *testing.T) { + events := []*binlogdatapb.VEvent{ + {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200}, + {Type: binlogdatapb.VEventType_COMMIT, Timestamp: 0, CurrentTime: 0}, + } + ts, ct := computeLastEventTimestamp(events) + assert.Equal(t, int64(100), ts) + assert.Equal(t, int64(200), ct) +} + +func TestComputeLastEventTimestamp_SkipsThrottledHeartbeat(t *testing.T) { + events := []*binlogdatapb.VEvent{ + {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200}, + {Type: binlogdatapb.VEventType_HEARTBEAT, Timestamp: 500, CurrentTime: 600, Throttled: true}, + } + ts, ct := computeLastEventTimestamp(events) + assert.Equal(t, int64(100), ts) + assert.Equal(t, int64(200), ct) +} + +func TestComputeLastEventTimestamp_NonThrottledHeartbeatCounts(t *testing.T) { + events := []*binlogdatapb.VEvent{ + {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200}, + {Type: binlogdatapb.VEventType_HEARTBEAT, Timestamp: 500, CurrentTime: 600, Throttled: false}, + } + ts, ct := computeLastEventTimestamp(events) + assert.Equal(t, int64(500), ts) + assert.Equal(t, int64(600), ct) +} + +func TestComputeLastEventTimestamp_AllZeroTimestamp(t *testing.T) { + events := []*binlogdatapb.VEvent{ + {Type: binlogdatapb.VEventType_COMMIT, Timestamp: 0}, + {Type: binlogdatapb.VEventType_BEGIN, Timestamp: 0}, + } + ts, ct := computeLastEventTimestamp(events) + assert.Equal(t, int64(0), ts) + assert.Equal(t, int64(0), ct) +} + +// ---------- sync.Pool helpers tests ---------- + +func TestAcquireReleaseApplyTxn(t *testing.T) { + txn := acquireApplyTxn() + require.NotNil(t, txn) + + // Set some fields + txn.order = 42 + txn.sequenceNumber = 10 + txn.forceGlobal = true + + // Release should zero out the struct + releaseApplyTxn(txn) + + // Acquire again — may get the same or a new object, but it should be zeroed + txn2 := acquireApplyTxn() + require.NotNil(t, txn2) + assert.Equal(t, int64(0), txn2.order) + assert.Equal(t, int64(0), txn2.sequenceNumber) + assert.False(t, txn2.forceGlobal) + releaseApplyTxn(txn2) +} + +func TestAcquireReleaseApplyTxnPayload(t *testing.T) { + p := acquireApplyTxnPayload() + require.NotNil(t, p) + + p.timestamp = 999 + p.mustSave = true + + // Attach to a txn and release + txn := acquireApplyTxn() + txn.payload = p + releaseApplyTxn(txn) + + p2 := acquireApplyTxnPayload() + require.NotNil(t, p2) + assert.Equal(t, int64(0), p2.timestamp) + assert.False(t, p2.mustSave) + applyTxnPayloadPool.Put(p2) +} + +func TestReleaseApplyTxnNilPayload(t *testing.T) { + txn := acquireApplyTxn() + txn.order = 5 + txn.payload = nil + // Should not panic + releaseApplyTxn(txn) +} + +// ---------- scheduler gaps: advanceCommittedSequence, waitForIdle, close ---------- + +func TestApplySchedulerAdvanceCommittedSequence(t *testing.T) { + ctx := testCtx(t) + s := newApplyScheduler(ctx) + + // Initially zero + assert.Equal(t, int64(0), s.lastCommittedSequence) + + // Advance to 5 + s.advanceCommittedSequence(5) + s.mu.Lock() + assert.Equal(t, int64(5), s.lastCommittedSequence) + s.mu.Unlock() + + // Advance to 10 + s.advanceCommittedSequence(10) + s.mu.Lock() + assert.Equal(t, int64(10), s.lastCommittedSequence) + s.mu.Unlock() + + // Lower value does not regress + s.advanceCommittedSequence(3) + s.mu.Lock() + assert.Equal(t, int64(10), s.lastCommittedSequence) + s.mu.Unlock() + + // Zero is a no-op + s.advanceCommittedSequence(0) + s.mu.Lock() + assert.Equal(t, int64(10), s.lastCommittedSequence) + s.mu.Unlock() + + // Negative is a no-op + s.advanceCommittedSequence(-1) + s.mu.Lock() + assert.Equal(t, int64(10), s.lastCommittedSequence) + s.mu.Unlock() +} + +func TestApplySchedulerAdvanceUnblocksMeta(t *testing.T) { + ctx := testCtx(t) + s := newApplyScheduler(ctx) + + // Enqueue a non-meta txn first AND keep it inflight so that when + // meta2 is enqueued, the seeding condition is NOT met (inflightMissingMeta > 0). + // This ensures lastCommittedSequence stays 0 and meta2 is blocked. + blocker := &applyTxn{order: 1, writeset: []uint64{100}} + require.NoError(t, s.enqueue(blocker)) + gotBlocker, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, blocker, gotBlocker) + // blocker is still inflight (inflightMissingMeta=1) + + meta2 := &applyTxn{order: 2, sequenceNumber: 5, commitParent: 3, hasCommitMeta: true} + require.NoError(t, s.enqueue(meta2)) + + // meta2 has empty writeset, commitParent=3, lastCommittedSequence=0. + // Also blocked by inflightMissingMeta > 0 from the blocker. + readyCh := make(chan *applyTxn, 1) + go func() { + txn, err := s.nextReady(ctx) + if err == nil { + readyCh <- txn + } + }() + + assert.Never(t, func() bool { + return len(readyCh) > 0 + }, 50*time.Millisecond, 5*time.Millisecond) + + // Commit the blocker to clear inflightMissingMeta, but + // lastCommittedSequence is still 0 so meta2 stays blocked. + require.NoError(t, s.markCommitted(gotBlocker)) + + assert.Never(t, func() bool { + return len(readyCh) > 0 + }, 50*time.Millisecond, 5*time.Millisecond) + + // Now advance committed sequence to 3 — should unblock meta2 + s.advanceCommittedSequence(3) + + assert.Eventually(t, func() bool { + return len(readyCh) > 0 + }, 200*time.Millisecond, 5*time.Millisecond) +} + +func TestApplySchedulerWaitForIdle(t *testing.T) { + ctx := testCtx(t) + s := newApplyScheduler(ctx) + + // Empty scheduler: waitForIdle returns immediately + err := s.waitForIdle(ctx) + require.NoError(t, err) + + // Enqueue and dequeue a txn, mark committed, then waitForIdle + txn := &applyTxn{order: 1, writeset: []uint64{100}} + require.NoError(t, s.enqueue(txn)) + got, err := s.nextReady(ctx) + require.NoError(t, err) + + // With inflight txn, waitForIdle should block + doneCh := make(chan error, 1) + go func() { + doneCh <- s.waitForIdle(ctx) + }() + + assert.Never(t, func() bool { + return len(doneCh) > 0 + }, 50*time.Millisecond, 5*time.Millisecond) + + // Mark committed → idle + require.NoError(t, s.markCommitted(got)) + + assert.Eventually(t, func() bool { + return len(doneCh) > 0 + }, 200*time.Millisecond, 5*time.Millisecond) + require.NoError(t, <-doneCh) + + // A dispatched-but-uncommitted noConflict txn (e.g. a position-only save) + // must also keep waitForIdle blocked: the DDL barrier relies on it to + // guarantee ALL scheduled work has been applied before the next fetch. + // noConflict txns bump only inflightNoConflict, so an idle check that + // omits that counter would let the barrier return too early. + noConflict := &applyTxn{order: 2, noConflict: true} + require.NoError(t, s.enqueue(noConflict)) + gotNoConflict, err := s.nextReady(ctx) + require.NoError(t, err) + require.Same(t, noConflict, gotNoConflict) + + noConflictDone := make(chan error, 1) + go func() { + noConflictDone <- s.waitForIdle(ctx) + }() + assert.Never(t, func() bool { + return len(noConflictDone) > 0 + }, 50*time.Millisecond, 5*time.Millisecond) + + require.NoError(t, s.markCommitted(gotNoConflict)) + assert.Eventually(t, func() bool { + return len(noConflictDone) > 0 + }, 200*time.Millisecond, 5*time.Millisecond) + require.NoError(t, <-noConflictDone) +} + +func TestApplySchedulerWaitForIdleCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(t.Context()) + s := newApplyScheduler(ctx) + + txn := &applyTxn{order: 1, forceGlobal: true} + require.NoError(t, s.enqueue(txn)) + _, err := s.nextReady(ctx) + require.NoError(t, err) + + doneCh := make(chan error, 1) + go func() { + doneCh <- s.waitForIdle(ctx) + }() + + cancel() + + assert.Eventually(t, func() bool { + return len(doneCh) > 0 + }, 200*time.Millisecond, 5*time.Millisecond) + err = <-doneCh + require.Error(t, err) +} + +func TestApplySchedulerClose(t *testing.T) { + ctx := testCtx(t) + s := newApplyScheduler(ctx) + + // Enqueue some transactions + require.NoError(t, s.enqueue(&applyTxn{order: 1, writeset: []uint64{100}})) + require.NoError(t, s.enqueue(&applyTxn{order: 2, writeset: []uint64{200}})) + + s.mu.Lock() + assert.Equal(t, 2, s.pendingCount) + s.mu.Unlock() + + err := s.close() + require.Error(t, err) // returns io.EOF + + s.mu.Lock() + assert.Equal(t, 2, s.pendingCount) + assert.Len(t, s.pending, 2) + assert.Equal(t, 0, s.pendingOff) + s.mu.Unlock() +} + +// ---------- noConflict scheduling tests ---------- + +func TestApplySchedulerNoConflictAlwaysReady(t *testing.T) { + ctx := testCtx(t) + s := newApplyScheduler(ctx) + + // A forceGlobal txn is inflight + global := &applyTxn{order: 1, forceGlobal: true} + require.NoError(t, s.enqueue(global)) + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, global, got) + + // Now enqueue a noConflict txn — should be ready even with inflight global + nc := &applyTxn{order: 2, noConflict: true} + require.NoError(t, s.enqueue(nc)) + + gotNC, err := s.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, nc, gotNC) + + require.NoError(t, s.markCommitted(got)) + require.NoError(t, s.markCommitted(gotNC)) +} + +// ---------- removePendingLocked compaction tests ---------- + +func TestApplySchedulerRemovePendingCompaction(t *testing.T) { + ctx := testCtx(t) + s := newApplyScheduler(ctx) + + // Enqueue 4 transactions with independent writesets + for i := int64(1); i <= 4; i++ { + require.NoError(t, s.enqueue(&applyTxn{order: i, writeset: []uint64{uint64(i)}})) + } + + // Dequeue all 4 — this exercises removePendingLocked compaction + for range 4 { + got, err := s.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, got) + require.NoError(t, s.markCommitted(got)) + } + + s.mu.Lock() + assert.Equal(t, 0, s.pendingCount) + s.mu.Unlock() +} + +// ---------- snapshotTablePlans tests ---------- + +func TestSnapshotTablePlans_Nil(t *testing.T) { + mu := &sync.RWMutex{} + version := &atomic.Int64{} + var cachedVersion int64 + result := snapshotTablePlans(mu, nil, version, &cachedVersion, nil) + assert.Nil(t, result) +} + +func TestSnapshotTablePlans_CopiesMap(t *testing.T) { + mu := &sync.RWMutex{} + plans := map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + "t2": {TargetName: "t2"}, + } + version := &atomic.Int64{} + version.Store(1) + var cachedVersion int64 + + snap := snapshotTablePlans(mu, plans, version, &cachedVersion, nil) + require.Len(t, snap, 2) + assert.Equal(t, "t1", snap["t1"].TargetName) + assert.Equal(t, "t2", snap["t2"].TargetName) + assert.Equal(t, int64(1), cachedVersion) + + // Modify original — snapshot should not be affected + plans["t3"] = &TablePlan{TargetName: "t3"} + assert.Len(t, snap, 2) +} + +func TestSnapshotTablePlans_UsesCacheWhenVersionMatches(t *testing.T) { + mu := &sync.RWMutex{} + plans := map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + } + version := &atomic.Int64{} + version.Store(5) + var cachedVersion int64 = 5 + cached := map[string]*TablePlan{ + "cached": {TargetName: "cached"}, + } + + snap := snapshotTablePlans(mu, plans, version, &cachedVersion, cached) + // Should return the cached map since versions match + require.Len(t, snap, 1) + assert.Equal(t, "cached", snap["cached"].TargetName) +} + +func TestSnapshotTablePlans_RefreshesCacheWhenVersionChanges(t *testing.T) { + mu := &sync.RWMutex{} + plans := map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + } + version := &atomic.Int64{} + version.Store(6) + var cachedVersion int64 = 5 + cached := map[string]*TablePlan{ + "stale": {TargetName: "stale"}, + } + + snap := snapshotTablePlans(mu, plans, version, &cachedVersion, cached) + require.Len(t, snap, 1) + assert.Equal(t, "t1", snap["t1"].TargetName) + assert.Equal(t, int64(6), cachedVersion) +} + +// ---------- scheduleItems tests ---------- + +// testVPlayer creates a minimal vplayer stub for testing scheduleItems. +// The returned vplayer has mocked query/commit functions and a mock DB client. +func testVPlayer(t *testing.T) (*vplayer, *binlogplayer.MockDBClient) { + t.Helper() + mockDB := binlogplayer.NewMockDBClient(t) + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + config, err := vttablet.NewVReplicationConfig(nil) + require.NoError(t, err) + vr := &vreplicator{ + id: 1, + stats: stats, + dbClient: newVDBClient(mockDB, stats, config.RelayLogMaxItems), + workflowConfig: config, + vre: &Engine{}, + source: &binlogdatapb.BinlogSource{OnDdl: binlogdatapb.OnDDLAction_IGNORE}, + } + + vp := &vplayer{ + vr: vr, + tablePlansMu: &sync.RWMutex{}, + tablePlans: make(map[string]*TablePlan), + tablePlansVersion: &atomic.Int64{}, + serialMu: &sync.Mutex{}, + parallelOrder: &atomic.Int64{}, + lagSnapshot: &atomic.Pointer[lagSnapshot]{}, + timeLastSaved: time.Now(), + idStr: "1", + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return &sqltypes.Result{}, nil + }, + commit: func() error { + return nil + }, + dbClient: vr.dbClient, + } + return vp, mockDB +} + +func TestTestVPlayerDoesNotMutateDefaultWorkflowConfig(t *testing.T) { + defaults := vttablet.InitVReplicationConfigDefaults() + savedWorkers := defaults.ParallelReplicationWorkers + t.Cleanup(func() { + defaults.ParallelReplicationWorkers = savedWorkers + }) + defaults.ParallelReplicationWorkers = 1 + + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + config, err := vttablet.NewVReplicationConfig(nil) + require.NoError(t, err) + assert.Equal(t, 1, config.ParallelReplicationWorkers) +} + +func TestExtractDDLAffectedTables_MixedCaseDDLMatchesLowercasePlan(t *testing.T) { + tracked, conservative := extractDDLAffectedTables( + "alter table T1 add column c1 bigint", + sqlparser.NewTestParser(), + map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + }, + nil, + ) + + require.False(t, conservative) + require.Contains(t, tracked, "t1") + require.Contains(t, tracked["t1"].refreshedPlans, "t1") +} + +func TestResolvedPostDDLStalePlans_MixedCaseDroppedNameMatchesLowercaseBarrier(t *testing.T) { + stalePlan := &TablePlan{TargetName: "t1"} + resolved := resolvedPostDDLStalePlans( + map[string]*TablePlan{"t1": stalePlan}, + map[string]struct{}{"T1": {}}, + map[string]postDDLStalePlan{ + "t1": { + stalePlan: stalePlan, + refreshedPlans: map[string]*TablePlan{"t1": stalePlan}, + allowDisappear: true, + }, + }, + ) + + require.Contains(t, resolved, "t1") +} + +func TestSnapshotPostDDLStalePlans_MixedCaseDroppedNameSkipsLowercasePlan(t *testing.T) { + tracked := snapshotPostDDLStalePlans( + map[string]*TablePlan{"t1": {TargetName: "t1"}}, + map[string]struct{}{"T1": {}}, + ) + + assert.Nil(t, tracked) +} + +func TestUnresolvedPostDDLStalePlans_MixedCaseRefreshNameMatchesLowercasePlan(t *testing.T) { + unresolved := unresolvedPostDDLStalePlans( + map[string]*TablePlan{"t1_new": {TargetName: "t1_new"}}, + nil, + map[string]postDDLStalePlan{ + "t1": { + stalePlan: &TablePlan{TargetName: "t1"}, + refreshedPlans: map[string]*TablePlan{"T1_NEW": nil}, + }, + }, + ) + + assert.Nil(t, unresolved) +} + +func TestTxnTouchesPostDDLBarrier_MixedCaseRefreshTargetMatchesLowercaseRow(t *testing.T) { + touches := txnTouchesPostDDLBarrier( + []*binlogdatapb.VEvent{{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1_new", + RowChanges: []*binlogdatapb.RowChange{{}}, + }, + }}, + map[string]postDDLStalePlan{ + "t1": { + stalePlan: &TablePlan{TargetName: "t1"}, + refreshedPlans: map[string]*TablePlan{"T1_NEW": nil}, + }, + }, + false, + ) + + assert.True(t, touches) +} + +func TestPostDDLRefreshTargetMatchesCachedPlan_MixedCaseRefreshNameMatches(t *testing.T) { + cachedPlan := &TablePlan{TargetName: "t1_new"} + matches := postDDLRefreshTargetMatchesCachedPlan( + map[string]postDDLStalePlan{ + "t1": { + stalePlan: &TablePlan{TargetName: "t1"}, + refreshedPlans: map[string]*TablePlan{"T1_NEW": cachedPlan}, + }, + }, + "t1_new", + cachedPlan, + ) + + assert.True(t, matches) +} + +func TestApplyEvent_FieldClearsMixedCaseDroppedTableEntry(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 1 + vp.postDDLDroppedTables = map[string]struct{}{"T1": {}} + mockDB.AddInvariant("begin", &sqltypes.Result{}) + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": { + TargetName: "t1", + IdentityColumns: []string{"id"}, + Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)"), + TablePlanBuilder: &tablePlanBuilder{}, + }, + }} + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + }, + }, false)) + + assert.Empty(t, vp.postDDLDroppedTables) +} + +// publishExecutedDDLBarrier mirrors commitLoop's post-DDL publication so the +// scheduler tests can model only DDLs that actually executed on the target. +func publishExecutedDDLBarrier(t *testing.T, vp *vplayer, statement string) { + t.Helper() + vp.serialMu.Lock() + defer vp.serialMu.Unlock() + vp.tablePlansMu.RLock() + renameTargets := extractDDLRenameTargets(statement, vp.vr.vre.env.Parser()) + retargetPostDDLStalePlans(vp.postDDLStalePlans, renameTargets, vp.tablePlans) + ddlStalePlans, conservative := extractDDLAffectedTables(statement, vp.vr.vre.env.Parser(), vp.tablePlans, vp.postDDLDroppedTables) + ddlStalePlans = unresolvedPostDDLStalePlans(vp.tablePlans, vp.postDDLDroppedTables, ddlStalePlans) + vp.tablePlansMu.RUnlock() + vp.postDDLStalePlans = mergePostDDLStalePlans(vp.postDDLStalePlans, ddlStalePlans) + vp.postDDLConservative = vp.postDDLConservative || conservative + vp.postDDLDroppedTables = mergeDroppedTables(vp.postDDLDroppedTables, extractDroppedTables(statement, vp.vr.vre.env.Parser())) +} + +// commitScheduledExecutedDDL models a commitLoop DDL commit and then syncs the +// resulting barrier into scheduler state the way the next fetch would observe it. +func commitScheduledExecutedDDL(t *testing.T, ctx context.Context, scheduler *applyScheduler, state *parallelScheduleState, vp *vplayer) { + t.Helper() + ddlTxn, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type) + publishExecutedDDLBarrier(t, vp, ddlTxn.payload.events[0].Statement) + require.NoError(t, scheduler.markCommitted(ddlTxn)) + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil)) +} + +func TestApplyEventsParallelCanceledContext(t *testing.T) { + vp, _ := testVPlayer(t) + + ctx, cancel := context.WithCancel(testCtx(t)) + cancel() + + vp.vr.workflowConfig.ParallelReplicationWorkers = 1 + + relay := newRelayLog(ctx, 10, 100) + + err := vp.applyEventsParallel(ctx, relay) + require.ErrorIs(t, err, context.Canceled) +} + +func TestApplyEventsParallelReturnsScheduleError(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{}) + mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{}) + mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables)) + mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES", + )) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + if vp.vr.vre.dbClientFactoryFiltered == nil { + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB } + } + + relay := newRelayLog(ctx, 10, 100) + invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"} + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{invalidGTID})) + + err := vp.applyEventsParallel(ctx, relay) + require.Error(t, err) +} + +func TestApplyEventsParallelCommitsScheduledPrefixBeforeScheduleError(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.workflowConfig.StoreCompressedGTID = false + + mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.foreign_key_checks", &sqltypes.Result{}) + mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{}) + mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables)) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return vp.dbClient.Execute(sql) + } + vp.commit = vp.dbClient.Commit + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + if vp.vr.vre.dbClientFactoryFiltered == nil { + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB } + } + + relay := newRelayLog(ctx, 10, 100) + validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"} + otherEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_OTHER, Timestamp: 100} + invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"} + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, otherEvent, invalidGTID})) + + err := vp.applyEventsParallel(ctx, relay) + require.Error(t, err) + mockDB.Wait() +} + +func TestApplyEventsParallelReturnsNilAfterScheduledStopPosEvenIfLaterScheduleFails(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.workflowConfig.StoreCompressedGTID = false + + mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{}) + mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{}) + mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables)) + mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES", + )) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return vp.dbClient.Execute(sql) + } + vp.commit = vp.dbClient.Commit + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + if vp.vr.vre.dbClientFactoryFiltered == nil { + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB } + } + + stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = stopPos + + relay := newRelayLog(ctx, 10, 100) + validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: replication.EncodePosition(stopPos)} + otherEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_OTHER, Timestamp: 100} + invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"} + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, otherEvent, invalidGTID})) + + err = vp.applyEventsParallel(ctx, relay) + require.NoError(t, err) + mockDB.Wait() +} + +func TestApplyEventsParallelReturnsNilAfterEmptyTxnStopPosEvenIfLaterScheduleFails(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.workflowConfig.StoreCompressedGTID = false + + mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{}) + mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{}) + mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables)) + mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES", + )) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return vp.dbClient.Execute(sql) + } + vp.commit = vp.dbClient.Commit + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + if vp.vr.vre.dbClientFactoryFiltered == nil { + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB } + } + + stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = stopPos + + relay := newRelayLog(ctx, 10, 100) + validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: replication.EncodePosition(stopPos)} + commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 100} + invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"} + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, commitEvent, invalidGTID})) + + err = vp.applyEventsParallel(ctx, relay) + require.NoError(t, err) + mockDB.Wait() +} + +func TestApplyEventsParallelReturnsNilAfterScheduledStopDDLEvenIfLaterScheduleFails(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.workflowConfig.StoreCompressedGTID = false + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_STOP + + mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{}) + mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{}) + mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables)) + mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES", + )) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil) + mockDB.ExpectRequestRE("update _vt\\.vreplication set state=", &sqltypes.Result{}, nil) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return vp.dbClient.Execute(sql) + } + vp.commit = vp.dbClient.Commit + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + if vp.vr.vre.dbClientFactoryFiltered == nil { + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB } + } + + relay := newRelayLog(ctx, 10, 100) + validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"} + ddlEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 100} + invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"} + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, ddlEvent, invalidGTID})) + + err := vp.applyEventsParallel(ctx, relay) + require.NoError(t, err) +} + +func TestApplyEventsParallelReturnsNilAfterScheduledRelevantJournalEvenIfLaterScheduleFails(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.workflowConfig.StoreCompressedGTID = false + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + }} + + mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{}) + mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{}) + mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables)) + mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES", + )) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return vp.dbClient.Execute(sql) + } + vp.commit = vp.dbClient.Commit + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + if vp.vr.vre.dbClientFactoryFiltered == nil { + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB } + } + vp.vr.vre.isOpen = true + vp.vr.vre.journaler = make(map[string]*journalEvent) + vp.vr.vre.controllers = map[int32]*controller{ + vp.vr.id: { + workflow: "wf", + source: &binlogdatapb.BinlogSource{ + Keyspace: "ks", + Shard: "0", + }, + }, + 2: { + workflow: "wf", + source: &binlogdatapb.BinlogSource{ + Keyspace: "ks", + Shard: "1", + }, + }, + } + + relay := newRelayLog(ctx, 10, 100) + validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"} + journalEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_JOURNAL, Timestamp: 100, Journal: &binlogdatapb.Journal{ + Id: 1, + MigrationType: binlogdatapb.MigrationType_TABLES, + Participants: []*binlogdatapb.KeyspaceShard{{ + Keyspace: "ks", + Shard: "0", + }, { + Keyspace: "ks", + Shard: "1", + }}, + Tables: []string{"t1"}, + }} + invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"} + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, journalEvent, invalidGTID})) + + err := vp.applyEventsParallel(ctx, relay) + require.NoError(t, err) +} + +func TestApplyEventsParallelReturnsWorkerErrorEvenIfCancellationLooksLikeEOF(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.workflowConfig.ExperimentalFlags = 0 + vp.canAcceptStmtEvents = true + + workerApplyErr := errors.New("worker apply failed") + mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{}) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { + return &failingDBClient{failOnQuery: map[string]error{ + "insert into t1": workerApplyErr, + }} + } + + relay := newRelayLog(ctx, 10, 100) + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_INSERT, Dml: "insert into t1(id) values (1)", Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + })) + + err := vp.applyEventsParallel(ctx, relay) + require.ErrorContains(t, err, workerApplyErr.Error()) +} + +type blockingBatchDBClient struct { + recordingDBClient + blockMulti chan struct{} + entered chan struct{} + closed chan struct{} +} + +func (b *blockingBatchDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) { + b.queries = append(b.queries, query) + select { + case b.entered <- struct{}{}: + default: + } + select { + case <-b.blockMulti: + case <-b.closed: + return nil, context.Canceled + } + return []*sqltypes.Result{{}}, nil +} + +func (b *blockingBatchDBClient) Close() { + select { + case <-b.closed: + default: + close(b.closed) + } +} + +func TestWorkerLoopCancelDoesNotUnblockBlockedBatchFlush(t *testing.T) { + ctx, cancel := context.WithCancel(testCtx(t)) + defer cancel() + + vp, _ := testVPlayer(t) + vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running) + vp.batchMode = true + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": { + TargetName: "t1", + IdentityColumns: []string{"id"}, + Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a_id)"), + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + PKReferences: []string{"id"}, + Stats: vp.vr.stats, + TablePlanBuilder: &tablePlanBuilder{}, + WorkflowConfig: vp.vr.workflowConfig, + }, + }} + vp.tablePlans["t1"] = vp.replicatorPlan.TablePlans["t1"] + vp.tablePlansVersion.Store(1) + + scheduler := newApplyScheduler(ctx) + commitCh := make(chan *applyTxn, 1) + blockingClient := &blockingBatchDBClient{ + blockMulti: make(chan struct{}), + entered: make(chan struct{}, 1), + closed: make(chan struct{}), + } + workerClient := newVDBClient(blockingClient, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + workerClient.maxBatchSize = 1024 + worker := &applyWorker{ + ctx: ctx, + conns: [2]*vdbClient{workerClient, workerClient}, + client: workerClient, + batchMode: true, + } + worker.bindFunctions() + + txn := acquireApplyTxn() + t.Cleanup(func() { + close(blockingClient.blockMulti) + }) + txn.order = 1 + txn.payload = &applyTxnPayload{events: []*binlogdatapb.VEvent{{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}, + }}, + }, + }}} + require.NoError(t, scheduler.enqueue(txn)) + + errCh := make(chan error, 1) + go func() { + errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker) + }() + + select { + case <-blockingClient.entered: + case <-time.After(200 * time.Millisecond): + t.Fatal("timed out waiting for blocking batch flush") + } + + cancel() + + select { + case err := <-errCh: + require.ErrorIs(t, err, context.Canceled) + case <-time.After(200 * time.Millisecond): + t.Fatal("workerLoop remained stuck after cancellation while batch flush was blocked") + } +} + +func TestApplyEventsParallelCancelledContext(t *testing.T) { + vp, _ := testVPlayer(t) + + ctx, cancel := context.WithCancel(testCtx(t)) + cancel() + + vp.vr.workflowConfig.ParallelReplicationWorkers = 1 + + relay := newRelayLog(ctx, 10, 100) + + err := vp.applyEventsParallel(ctx, relay) + require.ErrorIs(t, err, context.Canceled) +} + +func TestApplyEventsParallelParallelWorkersFailFastOnCanceledContext(t *testing.T) { + vp, _ := testVPlayer(t) + + ctx, cancel := context.WithCancel(testCtx(t)) + cancel() + + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { + panic("worker factory should not be called for canceled context") + } + + relay := newRelayLog(ctx, 10, 100) + + var err error + require.NotPanics(t, func() { + err = vp.applyEventsParallel(ctx, relay) + }) + require.ErrorIs(t, err, context.Canceled) +} + +func TestScheduleItems_GTIDAndROWAndCOMMIT(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + // Set up a table plan so writeset can be built + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}, + }, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // Should have enqueued exactly one transaction + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, got) + assert.Equal(t, int64(1), got.order) + assert.NotNil(t, got.payload) + assert.Len(t, got.payload.events, 1) // ROW event only + assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[0].Type) + assert.True(t, got.payload.rowOnly) +} + +func TestScheduleItemsBackpressuresOutstandingOrderedTransactions(t *testing.T) { + ctx, cancel := context.WithCancel(testCtx(t)) + defer cancel() + + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + scheduler.maxOutstandingOrders = 3 + state := ¶llelScheduleState{ + lastFlushTime: time.Now(), + lastHeartbeatRefresh: time.Now(), + maxBatchedCommits: 1, + } + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + batch := make([]*binlogdatapb.VEvent, 0, 12) + for i := 1; i <= 4; i++ { + gtid := fmt.Sprintf("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-%d", i) + value := strconv.Itoa(i) + batch = append(batch, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: gtid}, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte(value), Lengths: []int64{int64(len(value))}}, + }}, + }, Timestamp: int64(100 + i)}, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}, + ) + } + + errCh := make(chan error, 1) + go func() { + errCh <- vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{batch}) + }() + + assert.Eventually(t, func() bool { + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + return int64(scheduler.pendingCount) >= scheduler.maxOutstandingOrders + }, 200*time.Millisecond, 5*time.Millisecond) + + assert.Never(t, func() bool { + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + return int64(scheduler.pendingCount) > scheduler.maxOutstandingOrders + }, 100*time.Millisecond, 5*time.Millisecond) + + assert.Never(t, func() bool { + return len(errCh) > 0 + }, 100*time.Millisecond, 5*time.Millisecond) + + cancel() + require.ErrorIs(t, <-errCh, context.Canceled) +} + +func TestScheduleLoopCanceledContext(t *testing.T) { + vp, _ := testVPlayer(t) + + ctx, cancel := context.WithCancel(testCtx(t)) + cancel() + + scheduler := newApplyScheduler(ctx) + relay := newRelayLog(ctx, 10, 100) + + err := vp.scheduleLoop(ctx, relay, scheduler) + require.ErrorIs(t, err, context.Canceled) +} + +func TestScheduleLoopProcessesItems(t *testing.T) { + vp, mockDB := testVPlayer(t) + + ctx, cancel := context.WithCancel(testCtx(t)) + defer cancel() + + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + + scheduler := newApplyScheduler(ctx) + relay := newRelayLog(ctx, 10, 100) + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"} + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT} + + require.NoError(t, relay.Send([]*binlogdatapb.VEvent{gtidEvent, rowEvent, commitEvent})) + + errCh := make(chan error, 1) + go func() { + errCh <- vp.scheduleLoop(ctx, relay, scheduler) + }() + + ready, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, ready) + + cancel() + + select { + case err := <-errCh: + require.True(t, errors.Is(err, context.Canceled) || errors.Is(err, io.EOF)) + case <-time.After(200 * time.Millisecond): + t.Fatal("timed out waiting for scheduleLoop") + } +} + +func TestScheduleLoopThrottledUpdates(t *testing.T) { + vp, mockDB := testVPlayer(t) + + ctx, cancel := context.WithCancel(testCtx(t)) + defer cancel() + + if globalStats.ThrottledCount == nil { + globalStats.ThrottledCount = stats.NewCounter("", "") + } + + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + mockDB.AddInvariant("time_throttled", &sqltypes.Result{}) + + if vp.vr.vre == nil { + vp.vr.vre = &Engine{} + } + if vp.vr.vre.throttlerClient == nil { + vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope) + } + vp.throttlerAppName = throttlerapp.TestingAlwaysThrottledName.String() + if vp.vr.throttleUpdatesRateLimiter == nil { + vp.vr.throttleUpdatesRateLimiter = timer.NewRateLimiter(time.Millisecond) + defer vp.vr.throttleUpdatesRateLimiter.Stop() + } + + scheduler := newApplyScheduler(ctx) + relay := newRelayLog(ctx, 10, 100) + + errCh := make(chan error, 1) + go func() { + errCh <- vp.scheduleLoop(ctx, relay, scheduler) + }() + + time.Sleep(10 * time.Millisecond) + cancel() + + select { + case err := <-errCh: + require.True(t, errors.Is(err, context.Canceled) || errors.Is(err, io.EOF)) + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for scheduleLoop") + } +} + +func TestScheduleLoopCancelledContext(t *testing.T) { + vp, _ := testVPlayer(t) + + ctx, cancel := context.WithCancel(testCtx(t)) + cancel() + + scheduler := newApplyScheduler(ctx) + relay := newRelayLog(ctx, 10, 100) + + err := vp.scheduleLoop(ctx, relay, scheduler) + require.ErrorIs(t, err, context.Canceled) +} + +func TestScheduleItems_EmptyTransaction(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + Timestamp: 100, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // Empty transaction should NOT be enqueued — it just sets unsavedEvent + vp.serialMu.Lock() + assert.Equal(t, commitEvent, vp.unsavedEvent) + vp.serialMu.Unlock() +} + +func TestScheduleItems_EmptyTxnAfterIdleTimeoutEnqueuesPositionSave(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.serialMu.Lock() + vp.timeLastSaved = time.Now().Add(-2 * idleTimeout) + vp.serialMu.Unlock() + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + SequenceNumber: 7, + CommitParent: 6, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + Timestamp: 100, + } + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}}) + require.NoError(t, err) + + scheduler.mu.Lock() + require.Equal(t, 1, scheduler.pendingCount) + scheduler.mu.Unlock() + + got, gerr := scheduler.nextReady(ctx) + require.NoError(t, gerr) + require.NotNil(t, got) + assert.True(t, got.payload.commitOnly) + assert.True(t, got.payload.updatePosOnly) + assert.True(t, got.noConflict) + assert.Equal(t, int64(7), got.sequenceNumber) + assert.Equal(t, int64(6), got.commitParent) + assert.True(t, got.hasCommitMeta) + + vp.serialMu.Lock() + assert.Nil(t, vp.unsavedEvent) + vp.serialMu.Unlock() +} + +func TestScheduleItems_VERSIONIsIgnoredLikeEmptyTransaction(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + versionEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_VERSION, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + Timestamp: 100, + } + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, versionEvent, commitEvent}}) + require.NoError(t, err) + + vp.serialMu.Lock() + assert.Equal(t, commitEvent, vp.unsavedEvent) + vp.serialMu.Unlock() +} + +func TestScheduleItems_ROWSQUERYOnlyTransactionIsEmpty(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + beginEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_BEGIN, + } + rowsQueryEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROWS_QUERY, + Statement: "update t1 set id = id where id = 1", + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + Timestamp: 100, + } + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, beginEvent, rowsQueryEvent, commitEvent}}) + require.NoError(t, err) + + vp.serialMu.Lock() + assert.Equal(t, commitEvent, vp.unsavedEvent) + vp.serialMu.Unlock() + + scheduler.mu.Lock() + assert.Equal(t, 0, scheduler.pendingCount) + scheduler.mu.Unlock() +} + +func TestScheduleItems_EmptyTxnWithCommitMeta_AdvancesSequence(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + SequenceNumber: 7, + CommitParent: 6, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // Should have advanced the committed sequence to 7 + scheduler.mu.Lock() + assert.Equal(t, int64(7), scheduler.lastCommittedSequence) + scheduler.mu.Unlock() +} + +func TestScheduleItems_BEGINIsIgnored(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + beginEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_BEGIN, + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}, + }, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + // BEGIN should not be added to curEvents + items := [][]*binlogdatapb.VEvent{{gtidEvent, beginEvent, rowEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + // Should only have the ROW event, not BEGIN + assert.Len(t, got.payload.events, 1) + assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[0].Type) +} + +func TestScheduleItems_DDLIsForceGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + ddlEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_DDL, + Timestamp: 200, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, ddlEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.True(t, got.payload.commitOnly) + assert.Len(t, got.payload.events, 1) + assert.Equal(t, binlogdatapb.VEventType_DDL, got.payload.events[0].Type) +} + +func TestScheduleItems_PostDDLComplexDDLDoesNotClearOnUnrelatedPlanRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + staleT1 := &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + staleT2 := &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + staleT3 := &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t1"] = staleT1 + vp.tablePlans["t2"] = staleT2 + vp.tablePlans["t3"] = staleT3 + vp.tablePlansVersion.Store(1) + + ddlItems := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new, t2 to t2_new", Timestamp: 200}, + }} + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, ddlItems)) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t1") + require.Contains(t, state.postDDLStalePlans, "t2") + require.NotContains(t, state.postDDLStalePlans, "t3") + + // Simulate an unrelated plan refresh while plans for DDL-affected tables remain stale. + vp.tablePlansMu.Lock() + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansMu.Unlock() + + rowItems := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, rowItems)) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.NotNil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLRenameClearsAfterRenamedTableFieldRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + + vp.tablePlansMu.Lock() + vp.tablePlans["t1_new"] = &TablePlan{ + TargetName: "t1_new", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1_new", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLUnknownDDLRetainsConservativeBarrier(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + staleT1 := &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + staleT2 := &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t1"] = staleT1 + vp.tablePlans["t2"] = staleT2 + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + + vp.tablePlansMu.Lock() + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.NotNil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLDropDoesNotClearOnUnrelatedPlanRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")}, + }} + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + assert.Nil(t, state.postDDLStalePlans) + + vp.tablePlansMu.Lock() + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLDropClearsAfterDroppedTableSatisfied(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")}, + }} + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + assert.Nil(t, state.postDDLStalePlans) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLDroppedTablesSnapshotDoesNotAliasVPlayer(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.postDDLDroppedTables = map[string]struct{}{"t1": {}} + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil)) + require.Equal(t, map[string]struct{}{"t1": {}}, state.postDDLDroppedTables) + + delete(vp.postDDLDroppedTables, "t1") + vp.postDDLDroppedTables["t2"] = struct{}{} + + require.Equal(t, map[string]struct{}{"t1": {}}, state.postDDLDroppedTables) +} + +func TestScheduleItems_PostDDLAlterRenameClearsAfterRenamedTableFieldRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 rename to t1_new", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + + vp.tablePlansMu.Lock() + vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1_new", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLAlterClearsAfterSameTableFieldRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t1") + + vp.tablePlansMu.Lock() + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLSecondDDLDoesNotReplaceEarlierUnresolvedBarrier(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t2 add column c1 int", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + require.Contains(t, state.postDDLStalePlans, "t2") + + vp.tablePlansMu.Lock() + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t3", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + require.NotNil(t, state.postDDLStalePlans) + assert.Contains(t, state.postDDLStalePlans, "t1") +} + +func TestScheduleItems_PostDDLUnknownSecondDDLExpandsBarrierConservatively(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + require.NotContains(t, state.postDDLStalePlans, "t2") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + require.Contains(t, state.postDDLStalePlans, "t2") + require.Contains(t, state.postDDLStalePlans, "t3") + + vp.tablePlansMu.Lock() + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.NotNil(t, state.postDDLStalePlans) + assert.Contains(t, state.postDDLStalePlans, "t1") +} + +func TestScheduleItems_PostDDLRenameThenUnknownStillBlocksAfterRenamedTableRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")}, + }} + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + + vp.tablePlansMu.Lock() + vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1_new", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.NotNil(t, state.postDDLStalePlans) + assert.Contains(t, state.postDDLStalePlans, "t1") +} + +func TestScheduleItems_PostDDLRenameRetiresOldNameFromLaterUnknownBarrier(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + + vp.tablePlansMu.Lock() + vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1_new", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 250}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + require.NoError(t, scheduler.markCommitted(got)) + assert.Nil(t, state.postDDLStalePlans) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 300}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + assert.NotContains(t, state.postDDLStalePlans, "t1") + assert.Contains(t, state.postDDLStalePlans, "t1_new") + assert.Contains(t, state.postDDLStalePlans, "t2") + + vp.tablePlansMu.Lock() + vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 350}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err = scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLRenameRetiresOldNameEvenWhenAnotherBarrierRemains(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t3 add column c1 int", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + require.Contains(t, state.postDDLStalePlans, "t3") + + vp.tablePlansMu.Lock() + vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1_new", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + require.NoError(t, scheduler.markCommitted(got)) + require.NotNil(t, state.postDDLStalePlans) + assert.NotContains(t, state.postDDLStalePlans, "t1") + assert.Contains(t, state.postDDLStalePlans, "t3") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 350}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + assert.NotContains(t, state.postDDLStalePlans, "t1") + assert.Contains(t, state.postDDLStalePlans, "t1_new") + assert.Contains(t, state.postDDLStalePlans, "t3") +} + +func TestScheduleItems_PostDDLRenameSwapRequiresBothTablesToRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2, t2 to t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + + vp.tablePlansMu.Lock() + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) +} + +func TestScheduleItems_PostDDLCreateTableDoesNotBlockUnrelatedTable(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "create table t3(id bigint primary key)", Timestamp: 200}, + }})) + + ddlTxn, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type) + _, err = vp.applyDDLEvent(ctx, ddlTxn.payload.events[0], nil) + require.NoError(t, err) + require.NoError(t, scheduler.markCommitted(ddlTxn)) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) +} + +func TestScheduleItems_PostDDLDropTableDoesNotBlockUnrelatedTable(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + + ddlTxn, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type) + require.NoError(t, scheduler.markCommitted(ddlTxn)) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) +} + +func TestScheduleItems_PostDDLExecIgnoreFailureDoesNotBlockAffectedTable(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC_IGNORE + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{}) + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + if sql == "alter table t1 add column c1 int" { + return nil, errors.New("ddl failed") + } + return &sqltypes.Result{}, nil + } + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 200}, + }})) + + ddlTxn, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type) + require.NoError(t, vp.applyEvent(ctx, ddlTxn.payload.events[0], ddlTxn.payload.mustSave)) + require.NoError(t, scheduler.markCommitted(ddlTxn)) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) +} + +func TestScheduleItems_PostDDLDropThenUnknownStillClearsAfterDropSatisfaction(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + assert.Nil(t, state.postDDLStalePlans) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + assert.NotContains(t, state.postDDLStalePlans, "t1") + assert.Contains(t, state.postDDLStalePlans, "t2") + + vp.tablePlansMu.Lock() + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLRecreatedDroppedTableIsTrackedAgain(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")}, + }} + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Nil(t, state.postDDLStalePlans) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.NotContains(t, state.postDDLStalePlans, "t1") + require.Contains(t, state.postDDLStalePlans, "t2") + + fieldEvent := &binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "c1", Type: querypb.Type_INT64}, + }, + } + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_FIELD, FieldEvent: fieldEvent}, false)) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 300}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + assert.Contains(t, state.postDDLStalePlans, "t1") + assert.Contains(t, state.postDDLStalePlans, "t2") +} + +func TestScheduleItems_PostDDLDropThenCreateSameTableBlocksUntilFieldRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + staleT1 := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + IdentityColumns: []string{"id"}, + HasExtraUniqueSecondary: false, + } + vp.tablePlans["t1"] = staleT1 + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Nil(t, state.postDDLStalePlans) + require.Contains(t, vp.postDDLDroppedTables, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "create table t1(id bigint primary key, email bigint unique)", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.NotNil(t, state.postDDLStalePlans) + assert.Contains(t, state.postDDLStalePlans, "t1") + require.NoError(t, scheduler.markCommitted(got)) + + vp.tablePlansMu.Lock() + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "email", Type: querypb.Type_INT64}}, + PKIndices: []bool{true, false}, + IdentityColumns: []string{"id"}, + HasExtraUniqueSecondary: true, + } + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 350}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err = scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLDropThenRenameToDroppedNameBlocksUntilFieldRefresh(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Nil(t, state.postDDLStalePlans) + require.Contains(t, vp.postDDLDroppedTables, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t2 to t1", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t2") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.NoError(t, scheduler.markCommitted(got)) + + vp.tablePlansMu.Lock() + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t3", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 350}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err = scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLRenameThenCreateSameNameRequiresBothFieldRefreshes(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "create table t1(id bigint primary key)", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t1") + + vp.tablePlansMu.Lock() + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t2", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 300}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.NoError(t, scheduler.markCommitted(got)) + require.NotNil(t, state.postDDLStalePlans) + require.Contains(t, state.postDDLStalePlans, "t1") + + vp.tablePlansMu.Lock() + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "email", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}}, + }, Timestamp: 350}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err = scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLDropCreateRenameRetargetsBarrierToFinalName(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "create table t1(id bigint primary key)", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2", Timestamp: 300}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + + vp.tablePlansMu.Lock() + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil)) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestScheduleItems_PostDDLRenameChainRetargetsBarrierToFinalName(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}} + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2", Timestamp: 200}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t2 to t3", Timestamp: 250}, + }})) + commitScheduledExecutedDDL(t, ctx, scheduler, state, vp) + require.Contains(t, state.postDDLStalePlans, "t1") + require.Contains(t, state.postDDLStalePlans, "t2") + + vp.tablePlansMu.Lock() + vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}} + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil)) + assert.Nil(t, state.postDDLStalePlans) +} + +func TestMergeDroppedTables_DoesNotMutateInput(t *testing.T) { + original := map[string]struct{}{"t1": {}} + merged := mergeDroppedTables(original, map[string]struct{}{"t2": {}}) + + assert.Equal(t, map[string]struct{}{"t1": {}}, original) + assert.Equal(t, map[string]struct{}{"t1": {}, "t2": {}}, merged) +} + +func TestRetargetPostDDLStalePlans_RenameSwapUsesOriginalRefreshNames(t *testing.T) { + t1Old := &TablePlan{TargetName: "t1"} + t2Old := &TablePlan{TargetName: "t2"} + t1New := &TablePlan{TargetName: "t1"} + t2New := &TablePlan{TargetName: "t2"} + + stalePlans := map[string]postDDLStalePlan{ + "barrier": { + stalePlan: t1Old, + refreshedPlans: map[string]*TablePlan{ + "t1": t1Old, + "t2": t2Old, + }, + }, + } + + retargetPostDDLStalePlans(stalePlans, map[string]string{"t1": "t2", "t2": "t1"}, map[string]*TablePlan{"t1": t1New, "t2": t2New}) + + require.Contains(t, stalePlans, "barrier") + assert.Equal(t, map[string]*TablePlan{"t1": t1New, "t2": t2New}, stalePlans["barrier"].refreshedPlans) +} + +func TestRetargetPostDDLStalePlans_MixedCaseRenameTargetsUseMatchingLiveKeys(t *testing.T) { + t1Old := &TablePlan{TargetName: "t1"} + t2New := &TablePlan{TargetName: "t2"} + + stalePlans := map[string]postDDLStalePlan{ + "t1": { + stalePlan: t1Old, + refreshedPlans: map[string]*TablePlan{ + "t1": t1Old, + }, + }, + } + + retargetPostDDLStalePlans(stalePlans, map[string]string{"T1": "T2"}, map[string]*TablePlan{"t2": t2New}) + + require.Contains(t, stalePlans, "t1") + assert.Equal(t, map[string]*TablePlan{"t2": t2New}, stalePlans["t1"].refreshedPlans) +} + +func TestScheduleItems_OTHERIsForceGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + otherEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_OTHER, + Timestamp: 200, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, otherEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.True(t, got.payload.commitOnly) +} + +func TestScheduleItems_CopyStateForceGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + // When copyState is non-empty, all transactions should be forceGlobal + vp.copyState = map[string]*sqltypes.Result{"t1": {}} + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}, + }, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) +} + +// TestScheduleItems_UniqueSecondaryIndexEmitsWritesetKey pins that a plain +// unique secondary no longer force-serializes: the scheduled txn carries a +// writeset that includes the unique-key conflict key (so colliding unique +// values serialize against each other while non-colliding rows run in +// parallel), rather than being marked forceGlobal. +func TestScheduleItems_UniqueSecondaryIndexEmitsWritesetKey(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_unique_secondary_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + // Confirm the FIELD handler classified the plain unique secondary as + // hashable (emits a writeset key) rather than force-serializing. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) + + err = vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: tableName, + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1a"), Lengths: []int64{1, 1}}, + }}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + }}) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + // The txn runs in parallel (not force-serialized) and carries the + // unique-key conflict key for email="a". + assert.False(t, got.forceGlobal) + expectedUniqueKey := map[uint64]struct{}{} + require.NoError(t, writesetKeysForUniqueKey( + tableName, 0, []int{1}, + vp.tablePlans[tableName].Fields, + nil, + []sqltypes.Value{sqltypes.NewInt32(1), sqltypes.NewVarChar("a")}, + expectedUniqueKey, + )) + require.Len(t, expectedUniqueKey, 1) + for key := range expectedUniqueKey { + assert.Contains(t, got.writeset, key, + "scheduled txn writeset must include the unique-key conflict key") + } +} + +func TestScheduleItems_UnsupportedWritesetMappingForcesGlobal(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + env := vtenv.NewTestEnv() + plan, err := (&vreplicator{workflowConfig: vp.vr.workflowConfig}).buildReplicatorPlan( + getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: "t1", + Filter: "select a + b as c1, c as c2 from t1", + }}}), + map[string][]*ColumnInfo{"t1": {{Name: "c1", IsPK: true}, {Name: "c2"}}}, + nil, + vp.vr.stats, + env.CollationEnv(), + env.Parser(), + ) + require.NoError(t, err) + + tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "a", Type: querypb.Type_INT64}, + {Name: "b", Type: querypb.Type_INT64}, + {Name: "c", Type: querypb.Type_INT64}, + }, + }) + require.NoError(t, err) + require.True(t, tplan.HasUnsupportedWritesetMapping) + + vp.tablePlans["t1"] = tplan + vp.tablePlansVersion.Store(1) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("123"), Lengths: []int64{1, 1, 1}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.NoError(t, scheduler.markCommitted(got)) +} + +func TestApplyEvent_FIELDEmitsWritesetKeyForUniqueSecondaryIndex(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_unique_secondary_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + // A plain (non-prefix, non-expression) unique secondary not covered by + // the identity no longer force-serializes; it emits a writeset unique + // key instead. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDMarksAlternateIdentityAgainstPrimaryKeyAsUnsafe(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_alt_identity_primary_key_conflict" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: tableName, + Filter: "select id, email from " + tableName, + TargetUniqueKeyColumns: "email", + }}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + require.Equal(t, []string{"email"}, vp.tablePlans[tableName].IdentityColumns) + // PK(id) does not match the chosen identity (email): the writeset hasher + // can't reason about it, so force-serialize and emit no unique keys. + require.True(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDCachesExtraUniqueSecondaryLookup(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_cached_unique_secondary_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + } + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + + vp.vr.mysqld = nil + + // Second FIELD reuses the cached unique-key analysis (mysqld is nil, so a + // fresh schema fetch would fail): a plain unique secondary emits a + // writeset unique key, no force-serialization. + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDCachesNoExtraUniqueSecondaryLookup(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_cached_no_unique_secondary_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), key idx_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + } + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + require.Contains(t, vp.tablePlans, tableName) + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) + + vp.vr.mysqld = nil + + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDEmitsWritesetKeyForNullableUniqueSecondaryIndex(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_nullable_unique_secondary_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + // A NULLABLE plain unique secondary is still hashable: NULL key values + // simply emit no key at write time (MySQL permits multiple NULLs), so we + // emit a writeset unique key rather than force-serializing. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDIgnoresIdentityEquivalentReorderedUniqueSecondaryIndex(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_reordered_identity_equivalent_unique_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (a int not null, b int not null, c varchar(128) not null, primary key(a, b), unique key uk_b_a(b, a))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "a", Type: querypb.Type_INT32}, + {Name: "b", Type: querypb.Type_INT32}, + {Name: "c", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + require.Equal(t, []string{"a", "b"}, vp.tablePlans[tableName].IdentityColumns) + // The unique secondary covers the full identity, so it can't create + // cross-identity conflicts: no force-serialization, no extra unique key. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) +} + +// TestApplyEvent_FIELDIgnoresUniqueSecondaryIndexThatContainsIdentity covers a +// unique secondary index whose column set is a strict superset of the +// identity. UNIQUE(id, name) where id is the PK cannot introduce conflicts +// beyond what PK(id) already enforces, so the table must not be flagged as +// having an "extra" unique secondary index. The pre-fix code short-circuited +// on column-count mismatch, forcing unnecessary global serialization. +func TestApplyEvent_FIELDIgnoresUniqueSecondaryIndexThatContainsIdentity(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_unique_secondary_contains_identity" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, name varchar(128) not null, primary key(id), unique key uk_id_name(id, name))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "name", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + require.Equal(t, []string{"id"}, vp.tablePlans[tableName].IdentityColumns) + // UNIQUE(id, name) is a superset of the identity (id), so it adds no + // conflicts beyond the PK: no force-serialization, no extra unique key. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDIgnoresIdentityEquivalentReorderedPrimaryKey(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_reordered_identity_equivalent_primary_key" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (a int not null, b int not null, c varchar(128) not null, primary key(a, b), unique key uk_b_a(b, a))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: tableName, + Filter: "select a, b, c from " + tableName, + TargetUniqueKeyColumns: "b,a", + }}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "a", Type: querypb.Type_INT32}, + {Name: "b", Type: querypb.Type_INT32}, + {Name: "c", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + require.Equal(t, []string{"b", "a"}, vp.tablePlans[tableName].IdentityColumns) + // The unique secondary's column set equals the identity (reordered), so + // it can't create cross-identity conflicts: no force-serialization, no + // extra unique key. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDMarksPrefixUniqueIndexAsExtraUniqueSecondary(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_prefix_unique_secondary_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (email varchar(128) not null, payload varchar(128), primary key(email), unique key uk_email_prefix(email(10)))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "email", Type: querypb.Type_VARCHAR}, + {Name: "payload", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + require.Equal(t, []string{"email"}, vp.tablePlans[tableName].IdentityColumns) + // A prefix unique index enforces uniqueness over a derived value the + // hasher can't reproduce, so it force-serializes and emits no unique key. + require.True(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDAfterExecutedDDLRefreshesUniqueSecondaryLookup(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + + tableName := "parallel_apply_field_refresh_after_unique_ddl" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), key idx_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + } + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + // Before the DDL: only a non-unique secondary, so no unique keys. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) + + ddlEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_DDL, + Statement: "alter table " + tableName + " add unique key uk_email(email)", + Timestamp: 100, + } + execStatements(t, []string{"alter table " + qualifiedTableName + " add unique key uk_email(email)"}) + publishExecutedDDLBarrier(t, vp, ddlEvent.Statement) + + // After the DDL barrier the FIELD handler re-runs the unique-key analysis: + // the new plain unique secondary emits a writeset unique key. + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) + + vp.vr.mysqld = nil + + // The refreshed analysis is cached; a later FIELD reuses it (mysqld nil). + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestWorkerLoop_FIELDRefreshesPublishedDDLBarrierState(t *testing.T) { + ctx, cancel := context.WithCancel(testCtx(t)) + defer cancel() + + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + + tableName := "parallel_apply_worker_field_refresh_after_unique_ddl" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), key idx_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + } + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns) + + workerDB := &recordingDBClient{} + workerClient := newVDBClient(workerDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + worker := &applyWorker{ + ctx: ctx, + conns: [2]*vdbClient{workerClient, workerClient}, + client: workerClient, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return &sqltypes.Result{}, nil + }, + commit: func() error { + return nil + }, + } + + scheduler := newApplyScheduler(ctx) + commitCh := make(chan *applyTxn, 2) + errCh := make(chan error, 1) + go func() { + errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker) + }() + + commitOnlyTxn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + commitOnly: true, + events: []*binlogdatapb.VEvent{{ + Type: binlogdatapb.VEventType_OTHER, + }}, + }, + } + require.NoError(t, scheduler.enqueue(commitOnlyTxn)) + require.Same(t, commitOnlyTxn, <-commitCh) + require.NoError(t, scheduler.markCommitted(commitOnlyTxn)) + + execStatements(t, []string{"alter table " + qualifiedTableName + " add unique key uk_email(email)"}) + publishExecutedDDLBarrier(t, vp, "alter table "+tableName+" add unique key uk_email(email)") + + fieldTxn := &applyTxn{ + order: 2, + noConflict: true, + payload: &applyTxnPayload{ + events: []*binlogdatapb.VEvent{fieldEvent}, + }, + } + require.NoError(t, scheduler.enqueue(fieldTxn)) + require.Same(t, fieldTxn, <-commitCh) + + // The worker-loop FIELD refresh re-ran the unique-key analysis after the + // published DDL barrier: the new plain unique secondary emits a writeset + // unique key (it does not force-serialize). + assert.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + assert.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) + + cancel() + require.ErrorIs(t, <-errCh, context.Canceled) +} + +func TestWorkerLoop_FIELDRefreshClearsPublishedDroppedTablesAfterCommit(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_worker_field_does_not_clear_dropped_state" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, primary key(id))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}}, + }, + } + require.NoError(t, vp.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + + vp.postDDLDroppedTables = map[string]struct{}{tableName: {}} + vp2 := *vp + vp2.postDDLDroppedTables = cloneDroppedTables(vp.postDDLDroppedTables) + + require.NoError(t, vp2.applyEvent(ctx, fieldEvent, false)) + require.NoError(t, vp.dbClient.Rollback()) + require.Contains(t, vp.postDDLDroppedTables, tableName) + + scheduler := newApplyScheduler(ctx) + payload := acquireApplyTxnPayload() + payload.pos = vp.pos + payload.timestamp = 123 + payload.events = []*binlogdatapb.VEvent{fieldEvent} + payload.query = func(context.Context, string) (*sqltypes.Result, error) { + return &sqltypes.Result{}, nil + } + payload.commit = func() error { return nil } + payload.client = vp.dbClient + txn := acquireApplyTxn() + txn.order = 1 + txn.payload = payload + defer releaseApplyTxn(txn) + + commitCh := make(chan *applyTxn, 1) + commitCh <- txn + close(commitCh) + require.NoError(t, vp.commitLoop(ctx, scheduler, commitCh)) + assert.NotContains(t, vp.postDDLDroppedTables, tableName) +} + +func TestApplyEvent_FIELDRefreshTargetInvalidatesUniqueSecondaryCache(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_field_refresh_target_unique_secondary_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + cachedPlan := &TablePlan{ + TargetName: tableName, + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}, {Name: "email", Type: querypb.Type_VARCHAR}}, + PKIndices: []bool{true, false}, + HasExtraUniqueSecondary: false, + } + vp.tablePlans[tableName] = cachedPlan + vp.postDDLStalePlans = map[string]postDDLStalePlan{ + "old_" + tableName: { + stalePlan: &TablePlan{TargetName: "old_" + tableName}, + refreshedPlans: map[string]*TablePlan{tableName: cachedPlan}, + }, + } + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + // The refresh-target barrier invalidated the cached plan, so the FIELD + // handler re-ran the unique-key analysis: the plain unique secondary + // emits a writeset unique key rather than force-serializing. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDRefreshTargetInvalidatesUniqueSecondaryCacheAcrossMultipleBarriers(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "pa_field_refresh_multi_barrier_uniq_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + cachedPlan := &TablePlan{ + TargetName: tableName, + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}, {Name: "email", Type: querypb.Type_VARCHAR}}, + PKIndices: []bool{true, false}, + HasExtraUniqueSecondary: false, + } + vp.tablePlans[tableName] = cachedPlan + vp.postDDLStalePlans = map[string]postDDLStalePlan{ + "old_" + tableName: { + stalePlan: &TablePlan{TargetName: "old_" + tableName}, + refreshedPlans: map[string]*TablePlan{tableName: {TargetName: tableName}}, + }, + "other_old_" + tableName: { + stalePlan: cachedPlan, + refreshedPlans: map[string]*TablePlan{tableName: cachedPlan}, + }, + } + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + // Across multiple barriers the cache was invalidated and the FIELD handler + // re-ran the unique-key analysis: the plain unique secondary emits a + // writeset unique key rather than force-serializing. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) +} + +func TestApplyEvent_FIELDWithoutParallelApplySkipsUniqueSecondaryLookup(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + vp.vr.workflowConfig.ParallelReplicationWorkers = 1 + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": { + TargetName: "t1", + IdentityColumns: []string{"id"}, + Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)"), + TablePlanBuilder: &tablePlanBuilder{}, + }, + }} + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + }, + }, false)) + + require.Contains(t, vp.tablePlans, "t1") + require.False(t, vp.tablePlans["t1"].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans["t1"].UniqueKeyColumns) + + // During the copy phase (catchup/fastforward run the SERIAL applier and + // this vplayer's table plans die with it), the lookup must be skipped + // even when parallel workers are configured: the schema fetch is a + // wasted mysqld round-trip and a needless failure mode. vr.mysqld is + // nil here, so reaching the lookup would fail loudly. + vp.vr.workflowConfig.ParallelReplicationWorkers = 4 + vp.copyState = map[string]*sqltypes.Result{"t1": nil} + delete(vp.tablePlans, "t1") + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + }, + }, false)) + require.Contains(t, vp.tablePlans, "t1") + require.False(t, vp.tablePlans["t1"].HasExtraUniqueSecondary) + require.Nil(t, vp.tablePlans["t1"].UniqueKeyColumns) +} + +func TestApplyEvent_VERSIONIsIgnored(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + + gtid := "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5" + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: gtid}, false)) + + versionEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_VERSION, Timestamp: 100} + require.NoError(t, vp.applyEvent(ctx, versionEvent, false)) + require.False(t, vp.dbClient.InTransaction) + require.Nil(t, vp.unsavedEvent) + + commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 100} + require.NoError(t, vp.applyEvent(ctx, commitEvent, false)) + require.Equal(t, commitEvent, vp.unsavedEvent) +} + +// TestApplyEvent_JournalDoesNotPersistPositionBeforeTransition pins that a +// JOURNAL event must NOT durably advance the saved position when it is +// registered. registerJournal returns nil as soon as THIS participant has +// joined — here participant ks:1 has not — and the engine's journaler state +// is in-memory only. If the position were saved past the journal and the +// tablet restarted before all participants joined, the stream would resume +// past the journal, never re-register, and the workflow would hang forever +// waiting for a transition that can no longer happen. +func TestApplyEvent_JournalDoesNotPersistPositionBeforeTransition(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + }} + + oldPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5540") + require.NoError(t, err) + journalPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5541") + require.NoError(t, err) + vp.pos = oldPos + vp.stopPos = journalPos + + recording := &recordingDBClient{} + mainClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.vr.dbClient = mainClient + vp.dbClient = mainClient + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return mainClient.Execute(sql) + } + vp.commit = mainClient.Commit + + vp.vr.vre = &Engine{ + isOpen: true, + journaler: make(map[string]*journalEvent), + controllers: map[int32]*controller{ + vp.vr.id: { + workflow: "wf", + source: &binlogdatapb.BinlogSource{ + Keyspace: "ks", + Shard: "0", + }, + }, + 2: { + workflow: "wf", + source: &binlogdatapb.BinlogSource{ + Keyspace: "ks", + Shard: "1", + }, + }, + }, + } + + err = vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_JOURNAL, + Timestamp: 100, + EventGtid: "3e11fa47-71ca-11e1-9e33-c80aa9429562:5541", + Journal: &binlogdatapb.Journal{ + Id: 1, + MigrationType: binlogdatapb.MigrationType_TABLES, + Participants: []*binlogdatapb.KeyspaceShard{{ + Keyspace: "ks", + Shard: "0", + }, { + Keyspace: "ks", + Shard: "1", + }}, + Tables: []string{"t1"}, + }, + }, true) + require.ErrorIs(t, err, io.EOF) + + // This participant registered with the journaler... + require.Contains(t, vp.vr.vre.journaler, "wf:1") + // ...but the in-memory and durable positions must remain BEFORE the + // journal event so a restart re-delivers it and re-registers. + assert.Equal(t, oldPos, vp.pos) + for _, query := range recording.queries { + assert.NotContains(t, query, "update _vt.vreplication set pos=", "no position may be persisted at journal registration; queries: %v", recording.queries) + } +} + +// TestVPlayerLagSnapshotIsAtomic pins the invariant that lag-state reads +// never observe a torn pair of (lastTimestampNs, timeOffsetNs). The two +// values are written together (commitLoop.updateLag and the heartbeat +// store path), and the throttled-path lag estimator reads them both. If +// they were independent atomics, a reader could see (new ts, old offset) +// and report nonsense lag values. We pack them into a single atomic +// snapshot so loads are naturally consistent. +// +// The test pairs ts and offset (offset = ts + sentinelOffset) on every +// write, then a concurrent reader checks the relationship on every load. +// A torn read would produce a pair that violates the invariant. +func TestVPlayerLagSnapshotIsAtomic(t *testing.T) { + vp, _ := testVPlayer(t) + const sentinelOffset = int64(0xDEADBEEF) + const iterations = 50_000 + + stop := make(chan struct{}) + var writeWG sync.WaitGroup + writeWG.Go(func() { + for i := int64(1); ; i++ { + select { + case <-stop: + return + default: + } + vp.storeLagSnapshot(i, i+sentinelOffset) + } + }) + + var readWG sync.WaitGroup + mismatches := atomic.Int64{} + readWG.Go(func() { + for range iterations { + snap := vp.loadLagSnapshot() + if snap.timestampNs == 0 { + continue + } + if snap.offsetNs != snap.timestampNs+sentinelOffset { + mismatches.Add(1) + } + } + }) + + readWG.Wait() + close(stop) + writeWG.Wait() + + require.Equal(t, int64(0), mismatches.Load(), "lag snapshot reads must always observe a consistent ts/offset pair") +} + +// TestScheduleItems_FIELDIncrementsPendingRefreshBeforeEnqueue pins the +// ordering invariant: when scheduleItems processes a FIELD event, the +// vp.pendingFieldRefreshTables increment must happen BEFORE the txn is +// enqueued into the scheduler. Otherwise a worker can pick up the txn and +// commitLoop's matching decrement loop can run with an empty map (no-op), +// leaving the counter permanently stuck at 1 and force-serializing every +// future ROW txn that touches this table. +// +// The test stubs the scheduler-dispatch path by reading the increment via a +// nextReady call from the same goroutine immediately after scheduleItems +// returns; the contract is that pendingFieldRefreshTables must be visible +// before the txn becomes pickable. We additionally assert that on a +// scheduler-closed enqueue error, the speculative increment is rolled back +// so a transient teardown does not poison the next workflow restart. +func TestScheduleItems_FIELDIncrementsPendingRefreshBeforeEnqueue(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT} + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, commitEvent}}) + require.NoError(t, err) + + // As a worker would: pull the txn out of the scheduler. By the time the + // scheduler hands a worker any txn that carries a FIELD refresh, the + // pendingFieldRefreshTables count for that table MUST already be at + // least 1 — otherwise commitLoop's decrement could race past the + // increment and leave the counter permanently stuck. + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, got) + + vp.serialMu.Lock() + count := vp.pendingFieldRefreshTables["t1"] + vp.serialMu.Unlock() + require.GreaterOrEqual(t, count, 1, "pendingFieldRefreshTables[t1] must be incremented before the txn is dispatched to a worker") +} + +func TestScheduleItems_FIELDRollsBackPendingRefreshOnEnqueueError(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + // Close the scheduler so any subsequent enqueue returns io.EOF. + require.ErrorIs(t, scheduler.close(), io.EOF) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT} + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, commitEvent}}) + require.ErrorIs(t, err, io.EOF) + + vp.serialMu.Lock() + count := vp.pendingFieldRefreshTables["t1"] + vp.serialMu.Unlock() + require.Equal(t, 0, count, "pendingFieldRefreshTables[t1] must roll back to 0 when enqueue fails") +} + +func TestScheduleItems_FIELDEventDoesNotForceGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + // FIELD events are metadata (table definitions). They should NOT force + // global serialization — they are harmless for conflict detection and + // just need to be applied before the ROW events that follow. + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + // FIELD events have an explicit handler that does NOT set curRowOnly=false, + // so the transaction is scheduled normally with an empty writeset (noConflict). + assert.False(t, got.forceGlobal) +} + +func TestScheduleItems_ROWSQUERYEventDoesNotForceGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowsQueryEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROWS_QUERY, + Statement: "insert into t1 values (1)", + Timestamp: 100, + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}, + }, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT} + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, rowsQueryEvent, rowEvent, commitEvent}}) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + require.Len(t, got.payload.events, 2) + assert.Equal(t, binlogdatapb.VEventType_ROWS_QUERY, got.payload.events[0].Type) + assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[1].Type) + require.Len(t, got.writeset, 1) +} + +func TestScheduleItems_UnknownVEventTypeFailsFast(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType(12345)}, + {Type: binlogdatapb.VEventType_COMMIT}, + }}) + require.Error(t, err) + require.Contains(t, err.Error(), "unsupported vevent type") +} + +func TestScheduleItems_InsertStatementEventDoesNotFailFast(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_INSERT, Dml: "insert into t1(id) values (1)", Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + }}) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.Len(t, got.payload.events, 1) + assert.Equal(t, binlogdatapb.VEventType_INSERT, got.payload.events[0].Type) +} + +func TestScheduleItems_TimestampTracking(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + Timestamp: 50, + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}, + }, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + // Timestamp from the ROW event should be tracked + assert.Equal(t, int64(100), got.payload.timestamp) +} + +func TestScheduleItems_WritesetBuild(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {After: &querypb.Row{Values: []byte("42"), Lengths: []int64{2}}}, + }, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, got.forceGlobal) + assert.Contains(t, got.payload.events[0].RowEvent.TableName, "t1") + // Writeset should contain PK-based key + require.Len(t, got.writeset, 1) + expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42"))) + assert.Equal(t, expected, got.writeset[0]) +} + +func TestScheduleItems_MissingTablePlanReturnsWritesetError(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + // No table plan for "t1" — writeset build should fail closed. + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}, + }, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.Error(t, err) + assert.Equal(t, vtrpcpb.Code_FAILED_PRECONDITION, vterrors.Code(err)) + assert.Contains(t, err.Error(), "missing table plan for t1") +} + +func TestScheduleItems_FieldThenRowWithoutCachedPlanForcesGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + fieldEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + }, + Timestamp: 100, + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}, + }}, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT} + + err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, rowEvent, commitEvent}}) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + require.Len(t, got.payload.events, 2) + assert.Equal(t, binlogdatapb.VEventType_FIELD, got.payload.events[0].Type) + assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[1].Type) + assert.Nil(t, got.writeset) +} + +func TestScheduleItems_RowAfterPendingFieldRefreshForKnownTableForcesGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "customer": { + TargetName: "customer", + SendRule: &binlogdatapb.Rule{Match: "customer", Filter: "select * from customer"}, + }, + }} + + fieldTxn := [][]*binlogdatapb.VEvent{{ + { + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + SequenceNumber: 5, + CommitParent: 4, + }, + { + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: "customer", + Fields: []*querypb.Field{ + {Name: "cid", Type: querypb.Type_INT64}, + {Name: "name", Type: querypb.Type_VARCHAR}, + }, + }, + Timestamp: 100, + }, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, fieldTxn)) + + fieldReady, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.False(t, fieldReady.forceGlobal) + require.Len(t, fieldReady.payload.events, 1) + assert.Equal(t, binlogdatapb.VEventType_FIELD, fieldReady.payload.events[0].Type) + + rowTxn := [][]*binlogdatapb.VEvent{{ + { + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6", + SequenceNumber: 6, + CommitParent: 5, + }, + { + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "customer", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1alice"), Lengths: []int64{1, 5}}, + }}, + }, + Timestamp: 101, + }, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, rowTxn)) + + scheduler.mu.Lock() + require.Equal(t, 1, scheduler.pendingCount) + var queued *applyTxn + for _, pending := range scheduler.pending { + if pending != nil { + queued = pending + break + } + } + scheduler.mu.Unlock() + require.NotNil(t, queued) + assert.True(t, queued.forceGlobal) + require.Len(t, queued.payload.events, 1) + assert.Equal(t, binlogdatapb.VEventType_ROW, queued.payload.events[0].Type) + assert.Nil(t, queued.writeset) + + require.NoError(t, scheduler.markCommitted(fieldReady)) + ready, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.Same(t, queued, ready) + require.NoError(t, scheduler.markCommitted(ready)) +} + +func TestScheduleItems_PartialRowImageFallsBackToSerializedApply(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "a", Type: querypb.Type_INT64}, + {Name: "id", Type: querypb.Type_INT64}, + {Name: "b", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{false, true, false}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("23"), Lengths: []int64{1, 1}}, + DataColumns: &binlogdatapb.RowChange_Bitmap{ + Count: 3, + Cols: []byte{0x06}, + }, + }}, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.Empty(t, got.writeset) + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Zero(t, scheduler.pendingCount) + assert.Equal(t, 1, scheduler.inflightGlobal) + assert.Zero(t, scheduler.inflightMissingMeta) + assert.Zero(t, scheduler.inflightCommitMeta) + assert.Empty(t, scheduler.pending) + assert.Empty(t, scheduler.inflightWriteset) +} + +func TestScheduleItems_MissingFKColumnFallsBackToSerializedApply(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["child"] = &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + vp.fkRefs = map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}}, + } + vp.parentFKRefs = buildParentFKRefs(vp.fkRefs) + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "child", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + }})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.Empty(t, got.writeset) + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Zero(t, scheduler.pendingCount) + assert.Equal(t, 1, scheduler.inflightGlobal) + assert.Zero(t, scheduler.inflightMissingMeta) + assert.Zero(t, scheduler.inflightCommitMeta) + assert.Empty(t, scheduler.pending) + assert.Empty(t, scheduler.inflightWriteset) +} + +func TestScheduleItems_CommitMeta(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + SequenceNumber: 10, + CommitParent: 9, + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.hasCommitMeta) + assert.Equal(t, int64(10), got.sequenceNumber) + assert.Equal(t, int64(9), got.commitParent) +} + +func TestScheduleItems_DDLCommitOnlyPreservesCommitMetaFromGTID(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + SequenceNumber: 10, + CommitParent: 9, + } + ddlEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_DDL, + Timestamp: 200, + } + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, ddlEvent}})) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.payload.commitOnly) + assert.True(t, got.hasCommitMeta) + assert.Equal(t, int64(10), got.sequenceNumber) + assert.Equal(t, int64(9), got.commitParent) +} + +func TestScheduleItems_BatchingMixedCommitMetaStaysMissingMeta(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + state.maxBatchedCommits = 2 + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6", SequenceNumber: 11, CommitParent: 10}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, items)) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, got) + assert.False(t, got.hasCommitMeta) + assert.Zero(t, got.sequenceNumber) + assert.Zero(t, got.commitParent) + assert.Len(t, got.payload.events, 2) + assert.NotNil(t, got.writeset) + require.NoError(t, scheduler.markCommitted(got)) + scheduler.mu.Lock() + assert.Equal(t, int64(11), scheduler.lastCommittedSequence) + scheduler.mu.Unlock() +} + +func TestScheduleItems_HeartbeatSetsMustSave(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.vr.workflowConfig.HeartbeatUpdateInterval = math.MaxInt + + vp.numAccumulatedHeartbeats = 1 + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + // recordHeartbeat() calls vr.stats.RecordHeartbeat (no DB) then + // mustUpdateHeartbeat() → false (numAccumulatedHeartbeats=0), so no DB call. + _ = mockDB + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, + Timestamp: 100, + } + heartbeatEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_HEARTBEAT, + Timestamp: 200, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + // GTID, ROW, HEARTBEAT, COMMIT — heartbeat should set curMustSave + // because there are accumulated events when heartbeat arrives + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, heartbeatEvent, commitEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, got) + // The heartbeat forced curMustSave=true, which means the transaction was flushed + // even if batching would otherwise accumulate it + assert.Equal(t, int64(1), got.order) +} + +func TestScheduleItems_BatchingSkipsFlushWhenAnotherCommitAhead(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + // Two transactions in same batch — first COMMIT should be skipped (batched) + // since another COMMIT follows + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + // Second transaction in same batch + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // With batching, both transactions merge into one — only one enqueue + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, got) + + // The batched transaction should have both ROW events + assert.Len(t, got.payload.events, 2) + assert.Equal(t, int64(1), got.order) +} + +func TestScheduleItems_FKRefsDisableBatching(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + vp.tablePlansVersion.Store(1) + + // Set FK refs — this should disable batching + vp.fkRefs = map[string][]fkConstraintRef{ + "t1": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}}}, + } + + // Two transactions in same batch + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("223"), Lengths: []int64{1, 2}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // With FK refs, batching is disabled — two separate transactions + got1, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NoError(t, scheduler.markCommitted(got1)) + got2, err := scheduler.nextReady(ctx) + require.NoError(t, err) + + assert.Len(t, got1.payload.events, 1) + assert.Len(t, got2.payload.events, 1) + assert.Equal(t, int64(1), got1.order) + assert.Equal(t, int64(2), got2.order) +} + +func TestScheduleItems_FKRefsDisableBatchingForRenamedTable(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["child_src"] = &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + vp.tablePlansVersion.Store(1) + + vp.fkRefs = map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}}}, + } + vp.parentFKRefs = buildParentFKRefs(vp.fkRefs) + + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "child_src", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "child_src", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("223"), Lengths: []int64{1, 2}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got1, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Len(t, got1.payload.events, 1) + require.Equal(t, int64(1), got1.order) + scheduler.mu.Lock() + pendingCount := scheduler.pendingCount + scheduler.mu.Unlock() + require.Equal(t, 1, pendingCount) + require.NoError(t, scheduler.markCommitted(got1)) + got2, err := scheduler.nextReady(ctx) + require.NoError(t, err) + + assert.Len(t, got2.payload.events, 1) + assert.Equal(t, int64(2), got2.order) +} + +func TestScheduleItems_FKRefsDisableBatchingForMixedCaseTargetTable(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["child_src"] = &TablePlan{ + TargetName: "Child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + vp.tablePlansVersion.Store(1) + + vp.fkRefs = map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}}}, + } + vp.parentFKRefs = buildParentFKRefs(vp.fkRefs) + + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "child_src", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "child_src", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("223"), Lengths: []int64{1, 2}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + got1, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Len(t, got1.payload.events, 1) + require.Equal(t, int64(1), got1.order) + scheduler.mu.Lock() + pendingCount := scheduler.pendingCount + scheduler.mu.Unlock() + require.Equal(t, 1, pendingCount) + require.NoError(t, scheduler.markCommitted(got1)) + got2, err := scheduler.nextReady(ctx) + require.NoError(t, err) + + assert.Len(t, got2.payload.events, 1) + assert.Equal(t, int64(2), got2.order) +} + +func TestScheduleItems_BatchingMergedSequenceAdvanced(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + // Pre-advance the watermark so enqueue's idle-seeding path (which seeds + // lastCommittedSequence from the enqueued txn's commitParent) cannot + // mask the mergedSequences behavior this test pins. + scheduler.advanceCommittedSequence(9) + + // Two transactions with commit meta — the first gets merged into the + // second's batch, so its sequence (10) must ride along in mergedSequences + // and publish only when the batch commits. + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", SequenceNumber: 10, CommitParent: 9}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + // Second txn + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6", SequenceNumber: 11, CommitParent: 10}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // The merged-away sequence must NOT be visible yet: publishing it at + // enqueue time would let an empty-writeset dependent with commitParent=10 + // run before the batch containing sequence 10 has actually committed. + scheduler.mu.Lock() + seq := scheduler.lastCommittedSequence + scheduler.mu.Unlock() + assert.Equal(t, int64(9), seq) + + // Both source transactions were batched into a single txn that carries + // the surviving commit meta (sequence 11) plus the merged-away sequence. + txn, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Equal(t, int64(1), txn.order) + assert.Equal(t, int64(11), txn.sequenceNumber) + assert.Len(t, txn.payload.events, 2) + require.Equal(t, []int64{10}, txn.mergedSequences) + + // Committing the batch publishes both its own and the merged sequence. + require.NoError(t, scheduler.markCommitted(txn)) + scheduler.mu.Lock() + seq = scheduler.lastCommittedSequence + scheduler.mu.Unlock() + assert.Equal(t, int64(11), seq) +} + +func TestScheduleItems_StopPosSetsMustSave(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + // Set a stop position + stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = stopPos + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-10", // at or past stopPos + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, + Timestamp: 100, + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}} + err = vp.scheduleItems(ctx, scheduler, state, items) + require.ErrorIs(t, err, io.EOF) + + got, gerr := scheduler.nextReady(ctx) + require.NoError(t, gerr) + require.NotNil(t, got) + assert.True(t, got.payload.mustSave) +} + +func TestScheduleItems_StopPosStopsSchedulingLaterTransactionsInSameFetch(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = stopPos + + items := [][]*binlogdatapb.VEvent{ + { + {Type: binlogdatapb.VEventType_GTID, Gtid: replication.EncodePosition(stopPos)}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + }, + { + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }, + } + + err = vp.scheduleItems(ctx, scheduler, state, items) + require.ErrorIs(t, err, io.EOF) + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Equal(t, 1, scheduler.pendingCount) +} + +func TestScheduleItems_HeartbeatUpdatesLag(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.vr.workflowConfig.HeartbeatUpdateInterval = math.MaxInt + + vp.numAccumulatedHeartbeats = 1 + + hbEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_HEARTBEAT, + Timestamp: 100, + CurrentTime: time.Now().UnixNano(), + } + + items := [][]*binlogdatapb.VEvent{{hbEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + assert.Equal(t, int64(100*1e9), vp.loadLagSnapshot().timestampNs) + assert.Equal(t, 2, vp.numAccumulatedHeartbeats) +} + +func TestScheduleItems_ThrottledHeartbeatEstimatesLag(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.vr.workflowConfig.HeartbeatUpdateInterval = math.MaxInt + vp.vr.throttleUpdatesRateLimiter = timer.NewRateLimiter(time.Second) + t.Cleanup(vp.vr.throttleUpdatesRateLimiter.Stop) + + vp.numAccumulatedHeartbeats = 1 + + // Set last known timestamp so estimateLag works + vp.storeLagSnapshot(time.Now().Add(-5*time.Second).UnixNano(), 0) + + // updateTimeThrottled calls dbClient.ExecuteFetch + mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{}) + + hbEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_HEARTBEAT, + Timestamp: 100, + CurrentTime: time.Now().UnixNano(), + Throttled: true, + ThrottledReason: "test", + } + + items := [][]*binlogdatapb.VEvent{{hbEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // Lag should be estimated (non-zero) + lag := vp.vr.stats.ReplicationLagSeconds.Load() + assert.GreaterOrEqual(t, lag, int64(4)) +} + +func BenchmarkScheduleItems_FKBatchingCheckSkipsUnrelatedTables(b *testing.B) { + ctx := context.Background() + vp, _ := testVPlayer(&testing.T{}) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + vp.tablePlans["hot"] = &TablePlan{ + TargetName: "hot", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true}, + } + vp.tablePlans["child"] = &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + vp.tablePlansVersion.Store(1) + vp.fkRefs = map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}}, + } + vp.parentFKRefs = buildParentFKRefs(vp.fkRefs) + + items := make([][]*binlogdatapb.VEvent, 1) + batch := make([]*binlogdatapb.VEvent, 0, 96) + for range 32 { + batch = append(batch, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "hot", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}, + }}, + }}, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}, + ) + } + items[0] = batch + + b.ReportAllocs() + b.ResetTimer() + for range b.N { + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + if err := vp.scheduleItems(ctx, scheduler, state, items); err != nil { + b.Fatal(err) + } + _ = scheduler.close() + } +} + +func BenchmarkScheduleItems_WritesetFKResolutionForRepeatedTable(b *testing.B) { + const ( + tableCount = 256 + txnCount = 32 + ) + + ctx := context.Background() + vp, _ := testVPlayer(&testing.T{}) + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + fkRefs := make(map[string][]fkConstraintRef, tableCount) + for i := range tableCount { + parentTable := fmt.Sprintf("parent%d", i) + childTable := fmt.Sprintf("child%d", i) + vp.tablePlans[parentTable] = &TablePlan{ + TargetName: parentTable, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true}, + } + vp.tablePlans[childTable] = &TablePlan{ + TargetName: childTable, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + fkRefs[childTable] = []fkConstraintRef{{ + ParentTable: parentTable, + ChildColumnNames: []string{"parent_id"}, + ReferencedColumnNames: []string{"id"}, + }} + } + vp.tablePlansVersion.Store(1) + vp.fkRefs = fkRefs + vp.parentFKRefs = buildParentFKRefs(fkRefs) + + items := make([][]*binlogdatapb.VEvent, 1) + batch := make([]*binlogdatapb.VEvent, 0, txnCount*3) + for range txnCount { + batch = append(batch, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "child0", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}, + }}, + }}, + &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}, + ) + } + items[0] = batch + + b.ReportAllocs() + b.ResetTimer() + for range b.N { + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + if err := vp.scheduleItems(ctx, scheduler, state, items); err != nil { + b.Fatal(err) + } + _ = scheduler.close() + } +} + +// ---------- commitLoop tests ---------- + +func TestCommitLoop_InOrderCommit(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + // commitLoop calls commitTxn → updatePos → vp.query/commit on each txn. + // For commitOnly+updatePosOnly, it calls vp.updatePos which calls + // vp.query (binlogplayer.GenerateUpdatePos). + // We mock the DB to accept any update/commit. + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + commitCh := make(chan *applyTxn, 3) + + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + // Send 3 transactions in order + for i := int64(1); i <= 3; i++ { + txn := &applyTxn{ + order: i, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100 * i, + commitOnly: true, + updatePosOnly: true, + lastEventTimestamp: 100 * i, + }, + done: make(chan struct{}), + } + commitCh <- txn + } + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) +} + +func TestCommitLoop_OutOfOrderReordering(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + commitCh := make(chan *applyTxn, 3) + + // Each txn records its position write through its own query closure. + // commitLoop is single-goroutine, so plain slices are safe here. + var committedOrders []int64 + var committedSQL []string + makeTxn := func(order int64) *applyTxn { + pos, err := replication.DecodePosition(fmt.Sprintf("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-%d", 4+order)) + require.NoError(t, err) + return &applyTxn{ + order: order, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 100 * order, + commitOnly: true, + updatePosOnly: true, + lastEventTimestamp: 100 * order, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + committedOrders = append(committedOrders, order) + committedSQL = append(committedSQL, sql) + return &sqltypes.Result{}, nil + }, + }, + done: make(chan struct{}), + } + } + + // Send transactions out of order: 2, 1, 3 + for _, order := range []int64{2, 1, 3} { + commitCh <- makeTxn(order) + } + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + + // The position writes must happen in strict order despite arrival order, + // and each write must carry its own txn's position. + require.Equal(t, []int64{1, 2, 3}, committedOrders) + require.Len(t, committedSQL, 3) + assert.Contains(t, committedSQL[0], ":1-5") + assert.Contains(t, committedSQL[1], ":1-6") + assert.Contains(t, committedSQL[2], ":1-7") + + scheduler.mu.Lock() + lastCommittedOrder := scheduler.lastCommittedOrder + scheduler.mu.Unlock() + assert.Equal(t, int64(3), lastCommittedOrder) +} + +// TestCommitLoop_ZeroOrderIsRejected pins the invariant that every txn +// reaching commitLoop must carry a positive order. All production enqueue +// paths use parallelOrder.Add(1) (>= 1), so an order==0 txn indicates a +// regression that would silently bypass strict commit ordering — and silently +// regress the monotonic position invariant on _vt.vreplication.pos. Fail +// fast so the workflow restarts cleanly instead of corrupting position state. +func TestCommitLoop_ZeroOrderIsRejected(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + commitCh := make(chan *applyTxn, 1) + + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + txn := &applyTxn{ + order: 0, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + commitCh <- txn + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.Error(t, err) + assert.Contains(t, err.Error(), "parallel apply commit txn missing order") +} + +func TestCommitLoop_PendingLeftover(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + commitCh := make(chan *applyTxn, 2) + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + // Send order 3 and 1, but no order 2 → should error about missing order + for _, order := range []int64{3, 1} { + txn := &applyTxn{ + order: order, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + commitCh <- txn + } + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.Error(t, err) + assert.Contains(t, err.Error(), "parallel apply commit missing order") +} + +func TestCommitLoop_MarksCommittedOnScheduler(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + commitCh := make(chan *applyTxn, 1) + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + txn := &applyTxn{ + order: 1, + sequenceNumber: 7, + hasCommitMeta: true, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + commitCh <- txn + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + + // markCommitted should have advanced lastCommittedSequence + scheduler.mu.Lock() + assert.Equal(t, int64(7), scheduler.lastCommittedSequence) + scheduler.mu.Unlock() + + // lastCommittedOrder should be 1 + scheduler.mu.Lock() + assert.Equal(t, int64(1), scheduler.lastCommittedOrder) + scheduler.mu.Unlock() +} + +func TestCommitLoop_UpdatesLag(t *testing.T) { + vp, mockDB := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + commitCh := make(chan *applyTxn, 1) + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + now := time.Now() + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + lastEventTimestamp: now.Add(-3 * time.Second).Unix(), + lastEventCurrentTime: now.UnixNano(), + }, + done: make(chan struct{}), + } + commitCh <- txn + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + + // Lag should be approximately 3 seconds + lag := vp.vr.stats.ReplicationLagSeconds.Load() + assert.GreaterOrEqual(t, lag, int64(2)) + assert.LessOrEqual(t, lag, int64(5)) +} + +func TestCommitLoop_UpdatePosOnlyKeepsLaterUnsavedEvent(t *testing.T) { + ctx := testCtx(t) + vp, mockDB := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + + laterUnsaved := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 200} + vp.serialMu.Lock() + vp.unsavedEvent = laterUnsaved + vp.serialMu.Unlock() + + commitCh := make(chan *applyTxn, 1) + commitCh <- &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + }, + done: make(chan struct{}), + } + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + + vp.serialMu.Lock() + defer vp.serialMu.Unlock() + require.Same(t, laterUnsaved, vp.unsavedEvent) +} + +func TestCommitLoop_UpdatePosOnlyDoesNotRefreshIdleTimerBehindLaterUnsavedEvent(t *testing.T) { + ctx := testCtx(t) + vp, mockDB := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + committedPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + laterPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-9") + require.NoError(t, err) + + laterUnsaved := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 200} + oldSavedAt := time.Now().Add(-2 * idleTimeout) + vp.serialMu.Lock() + vp.pos = laterPos + vp.unsavedEvent = laterUnsaved + vp.timeLastSaved = oldSavedAt + vp.serialMu.Unlock() + + commitCh := make(chan *applyTxn, 1) + commitCh <- &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: committedPos, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + }, + done: make(chan struct{}), + } + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + + vp.serialMu.Lock() + defer vp.serialMu.Unlock() + require.Same(t, laterUnsaved, vp.unsavedEvent) + assert.Equal(t, laterPos, vp.pos) + assert.Equal(t, oldSavedAt, vp.timeLastSaved) +} + +func TestCommitLoop_UpdatePosOnlyWithoutTimestampRefreshesHeartbeat(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + recording := &recordingDBClient{} + mainClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.vr.dbClient = mainClient + vp.dbClient = mainClient + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return mainClient.Execute(sql) + } + vp.commit = mainClient.Commit + + pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + + commitCh := make(chan *applyTxn, 1) + commitCh <- &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 0, + commitOnly: true, + updatePosOnly: true, + }, + done: make(chan struct{}), + } + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + + require.Len(t, recording.queries, 2) + assert.Contains(t, recording.queries[0], "update _vt.vreplication set pos=") + assert.NotContains(t, recording.queries[0], "transaction_timestamp=") + assert.Contains(t, recording.queries[1], "time_heartbeat=") +} + +func TestCommitLoop_WorkerCommitDoesNotRefreshIdleTimerBehindLaterUnsavedEvent(t *testing.T) { + ctx := testCtx(t) + vp, mockDB := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + committedPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + laterPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-9") + require.NoError(t, err) + + workerClient := newVDBClient(&recordingDBClient{}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + require.NoError(t, workerClient.Begin()) + t.Cleanup(func() { + _ = workerClient.Rollback() + }) + + laterUnsaved := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 200} + oldSavedAt := time.Now().Add(-2 * idleTimeout) + vp.serialMu.Lock() + vp.pos = laterPos + vp.unsavedEvent = laterUnsaved + vp.timeLastSaved = oldSavedAt + vp.serialMu.Unlock() + + doneCh := make(chan struct{}, 1) + commitCh := make(chan *applyTxn, 1) + commitCh <- &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: committedPos, + timestamp: 100, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return workerClient.Execute(sql) + }, + commit: workerClient.Commit, + client: workerClient, + }, + done: doneCh, + } + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + + select { + case <-doneCh: + default: + t.Fatal("worker done was not signaled") + } + + vp.serialMu.Lock() + defer vp.serialMu.Unlock() + require.Same(t, laterUnsaved, vp.unsavedEvent) + assert.Equal(t, laterPos, vp.pos) + assert.Equal(t, oldSavedAt, vp.timeLastSaved) +} + +func TestCommitLoop_CommitOnlyAppliesEvent(t *testing.T) { + ctx := testCtx(t) + vp, mockDB := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + mockDB.AddInvariant("insert", &sqltypes.Result{}) + + commitCh := make(chan *applyTxn, 1) + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + heartbeatEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_HEARTBEAT, + Timestamp: 100, + } + + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + commitOnly: true, + updatePosOnly: false, + mustSave: true, + events: []*binlogdatapb.VEvent{heartbeatEvent}, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + commitCh <- txn + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) +} + +func TestCommitLoop_UpdatePosOnlyStopPosReached(t *testing.T) { + ctx := testCtx(t) + vp, mockDB := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + stopPos, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + vp.stopPos = stopPos + + commitCh := make(chan *applyTxn, 1) + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + mustSave: true, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + commitCh <- txn + close(commitCh) + + err := vp.commitLoop(ctx, scheduler, commitCh) + require.ErrorIs(t, err, io.EOF) +} + +func TestCommitLoop_UpdatePosOnlyStopPosStateFailureKeepsTransactionOpen(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = stopPos + vp.saveStop = true + + stateErr := errors.New("set state failed") + mainClient := newVDBClient(&failingDBClient{failOnQuery: map[string]error{ + "update _vt.vreplication set state=": stateErr, + }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + t.Cleanup(func() { + _ = mainClient.Rollback() + }) + vp.vr.dbClient = mainClient + vp.dbClient = mainClient + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return mainClient.Execute(sql) + } + vp.commit = mainClient.Commit + + commitCh := make(chan *applyTxn, 1) + commitCh <- &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: stopPos, + timestamp: 100, + commitOnly: true, + updatePosOnly: true, + mustSave: true, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.ErrorContains(t, err, stateErr.Error()) + assert.True(t, mainClient.InTransaction) + assert.Contains(t, mainClient.queries, "begin") + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Zero(t, scheduler.lastCommittedOrder) + assert.Zero(t, scheduler.lastCommittedSequence) + assert.Zero(t, scheduler.inflightGlobal) + assert.Zero(t, scheduler.inflightMissingMeta) + assert.Zero(t, scheduler.inflightCommitMeta) +} + +func TestCommitLoop_WorkerStopPosStateFailureDoesNotCommit(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = stopPos + vp.saveStop = true + + stateErr := errors.New("set state failed") + vp.vr.dbClient = newVDBClient(&failingDBClient{failOnQuery: map[string]error{ + "update _vt.vreplication set state=": stateErr, + }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + + workerClient := newVDBClient(&failingDBClient{failOnQuery: map[string]error{ + "update _vt.vreplication set state=": stateErr, + }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + require.NoError(t, workerClient.Begin()) + t.Cleanup(func() { + _ = workerClient.Rollback() + }) + + doneCh := make(chan struct{}, 1) + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: stopPos, + timestamp: 100, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return workerClient.Execute(sql) + }, + commit: workerClient.Commit, + client: workerClient, + lastEventTimestamp: 100, + }, + done: doneCh, + } + + commitCh := make(chan *applyTxn, 1) + commitCh <- txn + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.ErrorContains(t, err, stateErr.Error()) + assert.True(t, workerClient.InTransaction) + assert.NotContains(t, workerClient.queries, "commit") + select { + case <-doneCh: + t.Fatal("worker done signaled before stop-state update succeeded") + default: + } + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Zero(t, scheduler.lastCommittedOrder) + assert.Zero(t, scheduler.lastCommittedSequence) + assert.Zero(t, scheduler.inflightGlobal) + assert.Zero(t, scheduler.inflightMissingMeta) + assert.Zero(t, scheduler.inflightCommitMeta) +} + +// TestCommitLoop_WorkerPosUpdateFailureDoesNotCommit pins the failure path +// where the position update on the worker's connection fails: the commitLoop +// must return the error without committing the worker's transaction, without +// signaling the worker's done channel (the connection is in an unknown state +// and must not be reused), and without advancing the scheduler. +func TestCommitLoop_WorkerPosUpdateFailureDoesNotCommit(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + posErr := errors.New("pos update failed") + workerClient := newVDBClient(&failingDBClient{failOnQuery: map[string]error{ + "update _vt.vreplication set pos=": posErr, + }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + require.NoError(t, workerClient.Begin()) + t.Cleanup(func() { + _ = workerClient.Rollback() + }) + + pos1, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + + doneCh := make(chan struct{}, 1) + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return workerClient.Execute(sql) + }, + commit: workerClient.Commit, + client: workerClient, + lastEventTimestamp: 100, + }, + done: doneCh, + } + + commitCh := make(chan *applyTxn, 1) + commitCh <- txn + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.ErrorContains(t, err, posErr.Error()) + assert.True(t, workerClient.InTransaction) + assert.NotContains(t, workerClient.queries, "commit") + select { + case <-doneCh: + require.Fail(t, "worker done signaled after failed position update") + default: + } + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Zero(t, scheduler.lastCommittedOrder) + assert.Zero(t, scheduler.lastCommittedSequence) + assert.Zero(t, scheduler.inflightGlobal) + assert.Zero(t, scheduler.inflightMissingMeta) + assert.Zero(t, scheduler.inflightCommitMeta) +} + +// TestCommitLoop_WorkerCommitFailureKeepsTransactionOpen pins the failure +// path where the position update succeeds but the COMMIT itself fails: the +// commitLoop must return the error, the vdbClient must still consider the +// transaction open (a failed COMMIT leaves the server-side state unknown), +// the worker must not be signaled to reuse the connection, and the scheduler +// must not record the txn as committed. +func TestCommitLoop_WorkerCommitFailureKeepsTransactionOpen(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + commitErr := errors.New("commit failed") + workerClient := newVDBClient(&failingCommitDBClient{commitErr: commitErr}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + require.NoError(t, workerClient.Begin()) + t.Cleanup(func() { + _ = workerClient.Rollback() + }) + + pos1, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + + doneCh := make(chan struct{}, 1) + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos1, + timestamp: 100, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return workerClient.Execute(sql) + }, + commit: workerClient.Commit, + client: workerClient, + lastEventTimestamp: 100, + }, + done: doneCh, + } + + commitCh := make(chan *applyTxn, 1) + commitCh <- txn + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.ErrorContains(t, err, commitErr.Error()) + assert.True(t, workerClient.InTransaction) + select { + case <-doneCh: + require.Fail(t, "worker done signaled after failed commit") + default: + } + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Zero(t, scheduler.lastCommittedOrder) + assert.Zero(t, scheduler.lastCommittedSequence) + assert.Zero(t, scheduler.inflightGlobal) + assert.Zero(t, scheduler.inflightMissingMeta) + assert.Zero(t, scheduler.inflightCommitMeta) +} + +func TestCommitLoop_CommitOnlyEOFStillMarksCommitted(t *testing.T) { + ctx := testCtx(t) + vp, mockDB := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + mockDB.AddInvariant("update _vt.vreplication set state=", &sqltypes.Result{}) + mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{}) + mockDB.AddInvariant("commit", &sqltypes.Result{}) + mockDB.AddInvariant("begin", &sqltypes.Result{}) + + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_STOP + + pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.pos = pos + vp.stopPos = pos + + txn := &applyTxn{ + order: 1, + forceGlobal: true, + hasCommitMeta: true, + sequenceNumber: 7, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 100, + commitOnly: true, + updatePosOnly: false, + events: []*binlogdatapb.VEvent{{ + Type: binlogdatapb.VEventType_DDL, + Statement: "alter table t1 add column c1 int", + Timestamp: 100, + }}, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + require.NoError(t, scheduler.enqueue(txn)) + + ready, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Same(t, txn, ready) + + commitCh := make(chan *applyTxn, 1) + commitCh <- ready + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.ErrorIs(t, err, io.EOF) + + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Equal(t, int64(7), scheduler.lastCommittedSequence) + assert.Equal(t, int64(1), scheduler.lastCommittedOrder) + assert.Zero(t, scheduler.inflightGlobal) + assert.Zero(t, scheduler.inflightMissingMeta) + assert.Zero(t, scheduler.inflightCommitMeta) +} + +func TestCommitLoop_EXECIGNOREIdempotentDropForeignKeyRefreshesFKMetadata(t *testing.T) { + ctx := testCtx(t) + vp, mockDB := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC_IGNORE + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + + oldFKRefs := map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}}, + } + vp.fkRefs = oldFKRefs + vp.parentFKRefs = buildParentFKRefs(oldFKRefs) + + mockDB.RemoveInvariant("information_schema.key_column_usage") + mockDB.ExpectRequestRE("update _vt\\.vreplication set pos='MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5', time_updated=.*", &sqltypes.Result{}, nil) + mockDB.ExpectRequestRE( + "SELECT kcu\\.TABLE_NAME, kcu\\.CONSTRAINT_NAME, kcu\\.COLUMN_NAME, kcu\\.REFERENCED_TABLE_NAME, kcu\\.REFERENCED_COLUMN_NAME, .* FROM information_schema\\.KEY_COLUMN_USAGE kcu JOIN information_schema\\.COLUMNS child_cols .* JOIN information_schema\\.COLUMNS parent_cols .* WHERE kcu\\.TABLE_SCHEMA = 'db' AND kcu\\.REFERENCED_TABLE_NAME IS NOT NULL ORDER BY kcu\\.TABLE_NAME, kcu\\.CONSTRAINT_NAME, kcu\\.ORDINAL_POSITION", + &sqltypes.Result{}, + nil, + ) + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + if sql == "alter table child drop foreign key fk_child_parent" { + return nil, sqlerror.NewSQLErrorf(sqlerror.ERCantDropFieldOrKey, sqlerror.SSBadFieldError, "Can't DROP 'fk_child_parent'; check that column/key exists") + } + return vp.vr.dbClient.Execute(sql) + } + + pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + + txn := &applyTxn{ + order: 1, + forceGlobal: true, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 100, + commitOnly: true, + updatePosOnly: false, + events: []*binlogdatapb.VEvent{{ + Type: binlogdatapb.VEventType_DDL, + Statement: "alter table child drop foreign key fk_child_parent", + Timestamp: 100, + }}, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + + commitCh := make(chan *applyTxn, 1) + commitCh <- txn + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + assert.Nil(t, vp.fkRefs) + assert.Nil(t, vp.parentFKRefs) + mockDB.Wait() + + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.tablePlans["child"] = &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + vp.tablePlansVersion.Store(1) + + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "child", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + require.NoError(t, vp.scheduleItems(ctx, scheduler, state, items)) + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.Len(t, got.payload.events, 1) + assert.False(t, got.forceGlobal) +} + +func TestCommitLoop_EXECIGNOREIdempotentAddUniqueIndexInvalidatesUniqueSecondaryCache(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC_IGNORE + vp.vr.vre = &Engine{env: vtenv.NewTestEnv()} + vp.vr.workflowConfig.ParallelReplicationWorkers = 2 + + tableName := "parallel_apply_execignore_idempotent_add_unique_idx" + qualifiedTableName := vrepldb + "." + tableName + execStatements(t, []string{ + "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))", + }) + t.Cleanup(func() { + execStatements(t, []string{"drop table if exists " + qualifiedTableName}) + }) + + realDB := &realDBClient{nolog: true} + require.NoError(t, realDB.Connect()) + t.Cleanup(realDB.Close) + + vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.dbClient = vp.vr.dbClient + vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld} + vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}} + + colInfoMap, err := vp.vr.buildColInfoMap(ctx) + require.NoError(t, err) + vp.vr.colInfoMap = colInfoMap + + vp.replicatorPlan, err = vp.vr.buildReplicatorPlan( + vp.vr.source, + vp.vr.colInfoMap, + nil, + vp.vr.stats, + vp.vr.vre.env.CollationEnv(), + vp.vr.vre.env.Parser(), + ) + require.NoError(t, err) + + stalePlan := &TablePlan{ + TargetName: tableName, + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}, {Name: "email", Type: querypb.Type_VARCHAR}}, + PKIndices: []bool{true, false}, + IdentityColumns: []string{"id"}, + HasExtraUniqueSecondary: false, + } + vp.tablePlans[tableName] = stalePlan + vp.tablePlansVersion.Store(1) + + vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + if sql == "alter table "+tableName+" add unique key uk_email(email)" { + return nil, sqlerror.NewSQLErrorf(sqlerror.ERDupKeyName, sqlerror.SSAccessDeniedError, "Duplicate key name 'uk_email'") + } + return vp.vr.dbClient.Execute(sql) + } + + pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + + txn := &applyTxn{ + order: 1, + forceGlobal: true, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 100, + commitOnly: true, + updatePosOnly: false, + events: []*binlogdatapb.VEvent{{ + Type: binlogdatapb.VEventType_DDL, + Statement: "alter table " + tableName + " add unique key uk_email(email)", + Timestamp: 100, + }}, + lastEventTimestamp: 100, + }, + done: make(chan struct{}), + } + require.NoError(t, scheduler.enqueue(txn)) + + ready, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.Same(t, txn, ready) + + commitCh := make(chan *applyTxn, 1) + commitCh <- ready + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.NoError(t, err) + require.NotNil(t, vp.postDDLStalePlans) + require.Contains(t, vp.postDDLStalePlans, tableName) + + require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_FIELD, + FieldEvent: &binlogdatapb.FieldEvent{ + TableName: tableName, + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT32}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }, + }, false)) + require.NoError(t, vp.dbClient.Rollback()) + + // The idempotent EXEC_IGNORE add-unique-index barrier invalidated the + // cached plan, so the FIELD handler re-ran the unique-key analysis: the + // plain unique secondary emits a writeset unique key. + require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary) + require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns) +} + +// TestCommitLoop_WorkerTxnCommitProtocol drives a single worker transaction +// through the real commitLoop and pins the commit protocol end-to-end: the +// position update and COMMIT run on the worker's connection, the worker's +// done channel is signaled, and the scheduler observes the committed order. +func TestCommitLoop_WorkerTxnCommitProtocol(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + recording := &recordingDBClient{} + workerClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + require.NoError(t, workerClient.Begin()) + + pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + + doneCh := make(chan struct{}, 1) + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 100, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return workerClient.Execute(sql) + }, + commit: workerClient.Commit, + client: workerClient, + lastEventTimestamp: 100, + }, + done: doneCh, + } + + commitCh := make(chan *applyTxn, 1) + commitCh <- txn + close(commitCh) + + require.NoError(t, vp.commitLoop(ctx, scheduler, commitCh)) + + require.NotEmpty(t, recording.queries) + assert.Contains(t, recording.queries[0], "update _vt.vreplication set pos=") + assert.False(t, workerClient.InTransaction, "commitLoop must commit the worker's transaction") + select { + case <-doneCh: + default: + t.Fatal("commitLoop must signal the worker's done channel after committing") + } + scheduler.mu.Lock() + defer scheduler.mu.Unlock() + assert.Equal(t, int64(1), scheduler.lastCommittedOrder) +} + +// TestCommitLoop_WorkerStopPosSetsStateAndStops pins the stop-position path +// of the worker commit protocol: when the transaction's position reaches +// stopPos, the Stopped state update is written on the worker's connection +// (inside the same MySQL transaction as the position update), the +// transaction commits, the worker is unblocked, and commitLoop returns +// io.EOF to stop the stream. +func TestCommitLoop_WorkerStopPosSetsStateAndStops(t *testing.T) { + ctx := testCtx(t) + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + + pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = pos + vp.saveStop = true + + recording := &recordingDBClient{} + workerClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + require.NoError(t, workerClient.Begin()) + + doneCh := make(chan struct{}, 1) + txn := &applyTxn{ + order: 1, + payload: &applyTxnPayload{ + pos: pos, + timestamp: 100, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return workerClient.Execute(sql) + }, + commit: workerClient.Commit, + client: workerClient, + lastEventTimestamp: 100, + }, + done: doneCh, + } + + commitCh := make(chan *applyTxn, 1) + commitCh <- txn + close(commitCh) + + err = vp.commitLoop(ctx, scheduler, commitCh) + require.ErrorIs(t, err, io.EOF, "reaching the stop position must stop the stream") + + assert.False(t, workerClient.InTransaction, "the worker's transaction must be committed") + var sawPosUpdate, sawStateUpdate bool + for _, q := range recording.queries { + if strings.Contains(q, "update _vt.vreplication set pos=") { + sawPosUpdate = true + } + if strings.Contains(q, "update _vt.vreplication set state=") { + sawStateUpdate = true + } + } + assert.True(t, sawPosUpdate, "position update must run on the worker's connection") + assert.True(t, sawStateUpdate, "the Stopped state update must run on the worker's connection so it commits atomically with the position") + select { + case <-doneCh: + default: + t.Fatal("commitLoop must signal the worker's done channel after committing") + } +} + +// TestSetState_BatchedTransactionExecutesImmediatelyWithoutReplay pins the +// mid-batch setState contract: the pending batch is flushed first (inside +// the same open MySQL transaction, preserving stop-path atomicity), the +// state UPDATE and insertLog statements execute immediately, and the batch +// buffer is marked flushed so the later CommitTrxQueryBatch sends only +// "commit" — replaying nothing. Deferring the state UPDATE into the batch +// (the previous design) double-executed insertLog's SELECT/INSERT on +// replay, duplicating vreplication_log rows. +func TestSetState_BatchedTransactionExecutesImmediatelyWithoutReplay(t *testing.T) { + vp, _ := testVPlayer(t) + recording := &recordingDBClient{} + + vp.vr.dbClient = newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Stopped) + vp.vr.dbClient.maxBatchSize = 1024 + + require.NoError(t, vp.vr.dbClient.Begin()) + require.NoError(t, vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, "")) + // The state UPDATE executed on the wire immediately (after the batch + // flush), within the open transaction. + require.NotEmpty(t, recording.queries) + sawStateUpdate := 0 + for _, q := range recording.queries { + if strings.Contains(q, "update _vt.vreplication set state='Stopped'") { + sawStateUpdate++ + } + } + require.Equal(t, 1, sawStateUpdate) + + // The later batch commit must replay nothing: only "commit" goes out. + preCommit := len(recording.queries) + require.NoError(t, vp.vr.dbClient.CommitTrxQueryBatch()) + require.Equal(t, preCommit+1, len(recording.queries)) + assert.Equal(t, "commit", recording.queries[len(recording.queries)-1]) + // Still exactly one state UPDATE — no double execution. + sawStateUpdate = 0 + for _, q := range recording.queries { + if strings.Contains(q, "update _vt.vreplication set state='Stopped'") { + sawStateUpdate++ + } + } + require.Equal(t, 1, sawStateUpdate) +} + +// TestSetStateImmediate_BatchedTransactionDoesNotDuplicateWrites exercises +// the worker batch-mode stop-position pattern at parallel_apply.go: the +// caller buffers the position update with AddQueryToTrxBatch, then flushes +// the batch with ExecuteTrxQueryBatch so the upcoming immediate writes share +// the same MySQL transaction, runs setStateWithDBClientImmediate to emit +// the state UPDATE and vreplication_log INSERT via ExecuteFetch, marks +// those already-executed queries as flushed, and finally calls +// CommitTrxQueryBatch which must send only ";commit" — not a replay of +// every prior query. Skipping any step in this dance doubles the +// vreplication_log row and (in the previous fix) broke atomicity with the +// position save by implicit-committing the active transaction via a +// nested BEGIN. +func TestSetStateImmediate_BatchedTransactionDoesNotDuplicateWrites(t *testing.T) { + vp, _ := testVPlayer(t) + recording := &recordingDBClient{} + + dbClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.vr.dbClient = dbClient + vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running) + dbClient.maxBatchSize = 1024 + + require.NoError(t, dbClient.Begin()) + require.NoError(t, dbClient.AddQueryToTrxBatch( + "update _vt.vreplication set pos='MySQL56/x:1-5', time_updated=1 where id=1")) + require.NoError(t, vp.vr.setStateWithDBClientImmediate( + dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at position foo")) + require.NoError(t, dbClient.CommitTrxQueryBatch()) + + joined := strings.Join(recording.queries, ";") + assert.Equal(t, 1, strings.Count(joined, "update _vt.vreplication set state='Stopped'"), + "state UPDATE must be sent exactly once. Queries: %v", recording.queries) + assert.Equal(t, 1, strings.Count(joined, "insert into _vt.vreplication_log"), + "vreplication_log INSERT must be sent exactly once. Queries: %v", recording.queries) + assert.Equal(t, 1, strings.Count(joined, "update _vt.vreplication set pos="), + "position UPDATE must be sent exactly once. Queries: %v", recording.queries) +} + +// TestSetStateImmediate_FollowedByAddQueryToTrxBatchPreservesNoDuplicate +// covers the failure mode the immediate path was designed to defeat: +// after setStateWithDBClientImmediate flushes pre-batched queries and +// runs the state UPDATE on the wire, queriesPos is advanced. A subsequent +// AddQueryToTrxBatch (the natural pattern: immediate write, then more +// batched work, then commit) must NOT replay the immediate queries when +// CommitTrxQueryBatch sends queries[queriesPos:]. If a future regression +// removed the markTrxBatchedQueriesFlushed call inside +// setStateWithDBClient, this test would catch it because the state UPDATE +// would appear twice in the recorded queries. +func TestSetStateImmediate_FollowedByAddQueryToTrxBatchPreservesNoDuplicate(t *testing.T) { + vp, _ := testVPlayer(t) + recording := &recordingDBClient{} + + dbClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + vp.vr.dbClient = dbClient + vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running) + dbClient.maxBatchSize = 1024 + + require.NoError(t, dbClient.Begin()) + require.NoError(t, dbClient.AddQueryToTrxBatch( + "update _vt.vreplication set pos='MySQL56/x:1-5', time_updated=1 where id=1")) + require.NoError(t, vp.vr.setStateWithDBClientImmediate( + dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at position foo")) + // Future-style follow-up batched work after the immediate write. + require.NoError(t, dbClient.AddQueryToTrxBatch( + "insert into _vt.vreplication_log(vrepl_id, type, message) values (1, 'Note', 'after')")) + require.NoError(t, dbClient.CommitTrxQueryBatch()) + + joined := strings.Join(recording.queries, ";") + assert.Equal(t, 1, strings.Count(joined, "update _vt.vreplication set state='Stopped'"), + "state UPDATE must be sent exactly once even with later batched work. Queries: %v", recording.queries) + // We expect 2 vreplication_log inserts: one from the immediate setState + // (LogStateChange) and one from the follow-up AddQueryToTrxBatch. + assert.Equal(t, 2, strings.Count(joined, "insert into _vt.vreplication_log"), + "each vreplication_log INSERT must be sent exactly once. Queries: %v", recording.queries) +} + +// TestBeginImmediate_AdvancesQueriesPosPastBeginSeed pins that BeginImmediate +// leaves vc.queriesPos past the synthetic "begin" entry it seeds. BEGIN was +// already sent on the wire by BeginImmediate; the buffer entry only exists so +// Retry's replay loop calls vc.Begin() instead of ExecuteFetch("begin"). Any +// subsequent ExecuteTrxQueryBatch / CommitTrxQueryBatch must not include it +// in its multi-statement, or the nested BEGIN would implicit-commit the +// active transaction and break atomicity with whatever the caller has done +// so far. +func TestBeginImmediate_AdvancesQueriesPosPastBeginSeed(t *testing.T) { + vp, _ := testVPlayer(t) + recording := &recordingDBClient{} + dbClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + dbClient.maxBatchSize = 1024 + + require.NoError(t, dbClient.BeginImmediate()) + assert.Equal(t, []string{"begin"}, dbClient.queries) + assert.Equal(t, int64(1), dbClient.queriesPos) +} + +// ---------- enqueueCommitOnly tests ---------- + +func TestEnqueueCommitOnly(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + // Set up a known position + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + vp.serialMu.Lock() + vp.pos = pos1 + vp.serialMu.Unlock() + + event := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + Timestamp: 200, + } + + err := vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NotNil(t, got) + assert.True(t, got.forceGlobal) + assert.True(t, got.noConflict) + assert.True(t, got.payload.commitOnly) + assert.True(t, got.payload.updatePosOnly) + assert.True(t, got.payload.mustSave) + assert.Equal(t, int64(200), got.payload.timestamp) +} + +func TestEnqueueCommitOnly_NotUpdatePosOnly(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + vp.serialMu.Lock() + vp.pos = pos1 + vp.serialMu.Unlock() + + event := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + Timestamp: 200, + SequenceNumber: 5, + CommitParent: 4, + } + + err := vp.enqueueCommitOnly(ctx, scheduler, event, false, false, event.SequenceNumber, event.CommitParent, true) + require.NoError(t, err) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.False(t, got.noConflict) // updatePosOnly=false → noConflict=false + assert.False(t, got.payload.mustSave) + assert.False(t, got.payload.updatePosOnly) + assert.True(t, got.hasCommitMeta) // SequenceNumber=5 + assert.Equal(t, int64(5), got.sequenceNumber) +} + +func TestEnqueueCommitOnly_IncrementsOrder(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 100} + + require.NoError(t, vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false)) + require.NoError(t, vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false)) + + got1, err := scheduler.nextReady(ctx) + require.NoError(t, err) + got2, err := scheduler.nextReady(ctx) + require.NoError(t, err) + + assert.Equal(t, int64(1), got1.order) + assert.Equal(t, int64(2), got2.order) +} + +// ---------- workerLoop tests ---------- + +func TestWorkerLoop_CommitOnlyBypassesApply(t *testing.T) { + vp, _ := testVPlayer(t) + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + scheduler := newApplyScheduler(ctx) + commitCh := make(chan *applyTxn, 1) + + pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + + // Enqueue a commitOnly transaction + payload := &applyTxnPayload{ + pos: pos1, + commitOnly: true, + timestamp: 100, + lastEventTimestamp: 100, + } + txn := &applyTxn{ + order: 1, + payload: payload, + } + require.NoError(t, scheduler.enqueue(txn)) + + // Worker is nil since commitOnly doesn't need it + worker := &applyWorker{ctx: ctx} + + // Run workerLoop in background + doneCh := make(chan error, 1) + go func() { + doneCh <- vp.workerLoop(ctx, scheduler, commitCh, worker) + }() + + // Should forward to commitCh + assert.Eventually(t, func() bool { + return len(commitCh) > 0 + }, 200*time.Millisecond, 5*time.Millisecond) + + got := <-commitCh + assert.Equal(t, txn, got) + + // Cancel to stop worker loop + cancel() + + assert.Eventually(t, func() bool { + return len(doneCh) > 0 + }, 200*time.Millisecond, 5*time.Millisecond) +} + +func TestWorkerLoop_AppliesAndDispatches(t *testing.T) { + ctx, cancel := context.WithCancel(testCtx(t)) + defer cancel() + + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + commitCh := make(chan *applyTxn, 1) + + worker := &applyWorker{ + ctx: ctx, + query: func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return &sqltypes.Result{}, nil + }, + commit: func() error { + return nil + }, + } + activeClient := newVDBClient(&recordingDBClient{}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + worker.client = activeClient + + event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"} + payload := &applyTxnPayload{events: []*binlogdatapb.VEvent{event}} + gotTxn := &applyTxn{order: 1, payload: payload} + + require.NoError(t, scheduler.enqueue(gotTxn)) + + errCh := make(chan error, 1) + go func() { + errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker) + }() + + select { + case txn := <-commitCh: + require.NotNil(t, txn) + assert.NotNil(t, txn.payload.query) + assert.NotNil(t, txn.payload.commit) + assert.Same(t, activeClient, txn.payload.client) + case <-time.After(200 * time.Millisecond): + t.Fatal("timed out waiting for commitCh") + } + + cancel() + + select { + case err := <-errCh: + require.ErrorIs(t, err, context.Canceled) + case <-time.After(200 * time.Millisecond): + t.Fatal("timed out waiting for workerLoop exit") + } +} + +func TestWorkerLoop_ErrorRollsBack(t *testing.T) { + ctx, cancel := context.WithCancel(testCtx(t)) + defer cancel() + + vp, _ := testVPlayer(t) + scheduler := newApplyScheduler(ctx) + commitCh := make(chan *applyTxn, 1) + + mockDB := binlogplayer.NewMockDBClient(t) + mockDB.AddInvariant("rollback", &sqltypes.Result{}) + + worker := &applyWorker{ + ctx: ctx, + client: newVDBClient(mockDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems), + } + + badEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"} + payload := &applyTxnPayload{events: []*binlogdatapb.VEvent{badEvent}} + gotTxn := &applyTxn{order: 1, payload: payload} + + require.NoError(t, scheduler.enqueue(gotTxn)) + + errCh := make(chan error, 1) + go func() { + errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker) + }() + + select { + case err := <-errCh: + require.Error(t, err) + case <-time.After(200 * time.Millisecond): + t.Fatal("timed out waiting for workerLoop error") + } +} + +// ---------- Batch time bound test ---------- + +func TestScheduleItems_BatchTimeBoundForcesSave(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + + // Set lastFlushTime to long ago to trigger the 500ms time bound + state := ¶llelScheduleState{ + lastFlushTime: time.Now().Add(-1 * time.Second), + lastHeartbeatRefresh: time.Now(), + } + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + // Two transactions in same batch — but time bound should force flush + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, Timestamp: 100}, + {Type: binlogdatapb.VEventType_COMMIT}, + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"}, + {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}}, + }, Timestamp: 200}, + {Type: binlogdatapb.VEventType_COMMIT}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // Time bound forced a flush — should have 2 separate transactions + got1, err := scheduler.nextReady(ctx) + require.NoError(t, err) + got2, err := scheduler.nextReady(ctx) + require.NoError(t, err) + + assert.Equal(t, int64(1), got1.order) + assert.Equal(t, int64(2), got2.order) +} + +// ---------- Empty txn with stop position enqueues commitOnly ---------- + +func TestScheduleItems_EmptyTxnWithStopPos(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5") + require.NoError(t, err) + vp.stopPos = stopPos + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-10", + } + commitEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_COMMIT, + Timestamp: 300, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}} + err = vp.scheduleItems(ctx, scheduler, state, items) + require.ErrorIs(t, err, io.EOF) + + // Empty txn at/past stop pos → enqueueCommitOnly should fire + got, gerr := scheduler.nextReady(ctx) + require.NoError(t, gerr) + require.NotNil(t, got) + assert.True(t, got.forceGlobal) + assert.True(t, got.payload.commitOnly) +} + +// ---------- JOURNAL is ForceGlobal ---------- + +func TestScheduleItems_JOURNALIsForceGlobal(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + }} + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + journalEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_JOURNAL, + Timestamp: 200, + Journal: &binlogdatapb.Journal{ + MigrationType: binlogdatapb.MigrationType_TABLES, + Tables: []string{"t1"}, + }, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, journalEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.ErrorIs(t, err, io.EOF) + + got, err := scheduler.nextReady(ctx) + require.NoError(t, err) + assert.True(t, got.forceGlobal) + assert.True(t, got.payload.commitOnly) +} + +func TestScheduleItems_RelevantJournalStopsSchedulingLaterEventsInSameFetch(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{ + "t1": {TargetName: "t1"}, + }} + + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + { + Type: binlogdatapb.VEventType_JOURNAL, + Timestamp: 200, + Journal: &binlogdatapb.Journal{ + MigrationType: binlogdatapb.MigrationType_TABLES, + Tables: []string{"t1"}, + }, + }, + {Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.ErrorIs(t, err, io.EOF) + + scheduler.mu.Lock() + assert.Equal(t, 1, scheduler.pendingCount) + scheduler.mu.Unlock() + + got, gerr := scheduler.nextReady(ctx) + require.NoError(t, gerr) + require.NotNil(t, got) + assert.Equal(t, binlogdatapb.VEventType_JOURNAL, got.payload.events[0].Type) + assert.True(t, got.payload.commitOnly) +} + +func TestScheduleItems_StopDDLStopsSchedulingLaterEventsInSameFetch(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_STOP + + items := [][]*binlogdatapb.VEvent{{ + {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}, + {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 200}, + {Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}, + }} + + err := vp.scheduleItems(ctx, scheduler, state, items) + require.ErrorIs(t, err, io.EOF) + + scheduler.mu.Lock() + assert.Equal(t, 1, scheduler.pendingCount) + scheduler.mu.Unlock() + + got, gerr := scheduler.nextReady(ctx) + require.NoError(t, gerr) + require.NotNil(t, got) + assert.Equal(t, binlogdatapb.VEventType_DDL, got.payload.events[0].Type) + assert.True(t, got.payload.commitOnly) +} + +// ---------- DDL after accumulated ROW events flushes first ---------- + +func TestScheduleItems_DDLFlushesAccumulatedEvents(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := testCtx(t) + scheduler := newApplyScheduler(ctx) + state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()} + + vp.tablePlans["t1"] = &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + vp.tablePlansVersion.Store(1) + + gtidEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_GTID, + Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", + } + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}}, + }, + Timestamp: 100, + } + ddlEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_DDL, + Timestamp: 200, + } + + items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, ddlEvent}} + err := vp.scheduleItems(ctx, scheduler, state, items) + require.NoError(t, err) + + // Should have 2 transactions: the flush of ROW events, then the DDL + got1, err := scheduler.nextReady(ctx) + require.NoError(t, err) + require.NoError(t, scheduler.markCommitted(got1)) + + got2, err := scheduler.nextReady(ctx) + require.NoError(t, err) + + // First is the row data flush + assert.Len(t, got1.payload.events, 1) + assert.Equal(t, binlogdatapb.VEventType_ROW, got1.payload.events[0].Type) + + // Second is the DDL (commitOnly, forceGlobal) + assert.True(t, got2.forceGlobal) + assert.Equal(t, binlogdatapb.VEventType_DDL, got2.payload.events[0].Type) +} + +// TestRecoverParallelApplyCatchesPanic verifies that the panic-recovery +// helper used by every parallel-applier goroutine turns a panic into a +// normal error routed through the supplied callback (which in production +// pushes onto the orchestrator's error channel and cancels ctx). Without +// this helper a panic in any worker would crash the entire vttablet. +func TestRecoverParallelApplyCatchesPanic(t *testing.T) { + t.Run("nil callback does not panic", func(t *testing.T) { + // Explicitly runs the helper with no callback supplied to ensure + // the nil-cb branch is safe. + func() { + defer recoverParallelApply("testGoroutine", nil) + panic("boom") + }() + }) + + t.Run("callback receives a wrapped error on panic", func(t *testing.T) { + var got error + func() { + defer recoverParallelApply("worker-1", func(err error) { got = err }) + panic("ouch") + }() + require.Error(t, got) + require.ErrorContains(t, got, "worker-1") + require.ErrorContains(t, got, "panicked") + }) + + t.Run("no panic means no callback invocation", func(t *testing.T) { + invoked := false + func() { + defer recoverParallelApply("happy", func(err error) { invoked = true }) + }() + require.False(t, invoked, "callback must not fire without a panic") + }) + + t.Run("runtime panic types are caught and surfaced", func(t *testing.T) { + var got error + func() { + defer recoverParallelApply("oob", func(err error) { got = err }) + // Force a runtime-panic path (slice index OOB) rather than an + // explicit panic() call to exercise Go's typed-panic surface. + s := []int{1, 2, 3} + idx := len(s) + 1 // silence staticcheck's constant-OOB check + _ = s[idx] + }() + require.Error(t, got) + require.ErrorContains(t, got, "oob") + }) +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker.go new file mode 100644 index 00000000000..d7ad9e1937c --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker.go @@ -0,0 +1,257 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "context" + "errors" + "log/slog" + + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/binlog/binlogplayer" + "vitess.io/vitess/go/vt/log" + vttablet "vitess.io/vitess/go/vt/vttablet/common" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" +) + +type applyWorker struct { + ctx context.Context + vr *vreplicator + // conns holds a pair of MySQL connections for double-buffering. While + // one connection is being committed by the commitLoop, the worker can + // immediately start applying the next transaction on the other. This + // decouples the worker's apply phase from the serial commitLoop, + // allowing true pipeline parallelism. + conns [2]*vdbClient + active int + // client points to conns[active] for convenience. Updated by rotate(). + client *vdbClient + // batchMode indicates whether this worker buffers SQL statements and + // flushes them as a single multi-statement request. When true, the + // apply phase buffers INSERTs via AddQueryToTrxBatch (near-zero cost), + // then flushWorkerBatch sends them all to MySQL in one ExecuteFetchMulti + // call. This happens during the parallel apply phase, so all workers + // execute their multi-statement batches concurrently. The commitLoop + // then just does a quick COMMIT + position update. + batchMode bool + // query executes a SQL statement on this worker's active connection. + // Rebound by rotate() to use the new active connection. + query func(ctx context.Context, sql string) (*sqltypes.Result, error) + // commit commits the current transaction on this worker's active connection. + // Rebound by rotate() alongside query. + commit func() error +} + +// createWorkerConn creates a single configured vdbClient for a worker. +func createWorkerConn(ctx context.Context, vr *vreplicator) (*vdbClient, error) { + dbClient := vr.vre.dbClientFactoryFiltered() + if err := dbClient.Connect(); err != nil { + return nil, err + } + if err := setDBClientSettings(dbClient, vr.workflowConfig); err != nil { + dbClient.Close() + return nil, err + } + // Workers apply transactions concurrently. The writeset scheduler models + // PK/unique/FK conflicts, but it cannot model InnoDB gap/next-key locks, + // which REPEATABLE READ takes even for point operations on absent rows + // (e.g. DELETE of a row that does not exist, or delete-marking in a + // non-unique secondary index). A later-ordered transaction's gap lock can + // block an earlier-ordered transaction's INSERT while the commitLoop's + // strict ordering keeps that gap lock held until the earlier transaction + // commits — a deadlock InnoDB's detector cannot see because half the + // cycle lives in the commitLoop (MySQL's MTA has its Commit_order_manager + // for exactly this). READ COMMITTED takes no gap locks for row-image + // application and is MySQL's own recommendation for row-based parallel + // appliers. Statement-based events force-serialize, so RC cannot change + // their outcome either. + // + // Use the SQL-standard statement form rather than setting the + // transaction_isolation sysvar: this connection goes directly to the + // target mysqld (no vtgate sysvar compatibility layer), and the sysvar + // spelling is flavor-specific (MariaDB used tx_isolation until 11.1; + // MySQL only added transaction_isolation in 5.7.20). Keep it lowercase + // to match the other session-setup statements (set names, set @@session.*). + if _, err := dbClient.ExecuteFetch("set session transaction isolation level read committed", 1); err != nil { + dbClient.Close() + return nil, err + } + vdbc := newVDBClientWithID(dbClient, vr.stats, vr.workflowConfig.RelayLogMaxItems, vr.id) + if _, err := vr.setSQLMode(ctx, vdbc); err != nil { + dbClient.Close() + return nil, err + } + if err := vr.resetFKCheckAfterCopy(vdbc); err != nil { + dbClient.Close() + return nil, err + } + if err := vr.resetFKRestrictAfterCopy(vdbc); err != nil { + dbClient.Close() + return nil, err + } + return vdbc, nil +} + +// newApplyWorker constructs a worker with two DB connections so its apply +// phase can overlap with the commitLoop's commit phase: one connection +// handles the current txn while the other is ready for the next. In batch +// mode it also reads MySQL's max_allowed_packet to size the multi-statement +// flush so a worker's batched INSERTs cannot exceed the wire limit. +func newApplyWorker(ctx context.Context, vr *vreplicator) (*applyWorker, error) { + batchMode := vr.workflowConfig.ExperimentalFlags&vttablet.VReplicationExperimentalFlagVPlayerBatching != 0 + + var conns [2]*vdbClient + for i := range 2 { + vdbc, err := createWorkerConn(ctx, vr) + if err != nil { + // Close any previously created connections. + for j := range i { + conns[j].Close() + } + return nil, err + } + conns[i] = vdbc + } + + if batchMode { + maxBatchSize := int64(vr.workflowConfig.RelayLogMaxSize) + res, err := conns[0].ExecuteFetch(SqlMaxAllowedPacket, 1) + if err != nil { + log.Error("Worker: error getting max_allowed_packet, will use relay-log-max-size value", slog.Int64("bytes", int64(vr.workflowConfig.RelayLogMaxSize)), slog.Any("error", err)) + } else { + if pkt, err := res.Rows[0][0].ToInt64(); err != nil { + log.Error("Worker: error getting max_allowed_packet, will use relay-log-max-size value", slog.Int64("bytes", int64(vr.workflowConfig.RelayLogMaxSize)), slog.Any("error", err)) + } else { + maxBatchSize = pkt + } + } + maxBatchSize -= 64 + for _, c := range conns { + c.maxBatchSize = maxBatchSize + } + } + + worker := &applyWorker{ + ctx: ctx, + vr: vr, + conns: conns, + active: 0, + client: conns[0], + batchMode: batchMode, + } + worker.bindFunctions() + return worker, nil +} + +// bindFunctions sets the query and commit closures to use the active connection. +func (w *applyWorker) bindFunctions() { + vdbc := w.client + if w.batchMode { + w.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + if !vdbc.InTransaction { + return vdbc.Execute(sql) + } + return nil, vdbc.AddQueryToTrxBatch(sql) + } + w.commit = func() error { + return vdbc.Commit() + } + } else { + w.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return vdbc.ExecuteWithRetry(ctx, sql) + } + w.commit = func() error { + return vdbc.Commit() + } + } +} + +// rotate switches the worker to its spare connection for the next transaction. +// The commitLoop will continue committing the previous transaction on the old +// connection. This double-buffering allows the worker's apply phase to overlap +// with the commitLoop's commit phase, enabling true pipeline parallelism. +func (w *applyWorker) rotate() { + w.active = 1 - w.active + w.client = w.conns[w.active] + w.bindFunctions() +} + +// flushWorkerBatch sends all buffered SQL statements to MySQL in one +// multi-statement call via ExecuteTrxQueryBatch. This is called after +// the worker has finished applying all events for a transaction, moving +// the MySQL work into the parallel apply phase (before the serial +// commitLoop). If batch mode is disabled, this is a no-op. +func (w *applyWorker) flushWorkerBatch() error { + if !w.batchMode || w.client == nil { + return nil + } + _, err := w.client.ExecuteTrxQueryBatch() + return err +} + +// close releases both of the worker's DB connections, rolling back first if +// either is mid-transaction so no half-applied worker state leaks back into +// the pool. +func (w *applyWorker) close() { + for _, c := range w.conns { + if c != nil { + if c.InTransaction { + _ = c.Rollback() + } + c.Close() + } + } +} + +// rollback discards in-progress work on the worker's active connection after +// an apply error, so the next rotate() does not leave a stale partial txn +// hanging on the connection we are about to park. +func (w *applyWorker) rollback() { + if w.client != nil { + _ = w.client.Rollback() + } +} + +// applyEvent dispatches through the shared vplayer.applyEvent code path while +// temporarily rebinding vp.dbClient/query/commit to this worker's active +// connection. Bindings are restored on return so the orchestrator's vplayer +// (shared by the scheduler and commitLoop) never ends up pointing at +// worker-owned state. +func (w *applyWorker) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, mustSave bool, vp *vplayer) error { + if w.client == nil { + return errors.New("apply worker has no active client") + } + prevLocal := vp.dbClient + prevQuery := vp.query + prevCommit := vp.commit + vp.query = w.query + vp.commit = w.commit + vp.dbClient = w.client + defer func() { + vp.dbClient = prevLocal + vp.query = prevQuery + vp.commit = prevCommit + }() + return vp.applyEvent(ctx, event, mustSave) +} + +// stats exposes the underlying vreplication stats so helpers that only hold +// an *applyWorker can record counters without reaching through w.vr. +func (w *applyWorker) stats() *binlogplayer.Stats { + return w.vr.stats +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker_test.go new file mode 100644 index 00000000000..6b529886f72 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker_test.go @@ -0,0 +1,566 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "context" + "errors" + "fmt" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql/capabilities" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/binlog/binlogplayer" + vttablet "vitess.io/vitess/go/vt/vttablet/common" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" +) + +type failingDBClient struct { + connectErr error + failOnQuery map[string]error + supportsCaps bool +} + +type recordingDBClient struct { + queries []string +} + +func (f *failingDBClient) DBName() string { return "db" } +func (f *failingDBClient) Connect() error { return f.connectErr } +func (f *failingDBClient) Begin() error { return nil } +func (f *failingDBClient) Commit() error { return nil } +func (f *failingDBClient) Rollback() error { return nil } +func (f *failingDBClient) Close() {} +func (f *failingDBClient) IsClosed() bool { return false } +func (f *failingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) { + for key, err := range f.failOnQuery { + if strings.Contains(query, key) { + return nil, err + } + } + if strings.Contains(query, getSQLModeQuery) { + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES,NO_ZERO_DATE,ANSI_QUOTES", + ), nil + } + if strings.Contains(query, "from _vt.vreplication where id=") { + return sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables), nil + } + if strings.Contains(query, "from _vt.copy_state where vrepl_id=") { + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + ), nil + } + return &sqltypes.Result{}, nil +} + +func (f *failingDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) { + qr, err := f.ExecuteFetch(query, maxrows) + if err != nil { + return nil, err + } + return []*sqltypes.Result{qr}, nil +} + +func (f *failingDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) { + return f.supportsCaps, nil +} + +func (r *recordingDBClient) DBName() string { return "db" } +func (r *recordingDBClient) Connect() error { return nil } +func (r *recordingDBClient) Begin() error { return nil } +func (r *recordingDBClient) Commit() error { return nil } +func (r *recordingDBClient) Rollback() error { return nil } +func (r *recordingDBClient) Close() {} +func (r *recordingDBClient) IsClosed() bool { return false } +func (r *recordingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) { + r.queries = append(r.queries, query) + return &sqltypes.Result{}, nil +} + +func (r *recordingDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) { + r.queries = append(r.queries, query) + return []*sqltypes.Result{{}}, nil +} + +func (r *recordingDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) { + return false, nil +} + +func TestApplyWorkerCloseRollsBack(t *testing.T) { + worker := &applyWorker{} + assert.NotPanics(t, func() { + worker.close() + }) +} + +func TestApplyWorkerRollbackNoError(t *testing.T) { + worker := &applyWorker{} + assert.NotPanics(t, func() { + worker.rollback() + }) +} + +func TestApplyWorkerApplyEventRestoresVPlayer(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := t.Context() + + originalClient := vp.dbClient + vp.query = nil + vp.commit = nil + + altDB := binlogplayer.NewMockDBClient(t) + altClient := newVDBClient(altDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems) + + worker := &applyWorker{ctx: ctx, client: altClient} + worker.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) { + return &sqltypes.Result{}, nil + } + worker.commit = func() error { + return nil + } + + gtid := "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5" + event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: gtid} + + err := worker.applyEvent(ctx, event, false, vp) + require.NoError(t, err) + + expectedPos, err := binlogplayer.DecodePosition(gtid) + require.NoError(t, err) + assert.Equal(t, expectedPos.String(), vp.pos.String()) + + assert.Equal(t, originalClient, vp.dbClient) + assert.Nil(t, vp.query) + assert.Nil(t, vp.commit) +} + +func TestApplyWorkerApplyEventNilClientFailsFast(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := t.Context() + + initial := "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-1" + pos, err := binlogplayer.DecodePosition(initial) + require.NoError(t, err) + vp.pos = pos + + worker := &applyWorker{ctx: ctx} + event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"} + + err = worker.applyEvent(ctx, event, false, vp) + require.ErrorContains(t, err, "apply worker has no active client") + assert.Equal(t, pos.String(), vp.pos.String()) +} + +func TestApplyWorkerApplyEventInsertStatementAcceptsMatchAllFilter(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := t.Context() + vp.canAcceptStmtEvents = true + + db := &recordingDBClient{} + worker := &applyWorker{ + ctx: ctx, + client: newVDBClient(db, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems), + } + worker.bindFunctions() + + event := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_INSERT, + Dml: "insert into t1(id) values (1)", + } + + workerVP := workerLocalVPlayer(vp) + err := worker.applyEvent(ctx, event, false, &workerVP) + require.NoError(t, err) + assert.Contains(t, db.queries, event.Dml) +} + +func TestApplyWorkerStatsReturnsVReplicatorStats(t *testing.T) { + vp, _ := testVPlayer(t) + worker := &applyWorker{vr: vp.vr} + + assert.Equal(t, vp.vr.stats, worker.stats()) +} + +func TestNewApplyWorker(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + config := vttablet.InitVReplicationConfigDefaults() + + mockDB := binlogplayer.NewMockDBClient(t) + mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{}) + mockDB.AddInvariant("set @@session.foreign_key_checks", &sqltypes.Result{}) + mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables)) + mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + )) + mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("max_allowed_packet", "int64"), + "4194304", + )) + + vr := &vreplicator{ + id: 1, + stats: stats, + dbClient: newVDBClient(mockDB, stats, config.RelayLogMaxItems), + workflowConfig: config, + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return mockDB }}, + } + + worker, err := newApplyWorker(t.Context(), vr) + require.NoError(t, err) + require.NotNil(t, worker) + + worker.close() +} + +func TestCreateWorkerConn_UsesSerialSQLModeContract(t *testing.T) { + testCases := []struct { + name string + workflowType binlogdatapb.VReplicationWorkflowType + expectedMode string + }{ + { + name: "non-online-ddl uses exact sql mode", + workflowType: binlogdatapb.VReplicationWorkflowType_MoveTables, + expectedMode: SQLMode, + }, + { + name: "online-ddl uses exact strict sql mode", + workflowType: binlogdatapb.VReplicationWorkflowType_OnlineDDL, + expectedMode: StrictSQLMode, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + teardownStats := stats + defer teardownStats.Stop() + + config := vttablet.InitVReplicationConfigDefaults() + workerDB := binlogplayer.NewMockDBClient(t) + workerDB.RemoveInvariants("select @@session.sql_mode", "set @@session.sql_mode", "set @@session.foreign_key_checks") + workerDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + workerDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + workerDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.sql_mode = CONCAT(@@session.sql_mode, ',NO_AUTO_VALUE_ON_ZERO')", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.sql_mode = REPLACE(REPLACE(REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', ''), 'NO_ZERO_IN_DATE', ''), 'NO_BACKSLASH_ESCAPES', '')", &sqltypes.Result{}) + workerDB.ExpectRequest(getSQLModeQuery, sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES,NO_ZERO_DATE,ANSI_QUOTES", + ), nil) + workerDB.ExpectRequest(binlogplayer.TestGetWorkflowQueryId1, sqlModeWorkflowSettingsResult(tc.workflowType), nil) + workerDB.ExpectRequest("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + ), nil) + workerDB.ExpectRequest(fmt.Sprintf(setSQLModeQueryf, tc.expectedMode), &sqltypes.Result{}, nil) + workerDB.ExpectRequest("set @@session.foreign_key_checks=0", &sqltypes.Result{}, nil) + + vr := &vreplicator{ + id: 1, + stats: stats, + dbClient: newVDBClient(workerDB, stats, config.RelayLogMaxItems), + workflowConfig: config, + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return workerDB }}, + } + + conn, err := createWorkerConn(t.Context(), vr) + require.NoError(t, err) + require.NotNil(t, conn) + workerDB.Wait() + conn.Close() + }) + } +} + +func TestCreateWorkerConn_UsesRunningFKSessionSettings(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + defer stats.Stop() + + config := vttablet.InitVReplicationConfigDefaults() + workerDB := binlogplayer.NewMockDBClient(t) + workerDB.RemoveInvariants("select @@session.sql_mode", "set @@session.sql_mode", "set @@session.foreign_key_checks") + workerDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{}) + workerDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{}) + workerDB.AddInvariant("set names 'binary'", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.sql_mode = CONCAT(@@session.sql_mode, ',NO_AUTO_VALUE_ON_ZERO')", &sqltypes.Result{}) + workerDB.AddInvariant("set @@session.sql_mode = REPLACE(REPLACE(REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', ''), 'NO_ZERO_IN_DATE', ''), 'NO_BACKSLASH_ESCAPES', '')", &sqltypes.Result{}) + workerDB.ExpectRequest(getSQLModeQuery, sqltypes.MakeTestResult( + sqltypes.MakeTestFields("sql_mode", "varchar"), + "STRICT_TRANS_TABLES,NO_ZERO_DATE,ANSI_QUOTES", + ), nil) + workerDB.ExpectRequest(binlogplayer.TestGetWorkflowQueryId1, sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables), nil) + workerDB.ExpectRequest("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "0", + ), nil) + workerDB.ExpectRequest(fmt.Sprintf(setSQLModeQueryf, SQLMode), &sqltypes.Result{}, nil) + workerDB.ExpectRequest("set @@session.foreign_key_checks=1", &sqltypes.Result{}, nil) + + vr := &vreplicator{ + id: 1, + stats: stats, + dbClient: newVDBClient(workerDB, stats, config.RelayLogMaxItems), + workflowConfig: config, + originalFKCheckSetting: 1, + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return workerDB }}, + } + + conn, err := createWorkerConn(t.Context(), vr) + require.NoError(t, err) + require.NotNil(t, conn) + workerDB.Wait() + conn.Close() +} + +func TestNewApplyWorkerConnectError(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + config := vttablet.InitVReplicationConfigDefaults() + + connectErr := errors.New("connect failed") + badClient := &failingDBClient{connectErr: connectErr} + vr := &vreplicator{ + id: 1, + stats: stats, + workflowConfig: config, + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return badClient }}, + } + + worker, err := newApplyWorker(t.Context(), vr) + require.ErrorIs(t, err, connectErr) + require.Nil(t, worker) +} + +func TestNewApplyWorkerSettingsError(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + config := vttablet.InitVReplicationConfigDefaults() + + settingsErr := errors.New("settings failed") + badClient := &failingDBClient{failOnQuery: map[string]error{"time_zone": settingsErr}} + vr := &vreplicator{ + id: 1, + stats: stats, + workflowConfig: config, + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return badClient }}, + } + + worker, err := newApplyWorker(t.Context(), vr) + require.ErrorIs(t, err, settingsErr) + require.Nil(t, worker) +} + +func TestNewApplyWorkerClearFKCheckError(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + config := vttablet.InitVReplicationConfigDefaults() + + fkErr := errors.New("fk checks failed") + badClient := &failingDBClient{failOnQuery: map[string]error{"set @@session.foreign_key_checks=0": fkErr}} + vr := &vreplicator{ + id: 1, + stats: stats, + dbClient: newVDBClient(badClient, stats, config.RelayLogMaxItems), + workflowConfig: config, + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return badClient }}, + } + + worker, err := newApplyWorker(t.Context(), vr) + require.ErrorIs(t, err, fkErr) + require.Nil(t, worker) +} + +func TestNewApplyWorkerClearFKRestrictError(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + config := vttablet.InitVReplicationConfigDefaults() + + restrictErr := errors.New("fk restrict failed") + workerClient := &failingDBClient{failOnQuery: map[string]error{"set @@session.restrict_fk_on_non_standard_key=0": restrictErr}} + capClient := &failingDBClient{supportsCaps: true} + + vr := &vreplicator{ + id: 1, + stats: stats, + dbClient: newVDBClient(capClient, stats, config.RelayLogMaxItems), + workflowConfig: config, + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return workerClient }}, + } + + worker, err := newApplyWorker(t.Context(), vr) + require.ErrorIs(t, err, restrictErr) + require.Nil(t, worker) +} + +func TestApplyWorkerApplyEventSetsFKChecksAfterRotate(t *testing.T) { + vp, _ := testVPlayer(t) + ctx := t.Context() + vp.tablePlans["t1"] = &TablePlan{TargetName: "t1"} + vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running) + + db0 := &recordingDBClient{} + db1 := &recordingDBClient{} + worker := &applyWorker{ + ctx: ctx, + conns: [2]*vdbClient{newVDBClient(db0, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems), newVDBClient(db1, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)}, + active: 0, + } + worker.client = worker.conns[0] + worker.bindFunctions() + + vp.query = worker.query + vp.commit = worker.commit + vp.dbClient = worker.client + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + Flags: 0, + TableName: "t1", + }, + } + + require.NoError(t, worker.applyEvent(ctx, rowEvent, false, vp)) + assert.Contains(t, db0.queries, "set @@session.foreign_key_checks=true") + + worker.rotate() + vp.query = worker.query + vp.commit = worker.commit + vp.dbClient = worker.client + + require.NoError(t, worker.applyEvent(ctx, rowEvent, false, vp)) + assert.Contains(t, db1.queries, "set @@session.foreign_key_checks=true") +} + +func sqlModeWorkflowSettingsResult(workflowType binlogdatapb.VReplicationWorkflowType) *sqltypes.Result { + return &sqltypes.Result{ + Fields: []*querypb.Field{ + {Name: "pos", Type: sqltypes.VarBinary}, + {Name: "stop_pos", Type: sqltypes.VarBinary}, + {Name: "max_tps", Type: sqltypes.Int64}, + {Name: "max_replication_lag", Type: sqltypes.Int64}, + {Name: "state", Type: sqltypes.VarBinary}, + {Name: "workflow_type", Type: sqltypes.Int64}, + {Name: "workflow", Type: sqltypes.VarChar}, + {Name: "workflow_sub_type", Type: sqltypes.Int64}, + {Name: "defer_secondary_keys", Type: sqltypes.Int64}, + {Name: "options", Type: sqltypes.VarBinary}, + }, + RowsAffected: 1, + Rows: [][]sqltypes.Value{{ + sqltypes.NewVarBinary("MariaDB/0-1-1083"), + sqltypes.NULL, + sqltypes.NewInt64(0), + sqltypes.NewInt64(0), + sqltypes.NewVarBinary(binlogdatapb.VReplicationWorkflowState_Running.String()), + sqltypes.NewInt64(int64(workflowType)), + sqltypes.NewVarChar("wf"), + sqltypes.NewInt64(0), + sqltypes.NewInt64(0), + sqltypes.NewVarBinary("{}"), + }}, + } +} + +// recordingFailingDBClient records every query while delegating behavior to +// failingDBClient (which serves the standard setup queries). +type recordingFailingDBClient struct { + failingDBClient + queries []string +} + +func (c *recordingFailingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) { + c.queries = append(c.queries, query) + return c.failingDBClient.ExecuteFetch(query, maxrows) +} + +// failingCommitDBClient delegates to failingDBClient but fails COMMIT, for +// exercising commit-failure paths (failingDBClient.Commit always succeeds). +type failingCommitDBClient struct { + failingDBClient + commitErr error +} + +func (c *failingCommitDBClient) Commit() error { return c.commitErr } + +// TestCreateWorkerConnSetsReadCommitted pins that worker connections run at +// READ COMMITTED. The writeset scheduler models PK/unique/FK conflicts, but +// it cannot model InnoDB gap/next-key locks, which REPEATABLE READ takes +// even for point operations on absent rows (e.g. DELETE of a row that does +// not exist, or delete-marking in a non-unique secondary index). A +// later-ordered transaction's gap lock can block an earlier-ordered +// transaction's INSERT while the commitLoop's strict ordering keeps that gap +// lock held until the earlier transaction commits — a deadlock InnoDB's +// detector cannot see because half the cycle lives in the commitLoop. READ +// COMMITTED takes no gap locks for row-image application and is MySQL's own +// recommendation for row-based parallel appliers. +func TestCreateWorkerConnSetsReadCommitted(t *testing.T) { + recording := &recordingFailingDBClient{} + stats := binlogplayer.NewStats() + vr := &vreplicator{ + id: 1, + stats: stats, + dbClient: newVDBClient(&failingDBClient{}, stats, 0), + workflowConfig: vttablet.InitVReplicationConfigDefaults(), + vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return recording }}, + } + conn, err := createWorkerConn(t.Context(), vr) + require.NoError(t, err) + require.NotNil(t, conn) + + // Pin the SQL-standard statement form: the worker conn talks directly to + // the target mysqld (no vtgate sysvar rewriting), and the + // transaction_isolation sysvar spelling is flavor-specific (MariaDB used + // tx_isolation until 11.1; MySQL only added transaction_isolation in + // 5.7.20). The statement form works everywhere. Lowercase keeps it + // consistent with the other session-setup statements and the framework's + // globalDBQueries filter (which skips lowercase "set ..." setup queries). + require.Contains(t, recording.queries, "set session transaction isolation level read committed", + "worker connections must run at READ COMMITTED to avoid gap-lock deadlocks through the commit order") +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset.go new file mode 100644 index 00000000000..dc884676b50 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset.go @@ -0,0 +1,1211 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "encoding/binary" + "fmt" + "maps" + "strings" + "sync" + "sync/atomic" + + "github.com/cespare/xxhash/v2" + + "vitess.io/vitess/go/mysql/collations" + "vitess.io/vitess/go/mysql/collations/charset" + "vitess.io/vitess/go/mysql/collations/colldata" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/vthash" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" +) + +var ( + writesetTextValueMarker = [2]byte{0xFF, 0x00} + // writesetKeySeparator separates the table name from the key values in + // the digest. A package-level array (Go has no []byte constants) so + // writesetDigestInit never allocates for it. + writesetKeySeparator = [1]byte{':'} +) + +// fieldIndexForName resolves a column-name lookup in a field-index map by +// trying the exact spelling first and falling back to lowercase. The maps are +// populated with both variants to bridge the case-sensitivity gap between +// sqlparser output and raw binlog field names. +func fieldIndexForName(fieldIdx map[string]int, colName string) (int, bool) { + if idx, ok := fieldIdx[colName]; ok { + return idx, true + } + idx, ok := fieldIdx[strings.ToLower(colName)] + return idx, ok +} + +// writesetDigestAddPayload writes a length-prefixed payload into the digest. +// The length prefix keeps concatenated payloads unambiguous so two different +// byte sequences cannot hash to the same digest by coincidental boundary +// alignment. +func writesetDigestAddPayload(d *xxhash.Digest, payload []byte) { + var scratch [8]byte + binary.LittleEndian.PutUint64(scratch[:], uint64(len(payload))) + d.Write(scratch[:]) + d.Write(payload) +} + +// NOTE on collisions: writeset keys are 64-bit xxhash digests. A hash +// collision between two unrelated keys creates a FALSE conflict (needless +// serialization) — never a missed one — so collisions degrade throughput, +// not correctness. +// +// writesetDigestInit initializes an xxhash digest with the table name +// followed by a ':' separator. Callers declare a stack-local xxhash.Digest +// and pass its address to avoid heap allocation. xxhash provides better +// throughput than FNV-1a for writeset keys with multiple PK columns. +func writesetDigestInit(d *xxhash.Digest, tableName string) { + d.Reset() + d.WriteString(tableName) + d.Write(writesetKeySeparator[:]) +} + +// writesetDigestAddValue folds a sqltypes.Value into the digest by writing +// its type discriminator (2 bytes, little-endian) followed by its raw bytes. +// querypb.Type is a 16-bit enum and using a 1-byte discriminator would let +// future types whose low byte collides (e.g. a hypothetical Type=N and +// Type=N+256) hash to the same key — silently letting truly conflicting +// transactions run in parallel. +func writesetDigestAddValue(d *xxhash.Digest, v sqltypes.Value) { + var scratch [8]byte + raw := v.Raw() + binary.LittleEndian.PutUint64(scratch[:], uint64(2+len(raw))) + d.Write(scratch[:]) + binary.LittleEndian.PutUint16(scratch[:2], uint16(v.Type())) + d.Write(scratch[:2]) + d.Write(raw) +} + +// writesetDigestAddFieldValue folds a column value into the digest using +// collation-aware hashing for text columns. Two rows that MySQL considers +// equal (trailing spaces under PAD SPACE, equivalent forms under *_ci +// collations) must hash to the same writeset key or conflict detection +// would let truly-conflicting txns run in parallel. +func writesetDigestAddFieldValue(d *xxhash.Digest, field *querypb.Field, v sqltypes.Value) error { + if field == nil || !sqltypes.IsText(field.Type) || field.Charset == 0 { + writesetDigestAddValue(d, v) + return nil + } + + collation := colldata.Lookup(collations.ID(field.Charset)) + if collation == nil { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unknown collation %d for field %s", field.Charset, field.Name) + } + + raw := v.Raw() + if collationUsesPadSpace(collation) { + raw = trimTrailingPadSpaceCodepoints(collation.Charset(), raw) + } + + var semanticHash vthash.Hasher + semanticHash.Reset() + collation.Hash(&semanticHash, raw, 0) + + // Fixed-size stack buffer: marker followed by the 8-byte collation hash. + var payload [len(writesetTextValueMarker) + 8]byte + copy(payload[:], writesetTextValueMarker[:]) + binary.LittleEndian.PutUint64(payload[len(writesetTextValueMarker):], semanticHash.Sum64()) + writesetDigestAddPayload(d, payload[:]) + return nil +} + +// collationUsesPadSpace reports whether the given collation compares strings +// as if right-padded with spaces. Values under such collations have trailing +// pad codepoints stripped before hashing so that e.g. 'abc' and 'abc ' +// hash to the same writeset key. +func collationUsesPadSpace(collation colldata.Collation) bool { + switch collation.(type) { + case *colldata.Collation_utf8mb4_uca_0900, *colldata.Collation_utf8mb4_0900_bin: + return false + default: + return true + } +} + +// trimTrailingPadSpaceCodepoints strips trailing space codepoints from raw +// bytes using the column's charset decoder. Used by PAD SPACE collations so +// values that compare equal in MySQL also hash equal in the writeset digest. +func trimTrailingPadSpaceCodepoints(cs charset.Charset, raw []byte) []byte { + trimmedEnd := 0 + for i := 0; i < len(raw); { + r, size := cs.DecodeRune(raw[i:]) + if size <= 0 { + return raw + } + i += size + if r != ' ' { + trimmedEnd = i + } + } + return raw[:trimmedEnd] +} + +// fkConstraintRef represents one foreign key constraint on a table. +// It maps one or more child columns to a parent table, allowing the +// parallel apply writeset to include FK reference keys that conflict +// with the parent table's writeset keys. +type fkConstraintRef struct { + ParentTable string // referenced parent table name + ChildColumnNames []string // child column names, in FK ordinal order + ReferencedColumnNames []string // parent column names, in FK ordinal order +} + +// parentFKRef represents a foreign key constraint from the parent table's +// perspective. When a parent row changes, we generate writeset keys using the +// referenced column values so they conflict with child-side FK keys. +type parentFKRef struct { + ParentTable string // the parent table name (same as the table being modified) + ReferencedColumnNames []string // parent column names referenced by the FK +} + +// buildParentFKRefs builds a reverse map from parent table name to the FK +// constraints that reference it. This allows parent-side rows to generate +// writeset keys that match child-side FK keys. +func buildParentFKRefs(fkRefs map[string][]fkConstraintRef) map[string][]parentFKRef { + if len(fkRefs) == 0 { + return nil + } + result := make(map[string][]parentFKRef) + for _, refs := range fkRefs { + for _, ref := range refs { + result[ref.ParentTable] = append(result[ref.ParentTable], parentFKRef{ + ParentTable: ref.ParentTable, + ReferencedColumnNames: ref.ReferencedColumnNames, + }) + } + } + return result +} + +// buildCanonicalTargetTableNames builds a lowercase→original-case map of +// target table names so canonicalTargetTableName can line up FK-graph lookups +// with the various case variants that arrive from DDL, binlog events, and +// replicator plans. Entries with ambiguous casing (two different target +// names sharing the same lowercase key) are dropped rather than silently +// picking one. +func buildCanonicalTargetTableNames(tablePlans map[string]*TablePlan) map[string]string { + if len(tablePlans) == 0 { + return nil + } + canonical := make(map[string]string, len(tablePlans)) + ambiguous := make(map[string]struct{}) + for _, plan := range tablePlans { + if plan == nil || plan.TargetName == "" { + continue + } + key := strings.ToLower(plan.TargetName) + if _, ok := ambiguous[key]; ok { + continue + } + if existing, ok := canonical[key]; ok { + if existing != plan.TargetName { + delete(canonical, key) + ambiguous[key] = struct{}{} + } + continue + } + canonical[key] = plan.TargetName + } + if len(canonical) == 0 { + return nil + } + return canonical +} + +// canonicalTargetTableName resolves a possibly case-varying name to the exact +// target-table key used in tablePlans. Returns the input unchanged when no +// canonical match exists so lookups miss cleanly rather than silently hitting +// a sibling table. +func canonicalTargetTableName(name string, canonical map[string]string) string { + if name == "" || len(canonical) == 0 { + return name + } + if resolved, ok := canonical[strings.ToLower(name)]; ok { + return resolved + } + return name +} + +// resolveFKRefsForTable collects FK constraints whose child table matches the +// given name (compared canonically). Returned refs have their ParentTable +// canonicalized so the writeset digest for a child row hashes under the same +// table-name key as the parent's writeset, which is what makes the two sides +// conflict. +func resolveFKRefsForTable(tableName string, refs map[string][]fkConstraintRef, canonical map[string]string) []fkConstraintRef { + if tableName == "" || len(refs) == 0 { + return nil + } + resolvedTableName := canonicalTargetTableName(tableName, canonical) + var resolved []fkConstraintRef + for name, tableRefs := range refs { + if canonicalTargetTableName(name, canonical) != resolvedTableName { + continue + } + start := len(resolved) + resolved = append(resolved, tableRefs...) + for i := start; i < len(resolved); i++ { + resolved[i].ParentTable = canonicalTargetTableName(resolved[i].ParentTable, canonical) + } + } + return resolved +} + +// resolveParentFKRefsForTable is the parent-side counterpart to +// resolveFKRefsForTable: when a parent row changes, we need FK-style writeset +// keys keyed on the parent's referenced columns so the change conflicts with +// the child-side FK keys. +func resolveParentFKRefsForTable(tableName string, refs map[string][]parentFKRef, canonical map[string]string) []parentFKRef { + if tableName == "" || len(refs) == 0 { + return nil + } + resolvedTableName := canonicalTargetTableName(tableName, canonical) + var resolved []parentFKRef + for name, tableRefs := range refs { + if canonicalTargetTableName(name, canonical) != resolvedTableName { + continue + } + start := len(resolved) + resolved = append(resolved, tableRefs...) + for i := start; i < len(resolved); i++ { + resolved[i].ParentTable = canonicalTargetTableName(resolved[i].ParentTable, canonical) + } + } + return resolved +} + +// buildResolvedFKRefTableSet returns the set of canonicalized table names +// that participate in any FK edge, as either child or parent. The scheduler +// uses this set to decide which tables' touched-row bookkeeping must follow +// FK-induced conflicts across the txn graph. +func buildResolvedFKRefTableSet(refs map[string][]fkConstraintRef, parentRefs map[string][]parentFKRef, canonical map[string]string) map[string]struct{} { + if len(refs) == 0 && len(parentRefs) == 0 { + return nil + } + resolved := make(map[string]struct{}, len(refs)+len(parentRefs)) + for name, tableRefs := range refs { + if len(tableRefs) == 0 { + continue + } + resolved[canonicalTargetTableName(name, canonical)] = struct{}{} + } + for name, tableRefs := range parentRefs { + if len(tableRefs) == 0 { + continue + } + resolved[canonicalTargetTableName(name, canonical)] = struct{}{} + } + if len(resolved) == 0 { + return nil + } + return resolved +} + +type txnWritesetCache struct { + fieldIdxCache map[string]map[string]int + canonicalTargetNames map[string]string + resolvedFKRefs map[string][]fkConstraintRef + resolvedParentRefs map[string][]parentFKRef + // identityIdxCache caches, per source table name, the plan's identity + // column positions resolved against the streamed fields. The list is + // stable per plan, so resolving it per row change would allocate a + // slice per change for composite-identity tables. + identityIdxCache map[string][]int + // planByTarget maps canonical target table names to their plans, for + // FK parent lookups (tablePlans itself is keyed by SOURCE table name). + planByTarget map[string]*TablePlan + // fkStreamedValidated records child source-table names whose FK refs + // have been validated against their parent plans' streamed metadata. + fkStreamedValidated map[string]struct{} + // relevantColsCache caches, per source table name, the set of column + // indexes the writeset depends on (PK plus FK-joined columns). Building + // it is O(columns + FK refs) with a map allocation, which is too + // expensive to repeat for every row change in the hot path. + relevantColsCache map[string]map[int]struct{} + // uniqueKeyIdxCache caches, per source table name, the resolved field + // positions of each hashable unique secondary index (plan.UniqueKeyColumns), + // so the column-name lookups happen once per table instead of per change. + uniqueKeyIdxCache map[string][][]int +} + +// writesetKeysForParentFKRef generates writeset keys for a parent table row +// based on foreign key constraints that reference this table. The hash uses +// parentTable:referencedColValues, matching the child-side FK key hash. +// Returns an error if the FK columns are missing from the streamed field list, +// so the caller can force serialization instead of silently dropping the edge. +func writesetKeysForParentFKRef(ref *parentFKRef, fields []*querypb.Field, fieldIdx map[string]int, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error { + appendKey := func(vals []sqltypes.Value) error { + if len(vals) == 0 { + return nil + } + var d xxhash.Digest + writesetDigestInit(&d, ref.ParentTable) + for _, colName := range ref.ReferencedColumnNames { + idx, ok := fieldIndexForName(fieldIdx, colName) + if !ok { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK referenced column %q not in streamed fields for parent table %s", colName, ref.ParentTable) + } + if idx >= len(fields) { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK referenced column %q index %d out of range for parent table fields %s", colName, idx, ref.ParentTable) + } + if idx >= len(vals) { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK referenced column %q index %d out of range for parent table %s", colName, idx, ref.ParentTable) + } + val := vals[idx] + if val.IsNull() { + return nil + } + if err := writesetDigestAddFieldValue(&d, fields[idx], val); err != nil { + return err + } + } + keySet[d.Sum64()] = struct{}{} + return nil + } + if err := appendKey(beforeVals); err != nil { + return err + } + return appendKey(afterVals) +} + +// writesetKeysForFKRef generates writeset keys based on a foreign key constraint. +// For each row (before and after), it looks up the child column values and produces +// a hash keyed on the parent table name and FK column values, which will conflict +// with the parent table's PK-based writeset key, forcing serialization of +// dependent txns. +// Returns an error if FK columns are missing from the streamed field list. +func writesetKeysForFKRef(ref *fkConstraintRef, fields []*querypb.Field, fieldIdx map[string]int, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error { + if ref == nil { + return nil + } + appendFKKey := func(vals []sqltypes.Value) error { + if len(vals) == 0 { + return nil + } + var d xxhash.Digest + writesetDigestInit(&d, ref.ParentTable) + for _, colName := range ref.ChildColumnNames { + idx, ok := fieldIndexForName(fieldIdx, colName) + if !ok { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK child column %q not in streamed fields for table referencing %s", colName, ref.ParentTable) + } + if idx >= len(fields) { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK child column %q index %d out of range for table fields referencing %s", colName, idx, ref.ParentTable) + } + if idx >= len(vals) { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK child column %q index %d out of range for table referencing %s", colName, idx, ref.ParentTable) + } + val := vals[idx] + // In MySQL, if any referencing column in an FK is NULL, the FK + // constraint is not enforced for that row. Skip generating a + // writeset key in that case to avoid artificial conflicts. + if val.IsNull() { + return nil + } + if err := writesetDigestAddFieldValue(&d, fields[idx], val); err != nil { + return err + } + } + keySet[d.Sum64()] = struct{}{} + return nil + } + if err := appendFKKey(beforeVals); err != nil { + return err + } + return appendFKKey(afterVals) +} + +// writesetKeysForUniqueKey emits conflict keys for one hashable unique +// secondary index, for both row images, mirroring MySQL's WRITESET tracking. +// A NULL in any key column emits no key for that image: MySQL unique indexes +// permit multiple NULLs, so a NULL-valued key cannot conflict with anything. +// The index ordinal is folded into the digest so different indexes on the +// same table produce distinct key spaces (a cross-index hash collision would +// only over-serialize, but unambiguous inputs are cheap). +func writesetKeysForUniqueKey(tableName string, ordinal int, colIdxs []int, fields []*querypb.Field, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error { + appendKey := func(vals []sqltypes.Value) error { + if len(vals) == 0 { + return nil + } + var d xxhash.Digest + writesetDigestInit(&d, tableName) + var ordinalScratch [8]byte + binary.LittleEndian.PutUint64(ordinalScratch[:], uint64(ordinal)) + writesetDigestAddPayload(&d, ordinalScratch[:]) + for _, idx := range colIdxs { + if idx >= len(vals) { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "unique key index out of range for %s", tableName) + } + val := vals[idx] + // A NULL key column cannot conflict: MySQL unique indexes permit + // multiple NULLs. Emit no key for this image. + if val.IsNull() { + return nil + } + var field *querypb.Field + if idx < len(fields) { + field = fields[idx] + } + if err := writesetDigestAddFieldValue(&d, field, val); err != nil { + return err + } + } + keySet[d.Sum64()] = struct{}{} + return nil + } + if err := appendKey(beforeVals); err != nil { + return err + } + return appendKey(afterVals) +} + +// writesetFieldsHashCompatible reports whether two streamed fields produce +// identical digests for logically-equal values. writesetDigestAddFieldValue +// hashes text fields via their collation (driven by Field.Charset) and +// everything else as a 2-byte type discriminator plus raw bytes — so the FK +// child/parent hash equality the scheduler relies on requires matching +// textness, charset (text), or exact type (non-text). +func writesetFieldsHashCompatible(a, b *querypb.Field) bool { + if a == nil || b == nil { + return false + } + aText := sqltypes.IsText(a.Type) && a.Charset != 0 + bText := sqltypes.IsText(b.Type) && b.Charset != 0 + if aText != bText { + return false + } + if aText { + return a.Charset == b.Charset + } + return a.Type == b.Type +} + +// validateFKStreamedFieldCompatibility fails closed when a child table's FK +// columns and the parent's referenced columns have hash-incompatible STREAMED +// field metadata. queryFKRefs validates the TARGET schema, but the digests +// are computed from the SOURCE (FIELD-event) metadata, which can diverge for +// target-only FKs (e.g. source child latin1 vs source parent utf8mb4, or INT +// vs BIGINT): equal logical values would then hash to different keys and the +// child/parent transactions could reorder. Parents whose plan has not been +// streamed yet are skipped — they cannot generate parent-side keys until +// their FIELD event arrives, and FIELD-bearing transactions serialize. +func validateFKStreamedFieldCompatibility(childPlan *TablePlan, childFieldIdx map[string]int, refs []fkConstraintRef, cache *txnWritesetCache, tablePlans map[string]*TablePlan) error { + if len(refs) == 0 { + return nil + } + var planByTarget map[string]*TablePlan + if cache != nil && cache.planByTarget != nil { + planByTarget = cache.planByTarget + } else { + planByTarget = make(map[string]*TablePlan, len(tablePlans)) + for _, plan := range tablePlans { + if plan != nil && plan.TargetName != "" { + planByTarget[plan.TargetName] = plan + } + } + if cache != nil { + cache.planByTarget = planByTarget + } + } + for i := range refs { + ref := &refs[i] + parentPlan := planByTarget[ref.ParentTable] + if parentPlan == nil || len(parentPlan.Fields) == 0 { + continue + } + parentFieldIdx := make(map[string]int, len(parentPlan.Fields)) + for j, f := range parentPlan.Fields { + if f == nil { + continue + } + parentFieldIdx[f.Name] = j + parentFieldIdx[strings.ToLower(f.Name)] = j + } + for k, childCol := range ref.ChildColumnNames { + if k >= len(ref.ReferencedColumnNames) { + break + } + childIdx, ok := fieldIndexForName(childFieldIdx, childCol) + if !ok || childIdx >= len(childPlan.Fields) { + continue // missing columns are caught by the key emitters + } + parentIdx, ok := fieldIndexForName(parentFieldIdx, ref.ReferencedColumnNames[k]) + if !ok || parentIdx >= len(parentPlan.Fields) { + continue + } + if !writesetFieldsHashCompatible(childPlan.Fields[childIdx], parentPlan.Fields[parentIdx]) { + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, + "FK streamed field metadata mismatch between child column %q and parent %s.%q: forcing serialization", + childCol, ref.ParentTable, ref.ReferencedColumnNames[k]) + } + } + } + return nil +} + +// buildTxnWriteset builds writeset keys for the given events. +// fieldIdxCache is an optional cache of field-name→index maps, shared +// across transactions on the same scheduler goroutine. Pass nil to +// use a local cache (e.g. in tests). +func buildTxnWriteset(tablePlans map[string]*TablePlan, fkRefs map[string][]fkConstraintRef, parentRefs map[string][]parentFKRef, events []*binlogdatapb.VEvent, fieldIdxCaches ...map[string]map[string]int) ([]uint64, error) { + var cache *txnWritesetCache + if len(fieldIdxCaches) > 0 && fieldIdxCaches[0] != nil { + cache = &txnWritesetCache{fieldIdxCache: fieldIdxCaches[0]} + } + return buildTxnWritesetWithCache(tablePlans, fkRefs, parentRefs, events, cache) +} + +// buildTxnWritesetWithCache is the cache-aware core of buildTxnWriteset. +// canonical-name, FK-resolution, and fieldIdx maps are shared across txns +// on the same scheduler goroutine to avoid rebuilding them per txn. Fails +// closed (returns an error) on partial row images or missing FK columns so +// the caller can route the txn through the serial path instead of producing +// a writeset that misses conflict-determining columns. +func buildTxnWritesetWithCache(tablePlans map[string]*TablePlan, fkRefs map[string][]fkConstraintRef, parentRefs map[string][]parentFKRef, events []*binlogdatapb.VEvent, cache *txnWritesetCache) ([]uint64, error) { + // Pre-estimate capacity to avoid map rehashing during key insertion. + // Each row change can produce ~2 keys (before + after). + estimated := 0 + for _, event := range events { + if event.Type == binlogdatapb.VEventType_ROW && event.RowEvent != nil { + estimated += 2 * len(event.RowEvent.RowChanges) + } + } + keySet := make(map[uint64]struct{}, estimated) + needResolvedFKRefs := len(fkRefs) > 0 || len(parentRefs) > 0 + var canonicalTargetNames map[string]string + var resolvedFKRefs map[string][]fkConstraintRef + var resolvedParentRefs map[string][]parentFKRef + if needResolvedFKRefs { + if cache != nil { + canonicalTargetNames = cache.canonicalTargetNames + resolvedFKRefs = cache.resolvedFKRefs + resolvedParentRefs = cache.resolvedParentRefs + } + if canonicalTargetNames == nil { + canonicalTargetNames = buildCanonicalTargetTableNames(tablePlans) + if cache != nil { + cache.canonicalTargetNames = canonicalTargetNames + } + } + if resolvedFKRefs == nil { + resolvedFKRefs = make(map[string][]fkConstraintRef) + if cache != nil { + cache.resolvedFKRefs = resolvedFKRefs + } + } + if resolvedParentRefs == nil { + resolvedParentRefs = make(map[string][]parentFKRef) + if cache != nil { + cache.resolvedParentRefs = resolvedParentRefs + } + } + } + var fieldIdxCache map[string]map[string]int + if cache != nil && cache.fieldIdxCache != nil { + fieldIdxCache = cache.fieldIdxCache + } else { + fieldIdxCache = map[string]map[string]int{} + } + for _, event := range events { + if event.Type != binlogdatapb.VEventType_ROW { + continue + } + rowEvent := event.RowEvent + if rowEvent == nil { + continue + } + // tablePlans is keyed by the FIELD event's source TableName. We rely + // on vstreamer emitting identical-case TableName for both the FIELD + // and the subsequent ROW events of the same table — they share a + // single per-stream cache. We do NOT canonicalize via + // canonicalTargetTableName here because that operates on TARGET + // names; the SOURCE-name space is independent and a case-insensitive + // fold could conflate distinct source tables on case-sensitive + // filesystems (lower_case_table_names=0). + plan := tablePlans[rowEvent.TableName] + if plan == nil { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "missing table plan for %s", rowEvent.TableName) + } + targetTableName := plan.TargetName + var refs []fkConstraintRef + var pRefs []parentFKRef + if needResolvedFKRefs { + var ok bool + refs, ok = resolvedFKRefs[targetTableName] + if !ok { + refs = resolveFKRefsForTable(targetTableName, fkRefs, canonicalTargetNames) + resolvedFKRefs[targetTableName] = refs + } + pRefs, ok = resolvedParentRefs[targetTableName] + if !ok { + pRefs = resolveParentFKRefsForTable(targetTableName, parentRefs, canonicalTargetNames) + resolvedParentRefs[targetTableName] = pRefs + } + } + // Build fieldIdx once per table for FK, composite identity, and + // unique-key column lookups. + var fieldIdx map[string]int + if len(refs) > 0 || len(pRefs) > 0 || len(plan.IdentityColumns) > 1 || len(plan.UniqueKeyColumns) > 0 { + var ok bool + fieldIdx, ok = fieldIdxCache[rowEvent.TableName] + if !ok { + fieldIdx = make(map[string]int, len(plan.Fields)) + for i, f := range plan.Fields { + fieldIdx[f.Name] = i + fieldIdx[strings.ToLower(f.Name)] = i + } + fieldIdxCache[rowEvent.TableName] = fieldIdx + } + } + // Fail closed when this child's FK columns and the parent's + // referenced columns have hash-incompatible STREAMED metadata + // (validated once per child table per fetch; parents without a + // streamed plan yet are re-checked on later transactions). + if len(refs) > 0 { + validated := false + if cache != nil { + _, validated = cache.fkStreamedValidated[rowEvent.TableName] + } + if !validated { + if err := validateFKStreamedFieldCompatibility(plan, fieldIdx, refs, cache, tablePlans); err != nil { + return nil, err + } + if cache != nil { + if cache.fkStreamedValidated == nil { + cache.fkStreamedValidated = make(map[string]struct{}) + } + cache.fkStreamedValidated[rowEvent.TableName] = struct{}{} + } + } + } + // Resolve the plan's identity column positions once per table. + var identityIndexes []int + if cache != nil { + if cache.identityIdxCache == nil { + cache.identityIdxCache = make(map[string][]int) + } + var ok bool + identityIndexes, ok = cache.identityIdxCache[rowEvent.TableName] + if !ok { + var err error + identityIndexes, err = writesetIdentityFieldIndexes(plan, targetTableName, fieldIdx) + if err != nil { + return nil, err + } + cache.identityIdxCache[rowEvent.TableName] = identityIndexes + } + } else { + var err error + identityIndexes, err = writesetIdentityFieldIndexes(plan, targetTableName, fieldIdx) + if err != nil { + return nil, err + } + } + // Resolve the hashable unique secondary indexes' field positions once + // per table. + var uniqueKeyIndexes [][]int + if cache != nil { + if cache.uniqueKeyIdxCache == nil { + cache.uniqueKeyIdxCache = make(map[string][][]int) + } + var ok bool + uniqueKeyIndexes, ok = cache.uniqueKeyIdxCache[rowEvent.TableName] + if !ok { + var err error + uniqueKeyIndexes, err = writesetUniqueKeyFieldIndexes(plan, targetTableName, fieldIdx) + if err != nil { + return nil, err + } + cache.uniqueKeyIdxCache[rowEvent.TableName] = uniqueKeyIndexes + } + } else { + var err error + uniqueKeyIndexes, err = writesetUniqueKeyFieldIndexes(plan, targetTableName, fieldIdx) + if err != nil { + return nil, err + } + } + // Resolve the writeset-relevant column set once per table. + var relevantCols map[int]struct{} + if cache != nil { + if cache.relevantColsCache == nil { + cache.relevantColsCache = make(map[string]map[int]struct{}) + } + var ok bool + relevantCols, ok = cache.relevantColsCache[rowEvent.TableName] + if !ok { + relevantCols = writesetRelevantColumns(plan, fieldIdx, refs, pRefs) + cache.relevantColsCache[rowEvent.TableName] = relevantCols + } + } else { + relevantCols = writesetRelevantColumns(plan, fieldIdx, refs, pRefs) + } + for _, change := range rowEvent.RowChanges { + // Partial row images (DataColumns/JsonPartialValues) omit columns + // from the binlog payload. buildTxnWriteset decodes rows with + // sqltypes.MakeRowTrusted(plan.Fields, change.Before/After), which + // treats the streamed values as positional and ignores the bitmaps. + // That makes both PK and FK hashing unsafe: omitted columns can + // shift later values into the wrong field slots. BEFORE images are + // ambiguous too: vstreamer can encode omitted columns as -1 lengths, + // but it only publishes DataColumns for AFTER rows. + // Fail closed until writeset reconstruction becomes bitmap-aware. + isPartialRow := change.DataColumns != nil || change.JsonPartialValues != nil + if !isPartialRow && plan.Fields != nil { + // Use != (not <) so an over-sized row image — which can arise + // from a stale plan that's missing a column the source still + // streams — also fails closed instead of running MakeRowTrusted + // past the end of plan.Fields and nil-derefing. + isPartialRow = (change.Before != nil && len(change.Before.Lengths) != len(plan.Fields)) || + (change.After != nil && len(change.After.Lengths) != len(plan.Fields)) + } + if !isPartialRow { + isPartialRow = rowHasNegativeRelevantLengths(change.Before, relevantCols) || + rowHasNegativeRelevantLengths(change.After, relevantCols) + } + if isPartialRow { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "partial row image on table %s: forcing serialization", rowEvent.TableName) + } + // Decode Before/After row values once per change. + var beforeVals, afterVals []sqltypes.Value + if change.Before != nil && plan.Fields != nil { + beforeVals = sqltypes.MakeRowTrusted(plan.Fields, change.Before) + } + if change.After != nil && plan.Fields != nil { + afterVals = sqltypes.MakeRowTrusted(plan.Fields, change.After) + } + if err := writesetKeysForChangeWithFieldIdx(plan, targetTableName, identityIndexes, beforeVals, afterVals, keySet); err != nil { + return nil, err + } + for ord, colIdxs := range uniqueKeyIndexes { + if err := writesetKeysForUniqueKey(targetTableName, ord, colIdxs, plan.Fields, beforeVals, afterVals, keySet); err != nil { + return nil, err + } + } + for i := range refs { + if err := writesetKeysForFKRef(&refs[i], plan.Fields, fieldIdx, beforeVals, afterVals, keySet); err != nil { + return nil, err + } + } + // Parent-side: generate FK-aware keys using the referenced columns + // so parent row changes conflict with child FK keys. + for i := range pRefs { + if err := writesetKeysForParentFKRef(&pRefs[i], plan.Fields, fieldIdx, beforeVals, afterVals, keySet); err != nil { + return nil, err + } + } + } + } + if len(keySet) == 0 { + return nil, nil + } + keys := make([]uint64, 0, len(keySet)) + for key := range keySet { + keys = append(keys, key) + } + return keys, nil +} + +// writesetRelevantColumns builds the set of column indexes the writeset +// depends on (PK plus FK-joined columns) for one table plan. Callers cache +// the result per table (see txnWritesetCache.relevantColsCache) so the map +// is built once per table per fetch instead of once per row change. +// +// Hashable unique-secondary columns are intentionally NOT included here. A +// -1 length on a relevant column trips the partial-image guard and forces +// serialization, but a NULL value in a full row image is also encoded as a +// -1 length. Unique-secondary columns are commonly nullable, and a NULL +// unique value cannot conflict (MySQL permits multiple NULLs), so the +// emitter (writesetKeysForUniqueKey) skips it. Adding such columns to the +// relevance set would force-serialize every NULL unique value and negate the +// parallelism this change unlocks. +func writesetRelevantColumns(plan *TablePlan, fieldIdx map[string]int, refs []fkConstraintRef, pRefs []parentFKRef) map[int]struct{} { + relevantColumns := make(map[int]struct{}) + for i, isPK := range plan.PKIndices { + if isPK { + relevantColumns[i] = struct{}{} + } + } + for _, ref := range refs { + for _, colName := range ref.ChildColumnNames { + if idx, ok := fieldIndexForName(fieldIdx, colName); ok { + relevantColumns[idx] = struct{}{} + } + } + } + for _, ref := range pRefs { + for _, colName := range ref.ReferencedColumnNames { + if idx, ok := fieldIndexForName(fieldIdx, colName); ok { + relevantColumns[idx] = struct{}{} + } + } + } + return relevantColumns +} + +// rowHasNegativeRelevantLengths returns true when a row image has -1 +// (omitted) lengths for any column the writeset depends on (PK or FK-joined +// column). vstreamer encodes omitted columns as -1 length without publishing +// a DataColumns bitmap on BEFORE rows, and the same sentinel can appear on +// AFTER rows under partial-image producers that do not set a bitmap. Treating +// those as partial images lets us fail closed and serialize instead of +// hashing against wrong-slot (NULL) values. +func rowHasNegativeRelevantLengths(row *querypb.Row, relevantColumns map[int]struct{}) bool { + if row == nil { + return false + } + for i, length := range row.Lengths { + if length < 0 { + if _, ok := relevantColumns[i]; ok { + return true + } + } + } + return false +} + +// snapshotTablePlans returns a copy-on-write snapshot of tablePlans. It only +// copies the map when the version has changed since the last snapshot, avoiding +// the read-lock hold time of building writesets directly against the live map. +func snapshotTablePlans(mu *sync.RWMutex, tablePlans map[string]*TablePlan, version *atomic.Int64, cachedVersion *int64, cached map[string]*TablePlan) map[string]*TablePlan { + if tablePlans == nil { + return nil + } + mu.RLock() + defer mu.RUnlock() + v := version.Load() + if cached != nil && v == *cachedVersion { + return cached + } + cp := make(map[string]*TablePlan, len(tablePlans)) + maps.Copy(cp, tablePlans) + *cachedVersion = v + return cp +} + +// txnTouchesExtraUniqueSecondary reports whether the txn writes any table +// whose plan carries an extra unique secondary index. Those tables have to +// serialize: writeset keys built from PK alone can miss conflicts that the +// secondary unique index would otherwise enforce. +func txnTouchesExtraUniqueSecondary(events []*binlogdatapb.VEvent, tablePlans map[string]*TablePlan) bool { + for _, event := range events { + if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil { + continue + } + plan := tablePlans[event.RowEvent.TableName] + if plan != nil && plan.HasExtraUniqueSecondary { + return true + } + } + return false +} + +// txnTouchesUnsupportedWritesetMapping reports whether any ROW event in the +// txn targets a table whose plan uses a mapping the writeset builder can't +// reason about (expressions, generated columns, lossy casts, etc). The +// scheduler must force serialization so those txns do not slip past conflict +// detection. +func txnTouchesUnsupportedWritesetMapping(events []*binlogdatapb.VEvent, tablePlans map[string]*TablePlan) bool { + for _, event := range events { + if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil { + continue + } + plan := tablePlans[event.RowEvent.TableName] + if plan != nil && plan.HasUnsupportedWritesetMapping { + return true + } + } + return false +} + +// writesetKeysForChange extracts PK-based writeset keys from pre-decoded row +// values and inserts them directly into the caller's keySet map as uint64 hashes. +func writesetKeysForChange(plan *TablePlan, tableName string, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error { + identityIndexes, err := writesetIdentityFieldIndexes(plan, tableName, nil) + if err != nil { + return err + } + return writesetKeysForChangeWithFieldIdx(plan, tableName, identityIndexes, beforeVals, afterVals, keySet) +} + +// writesetIdentityFieldIndexes resolves a plan's declared identity column +// names to positional indexes into the streamed fields. Multi-column +// identities go through this path; single-column identity plans use a +// simpler fast path elsewhere. Returns an error if any declared column is +// missing from the streamed fields so the caller can serialize the txn. +func writesetIdentityFieldIndexes(plan *TablePlan, tableName string, fieldIdx map[string]int) ([]int, error) { + if plan == nil || len(plan.IdentityColumns) <= 1 { + return nil, nil + } + if fieldIdx == nil { + fieldIdx = make(map[string]int, len(plan.Fields)) + for i, f := range plan.Fields { + if f == nil { + continue + } + fieldIdx[f.Name] = i + fieldIdx[strings.ToLower(f.Name)] = i + } + } + indexes := make([]int, 0, len(plan.IdentityColumns)) + for _, colName := range plan.IdentityColumns { + idx, ok := fieldIndexForName(fieldIdx, colName) + if !ok { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "writeset identity column %q not in streamed fields for %s", colName, tableName) + } + indexes = append(indexes, idx) + } + return indexes, nil +} + +// writesetUniqueKeyFieldIndexes resolves each hashable unique key's column +// names to positions in the streamed fields. Returns an error (which routes +// to serialization via writesetErrorForcesSerialization's "not in streamed +// fields" match) when a column is missing. +func writesetUniqueKeyFieldIndexes(plan *TablePlan, tableName string, fieldIdx map[string]int) ([][]int, error) { + if plan == nil || len(plan.UniqueKeyColumns) == 0 { + return nil, nil + } + if fieldIdx == nil { + fieldIdx = make(map[string]int, len(plan.Fields)) + for i, f := range plan.Fields { + if f == nil { + continue + } + fieldIdx[f.Name] = i + fieldIdx[strings.ToLower(f.Name)] = i + } + } + uniqueKeyIndexes := make([][]int, 0, len(plan.UniqueKeyColumns)) + for _, cols := range plan.UniqueKeyColumns { + indexes := make([]int, 0, len(cols)) + for _, colName := range cols { + idx, ok := fieldIndexForName(fieldIdx, colName) + if !ok { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "writeset unique key column %q not in streamed fields for %s", colName, tableName) + } + indexes = append(indexes, idx) + } + uniqueKeyIndexes = append(uniqueKeyIndexes, indexes) + } + return uniqueKeyIndexes, nil +} + +// writesetKeysForChangeWithFieldIdx is the indexed variant of +// writesetKeysForChange: it takes the plan's identity column positions +// pre-resolved (see writesetIdentityFieldIndexes) so multi-row txns do not +// re-resolve them per change. The keys it inserts into keySet are what the +// scheduler compares to decide which concurrent txns conflict. +func writesetKeysForChangeWithFieldIdx(plan *TablePlan, tableName string, identityIndexes []int, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error { + if plan == nil { + return nil + } + if len(plan.PKIndices) == 0 { + // Fail closed: a plan with no identity must not silently contribute + // zero keys. In a txn that also touches keyed tables the writeset + // would be non-empty and this table's rows would race with no + // conflict tracking at all. The error routes the txn to the serial + // path (see writesetErrorForcesSerialization). + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no usable writeset identity for %s", tableName) + } + appendKey := func(vals []sqltypes.Value) error { + if len(vals) == 0 { + return nil + } + var d xxhash.Digest + writesetDigestInit(&d, tableName) + hasPK := false + if len(identityIndexes) > 0 { + for _, idx := range identityIndexes { + if idx >= len(vals) { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "pk index out of range for %s", tableName) + } + hasPK = true + var field *querypb.Field + if idx < len(plan.Fields) { + field = plan.Fields[idx] + } + if err := writesetDigestAddFieldValue(&d, field, vals[idx]); err != nil { + return err + } + } + } else { + for i, isPK := range plan.PKIndices { + if !isPK { + continue + } + if i >= len(vals) { + return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "pk index out of range for %s", tableName) + } + hasPK = true + var field *querypb.Field + if i < len(plan.Fields) { + field = plan.Fields[i] + } + if err := writesetDigestAddFieldValue(&d, field, vals[i]); err != nil { + return err + } + } + } + if !hasPK { + // Fail closed, same as the empty-PKIndices case above. + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no usable writeset identity for %s", tableName) + } + keySet[d.Sum64()] = struct{}{} + return nil + } + if err := appendKey(beforeVals); err != nil { + return err + } + if err := appendKey(afterVals); err != nil { + return err + } + return nil +} + +// queryFKRefs queries information_schema.KEY_COLUMN_USAGE to discover all +// foreign key constraints in the given database. It returns a map from +// child table name to a list of FK constraints. Each constraint includes +// the referenced (parent) table name and the child column names in ordinal +// order, so that writeset keys generated for child rows will match the +// parent table's PK-based writeset keys. +func queryFKRefs(dbClient *vdbClient, dbName string) (map[string][]fkConstraintRef, error) { + query := fmt.Sprintf( + "SELECT kcu.TABLE_NAME, kcu.CONSTRAINT_NAME, kcu.COLUMN_NAME, kcu.REFERENCED_TABLE_NAME, kcu.REFERENCED_COLUMN_NAME, "+ + "child_cols.DATA_TYPE, COALESCE(child_cols.CHARACTER_SET_NAME, ''), COALESCE(child_cols.COLLATION_NAME, ''), COALESCE(child_cols.COLUMN_TYPE, ''), "+ + "parent_cols.DATA_TYPE, COALESCE(parent_cols.CHARACTER_SET_NAME, ''), COALESCE(parent_cols.COLLATION_NAME, ''), COALESCE(parent_cols.COLUMN_TYPE, '') "+ + "FROM information_schema.KEY_COLUMN_USAGE kcu "+ + "JOIN information_schema.COLUMNS child_cols "+ + "ON child_cols.TABLE_SCHEMA = kcu.TABLE_SCHEMA AND child_cols.TABLE_NAME = kcu.TABLE_NAME AND child_cols.COLUMN_NAME = kcu.COLUMN_NAME "+ + "JOIN information_schema.COLUMNS parent_cols "+ + "ON parent_cols.TABLE_SCHEMA = kcu.TABLE_SCHEMA AND parent_cols.TABLE_NAME = kcu.REFERENCED_TABLE_NAME AND parent_cols.COLUMN_NAME = kcu.REFERENCED_COLUMN_NAME "+ + "WHERE kcu.TABLE_SCHEMA = %s AND kcu.REFERENCED_TABLE_NAME IS NOT NULL "+ + "ORDER BY kcu.TABLE_NAME, kcu.CONSTRAINT_NAME, kcu.ORDINAL_POSITION", + encodeString(dbName), + ) + qr, err := dbClient.ExecuteFetch(query, -1) + if err != nil { + return nil, vterrors.Wrapf(err, "queryFKRefs") + } + if len(qr.Rows) == 0 { + return nil, nil + } + + // Group by (childTable, constraintName) — each row is one column + // of a potentially multi-column FK. We group by constraint name + // rather than parent table because a child table can have multiple + // FK constraints referencing the same parent table with different + // column sets. + type constraintKey struct { + childTable string + constraintName string + } + type constraintEntry struct { + key constraintKey + parentTable string + cols []string // child column names in ordinal order + referencedCols []string // parent column names in ordinal order + } + + // Use a slice to preserve order; there are typically very few FK constraints. + var constraints []constraintEntry + idx := map[constraintKey]int{} + + type fkColumnDigestMeta struct { + dataType string + charset string + collation string + columnType string + } + parseDigestMeta := func(offset int, row []sqltypes.Value) fkColumnDigestMeta { + return fkColumnDigestMeta{ + dataType: strings.ToLower(row[offset].ToString()), + charset: row[offset+1].ToString(), + collation: row[offset+2].ToString(), + columnType: strings.ToLower(row[offset+3].ToString()), + } + } + usesTextDigest := func(meta fkColumnDigestMeta) bool { + return meta.charset != "" || meta.collation != "" + } + columnsShareWritesetEncoding := func(child, parent fkColumnDigestMeta) bool { + if usesTextDigest(child) || usesTextDigest(parent) { + return usesTextDigest(child) && usesTextDigest(parent) && + child.charset == parent.charset && + child.collation == parent.collation + } + return child.columnType == parent.columnType + } + + for _, row := range qr.Rows { + childTable := row[0].ToString() + constraintName := row[1].ToString() + colName := row[2].ToString() + parentTable := row[3].ToString() + referencedColName := row[4].ToString() + childMeta := parseDigestMeta(5, row) + parentMeta := parseDigestMeta(9, row) + if !columnsShareWritesetEncoding(childMeta, parentMeta) { + return nil, vterrors.Errorf( + vtrpcpb.Code_FAILED_PRECONDITION, + "incompatible FK column definitions for %s.%s referencing %s.%s: child=%s/%s parent=%s/%s; align the definitions or disable parallel apply for this workflow", + childTable, + colName, + parentTable, + referencedColName, + childMeta.columnType, + childMeta.collation, + parentMeta.columnType, + parentMeta.collation, + ) + } + + k := constraintKey{childTable: childTable, constraintName: constraintName} + if i, ok := idx[k]; ok { + constraints[i].cols = append(constraints[i].cols, colName) + constraints[i].referencedCols = append(constraints[i].referencedCols, referencedColName) + } else { + idx[k] = len(constraints) + constraints = append(constraints, constraintEntry{ + key: k, + parentTable: parentTable, + cols: []string{colName}, + referencedCols: []string{referencedColName}, + }) + } + } + + result := make(map[string][]fkConstraintRef, len(constraints)) + for _, c := range constraints { + result[c.key.childTable] = append(result[c.key.childTable], fkConstraintRef{ + ParentTable: c.parentTable, + ChildColumnNames: c.cols, + ReferencedColumnNames: c.referencedCols, + }) + } + + return result, nil +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset_test.go new file mode 100644 index 00000000000..c683356e485 --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset_test.go @@ -0,0 +1,1590 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "fmt" + "testing" + + "github.com/cespare/xxhash/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "vitess.io/vitess/go/mysql/capabilities" + "vitess.io/vitess/go/mysql/collations" + "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/vt/binlog/binlogplayer" + "vitess.io/vitess/go/vt/sqlparser" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" + vttablet "vitess.io/vitess/go/vt/vttablet/common" +) + +// testWritesetHash mirrors production hash logic for test assertions. +func testWritesetHash(tableName string, vals ...sqltypes.Value) uint64 { + var d xxhash.Digest + writesetDigestInit(&d, tableName) + for _, v := range vals { + writesetDigestAddValue(&d, v) + } + return d.Sum64() +} + +func TestBuildTxnWritesetSinglePK(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}} + change := &binlogdatapb.RowChange{After: row} + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.NoError(t, err) + expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1"))) + require.Equal(t, []uint64{expected}, keys) +} + +func TestBuildTxnWritesetUsesBeforeAndAfter(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + beforeRow := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}} + afterRow := &querypb.Row{Values: []byte("2"), Lengths: []int64{1}} + change := &binlogdatapb.RowChange{Before: beforeRow, After: afterRow} + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.NoError(t, err) + require.Len(t, keys, 2) + h1 := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1"))) + h2 := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("2"))) + assert.ElementsMatch(t, []uint64{h1, h2}, keys) +} + +func BenchmarkBuildTxnWriteset_NoFKRefsAvoidsPlanWideCanonicalization(b *testing.B) { + const tableCount = 256 + tablePlans := make(map[string]*TablePlan, tableCount) + for i := range tableCount { + name := fmt.Sprintf("t%d", i) + tablePlans[name] = &TablePlan{ + TargetName: name, + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + } + row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}} + events := []*binlogdatapb.VEvent{{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t0", + RowChanges: []*binlogdatapb.RowChange{{After: row}}, + }, + }} + + b.ReportAllocs() + for b.Loop() { + keys, err := buildTxnWriteset(tablePlans, nil, nil, events) + if err != nil { + b.Fatal(err) + } + if len(keys) != 1 { + b.Fatalf("unexpected key count: %d", len(keys)) + } + } +} + +func BenchmarkWritesetDigestAddFieldValue_TextAllocations(b *testing.B) { + collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci")) + field := &querypb.Field{Name: "email", Type: querypb.Type_VARCHAR, Charset: collationID} + value := sqltypes.NewVarChar("user@example.com ") + + b.ReportAllocs() + for b.Loop() { + var d xxhash.Digest + writesetDigestInit(&d, "emails") + if err := writesetDigestAddFieldValue(&d, field, value); err != nil { + b.Fatal(err) + } + _ = d.Sum64() + } +} + +func TestBuildTxnWritesetRejectsPartialRowImageWithoutFKRefs(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "a", Type: querypb.Type_INT64}, + {Name: "id", Type: querypb.Type_INT64}, + {Name: "b", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{false, true, false}, + } + change := &binlogdatapb.RowChange{ + After: &querypb.Row{Values: []byte("23"), Lengths: []int64{1, 1}}, + DataColumns: &binlogdatapb.RowChange_Bitmap{ + Count: 3, + Cols: []byte{0x06}, + }, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.Error(t, err) + require.Contains(t, err.Error(), "partial row image") + require.Nil(t, keys) + assert.NotEqual(t, []uint64{testWritesetHash("t1", sqltypes.NewInt64(3))}, keys) +} + +// TestWritesetDigestAddValueDistinguishesTypesAcrossByteBoundary pins the +// invariant that the writeset type discriminator distinguishes types whose +// values modulo-256 collide. querypb.Type is a 16-bit enum and the encoding +// MUST not silently lose the high byte — otherwise two rows with conflicting +// PK values but distinct types would hash to the same key, letting truly +// conflicting transactions run in parallel and corrupt downstream apply. +func TestWritesetDigestAddValueDistinguishesTypesAcrossByteBoundary(t *testing.T) { + // Two synthetic types whose low bytes are identical: 1 and 1+256. + // All current named querypb.Type values stay below the collision + // threshold, but the encoding must defend against future additions. + v1 := sqltypes.MakeTrusted(querypb.Type(1), []byte{0x42}) + v2 := sqltypes.MakeTrusted(querypb.Type(1+256), []byte{0x42}) + + var d1, d2 xxhash.Digest + writesetDigestInit(&d1, "t") + writesetDigestInit(&d2, "t") + writesetDigestAddValue(&d1, v1) + writesetDigestAddValue(&d2, v2) + + require.NotEqual(t, d1.Sum64(), d2.Sum64(), "writeset digest must distinguish types whose low byte collides") +} + +// TestBuildTxnWritesetRejectsSparseAfterImageOnRelevantPKColumn covers an +// AFTER image that carries a -1 (omitted) length in a PK column without +// publishing a DataColumns bitmap. Before the fix, only BEFORE images were +// scanned for negative relevant lengths, so this case fell through to +// MakeRowTrusted and silently hashed the PK as a NULL/zero value — making the +// row collide with any other row whose AFTER image was similarly sparse. +func TestBuildTxnWritesetRejectsSparseAfterImageOnRelevantPKColumn(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "name", Type: querypb.Type_VARCHAR}, + }, + PKIndices: []bool{true, false}, + } + // AFTER image omits the PK column (length=-1) but does not publish a + // DataColumns bitmap — only the "name" value is present. + change := &binlogdatapb.RowChange{ + After: &querypb.Row{Values: []byte("john"), Lengths: []int64{-1, 4}}, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.Error(t, err) + require.Contains(t, err.Error(), "partial row image") + require.Nil(t, keys) +} + +// TestBuildTxnWritesetRejectsRowImageWithExtraLengths covers the case where the +// row image carries more length entries than the plan has fields. This can +// happen if the table plan cache is stale relative to a schema that dropped a +// column. The writeset builder must fail closed instead of indexing into +// plan.Fields out of bounds (which would nil-deref in MakeRowTrusted). +func TestBuildTxnWritesetRejectsRowImageWithExtraLengths(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true}, + } + // Row has 2 length entries, but plan only knows 1 field. + change := &binlogdatapb.RowChange{ + After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}}, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.Error(t, err) + require.Contains(t, err.Error(), "partial row image") + require.Nil(t, keys) +} + +func TestBuildTxnWritesetAllowsBeforeImageWithNullValue(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "nullable_col", Type: querypb.Type_VARCHAR}, + }, + PKIndices: []bool{true, false}, + } + change := &binlogdatapb.RowChange{ + Before: &querypb.Row{Values: []byte("1"), Lengths: []int64{1, -1}}, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.NoError(t, err) + expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1"))) + require.Equal(t, []uint64{expected}, keys) +} + +func TestBuildTxnWritesetRejectsSparseBeforeImageOnRelevantFKColumn(t *testing.T) { + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + {Name: "val", Type: querypb.Type_VARCHAR}, + }, + PKIndices: []bool{true, false, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}}, + } + change := &binlogdatapb.RowChange{ + Before: &querypb.Row{ + Lengths: []int64{1, -1, 3}, + Values: []byte("5aaa"), + }, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset( + map[string]*TablePlan{"child": childPlan}, + fkRefs, + buildParentFKRefs(fkRefs), + []*binlogdatapb.VEvent{vevent}, + ) + require.Error(t, err) + require.Contains(t, err.Error(), "partial row image") + require.Nil(t, keys) +} + +func TestBuildTxnWritesetAllowsCaseOnlyFKColumnNameMismatch(t *testing.T) { + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "ID", Type: querypb.Type_INT64}, + {Name: "PARENT_ID", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}}, + } + change := &binlogdatapb.RowChange{ + After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}}, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset( + map[string]*TablePlan{"child": childPlan}, + fkRefs, + buildParentFKRefs(fkRefs), + []*binlogdatapb.VEvent{vevent}, + ) + require.NoError(t, err) + require.Len(t, keys, 2) +} + +func TestBuildTxnWritesetAllowsMixedCaseFKColumnNameMismatch(t *testing.T) { + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "ID", Type: querypb.Type_INT64}, + {Name: "PARENT_ID", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"Parent_ID"}, ReferencedColumnNames: []string{"ID"}}}, + } + change := &binlogdatapb.RowChange{ + After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}}, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset( + map[string]*TablePlan{"child": childPlan}, + fkRefs, + buildParentFKRefs(fkRefs), + []*binlogdatapb.VEvent{vevent}, + ) + require.NoError(t, err) + require.Len(t, keys, 2) +} + +func TestBuildTxnWritesetAllowsFullRowImageWithNullValue(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "nullable_col", Type: querypb.Type_VARCHAR}, + }, + PKIndices: []bool{true, false}, + } + change := &binlogdatapb.RowChange{ + After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1, -1}}, + } + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.NoError(t, err) + expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1"))) + require.Equal(t, []uint64{expected}, keys) +} + +// TestBuildTxnWritesetNoPK pins that a table plan with no usable identity +// (no PK columns and no identity columns) fails closed instead of silently +// contributing zero keys. Silent no-keys would be a correctness hole: in a +// transaction that also touches keyed tables, the writeset would be +// non-empty, the scheduler would use writeset-only conflict detection, and +// this table's rows would race with no conflict tracking at all. +// buildColInfoMap's PK -> PK-equivalent -> all-columns fallback should make +// this unreachable for real tables, but the writeset builder must not rely +// on that staying true. +func TestBuildTxnWritesetNoPK(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{false}, + } + row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}} + change := &binlogdatapb.RowChange{After: row} + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.Error(t, err) + require.Contains(t, err.Error(), "no usable writeset identity") + require.Nil(t, keys) + // The error must route the transaction to the serial path, not fail the + // workflow: over-serialization is safe, a bricked workflow is not. + require.True(t, writesetErrorForcesSerialization(err)) +} + +func TestBuildTxnWritesetFailsClosedWithoutUsableIdentity(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + IdentityColumns: []string{"id"}, + } + row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}} + change := &binlogdatapb.RowChange{After: row} + rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.Error(t, err) + require.Contains(t, err.Error(), "no usable writeset identity") + require.Nil(t, keys) + require.True(t, writesetErrorForcesSerialization(err), "missing identity must serialize the txn, not fail the workflow") +} + +func TestWritesetKeysForChangeMissingPlan(t *testing.T) { + keySet := map[uint64]struct{}{} + err := writesetKeysForChange(nil, "t1", nil, nil, keySet) + require.NoError(t, err) + require.Empty(t, keySet) +} + +func TestWritesetKeysForChangeMultiplePK(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "name", Type: querypb.Type_VARCHAR}, + }, + PKIndices: []bool{true, true}, + } + row := &querypb.Row{Values: []byte("1foo"), Lengths: []int64{1, 3}} + afterVals := sqltypes.MakeRowTrusted(plan.Fields, row) + keySet := map[uint64]struct{}{} + err := writesetKeysForChange(plan, "t1", nil, afterVals, keySet) + require.NoError(t, err) + require.Len(t, keySet, 1) + expected := testWritesetHash("t1", + sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")), + sqltypes.MakeTrusted(querypb.Type_VARCHAR, []byte("foo")), + ) + _, ok := keySet[expected] + require.True(t, ok) +} + +func TestWritesetKeysForChangeCompositeBinaryPKValuesDoNotAlias(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id1", Type: querypb.Type_VARBINARY}, + {Name: "id2", Type: querypb.Type_VARBINARY}, + }, + PKIndices: []bool{true, true}, + } + valueType := querypb.Type_VARBINARY + typeByte := byte(valueType) + firstTuple := []sqltypes.Value{ + sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'a'}), + sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'x', ',', typeByte, 'y'}), + } + secondTuple := []sqltypes.Value{ + sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'a', ',', typeByte, 'x'}), + sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'y'}), + } + keySet := map[uint64]struct{}{} + + require.NoError(t, writesetKeysForChange(plan, "t1", nil, firstTuple, keySet)) + require.NoError(t, writesetKeysForChange(plan, "t1", nil, secondTuple, keySet)) + require.Len(t, keySet, 2) +} + +func TestWritesetKeysForChangeUsesMakeRowTrusted(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}} + afterVals := sqltypes.MakeRowTrusted(plan.Fields, row) + keySet := map[uint64]struct{}{} + err := writesetKeysForChange(plan, "t1", nil, afterVals, keySet) + require.NoError(t, err) + require.Len(t, keySet, 1) + expected := testWritesetHash("t1", sqltypes.MakeRowTrusted(plan.Fields, row)[0]) + _, ok := keySet[expected] + require.True(t, ok) +} + +type stubDBClient struct { + result *sqltypes.Result + err error +} + +func (s *stubDBClient) DBName() string { return "db" } +func (s *stubDBClient) Connect() error { return nil } +func (s *stubDBClient) Begin() error { return nil } +func (s *stubDBClient) Commit() error { return nil } +func (s *stubDBClient) Rollback() error { return nil } +func (s *stubDBClient) Close() {} +func (s *stubDBClient) IsClosed() bool { return false } +func (s *stubDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) { + if s.err != nil { + return nil, s.err + } + return s.result, nil +} + +func (s *stubDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) { + if s.err != nil { + return nil, s.err + } + return []*sqltypes.Result{s.result}, nil +} + +func (s *stubDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) { + return false, nil +} + +func TestWritesetKeysForChangePKOutOfRange(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "other", Type: querypb.Type_INT64}}, + PKIndices: []bool{true, true}, + } + row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}} + afterVals := sqltypes.MakeRowTrusted(plan.Fields[:1], row) + keySet := map[uint64]struct{}{} + err := writesetKeysForChange(plan, "t1", nil, afterVals, keySet) + require.Error(t, err) +} + +func TestQueryFKRefs(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + qr := sqltypes.MakeTestResult( + sqltypes.MakeTestFields( + "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE", + "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar", + ), + "child|fk_child_parent|parent_id|parent|id|int|||int|int|||int", + "child|fk_child_parent|parent_id2|parent|id2|int|||int|int|||int", + "other|fk_other_parent|parent_id|parent|id|int|||int|int|||int", + ) + client := newVDBClient(&stubDBClient{result: qr}, stats, 100) + refs, err := queryFKRefs(client, "db") + require.NoError(t, err) + require.Len(t, refs, 2) + require.Len(t, refs["child"], 1) + require.Equal(t, "parent", refs["child"][0].ParentTable) + require.Equal(t, []string{"parent_id", "parent_id2"}, refs["child"][0].ChildColumnNames) + require.Equal(t, []string{"id", "id2"}, refs["child"][0].ReferencedColumnNames) +} + +func TestQueryFKRefsError(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + client := newVDBClient(&stubDBClient{err: assert.AnError}, stats, 100) + refs, err := queryFKRefs(client, "db") + require.Error(t, err) + require.Nil(t, refs) +} + +type maxRowsAssertingDBClient struct { + result *sqltypes.Result + err error + assertQuery func(query string) + assertRows func(maxrows int) error +} + +func (m *maxRowsAssertingDBClient) DBName() string { return "db" } +func (m *maxRowsAssertingDBClient) Connect() error { return nil } +func (m *maxRowsAssertingDBClient) Begin() error { return nil } +func (m *maxRowsAssertingDBClient) Commit() error { return nil } +func (m *maxRowsAssertingDBClient) Rollback() error { return nil } +func (m *maxRowsAssertingDBClient) Close() {} +func (m *maxRowsAssertingDBClient) IsClosed() bool { return false } +func (m *maxRowsAssertingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) { + if m.assertQuery != nil { + m.assertQuery(query) + } + if m.assertRows != nil { + if err := m.assertRows(maxrows); err != nil { + return nil, err + } + } + if m.err != nil { + return nil, m.err + } + return m.result, nil +} + +func (m *maxRowsAssertingDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) { + qr, err := m.ExecuteFetch(query, maxrows) + if err != nil { + return nil, err + } + return []*sqltypes.Result{qr}, nil +} + +func (m *maxRowsAssertingDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) { + return false, nil +} + +func TestQueryFKRefsFetchesAllRows(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + qr := sqltypes.MakeTestResult( + sqltypes.MakeTestFields( + "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE", + "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar", + ), + "child|fk_child_parent|parent_id|parent|id|int|||int|int|||int", + ) + client := newVDBClient(&maxRowsAssertingDBClient{ + result: qr, + assertQuery: func(query string) { + require.Contains(t, query, "JOIN information_schema.COLUMNS child_cols") + require.Contains(t, query, "JOIN information_schema.COLUMNS parent_cols") + require.NotContains(t, query, "FROM information_schema.COLUMNS WHERE TABLE_SCHEMA") + }, + assertRows: func(maxrows int) error { + if maxrows != -1 { + return fmt.Errorf("expected fetch-all maxrows, got %d", maxrows) + } + return nil + }, + }, stats, 100) + + refs, err := queryFKRefs(client, "db") + require.NoError(t, err) + require.Len(t, refs["child"], 1) + require.Equal(t, "parent", refs["child"][0].ParentTable) + require.Equal(t, []string{"parent_id"}, refs["child"][0].ChildColumnNames) + require.Equal(t, []string{"id"}, refs["child"][0].ReferencedColumnNames) +} + +func TestQueryFKRefsRejectsHashIncompatibleFKColumnDefinitions(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + qr := sqltypes.MakeTestResult( + sqltypes.MakeTestFields( + "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE", + "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar", + ), + "child|fk_child_parent|parent_id|parent|id|int|||int|bigint|||bigint", + ) + + client := newVDBClient(&stubDBClient{result: qr}, stats, 100) + refs, err := queryFKRefs(client, "db") + require.Error(t, err) + require.ErrorContains(t, err, "incompatible FK column definitions") + require.Nil(t, refs) +} + +func TestQueryFKRefsAllowsCompatibleCharacterFKColumns(t *testing.T) { + stats := binlogplayer.NewStats() + stats.VReplicationLagGauges.Stop() + t.Cleanup(stats.Stop) + + qr := sqltypes.MakeTestResult( + sqltypes.MakeTestFields( + "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE", + "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar", + ), + "child|fk_child_parent|parent_code|parent|code|varchar|utf8mb4|utf8mb4_0900_ai_ci|varchar(64)|char|utf8mb4|utf8mb4_0900_ai_ci|char(32)", + ) + + client := newVDBClient(&stubDBClient{result: qr}, stats, 100) + refs, err := queryFKRefs(client, "db") + require.NoError(t, err) + require.Len(t, refs["child"], 1) + require.Equal(t, []string{"parent_code"}, refs["child"][0].ChildColumnNames) + require.Equal(t, []string{"code"}, refs["child"][0].ReferencedColumnNames) +} + +func TestBuildTxnWritesetMissingTablePlan(t *testing.T) { + rowEvent := &binlogdatapb.RowEvent{ + TableName: "missing", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}, + }}, + } + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(map[string]*TablePlan{}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.Error(t, err) + require.Nil(t, keys) +} + +func TestBuildTxnWritesetNoRows(t *testing.T) { + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_BEGIN} + keys, err := buildTxnWriteset(map[string]*TablePlan{}, nil, nil, []*binlogdatapb.VEvent{vevent}) + require.NoError(t, err) + require.Nil(t, keys) +} + +func TestWritesetKeysForFKRefMissingColumn(t *testing.T) { + ref := &fkConstraintRef{ParentTable: "parent", ChildColumnNames: []string{"missing"}, ReferencedColumnNames: []string{"id"}} + fieldIdx := map[string]int{"id": 0} + vals := []sqltypes.Value{sqltypes.NewInt64(1)} + keySet := map[uint64]struct{}{} + // When an FK column is missing from the streamed fields, the function + // should return an error (fail closed) instead of silently dropping the edge. + err := writesetKeysForFKRef(ref, []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, fieldIdx, nil, vals, keySet) + require.Error(t, err) + require.Contains(t, err.Error(), "not in streamed fields") + require.Empty(t, keySet) +} + +func TestWritesetKeysForFKRef(t *testing.T) { + // Child table has columns: id (PK), parent_id (FK -> parent.id) + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + ref := &fkConstraintRef{ + ParentTable: "parent", + ChildColumnNames: []string{"parent_id"}, + } + // child row: id=5, parent_id=42 + row := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}} + afterVals := sqltypes.MakeRowTrusted(childPlan.Fields, row) + // Build fieldIdx once per table, as buildTxnWriteset now does. + fieldIdx := make(map[string]int, len(childPlan.Fields)) + for i, f := range childPlan.Fields { + fieldIdx[f.Name] = i + } + keySet := map[uint64]struct{}{} + writesetKeysForFKRef(ref, childPlan.Fields, fieldIdx, nil, afterVals, keySet) + require.Len(t, keySet, 1) + expected := testWritesetHash("parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42"))) + _, ok := keySet[expected] + require.True(t, ok) +} + +func TestBuildTxnWritesetWithFKRefs(t *testing.T) { + // Parent table: parent(id PK) + parentPlan := &TablePlan{ + TargetName: "parent", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + // Child table: child(id PK, parent_id FK -> parent.id) + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": { + {ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}, + }, + } + parentRefs := buildParentFKRefs(fkRefs) + tablePlans := map[string]*TablePlan{ + "parent": parentPlan, + "child": childPlan, + } + + // Parent insert: id=42 + parentRow := &querypb.Row{Values: []byte("42"), Lengths: []int64{2}} + parentChange := &binlogdatapb.RowChange{After: parentRow} + parentEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{parentChange}}, + } + + // Child insert: id=5, parent_id=42 + childRow := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}} + childChange := &binlogdatapb.RowChange{After: childRow} + childEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{childChange}}, + } + + // Build writeset for parent txn + parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent}) + require.NoError(t, err) + parentHash := testWritesetHash("parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42"))) + require.Equal(t, []uint64{parentHash}, parentKeys) + + // Build writeset for child txn — should have both child PK hash and parent FK ref hash + childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent}) + require.NoError(t, err) + require.Len(t, childKeys, 2) + childPKHash := testWritesetHash("child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("5"))) + assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys) + + // The parent hash appears in both writesets — this creates a conflict + // that forces serialization, preventing FK constraint violations. + parentKeySet := map[uint64]struct{}{} + for _, k := range parentKeys { + parentKeySet[k] = struct{}{} + } + conflict := false + for _, k := range childKeys { + if _, ok := parentKeySet[k]; ok { + conflict = true + break + } + } + require.True(t, conflict, "parent and child writesets should conflict on parent hash") +} + +func TestBuildTxnWritesetWithCompositeParentFKRefsUsesIdentityColumnOrder(t *testing.T) { + parentPlan := &TablePlan{ + TargetName: "parent", + Fields: []*querypb.Field{{Name: "b", Type: querypb.Type_INT64}, {Name: "a", Type: querypb.Type_INT64}}, + IdentityColumns: []string{"a", "b"}, + PKIndices: []bool{true, true}, + } + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_a", Type: querypb.Type_INT64}, + {Name: "parent_b", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": { + {ParentTable: "parent", ChildColumnNames: []string{"parent_a", "parent_b"}, ReferencedColumnNames: []string{"a", "b"}}, + }, + } + parentRefs := buildParentFKRefs(fkRefs) + tablePlans := map[string]*TablePlan{ + "parent": parentPlan, + "child": childPlan, + } + + parentEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}}, + }}}, + } + childEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("921"), Lengths: []int64{1, 1, 1}}, + }}}, + } + + parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent}) + require.NoError(t, err) + parentHash := testWritesetHash( + "parent", + sqltypes.MakeTrusted(querypb.Type_INT64, []byte("2")), + sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")), + ) + require.Equal(t, []uint64{parentHash}, parentKeys) + + childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent}) + require.NoError(t, err) + require.Len(t, childKeys, 2) + childPKHash := testWritesetHash("child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("9"))) + assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys) + + parentKeySet := map[uint64]struct{}{parentHash: {}} + conflict := false + for _, k := range childKeys { + if _, ok := parentKeySet[k]; ok { + conflict = true + break + } + } + require.True(t, conflict, "parent and child writesets should conflict on the parent identity hash") +} + +func TestBuildTxnWritesetWithRenamedTableFKRefsUsesTargetTableNames(t *testing.T) { + parentPlan := &TablePlan{ + TargetName: "parent", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": { + {ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}, + }, + } + parentRefs := buildParentFKRefs(fkRefs) + tablePlans := map[string]*TablePlan{ + "parent_src": parentPlan, + "child_src": childPlan, + } + + parentRow := &querypb.Row{Values: []byte("42"), Lengths: []int64{2}} + parentEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "parent_src", + RowChanges: []*binlogdatapb.RowChange{{After: parentRow}}, + }, + } + childRow := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}} + childEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "child_src", + RowChanges: []*binlogdatapb.RowChange{{After: childRow}}, + }, + } + + parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent}) + require.NoError(t, err) + parentHash := testWritesetHash("parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42"))) + require.Equal(t, []uint64{parentHash}, parentKeys) + + childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent}) + require.NoError(t, err) + require.Len(t, childKeys, 2) + childPKHash := testWritesetHash("child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("5"))) + assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys) + + parentKeySet := map[uint64]struct{}{} + for _, k := range parentKeys { + parentKeySet[k] = struct{}{} + } + conflict := false + for _, k := range childKeys { + if _, ok := parentKeySet[k]; ok { + conflict = true + break + } + } + require.True(t, conflict, "renamed parent and child writesets should still conflict on target parent hash") +} + +func TestBuildTxnWritesetWithMixedCaseFKRefsUsesTargetTableNames(t *testing.T) { + parentPlan := &TablePlan{ + TargetName: "Parent", + Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, + PKIndices: []bool{true}, + } + childPlan := &TablePlan{ + TargetName: "Child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": { + {ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}, + }, + } + parentRefs := buildParentFKRefs(fkRefs) + tablePlans := map[string]*TablePlan{ + "parent_src": parentPlan, + "child_src": childPlan, + } + + parentRow := &querypb.Row{Values: []byte("42"), Lengths: []int64{2}} + parentEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "parent_src", + RowChanges: []*binlogdatapb.RowChange{{After: parentRow}}, + }, + } + childRow := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}} + childEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "child_src", + RowChanges: []*binlogdatapb.RowChange{{After: childRow}}, + }, + } + + parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent}) + require.NoError(t, err) + parentHash := testWritesetHash("Parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42"))) + require.Equal(t, []uint64{parentHash}, parentKeys) + + childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent}) + require.NoError(t, err) + require.Len(t, childKeys, 2) + childPKHash := testWritesetHash("Child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("5"))) + assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys) + + parentKeySet := map[uint64]struct{}{parentHash: {}} + conflict := false + for _, k := range childKeys { + if _, ok := parentKeySet[k]; ok { + conflict = true + break + } + } + require.True(t, conflict, "mixed-case FK metadata should still conflict on the target parent hash") +} + +func TestBuildTxnWritesetTextPrimaryKeyUsesCollationEquality(t *testing.T) { + collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_0900_ai_ci")) + require.NotZero(t, collationID) + + plan := &TablePlan{ + TargetName: "emails", + Fields: []*querypb.Field{{ + Name: "email", + Type: querypb.Type_VARCHAR, + Charset: collationID, + }}, + PKIndices: []bool{true}, + } + + upperEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("A"), Lengths: []int64{1}}, + }}}, + } + lowerEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("a"), Lengths: []int64{1}}, + }}}, + } + + upperKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{upperEvent}) + require.NoError(t, err) + lowerKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{lowerEvent}) + require.NoError(t, err) + require.Equal(t, upperKeys, lowerKeys, "text primary keys that compare equal under MySQL collation rules must hash identically") +} + +func TestBuildTxnWritesetPadSpaceTextPrimaryKeyUsesTrailingSpaceEquality(t *testing.T) { + collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci")) + require.NotZero(t, collationID) + + plan := &TablePlan{ + TargetName: "emails", + Fields: []*querypb.Field{{ + Name: "email", + Type: querypb.Type_VARCHAR, + Charset: collationID, + }}, + PKIndices: []bool{true}, + } + + trimmedEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("a"), Lengths: []int64{1}}, + }}}, + } + spacedEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("a "), Lengths: []int64{2}}, + }}}, + } + + trimmedKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{trimmedEvent}) + require.NoError(t, err) + spacedKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{spacedEvent}) + require.NoError(t, err) + require.Equal(t, trimmedKeys, spacedKeys, "text primary keys that compare equal under PAD SPACE collation rules must hash identically") +} + +func TestBuildTxnWritesetWithStringFKRefsUsesCollationEqualityAcrossCompatibleTypes(t *testing.T) { + collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_0900_ai_ci")) + require.NotZero(t, collationID) + + parentPlan := &TablePlan{ + TargetName: "parent", + Fields: []*querypb.Field{{ + Name: "email", + Type: querypb.Type_CHAR, + Charset: collationID, + }}, + PKIndices: []bool{true}, + } + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_email", Type: querypb.Type_VARCHAR, Charset: collationID}, + }, + PKIndices: []bool{true, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_email"}, ReferencedColumnNames: []string{"email"}}}, + } + parentRefs := buildParentFKRefs(fkRefs) + tablePlans := map[string]*TablePlan{ + "parent": parentPlan, + "child": childPlan, + } + + parentEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("A"), Lengths: []int64{1}}, + }}}, + } + childEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1a"), Lengths: []int64{1, 1}}, + }}}, + } + + parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent}) + require.NoError(t, err) + childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent}) + require.NoError(t, err) + + parentKeySet := map[uint64]struct{}{} + for _, k := range parentKeys { + parentKeySet[k] = struct{}{} + } + conflict := false + for _, k := range childKeys { + if _, ok := parentKeySet[k]; ok { + conflict = true + break + } + } + require.True(t, conflict, "compatible string FK values that compare equal under MySQL collation rules must conflict") +} + +func TestBuildTxnWritesetWithPadSpaceStringFKRefsUsesTrailingSpaceEqualityAcrossCompatibleTypes(t *testing.T) { + collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci")) + require.NotZero(t, collationID) + + parentPlan := &TablePlan{ + TargetName: "parent", + Fields: []*querypb.Field{{ + Name: "email", + Type: querypb.Type_CHAR, + Charset: collationID, + }}, + PKIndices: []bool{true}, + } + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_email", Type: querypb.Type_VARCHAR, Charset: collationID}, + }, + PKIndices: []bool{true, false}, + } + fkRefs := map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_email"}, ReferencedColumnNames: []string{"email"}}}, + } + parentRefs := buildParentFKRefs(fkRefs) + tablePlans := map[string]*TablePlan{ + "parent": parentPlan, + "child": childPlan, + } + + parentEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("A"), Lengths: []int64{1}}, + }}}, + } + childEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1a "), Lengths: []int64{1, 2}}, + }}}, + } + + parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent}) + require.NoError(t, err) + childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent}) + require.NoError(t, err) + + parentKeySet := map[uint64]struct{}{} + for _, k := range parentKeys { + parentKeySet[k] = struct{}{} + } + conflict := false + for _, k := range childKeys { + if _, ok := parentKeySet[k]; ok { + conflict = true + break + } + } + require.True(t, conflict, "compatible PAD SPACE string FK values that compare equal under MySQL rules must conflict") +} + +func TestBuildTxnWritesetExpressionPlanIsMarkedUnsupported(t *testing.T) { + vttablet.InitVReplicationConfigDefaults() + vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig} + plan, err := vr.buildReplicatorPlan( + getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: "t1", + Filter: "select a + b as c1, c as c2 from t1", + }}}), + map[string][]*ColumnInfo{"t1": {{Name: "c1", IsPK: true}, {Name: "c2"}}}, + nil, + binlogplayer.NewStats(), + collations.MySQL8(), + sqlparser.NewTestParser(), + ) + require.NoError(t, err) + + tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "a", Type: querypb.Type_INT64}, + {Name: "b", Type: querypb.Type_INT64}, + {Name: "c", Type: querypb.Type_INT64}, + }, + }) + require.NoError(t, err) + assert.True(t, tplan.HasUnsupportedWritesetMapping) +} + +func TestBuildTxnWritesetAliasedFKColumnPlanIsMarkedUnsupported(t *testing.T) { + vttablet.InitVReplicationConfigDefaults() + vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig} + plan, err := vr.buildReplicatorPlan( + getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: "child", + Filter: "select id, parent_id as pid from child", + }}}), + map[string][]*ColumnInfo{"child": {{Name: "id", IsPK: true}, {Name: "pid"}}}, + nil, + binlogplayer.NewStats(), + collations.MySQL8(), + sqlparser.NewTestParser(), + ) + require.NoError(t, err) + + tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{ + TableName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + }) + require.NoError(t, err) + assert.True(t, tplan.HasUnsupportedWritesetMapping) +} + +func TestBuildTxnWritesetMatchingAliasExpressionPlanIsMarkedUnsupported(t *testing.T) { + vttablet.InitVReplicationConfigDefaults() + vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig} + plan, err := vr.buildReplicatorPlan( + getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: "t1", + Filter: "select lower(email) as email from t1", + }}}), + map[string][]*ColumnInfo{"t1": {{Name: "email", IsPK: true}}}, + nil, + binlogplayer.NewStats(), + collations.MySQL8(), + sqlparser.NewTestParser(), + ) + require.NoError(t, err) + + tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + }) + require.NoError(t, err) + assert.True(t, tplan.HasUnsupportedWritesetMapping) +} + +func TestBuildTxnWritesetBacktickedDirectColumnPlanStaysSupported(t *testing.T) { + vttablet.InitVReplicationConfigDefaults() + vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig} + plan, err := vr.buildReplicatorPlan( + getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: "t1", + Filter: "select id, email from t1", + }}}), + map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}, {Name: "email"}}}, + nil, + binlogplayer.NewStats(), + collations.MySQL8(), + sqlparser.NewTestParser(), + ) + require.NoError(t, err) + + tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{ + TableName: "t1", + Fields: []*querypb.Field{ + {Name: "`id`", Type: querypb.Type_INT64}, + {Name: "`email`", Type: querypb.Type_VARCHAR}, + }, + }) + require.NoError(t, err) + assert.False(t, tplan.HasUnsupportedWritesetMapping) + require.Len(t, tplan.Fields, 2) + assert.Equal(t, "id", tplan.Fields[0].Name) + assert.Equal(t, "email", tplan.Fields[1].Name) +} + +// keySetsIntersect reports whether two writeset key slices share any key. +func keySetsIntersect(a, b []uint64) bool { + set := make(map[uint64]struct{}, len(a)) + for _, k := range a { + set[k] = struct{}{} + } + for _, k := range b { + if _, ok := set[k]; ok { + return true + } + } + return false +} + +// uniqueKeyRowEvent builds a single-change ROW event for an (id, email) table. +func uniqueKeyRowEvent(id, email string) *binlogdatapb.VEvent { + values := append([]byte(id), []byte(email)...) + row := &querypb.Row{Values: values, Lengths: []int64{int64(len(id)), int64(len(email))}} + return &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: row}}, + }, + } +} + +// uniqueKeyPlan builds an (id PK, email) table plan with a hashable unique +// secondary on email. +func uniqueKeyPlan() *TablePlan { + return &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + PKIndices: []bool{true, false}, + IdentityColumns: []string{"id"}, + UniqueKeyColumns: [][]string{{"email"}}, + } +} + +// TestBuildTxnWritesetUniqueKeySameValueDifferentIdentityConflicts pins the +// core MySQL-WRITESET behavior: two changes on DIFFERENT identities but the +// SAME unique secondary value must produce intersecting writesets (so they +// serialize), while different unique values must stay disjoint. +func TestBuildTxnWritesetUniqueKeySameValueDifferentIdentityConflicts(t *testing.T) { + plan := uniqueKeyPlan() + plans := map[string]*TablePlan{"t1": plan} + + // id=1 and id=2 both claim email "a@x". + sameValueA, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("1", "a@x")}) + require.NoError(t, err) + sameValueB, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("2", "a@x")}) + require.NoError(t, err) + require.True(t, keySetsIntersect(sameValueA, sameValueB), + "changes on different identities sharing a unique value must conflict") + + // id=2 with a different email "b@x" must not conflict with id=1/"a@x". + differentValue, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("2", "b@x")}) + require.NoError(t, err) + require.False(t, keySetsIntersect(sameValueA, differentValue), + "changes with different unique values must not conflict") +} + +// TestBuildTxnWritesetUniqueKeyUpdateEmitsBothImages pins that an UPDATE moving +// a unique value emits keys for BOTH the before holder and the after holder, so +// it conflicts with both the txn freeing the old value and the txn claiming the +// new one. +func TestBuildTxnWritesetUniqueKeyUpdateEmitsBothImages(t *testing.T) { + plan := uniqueKeyPlan() + plans := map[string]*TablePlan{"t1": plan} + + // UPDATE id=1 moving email from "old@x" to "new@x". + beforeRow := &querypb.Row{Values: []byte("1old@x"), Lengths: []int64{1, 5}} + afterRow := &querypb.Row{Values: []byte("1new@x"), Lengths: []int64{1, 5}} + updateEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{Before: beforeRow, After: afterRow}}, + }, + } + updateKeys, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{updateEvent}) + require.NoError(t, err) + + // A concurrent txn claiming the freed "old@x" value (different identity). + oldHolder, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("7", "old@x")}) + require.NoError(t, err) + // A concurrent txn that already holds the "new@x" value (different identity). + newHolder, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("8", "new@x")}) + require.NoError(t, err) + + require.True(t, keySetsIntersect(updateKeys, oldHolder), + "the UPDATE must conflict with a txn claiming the freed before-image value") + require.True(t, keySetsIntersect(updateKeys, newHolder), + "the UPDATE must conflict with a txn holding the after-image value") +} + +// TestBuildTxnWritesetUniqueKeyNullEmitsNoKey pins that a NULL unique value +// emits no unique-key key (two NULL rows do not conflict, since MySQL unique +// indexes permit multiple NULLs) while the PK key is still emitted. +func TestBuildTxnWritesetUniqueKeyNullEmitsNoKey(t *testing.T) { + plan := uniqueKeyPlan() + plans := map[string]*TablePlan{"t1": plan} + + // id=1 with NULL email, id=2 with NULL email: -1 length encodes NULL. + nullRowA := &querypb.Row{Values: []byte("1"), Lengths: []int64{1, -1}} + nullRowB := &querypb.Row{Values: []byte("2"), Lengths: []int64{1, -1}} + eventA := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: nullRowA}}, + }, + } + eventB := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: nullRowB}}, + }, + } + + keysA, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{eventA}) + require.NoError(t, err) + keysB, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{eventB}) + require.NoError(t, err) + + // Only the PK key is emitted (one key each), and the two NULL-email rows + // on different identities do not conflict. + require.Len(t, keysA, 1, "NULL unique value must emit no unique-key key, only the PK key") + require.Len(t, keysB, 1) + require.False(t, keySetsIntersect(keysA, keysB), + "two NULL unique values on different identities must not conflict") + + // Sanity: the single emitted key is the PK key. + pkKeyA := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1"))) + require.Equal(t, []uint64{pkKeyA}, keysA) +} + +// TestBuildTxnWritesetUniqueKeyCaseInsensitiveCollationConflicts pins that two +// unique values differing only by case under a case-insensitive collation hash +// to the same unique key and therefore conflict. +func TestBuildTxnWritesetUniqueKeyCaseInsensitiveCollationConflicts(t *testing.T) { + collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci")) + require.NotZero(t, collationID) + + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "email", Type: querypb.Type_VARCHAR, Charset: collationID}, + }, + PKIndices: []bool{true, false}, + IdentityColumns: []string{"id"}, + UniqueKeyColumns: [][]string{{"email"}}, + } + plans := map[string]*TablePlan{"t1": plan} + + // Different identities, unique values "A@X" vs "a@x". + upperKeys, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("1", "A@X")}) + require.NoError(t, err) + lowerKeys, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("2", "a@x")}) + require.NoError(t, err) + + require.True(t, keySetsIntersect(upperKeys, lowerKeys), + "unique values equal under a case-insensitive collation must hash to the same unique key") +} + +// TestBuildTxnWritesetUniqueKeyColumnMissingForcesSerialization pins that a +// unique-key column absent from the streamed fields produces a "not in streamed +// fields" error that routes the txn to the serial path. +func TestBuildTxnWritesetUniqueKeyColumnMissingForcesSerialization(t *testing.T) { + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "email", Type: querypb.Type_VARCHAR}, + }, + PKIndices: []bool{true, false}, + IdentityColumns: []string{"id"}, + // The unique key references a column the stream never sends. + UniqueKeyColumns: [][]string{{"missing_col"}}, + } + + _, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("1", "a@x")}) + require.Error(t, err) + require.Contains(t, err.Error(), "not in streamed fields") + require.True(t, writesetErrorForcesSerialization(err), + "a missing unique-key column must route the txn to the serial path") +} + +// TestBuildTxnWritesetUniqueKeyOrdinalDiscriminatesIndexes pins that two +// different unique indexes with coincidentally equal values produce distinct +// keys: the index ordinal is folded into the digest, so equal values on +// different indexes do not over-serialize by colliding. +func TestBuildTxnWritesetUniqueKeyOrdinalDiscriminatesIndexes(t *testing.T) { + // Two single-column unique secondaries (a, b), both INT64. A row with + // a == b would, without the ordinal discriminator, hash both unique keys + // to the same value. + plan := &TablePlan{ + TargetName: "t1", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "a", Type: querypb.Type_INT64}, + {Name: "b", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false, false}, + IdentityColumns: []string{"id"}, + UniqueKeyColumns: [][]string{{"a"}, {"b"}}, + } + + // id=1, a=7, b=7. + row := &querypb.Row{Values: []byte("177"), Lengths: []int64{1, 1, 1}} + event := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{After: row}}, + }, + } + + keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{event}) + require.NoError(t, err) + // PK key + two distinct unique-key keys = 3 keys, none colliding despite + // a == b. + require.Len(t, keys, 3, "equal values on different unique indexes must produce distinct keys") +} + +// TestBuildTxnWritesetFKStreamedMetadataMismatchSerializes pins the +// fail-closed path for target-only FKs whose SOURCE column metadata +// diverges: queryFKRefs validates the TARGET schema, but the digests hash +// the streamed (FIELD-event) metadata, so a child column streamed as INT64 +// referencing a parent column streamed as VARCHAR would hash equal logical +// values to different keys and let the child/parent transactions reorder. +// Such transactions must serialize instead. +func TestBuildTxnWritesetFKStreamedMetadataMismatchSerializes(t *testing.T) { + collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci")) + childPlan := &TablePlan{ + TargetName: "child", + Fields: []*querypb.Field{ + {Name: "id", Type: querypb.Type_INT64}, + {Name: "parent_id", Type: querypb.Type_INT64}, + }, + PKIndices: []bool{true, false}, + } + parentPlan := &TablePlan{ + TargetName: "parent", + Fields: []*querypb.Field{ + // The parent's referenced column streams as text: hash-incompatible + // with the child's INT64. + {Name: "id", Type: querypb.Type_VARCHAR, Charset: collationID}, + }, + PKIndices: []bool{true}, + } + tablePlans := map[string]*TablePlan{"child": childPlan, "parent": parentPlan} + fkRefs := map[string][]fkConstraintRef{ + "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}}, + } + + row := &querypb.Row{Values: []byte("142"), Lengths: []int64{1, 2}} + change := &binlogdatapb.RowChange{After: row} + rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}} + vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent} + + keys, err := buildTxnWriteset(tablePlans, fkRefs, buildParentFKRefs(fkRefs), []*binlogdatapb.VEvent{vevent}) + require.Error(t, err) + require.Contains(t, err.Error(), "streamed field metadata mismatch") + require.Nil(t, keys) + require.True(t, writesetErrorForcesSerialization(err), "metadata mismatch must serialize the txn, not fail the workflow") +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/relaylog_test.go b/go/vt/vttablet/tabletmanager/vreplication/relaylog_test.go new file mode 100644 index 00000000000..af0aaf0b1af --- /dev/null +++ b/go/vt/vttablet/tabletmanager/vreplication/relaylog_test.go @@ -0,0 +1,132 @@ +/* +Copyright 2026 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package vreplication + +import ( + "context" + "io" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" +) + +func TestRelayLogSendFetch(t *testing.T) { + ctx := t.Context() + rl := newRelayLog(ctx, 5, 10) + + event := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}, + }}, + }, + } + + require.NoError(t, rl.Send([]*binlogdatapb.VEvent{event})) + + items, err := rl.Fetch() + require.NoError(t, err) + require.Len(t, items, 1) + require.Len(t, items[0], 1) + assert.Equal(t, binlogdatapb.VEventType_ROW, items[0][0].Type) +} + +func TestRelayLogSendTimeout(t *testing.T) { + ctx := t.Context() + oldDeadline := vplayerProgressDeadline + vplayerProgressDeadline = 100 * time.Millisecond + t.Cleanup(func() { + vplayerProgressDeadline = oldDeadline + }) + + rl := newRelayLog(ctx, 1, 1) + + event := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{{ + After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}, + }}, + }, + } + + require.NoError(t, rl.Send([]*binlogdatapb.VEvent{event})) + + errCh := make(chan error, 1) + go func() { + errCh <- rl.Send([]*binlogdatapb.VEvent{event}) + }() + + select { + case err := <-errCh: + require.Error(t, err) + assert.ErrorContains(t, err, relayLogIOStalledMsg) + case <-time.After(5 * time.Second): + t.Fatal("timed out waiting for send") + } +} + +func TestRelayLogFetchTimeout(t *testing.T) { + ctx := t.Context() + oldIdle := idleTimeout + idleTimeout = 100 * time.Millisecond + t.Cleanup(func() { + idleTimeout = oldIdle + }) + + rl := newRelayLog(ctx, 1, 1) + + items, err := rl.Fetch() + require.NoError(t, err) + assert.Len(t, items, 0) +} + +func TestRelayLogDoneReturnsEOF(t *testing.T) { + ctx, cancel := context.WithCancel(t.Context()) + cancel() + + rl := newRelayLog(ctx, 1, 1) + + items, err := rl.Fetch() + assert.ErrorIs(t, err, io.EOF) + assert.Nil(t, items) +} + +func TestRelayLogEventsSize(t *testing.T) { + rowEvent := &binlogdatapb.VEvent{ + Type: binlogdatapb.VEventType_ROW, + RowEvent: &binlogdatapb.RowEvent{ + TableName: "t1", + RowChanges: []*binlogdatapb.RowChange{ + {Before: &querypb.Row{Values: []byte("ab"), Lengths: []int64{2}}}, + {After: &querypb.Row{Values: []byte("cde"), Lengths: []int64{3}}}, + }, + }, + } + otherEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT} + + size := eventsSize([]*binlogdatapb.VEvent{rowEvent, otherEvent}) + assert.Equal(t, 5, size) +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go index c6d0675b94c..1fe7a3deaac 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go +++ b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go @@ -23,6 +23,7 @@ import ( "slices" "sort" "strings" + "sync" "vitess.io/vitess/go/bytes2" "vitess.io/vitess/go/mysql/collations" @@ -88,6 +89,7 @@ func (rp *ReplicatorPlan) buildExecutionPlan(fieldEvent *binlogdatapb.FieldEvent trimmed.Name = strings.Trim(trimmed.Name, "`") tplanv.Fields = append(tplanv.Fields, trimmed) } + tplanv.HasUnsupportedWritesetMapping = hasUnsupportedWritesetMapping(&tplanv, tplanv.Fields) return &tplanv, nil } // select * construct was used. We need to use the field names. @@ -99,6 +101,35 @@ func (rp *ReplicatorPlan) buildExecutionPlan(fieldEvent *binlogdatapb.FieldEvent return tplan, nil } +// hasUnsupportedWritesetMapping reports whether the plan's source→target +// column mapping is something the parallel applier's writeset hasher +// cannot reason about safely. Plans that rewrite, project, or reorder +// columns produce hash inputs that do not correspond 1:1 with the row +// image bytes, so the scheduler falls back to serialization rather +// than compute a misleading writeset that could miss conflicts. +func hasUnsupportedWritesetMapping(plan *TablePlan, streamedFields []*querypb.Field) bool { + if plan == nil || len(streamedFields) == 0 || len(plan.PKIndices) == 0 { + return false + } + if len(streamedFields) != len(plan.PKIndices) { + return true + } + for i, field := range streamedFields { + if field == nil || i >= len(plan.TablePlanBuilder.colExprs) { + return true + } + cexpr := plan.TablePlanBuilder.colExprs[i] + if cexpr == nil || !cexpr.colName.Equal(sqlparser.NewIdentifierCI(field.Name)) { + return true + } + sourceCol, ok := cexpr.expr.(*sqlparser.ColName) + if !ok || !sourceCol.Name.Equal(sqlparser.NewIdentifierCI(field.Name)) || !sourceCol.Qualifier.IsEmpty() { + return true + } + } + return false +} + // buildFromFields builds a full TablePlan, but uses the field info as the // full column list. This happens when the query used was a 'select *', which // requires us to wait for the field info sent by the source. @@ -210,17 +241,35 @@ type TablePlan struct { // PKReferences is used to check if an event changed // a primary key column (row move). PKReferences []string + // IdentityColumns stores the chosen replication identity columns in key order. + IdentityColumns []string // PKIndices is an array, length = #columns, true if column is part of the PK - PKIndices []bool - Stats *binlogplayer.Stats - FieldsToSkip map[string]bool - ConvertCharset map[string](*binlogdatapb.CharsetConversion) - HasExtraSourcePkColumns bool + PKIndices []bool + // HasExtraUniqueSecondary means the table has uniqueness the writeset + // hasher cannot reason about (prefix/expression unique indexes, PK/identity + // mismatch); transactions touching it force-serialize. + HasExtraUniqueSecondary bool + // UniqueKeyColumns holds, per hashable unique secondary index, the ordered + // column names whose values get extra writeset keys (MySQL-WRITESET-style) + // so cross-row unique-value conflicts serialize against each other. + UniqueKeyColumns [][]string + // HasUnsupportedWritesetMapping means the streamed FIELD layout cannot be + // mapped positionally back to target PK/FK columns for safe writeset hashing. + HasUnsupportedWritesetMapping bool + Stats *binlogplayer.Stats + FieldsToSkip map[string]bool + ConvertCharset map[string](*binlogdatapb.CharsetConversion) + HasExtraSourcePkColumns bool TablePlanBuilder *tablePlanBuilder // PartialInserts is a dynamically generated cache of insert ParsedQueries, which update only some columns. // This is when we use a binlog_row_image which is not "full". The key is a serialized bitmap of data columns // which are sent as part of the RowEvent. + // partialMu protects PartialInserts and PartialUpdates from concurrent + // access when multiple parallel-apply workers process partial-row-image + // events for the same table simultaneously. Pointer to avoid copying + // the lock when TablePlan values are cloned in buildExecutionPlan. + partialMu *sync.Mutex PartialInserts map[string]*sqlparser.ParsedQuery // PartialUpdates are same as PartialInserts, but for update statements PartialUpdates map[string]*sqlparser.ParsedQuery @@ -852,6 +901,11 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange baseQuerySize := int64(len(tp.MultiDelete.Query)) querySize := baseQuerySize + // lastQR captures the most recent successful flush. The oversized-row + // edge case below can leave pkVals empty at the end of the loop, and + // we must not call execQuery on an empty buffer (it would build an + // invalid "IN ()" clause). The final check returns lastQR in that case. + var lastQR *sqltypes.Result execQuery := func(pkVals *[]sqltypes.Value) (*sqltypes.Result, error) { pksBV, err := sqltypes.BuildBindVariable(*pkVals) @@ -863,7 +917,12 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange return nil, err } tp.TablePlanBuilder.stats.BulkQueryCount.Add("delete", 1) - return executor(query) + qr, err := executor(query) + if err != nil { + return nil, err + } + lastQR = qr + return qr, nil } pkIndex := -1 @@ -880,6 +939,20 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange } addedSize := int64(len(vals[pkIndex].Raw()) + 2) // Plus 2 for the comma and space if querySize+addedSize > maxQuerySize { + // Edge case: a single PK value is large enough to exceed the + // query size budget on its own (pkVals is still empty). Flush + // it as a one-row query, slightly exceeding maxQuerySize, rather + // than flushing an empty pkVals and producing an invalid empty + // "IN ()" clause. + if len(pkVals) == 0 { + pkVals = append(pkVals, vals[pkIndex]) + if _, err := execQuery(&pkVals); err != nil { + return nil, err + } + pkVals = nil + querySize = baseQuerySize + continue + } if _, err := execQuery(&pkVals); err != nil { return nil, err } @@ -890,6 +963,16 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange querySize += addedSize } + // If pkVals is empty here, every row in this batch was flushed solo via + // the oversized-row edge case above. Return the last successful result + // instead of calling execQuery on an empty buffer (which would produce + // an invalid empty "IN ()" clause). + if len(pkVals) == 0 { + if lastQR != nil { + return lastQR, nil + } + return &sqltypes.Result{}, nil + } return execQuery(&pkVals) } @@ -913,12 +996,23 @@ func (tp *TablePlan) applyBulkInsertChanges(rowInserts []*binlogdatapb.RowChange maxQuerySize -= int64(len(insertPrefix)) values := &strings.Builder{} + // lastQR captures the most recent successful flush. The oversized-row + // edge case below can leave the values buffer empty at the end of the + // loop, and we must not call execQuery on an empty buffer (it would + // build an invalid INSERT with no VALUES). The final check returns + // lastQR in that case. + var lastQR *sqltypes.Result execQuery := func(vals *strings.Builder) (*sqltypes.Result, error) { if tp.BulkInsertOnDup != nil { vals.WriteString(tp.BulkInsertOnDup.Query) } tp.TablePlanBuilder.stats.BulkQueryCount.Add("insert", 1) - return executor(insertPrefix + vals.String()) + qr, err := executor(insertPrefix + vals.String()) + if err != nil { + return nil, err + } + lastQR = qr + return qr, nil } limit := tp.maxRowJSONBytes() @@ -962,7 +1056,19 @@ func (tp *TablePlan) applyBulkInsertChanges(rowInserts []*binlogdatapb.RowChange if err := tp.BulkInsertValues.Append(rowValues, bindvars, nil); err != nil { return nil, err } - if !newStmt && int64(values.Len()+2+rowValues.Len()) > maxQuerySize { // Plus 2 for the comma and space + if int64(values.Len()+2+rowValues.Len()) > maxQuerySize { // Plus 2 for the comma and space + // Edge case: a single row's VALUES clause is large enough to + // exceed the query size budget on its own (values buffer is + // still empty). Flush it as a one-row INSERT, slightly exceeding + // maxQuerySize, rather than flushing an empty VALUES buffer and + // producing an invalid INSERT with no VALUES. + if values.Len() == 0 { + if _, err := execQuery(rowValues); err != nil { + return nil, err + } + newStmt = true + continue + } if _, err := execQuery(values); err != nil { return nil, err } @@ -976,6 +1082,16 @@ func (tp *TablePlan) applyBulkInsertChanges(rowInserts []*binlogdatapb.RowChange newStmt = false } + // If the values buffer is empty here, every row in this batch was flushed + // solo via the oversized-row edge case above. Return the last successful + // result instead of calling execQuery on an empty buffer (which would + // produce an INSERT with no VALUES). + if values.Len() == 0 { + if lastQR != nil { + return lastQR, nil + } + return &sqltypes.Result{}, nil + } return execQuery(values) } diff --git a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go index 609d6851ced..c5823c96190 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go @@ -807,6 +807,30 @@ func TestBuildPlayerPlanNoDup(t *testing.T) { assert.ErrorContainsf(t, err, want, "buildReplicatorPlan err: %v, must contain: %v", err, want) } +func TestBuildPlayerPlanInsertIgnorePreservesPKIndices(t *testing.T) { + vttablet.InitVReplicationConfigDefaults() + vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig} + plan, err := vr.buildReplicatorPlan( + getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{ + Match: "t1", + Filter: "select id, c2 from t1 group by id, c2", + }}}), + map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}, {Name: "c2"}}}, + nil, + binlogplayer.NewStats(), + collations.MySQL8(), + sqlparser.NewTestParser(), + ) + require.NoError(t, err) + + tplan := plan.TablePlans["t1"] + require.NotNil(t, tplan) + require.Equal(t, []string{"id"}, tplan.IdentityColumns) + require.Equal(t, []bool{true, false}, tplan.PKIndices) + require.NotNil(t, tplan.Insert) + require.NotNil(t, tplan.Update) +} + func TestBuildPlayerPlanExclude(t *testing.T) { PrimaryKeyInfos := map[string][]*ColumnInfo{ "t1": {&ColumnInfo{Name: "c1"}}, diff --git a/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go b/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go index 7ad2177fe5e..24b8857789c 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go +++ b/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go @@ -22,6 +22,7 @@ import ( "regexp" "sort" "strings" + "sync" "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/sqltypes" @@ -354,6 +355,16 @@ func (tpb *tablePlanBuilder) generate() *TablePlan { pkrefs = append(pkrefs, k) } sort.Strings(pkrefs) + identityCols := make([]string, 0, len(tpb.pkCols)) + for _, pkCol := range tpb.pkCols { + identityCols = append(identityCols, pkCol.colName.Lowered()) + } + tpb.pkIndices = make([]bool, len(tpb.colExprs)) + for i, cexpr := range tpb.colExprs { + if cexpr.isPK { + tpb.pkIndices[i] = true + } + } bvf := &bindvarFormatter{} @@ -374,11 +385,13 @@ func (tpb *tablePlanBuilder) generate() *TablePlan { Delete: tpb.generateDeleteStatement(), MultiDelete: tpb.generateMultiDeleteStatement(), PKReferences: pkrefs, + IdentityColumns: identityCols, PKIndices: tpb.pkIndices, Stats: tpb.stats, FieldsToSkip: fieldsToSkip, HasExtraSourcePkColumns: len(tpb.extraSourcePkCols) > 0, TablePlanBuilder: tpb, + partialMu: &sync.Mutex{}, PartialInserts: make(map[string]*sqlparser.ParsedQuery, 0), PartialUpdates: make(map[string]*sqlparser.ParsedQuery, 0), CollationEnv: tpb.collationEnv, @@ -811,11 +824,7 @@ func (tpb *tablePlanBuilder) generateUpdateStatement() *sqlparser.ParsedQuery { buf := sqlparser.NewTrackedBuffer(bvf.formatter) buf.Myprintf("update %v set ", tpb.name) separator := "" - tpb.pkIndices = make([]bool, len(tpb.colExprs)) - for i, cexpr := range tpb.colExprs { - if cexpr.isPK { - tpb.pkIndices[i] = true - } + for _, cexpr := range tpb.colExprs { if cexpr.isGrouped || cexpr.isPK || cexpr.isGenerated { continue } diff --git a/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go b/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go index 3f401192fdf..645b00b63cb 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go +++ b/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go @@ -178,6 +178,8 @@ func (tpb *tablePlanBuilder) createPartialUpdateQuery(dataColumns *binlogdatapb. func (tp *TablePlan) getPartialInsertQuery(dataColumns *binlogdatapb.RowChange_Bitmap) (*sqlparser.ParsedQuery, error) { key := hex.EncodeToString(dataColumns.Cols) + tp.partialMu.Lock() + defer tp.partialMu.Unlock() ins, ok := tp.PartialInserts[key] if ok { return ins, nil @@ -193,6 +195,8 @@ func (tp *TablePlan) getPartialInsertQuery(dataColumns *binlogdatapb.RowChange_B func (tp *TablePlan) getPartialUpdateQuery(dataColumns *binlogdatapb.RowChange_Bitmap) (*sqlparser.ParsedQuery, error) { key := hex.EncodeToString(dataColumns.Cols) + tp.partialMu.Lock() + defer tp.partialMu.Unlock() upd, ok := tp.PartialUpdates[key] if ok { return upd, nil diff --git a/go/vt/vttablet/tabletmanager/vreplication/vcopier.go b/go/vt/vttablet/tabletmanager/vreplication/vcopier.go index d42fb349c80..2d58de024f5 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vcopier.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vcopier.go @@ -425,7 +425,7 @@ func (vc *vcopier) copyTable(ctx context.Context, tableName string, copyState ma var prevCh <-chan *vcopierCopyTaskResult vstreamOptions := &binlogdatapb.VStreamOptions{ - ConfigOverrides: vc.vr.workflowConfig.Overrides, + ConfigOverrides: vc.vr.workflowConfig.SourceOverrides(), } serr := vc.vr.sourceVStreamer.VStreamRows(ctx, initialPlan.SendRule.Filter, lastpkpb, func(rows *binlogdatapb.VStreamRowsResponse) error { for { diff --git a/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go b/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go index 382dab60b67..16a531af9d0 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go @@ -30,10 +30,11 @@ import ( "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/binlog/binlogplayer" "vitess.io/vitess/go/vt/log" - binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" - querypb "vitess.io/vitess/go/vt/proto/query" "vitess.io/vitess/go/vt/sqlparser" "vitess.io/vitess/go/vt/vterrors" + + binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + querypb "vitess.io/vitess/go/vt/proto/query" ) /* @@ -103,7 +104,7 @@ func (vc *vcopier) copyAll(ctx context.Context, settings binlogplayer.VRSettings var gtid string vstreamOptions := &binlogdatapb.VStreamOptions{ - ConfigOverrides: vc.vr.workflowConfig.Overrides, + ConfigOverrides: vc.vr.workflowConfig.SourceOverrides(), } serr := vc.vr.sourceVStreamer.VStreamTables(ctx, func(resp *binlogdatapb.VStreamTablesResponse) error { defer vc.vr.stats.PhaseTimings.Record("copy", time.Now()) diff --git a/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go b/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go index a2738686a4c..f6c23e1212b 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go @@ -18,6 +18,7 @@ package vreplication import ( "context" + "encoding/json" "fmt" "os" "regexp" @@ -36,6 +37,7 @@ import ( "vitess.io/vitess/go/sqltypes" "vitess.io/vitess/go/vt/binlog/binlogplayer" binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + "vitess.io/vitess/go/vt/proto/vtctldata" qh "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication/queryhistory" "vitess.io/vitess/go/vt/vttablet/tabletserver/vstreamer" ) @@ -258,6 +260,31 @@ func testVcopierTestCases(t *testing.T, test func(*testing.T), cases []vcopierTe } } +func copyTestSourceOverrides() map[string]string { + return map[string]string{ + "vstream-dynamic-packet-size": "false", + "vstream-packet-size": "1", + } +} + +func createVReplicationStateWithSourceOverrides(t *testing.T, workflow string, bls *binlogdatapb.BinlogSource, state binlogdatapb.VReplicationWorkflowState, dbName string, overrides map[string]string) string { + t.Helper() + + query := binlogplayer.CreateVReplicationState(workflow, bls, "", state, dbName, 0, 0) + if len(overrides) == 0 { + return query + } + + options, err := json.Marshal(vtctldata.WorkflowOptions{Config: overrides}) + require.NoError(t, err) + + emptyOptions := sqltypes.EncodeStringSQL("{}") + idx := strings.LastIndex(query, emptyOptions) + require.NotEqual(t, -1, idx) + + return query[:idx] + sqltypes.EncodeStringSQL(string(options)) + query[idx+len(emptyOptions):] +} + func TestPlayerCopyCharPK(t *testing.T) { testVcopierTestCases(t, testPlayerCopyCharPK, commonVcopierTestCases()) } @@ -328,7 +355,7 @@ func testPlayerCopyCharPK(t *testing.T) { OnDdl: binlogdatapb.OnDDLAction_IGNORE, } - query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0) + query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides()) qr, err := playerEngine.Exec(query) require.NoError(t, err) defer func() { @@ -431,7 +458,7 @@ func testPlayerCopyVarcharPKCaseInsensitive(t *testing.T) { OnDdl: binlogdatapb.OnDDLAction_IGNORE, } - query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0) + query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides()) qr, err := playerEngine.Exec(query) require.NoError(t, err) defer func() { @@ -551,7 +578,7 @@ func testPlayerCopyVarcharCompositePKCaseSensitiveCollation(t *testing.T) { OnDdl: binlogdatapb.OnDDLAction_IGNORE, } - query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0) + query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides()) qr, err := playerEngine.Exec(query) require.NoError(t, err) defer func() { @@ -912,7 +939,7 @@ func testPlayerCopyBigTable(t *testing.T) { OnDdl: binlogdatapb.OnDDLAction_IGNORE, } - query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0) + query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides()) qr, err := playerEngine.Exec(query) require.NoError(t, err) defer func() { @@ -1046,7 +1073,7 @@ func testPlayerCopyWildcardRule(t *testing.T) { Filter: filter, OnDdl: binlogdatapb.OnDDLAction_IGNORE, } - query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0) + query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides()) qr, err := playerEngine.Exec(query) require.NoError(t, err) defer func() { diff --git a/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go b/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go index bb05fd5897d..92cfd9b882b 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go @@ -37,14 +37,17 @@ const beginStmtLen = int64(len("begin;")) // It allows us to retry a failed transactions on lock errors. type vdbClient struct { binlogplayer.DBClient - stats *binlogplayer.Stats - InTransaction bool - startTime time.Time - queries []string - queriesPos int64 - batchSize int64 - maxBatchSize int64 - relayLogMaxItems int + stats *binlogplayer.Stats + vreplicationID int32 + InTransaction bool + foreignKeyChecksEnabled bool + foreignKeyChecksStateInitialized bool + startTime time.Time + queries []string + queriesPos int64 + batchSize int64 + maxBatchSize int64 + relayLogMaxItems int } func newVDBClient(dbclient binlogplayer.DBClient, stats *binlogplayer.Stats, relayLogMaxItems int) *vdbClient { @@ -55,6 +58,15 @@ func newVDBClient(dbclient binlogplayer.DBClient, stats *binlogplayer.Stats, rel } } +// newVDBClientWithID creates a vdbClient with a pre-set vreplicationID. +// Used by parallel apply workers so each worker's connection is associated +// with the correct vreplication stream for relay log batching. +func newVDBClientWithID(dbclient binlogplayer.DBClient, stats *binlogplayer.Stats, relayLogMaxItems int, vreplicationID int32) *vdbClient { + client := newVDBClient(dbclient, stats, relayLogMaxItems) + client.vreplicationID = vreplicationID + return client +} + func (vc *vdbClient) Begin() error { if vc.InTransaction { return nil @@ -78,6 +90,31 @@ func (vc *vdbClient) Begin() error { return nil } +// BeginImmediate starts a real transaction on the server even when batch mode +// is enabled. This is needed for commit paths that must execute a couple of +// statements immediately on one connection and still commit them atomically. +func (vc *vdbClient) BeginImmediate() error { + if vc.InTransaction { + return nil + } + if err := vc.DBClient.Begin(); err != nil { + return err + } + // The "begin" entry is for Retry's replay loop, which calls vc.Begin() + // when it sees "begin" in the buffer. BEGIN has already gone down the + // wire above, so advance queriesPos past it: any later + // ExecuteTrxQueryBatch / CommitTrxQueryBatch must not include this + // "begin" in its multi-statement, because a nested BEGIN would + // implicit-commit the current transaction and break atomicity with + // the immediate writes the caller is about to do. + vc.queries = []string{"begin"} + vc.queriesPos = 1 + vc.batchSize = 0 + vc.InTransaction = true + vc.startTime = time.Now() + return nil +} + func (vc *vdbClient) Commit() error { if err := vc.DBClient.Commit(); err != nil { return err @@ -96,7 +133,7 @@ func (vc *vdbClient) Commit() error { func (vc *vdbClient) CommitTrxQueryBatch() error { vc.queries = append(vc.queries, "commit") queries := strings.Join(vc.queries[vc.queriesPos:], ";") - for _, err := vc.ExecuteFetchMulti(queries, -1); err != nil; { + if _, err := vc.ExecuteFetchMulti(queries, -1); err != nil { return err } vc.InTransaction = false @@ -128,7 +165,8 @@ func (vc *vdbClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, } else { vc.queries = append(vc.queries, query) } - return vc.DBClient.ExecuteFetch(query, maxrows) + qr, err := vc.DBClient.ExecuteFetch(query, maxrows) + return qr, err } // AddQueryToTrxBatch adds the query to the current transaction's query @@ -157,7 +195,8 @@ func (vc *vdbClient) AddQueryToTrxBatch(query string) error { func (vc *vdbClient) ExecuteTrxQueryBatch() ([]*sqltypes.Result, error) { defer vc.stats.Timings.Record(binlogplayer.BlplMultiQuery, time.Now()) - qrs, err := vc.ExecuteFetchMulti(strings.Join(vc.queries[vc.queriesPos:], ";"), -1) + queries := strings.Join(vc.queries[vc.queriesPos:], ";") + qrs, err := vc.ExecuteFetchMulti(queries, -1) if err != nil { return nil, err } @@ -168,6 +207,19 @@ func (vc *vdbClient) ExecuteTrxQueryBatch() ([]*sqltypes.Result, error) { return qrs, nil } +// markTrxBatchedQueriesFlushed advances the batch position past every +// query currently buffered. ExecuteFetch appends each query it runs to +// the trx batch buffer (so Retry can replay them), but in batch-commit +// mode that buffer is also what CommitTrxQueryBatch sends as a single +// multi-statement, which double-executes any query that was already +// run on the wire via ExecuteFetch. Callers that have already executed +// queries through ExecuteFetch mid-batch use this to keep them out of +// the upcoming CommitTrxQueryBatch replay. +func (vc *vdbClient) markTrxBatchedQueriesFlushed() { + vc.queriesPos = int64(len(vc.queries)) + vc.batchSize = 0 +} + // Execute is ExecuteFetch without the maxrows. func (vc *vdbClient) Execute(query string) (*sqltypes.Result, error) { // Number of rows should never exceed relayLogMaxItems. diff --git a/go/vt/vttablet/tabletmanager/vreplication/vplayer.go b/go/vt/vttablet/tabletmanager/vreplication/vplayer.go index f1ecad333b5..76b8459e93c 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vplayer.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vplayer.go @@ -24,6 +24,8 @@ import ( "math" "strconv" "strings" + "sync" + "sync/atomic" "time" "vitess.io/vitess/go/mysql/replication" @@ -35,6 +37,7 @@ import ( "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp" binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" ) const failedToRecordHeartbeatMsg = "failed to record heartbeat" @@ -57,13 +60,16 @@ type vplayer struct { saveStop bool copyState map[string]*sqltypes.Result - replicatorPlan *ReplicatorPlan - tablePlans map[string]*TablePlan + replicatorPlan *ReplicatorPlan + tablePlansMu *sync.RWMutex + tablePlans map[string]*TablePlan + tablePlansVersion *atomic.Int64 // These are set when creating the VPlayer based on whether the VPlayer // is in batch (stmt and trx) execution mode or not. - query func(ctx context.Context, sql string) (*sqltypes.Result, error) - commit func() error + query func(ctx context.Context, sql string) (*sqltypes.Result, error) + commit func() error + dbClient *vdbClient // If the VPlayer is in batch mode, we accumulate each transaction's statements // that are then sent as a single multi-statement protocol request to the database. batchMode bool @@ -74,12 +80,16 @@ type vplayer struct { // If nothing else happens for idleTimeout since timeLastSaved, // the position of the unsavedEvent gets saved. unsavedEvent *binlogdatapb.VEvent - // timeLastSaved is set every time a GTID is saved. + // timeLastSaved tracks when the latest pending position was durably saved. + // Older saves behind a later unsavedEvent must not refresh it. timeLastSaved time.Time - // lastTimestampNs is the last timestamp seen so far. - lastTimestampNs int64 - // timeOffsetNs keeps track of the clock difference with respect to source tablet. - timeOffsetNs int64 + // lagSnapshot packs the last timestamp seen and the clock offset to the + // source tablet into a single atomic struct. Storing them together (vs + // two independent atomic.Int64 fields) prevents the parallel applier's + // throttled-path lag estimator from seeing a torn pair (new ts with + // stale offset, or vice versa) when the commitLoop's updateLag races + // with the scheduleLoop's reader. + lagSnapshot *atomic.Pointer[lagSnapshot] // numAccumulatedHeartbeats keeps track of how many heartbeats have been received since we updated the time_updated column of _vt.vreplication numAccumulatedHeartbeats int @@ -90,15 +100,64 @@ type vplayer struct { throttlerAppName string - // See updateFKCheck for more details on how the two fields below are used. + serialMu *sync.Mutex + parallelOrder *atomic.Int64 - // foreignKeyChecksEnabled is the current state of the foreign key checks for the current session. - // It reflects what we have set the @@session.foreign_key_checks session variable to. - foreignKeyChecksEnabled bool + // fkRefs maps child table name → FK constraints for that table. + // Used by the parallel applier to generate writeset keys that + // create conflicts between child and parent table transactions. + fkRefs map[string][]fkConstraintRef + // parentFKRefs is the reverse map: parent table name → FK constraints + // that reference it. Used to generate parent-side writeset keys that + // match child FK keys, ensuring correct conflict detection even when + // FKs reference non-PK unique keys. + parentFKRefs map[string][]parentFKRef + // postDDLDroppedTables records dropped table names from executed DDLs so the + // parallel scheduler can clear post-DDL barriers without mutating tablePlans. + postDDLDroppedTables map[string]struct{} + // postDDLStalePlans records the still-stale table plans left behind by the + // most recently executed EXEC* DDLs. scheduleLoop snapshots this under + // serialMu so commitLoop can publish real runtime DDL effects without + // racing the scheduler. + postDDLStalePlans map[string]postDDLStalePlan + // postDDLConservative keeps unknown DDL barriers fail-closed until every + // currently tracked plan refreshes. + postDDLConservative bool + // pendingFieldRefreshTables tracks tables whose FIELD refresh was scheduled + // but has not committed yet, so later row transactions do not hash against a + // still-cold table-plan cache. + pendingFieldRefreshTables map[string]int - // foreignKeyChecksStateInitialized is set to true once we have initialized the foreignKeyChecksEnabled. - // The initialization is done on the first row event that this vplayer sees. - foreignKeyChecksStateInitialized bool + // idStr is vp.idStr, cached to avoid repeated + // conversions on every lag gauge update. + idStr string +} + +// lagSnapshot pairs the most-recent source-side timestamp seen by the +// applier with the corresponding clock offset to the source. It is stored +// behind an atomic.Pointer so readers always see a consistent (ts, offset) +// pair instead of a torn mix from two concurrent writers. +type lagSnapshot struct { + timestampNs int64 + offsetNs int64 +} + +// loadLagSnapshot returns the latest snapshot, or a zero-value snapshot if +// nothing has been stored yet. Callers can compare timestampNs against zero +// to detect "no data yet". +func (vp *vplayer) loadLagSnapshot() lagSnapshot { + snap := vp.lagSnapshot.Load() + if snap == nil { + return lagSnapshot{} + } + return *snap +} + +// storeLagSnapshot atomically replaces the lag snapshot with a new (ts, offset) +// pair. A reader's loadLagSnapshot will either see the entire previous +// snapshot or the entire new one — never a mix. +func (vp *vplayer) storeLagSnapshot(timestampNs, offsetNs int64) { + vp.lagSnapshot.Store(&lagSnapshot{timestampNs: timestampNs, offsetNs: offsetNs}) } // NoForeignKeyCheckFlagBitmask is the bitmask for the 2nd bit (least significant) of the flags in a binlog row event. @@ -151,22 +210,41 @@ func newVPlayer(vr *vreplicator, settings binlogplayer.VRSettings, copyState map } return &vplayer{ - vr: vr, - startPos: settings.StartPos, - pos: settings.StartPos, - stopPos: settings.StopPos, - saveStop: saveStop, - copyState: copyState, - timeLastSaved: time.Now(), - tablePlans: make(map[string]*TablePlan), - phase: phase, - throttlerAppName: throttlerapp.VPlayerName.ConcatenateString(vr.throttlerAppName()), - query: queryFunc, - commit: commitFunc, - batchMode: batchMode, + vr: vr, + startPos: settings.StartPos, + pos: settings.StartPos, + stopPos: settings.StopPos, + saveStop: saveStop, + copyState: copyState, + timeLastSaved: time.Now(), + lagSnapshot: &atomic.Pointer[lagSnapshot]{}, + tablePlansMu: &sync.RWMutex{}, + tablePlans: make(map[string]*TablePlan), + tablePlansVersion: &atomic.Int64{}, + serialMu: &sync.Mutex{}, + parallelOrder: &atomic.Int64{}, + phase: phase, + throttlerAppName: throttlerapp.VPlayerName.ConcatenateString(vr.throttlerAppName()), + pendingFieldRefreshTables: make(map[string]int), + query: queryFunc, + commit: commitFunc, + batchMode: batchMode, + dbClient: vr.dbClient, + idStr: strconv.Itoa(int(vr.id)), } } +// activeDBClient returns the vplayer's current DB connection. In the parallel +// applier, workers swap vp.dbClient to their own connection before applying +// events, so this returns whichever connection is currently active. Falls back +// to vr.dbClient (the main connection) when vp.dbClient is nil. +func (vp *vplayer) activeDBClient() *vdbClient { + if vp.dbClient != nil { + return vp.dbClient + } + return vp.vr.dbClient +} + // play is the entry point for playing binlogs. func (vp *vplayer) play(ctx context.Context) error { if !vp.stopPos.IsZero() && vp.startPos.AtLeast(vp.stopPos) { @@ -197,7 +275,7 @@ func (vp *vplayer) play(ctx context.Context) error { } // updateFKCheck updates the @@session.foreign_key_checks variable based on the binlog row event flags. -// The function only does it if it has changed to avoid redundant updates, using the cached vplayer.foreignKeyChecksEnabled +// The function only does it if it has changed to avoid redundant updates, using the cached state on the active db session. // The foreign_key_checks value for a transaction is determined by the 2nd bit (least significant) of the flags: // - If set (1), foreign key checks are disabled. // - If unset (0), foreign key checks are enabled. @@ -208,7 +286,7 @@ func (vp *vplayer) updateFKCheck(ctx context.Context, flags2 uint32) error { // If this is an atomic copy, we must update the foreign_key_checks state even when the vplayer runs during // the copy phase, i.e., for catchup and fastforward. mustUpdate = true - } else if vp.vr.state == binlogdatapb.VReplicationWorkflowState_Running { + } else if vp.vr.getState() == binlogdatapb.VReplicationWorkflowState_Running { // If the vreplication workflow is in Running state, we must update the foreign_key_checks // state for all workflow types. mustUpdate = true @@ -218,18 +296,19 @@ func (vp *vplayer) updateFKCheck(ctx context.Context, flags2 uint32) error { } dbForeignKeyChecksEnabled := flags2&NoForeignKeyCheckFlagBitmask != NoForeignKeyCheckFlagBitmask - if vp.foreignKeyChecksStateInitialized /* already set earlier */ && - dbForeignKeyChecksEnabled == vp.foreignKeyChecksEnabled /* no change in the state, no need to update */ { + activeClient := vp.activeDBClient() + if activeClient.foreignKeyChecksStateInitialized /* already set earlier */ && + dbForeignKeyChecksEnabled == activeClient.foreignKeyChecksEnabled /* no change in the state, no need to update */ { return nil } log.Info("Setting this session's foreign_key_checks to " + strconv.FormatBool(dbForeignKeyChecksEnabled)) if _, err := vp.query(ctx, "set @@session.foreign_key_checks="+strconv.FormatBool(dbForeignKeyChecksEnabled)); err != nil { return fmt.Errorf("failed to set session foreign_key_checks: %w", err) } - vp.foreignKeyChecksEnabled = dbForeignKeyChecksEnabled - if !vp.foreignKeyChecksStateInitialized { + activeClient.foreignKeyChecksEnabled = dbForeignKeyChecksEnabled + if !activeClient.foreignKeyChecksStateInitialized { log.Info("First foreign_key_checks update to: " + strconv.FormatBool(dbForeignKeyChecksEnabled)) - vp.foreignKeyChecksStateInitialized = true + activeClient.foreignKeyChecksStateInitialized = true } return nil } @@ -255,16 +334,21 @@ func (vp *vplayer) fetchAndApply(ctx context.Context) (err error) { streamErr := make(chan error, 1) go func() { vstreamOptions := &binlogdatapb.VStreamOptions{ - ConfigOverrides: vp.vr.workflowConfig.Overrides, + ConfigOverrides: vp.vr.workflowConfig.SourceOverrides(), } - streamErr <- vp.vr.sourceVStreamer.VStream(ctx, replication.EncodePosition(vp.startPos), nil, + err := vp.vr.sourceVStreamer.VStream(ctx, replication.EncodePosition(vp.startPos), nil, vp.replicatorPlan.VStreamFilter, func(events []*binlogdatapb.VEvent) error { return relay.Send(events) }, vstreamOptions) + streamErr <- err }() applyErr := make(chan error, 1) go func() { + if vp.vr.workflowConfig.ParallelReplicationWorkers > 1 && len(vp.copyState) == 0 { + applyErr <- vp.applyEventsParallel(ctx, relay) + return + } applyErr <- vp.applyEvents(ctx, relay) }() @@ -296,6 +380,13 @@ func (vp *vplayer) fetchAndApply(ctx context.Context) (err error) { return nil default: } + // If the vstream received a gRPC CANCELED error, it means the + // context was canceled but the Go context hasn't propagated yet. + // Treat this the same as ctx.Done() — return nil to avoid a + // spurious retry. + if vterrors.Code(err) == vtrpcpb.Code_CANCELED && ctx.Err() != nil { + return nil + } // If the stream ends normally we have to return an error indicating // that the controller has to retry a different vttablet. if err == nil || err == io.EOF { @@ -325,7 +416,9 @@ func (vp *vplayer) applyRowEvent(ctx context.Context, rowEvent *binlogdatapb.Row if err := vp.updateFKCheck(ctx, rowEvent.Flags); err != nil { return err } + vp.tablePlansMu.RLock() tplan := vp.tablePlans[rowEvent.TableName] + vp.tablePlansMu.RUnlock() if tplan == nil { return fmt.Errorf("unexpected event on table %s", rowEvent.TableName) } @@ -346,14 +439,14 @@ func (vp *vplayer) applyRowEvent(ctx context.Context, rowEvent *binlogdatapb.Row // then we can perform a simple bulk DELETE using an IN clause. if (rowEvent.RowChanges[0].Before != nil && rowEvent.RowChanges[0].After == nil) && tplan.MultiDelete != nil { - _, err := tplan.applyBulkDeleteChanges(rowEvent.RowChanges, applyFunc, vp.vr.dbClient.maxBatchSize) + _, err := tplan.applyBulkDeleteChanges(rowEvent.RowChanges, applyFunc, vp.activeDBClient().maxBatchSize) return err } // If we're done with the copy phase then we will be replicating all INSERTS // regardless of the PK value and can use a single INSERT statment with // multiple VALUES clauses. if len(vp.copyState) == 0 && (rowEvent.RowChanges[0].Before == nil && rowEvent.RowChanges[0].After != nil) { - _, err := tplan.applyBulkInsertChanges(rowEvent.RowChanges, applyFunc, vp.vr.dbClient.maxBatchSize) + _, err := tplan.applyBulkInsertChanges(rowEvent.RowChanges, applyFunc, vp.activeDBClient().maxBatchSize) return err } } @@ -368,22 +461,79 @@ func (vp *vplayer) applyRowEvent(ctx context.Context, rowEvent *binlogdatapb.Row } // updatePos should get called at a minimum of vreplicationMinimumHeartbeatUpdateInterval. -func (vp *vplayer) updatePos(ctx context.Context, ts int64) (posReached bool, err error) { - update := binlogplayer.GenerateUpdatePos(vp.vr.id, vp.pos, time.Now().Unix(), ts, vp.vr.stats.CopyRowCount.Get(), vp.vr.workflowConfig.StoreCompressedGTID) - if _, err := vp.query(ctx, update); err != nil { +func (vp *vplayer) generateUpdatePosQuery(pos replication.Position, ts int64) string { + return binlogplayer.GenerateUpdatePos(vp.vr.id, pos, time.Now().Unix(), ts, vp.vr.stats.CopyRowCount.Get(), vp.vr.workflowConfig.StoreCompressedGTID) +} + +// updatePosWithoutStop writes the position update through the supplied +// query function without applying the stop-position state transition. +// The parallel commitLoop uses this because the position update, +// COMMIT, and workflow state update must all run on the worker's +// connection — activeDBClient() would pick the wrong one here. +func (vp *vplayer) updatePosWithoutStop(ctx context.Context, pos replication.Position, ts int64, query func(context.Context, string) (*sqltypes.Result, error)) (posReached bool, err error) { + if _, err := query(ctx, vp.generateUpdatePosQuery(pos, ts)); err != nil { return false, fmt.Errorf("error %v updating position", err) } + return !vp.stopPos.IsZero() && pos.AtLeast(vp.stopPos), nil +} + +// recordPositionSave updates the in-memory bookkeeping that follows a +// successful position write (clear unsaved-event state, refresh the +// idle-flush timer, advance the lag gauge). Split out of updatePos so +// the parallel commitLoop can record the save after committing the +// worker's transaction instead of during apply. +func (vp *vplayer) recordPositionSave(pos replication.Position, clearUnsavedEvent bool) { vp.numAccumulatedHeartbeats = 0 - vp.unsavedEvent = nil - vp.timeLastSaved = time.Now() - vp.vr.stats.SetLastPosition(vp.pos) - posReached = !vp.stopPos.IsZero() && vp.pos.AtLeast(vp.stopPos) + refreshIdleTimer := clearUnsavedEvent || vp.unsavedEvent == nil || !vp.pos.AtLeast(pos) || vp.pos.Equal(pos) + if clearUnsavedEvent { + vp.unsavedEvent = nil + } + if refreshIdleTimer { + vp.timeLastSaved = time.Now() + } + vp.vr.stats.SetLastPosition(pos) +} + +// setStopPositionState marks the workflow as Stopped using the given +// dbClient's batch mode (if any). Used from the serial applier path +// where the stop-state write can ride along with the rest of the +// batched flush. +func (vp *vplayer) setStopPositionState(dbClient *vdbClient) error { + log.Info(fmt.Sprintf("Stopped at position: %v", vp.stopPos)) + if !vp.saveStop { + return nil + } + return vp.vr.setStateWithDBClient(dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, fmt.Sprintf("Stopped at position %v", vp.stopPos)) +} + +// setStopPositionStateImmediate marks the workflow as Stopped using a +// direct (non-batched) write. The parallel commitLoop uses this after +// the worker has flushed its batch and is about to COMMIT, so the +// state row update has to stay inside the same transaction rather than +// deferring to a later batch flush. +func (vp *vplayer) setStopPositionStateImmediate(dbClient *vdbClient) error { + log.Info(fmt.Sprintf("Stopped at position: %v", vp.stopPos)) + if !vp.saveStop { + return nil + } + return vp.vr.setStateWithDBClientImmediate(dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, fmt.Sprintf("Stopped at position %v", vp.stopPos)) +} + +// updatePos persists the current position, records the save, and — +// if the stop position has been reached — transitions the workflow to +// Stopped on the active DB client. The serial applier uses this +// end-to-end; the parallel flow calls the constituent helpers +// (updatePosWithoutStop, recordPositionSave, +// setStopPositionStateImmediate) on the worker connection instead. +func (vp *vplayer) updatePos(ctx context.Context, ts int64) (posReached bool, err error) { + posReached, err = vp.updatePosWithoutStop(ctx, vp.pos, ts, vp.query) + if err != nil { + return false, err + } + vp.recordPositionSave(vp.pos, true) if posReached { - log.Info(fmt.Sprintf("Stopped at position: %v", vp.stopPos)) - if vp.saveStop { - if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, fmt.Sprintf("Stopped at position %v", vp.stopPos)); err != nil { - return false, err - } + if err := vp.setStopPositionState(vp.activeDBClient()); err != nil { + return false, err } } return posReached, nil @@ -464,17 +614,18 @@ func (vp *vplayer) applyEvents(ctx context.Context, relay *relayLog) error { defer vp.vr.dbClient.Rollback() estimateLag := func() { - behind := time.Now().UnixNano() - vp.lastTimestampNs - vp.timeOffsetNs + snap := vp.loadLagSnapshot() + behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs behindSecs := behind / 1e9 vp.vr.stats.ReplicationLagSeconds.Store(behindSecs) - vp.vr.stats.VReplicationLagGauges.Set(strconv.Itoa(int(vp.vr.id)), behindSecs) + vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs) } // If we're not running, set ReplicationLagSeconds to be very high. // TODO(sougou): if we also stored the time of the last event, we // can estimate this value more accurately. defer vp.vr.stats.ReplicationLagSeconds.Store(math.MaxInt64) - defer vp.vr.stats.VReplicationLagGauges.Set(strconv.Itoa(int(vp.vr.id)), math.MaxInt64) + defer vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, math.MaxInt64) var lag int64 for { if ctx.Err() != nil { @@ -561,10 +712,11 @@ func (vp *vplayer) applyEvents(ctx context.Context, relay *relayLog) error { // determine the actual lag, as the vstreamer is fully throttled, and we // will estimate it after processing the batch. if event.Type != binlogdatapb.VEventType_HEARTBEAT || !event.Throttled { - vp.lastTimestampNs = event.Timestamp * 1e9 + tsNs := event.Timestamp * 1e9 now := time.Now().UnixNano() - vp.timeOffsetNs = now - event.CurrentTime - lag = now - vp.lastTimestampNs - vp.timeOffsetNs + offset := now - event.CurrentTime + vp.storeLagSnapshot(tsNs, offset) + lag = now - tsNs - offset } } } @@ -573,7 +725,7 @@ func (vp *vplayer) applyEvents(ctx context.Context, relay *relayLog) error { if lag >= 0 { lagSecs := lag / 1e9 vp.vr.stats.ReplicationLagSeconds.Store(lagSecs) - vp.vr.stats.VReplicationLagGauges.Set(strconv.Itoa(int(vp.vr.id)), lagSecs) + vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, lagSecs) } else { // We couldn't determine the lag, so we need to estimate it estimateLag() } @@ -647,12 +799,12 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m // No-op: begin is called as needed. case binlogdatapb.VEventType_COMMIT: if mustSave { - if err := vp.vr.dbClient.Begin(); err != nil { + if err := vp.activeDBClient().Begin(); err != nil { return err } } - if !vp.vr.dbClient.InTransaction { + if !vp.activeDBClient().InTransaction { // We're skipping an empty transaction. We may have to save the position on inactivity. vp.unsavedEvent = event return nil @@ -668,14 +820,50 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m return io.EOF } case binlogdatapb.VEventType_FIELD: - if err := vp.vr.dbClient.Begin(); err != nil { + if err := vp.activeDBClient().Begin(); err != nil { return err } tplan, err := vp.replicatorPlan.buildExecutionPlan(event.FieldEvent) if err != nil { return err } - vp.tablePlans[event.FieldEvent.TableName] = tplan + // HasExtraUniqueSecondary only matters to the parallel applier's + // writeset scheduling, which runs only in the replication phase + // (fetchAndApply requires len(copyState) == 0). During copy-phase + // catchup/fastforward this vplayer is serial and its table plans + // die with it, so the schema lookup would be a wasted mysqld + // round-trip and a needless failure mode. + if vp.vr.workflowConfig.ParallelReplicationWorkers > 1 && len(vp.copyState) == 0 { + vp.tablePlansMu.RLock() + cachedPlan := vp.tablePlans[event.FieldEvent.TableName] + vp.tablePlansMu.RUnlock() + vp.serialMu.Lock() + staleEntry, hasStaleEntry := vp.postDDLStalePlans[event.FieldEvent.TableName] + cacheInvalidatedByRefreshTarget := !hasStaleEntry && postDDLRefreshTargetMatchesCachedPlan(vp.postDDLStalePlans, event.FieldEvent.TableName, cachedPlan) + vp.serialMu.Unlock() + cacheInvalidatedByDDL := (hasStaleEntry && staleEntry.stalePlan == cachedPlan) || cacheInvalidatedByRefreshTarget + if cachedPlan != nil && cachedPlan.TargetName == tplan.TargetName && !cacheInvalidatedByDDL { + tplan.HasExtraUniqueSecondary = cachedPlan.HasExtraUniqueSecondary + tplan.UniqueKeyColumns = cachedPlan.UniqueKeyColumns + } else { + uniqueKeys, mustSerialize, err := vp.vr.writesetUniqueKeys(ctx, tplan.TargetName, tplan) + if err != nil { + return err + } + tplan.UniqueKeyColumns = uniqueKeys + tplan.HasExtraUniqueSecondary = mustSerialize + } + } + fieldTableName := event.FieldEvent.TableName + vp.tablePlansMu.Lock() + vp.tablePlans[fieldTableName] = tplan + vp.tablePlansVersion.Add(1) + vp.tablePlansMu.Unlock() + vp.serialMu.Lock() + // FIELD means this table name is live again, so later DDL barriers must + // treat it as tracked instead of as a previously dropped name. + delete(vp.postDDLDroppedTables, canonicalPostDDLTableKey(vp.postDDLDroppedTables, fieldTableName)) + vp.serialMu.Unlock() if stats != nil { stats.Send(fmt.Sprintf("%v", event.FieldEvent)) } @@ -690,7 +878,7 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m // If the event is for one of the AWS RDS "special" or pt-table-checksum tables, we skip if !strings.Contains(sql, " mysql.rds_") && !strings.Contains(sql, " percona.checksums") { // This is a player using statement based replication - if err := vp.vr.dbClient.Begin(); err != nil { + if err := vp.activeDBClient().Begin(); err != nil { return err } if err := vp.applyStmtEvent(ctx, event); err != nil { @@ -702,7 +890,7 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m } case binlogdatapb.VEventType_ROW: // This player is configured for row based replication - if err := vp.vr.dbClient.Begin(); err != nil { + if err := vp.activeDBClient().Begin(); err != nil { return err } if err := vp.applyRowEvent(ctx, event.RowEvent); err != nil { @@ -715,7 +903,7 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m stats.Send(fmt.Sprintf("%v", event.RowEvent)) } case binlogdatapb.VEventType_OTHER: - if vp.vr.dbClient.InTransaction { + if vp.activeDBClient().InTransaction { // Unreachable log.Error(fmt.Sprintf("internal error: vplayer is in a transaction on event: %v", event)) return fmt.Errorf("internal error: vplayer is in a transaction on event: %v", event) @@ -729,73 +917,20 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m return io.EOF } case binlogdatapb.VEventType_DDL: - if vp.vr.dbClient.InTransaction { + if vp.activeDBClient().InTransaction { // Unreachable log.Error(fmt.Sprintf("internal error: vplayer is in a transaction on event: %v", event)) return fmt.Errorf("internal error: vplayer is in a transaction on event: %v", event) } - vp.vr.stats.DDLEventActions.Add(vp.vr.source.OnDdl.String(), 1) // Record the DDL handling - switch vp.vr.source.OnDdl { - case binlogdatapb.OnDDLAction_IGNORE: - // We still have to update the position. - posReached, err := vp.updatePos(ctx, event.Timestamp) - if err != nil { - return err - } - if posReached { - return io.EOF - } - case binlogdatapb.OnDDLAction_STOP: - if err := vp.vr.dbClient.Begin(); err != nil { - return err - } - if _, err := vp.updatePos(ctx, event.Timestamp); err != nil { - return err - } - if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at DDL "+event.Statement); err != nil { - return err - } - if err := vp.commit(); err != nil { - return err - } - return io.EOF - case binlogdatapb.OnDDLAction_EXEC: - // It's impossible to save the position transactionally with the statement. - // So, we apply the DDL first, and then save the position. - // Manual intervention may be needed if there is a partial - // failure here. - if _, err := vp.query(ctx, event.Statement); err != nil { - return err - } - if stats != nil { - stats.Send(event.Statement) - } - posReached, err := vp.updatePos(ctx, event.Timestamp) - if err != nil { - return err - } - if posReached { - return io.EOF - } - case binlogdatapb.OnDDLAction_EXEC_IGNORE: - if _, err := vp.query(ctx, event.Statement); err != nil { - log.Info(fmt.Sprintf("Ignoring error: %v for DDL: %s", err, event.Statement)) - } - if stats != nil { - stats.Send(event.Statement) - } - posReached, err := vp.updatePos(ctx, event.Timestamp) - if err != nil { - return err - } - if posReached { - return io.EOF - } - } + _, err := vp.applyDDLEvent(ctx, event, stats) + return err case binlogdatapb.VEventType_ROWS_QUERY: // The original SQL query is informational only; VReplication applies row changes directly. + case binlogdatapb.VEventType_VERSION: + // VERSION only tells downstream consumers that schema_version changed. + // vplayer does not apply any data for it. case binlogdatapb.VEventType_JOURNAL: - if vp.vr.dbClient.InTransaction { + if vp.activeDBClient().InTransaction { // Unreachable log.Error(fmt.Sprintf("internal error: vplayer is in a transaction on event: %v", event)) return fmt.Errorf("internal error: vplayer is in a transaction on event: %v", event) @@ -832,6 +967,19 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m } // All were found. We must register journal. } + // We must NOT persist the position past the journal event here. + // registerJournal returns nil as soon as THIS participant has + // registered, even when other participants of the journal have not + // joined yet, and the engine's journaler state is in-memory only. + // The position is only safe to advance once transitionJournal has + // durably rewritten the participating streams. If we saved the + // position now and the tablet restarted before all participants + // joined, this stream would resume past the journal, never + // re-register, and the workflow would hang forever waiting for a + // transition that can no longer happen. Keeping the saved position + // before the journal means a restart re-delivers the journal event, + // and registerJournal is idempotent (per-key lookup, + // existing-participant guard), so re-registering is safe. log.Info(fmt.Sprintf("Binlog event registering journal event %+v", event.Journal)) if err := vp.vr.vre.registerJournal(event.Journal, vp.vr.id); err != nil { if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, err.Error()); err != nil { @@ -849,13 +997,83 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m return err } } - if !vp.vr.dbClient.InTransaction { + if !vp.activeDBClient().InTransaction { vp.numAccumulatedHeartbeats++ if err := vp.recordHeartbeat(); err != nil { return err } } + default: + return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unsupported vevent type: %v", event.Type) } return nil } + +// applyDDLEvent executes the DDL handling policy and reports whether the target +// schema was actually changed, so commitLoop can publish only real EXEC* side effects. +func (vp *vplayer) applyDDLEvent(ctx context.Context, event *binlogdatapb.VEvent, stats *VrLogStats) (bool, error) { + vp.vr.stats.DDLEventActions.Add(vp.vr.source.OnDdl.String(), 1) + sendStats := func() { + if stats != nil { + stats.Send(event.Statement) + } + } + switch vp.vr.source.OnDdl { + case binlogdatapb.OnDDLAction_IGNORE: + posReached, err := vp.updatePos(ctx, event.Timestamp) + if err != nil { + return false, err + } + if posReached { + return false, io.EOF + } + return false, nil + case binlogdatapb.OnDDLAction_STOP: + if err := vp.activeDBClient().Begin(); err != nil { + return false, err + } + if _, err := vp.updatePos(ctx, event.Timestamp); err != nil { + return false, err + } + if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at DDL "+event.Statement); err != nil { + return false, err + } + if err := vp.commit(); err != nil { + return false, err + } + return false, io.EOF + case binlogdatapb.OnDDLAction_EXEC: + // DDL and position save cannot be committed atomically, so we only + // publish the post-DDL barrier after the statement itself succeeds. + if _, err := vp.query(ctx, event.Statement); err != nil { + return false, err + } + sendStats() + posReached, err := vp.updatePos(ctx, event.Timestamp) + if err != nil { + return false, err + } + if posReached { + return true, io.EOF + } + return true, nil + case binlogdatapb.OnDDLAction_EXEC_IGNORE: + executed := true + if _, err := vp.query(ctx, event.Statement); err != nil { + executed = false + log.Info(fmt.Sprintf("Ignoring error: %v for DDL: %s", err, event.Statement)) + } + sendStats() + posReached, err := vp.updatePos(ctx, event.Timestamp) + if err != nil { + return executed, err + } + if posReached { + return executed, io.EOF + } + return executed, nil + default: + return false, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unsupported ddl action: %v", vp.vr.source.OnDdl) + } +} diff --git a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go index 02ae1c5b9af..869bf7f857e 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go @@ -39,10 +39,12 @@ import ( "vitess.io/vitess/go/vt/binlog/binlogplayer" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/logutil" + "vitess.io/vitess/go/vt/vterrors" vttablet "vitess.io/vitess/go/vt/vttablet/common" "vitess.io/vitess/go/vt/vttablet/tabletserver/vstreamer/testenv" binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" qh "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication/queryhistory" ) @@ -218,7 +220,7 @@ func TestHeartbeatFrequencyFlag(t *testing.T) { stats := binlogplayer.NewStats() defer stats.Stop() - vp := &vplayer{vr: &vreplicator{ + vp := &vplayer{tablePlansMu: &sync.RWMutex{}, serialMu: &sync.Mutex{}, vr: &vreplicator{ dbClient: newVDBClient(realDBClientFactory(), stats, vttablet.DefaultVReplicationConfig.RelayLogMaxItems), stats: stats, workflowConfig: vttablet.DefaultVReplicationConfig, @@ -2941,6 +2943,50 @@ func TestTimestamp(t *testing.T) { expectData(t, "t1", [][]string{{"1", want, want}}) } +func TestVPlayerDoesNotTreatRemoteCanceledStreamAsLocalShutdown(t *testing.T) { + tablet := addTablet(100) + defer deleteTablet(tablet) + + filter := &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "/.*"}}} + bls := &binlogdatapb.BinlogSource{Keyspace: env.KeyspaceName, Shard: env.ShardName, Filter: filter} + stats := binlogplayer.NewStats() + defer stats.Stop() + dbClient := playerEngine.dbClientFactoryFiltered() + err := dbClient.Connect() + require.NoError(t, err) + defer dbClient.Close() + + _, err = dbClient.ExecuteFetch(fmt.Sprintf("insert into _vt.vreplication (id, workflow, source, pos, max_tps, max_replication_lag, time_updated, transaction_timestamp, state, db_name, options) values (1, 'test', '', '', 99999, 99999, 0, 0, 'Stopped', '%s', '{}') on duplicate key update workflow='test', source='', pos='', max_tps=99999, max_replication_lag=99999, time_updated=0, transaction_timestamp=0, state='Stopped', db_name='%s'", dbClient.DBName(), dbClient.DBName()), 1) + require.NoError(t, err) + drainDBQueries() + defer func() { + _, err := dbClient.ExecuteFetch("delete from _vt.vreplication where id = 1", 1) + require.NoError(t, err) + drainDBQueries() + }() + + oldErrors := vstreamErrorsByTablet + defer func() { vstreamErrorsByTablet = oldErrors }() + vstreamErrorsByTablet = map[uint32]error{ + tablet.Alias.Uid: vterrors.New(vtrpcpb.Code_CANCELED, "remote canceled"), + } + + vsClient := newTabletConnector(tablet) + require.NoError(t, vsClient.Open(t.Context())) + defer func() { _ = vsClient.Close(t.Context()) }() + + vr := newVReplicator(1, bls, vsClient, stats, dbClient, env.Mysqld, playerEngine, vttablet.DefaultVReplicationConfig) + settings, _, err := vr.loadSettings(t.Context(), newVDBClient(dbClient, stats, vttablet.DefaultVReplicationConfig.RelayLogMaxItems)) + require.NoError(t, err) + + vp := newVPlayer(vr, settings, nil, replication.Position{}, "replicate") + vp.replicatorPlan = &ReplicatorPlan{VStreamFilter: filter} + + err = vp.fetchAndApply(t.Context()) + require.Error(t, err) + require.Equal(t, vtrpcpb.Code_CANCELED, vterrors.Code(err)) +} + // TestPlayerJSONDocs validates more complex and 'large' json docs. It only validates that the data on target matches that on source. // TestPlayerTypes, above, also verifies the sql queries applied on the target. func TestPlayerJSONDocs(t *testing.T) { diff --git a/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go b/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go index 47686336fb2..b34a14e8298 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go @@ -25,6 +25,7 @@ import ( "sort" "strconv" "strings" + "sync/atomic" "time" "vitess.io/vitess/go/mysql/capabilities" @@ -109,8 +110,22 @@ type vreplicator struct { // source source *binlogdatapb.BinlogSource sourceVStreamer VStreamerClient - state binlogdatapb.VReplicationWorkflowState - stats *binlogplayer.Stats + // state is the workflow state as last written by setState*. It is read + // by worker goroutines (updateFKCheck) and the controller while the + // parallel commitLoop may be writing it, so access goes through + // getState/storeState. + state atomic.Int32 // binlogdatapb.VReplicationWorkflowState + // inCopyPhase reports whether the workflow still has tables to copy + // (_vt.copy_state is non-empty). It is refreshed from the durable row + // on every loadSettings call, so — unlike state, which is only updated + // by setState calls — it is truthful immediately after a tablet + // restart. That matters for AtomicCopy: its copy path (copyAll) never + // calls setState(Copying) — only initTablesForCopy does, on first + // start — so after a restart the in-memory state stays at zero for the + // whole remaining copy. The controller's AtomicCopy terminal-error + // guard reads this from another goroutine, hence atomic. + inCopyPhase atomic.Bool + stats *binlogplayer.Stats // mysqld is used to fetch the local schema. mysqld mysqlctl.MysqlDaemon colInfoMap map[string][]*ColumnInfo @@ -165,7 +180,7 @@ func newVReplicator(id int32, source *binlogdatapb.BinlogSource, sourceVStreamer source: source, sourceVStreamer: sourceVStreamer, stats: stats, - dbClient: newVDBClient(dbClient, stats, workflowConfig.RelayLogMaxItems), + dbClient: newVDBClientWithID(dbClient, stats, workflowConfig.RelayLogMaxItems, id), mysqld: mysqld, workflowConfig: workflowConfig, } @@ -314,7 +329,7 @@ func (vr *vreplicator) replicate(ctx context.Context) error { return err } } else { - if vr.state != binlogdatapb.VReplicationWorkflowState_Copying { + if vr.getState() != binlogdatapb.VReplicationWorkflowState_Copying { if err := vr.setState(binlogdatapb.VReplicationWorkflowState_Copying, ""); err != nil { vr.stats.ErrorCounts.Add([]string{"Copy"}, 1) return err @@ -496,6 +511,7 @@ func (vr *vreplicator) loadSettings(ctx context.Context, dbClient *vdbClient) (s vr.WorkflowType = int32(settings.WorkflowType) vr.WorkflowSubType = int32(settings.WorkflowSubType) vr.WorkflowName = settings.WorkflowName + vr.inCopyPhase.Store(numTablesToCopy != 0) } return settings, numTablesToCopy, err } @@ -533,7 +549,7 @@ func (vr *vreplicator) setMessage(message string) (err error) { if _, err := vr.dbClient.Execute(query); err != nil { return fmt.Errorf("could not set message: %v: %v", query, err) } - insertLog(vr.dbClient, LogMessage, vr.id, vr.state.String(), message) + insertLog(vr.dbClient, LogMessage, vr.id, vr.getState().String(), message) return nil } @@ -555,10 +571,42 @@ func (vr *vreplicator) maxQuerySize(dbc *vdbClient) int64 { } func (vr *vreplicator) insertLog(typ, message string) { - insertLog(vr.dbClient, typ, vr.id, vr.state.String(), message) + insertLog(vr.dbClient, typ, vr.id, vr.getState().String(), message) +} + +// isInCopyPhase reports whether the workflow had tables left to copy as of +// the last loadSettings call. +func (vr *vreplicator) isInCopyPhase() bool { + return vr.inCopyPhase.Load() +} + +// getState returns the workflow state as last recorded by setState*. +func (vr *vreplicator) getState() binlogdatapb.VReplicationWorkflowState { + return binlogdatapb.VReplicationWorkflowState(vr.state.Load()) +} + +// storeState records the workflow state. Use setState* to also persist it. +func (vr *vreplicator) storeState(state binlogdatapb.VReplicationWorkflowState) { + vr.state.Store(int32(state)) } func (vr *vreplicator) setState(state binlogdatapb.VReplicationWorkflowState, message string) error { + return vr.setStateWithDBClient(vr.dbClient, state, message) +} + +// setStateWithDBClientImmediate is setStateWithDBClient; the name survives at +// call sites (e.g. the parallel commitLoop) to document that the stop-state +// write executes immediately within the connection's open transaction. +func (vr *vreplicator) setStateWithDBClientImmediate(dbClient *vdbClient, state binlogdatapb.VReplicationWorkflowState, message string) error { + return vr.setStateWithDBClient(dbClient, state, message) +} + +// setStateWithDBClient writes the workflow's state/message row to +// _vt.vreplication using the supplied connection. Mid-batch, it flushes the +// pending batch first and executes its own writes immediately (still inside +// the same open MySQL transaction), marking the buffer flushed so nothing +// double-executes on the later batch commit. +func (vr *vreplicator) setStateWithDBClient(dbClient *vdbClient, state binlogdatapb.VReplicationWorkflowState, message string) error { if message != "" { vr.stats.History.Add(&binlogplayer.StatsHistoryRecord{ Time: time.Now(), @@ -567,20 +615,33 @@ func (vr *vreplicator) setState(state binlogdatapb.VReplicationWorkflowState, me } vr.stats.State.Store(state.String()) query := fmt.Sprintf("update _vt.vreplication set state=%v, message=left(%v, 1000) where id=%v", encodeString(state.String()), encodeString(binlogplayer.MessageTruncate(message)), vr.id) - // If we're batching a transaction, then include the state update - // in the current transaction batch. - if vr.dbClient.InTransaction && vr.dbClient.maxBatchSize > 0 { - vr.dbClient.AddQueryToTrxBatch(query) - } else { // Otherwise, send it down the wire - if _, err := vr.dbClient.ExecuteFetch(query, 1); err != nil { - return fmt.Errorf("could not set state: %v: %v", query, err) + // In batch-commit mode, queries run via ExecuteFetch execute on the wire + // AND get appended to the trx batch buffer (for Retry). A later + // CommitTrxQueryBatch would replay them in a fresh MySQL transaction, + // doubling the state UPDATE and the vreplication_log SELECT/INSERT that + // insertLog below issues, and breaking atomicity with the position + // write. So mid-batch we always: flush the pending batch first (the + // flush stays inside the same open MySQL transaction, preserving + // stop-path atomicity with the position update), run the state write + // and insertLog immediately, and mark the buffer flushed on EVERY exit + // path so the caller's CommitTrxQueryBatch only sends "commit". + // (Deferring the state UPDATE into the batch instead is not an option: + // insertLog must read getLastLog and cannot be batched, so its + // statements would double-execute on replay.) + if dbClient.InTransaction && dbClient.maxBatchSize > 0 { + if _, err := dbClient.ExecuteTrxQueryBatch(); err != nil { + return fmt.Errorf("could not flush pending batched queries before set state: %v: %v", query, err) } + defer dbClient.markTrxBatchedQueriesFlushed() } - if state == vr.state { + if _, err := dbClient.ExecuteFetch(query, 1); err != nil { + return fmt.Errorf("could not set state: %v: %v", query, err) + } + if state == vr.getState() { return nil } - insertLog(vr.dbClient, LogStateChange, vr.id, state.String(), message) - vr.state = state + insertLog(dbClient, LogStateChange, vr.id, state.String(), message) + vr.storeState(state) return nil } @@ -632,7 +693,16 @@ func (vr *vreplicator) getSettingFKRestrict() error { func (vr *vreplicator) resetFKCheckAfterCopy(dbClient *vdbClient) error { _, err := dbClient.Execute(fmt.Sprintf("set @@session.foreign_key_checks=%d", vr.originalFKCheckSetting)) - return err + if err != nil { + return err + } + // Keep the connection's cached FK session state coherent: updateFKCheck + // skips its SET when the cache says the session already matches, so a + // session mutation here must be reflected in the cache or the applier + // will silently run with the wrong foreign_key_checks setting. + dbClient.foreignKeyChecksEnabled = vr.originalFKCheckSetting != 0 + dbClient.foreignKeyChecksStateInitialized = true + return nil } func (vr *vreplicator) resetFKRestrictAfterCopy(dbClient *vdbClient) error { @@ -738,7 +808,14 @@ func (vr *vreplicator) updateHeartbeatTime(tm int64) error { func (vr *vreplicator) clearFKCheck(dbClient *vdbClient) error { _, err := dbClient.Execute("set @@session.foreign_key_checks=0") - return err + if err != nil { + return err + } + // See resetFKCheckAfterCopy: the cached FK session state must follow + // every out-of-band session mutation. + dbClient.foreignKeyChecksEnabled = false + dbClient.foreignKeyChecksStateInitialized = true + return nil } func (vr *vreplicator) clearFKRestrict(dbClient *vdbClient) error { @@ -861,6 +938,57 @@ func (vr *vreplicator) stashSecondaryKeys(ctx context.Context, tableName string) } func (vr *vreplicator) getTableSecondaryKeys(ctx context.Context, tableName string) ([]*sqlparser.IndexDefinition, error) { + tableSpec, err := vr.getTargetTableSpec(ctx, tableName) + if err != nil { + return nil, err + } + return extractSecondaryKeys(tableSpec), nil +} + +// extractSecondaryKeys returns the non-PK, non-FK-backed secondary +// indexes on a parsed CreateTable. Indexes that exist only to satisfy +// a foreign-key constraint are filtered out because dropping them +// would break the constraint. +func extractSecondaryKeys(tableSpec *sqlparser.TableSpec) []*sqlparser.IndexDefinition { + if tableSpec == nil { + return nil + } + var secondaryKeys []*sqlparser.IndexDefinition + fkIndexCols := make(map[string]bool) + for _, constraint := range tableSpec.Constraints { + if fkDef, ok := constraint.Details.(*sqlparser.ForeignKeyDefinition); ok { + fkCols := make([]string, len(fkDef.Source)) + for i, fkCol := range fkDef.Source { + fkCols[i] = fkCol.Lowered() + } + fkIndexCols[strings.Join(fkCols, ",")] = true + } + } + for _, index := range tableSpec.Indexes { + if index.Info.Type != sqlparser.IndexTypePrimary { + cols := make([]string, len(index.Columns)) + for i, col := range index.Columns { + cols[i] = col.Column.Lowered() + } + if fkIndexCols[strings.Join(cols, ",")] { + // This index is needed for a FK constraint so we cannot drop it. + continue + } + secondaryKeys = append(secondaryKeys, index) + } + } + return secondaryKeys +} + +// getTargetTableSpec fetches the target-side CREATE TABLE for the +// named table and returns its parsed TableSpec. Used by helpers that +// need to reason about target structure after the stream is running — +// e.g. detecting extra unique secondary indexes that affect the +// parallel applier's conflict detection. +func (vr *vreplicator) getTargetTableSpec(ctx context.Context, tableName string) (*sqlparser.TableSpec, error) { + if vr.mysqld == nil || vr.vre == nil || vr.vre.env == nil { + return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "missing schema lookup dependencies for %s", tableName) + } req := &tabletmanagerdatapb.GetSchemaRequest{Tables: []string{tableName}} schema, err := vr.mysqld.GetSchema(ctx, vr.dbClient.DBName(), req) if err != nil { @@ -872,10 +1000,9 @@ func (vr *vreplicator) getTableSecondaryKeys(ctx context.Context, tableName stri tableName, len(schema.TableDefinitions)) } tableSchema := schema.TableDefinitions[0].Schema - var secondaryKeys []*sqlparser.IndexDefinition parsedDDL, err := vr.vre.env.Parser().ParseStrictDDL(tableSchema) if err != nil { - return secondaryKeys, err + return nil, err } createTable, ok := parsedDDL.(*sqlparser.CreateTable) // createTable or createTable.TableSpec should never be nil @@ -883,32 +1010,149 @@ func (vr *vreplicator) getTableSecondaryKeys(ctx context.Context, tableName stri if !ok || createTable == nil || createTable.GetTableSpec() == nil { return nil, fmt.Errorf("could not determine CREATE TABLE statement from table schema %q", tableSchema) } + return createTable.GetTableSpec(), nil +} - tableSpec := createTable.GetTableSpec() - fkIndexCols := make(map[string]bool) - for _, constraint := range tableSpec.Constraints { - if fkDef, ok := constraint.Details.(*sqlparser.ForeignKeyDefinition); ok { - fkCols := make([]string, len(fkDef.Source)) - for i, fkCol := range fkDef.Source { - fkCols[i] = fkCol.Lowered() +// writesetUniqueKeys analyzes the target table's unique secondary indexes +// for parallel-apply writeset hashing. nil plan -> (nil, false, nil). +func (vr *vreplicator) writesetUniqueKeys(ctx context.Context, tableName string, plan *TablePlan) (uniqueKeys [][]string, mustSerialize bool, err error) { + if plan == nil { + return nil, false, nil + } + tableSpec, err := vr.getTargetTableSpec(ctx, tableName) + if err != nil { + return nil, false, err + } + uniqueKeys, mustSerialize = writesetUniqueKeysFromSpec(plan, tableSpec) + return uniqueKeys, mustSerialize, nil +} + +// writesetUniqueKeysFromSpec analyzes the target table's unique secondary +// indexes for parallel-apply writeset hashing, mirroring MySQL's WRITESET +// dependency tracking (which hashes every unique key, not just the PK: +// uniqueness constraints make transactions on DIFFERENT rows order-dependent, +// e.g. one txn freeing a unique value and another claiming it). +// It returns: +// - uniqueKeys: ordered column-name lists (lowercased, index order) of each +// plain-column unique secondary index not covered by the identity. The +// writeset builder emits additional conflict keys for these. +// - mustSerialize: true when the table carries uniqueness the hasher cannot +// reason about — prefix or expression index columns, a PK that does not +// match the replication identity, or unique secondaries with no usable +// identity — in which case the table's transactions force-serialize. +func writesetUniqueKeysFromSpec(plan *TablePlan, tableSpec *sqlparser.TableSpec) (uniqueKeys [][]string, mustSerialize bool) { + if plan == nil || tableSpec == nil { + return nil, false + } + secondaryKeys := extractSecondaryKeys(tableSpec) + if len(secondaryKeys) == 0 { + return nil, false + } + + identityCols := plan.IdentityColumns + if len(identityCols) == 0 { + // No usable identity but the table has secondary indexes that may + // enforce uniqueness we cannot reason about via PK-based writeset + // keys. Force serialization for any unique-not-null secondary so two + // parallel inserts cannot collide at apply time. + for _, secondaryKey := range secondaryKeys { + if secondaryKey == nil || secondaryKey.Info == nil { + continue + } + if secondaryKey.Info.IsUnique() { + return nil, true } - fkIndexCols[strings.Join(fkCols, ",")] = true } + return nil, false } + + identityColSet := make(map[string]struct{}, len(identityCols)) + for _, col := range identityCols { + identityColSet[col] = struct{}{} + } + + primaryKeyMatchesIdentity := true + primaryKeyMatchesIdentitySet := len(identityColSet) == len(identityCols) + primaryKeyColumnCount := 0 for _, index := range tableSpec.Indexes { - if index.Info.Type != sqlparser.IndexTypePrimary { - cols := make([]string, len(index.Columns)) - for i, col := range index.Columns { - cols[i] = col.Column.Lowered() + if index == nil || index.Info == nil || index.Info.Type != sqlparser.IndexTypePrimary { + continue + } + primaryKeyColumnCount = len(index.Columns) + if primaryKeyColumnCount != len(identityCols) { + return nil, true + } + for i, idxCol := range index.Columns { + if idxCol.Expression != nil { + primaryKeyMatchesIdentity = false + primaryKeyMatchesIdentitySet = false + break } - if fkIndexCols[strings.Join(cols, ",")] { - // This index is needed for a FK constraint so we cannot drop it. + if idxCol.Length != nil { + primaryKeyMatchesIdentity = false + primaryKeyMatchesIdentitySet = false + break + } + colName := idxCol.Column.Lowered() + if colName != identityCols[i] { + primaryKeyMatchesIdentity = false + } + if _, ok := identityColSet[colName]; !ok { + primaryKeyMatchesIdentitySet = false + } + } + break + } + if primaryKeyColumnCount > 0 && !primaryKeyMatchesIdentity && !primaryKeyMatchesIdentitySet { + return nil, true + } + + for _, secondaryKey := range secondaryKeys { + if secondaryKey == nil || secondaryKey.Info == nil || !secondaryKey.Info.IsUnique() { + continue + } + // A unique secondary index can only enforce conflicts beyond the + // identity if its raw column set does not contain the identity. If + // the index covers (id, anything-else) and id is the identity, two + // rows with different identity values cannot collide on the index. + // Functional expressions and prefix lengths break that reasoning + // because uniqueness is enforced over a derived value rather than + // the raw column, so identity uniqueness no longer implies index + // uniqueness, and we cannot hash a faithful writeset key for them. + indexColNames := make([]string, 0, len(secondaryKey.Columns)) + indexColSet := make(map[string]struct{}, len(secondaryKey.Columns)) + hasDerivedColumn := false + for _, idxCol := range secondaryKey.Columns { + if idxCol == nil { continue } - secondaryKeys = append(secondaryKeys, index) + if idxCol.Expression != nil || idxCol.Length != nil { + hasDerivedColumn = true + break + } + colName := idxCol.Column.Lowered() + indexColNames = append(indexColNames, colName) + indexColSet[colName] = struct{}{} + } + if hasDerivedColumn { + return nil, true + } + containsIdentity := true + for _, col := range identityCols { + if _, ok := indexColSet[col]; !ok { + containsIdentity = false + break + } + } + if containsIdentity { + // The index's column set contains all identity columns, so two + // rows with different identities cannot collide on it. No extra + // writeset key needed. + continue } + uniqueKeys = append(uniqueKeys, indexColNames) } - return secondaryKeys, err + return uniqueKeys, false } func (vr *vreplicator) execPostCopyActions(ctx context.Context, tableName string) error { @@ -1177,7 +1421,7 @@ func (vr *vreplicator) newClientConnection(ctx context.Context) (*vdbClient, err if err := dbc.Connect(); err != nil { return nil, vterrors.Wrap(err, "can't connect to database") } - dbClient := newVDBClient(dbc, vr.stats, vr.workflowConfig.RelayLogMaxItems) + dbClient := newVDBClientWithID(dbc, vr.stats, vr.workflowConfig.RelayLogMaxItems, vr.id) if _, err := vr.setSQLMode(ctx, dbClient); err != nil { return nil, vterrors.Wrap(err, "failed to set sql_mode") } diff --git a/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go b/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go index a1d28a2fd67..78d64c80b7e 100644 --- a/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go +++ b/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go @@ -39,12 +39,115 @@ import ( "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/mysqlctl" "vitess.io/vitess/go/vt/schemadiff" + "vitess.io/vitess/go/vt/sqlparser" vttablet "vitess.io/vitess/go/vt/vttablet/common" binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata" tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata" ) +// TestWritesetUniqueKeysFromSpec pins the spec-analysis rules the parallel +// applier relies on. Plain-column unique secondaries that aren't covered by +// the identity now emit writeset unique keys (uniqueKeys set, mustSerialize +// false) instead of force-serializing; only uniqueness the hasher cannot +// reason about (prefix/expression indexes, PK/identity mismatch, no usable +// identity) still forces serialization. +func TestWritesetUniqueKeysFromSpec(t *testing.T) { + parser := sqlparser.NewTestParser() + specFor := func(t *testing.T, ddl string) *sqlparser.TableSpec { + t.Helper() + parsedDDL, err := parser.ParseStrictDDL(ddl) + require.NoError(t, err) + createTable, ok := parsedDDL.(*sqlparser.CreateTable) + require.True(t, ok) + tableSpec := createTable.GetTableSpec() + require.NotNil(t, tableSpec) + return tableSpec + } + + tests := []struct { + name string + ddl string + identityCols []string + wantUniqueKeys [][]string + wantMustSerialize bool + }{ + { + // No usable identity but a unique-not-null secondary the + // PK-based writeset can't reason about: force serialization. + name: "no identity with unique secondary", + ddl: "create table t1 (id int, email varchar(64) not null, unique key uk_email(email))", + identityCols: nil, + wantMustSerialize: true, + }, + { + // Plain single-column unique secondary not covered by the + // identity: emit a writeset unique key, don't serialize. + name: "plain unique secondary emits key", + ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email(email))", + identityCols: []string{"id"}, + wantUniqueKeys: [][]string{{"email"}}, + }, + { + // Multi-column plain unique secondary: ordered column list. + name: "composite unique secondary emits ordered key", + ddl: "create table t1 (id int not null, a int not null, b int not null, primary key(id), unique key uk_ab(a, b))", + identityCols: []string{"id"}, + wantUniqueKeys: [][]string{{"a", "b"}}, + }, + { + // Unique secondary whose column set contains the identity can't + // create cross-identity conflicts: skip it (no key, no serialize). + name: "unique secondary covering identity is skipped", + ddl: "create table t1 (id int not null, b int not null, primary key(id), unique key uk_idb(id, b))", + identityCols: []string{"id"}, + }, + { + // Prefix index on the unique secondary: uniqueness is over a + // derived value, force serialization. + name: "prefix unique secondary serializes", + ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email(email(8)))", + identityCols: []string{"id"}, + wantMustSerialize: true, + }, + { + // Expression/functional unique index: force serialization. + name: "expression unique secondary serializes", + ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email((lower(email))))", + identityCols: []string{"id"}, + wantMustSerialize: true, + }, + { + // PK does not match the chosen replication identity: the + // PK-based writeset key is unreliable, force serialization. + name: "pk identity mismatch serializes", + ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email(email))", + identityCols: []string{"email"}, + wantMustSerialize: true, + }, + { + // A mix: one hashable key plus one covered-by-identity key. + name: "mixed hashable and covered keys", + ddl: "create table t1 (id int not null, email varchar(64) not null, b int not null, primary key(id), unique key uk_email(email), unique key uk_idb(id, b))", + identityCols: []string{"id"}, + wantUniqueKeys: [][]string{{"email"}}, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tableSpec := specFor(t, tc.ddl) + plan := &TablePlan{ + TargetName: "t1", + IdentityColumns: tc.identityCols, + } + uniqueKeys, mustSerialize := writesetUniqueKeysFromSpec(plan, tableSpec) + assert.Equal(t, tc.wantMustSerialize, mustSerialize) + assert.Equal(t, tc.wantUniqueKeys, uniqueKeys) + }) + } +} + func TestMaxQuerySize(t *testing.T) { makeVR := func(dbClient binlogplayer.DBClient, relayLogMaxSize int) *vreplicator { stats := binlogplayer.NewStats() @@ -984,3 +1087,77 @@ func TestThrottlerAppNames(t *testing.T) { assert.Contains(t, vc.throttlerAppName, "vcopier") assert.NotContains(t, vc.throttlerAppName, "vplayer") } + +// TestFKCheckHelpersUpdateSessionCache pins that clearFKCheck and +// resetFKCheckAfterCopy keep the vdbClient's cached foreign_key_checks +// session state coherent. updateFKCheck skips the SET when the cache says +// the session already matches, so any helper that mutates the session +// out-of-band MUST update the cache — otherwise an atomic-copy workflow's +// catchup -> clearFKCheck -> copy -> resetFKCheckAfterCopy -> catchup cycle +// leaves the session FK state out of sync with what the applier believes, +// silently applying with the wrong foreign_key_checks setting. +func TestFKCheckHelpersUpdateSessionCache(t *testing.T) { + dbc := binlogplayer.NewMockDBClient(t) + vdbc := newVDBClient(dbc, binlogplayer.NewStats(), 0) + vr := &vreplicator{originalFKCheckSetting: 1} + + // Simulate updateFKCheck having initialized the cache with FK checks ON. + vdbc.foreignKeyChecksEnabled = true + vdbc.foreignKeyChecksStateInitialized = true + + // The mock treats "set @@session.foreign_key_checks..." as an invariant, + // so no per-query expectations are needed. + require.NoError(t, vr.clearFKCheck(vdbc)) + assert.False(t, vdbc.foreignKeyChecksEnabled, "clearFKCheck must record FK checks as disabled in the session cache") + assert.True(t, vdbc.foreignKeyChecksStateInitialized) + + require.NoError(t, vr.resetFKCheckAfterCopy(vdbc)) + assert.True(t, vdbc.foreignKeyChecksEnabled, "resetFKCheckAfterCopy must record the restored FK state in the session cache") + assert.True(t, vdbc.foreignKeyChecksStateInitialized) +} + +// copyPhaseDBClient serves loadSettings like failingDBClient but reports a +// non-empty _vt.copy_state, simulating a workflow restarted mid-copy. +type copyPhaseDBClient struct { + failingDBClient +} + +func (c *copyPhaseDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) { + if strings.Contains(query, "from _vt.copy_state where vrepl_id=") { + return sqltypes.MakeTestResult( + sqltypes.MakeTestFields("count(distinct table_name)", "int64"), + "2", + ), nil + } + return c.failingDBClient.ExecuteFetch(query, maxrows) +} + +// TestLoadSettingsTracksCopyPhase is a restart-style regression test: a fresh +// vreplicator (in-memory state at its zero value, as after a tablet or +// controller restart) whose durable _vt.copy_state still has rows must report +// that it is in the copy phase. The controller's AtomicCopy terminal-error +// guard needs this durable-evidence signal because the AtomicCopy copy path +// (copyAll) never calls setState(Copying) — only initTablesForCopy does, on +// first start — so after a restart the entire remaining copy phase would +// otherwise run with vr.state at zero and copy-phase errors would be +// misclassified as retryable. +func TestLoadSettingsTracksCopyPhase(t *testing.T) { + vr := &vreplicator{ + id: 1, + dbClient: newVDBClient(©PhaseDBClient{}, binlogplayer.NewStats(), 0), + } + require.False(t, vr.isInCopyPhase(), "zero value must preserve existing (retryable) behavior") + + _, numTablesToCopy, err := vr.loadSettings(t.Context(), vr.dbClient) + require.NoError(t, err) + require.Equal(t, int64(2), numTablesToCopy) + require.True(t, vr.isInCopyPhase(), "loadSettings must record that tables remain to be copied") + + // Once the copy completes (no copy_state rows), the next loadSettings + // clears the flag so post-copy errors are classified as before. + vr.dbClient = newVDBClient(&failingDBClient{}, binlogplayer.NewStats(), 0) + _, numTablesToCopy, err = vr.loadSettings(t.Context(), vr.dbClient) + require.NoError(t, err) + require.Equal(t, int64(0), numTablesToCopy) + require.False(t, vr.isInCopyPhase()) +} diff --git a/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go b/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go index 5816afd8091..6cbe1baf391 100644 --- a/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go +++ b/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go @@ -24,6 +24,7 @@ import ( "io" "slices" "strings" + "sync" "sync/atomic" "time" @@ -260,11 +261,27 @@ func (vs *vstreamer) refreshHistorianForStartup(ctx context.Context) error { // parseEvents parses and sends events. func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.BinlogEvent, errs <-chan error) error { + ctx, cancel := context.WithCancel(ctx) // bufferAndTransmit uses bufferedEvents and curSize to buffer events. var ( bufferedEvents []*binlogdatapb.VEvent curSize int ) + var pendingStreamErr error + drainSourceEvents := make(chan struct{}) + var drainSourceEventsOnce sync.Once + signalDrainSourceEvents := func() { + drainSourceEventsOnce.Do(func() { + close(drainSourceEvents) + }) + } + recordSourceStreamErr := func(err error, ok bool) { + if ok && err != nil && pendingStreamErr == nil { + pendingStreamErr = err + signalDrainSourceEvents() + } + errs = nil + } // Only the following patterns are possible: // BEGIN->ROWs or Statements->GTID->COMMIT. In the case of large transactions, this can be broken into chunks. @@ -400,29 +417,56 @@ func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog wfNameLog = " in workflow " + vs.filter.WorkflowName } throttlerErrs := make(chan error, 1) // How we share the error when we've been fully throttled too long - defer close(throttlerErrs) throttleEvents := func(throttledEvents chan mysql.BinlogEvent) { + drainingAfterSourceError := false throttledTime := atomic.Int64{} for { - // Check throttler. - if checkResult, ok := vs.vse.throttlerClient.ThrottleCheckOKOrWaitAppName(ctx, vs.throttlerApp); !ok { - // Make sure to leave if context is cancelled. + if !drainingAfterSourceError { select { - case <-ctx.Done(): - return + case <-drainSourceEvents: + drainingAfterSourceError = true default: - // Do nothing special. } - vs.vse.throttledCounts.Add(1) - curtime := time.Now().Unix() - if !throttledTime.CompareAndSwap(0, curtime) { - if curtime-throttledTime.Load() > int64(fullyThrottledTimeout.Seconds()) { - throttlerErrs <- vterrors.Errorf(vtrpcpb.Code_INTERNAL, "vstreamer has been fully throttled for more than %v, giving up so that we can retry", fullyThrottledTimeout) + } + // Check throttler. + if !drainingAfterSourceError { + if checkResult, ok := vs.vse.throttlerClient.ThrottleCheckOKOrWaitAppName(ctx, vs.throttlerApp); !ok { + // Make sure to leave if context is cancelled. + select { + case <-ctx.Done(): return + default: + // Do nothing special. + } + select { + case <-drainSourceEvents: + drainingAfterSourceError = true + throttledTime.Store(0) + continue + default: + } + // Count only iterations that remain throttled: the drain transition + // above is not a throttle wait. + vs.vse.throttledCounts.Add(1) + curtime := time.Now().Unix() + if !throttledTime.CompareAndSwap(0, curtime) { + if curtime-throttledTime.Load() > int64(fullyThrottledTimeout.Seconds()) { + throttlerErrs <- vterrors.Errorf(vtrpcpb.Code_INTERNAL, "vstreamer has been fully throttled for more than %v, giving up so that we can retry", fullyThrottledTimeout) + // Close throttledEvents so the main parseEvents loop's + // `case ev, ok := <-throttledEvents` fires with ok=false + // and can return the throttler error (or a pending + // source error). Without this close, if pendingStreamErr + // is already set the main loop's throttlerErrs case + // `continue`s and the only remaining live select case + // is hbTimer.C, which spins forever swallowing the + // pending error until the caller cancels. + close(throttledEvents) + return + } } + logger.Infof("vstreamer throttled%s: %s.", wfNameLog, checkResult.Summary()) + continue } - logger.Infof("vstreamer throttled%s: %s.", wfNameLog, checkResult.Summary()) - continue } throttledTime.Store(0) // We are no longer fully throttled select { @@ -446,7 +490,32 @@ func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog // throttledEvents pulls data from events, but throttles pulling data, // which in turn blocks the BinlogConnection from pushing events to the channel throttledEvents := make(chan mysql.BinlogEvent) - go throttleEvents(throttledEvents) + throttleEventsDone := make(chan struct{}) + go func() { + defer close(throttleEventsDone) + throttleEvents(throttledEvents) + }() + defer func() { + cancel() + <-throttleEventsDone + }() + handleThrottledEvent := func(ev mysql.BinlogEvent) error { + vevents, err := vs.parseEvent(ev, bufferAndTransmit) + if err != nil { + vs.vse.errorCounts.Add("ParseEvent", 1) + return err + } + for _, vevent := range vevents { + if err := bufferAndTransmit(vevent); err != nil { + if err == io.EOF { + return err + } + vs.vse.errorCounts.Add("BufferAndTransmit", 1) + return vterrors.Wrapf(err, "error sending event: %+v", vevent) + } + } + return nil + } for { hbTimer.Reset(HeartbeatTime) @@ -459,48 +528,79 @@ func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog select { case ev, ok := <-throttledEvents: if !ok { + if pendingStreamErr != nil { + return pendingStreamErr + } + if errs != nil { + select { + case err, ok := <-errs: + recordSourceStreamErr(err, ok) + default: + } + } + if pendingStreamErr != nil { + return pendingStreamErr + } + // throttleEvents closes throttledEvents right after sending its + // timeout error to throttlerErrs; both select cases become ready + // at once and Go picks randomly, so when the closed-channel case + // wins we must surface the real throttler error rather than a + // misleading "unexpected server EOF". + select { + case err := <-throttlerErrs: + if err != nil { + return err + } + default: + } select { - case err := <-errs: - return err case <-ctx.Done(): return nil default: } return vterrors.Errorf(vtrpcpb.Code_ABORTED, "unexpected server EOF while parsing events") } - vevents, err := vs.parseEvent(ev, bufferAndTransmit) - if err != nil { - vs.vse.errorCounts.Add("ParseEvent", 1) + if err := func() error { + return handleThrottledEvent(ev) + }(); err != nil { + if err == io.EOF { + return nil + } return err } - for _, vevent := range vevents { - if err := bufferAndTransmit(vevent); err != nil { - if err == io.EOF { - return nil + case vs.vschema = <-vs.vevents: + if pendingStreamErr != nil { + continue + } + if errs != nil { + select { + case err, ok := <-errs: + recordSourceStreamErr(err, ok) + if pendingStreamErr != nil { + continue } - vs.vse.errorCounts.Add("BufferAndTransmit", 1) - return vterrors.Wrapf(err, "error sending event: %+v", vevent) + case <-ctx.Done(): + return nil + default: } } - case vs.vschema = <-vs.vevents: - select { - case err := <-errs: - return err - case <-ctx.Done(): - return nil - default: - if err := vs.rebuildPlans(); err != nil { - return vterrors.Wrap(err, "failed to rebuild replication plans after vschema change notification") - } + if err := vs.rebuildPlans(); err != nil { + return vterrors.Wrap(err, "failed to rebuild replication plans after vschema change notification") } - case err := <-errs: - return err + case err, ok := <-errs: + recordSourceStreamErr(err, ok) case throttlerErr := <-throttlerErrs: + if pendingStreamErr != nil { + continue + } vs.vse.errorCounts.Add(fullyThrottledMetricLabel, 1) return throttlerErr case <-ctx.Done(): return nil case <-hbTimer.C: + if pendingStreamErr != nil { + continue + } checkResult, ok := vs.vse.throttlerClient.ThrottleCheckOK(ctx, vs.throttlerApp) if err := injectHeartbeat(!ok, checkResult.Summary()); err != nil { if err == io.EOF { diff --git a/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go b/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go index 7c3e6edda72..c84ea33eff8 100644 --- a/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go +++ b/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go @@ -39,6 +39,7 @@ import ( "vitess.io/vitess/go/mysql/fakesqldb" "vitess.io/vitess/go/mysql/replication" "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/stats" "vitess.io/vitess/go/vt/dbconfigs" "vitess.io/vitess/go/vt/log" "vitess.io/vitess/go/vt/sqlparser" @@ -46,6 +47,8 @@ import ( "vitess.io/vitess/go/vt/vterrors" "vitess.io/vitess/go/vt/vttablet/tabletserver/schema" "vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv" + "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle" + throttlebase "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base" "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp" "vitess.io/vitess/go/vt/vttablet/tabletserver/vstreamer/testenv" @@ -58,6 +61,286 @@ type testcase struct { output [][]string } +func TestParseEventsDrainsBufferedEventsBeforeTerminalError(t *testing.T) { + f := mysql.NewMySQL56BinlogFormat() + s := mysql.NewFakeBinlogStream() + s.ServerID = 62344 + + input := []mysql.BinlogEvent{ + mysql.NewRotateEvent(f, s, 0, ""), + mysql.NewFormatDescriptionEvent(f, s), + mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */), + mysql.NewXIDEvent(f, s), + } + + streamErr := errors.New("stream ended after buffered events") + cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName}) + // A nil throttlerClient is intentional and safe: Client.ThrottleCheckOK + // nil-checks its receiver and reports "not throttled". + vse := &Engine{keyspace: testenv.DBName, shard: testenv.DefaultShard, throttledCounts: stats.NewCounter("", "")} + + for i := range 64 { + events := make(chan mysql.BinlogEvent, len(input)) + errs := make(chan error, 1) + for _, ev := range input { + events <- ev + } + close(events) + errs <- streamErr + close(errs) + + var got [][]*binlogdatapb.VEvent + vs := &vstreamer{ + ctx: t.Context(), + cp: cp, + send: func(vevents []*binlogdatapb.VEvent) error { + got = append(got, vevents) + return nil + }, + vse: vse, + } + + err := vs.parseEvents(t.Context(), events, errs) + require.ErrorIs(t, err, streamErr, "iteration %d", i) + require.Len(t, got, 1, "iteration %d", i) + require.Len(t, got[0], 2, "iteration %d", i) + require.Equal(t, binlogdatapb.VEventType_GTID, got[0][0].Type, "iteration %d", i) + require.Equal(t, binlogdatapb.VEventType_COMMIT, got[0][1].Type, "iteration %d", i) + require.Equal(t, testenv.DBName, got[0][0].Keyspace, "iteration %d", i) + require.Equal(t, testenv.DefaultShard, got[0][0].Shard, "iteration %d", i) + require.Equal(t, testenv.DBName, got[0][1].Keyspace, "iteration %d", i) + require.Equal(t, testenv.DefaultShard, got[0][1].Shard, "iteration %d", i) + } +} + +func TestParseEventsDrainsBufferedEventsBeforeTerminalErrorWhenThrottled(t *testing.T) { + f := mysql.NewMySQL56BinlogFormat() + s := mysql.NewFakeBinlogStream() + s.ServerID = 62344 + + input := []mysql.BinlogEvent{ + mysql.NewRotateEvent(f, s, 0, ""), + mysql.NewFormatDescriptionEvent(f, s), + mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */), + mysql.NewXIDEvent(f, s), + } + + streamErr := errors.New("stream ended after buffered events") + cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName}) + vse := &Engine{ + keyspace: testenv.DBName, + shard: testenv.DefaultShard, + throttlerClient: throttle.NewBackgroundClient(nil, throttlerapp.VStreamerName, throttlebase.UndefinedScope), + // Unpublished counter (empty name skips stats registration): this bare + // Engine bypasses NewEngine, so any counter the production code touches + // must be non-nil here. + throttledCounts: stats.NewCounter("", ""), + } + + events := make(chan mysql.BinlogEvent, len(input)) + for _, ev := range input { + events <- ev + } + close(events) + errCh := make(chan error, 1) + + ctx, cancel := context.WithCancel(t.Context()) + defer cancel() + done := make(chan error, 1) + var got [][]*binlogdatapb.VEvent + vs := &vstreamer{ + ctx: ctx, + cp: cp, + throttlerApp: throttlerapp.TestingAlwaysThrottledName, + send: func(vevents []*binlogdatapb.VEvent) error { + got = append(got, vevents) + return nil + }, + vse: vse, + } + + go func() { + done <- vs.parseEvents(ctx, events, errCh) + }() + go func() { + tmr := time.NewTimer(100 * time.Millisecond) + defer tmr.Stop() + select { + case <-ctx.Done(): + case <-tmr.C: + errCh <- streamErr + close(errCh) + } + }() + + var err error + require.Eventually(t, func() bool { + select { + case err = <-done: + return true + default: + return false + } + }, 2*time.Second, 50*time.Millisecond) + require.ErrorIs(t, err, streamErr) + require.Len(t, got, 1) + require.Len(t, got[0], 2) + require.Equal(t, binlogdatapb.VEventType_GTID, got[0][0].Type) + require.Equal(t, binlogdatapb.VEventType_COMMIT, got[0][1].Type) +} + +func TestParseEventsReturnsNilOnClientEOF(t *testing.T) { + f := mysql.NewMySQL56BinlogFormat() + s := mysql.NewFakeBinlogStream() + s.ServerID = 62344 + + input := []mysql.BinlogEvent{ + mysql.NewRotateEvent(f, s, 0, ""), + mysql.NewFormatDescriptionEvent(f, s), + mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */), + mysql.NewXIDEvent(f, s), + } + + events := make(chan mysql.BinlogEvent, len(input)) + for _, ev := range input { + events <- ev + } + close(events) + errCh := make(chan error) + close(errCh) + + cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName}) + // A nil throttlerClient is intentional and safe: Client.ThrottleCheckOK + // nil-checks its receiver and reports "not throttled". + vse := &Engine{keyspace: testenv.DBName, shard: testenv.DefaultShard, throttledCounts: stats.NewCounter("", "")} + + sendCalls := 0 + vs := &vstreamer{ + ctx: t.Context(), + cp: cp, + send: func(vevents []*binlogdatapb.VEvent) error { + sendCalls++ + return io.EOF + }, + vse: vse, + } + + err := vs.parseEvents(t.Context(), events, errCh) + require.NoError(t, err) + require.Equal(t, 1, sendCalls) +} + +func TestParseEventsClientEOFDuringThrottleDoesNotPanicAfterReturn(t *testing.T) { + origTimeout := fullyThrottledTimeout + origHeartbeatTime := HeartbeatTime + fullyThrottledTimeout = -time.Second + HeartbeatTime = 10 * time.Millisecond + t.Cleanup(func() { + fullyThrottledTimeout = origTimeout + HeartbeatTime = origHeartbeatTime + }) + + events := make(chan mysql.BinlogEvent) + close(events) + errCh := make(chan error) + + cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName}) + vse := &Engine{ + keyspace: testenv.DBName, + shard: testenv.DefaultShard, + throttlerClient: throttle.NewBackgroundClient(nil, throttlerapp.VStreamerName, throttlebase.UndefinedScope), + // Unpublished counter: bare Engines bypass NewEngine, so counters the + // production code touches must be non-nil. + throttledCounts: stats.NewCounter("", ""), + } + + vs := &vstreamer{ + ctx: t.Context(), + cp: cp, + throttlerApp: throttlerapp.TestingAlwaysThrottledName, + send: func(vevents []*binlogdatapb.VEvent) error { + require.Len(t, vevents, 1) + require.Equal(t, binlogdatapb.VEventType_HEARTBEAT, vevents[0].Type) + return io.EOF + }, + vse: vse, + } + + err := vs.parseEvents(t.Context(), events, errCh) + require.NoError(t, err) + + // Give the throttling goroutine time to hit its fully-throttled timeout path. + // The background client sleeps once per throttle check, so the sender needs + // two iterations before it reaches the send. Before the fix, parseEvents + // closed throttlerErrs on return, so the sender panicked here with + // "send on closed channel". + time.Sleep(750 * time.Millisecond) +} + +func TestParseEventsReturnsPendingSourceErrorAfterFullyThrottledTimeout(t *testing.T) { + origTimeout := fullyThrottledTimeout + origHeartbeatTime := HeartbeatTime + fullyThrottledTimeout = -time.Second + HeartbeatTime = time.Hour + t.Cleanup(func() { + fullyThrottledTimeout = origTimeout + HeartbeatTime = origHeartbeatTime + }) + + ctx, cancel := context.WithTimeout(t.Context(), time.Second) + t.Cleanup(cancel) + + streamErr := errors.New("stream ended while throttler wait was sleeping") + events := make(chan mysql.BinlogEvent) + errCh := make(chan error, 1) + cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName}) + vse := &Engine{ + keyspace: testenv.DBName, + shard: testenv.DefaultShard, + throttlerClient: throttle.NewBackgroundClient(nil, throttlerapp.VStreamerName, throttlebase.UndefinedScope), + // Unpublished counter: bare Engines bypass NewEngine, so counters the + // production code touches must be non-nil. + throttledCounts: stats.NewCounter("", ""), + } + + vs := &vstreamer{ + ctx: ctx, + cp: cp, + throttlerApp: throttlerapp.TestingAlwaysThrottledName, + send: func(vevents []*binlogdatapb.VEvent) error { + return nil + }, + vse: vse, + } + + done := make(chan error, 1) + go func() { + done <- vs.parseEvents(ctx, events, errCh) + }() + go func() { + tmr := time.NewTimer(400 * time.Millisecond) + defer tmr.Stop() + select { + case <-ctx.Done(): + case <-tmr.C: + close(events) + errCh <- streamErr + close(errCh) + } + }() + + var err error + require.Eventually(t, func() bool { + select { + case err = <-done: + return true + default: + return false + } + }, 2*time.Second, 10*time.Millisecond) + require.ErrorIs(t, err, streamErr) +} + func checkIfOptionIsSupported(t *testing.T, variable string) bool { qr, err := env.Mysqld.FetchSuperQuery(t.Context(), fmt.Sprintf("show variables like '%s'", variable)) require.NoError(t, err) diff --git a/test/config.json b/test/config.json index 0b6de16beea..fad122d5f63 100644 --- a/test/config.json +++ b/test/config.json @@ -243,6 +243,7 @@ "Shard": "xb_backup", "Tags": [], "Needs": [ + "larger-runner", "xtrabackup" ] }, @@ -262,6 +263,7 @@ "Shard": "xb_backup", "Tags": [], "Needs": [ + "larger-runner", "xtrabackup" ] }, @@ -2108,20 +2110,20 @@ "binlog-compression" ] }, - "vreplication_copy_parallel": { + "vreplication_parallel": { "File": "unused.go", "Packages": [ "vitess.io/vitess/go/test/endtoend/vreplication" ], "Args": [ "-run", - "TestVreplicationCopyParallel", + "TestVreplicationParallel", "-timeout", "20m" ], "Command": [], "Manual": false, - "Shard": "vreplication_copy_parallel", + "Shard": "vreplication_parallel", "Tags": [], "Needs": [ "larger-runner",