diff --git a/.golangci.yml b/.golangci.yml
index 87a605cda55..5cbf22a13f4 100644
--- a/.golangci.yml
+++ b/.golangci.yml
@@ -236,3 +236,4 @@ formatters:
paths:
- examples$
- ^go/vt/proto/
+ - ^test/antithesis/
diff --git a/CLAUDE.md b/CLAUDE.md
index 355b21fe2fa..2a0e8bf24b6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -197,6 +197,11 @@ return user.NeedsMigration() && migrate(user) || user
- **Copyright header** - New Go files must include the project copyright header with the current year
- **Always run `gofumpt -w`** on changed Go files before committing - this is mandatory
- **Always run `goimports -local "vitess.io/vitess" -w`** on changed Go files before committing
+- **Always run `golangci-lint run --path-mode=abs --timeout 10m`** (from the `go/` directory, scoped to the changed package(s)) before reporting work complete. CI runs it and will surface modernize/style issues that `go vet`, `gofumpt`, and `goimports` do not — for example:
+ - `waitgroup`: prefer `WaitGroup.Go(func() { ... })` over `wg.Add(1); go func() { defer wg.Done(); ... }()`
+ - `rangeint`: prefer `for range N` over `for i := 0; i < N; i++` when the index is unused
+ - `bloop`: prefer `b.Loop()` over `for i := 0; i < b.N; i++` in benchmarks
+ - `unusedparams`, `unusedwrite`, `unusedfunc`: clean these in code you touch
- **Use format verbs precisely** - Use `%s` for strings and `%d` for integers, not `%v` for everything
- **Structured logging** - New log messages should use structured logging with `slog`-style fields (e.g., `log.Warn("message", slog.Any("error", err))`) rather than printf-style logging with format strings
- **Reuse existing helpers** - Before writing new parsing/validation code, check for existing utilities (e.g., `sqlerror` package for MySQL error codes, `mysqlctl.ParseVersionString()`, `strings.Split()`, `topoproto.TabletAliasString()` for formatting tablet aliases)
diff --git a/changelog/25.0/25.0.0/summary.md b/changelog/25.0/25.0.0/summary.md
index 4f4d0ca89de..46cb62ecef1 100644
--- a/changelog/25.0/25.0.0/summary.md
+++ b/changelog/25.0/25.0.0/summary.md
@@ -6,6 +6,7 @@
- **[Major Changes](#major-changes)**
- **[New Support](#new-support)**
+ - [Experimental parallel VReplication applier](#vreplication-parallel-applier)
- **[Breaking Changes](#breaking-changes)**
- [`--watch-replication-stream` flag removed](#vttablet-watch-replication-stream-removed)
- [Snapshot Topology feature removed](#vtorc-snapshot-topology-removed)
@@ -15,6 +16,8 @@
- **[Minor Changes](#minor-changes)**
- **[VReplication](#minor-changes-vreplication)**
- [Default data protection for `_reverse` workflow cancel/complete](#vreplication-reverse-workflow-data-protection)
+ - [Unknown VStream event types are now hard errors in the applier](#vreplication-unknown-event-error)
+ - [Workflow config overrides sent to source tablets are now allowlisted](#vreplication-source-overrides-allowlist)
- **[VTGate](#minor-changes-vtgate)**
- [New controls for cross-keyspace reads](#vtgate-cross-keyspace-reads)
- **[VTTablet](#minor-changes-vttablet)**
@@ -26,6 +29,15 @@
### New Support
+#### Experimental parallel VReplication applier
+
+> [!WARNING]
+> This feature is experimental.
+
+VReplication can now apply binlog events using multiple concurrent MySQL connections instead of a single serial connection. Set `--vreplication-parallel-replication-workers=N` (default `1` = serial, maximum `64`) on `vttablet`, or the `vreplication-parallel-replication-workers` per-workflow config override, to dispatch non-conflicting transactions to `N` worker goroutines during the replication (running) phase. Conflicts are detected with target-side writeset hashing (primary key, unique key, and foreign key values — similar to MySQL's own `WRITESET` dependency tracking), so it works regardless of the source's `binlog_transaction_dependency_tracking` setting. Commits remain strictly ordered, so the workflow position, lag metrics, and `WaitForPos` semantics are unchanged. Transactions the conflict detector cannot reason about (DDL, statement-based events, partial row images, prefix/expression unique indexes, and similar) fall back to serial application.
+
+Note that each worker holds two MySQL connections, so a workflow with `N` workers uses `2N+2` target-side connections.
+
### Breaking Changes
#### `--watch-replication-stream` flag removed
@@ -84,6 +96,14 @@ When calling `cancel` or `complete` on an auto-generated `_reverse` workflow wit
The `--keep-data` flag help text has been updated to note this default explicitly. This change applies to MoveTables, Reshard, and other VReplication workflow types that use the shared cancel/complete paths.
+#### Unknown VStream event types are now hard errors in the applier
+
+The VReplication applier previously ignored VStream event types it did not recognize. It now fails the workflow with an error for unknown event types (and unknown `on-ddl` actions), failing closed instead of silently skipping events. All event types produced by supported Vitess versions are handled; this only affects streams from sources emitting event types unknown to the target's version.
+
+#### Workflow config overrides sent to source tablets are now allowlisted
+
+When a workflow has per-workflow config overrides, the target now sends only the source-relevant subset (packet size, timeouts, experimental flags, and similar) to the source tablet's VStreamer instead of the full override map. This keeps newer target-only override keys from failing workflows whose source tablets run an older version that rejects unknown keys.
+
See [#19906](https://github.com/vitessio/vitess/pull/19906) for details.
### VTGate
diff --git a/examples/benchmark/bench_compare.sh b/examples/benchmark/bench_compare.sh
new file mode 100755
index 00000000000..2b0a780a6b2
--- /dev/null
+++ b/examples/benchmark/bench_compare.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+
+# A/B comparison: serial (workers=1) vs parallel (workers=4) VReplication applier
+# with mixed write workload (INSERT/UPDATE/DELETE/bulk operations).
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR" || exit 1
+
+ROW_COUNT=${ROW_COUNT:-200000}
+SEED_ROWS=${SEED_ROWS:-10000}
+RUN_ORDER=${RUN_ORDER:-random}
+export ROW_COUNT SEED_ROWS
+
+echo "============================================"
+echo " VReplication Parallel Applier Benchmark"
+echo " ROW_COUNT=$ROW_COUNT SEED_ROWS=$SEED_ROWS"
+echo "============================================"
+echo ""
+
+run_bench() {
+ local workers=$1
+ local label=$2
+
+ echo ">>> Run: $label (PARALLEL_WORKERS=$workers) <<<"
+ echo ""
+
+ # Teardown any previous state
+ (cd "$SCRIPT_DIR/../local" && ./501_teardown.sh) 2>/dev/null
+
+ # Setup cluster with specified worker count
+ PARALLEL_WORKERS=$workers ./bench_setup.sh || { echo "FAILED: setup for $label"; return 1; }
+
+ # Run benchmark. Use pipefail so a bench_run.sh validation failure is not
+ # masked by tee's zero exit status.
+ (
+ set -o pipefail
+ ./bench_run.sh 2>&1 | tee "/tmp/bench_${workers}_workers.log"
+ ) || { echo "FAILED: bench_run for $label (validation or drain failure)"; return 1; }
+
+ echo ""
+ echo ">>> $label complete <<<"
+ echo ""
+}
+
+case "$RUN_ORDER" in
+ serial-first)
+ first_workers=1
+ first_label="Serial (1 worker)"
+ second_workers=4
+ second_label="Parallel (4 workers)"
+ ;;
+ parallel-first)
+ first_workers=4
+ first_label="Parallel (4 workers)"
+ second_workers=1
+ second_label="Serial (1 worker)"
+ ;;
+ random)
+ if (( RANDOM % 2 == 0 )); then
+ first_workers=1
+ first_label="Serial (1 worker)"
+ second_workers=4
+ second_label="Parallel (4 workers)"
+ RUN_ORDER=serial-first
+ else
+ first_workers=4
+ first_label="Parallel (4 workers)"
+ second_workers=1
+ second_label="Serial (1 worker)"
+ RUN_ORDER=parallel-first
+ fi
+ ;;
+ *)
+ echo "Invalid RUN_ORDER: $RUN_ORDER"
+ exit 1
+ ;;
+esac
+
+echo "Run order: $RUN_ORDER"
+
+# Run 1
+run_bench "$first_workers" "$first_label" || exit 1
+
+# Teardown between runs
+echo "Tearing down between runs..."
+(cd "$SCRIPT_DIR/../local" && ./501_teardown.sh) 2>/dev/null
+sleep 3
+
+# Run 2
+run_bench "$second_workers" "$second_label" || exit 1
+
+# Teardown after
+echo "Tearing down after benchmark..."
+(cd "$SCRIPT_DIR/../local" && ./501_teardown.sh) 2>/dev/null
+
+# Compare results
+echo ""
+echo "============================================"
+echo " COMPARISON"
+echo "============================================"
+
+for workers in 1 4; do
+ logfile="/tmp/bench_${workers}_workers.log"
+ if [[ -f "$logfile" ]]; then
+ echo ""
+ echo "--- Workers=$workers ---"
+ grep -E "(Drain time|Throughput|Backlog ops|Seed rows)" "$logfile"
+ fi
+done
+
+# Calculate speedup if both logs exist
+serial_log="/tmp/bench_1_workers.log"
+parallel_log="/tmp/bench_4_workers.log"
+if [[ -f "$serial_log" ]] && [[ -f "$parallel_log" ]]; then
+ serial_time=$(grep "Drain time" "$serial_log" | grep -o '[0-9]*')
+ parallel_time=$(grep "Drain time" "$parallel_log" | grep -o '[0-9]*')
+ if [[ -n "$serial_time" ]] && [[ -n "$parallel_time" ]] && [[ "$parallel_time" -gt 0 ]]; then
+ # Integer math: multiply by 100 for 2 decimal places
+ speedup_x100=$((serial_time * 100 / parallel_time))
+ speedup_whole=$((speedup_x100 / 100))
+ speedup_frac=$((speedup_x100 % 100))
+ printf -v speedup_str '%d.%02d' "$speedup_whole" "$speedup_frac"
+ echo ""
+ echo "--- Speedup ---"
+ echo " Serial: ${serial_time}s"
+ echo " Parallel: ${parallel_time}s"
+ echo " Speedup: ${speedup_str}x"
+ fi
+fi
+
+echo ""
+echo "============================================"
+echo "Full logs: /tmp/bench_1_workers.log and /tmp/bench_4_workers.log"
+echo "============================================"
diff --git a/examples/benchmark/bench_generate_load.sh b/examples/benchmark/bench_generate_load.sh
new file mode 100755
index 00000000000..e136eaa2e24
--- /dev/null
+++ b/examples/benchmark/bench_generate_load.sh
@@ -0,0 +1,231 @@
+#!/bin/bash
+
+# Generate workload for VReplication benchmark. Supports two modes:
+# LOAD_TYPE=seed — INSERT-only (builds base data for UPDATE/DELETE targets)
+# LOAD_TYPE=mixed — Mixed INSERT/UPDATE/DELETE/bulk operations
+#
+# The random generator uses a FIXED SEED so output is deterministic and
+# benchmark runs are repeatable for proper A/B comparisons.
+#
+# Environment variables:
+# ROW_COUNT — total operations to generate (default 5000000)
+# LOAD_TYPE — "seed" or "mixed" (default "mixed")
+# SEED_ROWS — rows per table available for UPDATE/DELETE (used in mixed mode)
+
+source ../common/env.sh
+
+TOTAL_OPS=${ROW_COUNT:-200000}
+OPS_PER_TABLE=$((TOTAL_OPS / 4))
+LOAD_TYPE=${LOAD_TYPE:-mixed}
+SEED_ROWS=${SEED_ROWS:-10000}
+
+echo "=== Generating Load: $TOTAL_OPS total ops ($OPS_PER_TABLE per table, type=$LOAD_TYPE) ==="
+
+TMPDIR="$VTDATAROOT/tmp/bench_load"
+mkdir -p "$TMPDIR"
+
+python3 -c "
+import random
+import string
+import os
+
+ops_per_table = $OPS_PER_TABLE
+load_type = '$LOAD_TYPE'
+seed_rows = $SEED_ROWS
+tmpdir = '$TMPDIR'
+
+# Fixed seed for deterministic, repeatable output.
+random.seed(42)
+
+# Pre-compute a pool of random strings to avoid per-row generation cost.
+_pool_size = 10000
+_str_pool = {}
+def _init_pool(n):
+ if n not in _str_pool:
+ chars = string.ascii_letters + string.digits
+ _str_pool[n] = [''.join(random.choices(chars, k=n)) for _ in range(_pool_size)]
+def rand_str(n):
+ pool = _str_pool.get(n)
+ if pool is None:
+ _init_pool(n)
+ pool = _str_pool[n]
+ return pool[random.randint(0, _pool_size - 1)]
+
+def gen_insert_orders(f):
+ name = rand_str(60)
+ sku = rand_str(40)
+ qty = random.randint(1, 100)
+ price = random.randint(100, 100000)
+ status = random.choice(['pending', 'shipped', 'delivered', 'cancelled', 'returned', 'processing'])
+ region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1'])
+ notes = rand_str(400)
+ f.write(f\"INSERT INTO bench_orders (customer_name, product_sku, quantity, total_price, status, region, notes) VALUES ('{name}', '{sku}', {qty}, {price}, '{status}', '{region}', '{notes}');\n\")
+
+def gen_insert_events(f):
+ etype = random.choice(['click', 'purchase', 'view', 'signup', 'logout', 'error', 'timeout', 'retry'])
+ source = rand_str(60)
+ payload = rand_str(600)
+ severity = random.randint(1, 10)
+ created = random.randint(1700000000, 1800000000)
+ category = rand_str(40)
+ f.write(f\"INSERT INTO bench_events (event_type, source, payload, severity, created_at, category) VALUES ('{etype}', '{source}', '{payload}', {severity}, {created}, '{category}');\n\")
+
+def gen_insert_accounts(f):
+ username = rand_str(40)
+ email = rand_str(30) + '@' + rand_str(20) + '.com'
+ balance = random.randint(0, 1000000)
+ region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1'])
+ bio = rand_str(400)
+ tier = random.choice(['free', 'basic', 'pro', 'enterprise', 'unlimited'])
+ f.write(f\"INSERT INTO bench_accounts (username, email, balance, region, bio, tier) VALUES ('{username}', '{email}', {balance}, '{region}', '{bio}', '{tier}');\n\")
+
+def gen_insert_logs(f):
+ level = random.choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'])
+ message = rand_str(400)
+ component = random.choice(['api', 'worker', 'scheduler', 'gateway', 'cache', 'auth', 'billing', 'storage'])
+ error_code = random.randint(0, 9999)
+ trace_id = rand_str(32)
+ span_id = rand_str(16)
+ f.write(f\"INSERT INTO bench_logs (level, message, component, error_code, trace_id, span_id) VALUES ('{level}', '{message}', '{component}', {error_code}, '{trace_id}', '{span_id}');\n\")
+
+insert_fns = {
+ 'orders': gen_insert_orders,
+ 'events': gen_insert_events,
+ 'accounts': gen_insert_accounts,
+ 'logs': gen_insert_logs,
+}
+
+# UPDATE generators — modify multiple indexed columns to create significant MySQL work
+def gen_update_orders(f, pk):
+ name = rand_str(60)
+ status = random.choice(['pending', 'shipped', 'delivered', 'cancelled', 'returned', 'processing'])
+ region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1'])
+ notes = rand_str(400)
+ f.write(f\"UPDATE bench_orders SET customer_name='{name}', status='{status}', region='{region}', notes='{notes}' WHERE id={pk};\n\")
+
+def gen_update_events(f, pk):
+ etype = random.choice(['click', 'purchase', 'view', 'signup', 'logout', 'error', 'timeout', 'retry'])
+ source = rand_str(60)
+ payload = rand_str(600)
+ category = rand_str(40)
+ f.write(f\"UPDATE bench_events SET event_type='{etype}', source='{source}', payload='{payload}', category='{category}' WHERE id={pk};\n\")
+
+def gen_update_accounts(f, pk):
+ username = rand_str(40)
+ email = rand_str(30) + '@' + rand_str(20) + '.com'
+ balance = random.randint(0, 1000000)
+ bio = rand_str(400)
+ tier = random.choice(['free', 'basic', 'pro', 'enterprise', 'unlimited'])
+ f.write(f\"UPDATE bench_accounts SET username='{username}', email='{email}', balance={balance}, bio='{bio}', tier='{tier}' WHERE id={pk};\n\")
+
+def gen_update_logs(f, pk):
+ level = random.choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'])
+ message = rand_str(400)
+ component = random.choice(['api', 'worker', 'scheduler', 'gateway', 'cache', 'auth', 'billing', 'storage'])
+ error_code = random.randint(0, 9999)
+ f.write(f\"UPDATE bench_logs SET level='{level}', message='{message}', component='{component}', error_code={error_code} WHERE id={pk};\n\")
+
+update_fns = {
+ 'orders': gen_update_orders,
+ 'events': gen_update_events,
+ 'accounts': gen_update_accounts,
+ 'logs': gen_update_logs,
+}
+
+# Bulk UPDATE generators — update N rows in one statement
+def gen_bulk_update(table, f, pks):
+ pk_list = ','.join(str(p) for p in pks)
+ if table == 'orders':
+ status = random.choice(['pending', 'shipped', 'delivered', 'cancelled', 'returned', 'processing'])
+ region = random.choice(['us-east-1', 'us-west-2', 'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-east-1'])
+ notes = rand_str(400)
+ f.write(f\"UPDATE bench_orders SET status='{status}', region='{region}', notes='{notes}' WHERE id IN ({pk_list});\n\")
+ elif table == 'events':
+ etype = random.choice(['click', 'purchase', 'view', 'signup', 'logout', 'error', 'timeout', 'retry'])
+ payload = rand_str(600)
+ f.write(f\"UPDATE bench_events SET event_type='{etype}', payload='{payload}' WHERE id IN ({pk_list});\n\")
+ elif table == 'accounts':
+ balance = random.randint(0, 1000000)
+ tier = random.choice(['free', 'basic', 'pro', 'enterprise', 'unlimited'])
+ f.write(f\"UPDATE bench_accounts SET balance={balance}, tier='{tier}' WHERE id IN ({pk_list});\n\")
+ elif table == 'logs':
+ level = random.choice(['DEBUG', 'INFO', 'WARN', 'ERROR', 'FATAL'])
+ message = rand_str(400)
+ f.write(f\"UPDATE bench_logs SET level='{level}', message='{message}' WHERE id IN ({pk_list});\n\")
+
+def gen_bulk_delete(table, f, pks):
+ pk_list = ','.join(str(p) for p in pks)
+ f.write(f\"DELETE FROM bench_{table} WHERE id IN ({pk_list});\n\")
+
+tables = ['orders', 'events', 'accounts', 'logs']
+
+if load_type == 'seed':
+ # Seed mode: INSERT-only, one file per table
+ for table in tables:
+ fn = insert_fns[table]
+ with open(os.path.join(tmpdir, f'{table}.sql'), 'w') as f:
+ for _ in range(ops_per_table):
+ fn(f)
+ print('Seed SQL files generated.')
+else:
+ # Mixed mode: diverse write operations
+ # Operation mix (as fractions of total per table):
+ # 50% single-row INSERT — light txns, good for serial batching
+ # 20% single-row UPDATE — medium txns, index maintenance
+ # 5% single-row DELETE — light txns
+ # 15% bulk UPDATE (5-15 rows) — heavy txns, lots of row events
+ # 10% bulk DELETE (3-8 rows) — medium-heavy txns
+ for table in tables:
+ insert_fn = insert_fns[table]
+ update_fn = update_fns[table]
+ with open(os.path.join(tmpdir, f'{table}.sql'), 'w') as f:
+ for i in range(ops_per_table):
+ r = random.random()
+ if r < 0.50:
+ # Single-row INSERT
+ insert_fn(f)
+ elif r < 0.70:
+ # Single-row UPDATE on existing seed row
+ pk = random.randint(1, seed_rows)
+ update_fn(f, pk)
+ elif r < 0.75:
+ # Single-row DELETE
+ pk = random.randint(1, seed_rows)
+ f.write(f\"DELETE FROM bench_{table} WHERE id={pk};\n\")
+ elif r < 0.90:
+ # Bulk UPDATE (5-15 rows)
+ n = random.randint(5, 15)
+ pks = [random.randint(1, seed_rows) for _ in range(n)]
+ gen_bulk_update(table, f, pks)
+ else:
+ # Bulk DELETE (3-8 rows)
+ n = random.randint(3, 8)
+ pks = [random.randint(1, seed_rows) for _ in range(n)]
+ gen_bulk_delete(table, f, pks)
+ print('Mixed SQL files generated.')
+" || fail "Failed to generate SQL files"
+
+echo "Loading data into commerce keyspace via vtgate (4 concurrent streams)..."
+
+load_start=$(date +%s)
+
+# Pipe all 4 SQL files concurrently through vtgate
+load_pids=()
+for table in orders events accounts logs; do
+ command mysql --no-defaults -h 127.0.0.1 -P 15306 --binary-as-hex=false commerce < "$TMPDIR/${table}.sql" &
+ load_pids+=("$!")
+done
+
+for pid in "${load_pids[@]}"; do
+ wait "$pid" || fail "Failed to load one or more benchmark SQL streams"
+done
+
+load_end=$(date +%s)
+load_elapsed=$((load_end - load_start))
+
+echo "=== Load Generation Complete ==="
+echo "Total operations: $TOTAL_OPS"
+echo "Time: ${load_elapsed}s"
+if [ "$load_elapsed" -gt 0 ]; then
+ echo "Rate: $((TOTAL_OPS / load_elapsed)) ops/sec"
+fi
diff --git a/examples/benchmark/bench_run.sh b/examples/benchmark/bench_run.sh
new file mode 100755
index 00000000000..3d9281ff9b6
--- /dev/null
+++ b/examples/benchmark/bench_run.sh
@@ -0,0 +1,412 @@
+#!/bin/bash
+
+# Run the VReplication parallel applier benchmark with mixed write workload.
+# Prerequisites: bench_setup.sh must have been run first.
+#
+# Flow:
+# 1. Seed source tables with initial data (for UPDATE/DELETE targets)
+# 2. Create MoveTables workflow, copy seed data, stop
+# 3. Generate mixed backlog (INSERT/UPDATE/DELETE/bulk) while stopped
+# 4. Start workflow, time drain until lag reaches 0
+#
+# Environment variables:
+# ROW_COUNT — total backlog operations (default 200000)
+# SEED_ROWS — seed rows per table for UPDATE/DELETE targets (default 10000)
+
+source ../common/env.sh
+
+TOTAL_OPS=${ROW_COUNT:-200000}
+SEED_ROWS=${SEED_ROWS:-10000}
+TOTAL_SEED=$((SEED_ROWS * 4))
+BENCH_TABLES="bench_orders,bench_events,bench_accounts,bench_logs"
+
+source_mysql() {
+ command mysql --no-defaults -h 127.0.0.1 -P 15306 --binary-as-hex=false "$@"
+}
+
+# Find the primary tablet for a keyspace and return its MySQL socket path
+detect_tablet_socket() {
+ local ks=$1
+ local primary_tablet
+ primary_tablet=$(vtctldclient GetTablets --keyspace "$ks" --shard 0 2>/dev/null | grep -w primary | awk '{print $1}')
+ if [[ -z "$primary_tablet" ]]; then
+ fail "Could not find primary tablet for $ks keyspace"
+ fi
+ local uid
+ uid=$(echo "$primary_tablet" | sed 's/.*-0*//')
+ local sock="$VTDATAROOT/vt_$(printf '%010d' "$uid")/mysql.sock"
+ echo "$sock"
+}
+
+detect_primaries() {
+ SOURCE_SOCKET=$(detect_tablet_socket commerce)
+ TARGET_SOCKET=$(detect_tablet_socket customer)
+ echo "Source socket: $SOURCE_SOCKET"
+ echo "Target socket: $TARGET_SOCKET"
+}
+
+source_direct_mysql() {
+ command mysql --no-defaults -u vt_dba -S "$SOURCE_SOCKET" "$@"
+}
+
+target_mysql() {
+ command mysql --no-defaults -u vt_dba -S "$TARGET_SOCKET" "$@"
+}
+
+# Extract the max GTID transaction sequence number from a GTID set string.
+# Handles formats like "uuid:1-N" and "MySQL56/uuid:1-N".
+max_gtid_seq() {
+ echo "$1" | tr ',' '\n' | grep -oE ':[0-9]+-[0-9]+' | grep -oE '[0-9]+$' | sort -n | tail -1
+}
+
+# Get the target's current replication position from _vt.vreplication
+target_pos() {
+ target_mysql -N -e \
+ "SELECT pos FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null
+}
+
+# Get replication lag in seconds (for display only, not reliable for drain detection)
+replication_lag() {
+ target_mysql -N -e \
+ "SELECT UNIX_TIMESTAMP() - FLOOR(time_updated) FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null
+}
+
+echo "=== Bench Run (ROW_COUNT=$TOTAL_OPS, SEED_ROWS=$SEED_ROWS) ==="
+
+detect_primaries
+
+cleanup_workflow() {
+ vtctldclient MoveTables --workflow bench_move --target-keyspace customer cancel 2>/dev/null
+}
+
+timeout_failed=0
+
+add_target_indexes() {
+ target_mysql vt_customer -e "
+ ALTER TABLE bench_orders
+ ADD INDEX idx_name_status (customer_name, status),
+ ADD INDEX idx_name_region_qty (customer_name, region, quantity),
+ ADD INDEX idx_sku_status_region (product_sku, status, region),
+ ADD INDEX idx_region_status_price (region, status, total_price),
+ ADD INDEX idx_notes_prefix (notes(255)),
+ ADD INDEX idx_qty_price (quantity, total_price),
+ ADD INDEX idx_status_qty_price (status, quantity, total_price),
+ ADD INDEX idx_sku_qty (product_sku, quantity),
+ ADD INDEX idx_name_price (customer_name, total_price),
+ ADD INDEX idx_region_qty_price (region, quantity, total_price),
+ ADD INDEX idx_status_name (status, customer_name),
+ ADD INDEX idx_sku_region (product_sku, region),
+ ADD INDEX idx_status_sku_price (status, product_sku, total_price),
+ ADD INDEX idx_name_qty_status (customer_name, quantity, status),
+ ADD INDEX idx_region_name (region, customer_name),
+ ADD INDEX idx_sku_name_region (product_sku, customer_name, region),
+ ADD INDEX idx_qty_status_region (quantity, status, region),
+ ADD INDEX idx_price_status (total_price, status),
+ ADD INDEX idx_price_region_name (total_price, region, customer_name),
+ ADD INDEX idx_notes_prefix2 (notes(128));
+" || return 1
+
+ target_mysql vt_customer -e "
+ ALTER TABLE bench_events
+ ADD INDEX idx_source_type (source, event_type),
+ ADD INDEX idx_type_category (event_type, category),
+ ADD INDEX idx_category_severity (category, severity),
+ ADD INDEX idx_created_severity (created_at, severity),
+ ADD INDEX idx_source_category (source, category),
+ ADD INDEX idx_payload_prefix (payload(255)),
+ ADD INDEX idx_type_created_severity (event_type, created_at, severity),
+ ADD INDEX idx_source_severity (source, severity),
+ ADD INDEX idx_category_created (category, created_at),
+ ADD INDEX idx_type_source_severity (event_type, source, severity),
+ ADD INDEX idx_severity_category (severity, category),
+ ADD INDEX idx_created_type (created_at, event_type),
+ ADD INDEX idx_source_created_type (source, created_at, event_type),
+ ADD INDEX idx_category_type_created (category, event_type, created_at),
+ ADD INDEX idx_severity_source (severity, source),
+ ADD INDEX idx_type_severity_created (event_type, severity, created_at),
+ ADD INDEX idx_created_category_severity (created_at, category, severity),
+ ADD INDEX idx_source_type_category (source, event_type, category),
+ ADD INDEX idx_severity_type_source (severity, event_type, source),
+ ADD INDEX idx_payload_prefix2 (payload(128));
+" || return 1
+
+ target_mysql vt_customer -e "
+ ALTER TABLE bench_accounts
+ ADD INDEX idx_username_tier (username, tier),
+ ADD INDEX idx_email_region (email, region),
+ ADD INDEX idx_tier_balance (tier, balance),
+ ADD INDEX idx_region_tier (region, tier),
+ ADD INDEX idx_bio_prefix (bio(255)),
+ ADD INDEX idx_tier_region_balance (tier, region, balance),
+ ADD INDEX idx_username_balance (username, balance),
+ ADD INDEX idx_email_tier (email, tier),
+ ADD INDEX idx_username_region (username, region),
+ ADD INDEX idx_balance_tier (balance, tier),
+ ADD INDEX idx_region_balance_tier (region, balance, tier),
+ ADD INDEX idx_tier_username (tier, username),
+ ADD INDEX idx_email_balance (email, balance),
+ ADD INDEX idx_region_username (region, username),
+ ADD INDEX idx_username_tier_balance (username, tier, balance),
+ ADD INDEX idx_tier_email (tier, email),
+ ADD INDEX idx_balance_region (balance, region),
+ ADD INDEX idx_email_tier_region (email, tier, region),
+ ADD INDEX idx_region_email_balance (region, email, balance),
+ ADD INDEX idx_bio_prefix2 (bio(128));
+" || return 1
+
+ target_mysql vt_customer -e "
+ ALTER TABLE bench_logs
+ ADD INDEX idx_component_level (component, level),
+ ADD INDEX idx_trace_span (trace_id, span_id),
+ ADD INDEX idx_level_error (level, error_code),
+ ADD INDEX idx_component_error (component, error_code),
+ ADD INDEX idx_message_prefix (message(255)),
+ ADD INDEX idx_span_level (span_id, level),
+ ADD INDEX idx_error_component_level (error_code, component, level),
+ ADD INDEX idx_level_component_error (level, component, error_code),
+ ADD INDEX idx_trace_level (trace_id, level),
+ ADD INDEX idx_component_trace (component, trace_id),
+ ADD INDEX idx_error_level (error_code, level),
+ ADD INDEX idx_span_component (span_id, component),
+ ADD INDEX idx_level_trace (level, trace_id),
+ ADD INDEX idx_trace_component_level (trace_id, component, level),
+ ADD INDEX idx_error_span (error_code, span_id),
+ ADD INDEX idx_component_span_level (component, span_id, level),
+ ADD INDEX idx_level_span_error (level, span_id, error_code),
+ ADD INDEX idx_span_error_component (span_id, error_code, component),
+ ADD INDEX idx_trace_error (trace_id, error_code),
+ ADD INDEX idx_message_prefix2 (message(128));
+" || return 1
+}
+
+# Step 1: Seed source tables with initial data
+# Retry the seed step: vtgate's connection pool to the primary tablet can be
+# briefly unavailable right after cluster startup, surfacing as
+# "connection pool is closed" when the seed script runs too soon.
+echo ""
+echo "Seeding source tables ($SEED_ROWS rows per table = $TOTAL_SEED total)..."
+seed_attempts=0
+seed_max_attempts=3
+until LOAD_TYPE=seed ROW_COUNT=$TOTAL_SEED ./bench_generate_load.sh; do
+ seed_attempts=$((seed_attempts+1))
+ if [[ $seed_attempts -ge $seed_max_attempts ]]; then
+ fail "Failed to seed data after $seed_max_attempts attempts"
+ fi
+ echo "Seed failed (attempt $seed_attempts); retrying in 10s..."
+ sleep 10
+done
+
+# Step 2: Create MoveTables workflow (auto-start, copies seed data)
+echo ""
+echo "Creating MoveTables workflow..."
+vtctldclient MoveTables --workflow bench_move --target-keyspace customer create \
+ --source-keyspace commerce \
+ --tables "$BENCH_TABLES" || fail "Failed to create MoveTables workflow"
+
+# Step 3: Wait for copy phase to complete (state transitions to Running)
+echo "Waiting for copy phase to complete..."
+max_wait=600
+for i in $(seq 1 $max_wait); do
+ state=$(target_mysql -N -e \
+ "SELECT state FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null | head -1)
+ if [[ "$state" == "Running" ]]; then
+ echo "Copy phase complete, workflow is running."
+ break
+ fi
+ if [[ $((i % 10)) -eq 0 ]]; then
+ echo " ...still copying (state=$state, ${i}s elapsed)"
+ fi
+ sleep 1
+done
+
+if [[ "$state" != "Running" ]]; then
+ fail "Timed out waiting for copy phase to complete (state=$state)"
+fi
+
+# Step 4: Stop the workflow so we can build a backlog
+echo "Stopping workflow..."
+vtctldclient MoveTables --workflow bench_move --target-keyspace customer stop || fail "Failed to stop workflow"
+
+for i in $(seq 1 30); do
+ state=$(target_mysql -N -e \
+ "SELECT state FROM _vt.vreplication WHERE workflow='bench_move'" 2>/dev/null | head -1)
+ if [[ "$state" == "Stopped" ]]; then
+ break
+ fi
+ sleep 1
+done
+
+if [[ "$state" != "Stopped" ]]; then
+ fail "Workflow did not stop (state=$state)"
+fi
+echo "Workflow stopped."
+
+# Step 4b: Add extra indexes on the TARGET to increase per-statement MySQL cost.
+# The source keeps lightweight indexes so the vstreamer produces events fast.
+# Heavy target indexes make the applier the bottleneck, allowing parallel workers
+# to demonstrate their advantage by overlapping expensive index maintenance.
+# With ~25 indexes per table and an 8MB buffer pool, each INSERT/UPDATE/DELETE
+# requires many random page reads that can be overlapped by parallel workers.
+echo ""
+echo "Adding extra indexes on target to increase applier workload..."
+add_target_indexes || {
+ echo "ERROR: failed to add target indexes"
+ cleanup_workflow
+ exit 1
+}
+
+echo "Target indexes added (~25 per table)."
+
+# Step 5: Generate mixed backlog on source
+echo ""
+echo "Generating mixed backlog on source ($TOTAL_OPS operations)..."
+LOAD_TYPE=mixed ROW_COUNT=$TOTAL_OPS SEED_ROWS=$SEED_ROWS ./bench_generate_load.sh || fail "Failed to generate backlog"
+
+# Step 5b: Capture source GTID position after backlog generation.
+# This is the definitive marker — when the target's pos reaches this point,
+# all backlog events have been applied.
+source_gtid=$(source_direct_mysql -N -e "SELECT @@gtid_executed" 2>/dev/null | tr -d '[:space:]')
+source_seq=$(max_gtid_seq "$source_gtid")
+echo "Source GTID seq after backlog: $source_seq"
+
+if [[ -z "$source_seq" ]] || [[ "$source_seq" -eq 0 ]]; then
+ fail "Could not capture source GTID position"
+fi
+
+# Step 6: Record start time and start the workflow
+echo ""
+echo "Starting workflow to drain backlog..."
+start_time=$(date +%s)
+
+vtctldclient MoveTables --workflow bench_move --target-keyspace customer start || fail "Failed to start workflow"
+
+# Step 7: Poll until target GTID position catches up to source.
+# We use GTID comparison instead of time_updated lag because:
+# - time_updated is refreshed by the controller loop regardless of applier progress
+# - With parallel workers, the controller doesn't block on the applier, so
+# time_updated stays near-current even while the backlog is being processed
+# - GTID position accurately reflects committed progress
+echo "Waiting for target to catch up (source_seq=$source_seq)..."
+last_report=0
+while true; do
+ now=$(date +%s)
+ elapsed=$((now - start_time))
+
+ # Get target's current replicated position
+ tpos=$(target_pos)
+ target_seq=$(max_gtid_seq "$tpos")
+
+ # Check if target has caught up to source
+ if [[ -n "$target_seq" ]] && [[ "$target_seq" =~ ^[0-9]+$ ]] && [[ "$target_seq" -ge "$source_seq" ]]; then
+ echo "Target caught up! (target_seq=$target_seq >= source_seq=$source_seq, ${elapsed}s elapsed)"
+ break
+ fi
+
+ if [[ $((elapsed - last_report)) -ge 5 ]]; then
+ lag=$(replication_lag)
+ pct=""
+ if [[ -n "$target_seq" ]] && [[ "$target_seq" =~ ^[0-9]+$ ]] && [[ "$source_seq" -gt 0 ]]; then
+ pct=" $(( target_seq * 100 / source_seq ))%"
+ fi
+ echo " ...draining (pos=${target_seq:-?}/${source_seq}${pct} lag=${lag:-?}s ${elapsed}s)"
+ last_report=$elapsed
+ fi
+
+ if [[ "$elapsed" -ge 7200 ]]; then
+ echo "ERROR: Timed out after ${elapsed}s (target_seq=${target_seq:-?})"
+ timeout_failed=1
+ break
+ fi
+
+ sleep 1
+done
+
+if [[ "$timeout_failed" -ne 0 ]]; then
+ echo ""
+ echo "ERROR: drain timed out before reaching source GTID position"
+ cleanup_workflow
+ exit 1
+fi
+
+end_time=$(date +%s)
+
+# Step 8: Calculate and report results
+elapsed_s=$((end_time - start_time))
+
+echo ""
+echo "============================================"
+echo " BENCHMARK RESULTS"
+echo "============================================"
+echo " Backlog ops: $TOTAL_OPS"
+echo " Seed rows: $TOTAL_SEED"
+echo " Drain time: ${elapsed_s}s"
+if [ "$elapsed_s" -gt 0 ]; then
+ echo " Throughput: $((TOTAL_OPS / elapsed_s)) ops/sec"
+fi
+echo "============================================"
+
+# Step 9: Validate source and target are semantically equivalent (only after
+# drain, when target is idle). COUNT(*) alone is too weak: reordering errors,
+# wrong-row updates, or corrupted values can preserve cardinality while changing
+# row content. We compute a content checksum per table that is order-independent
+# (BIT_XOR of CRC32 over all column values) so parallel apply reordering is
+# not flagged as divergence as long as the final state is equivalent.
+echo ""
+echo "Validating row counts and content checksums..."
+validation_failed=0
+
+# Returns the column list for the given table. These must match
+# create_bench_schema.sql exactly — keep in sync. Using a function here
+# instead of an associative array for bash 3.2 compatibility (macOS).
+table_columns() {
+ case "$1" in
+ bench_orders) echo "id,customer_name,product_sku,quantity,total_price,status,region,notes" ;;
+ bench_events) echo "id,event_type,source,payload,severity,created_at,category" ;;
+ bench_accounts) echo "id,username,email,balance,region,bio,tier" ;;
+ bench_logs) echo "id,level,message,component,error_code,trace_id,span_id" ;;
+ esac
+}
+
+for table in bench_orders bench_events bench_accounts bench_logs; do
+ cols="$(table_columns "$table")"
+ # Build CONCAT_WS over all columns with IFNULL so NULLs don't collapse the row.
+ concat_expr="CONCAT_WS('|'"
+ old_ifs="$IFS"
+ IFS=','
+ for col in $cols; do
+ concat_expr="$concat_expr,IFNULL($col,'\\0')"
+ done
+ IFS="$old_ifs"
+ concat_expr="$concat_expr)"
+ checksum_sql="SELECT COUNT(*), COALESCE(BIT_XOR(CAST(CRC32($concat_expr) AS UNSIGNED)), 0) FROM"
+
+ source_row=$(source_mysql -N -e "$checksum_sql commerce.$table" 2>/dev/null)
+ target_row=$(target_mysql -N -e "$checksum_sql vt_customer.$table" 2>/dev/null)
+ source_count=$(echo "$source_row" | awk '{print $1}')
+ source_cksum=$(echo "$source_row" | awk '{print $2}')
+ target_count=$(echo "$target_row" | awk '{print $1}')
+ target_cksum=$(echo "$target_row" | awk '{print $2}')
+
+ match="OK"
+ if [[ "$source_count" != "$target_count" ]] || [[ "$source_cksum" != "$target_cksum" ]]; then
+ match="MISMATCH"
+ validation_failed=1
+ fi
+ echo " $table: source=(count=$source_count, cksum=$source_cksum) target=(count=$target_count, cksum=$target_cksum) [$match]"
+done
+
+if [[ "$validation_failed" -ne 0 ]]; then
+ echo ""
+ echo "ERROR: validation FAILED — source and target diverged. See mismatches above."
+ echo "=== Bench Run Failed ==="
+ # Still attempt workflow cleanup before exiting.
+ cleanup_workflow
+ exit 1
+fi
+
+# Step 10: Cleanup workflow
+echo ""
+echo "Cleaning up workflow..."
+cleanup_workflow
+
+echo "=== Bench Run Complete ==="
diff --git a/examples/benchmark/bench_scripts_test.sh b/examples/benchmark/bench_scripts_test.sh
new file mode 100644
index 00000000000..6beadff6689
--- /dev/null
+++ b/examples/benchmark/bench_scripts_test.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+fail_test() {
+ printf 'FAIL: %s\n' "$*" >&2
+ exit 1
+}
+
+assert_equals() {
+ local got=$1
+ local want=$2
+ local message=$3
+ if [[ "$got" != "$want" ]]; then
+ fail_test "$message (got=$got want=$want)"
+ fi
+}
+
+assert_contains() {
+ local haystack=$1
+ local needle=$2
+ local message=$3
+ if [[ "$haystack" != *"$needle"* ]]; then
+ fail_test "$message"
+ fi
+}
+
+assert_not_contains() {
+ local haystack=$1
+ local needle=$2
+ local message=$3
+ if [[ "$haystack" == *"$needle"* ]]; then
+ fail_test "$message"
+ fi
+}
+
+new_sandbox() {
+ local sandbox
+ sandbox=$(mktemp -d)
+ mkdir -p "$sandbox/examples/benchmark" "$sandbox/examples/common" "$sandbox/examples/local" "$sandbox/bin" "$sandbox/vtdataroot"
+ cp "$REPO_ROOT/examples/benchmark/bench_run.sh" "$sandbox/examples/benchmark/bench_run.sh"
+ cp "$REPO_ROOT/examples/benchmark/bench_compare.sh" "$sandbox/examples/benchmark/bench_compare.sh"
+ cat > "$sandbox/examples/common/env.sh" <<'EOF'
+#!/bin/bash
+
+fail() {
+ echo "$*" >&2
+ exit 1
+}
+
+export VTDATAROOT="${VTDATAROOT:-$PWD/vtdataroot}"
+mkdir -p "$VTDATAROOT"
+EOF
+ cat > "$sandbox/examples/benchmark/bench_generate_load.sh" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+printf '%s\n' "${LOAD_TYPE:-unset}:${ROW_COUNT:-unset}" >> "$BENCH_TEST_TMP/load_calls"
+EOF
+ cat > "$sandbox/examples/local/501_teardown.sh" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+printf 'teardown\n' >> "$BENCH_TEST_TMP/teardown_calls"
+EOF
+ cat > "$sandbox/bin/vtctldclient" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+printf '%s\n' "$*" >> "$BENCH_TEST_TMP/vtctld_calls"
+if [[ "${1:-}" == "GetTablets" ]]; then
+ if [[ "$*" == *"--keyspace commerce"* ]]; then
+ printf 'zone1-0000000100 primary\n'
+ else
+ printf 'zone1-0000000200 primary\n'
+ fi
+fi
+EOF
+ cat > "$sandbox/bin/mysql" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+
+query=""
+while (($#)); do
+ case "$1" in
+ -e)
+ query=$2
+ shift 2
+ ;;
+ --no-defaults|--binary-as-hex=false|-N)
+ shift
+ ;;
+ -h|-P|-u|-S)
+ shift 2
+ ;;
+ *)
+ shift
+ ;;
+ esac
+done
+
+state_calls_file="$BENCH_TEST_TMP/mysql_state_calls"
+case "$query" in
+ *"SELECT state FROM _vt.vreplication WHERE workflow='bench_move'"*)
+ state_calls=0
+ if [[ -f "$state_calls_file" ]]; then
+ state_calls=$(cat "$state_calls_file")
+ fi
+ state_calls=$((state_calls + 1))
+ printf '%s' "$state_calls" > "$state_calls_file"
+ if [[ "$state_calls" -eq 1 ]]; then
+ printf 'Running\n'
+ else
+ printf 'Stopped\n'
+ fi
+ ;;
+ *"ALTER TABLE bench_orders"*)
+ if [[ "${BENCH_FAIL_FIRST_TARGET_ALTER:-0}" == "1" ]]; then
+ echo 'simulated index build failure' >&2
+ exit 1
+ fi
+ ;;
+ *"ALTER TABLE bench_events"*|*"ALTER TABLE bench_accounts"*|*"ALTER TABLE bench_logs"*)
+ ;;
+ *"SELECT @@gtid_executed"*)
+ printf 'uuid:1-100\n'
+ ;;
+ *"SELECT pos FROM _vt.vreplication WHERE workflow='bench_move'"*)
+ printf 'uuid:1-50\n'
+ ;;
+ *"SELECT UNIX_TIMESTAMP() - FLOOR(time_updated) FROM _vt.vreplication WHERE workflow='bench_move'"*)
+ printf '999\n'
+ ;;
+ *"SELECT COUNT(*), COALESCE(BIT_XOR("*)
+ printf '1 2\n'
+ ;;
+ esac
+EOF
+ cat > "$sandbox/bin/date" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+if [[ "${1:-}" != "+%s" ]]; then
+ /bin/date "$@"
+ exit 0
+fi
+
+calls_file="$BENCH_TEST_TMP/date_calls"
+calls=0
+if [[ -f "$calls_file" ]]; then
+ calls=$(cat "$calls_file")
+fi
+calls=$((calls + 1))
+printf '%s' "$calls" > "$calls_file"
+
+case "$calls" in
+ 1)
+ printf '0\n'
+ ;;
+ 2)
+ printf '%s\n' "${BENCH_TIMEOUT_ELAPSED:-7200}"
+ ;;
+ *)
+ printf '%s\n' "${BENCH_TIMEOUT_ELAPSED_END:-7201}"
+ ;;
+esac
+EOF
+ chmod +x "$sandbox/examples/common/env.sh" "$sandbox/examples/benchmark/bench_generate_load.sh" "$sandbox/examples/local/501_teardown.sh" "$sandbox/bin/vtctldclient" "$sandbox/bin/mysql" "$sandbox/bin/date"
+ printf '%s\n' "$sandbox"
+}
+
+test_bench_run_timeout_fails_without_results() {
+ local sandbox output status
+ sandbox=$(new_sandbox)
+ trap 'rm -rf "$sandbox"' RETURN
+ output=$(cd "$sandbox/examples/benchmark" && BENCH_TEST_TMP="$sandbox" VTDATAROOT="$sandbox/vtdataroot" PATH="$sandbox/bin:$PATH" bash ./bench_run.sh 2>&1) || status=$?
+ status=${status:-0}
+
+ if [[ "$status" -eq 0 ]]; then
+ fail_test "bench_run timeout should fail"
+ fi
+ assert_contains "$output" "Timed out after" "bench_run should report the timeout"
+ assert_not_contains "$output" "BENCHMARK RESULTS" "bench_run should not print results after a timeout"
+ assert_contains "$(cat "$sandbox/vtctld_calls")" "MoveTables --workflow bench_move --target-keyspace customer cancel" "bench_run should clean up the workflow after a timeout"
+ trap - RETURN
+ rm -rf "$sandbox"
+}
+
+test_bench_run_index_failure_is_fatal() {
+ local sandbox output status load_calls
+ sandbox=$(new_sandbox)
+ trap 'rm -rf "$sandbox"' RETURN
+ output=$(cd "$sandbox/examples/benchmark" && BENCH_TEST_TMP="$sandbox" BENCH_FAIL_FIRST_TARGET_ALTER=1 VTDATAROOT="$sandbox/vtdataroot" PATH="$sandbox/bin:$PATH" bash ./bench_run.sh 2>&1) || status=$?
+ status=${status:-0}
+
+ if [[ "$status" -eq 0 ]]; then
+ fail_test "bench_run should fail when target index creation fails"
+ fi
+ assert_not_contains "$output" "Target indexes added (~25 per table)." "bench_run should not report index success after an index build failure"
+ load_calls=$(cat "$sandbox/load_calls")
+ assert_equals "$load_calls" "seed:40000" "bench_run should stop before generating the mixed backlog when index creation fails"
+ assert_contains "$(cat "$sandbox/vtctld_calls")" "MoveTables --workflow bench_move --target-keyspace customer cancel" "bench_run should clean up the workflow after an index build failure"
+ trap - RETURN
+ rm -rf "$sandbox"
+}
+
+test_bench_compare_can_run_parallel_first() {
+ local sandbox output order
+ sandbox=$(mktemp -d)
+ trap 'rm -rf "$sandbox"; rm -f /tmp/bench_1_workers.log /tmp/bench_4_workers.log' RETURN
+ mkdir -p "$sandbox/examples/benchmark" "$sandbox/examples/local"
+ cp "$REPO_ROOT/examples/benchmark/bench_compare.sh" "$sandbox/examples/benchmark/bench_compare.sh"
+ cat > "$sandbox/examples/benchmark/bench_setup.sh" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+printf '%s\n' "$PARALLEL_WORKERS" >> "$BENCH_TEST_TMP/setup_order"
+EOF
+ cat > "$sandbox/examples/benchmark/bench_run.sh" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+echo " Backlog ops: 200000"
+echo " Seed rows: 40000"
+echo " Drain time: 10s"
+echo " Throughput: 20000 ops/sec"
+EOF
+ cat > "$sandbox/examples/local/501_teardown.sh" <<'EOF'
+#!/bin/bash
+set -euo pipefail
+:
+EOF
+ chmod +x "$sandbox/examples/benchmark/bench_setup.sh" "$sandbox/examples/benchmark/bench_run.sh" "$sandbox/examples/local/501_teardown.sh"
+
+ output=$(cd "$sandbox/examples/benchmark" && BENCH_TEST_TMP="$sandbox" RUN_ORDER=parallel-first bash ./bench_compare.sh 2>&1)
+ order=$(paste -sd ',' "$sandbox/setup_order")
+ assert_equals "$order" "4,1" "bench_compare should honor RUN_ORDER=parallel-first"
+ assert_contains "$output" "Run order: parallel-first" "bench_compare should print the selected run order"
+ trap - RETURN
+ rm -rf "$sandbox"
+ rm -f /tmp/bench_1_workers.log /tmp/bench_4_workers.log
+}
+
+test_bench_run_timeout_fails_without_results
+test_bench_run_index_failure_is_fatal
+test_bench_compare_can_run_parallel_first
+
+echo "PASS: benchmark script regressions"
diff --git a/examples/benchmark/bench_setup.sh b/examples/benchmark/bench_setup.sh
new file mode 100755
index 00000000000..63b19b093ca
--- /dev/null
+++ b/examples/benchmark/bench_setup.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# Bring up commerce keyspace + customer keyspace tablets with configurable
+# parallel replication workers for benchmarking VReplication throughput.
+
+BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+source ../common/env.sh
+
+PARALLEL_WORKERS=${PARALLEL_WORKERS:-1}
+SIDECAR_DB_NAME=${SIDECAR_DB_NAME:-"_vt"}
+# VReplication experimental flags: 1=OptimizeInserts, 4=VPlayerBatching, 8=AllowNoBlobBinlogRowImage
+# Default 13 (all enabled). Set DISABLE_BATCHING=1 to test without multi-statement batching.
+VREPL_FLAGS=${VREPL_FLAGS:-13}
+
+echo "=== Bench Setup (parallel_workers=$PARALLEL_WORKERS) ==="
+
+# Step 1: Bring up the commerce keyspace (topo, vtctld, commerce tablets, vtorc, vtgate).
+# The local scripts must run from examples/local/ because they use relative paths.
+(cd "$BENCH_DIR/../local" && ./101_initial_cluster.sh) || fail "Failed to bring up initial cluster"
+
+# Step 2: Apply bench schema and vschema to commerce.
+vtctldclient ApplySchema --sql-file "$BENCH_DIR/create_bench_schema.sql" commerce || fail "Failed to apply bench schema"
+vtctldclient ApplyVSchema --vschema-file "$BENCH_DIR/vschema_bench.json" commerce || fail "Failed to apply bench vschema"
+
+echo "Bench schema and vschema applied to commerce keyspace."
+
+# Step 3: Create customer keyspace
+if vtctldclient GetKeyspace customer > /dev/null 2>&1; then
+ vtctldclient SetKeyspaceDurabilityPolicy --durability-policy=none customer || fail "Failed to set durability policy on customer keyspace"
+else
+ vtctldclient CreateKeyspace --sidecar-db-name="${SIDECAR_DB_NAME}" --durability-policy=none customer || fail "Failed to create customer keyspace"
+fi
+
+# Step 4: Start mysqlctls for customer tablets with small buffer pool.
+# We set innodb_buffer_pool_chunk_size=1M at startup so that the buffer pool
+# can actually be reduced below the default 128MB chunk size.
+BENCH_EXTRA_CNF="$VTDATAROOT/tmp/bench_target.cnf"
+cat > "$BENCH_EXTRA_CNF" <<'EOF'
+# Bench: small buffer pool to force disk I/O on secondary index access.
+# 32MB gives each parallel worker ~8MB (matching serial's total), avoiding
+# destructive cache thrashing between workers while keeping I/O significant.
+innodb_buffer_pool_chunk_size = 1048576
+innodb_buffer_pool_size = 33554432
+EOF
+
+for i in 200 201 202; do
+ EXTRA_MY_CNF="$BENCH_EXTRA_CNF" CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh &
+done
+
+sleep 2
+echo "Waiting for customer mysqlctls to start..."
+wait
+echo "Customer mysqlctls are running!"
+
+# Step 5: Start customer vttablets with --vreplication-parallel-replication-workers flag
+cell='zone1'
+keyspace='customer'
+
+for uid in 200 201 202; do
+ mysql_port=$((17000 + uid))
+ port=$((15000 + uid))
+ grpc_port=$((16000 + uid))
+ printf -v alias '%s-%010d' "$cell" "$uid"
+ printf -v tablet_dir 'vt_%010d' "$uid"
+ printf -v tablet_logfile 'vttablet_%010d_querylog.txt' "$uid"
+
+ tablet_type=replica
+ if [[ "${uid: -1}" -gt 1 ]]; then
+ tablet_type=rdonly
+ fi
+
+ echo "Starting vttablet for $alias with vreplication-parallel-replication-workers=$PARALLEL_WORKERS..."
+
+ # shellcheck disable=SC2086
+ vttablet \
+ $TOPOLOGY_FLAGS \
+ --log-queries-to-file "$VTDATAROOT/tmp/$tablet_logfile" \
+ --tablet-path "$alias" \
+ --tablet-hostname "" \
+ --init-keyspace "$keyspace" \
+ --init-shard "0" \
+ --init-tablet-type "$tablet_type" \
+ --health-check-interval 5s \
+ --backup-storage-implementation file \
+ --file-backup-storage-root "$VTDATAROOT/backups" \
+ --restore-from-backup \
+ --port "$port" \
+ --grpc-port "$grpc_port" \
+ --service-map 'grpc-queryservice,grpc-tabletmanager,grpc-updatestream' \
+ --pid-file "$VTDATAROOT/$tablet_dir/vttablet.pid" \
+ --heartbeat-on-demand-duration=5s \
+ --pprof-http \
+ --log-format text \
+ --vreplication-parallel-replication-workers "$PARALLEL_WORKERS" \
+ --relay-log-max-size 250000 \
+ --relay-log-max-items 5000 \
+ --vreplication-experimental-flags "$VREPL_FLAGS" \
+ >"$VTDATAROOT/$tablet_dir/vttablet.out" 2>&1 &
+
+ # Wait for tablet to be listening
+ for _ in $(seq 0 300); do
+ curl -I "http://$(hostname -f):$port/debug/status" >/dev/null 2>&1 && break
+ sleep 0.1
+ done
+ curl -I "http://$(hostname -f):$port/debug/status" || fail "vttablet for $alias could not be started!"
+ echo "vttablet for $alias is running!"
+done
+
+# Step 6: Wait for healthy shard
+wait_for_healthy_shard customer 0 || fail "Customer shard not healthy"
+
+# Step 7: Tune MySQL durability for benchmark throughput on all tablets.
+# With innodb_flush_log_at_trx_commit=0 and sync_binlog=0, redo log fsyncs
+# don't happen on every COMMIT. This removes fsync as a variable so we can
+# isolate the applier throughput difference between serial and parallel.
+echo ""
+echo "Tuning MySQL settings for benchmark..."
+for uid in 100 101 102 200 201 202; do
+ printf -v tablet_dir 'vt_%010d' "$uid"
+ sock="$VTDATAROOT/$tablet_dir/mysql.sock"
+ if [[ -S "$sock" ]]; then
+ command mysql --no-defaults -u vt_dba -S "$sock" -e \
+ "SET GLOBAL innodb_flush_log_at_trx_commit = 0; SET GLOBAL sync_binlog = 0; SET GLOBAL rpl_semi_sync_source_enabled = 0;" 2>/dev/null && \
+ echo " Tuned tablet $uid (durability, semi-sync off)" || echo " Warning: could not tune tablet $uid"
+ fi
+done
+# Tune target tablets: disable change buffering to force immediate B-tree
+# page reads on every INSERT/UPDATE/DELETE. Combined with the 8MB buffer pool
+# set at startup, this makes each applier statement very expensive.
+for uid in 200 201 202; do
+ printf -v tablet_dir 'vt_%010d' "$uid"
+ sock="$VTDATAROOT/$tablet_dir/mysql.sock"
+ if [[ -S "$sock" ]]; then
+ command mysql --no-defaults -u vt_dba -S "$sock" -e \
+ "SET GLOBAL innodb_change_buffering = 'none';" 2>/dev/null && \
+ echo " Tuned tablet $uid (change buffering off, 8MB buffer pool)" || echo " Warning: could not tune tablet $uid"
+ fi
+done
+
+echo ""
+echo "=== Bench Setup Complete ==="
+echo "Commerce keyspace: bench tables loaded"
+echo "Customer keyspace: 3 tablets (parallel_workers=$PARALLEL_WORKERS)"
diff --git a/examples/benchmark/create_bench_schema.sql b/examples/benchmark/create_bench_schema.sql
new file mode 100644
index 00000000000..8739e2f825d
--- /dev/null
+++ b/examples/benchmark/create_bench_schema.sql
@@ -0,0 +1,60 @@
+create table if not exists bench_orders(
+ id bigint not null auto_increment,
+ customer_name varchar(255),
+ product_sku varchar(128),
+ quantity int,
+ total_price bigint,
+ status varchar(64),
+ region varchar(64),
+ notes text,
+ primary key(id),
+ index idx_customer (customer_name),
+ index idx_sku_price (product_sku, total_price),
+ index idx_status_region (status, region),
+ index idx_region_price (region, total_price)
+) ENGINE=InnoDB;
+
+create table if not exists bench_events(
+ id bigint not null auto_increment,
+ event_type varchar(128),
+ source varchar(255),
+ payload text,
+ severity int,
+ created_at bigint,
+ category varchar(128),
+ primary key(id),
+ index idx_type_severity (event_type, severity),
+ index idx_source (source),
+ index idx_category (category),
+ index idx_created (created_at)
+) ENGINE=InnoDB;
+
+create table if not exists bench_accounts(
+ id bigint not null auto_increment,
+ username varchar(128),
+ email varchar(255),
+ balance bigint,
+ region varchar(64),
+ bio text,
+ tier varchar(32),
+ primary key(id),
+ index idx_username (username),
+ index idx_email (email),
+ index idx_region_balance (region, balance),
+ index idx_tier (tier)
+) ENGINE=InnoDB;
+
+create table if not exists bench_logs(
+ id bigint not null auto_increment,
+ level varchar(32),
+ message text,
+ component varchar(128),
+ error_code int,
+ trace_id varchar(64),
+ span_id varchar(64),
+ primary key(id),
+ index idx_level_component (level, component),
+ index idx_error_code (error_code),
+ index idx_trace (trace_id),
+ index idx_span (span_id)
+) ENGINE=InnoDB;
diff --git a/examples/benchmark/vschema_bench.json b/examples/benchmark/vschema_bench.json
new file mode 100644
index 00000000000..094d2ddc55b
--- /dev/null
+++ b/examples/benchmark/vschema_bench.json
@@ -0,0 +1,8 @@
+{
+ "tables": {
+ "bench_orders": {},
+ "bench_events": {},
+ "bench_accounts": {},
+ "bench_logs": {}
+ }
+}
diff --git a/go/cmd/vtctldclient/command/vreplication/override_request_test.go b/go/cmd/vtctldclient/command/vreplication/override_request_test.go
new file mode 100644
index 00000000000..1a910eea220
--- /dev/null
+++ b/go/cmd/vtctldclient/command/vreplication/override_request_test.go
@@ -0,0 +1,108 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication_test
+
+import (
+ "context"
+ "os"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "vitess.io/vitess/go/cmd/vtctldclient/command"
+ vtctldclientcommon "vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/common"
+ _ "vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/movetables"
+ _ "vitess.io/vitess/go/cmd/vtctldclient/command/vreplication/workflow"
+ "vitess.io/vitess/go/vt/vtctl/localvtctldclient"
+
+ vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata"
+ vtctlservicepb "vitess.io/vitess/go/vt/proto/vtctlservice"
+)
+
+type overrideCaptureServer struct {
+ vtctlservicepb.UnimplementedVtctldServer
+
+ moveTablesCreateReq *vtctldatapb.MoveTablesCreateRequest
+ workflowUpdateReq *vtctldatapb.WorkflowUpdateRequest
+}
+
+func (s *overrideCaptureServer) MoveTablesCreate(_ context.Context, req *vtctldatapb.MoveTablesCreateRequest) (*vtctldatapb.WorkflowStatusResponse, error) {
+ s.moveTablesCreateReq = req
+ return &vtctldatapb.WorkflowStatusResponse{}, nil
+}
+
+func (s *overrideCaptureServer) WorkflowUpdate(_ context.Context, req *vtctldatapb.WorkflowUpdateRequest) (*vtctldatapb.WorkflowUpdateResponse, error) {
+ s.workflowUpdateReq = req
+ return &vtctldatapb.WorkflowUpdateResponse{}, nil
+}
+
+func TestVtctldclientConfigOverrideRequestsIncludeParallelReplicationWorkers(t *testing.T) {
+ ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+ defer cancel()
+
+ server := &overrideCaptureServer{}
+ localvtctldclient.SetServer(server)
+
+ origArgs := append([]string{}, os.Args...)
+ origProtocol := command.VtctldClientProtocol
+ t.Cleanup(func() {
+ os.Args = append([]string{}, origArgs...)
+ command.VtctldClientProtocol = origProtocol
+ })
+
+ command.VtctldClientProtocol = "local"
+ vtctldclientcommon.SetCommandCtx(ctx)
+
+ t.Run("MoveTablesCreate", func(t *testing.T) {
+ os.Args = []string{
+ "vtctldclient",
+ "--server", "ignored",
+ "MoveTables",
+ "--workflow", "wf1",
+ "--target-keyspace", "target",
+ "create",
+ "--source-keyspace", "source",
+ "--all-tables",
+ "--config-overrides", "vreplication-parallel-replication-workers=7",
+ }
+
+ err := command.Root.Execute()
+ require.NoError(t, err)
+ require.NotNil(t, server.moveTablesCreateReq)
+ require.NotNil(t, server.moveTablesCreateReq.WorkflowOptions)
+ require.Equal(t, "7", server.moveTablesCreateReq.WorkflowOptions.Config["vreplication-parallel-replication-workers"])
+ })
+
+ t.Run("WorkflowUpdate", func(t *testing.T) {
+ os.Args = []string{
+ "vtctldclient",
+ "--server", "ignored",
+ "Workflow",
+ "--keyspace", "target",
+ "update",
+ "--workflow", "wf1",
+ "--config-overrides", "vreplication-parallel-replication-workers=9",
+ }
+
+ err := command.Root.Execute()
+ require.NoError(t, err)
+ require.NotNil(t, server.workflowUpdateReq)
+ require.NotNil(t, server.workflowUpdateReq.TabletRequest)
+ require.Equal(t, "9", server.workflowUpdateReq.TabletRequest.ConfigOverrides["vreplication-parallel-replication-workers"])
+ })
+}
diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt
index 860ff5c62fa..d8c94af24f1 100644
--- a/go/flags/endtoend/vtcombo.txt
+++ b/go/flags/endtoend/vtcombo.txt
@@ -434,6 +434,7 @@ Flags:
--vreplication-net-read-timeout int Session value of net_read_timeout for vreplication, in seconds (default 300)
--vreplication-net-write-timeout int Session value of net_write_timeout for vreplication, in seconds (default 600)
--vreplication-parallel-insert-workers int Number of parallel insertion workers to use during copy phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent insertion during copy phase. (default 1)
+ --vreplication-parallel-replication-workers int Number of parallel replication workers to use during the replication phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent apply. (default 1)
--vreplication-replica-lag-tolerance duration Replica lag threshold duration: once lag is below this we switch from copy phase to the replication (streaming) phase (default 1m0s)
--vreplication-retry-delay duration delay before retrying a failed workflow event in the replication phase (default 5s)
--vreplication-store-compressed-gtid Store compressed gtids in the pos column of the sidecar database's vreplication table
diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt
index 145ff5997b6..e797601289b 100644
--- a/go/flags/endtoend/vttablet.txt
+++ b/go/flags/endtoend/vttablet.txt
@@ -430,6 +430,7 @@ Flags:
--vreplication-net-read-timeout int Session value of net_read_timeout for vreplication, in seconds (default 300)
--vreplication-net-write-timeout int Session value of net_write_timeout for vreplication, in seconds (default 600)
--vreplication-parallel-insert-workers int Number of parallel insertion workers to use during copy phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent insertion during copy phase. (default 1)
+ --vreplication-parallel-replication-workers int Number of parallel replication workers to use during the replication phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent apply. (default 1)
--vreplication-replica-lag-tolerance duration Replica lag threshold duration: once lag is below this we switch from copy phase to the replication (streaming) phase (default 1m0s)
--vreplication-retry-delay duration delay before retrying a failed workflow event in the replication phase (default 5s)
--vreplication-store-compressed-gtid Store compressed gtids in the pos column of the sidecar database's vreplication table
diff --git a/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go b/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go
index 3d989a998ca..c4f4784d378 100644
--- a/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go
+++ b/go/test/endtoend/onlineddl/vrepl_stress_suite/onlineddl_vrepl_stress_suite_test.go
@@ -29,11 +29,13 @@ package vreplstress
import (
"context"
+ "errors"
"flag"
"fmt"
"math/rand/v2"
"os"
"path"
+ "strconv"
"strings"
"sync"
"sync/atomic"
@@ -70,6 +72,25 @@ type testcase struct {
autoIncInsert bool
}
+type stressErrorState struct {
+ firstErr atomic.Pointer[error]
+}
+
+func (s *stressErrorState) record(err error) {
+ if err == nil {
+ return
+ }
+ errCopy := err
+ s.firstErr.CompareAndSwap(nil, &errCopy)
+}
+
+func (s *stressErrorState) err() error {
+ if errPtr := s.firstErr.Load(); errPtr != nil {
+ return *errPtr
+ }
+ return nil
+}
+
var (
clusterInstance *cluster.LocalProcessCluster
primaryTablet *cluster.Vttablet
@@ -378,7 +399,7 @@ const (
maxConcurrency = 15
singleConnectionSleepInterval = 5 * time.Millisecond
periodicSleepPercent = 10 // in the range (0,100). 10 means 10% sleep time throught the stress load.
- waitForStatusTimeout = 180 * time.Second
+ waitForStatusTimeout = 300 * time.Second
)
func resetOpOrder() {
@@ -428,11 +449,16 @@ func TestMain(m *testing.M) {
// --vstream-packet-size is set to a small value that ensures we get multiple stream iterations,
// thereby examining lastPK on vcopier side. We will be iterating tables using non-PK order throughout
// this test suite, and so the low setting ensures we hit the more interesting code paths.
+ parallelWorkers := 4
+ txPoolSize := max(parallelWorkers, 100)
clusterInstance.VtTabletExtraArgs = []string{
"--heartbeat-interval", "250ms",
"--heartbeat-on-demand-duration", "5s",
"--migration-check-interval", "5s",
"--vstream-packet-size", "4096", // Keep this value small and below 10k to ensure multilple vstream iterations
+ "--queryserver-config-transaction-cap", strconv.Itoa(txPoolSize),
+ "--transaction-limit-per-user", "0.9",
+ "--vreplication-parallel-replication-workers", strconv.Itoa(parallelWorkers),
}
clusterInstance.VtGateExtraArgs = []string{
"--ddl-strategy", "online",
@@ -526,9 +552,9 @@ func TestVreplStressSchemaChanges(t *testing.T) {
}
status := onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid, waitForStatusTimeout, expectStatus)
fmt.Printf("# Migration status (for debug purposes): <%s>\n", status)
- onlineddl.CheckMigrationStatus(t, &vtParams, shards, uuid, expectStatus)
cancel() // will cause runMultipleConnections() to terminate
wg.Wait()
+ require.Equal(t, string(expectStatus), string(status), "migration did not reach expected status within timeout")
if !testcase.expectFailure {
testCompareBeforeAfterTables(t, testcase.autoIncInsert)
}
@@ -670,7 +696,7 @@ func generateDelete(t *testing.T, conn *mysql.Conn) error {
return err
}
-func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool, done *int64) {
+func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool, done *int64, errs *stressErrorState) {
log.Info("Running single connection")
conn, err := mysql.Connect(ctx, &vtParams)
require.Nil(t, err)
@@ -712,7 +738,10 @@ func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool,
}
}
}
- assert.Nil(t, err)
+ if err != nil {
+ errs.record(err)
+ return
+ }
time.Sleep(singleConnectionSleepInterval)
// Most o fthe time, we want the load to be high, so as to create real stress and potentially
// expose bugs in vreplication (the objective of this test!).
@@ -730,16 +759,18 @@ func runSingleConnection(ctx context.Context, t *testing.T, autoIncInsert bool,
func runMultipleConnections(ctx context.Context, t *testing.T, autoIncInsert bool) {
log.Info("Running multiple connections")
var done int64
+ errState := &stressErrorState{}
var wg sync.WaitGroup
for range maxConcurrency {
wg.Go(func() {
- runSingleConnection(ctx, t, autoIncInsert, &done)
+ runSingleConnection(ctx, t, autoIncInsert, &done, errState)
})
}
<-ctx.Done()
atomic.StoreInt64(&done, 1)
log.Info("Running multiple connections: done")
wg.Wait()
+ require.NoError(t, errState.err())
log.Info("All connections cancelled")
}
@@ -846,3 +877,22 @@ func testCompareBeforeAfterTables(t *testing.T, autoIncInsert bool) {
require.Equal(t, beforeOutput, afterOutput, "results mismatch: (%s) and (%s)", selectBeforeTable, selectAfterTable)
}
}
+
+func TestStressErrorStateRecordsFirstUnexpectedError(t *testing.T) {
+ state := &stressErrorState{}
+ firstErr := errors.New("first")
+ secondErr := errors.New("second")
+
+ state.record(firstErr)
+ state.record(secondErr)
+
+ require.ErrorIs(t, state.err(), firstErr)
+}
+
+func TestStressErrorStateIgnoresNilErrors(t *testing.T) {
+ state := &stressErrorState{}
+
+ state.record(nil)
+
+ require.NoError(t, state.err())
+}
diff --git a/go/test/endtoend/vreplication/cluster_test.go b/go/test/endtoend/vreplication/cluster_test.go
index 23c47fb8a20..a3c8e54bfe3 100644
--- a/go/test/endtoend/vreplication/cluster_test.go
+++ b/go/test/endtoend/vreplication/cluster_test.go
@@ -64,7 +64,8 @@ var (
// This variable can be used within specific tests to alter vttablet behavior.
extraVTTabletArgs = []string{}
- parallelInsertWorkers = "--vreplication-parallel-insert-workers=4"
+ parallelInsertWorkers = "--vreplication-parallel-insert-workers=4"
+ parallelReplicationWorkers = "--vreplication-parallel-replication-workers=4"
throttlerConfig = throttler.Config{Threshold: 15}
)
diff --git a/go/test/endtoend/vreplication/fk_ext_load_generator_test.go b/go/test/endtoend/vreplication/fk_ext_load_generator_test.go
index 36f4a3bc709..d51f8c8fd7b 100644
--- a/go/test/endtoend/vreplication/fk_ext_load_generator_test.go
+++ b/go/test/endtoend/vreplication/fk_ext_load_generator_test.go
@@ -103,6 +103,11 @@ type SimpleLoadGenerator struct {
ch chan bool
runCtx context.Context
runCtxCancel context.CancelFunc
+ // vtgateConn is reused across execQueryWithRetry calls during Start()'s
+ // goroutine. Opening a fresh TCP connection per DML piles up thousands of
+ // sockets in TIME_WAIT and exhausts the macOS ephemeral port range
+ // (49152-65535), which in turn stalls unrelated gRPC reconnects.
+ vtgateConn *mysql.Conn
}
func (lg *SimpleLoadGenerator) SetOverrideConstraints(allow bool) {
@@ -199,7 +204,6 @@ func (lg *SimpleLoadGenerator) execQueryWithRetry(query string) (*sqltypes.Resul
defer cancel()
errCh := make(chan error)
qrCh := make(chan *sqltypes.Result)
- var vtgateConn *mysql.Conn
go func() {
var qr *sqltypes.Result
var err error
@@ -219,23 +223,27 @@ func (lg *SimpleLoadGenerator) execQueryWithRetry(query string) (*sqltypes.Resul
if retry {
time.Sleep(tickInterval)
}
- // We need to parse the error as well as the output of vdiff to determine if the error is retryable, since
- // sometimes it is observed that we get the error output as part of vdiff output.
- vtgateConn, err = lg.getVtgateConn(ctx)
- if err != nil {
- if !isQueryRetryable(err) {
- errCh <- err
- return
+ // Reuse lg.vtgateConn across calls so we don't burn a TCP
+ // connection per DML. On error we close and null it out so the
+ // retry path above opens a fresh one.
+ if lg.vtgateConn == nil {
+ lg.vtgateConn, err = lg.getVtgateConn(ctx)
+ if err != nil {
+ if !isQueryRetryable(err) {
+ errCh <- err
+ return
+ }
+ time.Sleep(tickInterval)
+ continue
}
- time.Sleep(tickInterval)
- continue
}
- qr, err = vtgateConn.ExecuteFetch(query, 1000, false)
- vtgateConn.Close()
+ qr, err = lg.vtgateConn.ExecuteFetch(query, 1000, false)
if err == nil {
qrCh <- qr
return
}
+ lg.vtgateConn.Close()
+ lg.vtgateConn = nil
if !isQueryRetryable(err) {
errCh <- err
return
@@ -276,6 +284,10 @@ func (lg *SimpleLoadGenerator) Start() error {
lg.state = LoadGeneratorStateRunning
go func() {
defer func() {
+ if lg.vtgateConn != nil {
+ lg.vtgateConn.Close()
+ lg.vtgateConn = nil
+ }
lg.state = LoadGeneratorStateStopped
log.Info("Load generator stopped")
}()
diff --git a/go/test/endtoend/vreplication/fk_ext_test.go b/go/test/endtoend/vreplication/fk_ext_test.go
index ef46cec047c..abd82e4a634 100644
--- a/go/test/endtoend/vreplication/fk_ext_test.go
+++ b/go/test/endtoend/vreplication/fk_ext_test.go
@@ -88,7 +88,8 @@ func TestFKExt(t *testing.T) {
extraVTTabletArgs = append(extraVTTabletArgs,
"--vstream-packet-size=256",
"--queryserver-config-schema-change-signal",
- parallelInsertWorkers)
+ parallelInsertWorkers,
+ parallelReplicationWorkers)
extraVTGateArgs = append(extraVTGateArgs, "--schema-change-signal"+"=true", "--planner-version", "Gen4")
defer func() { extraVTTabletArgs = nil }()
initFKExtConfig(t)
diff --git a/go/test/endtoend/vreplication/vreplication_test.go b/go/test/endtoend/vreplication/vreplication_test.go
index 4fbfe25f4c9..fbf581eaccc 100644
--- a/go/test/endtoend/vreplication/vreplication_test.go
+++ b/go/test/endtoend/vreplication/vreplication_test.go
@@ -268,11 +268,12 @@ func TestBasicVreplicationWorkflow(t *testing.T) {
testBasicVreplicationWorkflow(t, "noblob")
}
-func TestVreplicationCopyParallel(t *testing.T) {
- defaultSourceKsOpts["DBTypeVersion"] = "mysql-5.7"
- defaultTargetKsOpts["DBTypeVersion"] = "mysql-5.7"
+func TestVreplicationParallel(t *testing.T) {
+ defaultSourceKsOpts["DBTypeVersion"] = "mysql-8.4"
+ defaultTargetKsOpts["DBTypeVersion"] = "mysql-8.4"
extraVTTabletArgs = []string{
parallelInsertWorkers,
+ parallelReplicationWorkers,
}
testBasicVreplicationWorkflow(t, "")
}
diff --git a/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go b/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go
index 5f185de8ab6..c5cf4ecc485 100644
--- a/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go
+++ b/go/test/endtoend/vreplication/vreplication_vtctldclient_cli_test.go
@@ -224,9 +224,10 @@ func TestVtctldclientCLI(t *testing.T) {
func testMoveTablesFlags1(t *testing.T, mt *iMoveTables, sourceKeyspace, targetKeyspace, defaultWorkflowName string, targetTabs map[string]*cluster.VttabletProcess) {
tables := "customer,customer2"
overrides := map[string]string{
- "vreplication-net-read-timeout": "6000",
- "relay-log-max-items": "10000",
- "vreplication-parallel-insert-workers": "10",
+ "vreplication-net-read-timeout": "6000",
+ "relay-log-max-items": "10000",
+ "vreplication-parallel-insert-workers": "10",
+ "vreplication-parallel-replication-workers": "3",
}
createFlags := []string{
"--auto-start=false", "--defer-secondary-keys=false", "--stop-after-copy",
@@ -496,8 +497,9 @@ func testWorkflowUpdateConfig(t *testing.T, mt *iMoveTables, targetTabs map[stri
{
name: "two values",
config: map[string]string{
- "vreplication-heartbeat-update-interval": "100",
- "vreplication-store-compressed-gtid": "true",
+ "vreplication-heartbeat-update-interval": "100",
+ "vreplication-store-compressed-gtid": "true",
+ "vreplication-parallel-replication-workers": "5",
},
},
{
diff --git a/go/vt/binlog/binlog_connection.go b/go/vt/binlog/binlog_connection.go
index 1cac5bb458d..82f78111495 100644
--- a/go/vt/binlog/binlog_connection.go
+++ b/go/vt/binlog/binlog_connection.go
@@ -142,8 +142,11 @@ func (bc *BinlogConnection) StartBinlogDumpFromPosition(ctx context.Context, bin
// streamEvents returns a channel on which events are streamed and a channel on
// which errors are propagated.
func (bc *BinlogConnection) streamEvents(ctx context.Context) (chan mysql.BinlogEvent, chan error) {
- // FIXME(alainjobart) I think we can use a buffered channel for better performance.
- eventChan := make(chan mysql.BinlogEvent)
+ // Buffer the event channel so the binlog reader goroutine can make
+ // progress without blocking on the consumer for every single event.
+ // An unbuffered channel here forces a context switch per event, which
+ // becomes a throughput bottleneck at high event rates.
+ eventChan := make(chan mysql.BinlogEvent, 10)
errChan := make(chan error)
// Start reading events.
diff --git a/go/vt/binlog/binlog_streamer.go b/go/vt/binlog/binlog_streamer.go
index b4cc4ad28e8..b2967b81183 100644
--- a/go/vt/binlog/binlog_streamer.go
+++ b/go/vt/binlog/binlog_streamer.go
@@ -255,6 +255,7 @@ func (bls *Streamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog
pos := bls.startPos
autocommit := true
var err error
+ var pendingStreamErr error
// Remember the RBR state.
// tableMaps is indexed by tableID.
@@ -298,12 +299,33 @@ func (bls *Streamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog
select {
case ev, ok = <-events:
if !ok {
+ if pendingStreamErr != nil {
+ return pos, pendingStreamErr
+ }
+ if errs != nil {
+ select {
+ case err, ok := <-errs:
+ if ok && err != nil {
+ return pos, err
+ }
+ default:
+ }
+ }
// events channel has been closed, which means the connection died.
log.Info("reached end of binlog event stream")
return pos, ErrServerEOF
}
- case err = <-errs:
- return pos, err
+ case err, ok = <-errs:
+ if !ok {
+ errs = nil
+ continue
+ }
+ if len(events) == 0 {
+ return pos, err
+ }
+ pendingStreamErr = err
+ errs = nil
+ continue
case <-ctx.Done():
log.Info("stopping early due to binlog Streamer service shutdown or client disconnect")
return pos, ctx.Err()
diff --git a/go/vt/binlog/binlog_streamer_test.go b/go/vt/binlog/binlog_streamer_test.go
index 93856015a86..2eedd58f8c9 100644
--- a/go/vt/binlog/binlog_streamer_test.go
+++ b/go/vt/binlog/binlog_streamer_test.go
@@ -200,6 +200,70 @@ func TestStreamerParseEventsCommit(t *testing.T) {
assert.Truef(t, got.equal(want), "binlogConnStreamer.parseEvents(): got %v, want %v", got, want)
}
+func TestStreamerParseEventsDrainsBufferedEventsBeforeTerminalError(t *testing.T) {
+ f := mysql.NewMySQL56BinlogFormat()
+ s := mysql.NewFakeBinlogStream()
+ s.ServerID = 62344
+
+ input := []mysql.BinlogEvent{
+ mysql.NewRotateEvent(f, s, 0, ""),
+ mysql.NewFormatDescriptionEvent(f, s),
+ mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */),
+ mysql.NewQueryEvent(f, s, mysql.Query{
+ Database: "vt_test_keyspace",
+ SQL: "BEGIN",
+ }),
+ mysql.NewQueryEvent(f, s, mysql.Query{
+ Database: "vt_test_keyspace",
+ SQL: "insert into vt_a(eid, id) values (1, 1) /* _stream vt_a (eid id ) (1 1 ); */",
+ }),
+ mysql.NewXIDEvent(f, s),
+ }
+
+ want := []*binlogdatapb.BinlogTransaction{
+ {
+ Statements: []*binlogdatapb.BinlogTransaction_Statement{
+ {Category: binlogdatapb.BinlogTransaction_Statement_BL_SET, Sql: []byte("SET TIMESTAMP=1407805592")},
+ {Category: binlogdatapb.BinlogTransaction_Statement_BL_INSERT, Sql: []byte("insert into vt_a(eid, id) values (1, 1) /* _stream vt_a (eid id ) (1 1 ); */")},
+ },
+ EventToken: &querypb.EventToken{
+ Timestamp: 1407805592,
+ Position: replication.EncodePosition(replication.Position{
+ GTIDSet: replication.MariadbGTIDSet{
+ 0: replication.MariadbGTID{
+ Domain: 0,
+ Server: 62344,
+ Sequence: 0x0d,
+ },
+ },
+ }),
+ },
+ },
+ }
+
+ mcp := &mysql.ConnParams{DbName: "vt_test_keyspace"}
+ dbcfgs := dbconfigs.New(mcp)
+ streamErr := errors.New("stream ended after buffered events")
+
+ for i := range 64 {
+ events := make(chan mysql.BinlogEvent, len(input))
+ errs := make(chan error, 1)
+ for _, ev := range input {
+ events <- ev
+ }
+ close(events)
+ errs <- streamErr
+ close(errs)
+
+ var got binlogStatements
+ bls := NewStreamer(dbcfgs, nil, nil, replication.Position{}, 0, (&got).sendTransaction)
+
+ _, err := bls.parseEvents(t.Context(), events, errs)
+ require.ErrorIs(t, err, streamErr, "iteration %d", i)
+ require.True(t, got.equal(want), "iteration %d: got %#v want %#v", i, got, want)
+ }
+}
+
func TestStreamerStop(t *testing.T) {
events := make(chan mysql.BinlogEvent)
errs := make(chan error)
diff --git a/go/vt/vttablet/common/config.go b/go/vt/vttablet/common/config.go
index d413a94db01..511751fa0c0 100644
--- a/go/vt/vttablet/common/config.go
+++ b/go/vt/vttablet/common/config.go
@@ -19,13 +19,27 @@ package vttablet
import (
"encoding/json"
"fmt"
+ "log/slog"
"maps"
+ "slices"
"strconv"
"strings"
"sync"
"time"
+
+ "vitess.io/vitess/go/vt/log"
)
+// maxParallelReplicationWorkers bounds --vreplication-parallel-replication-workers
+// and its per-workflow override. Each worker holds two MySQL connections per
+// workflow (double-buffered apply), plus the main connection, so an unbounded
+// value would let a single workflow exhaust the target's max_connections.
+const maxParallelReplicationWorkers = 64
+
+// warnParallelReplicationWorkersCap rate-limits the flag-clamp warning to once
+// per process; GetVReplicationConfigDefaults can be called per workflow.
+var warnParallelReplicationWorkersCap sync.Once
+
/*
This file contains the model for all the configuration parameters for VReplication workflows. It also provides methods to
initialize the default configuration and to override the default configuration with user-provided values. The overrides
@@ -37,21 +51,22 @@ import (
// target (vreplication)and the source (vstreamer) side.
type VReplicationConfig struct {
// Config parameters applicable to the target side (vreplication)
- ExperimentalFlags int64
- NetReadTimeout int
- NetWriteTimeout int
- CopyPhaseDuration time.Duration
- RetryDelay time.Duration
- MaxTimeToRetryError time.Duration
- RelayLogMaxSize int
- RelayLogMaxItems int
- ReplicaLagTolerance time.Duration
- HeartbeatUpdateInterval int
- StoreCompressedGTID bool
- ParallelInsertWorkers int
- TabletTypesStr string
- EnableHttpLog bool // Enable the /debug/vrlog endpoint
- MaxRowJSONBytes int64
+ ExperimentalFlags int64
+ NetReadTimeout int
+ NetWriteTimeout int
+ CopyPhaseDuration time.Duration
+ RetryDelay time.Duration
+ MaxTimeToRetryError time.Duration
+ RelayLogMaxSize int
+ RelayLogMaxItems int
+ ReplicaLagTolerance time.Duration
+ HeartbeatUpdateInterval int
+ StoreCompressedGTID bool
+ ParallelInsertWorkers int
+ ParallelReplicationWorkers int
+ TabletTypesStr string
+ EnableHttpLog bool // Enable the /debug/vrlog endpoint
+ MaxRowJSONBytes int64
// Config parameters applicable to the source side (vstreamer)
// The coresponding Override fields are used to determine if the user has provided a value for the parameter so
@@ -83,21 +98,22 @@ func GetVReplicationConfigDefaults(useCached bool) *VReplicationConfig {
return DefaultVReplicationConfig
}
DefaultVReplicationConfig = &VReplicationConfig{
- ExperimentalFlags: vreplicationExperimentalFlags,
- NetReadTimeout: vreplicationNetReadTimeout,
- NetWriteTimeout: vreplicationNetWriteTimeout,
- CopyPhaseDuration: vreplicationCopyPhaseDuration,
- RetryDelay: vreplicationRetryDelay,
- MaxTimeToRetryError: vreplicationMaxTimeToRetryError,
- RelayLogMaxSize: vreplicationRelayLogMaxSize,
- RelayLogMaxItems: vreplicationRelayLogMaxItems,
- ReplicaLagTolerance: vreplicationReplicaLagTolerance,
- HeartbeatUpdateInterval: vreplicationHeartbeatUpdateInterval,
- StoreCompressedGTID: vreplicationStoreCompressedGTID,
- ParallelInsertWorkers: vreplicationParallelInsertWorkers,
- TabletTypesStr: vreplicationTabletTypesStr,
- EnableHttpLog: vreplicationEnableHttpLog,
- MaxRowJSONBytes: vreplicationMaxRowJSONBytes,
+ ExperimentalFlags: vreplicationExperimentalFlags,
+ NetReadTimeout: vreplicationNetReadTimeout,
+ NetWriteTimeout: vreplicationNetWriteTimeout,
+ CopyPhaseDuration: vreplicationCopyPhaseDuration,
+ RetryDelay: vreplicationRetryDelay,
+ MaxTimeToRetryError: vreplicationMaxTimeToRetryError,
+ RelayLogMaxSize: vreplicationRelayLogMaxSize,
+ RelayLogMaxItems: vreplicationRelayLogMaxItems,
+ ReplicaLagTolerance: vreplicationReplicaLagTolerance,
+ HeartbeatUpdateInterval: vreplicationHeartbeatUpdateInterval,
+ StoreCompressedGTID: vreplicationStoreCompressedGTID,
+ ParallelInsertWorkers: vreplicationParallelInsertWorkers,
+ ParallelReplicationWorkers: cappedParallelReplicationWorkers(vreplicationParallelReplicationWorkers),
+ TabletTypesStr: vreplicationTabletTypesStr,
+ EnableHttpLog: vreplicationEnableHttpLog,
+ MaxRowJSONBytes: vreplicationMaxRowJSONBytes,
VStreamPacketSizeOverride: false,
VStreamPacketSize: VStreamerDefaultPacketSize,
@@ -133,7 +149,21 @@ func NewVReplicationConfig(overrides map[string]string) (*VReplicationConfig, er
getError := func(k, v string) string {
return fmt.Sprintf("invalid value for %s: %s", k, v)
}
- for k, v := range overrides {
+ // Iterate keys in sorted order so the resulting config is deterministic
+ // when the caller supplies both the hyphen and underscore variants of the
+ // same setting (e.g. `vstream-packet-size` and `vstream_packet_size`).
+ // Go map iteration is intentionally randomized; without sorting, last-
+ // write-wins would produce different results across runs of the same
+ // vttablet. ASCII '-' (0x2D) < '_' (0x5F), so hyphen variants are
+ // applied first and underscore variants override — matching the
+ // "UseEffectiveValues" behaviour exercised by the test suite.
+ keys := make([]string, 0, len(overrides))
+ for k := range overrides {
+ keys = append(keys, k)
+ }
+ slices.Sort(keys)
+ for _, k := range keys {
+ v := overrides[k]
if v == "" {
continue
}
@@ -194,6 +224,20 @@ func NewVReplicationConfig(overrides map[string]string) (*VReplicationConfig, er
} else {
c.RelayLogMaxItems = value
}
+ case "vreplication-parallel-replication-workers":
+ value, err := strconv.Atoi(v)
+ if err != nil {
+ errors = append(errors, getError(k, v))
+ } else if value < 0 {
+ // Negative values are never meaningful; the flag help text
+ // documents "<= 1 to disable parallelism" so 0 and 1 both
+ // fall through to the serial applier path.
+ errors = append(errors, fmt.Sprintf("invalid value for %s: %d (must be >= 0; 0 or 1 disables parallel apply)", k, value))
+ } else if value > maxParallelReplicationWorkers {
+ errors = append(errors, fmt.Sprintf("invalid value for %s: %d (must be at most %d; each worker holds two MySQL connections per workflow)", k, value, maxParallelReplicationWorkers))
+ } else {
+ c.ParallelReplicationWorkers = value
+ }
case "vreplication-replica-lag-tolerance":
value, err := time.ParseDuration(v)
if err != nil {
@@ -263,31 +307,57 @@ func NewVReplicationConfig(overrides map[string]string) (*VReplicationConfig, er
return c, nil
}
+// SourceOverrides returns only the vstreamer-side overrides that can be sent to source tablets.
+func (c VReplicationConfig) SourceOverrides() map[string]string {
+ sourceOverrides := make(map[string]string)
+ for _, key := range []string{
+ "vreplication-experimental-flags",
+ "vreplication-net-read-timeout",
+ "vreplication-net-write-timeout",
+ "vreplication-copy-phase-duration",
+ } {
+ if value, ok := c.Overrides[key]; ok && value != "" {
+ sourceOverrides[key] = value
+ }
+ }
+ if c.VStreamPacketSizeOverride {
+ sourceOverrides["vstream-packet-size"] = strconv.Itoa(c.VStreamPacketSize)
+ }
+ if c.VStreamDynamicPacketSizeOverride {
+ sourceOverrides["vstream-dynamic-packet-size"] = strconv.FormatBool(c.VStreamDynamicPacketSize)
+ }
+ if c.VStreamBinlogRotationThresholdOverride {
+ sourceOverrides["vstream_binlog_rotation_threshold"] = strconv.FormatInt(c.VStreamBinlogRotationThreshold, 10)
+ }
+ return sourceOverrides
+}
+
// Map returns a map of the VReplicationConfig: the keys are the flag names and the values are string representations.
// Used in tests to compare the expected and actual configuration values and in validations to check if the user-provided
// keys are one of those that are supported.
func (c VReplicationConfig) Map() map[string]string {
return map[string]string{
- "vreplication-experimental-flags": strconv.FormatInt(c.ExperimentalFlags, 10),
- "vreplication-net-read-timeout": strconv.Itoa(c.NetReadTimeout),
- "vreplication-net-write-timeout": strconv.Itoa(c.NetWriteTimeout),
- "vreplication-copy-phase-duration": c.CopyPhaseDuration.String(),
- "vreplication-retry-delay": c.RetryDelay.String(),
- "vreplication-max-time-to-retry-on-error": c.MaxTimeToRetryError.String(),
- "relay-log-max-size": strconv.Itoa(c.RelayLogMaxSize),
- "relay_log_max_size": strconv.Itoa(c.RelayLogMaxSize),
- "relay-log-max-items": strconv.Itoa(c.RelayLogMaxItems),
- "relay_log_max_items": strconv.Itoa(c.RelayLogMaxItems),
- "vreplication-replica-lag-tolerance": c.ReplicaLagTolerance.String(),
- "vreplication-heartbeat-update-interval": strconv.Itoa(c.HeartbeatUpdateInterval),
- "vreplication-store-compressed-gtid": strconv.FormatBool(c.StoreCompressedGTID),
- "vreplication-parallel-insert-workers": strconv.Itoa(c.ParallelInsertWorkers),
- "vstream-packet-size": strconv.Itoa(c.VStreamPacketSize),
- "vstream_packet_size": strconv.Itoa(c.VStreamPacketSize),
- "vstream-dynamic-packet-size": strconv.FormatBool(c.VStreamDynamicPacketSize),
- "vstream_dynamic_packet_size": strconv.FormatBool(c.VStreamDynamicPacketSize),
- "vstream_binlog_rotation_threshold": strconv.FormatInt(c.VStreamBinlogRotationThreshold, 10),
- "max-row-json-bytes": strconv.FormatInt(c.MaxRowJSONBytes, 10),
+ "vreplication-experimental-flags": strconv.FormatInt(c.ExperimentalFlags, 10),
+ "vreplication-net-read-timeout": strconv.Itoa(c.NetReadTimeout),
+ "vreplication-net-write-timeout": strconv.Itoa(c.NetWriteTimeout),
+ "vreplication-copy-phase-duration": c.CopyPhaseDuration.String(),
+ "vreplication-retry-delay": c.RetryDelay.String(),
+ "vreplication-max-time-to-retry-on-error": c.MaxTimeToRetryError.String(),
+ "relay-log-max-size": strconv.Itoa(c.RelayLogMaxSize),
+ "relay_log_max_size": strconv.Itoa(c.RelayLogMaxSize),
+ "relay-log-max-items": strconv.Itoa(c.RelayLogMaxItems),
+ "relay_log_max_items": strconv.Itoa(c.RelayLogMaxItems),
+ "vreplication-replica-lag-tolerance": c.ReplicaLagTolerance.String(),
+ "vreplication-heartbeat-update-interval": strconv.Itoa(c.HeartbeatUpdateInterval),
+ "vreplication-store-compressed-gtid": strconv.FormatBool(c.StoreCompressedGTID),
+ "vreplication-parallel-insert-workers": strconv.Itoa(c.ParallelInsertWorkers),
+ "vreplication-parallel-replication-workers": strconv.Itoa(c.ParallelReplicationWorkers),
+ "vstream-packet-size": strconv.Itoa(c.VStreamPacketSize),
+ "vstream_packet_size": strconv.Itoa(c.VStreamPacketSize),
+ "vstream-dynamic-packet-size": strconv.FormatBool(c.VStreamDynamicPacketSize),
+ "vstream_dynamic_packet_size": strconv.FormatBool(c.VStreamDynamicPacketSize),
+ "vstream_binlog_rotation_threshold": strconv.FormatInt(c.VStreamBinlogRotationThreshold, 10),
+ "max-row-json-bytes": strconv.FormatInt(c.MaxRowJSONBytes, 10),
}
}
@@ -295,3 +365,21 @@ func (c VReplicationConfig) String() string {
s, _ := json.Marshal(c.Map())
return string(s)
}
+
+// cappedParallelReplicationWorkers clamps the tablet-wide flag value to
+// maxParallelReplicationWorkers, warning once per process. The per-workflow
+// override rejects out-of-range values outright (it has an error path); the
+// flag is clamped instead so an over-eager value degrades gracefully rather
+// than failing tablet startup.
+func cappedParallelReplicationWorkers(value int) int {
+ if value <= maxParallelReplicationWorkers {
+ return value
+ }
+ warnParallelReplicationWorkersCap.Do(func() {
+ log.Warn("--vreplication-parallel-replication-workers exceeds the maximum; capping",
+ slog.Int("requested", value),
+ slog.Int("max", maxParallelReplicationWorkers),
+ )
+ })
+ return maxParallelReplicationWorkers
+}
diff --git a/go/vt/vttablet/common/config_test.go b/go/vt/vttablet/common/config_test.go
index a38eb537c78..a8d75159922 100644
--- a/go/vt/vttablet/common/config_test.go
+++ b/go/vt/vttablet/common/config_test.go
@@ -71,6 +71,7 @@ func TestNewVReplicationConfig(t *testing.T) {
HeartbeatUpdateInterval: 2,
StoreCompressedGTID: true,
ParallelInsertWorkers: 4,
+ ParallelReplicationWorkers: 1, // flag default
VStreamPacketSize: 1024,
VStreamDynamicPacketSize: false,
VStreamBinlogRotationThreshold: 2048,
@@ -125,6 +126,7 @@ func TestNewVReplicationConfig(t *testing.T) {
HeartbeatUpdateInterval: DefaultVReplicationConfig.HeartbeatUpdateInterval,
StoreCompressedGTID: !DefaultVReplicationConfig.StoreCompressedGTID,
ParallelInsertWorkers: DefaultVReplicationConfig.ParallelInsertWorkers,
+ ParallelReplicationWorkers: DefaultVReplicationConfig.ParallelReplicationWorkers,
VStreamPacketSize: DefaultVReplicationConfig.VStreamPacketSize,
VStreamDynamicPacketSize: !DefaultVReplicationConfig.VStreamDynamicPacketSize,
VStreamBinlogRotationThreshold: DefaultVReplicationConfig.VStreamBinlogRotationThreshold,
@@ -160,6 +162,85 @@ func TestNewVReplicationConfig(t *testing.T) {
}
}
+func TestVReplicationConfigSourceOverrides(t *testing.T) {
+ config, err := NewVReplicationConfig(map[string]string{
+ "vreplication-parallel-replication-workers": "4",
+ "vreplication-parallel-insert-workers": "8",
+ "vstream-packet-size": "1024",
+ "vstream_dynamic_packet_size": "false",
+ "vstream_binlog_rotation_threshold": "2048",
+ })
+ require.NoError(t, err)
+
+ require.Equal(t, map[string]string{
+ "vstream-packet-size": "1024",
+ "vstream-dynamic-packet-size": "false",
+ "vstream_binlog_rotation_threshold": "2048",
+ }, config.SourceOverrides())
+}
+
+func TestVReplicationConfigSourceOverridesUseEffectiveValues(t *testing.T) {
+ config, err := NewVReplicationConfig(map[string]string{
+ "vstream-packet-size": "1024",
+ "vstream_packet_size": "2048",
+ })
+ require.NoError(t, err)
+
+ require.Equal(t, map[string]string{
+ "vstream-packet-size": "2048",
+ }, config.SourceOverrides())
+}
+
+// TestVReplicationConfigRejectsInvalidParallelWorkers verifies that
+// negative worker counts are rejected at config-parse time. 0 is allowed
+// (documented as "disable parallel apply"); see
+// TestVReplicationConfigAcceptsZeroParallelReplicationWorkers below.
+func TestVReplicationConfigRejectsInvalidParallelWorkers(t *testing.T) {
+ for _, bad := range []string{"-1", "-5"} {
+ t.Run(bad, func(t *testing.T) {
+ _, err := NewVReplicationConfig(map[string]string{
+ "vreplication-parallel-replication-workers": bad,
+ })
+ require.Error(t, err, "expected error for invalid value %q", bad)
+ require.Contains(t, err.Error(), "vreplication-parallel-replication-workers")
+ require.Contains(t, err.Error(), "must be >= 0")
+ })
+ }
+}
+
+func TestVReplicationConfigAcceptsZeroParallelReplicationWorkers(t *testing.T) {
+ config, err := NewVReplicationConfig(map[string]string{
+ "vreplication-parallel-replication-workers": "0",
+ })
+ require.NoError(t, err)
+ require.Equal(t, 0, config.ParallelReplicationWorkers)
+}
+
+func TestVReplicationConfigSourceOverridesIncludeSourceConsumedWorkflowKeys(t *testing.T) {
+ config, err := NewVReplicationConfig(map[string]string{
+ "vreplication-experimental-flags": "3",
+ "vreplication-net-read-timeout": "123",
+ "vreplication-net-write-timeout": "456",
+ "vreplication-copy-phase-duration": "2h",
+ "vreplication-parallel-replication-workers": "4",
+ "vreplication-parallel-insert-workers": "8",
+ "vstream-packet-size": "1024",
+ "vstream_dynamic_packet_size": "false",
+ "vstream_binlog_rotation_threshold": "2048",
+ })
+ require.NoError(t, err)
+
+ require.Equal(t, map[string]string{
+ "vreplication-experimental-flags": "3",
+ "vreplication-net-read-timeout": "123",
+ "vreplication-net-write-timeout": "456",
+ "vreplication-copy-phase-duration": "2h",
+ "vstream-packet-size": "1024",
+ "vstream-dynamic-packet-size": "false",
+ "vstream_binlog_rotation_threshold": "2048",
+ }, config.SourceOverrides())
+}
+
func TestMaxRowJSONBytesOverride(t *testing.T) {
InitVReplicationConfigDefaults()
cfg, err := NewVReplicationConfig(map[string]string{"max-row-json-bytes": "1048576"})
@@ -200,3 +281,18 @@ func TestVReplicationMaxRowJSONBytesFlagRejectsNegative(t *testing.T) {
require.Error(t, err)
require.ErrorContains(t, err, "must be non-negative")
}
+
+// TestVReplicationConfigCapsParallelReplicationWorkers pins the upper bound:
+// the per-workflow override rejects values above the cap, and the tablet-wide
+// flag value is clamped (each worker holds two MySQL connections per
+// workflow, so an unbounded value could exhaust the target's max_connections).
+func TestVReplicationConfigCapsParallelReplicationWorkers(t *testing.T) {
+ _, err := NewVReplicationConfig(map[string]string{
+ "vreplication-parallel-replication-workers": "65",
+ })
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "must be at most 64")
+
+ require.Equal(t, 64, cappedParallelReplicationWorkers(1000))
+ require.Equal(t, 4, cappedParallelReplicationWorkers(4))
+}
diff --git a/go/vt/vttablet/common/flags.go b/go/vt/vttablet/common/flags.go
index 89daa0bf4db..5c9eb9944af 100644
--- a/go/vt/vttablet/common/flags.go
+++ b/go/vt/vttablet/common/flags.go
@@ -81,9 +81,10 @@ var (
vreplicationHeartbeatUpdateInterval = 1
- vreplicationStoreCompressedGTID = false
- vreplicationParallelInsertWorkers = 1
- vreplicationMaxRowJSONBytes = int64(0)
+ vreplicationStoreCompressedGTID = false
+ vreplicationParallelInsertWorkers = 1
+ vreplicationParallelReplicationWorkers = 1
+ vreplicationMaxRowJSONBytes = int64(0)
// VStreamerBinlogRotationThreshold is the threshold, above which we rotate binlogs, before taking a GTID snapshot
VStreamerBinlogRotationThreshold = int64(64 * 1024 * 1024) // 64MiB
@@ -132,6 +133,8 @@ func registerFlags(fs *pflag.FlagSet) {
fs.IntVar(&vreplicationParallelInsertWorkers, "vreplication-parallel-insert-workers", vreplicationParallelInsertWorkers, "Number of parallel insertion workers to use during copy phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent insertion during copy phase.")
+ fs.IntVar(&vreplicationParallelReplicationWorkers, "vreplication-parallel-replication-workers", vreplicationParallelReplicationWorkers, "Number of parallel replication workers to use during the replication phase. Set <= 1 to disable parallelism, or > 1 to enable concurrent apply.")
+
fs.Uint64Var(&mysql.ZstdInMemoryDecompressorMaxSize, "binlog-in-memory-decompressor-max-size", mysql.ZstdInMemoryDecompressorMaxSize, "This value sets the uncompressed transaction payload size at which we switch from in-memory buffer based decompression to the slower streaming mode.")
fs.BoolVar(&vreplicationEnableHttpLog, "vreplication-enable-http-log", vreplicationEnableHttpLog, "Enable the /debug/vrlog HTTP endpoint, which will produce a log of the events replicated on primary tablets in the target keyspace by all VReplication workflows that are in the running/replicating phase.")
diff --git a/go/vt/vttablet/common/flags_test.go b/go/vt/vttablet/common/flags_test.go
new file mode 100644
index 00000000000..f35c8b1e12e
--- /dev/null
+++ b/go/vt/vttablet/common/flags_test.go
@@ -0,0 +1,33 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vttablet
+
+import (
+ "testing"
+
+ "github.com/spf13/pflag"
+ "github.com/stretchr/testify/require"
+)
+
+func TestRegisterFlags_ParallelReplicationWorkersUsage(t *testing.T) {
+ fs := pflag.NewFlagSet("test", pflag.ContinueOnError)
+ registerFlags(fs)
+
+ flag := fs.Lookup("vreplication-parallel-replication-workers")
+ require.NotNil(t, flag)
+ require.NotContains(t, flag.Usage, "Experimental")
+}
diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go
index ff17d4c1e7a..db35cc1a01e 100644
--- a/go/vt/vttablet/onlineddl/executor.go
+++ b/go/vt/vttablet/onlineddl/executor.go
@@ -22,8 +22,10 @@ package onlineddl
import (
"context"
+ "encoding/json"
"errors"
"fmt"
+ "log/slog"
"os"
"slices"
"strconv"
@@ -56,6 +58,7 @@ import (
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/utils"
"vitess.io/vitess/go/vt/vterrors"
+ vttablet "vitess.io/vitess/go/vt/vttablet/common"
"vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication"
"vitess.io/vitess/go/vt/vttablet/tabletserver/connpool"
"vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv"
@@ -66,6 +69,7 @@ import (
binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
+ vtctldatapb "vitess.io/vitess/go/vt/proto/vtctldata"
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
)
@@ -1028,6 +1032,12 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh
go log.Info(fmt.Sprintf("cutOverVReplMigration %v: unbuffered queries", s.workflow))
})
}
+
+ shouldWaitForParallelApply, err := shouldPreBufferWaitForParallelApply(s)
+ if err != nil {
+ return vterrors.Wrapf(err, "failed parsing vreplication workflow options before pre-buffer wait")
+ }
+
e.updateMigrationStage(ctx, onlineDDL.UUID, "buffering queries")
// stop writes on source:
err = toggleBuffering(true)
@@ -1075,6 +1085,23 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh
return vterrors.Wrapf(err, "failed locking tables")
}
+ if shouldWaitForParallelApply {
+ // With writes blocked on the original table, the stream can catch up without a queued
+ // RENAME holding conflicting metadata locks on the shadow table.
+ e.updateMigrationStage(ctx, onlineDDL.UUID, "post-lock: waiting for vreplication to catch up")
+ preRenamePos, err := e.primaryPosition(ctx)
+ if err != nil {
+ return vterrors.Wrapf(err, "failed reading primary position before renaming")
+ }
+ if s, err = e.readVReplStream(ctx, s.workflow, false); err != nil {
+ return vterrors.Wrapf(err, "failed reading vreplication stream before renaming")
+ }
+ if err := waitForPos(s, preRenamePos, onlineDDL.CutOverThreshold); err != nil {
+ return vterrors.Wrapf(err, "failed waiting for vreplication to catch up before renaming")
+ }
+ go log.Info("cutOverVReplMigration: post-lock waitForPos reached", slog.String("workflow", s.workflow), slog.String("position", replication.EncodePosition(preRenamePos)))
+ }
+
e.updateMigrationStage(ctx, onlineDDL.UUID, "renaming tables")
killWhileRenamingContext, killWhileRenamingCancel := context.WithCancel(ctx)
defer killWhileRenamingCancel()
@@ -1118,12 +1145,20 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh
return vterrors.Wrapf(err, "failed reading vreplication table after locking")
}
- e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-lock pos: %v", replication.EncodePosition(postWritesPos))
- if err := waitForPos(s, postWritesPos, onlineDDL.CutOverThreshold); err != nil {
- e.updateMigrationStage(ctx, onlineDDL.UUID, "timeout while waiting for post-lock pos: %v", err)
- return vterrors.Wrapf(err, "failed waiting for pos after locking")
+ // The parallel-apply pre-rename wait happens in-lock in the production
+ // branch above, so the post-lock wait is redundant there. The test-suite
+ // branch has no queued RENAME (and therefore no MDL conflict to avoid)
+ // but also never performed the in-lock wait, so it must still wait here —
+ // otherwise StopVReplication below can fire before the stream has caught
+ // up to postWritesPos and the cutover loses tail writes.
+ if !shouldWaitForParallelApply || isVreplicationTestSuite {
+ e.updateMigrationStage(ctx, onlineDDL.UUID, "waiting for post-lock pos: %v", replication.EncodePosition(postWritesPos))
+ if err := waitForPos(s, postWritesPos, onlineDDL.CutOverThreshold); err != nil {
+ e.updateMigrationStage(ctx, onlineDDL.UUID, "timeout while waiting for post-lock pos: %v", err)
+ return vterrors.Wrapf(err, "failed waiting for pos after locking")
+ }
+ go log.Info(fmt.Sprintf("cutOverVReplMigration %v: done waiting for position %v", s.workflow, replication.EncodePosition(postWritesPos)))
}
- go log.Info(fmt.Sprintf("cutOverVReplMigration %v: done waiting for position %v", s.workflow, replication.EncodePosition(postWritesPos)))
// Stop vreplication
e.updateMigrationStage(ctx, onlineDDL.UUID, "stopping vreplication")
if _, err := e.vreplicationExec(ctx, tablet.Tablet, binlogplayer.StopVReplication(s.id, "stopped for online DDL cutover")); err != nil {
@@ -1211,6 +1246,42 @@ func (e *Executor) cutOverVReplMigration(ctx context.Context, s *VReplStream, sh
// deferred function will re-enable writes now
}
+// shouldPreBufferWaitForParallelApply reports whether the VReplication
+// stream backing this migration runs with parallel apply enabled (>1
+// worker), taking workflow-level config overrides into account. The
+// cut-over path consults this because parallel apply introduces a
+// reorder buffer that must drain cleanly before tables can be swapped;
+// the serial applier has no such buffer and does not need the extra
+// wait.
+func shouldPreBufferWaitForParallelApply(s *VReplStream) (bool, error) {
+ workers := vttablet.InitVReplicationConfigDefaults().ParallelReplicationWorkers
+ if s == nil || s.options == "" {
+ return workers > 1, nil
+ }
+
+ var options vtctldatapb.WorkflowOptions
+ if err := json.Unmarshal([]byte(s.options), &options); err != nil {
+ return false, err
+ }
+ if len(options.Config) == 0 {
+ return workers > 1, nil
+ }
+
+ workerOverrides := map[string]string{}
+ if value, ok := options.Config["vreplication-parallel-replication-workers"]; ok {
+ workerOverrides["vreplication-parallel-replication-workers"] = value
+ }
+ if len(workerOverrides) == 0 {
+ return workers > 1, nil
+ }
+
+ config, err := vttablet.NewVReplicationConfig(workerOverrides)
+ if err != nil {
+ return false, err
+ }
+ return config.ParallelReplicationWorkers > 1, nil
+}
+
// initMigrationSQLMode sets sql_mode according to DDL strategy, and returns a function that
// restores sql_mode to original state
func (e *Executor) initMigrationSQLMode(ctx context.Context, onlineDDL *schema.OnlineDDL, conn *dbconnpool.DBConnection) (deferFunc func(), err error) {
@@ -3112,6 +3183,7 @@ func (e *Executor) readVReplStream(ctx context.Context, uuid string, okIfMissing
id: row.AsInt32("id", 0),
workflow: row.AsString("workflow", ""),
source: row.AsString("source", ""),
+ options: row.AsString("options", ""),
pos: row.AsString("pos", ""),
timeUpdated: row.AsInt64("time_updated", 0),
timeHeartbeat: row.AsInt64("time_heartbeat", 0),
diff --git a/go/vt/vttablet/onlineddl/executor_test.go b/go/vt/vttablet/onlineddl/executor_test.go
index 08485a083f9..46b359c07e1 100644
--- a/go/vt/vttablet/onlineddl/executor_test.go
+++ b/go/vt/vttablet/onlineddl/executor_test.go
@@ -22,6 +22,9 @@ package onlineddl
import (
"context"
+ "fmt"
+ "strings"
+ "sync"
"testing"
"time"
@@ -34,13 +37,20 @@ import (
"vitess.io/vitess/go/vt/dbconfigs"
"vitess.io/vitess/go/vt/dbconnpool"
"vitess.io/vitess/go/vt/schema"
+ "vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/topo/memorytopo"
"vitess.io/vitess/go/vt/vtenv"
+ "vitess.io/vitess/go/vt/vterrors"
+ vttablet "vitess.io/vitess/go/vt/vttablet/common"
+ "vitess.io/vitess/go/vt/vttablet/tabletserver/connpool"
"vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv"
"vitess.io/vitess/go/vt/vttablet/tmclient"
"vitess.io/vitess/go/vt/vttablet/tmclienttest"
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
+ vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
)
func TestShouldCutOverAccordingToBackoff(t *testing.T) {
@@ -349,6 +359,511 @@ func TestExecuteDirectlySetsLockWaitTimeout(t *testing.T) {
assert.Contains(t, queryLog, "set @@session.lock_wait_timeout=@lock_wait_timeout")
}
+func TestShouldPreBufferWaitForParallelApply(t *testing.T) {
+ config := vttablet.InitVReplicationConfigDefaults()
+ savedParallelWorkers := config.ParallelReplicationWorkers
+ t.Cleanup(func() {
+ config.ParallelReplicationWorkers = savedParallelWorkers
+ })
+
+ config.ParallelReplicationWorkers = 1
+ shouldWait, err := shouldPreBufferWaitForParallelApply(nil)
+ require.NoError(t, err)
+ require.False(t, shouldWait)
+
+ config.ParallelReplicationWorkers = 2
+ shouldWait, err = shouldPreBufferWaitForParallelApply(nil)
+ require.NoError(t, err)
+ require.True(t, shouldWait)
+}
+
+func TestShouldPreBufferWaitForParallelApplyPrefersWorkflowOverride(t *testing.T) {
+ config := vttablet.InitVReplicationConfigDefaults()
+ savedParallelWorkers := config.ParallelReplicationWorkers
+ t.Cleanup(func() {
+ config.ParallelReplicationWorkers = savedParallelWorkers
+ })
+
+ config.ParallelReplicationWorkers = 1
+
+ stream := &VReplStream{}
+ stream.bls = &binlogdatapb.BinlogSource{}
+ stream.options = `{"config":{"vreplication-parallel-replication-workers":"2"}}`
+
+ shouldWait, err := shouldPreBufferWaitForParallelApply(stream)
+ require.NoError(t, err)
+ require.True(t, shouldWait)
+}
+
+func TestShouldPreBufferWaitForParallelApplyRejectsInvalidWorkflowOverride(t *testing.T) {
+ config := vttablet.InitVReplicationConfigDefaults()
+ savedParallelWorkers := config.ParallelReplicationWorkers
+ t.Cleanup(func() {
+ config.ParallelReplicationWorkers = savedParallelWorkers
+ })
+
+ config.ParallelReplicationWorkers = 2
+
+ stream := &VReplStream{}
+ stream.bls = &binlogdatapb.BinlogSource{}
+ stream.options = `{"config":{"vreplication-parallel-replication-workers":"not-an-int"}}`
+
+ _, err := shouldPreBufferWaitForParallelApply(stream)
+ require.Error(t, err)
+ require.ErrorContains(t, err, "invalid value for vreplication-parallel-replication-workers")
+}
+
+func TestShouldPreBufferWaitForParallelApplyIgnoresUnknownWorkflowOverrideKeys(t *testing.T) {
+ config := vttablet.InitVReplicationConfigDefaults()
+ savedParallelWorkers := config.ParallelReplicationWorkers
+ t.Cleanup(func() {
+ config.ParallelReplicationWorkers = savedParallelWorkers
+ })
+
+ config.ParallelReplicationWorkers = 2
+
+ stream := &VReplStream{}
+ stream.bls = &binlogdatapb.BinlogSource{}
+ stream.options = `{"config":{"user":"admin","password":"secret"}}`
+
+ shouldWait, err := shouldPreBufferWaitForParallelApply(stream)
+ require.NoError(t, err)
+ require.True(t, shouldWait)
+}
+
+type recordingTabletManagerClient struct {
+ tmclient.TabletManagerClient
+
+ mu sync.Mutex
+ waitCalls []string
+ waitErr error
+ waitErrs []error
+ refreshStateCalled int
+}
+
+func (c *recordingTabletManagerClient) Close() {}
+
+func (c *recordingTabletManagerClient) ReloadSchema(ctx context.Context, tablet *topodatapb.Tablet, waitPosition string) error {
+ return nil
+}
+
+func (c *recordingTabletManagerClient) RefreshState(ctx context.Context, tablet *topodatapb.Tablet) error {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ c.refreshStateCalled++
+ return nil
+}
+
+func (c *recordingTabletManagerClient) VReplicationWaitForPos(ctx context.Context, tablet *topodatapb.Tablet, id int32, pos string) error {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ c.waitCalls = append(c.waitCalls, pos)
+ if len(c.waitErrs) > 0 {
+ err := c.waitErrs[0]
+ c.waitErrs = c.waitErrs[1:]
+ if err != nil {
+ return err
+ }
+ return nil
+ }
+ if c.waitErr != nil {
+ return c.waitErr
+ }
+ return nil
+}
+
+func (c *recordingTabletManagerClient) VReplicationExec(ctx context.Context, tablet *topodatapb.Tablet, query string) (*querypb.QueryResult, error) {
+ return &querypb.QueryResult{}, nil
+}
+
+func (c *recordingTabletManagerClient) WaitCalls() []string {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return append([]string(nil), c.waitCalls...)
+}
+
+func (c *recordingTabletManagerClient) RefreshStateCalled() int {
+ c.mu.Lock()
+ defer c.mu.Unlock()
+ return c.refreshStateCalled
+}
+
+func newCutoverTestExecutor(t *testing.T, db *fakesqldb.DB, ts *topo.Server, alias *topodatapb.TabletAlias) *Executor {
+ t.Helper()
+
+ cfg := tabletenv.NewDefaultConfig()
+ cfg.DB = dbconfigs.NewTestDBConfigs(*db.ConnParams(), *db.ConnParams(), db.ConnParams().DbName)
+ venv := vtenv.NewTestEnv()
+
+ executor := &Executor{
+ env: tabletenv.NewEnv(venv, cfg, "ExecutorTest"),
+ ts: ts,
+ tabletAlias: alias,
+ ticks: timer.NewTimer(migrationCheckInterval),
+ isPreparedPoolEmpty: func(tableName string) bool {
+ return false
+ },
+ }
+ executor.execQuery = func(ctx context.Context, query string) (*sqltypes.Result, error) {
+ loweredQuery := strings.ToLower(query)
+ switch {
+ case strings.Contains(loweredQuery, "from _vt.schema_migrations"):
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("migration_uuid|keyspace|mysql_table|mysql_schema|migration_statement|strategy|options|migration_status|retries|ready_to_complete|was_ready_to_complete|tablet|migration_context|cutover_threshold_seconds|shadow_analyzed_timestamp", "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|int64|int64|int64|varchar|varchar|int64|varchar"),
+ strings.Join([]string{
+ t.Name(),
+ "ks",
+ "t1",
+ db.ConnParams().DbName,
+ "alter table t1 add column i int",
+ "vitess",
+ "-vreplication-test-suite",
+ "running",
+ "0",
+ "1",
+ "1",
+ "cell-0000000001",
+ "",
+ "5",
+ "null",
+ }, "|"),
+ ), nil
+ case strings.Contains(loweredQuery, "from _vt.vreplication_log"):
+ return &sqltypes.Result{Fields: sqltypes.MakeTestFields("state|message", "varchar|varchar")}, nil
+ case strings.Contains(loweredQuery, "from _vt.vreplication"):
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("id|workflow|source|options|pos|time_updated|transaction_timestamp|time_heartbeat|time_throttled|component_throttled|reason_throttled|state|message|rows_copied", "int64|varchar|varchar|varchar|varchar|int64|int64|int64|int64|varchar|varchar|varchar|varchar|int64"),
+ "1|"+t.Name()+"|keyspace:\"ks\" shard:\"0\" filter:{rules:{match:\"_vt_HOLD_"+t.Name()+"\"}}|{\"config\":{\"vreplication-parallel-replication-workers\":\"2\"}}|MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-4|1|1|1|0|||Running||10",
+ ), nil
+ default:
+ return &sqltypes.Result{}, nil
+ }
+ }
+ executor.pool = connpool.NewPool(executor.env, "OnlineDDLExecutorPoolTest", tabletenv.ConnPoolConfig{
+ Size: databasePoolSize,
+ IdleTimeout: executor.env.Config().OltpReadPool.IdleTimeout,
+ })
+ executor.pool.Open(executor.env.Config().DB.AppWithDB(), executor.env.Config().DB.DbaWithDB(), executor.env.Config().DB.AppDebugWithDB())
+ t.Cleanup(executor.pool.Close)
+
+ return executor
+}
+
+func TestCutOverVReplMigrationBuffersBeforeParallelApplyCatchUpWait(t *testing.T) {
+ ctx := t.Context()
+ db := fakesqldb.New(t)
+ defer db.Close()
+ protocolName := t.Name()
+ resetProtocol := tmclienttest.SetProtocol(t.Name(), protocolName)
+ defer resetProtocol()
+
+ waitErr := vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "vreplication still catching up")
+ tmClient := &recordingTabletManagerClient{waitErr: waitErr}
+ tmclient.RegisterTabletManagerClientFactory(protocolName, func() tmclient.TabletManagerClient {
+ return tmClient
+ })
+
+ alias := &topodatapb.TabletAlias{Cell: "cell", Uid: 1}
+ ts := memorytopo.NewServer(ctx, "cell")
+ err := ts.CreateTablet(ctx, &topodatapb.Tablet{
+ Alias: alias,
+ Keyspace: "ks",
+ Shard: "0",
+ Type: topodatapb.TabletType_PRIMARY,
+ })
+ require.NoError(t, err)
+
+ addSessionTimeoutQueries := func(lockWaitSeconds int64) {
+ db.AddQuery("set @lock_wait_timeout=@@session.lock_wait_timeout", &sqltypes.Result{})
+ db.AddQuery(fmt.Sprintf("set @@session.lock_wait_timeout=%d", lockWaitSeconds), &sqltypes.Result{})
+ db.AddQuery("set @wait_timeout=@@session.wait_timeout", &sqltypes.Result{})
+ db.AddQuery(fmt.Sprintf("set @@session.wait_timeout=%d", int64(waitTimeoutDuringCutOver.Seconds())), &sqltypes.Result{})
+ db.AddQuery("set @@session.wait_timeout=@wait_timeout", &sqltypes.Result{})
+ db.AddQuery("set @@session.lock_wait_timeout=@lock_wait_timeout", &sqltypes.Result{})
+ }
+ addSessionTimeoutQueries(15)
+ addSessionTimeoutQueries(10)
+
+ db.AddQuery("show global variables like 'rename_table_preserve_foreign_key'", &sqltypes.Result{})
+ db.AddQuery("SELECT @@global.gtid_executed", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("@@global.gtid_executed", "varchar"),
+ "3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ ))
+ db.AddQueryPattern(`(?is)^drop table if exists .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^unlock tables$`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^kill \d+$`, &sqltypes.Result{})
+
+ executor := newCutoverTestExecutor(t, db, ts, alias)
+
+ bufferEvents := []bool{}
+ var bufferMu sync.Mutex
+ executor.toggleBufferTableFunc = func(cancelCtx context.Context, tableName string, timeout time.Duration, bufferQueries bool) {
+ bufferMu.Lock()
+ defer bufferMu.Unlock()
+ bufferEvents = append(bufferEvents, bufferQueries)
+ }
+
+ stream := &VReplStream{
+ id: 1,
+ workflow: t.Name(),
+ options: `{"config":{"vreplication-parallel-replication-workers":"2"}}`,
+ bls: &binlogdatapb.BinlogSource{
+ Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "_vt_HOLD_" + t.Name()}}},
+ },
+ }
+
+ err = executor.cutOverVReplMigration(ctx, stream, true)
+ require.Error(t, err)
+ require.ErrorContains(t, err, "checking prepared pool for table")
+
+ bufferMu.Lock()
+ bufferEventsCopy := append([]bool(nil), bufferEvents...)
+ bufferMu.Unlock()
+ assert.Equal(t, []bool{true, false}, bufferEventsCopy)
+ assert.Equal(t, 1, tmClient.RefreshStateCalled())
+ assert.Len(t, tmClient.WaitCalls(), 0)
+}
+
+func TestCutOverVReplMigrationWaitsForParallelApplyAfterLocking(t *testing.T) {
+ ctx := t.Context()
+ db := fakesqldb.New(t)
+ defer db.Close()
+ params := db.ConnParams()
+
+ protocolName := t.Name()
+ resetProtocol := tmclienttest.SetProtocol(t.Name(), protocolName)
+ defer resetProtocol()
+
+ waitErr := vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "vreplication still catching up")
+ tmClient := &recordingTabletManagerClient{waitErrs: []error{nil, waitErr}}
+ tmclient.RegisterTabletManagerClientFactory(protocolName, func() tmclient.TabletManagerClient {
+ return tmClient
+ })
+
+ alias := &topodatapb.TabletAlias{Cell: "cell", Uid: 1}
+ ts := memorytopo.NewServer(ctx, "cell")
+ err := ts.CreateTablet(ctx, &topodatapb.Tablet{
+ Alias: alias,
+ Keyspace: "ks",
+ Shard: "0",
+ Type: topodatapb.TabletType_PRIMARY,
+ })
+ require.NoError(t, err)
+
+ addSessionTimeoutQueries := func(lockWaitSeconds int64) {
+ db.AddQuery("set @lock_wait_timeout=@@session.lock_wait_timeout", &sqltypes.Result{})
+ db.AddQuery(fmt.Sprintf("set @@session.lock_wait_timeout=%d", lockWaitSeconds), &sqltypes.Result{})
+ db.AddQuery("set @wait_timeout=@@session.wait_timeout", &sqltypes.Result{})
+ db.AddQuery(fmt.Sprintf("set @@session.wait_timeout=%d", int64(waitTimeoutDuringCutOver.Seconds())), &sqltypes.Result{})
+ db.AddQuery("set @@session.wait_timeout=@wait_timeout", &sqltypes.Result{})
+ db.AddQuery("set @@session.lock_wait_timeout=@lock_wait_timeout", &sqltypes.Result{})
+ }
+ addSessionTimeoutQueries(15)
+ addSessionTimeoutQueries(10)
+
+ db.AddQuery("show global variables like 'rename_table_preserve_foreign_key'", &sqltypes.Result{})
+ db.AddQuery("SELECT @@global.gtid_executed", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("@@global.gtid_executed", "varchar"),
+ "3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ ))
+ db.AddQueryPattern(`(?is)^update _vt\.schema_migrations set artifacts=.*$`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^drop table if exists .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^create table if not exists .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^lock tables .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^unlock tables$`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^kill \d+$`, &sqltypes.Result{})
+
+ executor := newCutoverTestExecutor(t, db, ts, alias)
+ executor.execQuery = func(ctx context.Context, query string) (*sqltypes.Result, error) {
+ loweredQuery := strings.ToLower(query)
+ switch {
+ case strings.Contains(loweredQuery, "from _vt.schema_migrations"):
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("migration_uuid|keyspace|mysql_table|mysql_schema|migration_statement|strategy|options|migration_status|retries|ready_to_complete|was_ready_to_complete|tablet|migration_context|cutover_threshold_seconds|shadow_analyzed_timestamp", "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|int64|int64|int64|varchar|varchar|int64|varchar"),
+ strings.Join([]string{
+ t.Name(),
+ "ks",
+ "t1",
+ params.DbName,
+ "alter table t1 add column i int",
+ "vitess",
+ "",
+ "running",
+ "0",
+ "1",
+ "1",
+ "cell-0000000001",
+ "",
+ "5",
+ "done",
+ }, "|"),
+ ), nil
+ case strings.Contains(loweredQuery, "select id, info as info from information_schema.processlist"):
+ return &sqltypes.Result{Fields: sqltypes.MakeTestFields("id|info", "int64|varchar")}, nil
+ case strings.Contains(loweredQuery, "from _vt.vreplication_log"):
+ return &sqltypes.Result{Fields: sqltypes.MakeTestFields("state|message", "varchar|varchar")}, nil
+ case strings.Contains(loweredQuery, "from _vt.vreplication"):
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("id|workflow|source|options|pos|time_updated|transaction_timestamp|time_heartbeat|time_throttled|component_throttled|reason_throttled|state|message|rows_copied", "int64|varchar|varchar|varchar|varchar|int64|int64|int64|int64|varchar|varchar|varchar|varchar|int64"),
+ "1|"+t.Name()+"|keyspace:\"ks\" shard:\"0\" filter:{rules:{match:\"_vt_HOLD_"+t.Name()+"\"}}|{\"config\":{\"vreplication-parallel-replication-workers\":\"2\"}}|MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-4|1|1|1|0|||Running||10",
+ ), nil
+ default:
+ return &sqltypes.Result{}, nil
+ }
+ }
+
+ bufferEvents := []bool{}
+ var bufferMu sync.Mutex
+ executor.toggleBufferTableFunc = func(cancelCtx context.Context, tableName string, timeout time.Duration, bufferQueries bool) {
+ bufferMu.Lock()
+ defer bufferMu.Unlock()
+ bufferEvents = append(bufferEvents, bufferQueries)
+ }
+
+ stream := &VReplStream{
+ id: 1,
+ workflow: t.Name(),
+ options: `{"config":{"vreplication-parallel-replication-workers":"2"}}`,
+ bls: &binlogdatapb.BinlogSource{
+ Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "_vt_HOLD_" + t.Name()}}},
+ },
+ }
+
+ err = executor.cutOverVReplMigration(ctx, stream, false)
+ require.Error(t, err)
+ require.ErrorContains(t, err, "failed waiting for vreplication to catch up before renaming")
+
+ bufferMu.Lock()
+ bufferEventsCopy := append([]bool(nil), bufferEvents...)
+ bufferMu.Unlock()
+ assert.Equal(t, []bool{true, false}, bufferEventsCopy)
+ assert.Equal(t, 1, tmClient.RefreshStateCalled())
+ assert.Len(t, tmClient.WaitCalls(), 2)
+ assert.NotContains(t, db.QueryLog(), "rename table")
+}
+
+func TestCutOverVReplMigrationSkipsSecondPostLockWaitAfterParallelApplyCatchUp(t *testing.T) {
+ ctx := t.Context()
+ db := fakesqldb.New(t)
+ defer db.Close()
+ params := db.ConnParams()
+
+ protocolName := t.Name()
+ resetProtocol := tmclienttest.SetProtocol(t.Name(), protocolName)
+ defer resetProtocol()
+
+ waitErr := vterrors.Errorf(vtrpcpb.Code_DEADLINE_EXCEEDED, "unexpected extra wait after parallel apply catch up")
+ tmClient := &recordingTabletManagerClient{waitErrs: []error{nil, nil, waitErr}}
+ tmclient.RegisterTabletManagerClientFactory(protocolName, func() tmclient.TabletManagerClient {
+ return tmClient
+ })
+
+ alias := &topodatapb.TabletAlias{Cell: "cell", Uid: 1}
+ ts := memorytopo.NewServer(ctx, "cell")
+ err := ts.CreateTablet(ctx, &topodatapb.Tablet{
+ Alias: alias,
+ Keyspace: "ks",
+ Shard: "0",
+ Type: topodatapb.TabletType_PRIMARY,
+ })
+ require.NoError(t, err)
+
+ addSessionTimeoutQueries := func(lockWaitSeconds int64) {
+ db.AddQuery("set @lock_wait_timeout=@@session.lock_wait_timeout", &sqltypes.Result{})
+ db.AddQuery(fmt.Sprintf("set @@session.lock_wait_timeout=%d", lockWaitSeconds), &sqltypes.Result{})
+ db.AddQuery("set @wait_timeout=@@session.wait_timeout", &sqltypes.Result{})
+ db.AddQuery(fmt.Sprintf("set @@session.wait_timeout=%d", int64(waitTimeoutDuringCutOver.Seconds())), &sqltypes.Result{})
+ db.AddQuery("set @@session.wait_timeout=@wait_timeout", &sqltypes.Result{})
+ db.AddQuery("set @@session.lock_wait_timeout=@lock_wait_timeout", &sqltypes.Result{})
+ }
+ addSessionTimeoutQueries(15)
+ addSessionTimeoutQueries(10)
+
+ db.AddQuery("show global variables like 'rename_table_preserve_foreign_key'", &sqltypes.Result{})
+ db.AddQuery("SELECT @@global.gtid_executed", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("@@global.gtid_executed", "varchar"),
+ "3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ ))
+ db.AddQueryPattern(`(?is)^update _vt\.schema_migrations set artifacts=.*$`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^drop table if exists .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^create table if not exists .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^lock tables .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^rename table .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^drop table .*`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^unlock tables$`, &sqltypes.Result{})
+ db.AddQueryPattern(`(?is)^kill \d+$`, &sqltypes.Result{})
+
+ executor := newCutoverTestExecutor(t, db, ts, alias)
+ executor.execQuery = func(ctx context.Context, query string) (*sqltypes.Result, error) {
+ loweredQuery := strings.ToLower(query)
+ switch {
+ case strings.Contains(loweredQuery, "from _vt.schema_migrations"):
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("migration_uuid|keyspace|mysql_table|mysql_schema|migration_statement|strategy|options|migration_status|retries|ready_to_complete|was_ready_to_complete|tablet|migration_context|cutover_threshold_seconds|shadow_analyzed_timestamp", "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|int64|int64|int64|varchar|varchar|int64|varchar"),
+ strings.Join([]string{
+ t.Name(),
+ "ks",
+ "t1",
+ params.DbName,
+ "alter table t1 add column i int",
+ "vitess",
+ "",
+ "running",
+ "0",
+ "1",
+ "1",
+ "cell-0000000001",
+ "",
+ "5",
+ "done",
+ }, "|"),
+ ), nil
+ case strings.Contains(loweredQuery, "select id, info as info from information_schema.processlist"):
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("id|info", "int64|varchar"),
+ "3|rename table `t1` to `_vt_hld_dummy`",
+ ), nil
+ case strings.Contains(loweredQuery, "from _vt.vreplication_log"):
+ return &sqltypes.Result{Fields: sqltypes.MakeTestFields("state|message", "varchar|varchar")}, nil
+ case strings.Contains(loweredQuery, "from _vt.vreplication"):
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("id|workflow|source|options|pos|time_updated|transaction_timestamp|time_heartbeat|time_throttled|component_throttled|reason_throttled|state|message|rows_copied", "int64|varchar|varchar|varchar|varchar|int64|int64|int64|int64|varchar|varchar|varchar|varchar|int64"),
+ "1|"+t.Name()+"|keyspace:\"ks\" shard:\"0\" filter:{rules:{match:\"_vt_HOLD_"+t.Name()+"\"}}|{\"config\":{\"vreplication-parallel-replication-workers\":\"2\"}}|MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-4|1|1|1|0|||Running||10",
+ ), nil
+ default:
+ return &sqltypes.Result{}, nil
+ }
+ }
+
+ bufferEvents := []bool{}
+ var bufferMu sync.Mutex
+ executor.toggleBufferTableFunc = func(cancelCtx context.Context, tableName string, timeout time.Duration, bufferQueries bool) {
+ bufferMu.Lock()
+ defer bufferMu.Unlock()
+ bufferEvents = append(bufferEvents, bufferQueries)
+ }
+
+ stream := &VReplStream{
+ id: 1,
+ workflow: t.Name(),
+ options: `{"config":{"vreplication-parallel-replication-workers":"2"}}`,
+ bls: &binlogdatapb.BinlogSource{
+ Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "_vt_HOLD_" + t.Name()}}},
+ },
+ }
+
+ err = executor.cutOverVReplMigration(ctx, stream, false)
+ require.NoError(t, err)
+
+ bufferMu.Lock()
+ bufferEventsCopy := append([]bool(nil), bufferEvents...)
+ bufferMu.Unlock()
+ assert.Equal(t, []bool{true, false}, bufferEventsCopy)
+ assert.Equal(t, 1, tmClient.RefreshStateCalled())
+ assert.Len(t, tmClient.WaitCalls(), 2)
+ assert.Contains(t, strings.ToLower(db.QueryLog()), "rename table")
+}
+
type fakeTabletManagerClient struct {
tmclient.TabletManagerClient
}
diff --git a/go/vt/vttablet/onlineddl/schema.go b/go/vt/vttablet/onlineddl/schema.go
index f6fc0606efc..9210eed6bd7 100644
--- a/go/vt/vttablet/onlineddl/schema.go
+++ b/go/vt/vttablet/onlineddl/schema.go
@@ -529,6 +529,7 @@ const (
id,
workflow,
source,
+ options,
pos,
time_updated,
transaction_timestamp,
diff --git a/go/vt/vttablet/onlineddl/vrepl.go b/go/vt/vttablet/onlineddl/vrepl.go
index 18fa34790f4..001b9a690b9 100644
--- a/go/vt/vttablet/onlineddl/vrepl.go
+++ b/go/vt/vttablet/onlineddl/vrepl.go
@@ -50,6 +50,7 @@ type VReplStream struct {
id int32
workflow string
source string
+ options string
pos string
timeUpdated int64
timeHeartbeat int64
diff --git a/go/vt/vttablet/tabletmanager/restore_test.go b/go/vt/vttablet/tabletmanager/restore_test.go
index 7eabaa6b584..f5b059809f1 100644
--- a/go/vt/vttablet/tabletmanager/restore_test.go
+++ b/go/vt/vttablet/tabletmanager/restore_test.go
@@ -17,6 +17,7 @@ limitations under the License.
package tabletmanager
import (
+ "context"
"errors"
"os"
"path/filepath"
@@ -153,3 +154,50 @@ func TestInvokeRestoreDoneHook_Timestamps(t *testing.T) {
}
}
}
+
+func TestDisableReplicationRecoversFromRecoverableReplicationInitError(t *testing.T) {
+ fakeMysqlDaemon := newTestMysqlDaemon(t, 1)
+ fakeMysqlDaemon.ExpectedExecuteSuperQueryList = []string{
+ "STOP REPLICA",
+ "FAKE RESET REPLICA ALL",
+ "FAKE SET SOURCE",
+ "FAKE RESET REPLICA ALL",
+ "FAKE SET SOURCE",
+ "START REPLICA",
+ }
+
+ setSourceCalls := 0
+ fakeMysqlDaemon.SetReplicationSourceFunc = func(ctx context.Context, host string, port int32, heartbeatInterval float64, stopReplicationBefore bool, startReplicationAfter bool) error {
+ setSourceCalls++
+
+ require.Equal(t, "//", host)
+ require.Zero(t, port)
+ require.Zero(t, heartbeatInterval)
+ require.False(t, stopReplicationBefore)
+ require.False(t, startReplicationAfter)
+
+ if setSourceCalls == 1 {
+ require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"FAKE SET SOURCE"}))
+ return recoverableReplicationInitError()
+ }
+
+ if setSourceCalls == 2 {
+ require.NoError(t, fakeMysqlDaemon.ExecuteSuperQueryList(ctx, []string{"FAKE SET SOURCE"}))
+ fakeMysqlDaemon.CurrentSourceHost = host
+ fakeMysqlDaemon.CurrentSourcePort = port
+ return nil
+ }
+
+ return errors.New("unexpected SetReplicationSource call")
+ }
+
+ tm := newTestTabletManager(t)
+ tm.MysqlDaemon = fakeMysqlDaemon
+
+ err := tm.disableReplication(t.Context())
+ require.NoError(t, err)
+ require.Equal(t, 2, setSourceCalls)
+ require.Equal(t, "//", fakeMysqlDaemon.CurrentSourceHost)
+ require.Zero(t, fakeMysqlDaemon.CurrentSourcePort)
+ require.NoError(t, fakeMysqlDaemon.CheckSuperQueryList())
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/controller.go b/go/vt/vttablet/tabletmanager/vreplication/controller.go
index 2c02073ddb6..8da1d829261 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/controller.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/controller.go
@@ -344,7 +344,16 @@ func (ct *controller) runBlp(ctx context.Context) (err error) {
// non-recoverable BUT it has persisted beyond the retry limit
// (maxTimeToRetryError). In addition, we cannot restart a workflow
// started with AtomicCopy which has _any_ error during copy phase.
- if (err != nil && vr.WorkflowSubType == int32(binlogdatapb.VReplicationWorkflowSubType_AtomicCopy) && vr.state == binlogdatapb.VReplicationWorkflowState_Copying) ||
+ // The copy-phase check consults both signals: vr.getState() is only
+ // updated by setState calls, and AtomicCopy's copy path (copyAll)
+ // never calls setState(Copying) — only initTablesForCopy does, on
+ // first start — so after a tablet restart the in-memory state stays
+ // at its zero value for the whole remaining copy and a copy-phase
+ // error would be misclassified as retryable. isInCopyPhase() is
+ // refreshed from the durable _vt.copy_state contents on every
+ // loadSettings call and covers the restarted-copy case.
+ if (err != nil && vr.WorkflowSubType == int32(binlogdatapb.VReplicationWorkflowSubType_AtomicCopy) &&
+ (vr.getState() == binlogdatapb.VReplicationWorkflowState_Copying || vr.isInCopyPhase())) ||
isUnrecoverableError(err) ||
!ct.lastWorkflowError.ShouldRetry() {
err = vterrors.Wrapf(err, TerminalErrorIndicator)
diff --git a/go/vt/vttablet/tabletmanager/vreplication/framework_test.go b/go/vt/vttablet/tabletmanager/vreplication/framework_test.go
index be8a314873e..f1c640e5601 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/framework_test.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/framework_test.go
@@ -18,6 +18,7 @@ package vreplication
import (
"context"
+ "errors"
"fmt"
"io"
"os"
@@ -225,6 +226,9 @@ func TestMain(m *testing.M) {
return ret
}
cancel()
+ if testing.Short() {
+ return 0
+ }
runNoBlobTest = true
if err := utils.SetBinlogRowImageOptions("noblob", runPartialJSONTest, tempDir); err != nil {
@@ -247,7 +251,7 @@ func resetBinlogClient() {
func primaryPosition(t *testing.T) string {
t.Helper()
- pos, err := env.Mysqld.PrimaryPosition(t.Context())
+ pos, err := env.Mysqld.PrimaryPosition(context.Background())
require.NoError(t, err)
return replication.EncodePosition(pos)
}
@@ -261,7 +265,7 @@ func primaryPositionParsed(t *testing.T) replication.Position {
func execStatements(t *testing.T, queries []string) {
t.Helper()
- if err := env.Mysqld.ExecuteSuperQueryList(t.Context(), queries); err != nil {
+ if err := env.Mysqld.ExecuteSuperQueryList(context.Background(), queries); err != nil {
log.Error("Error executing query: " + err.Error())
assert.NoError(t, err)
}
@@ -278,6 +282,174 @@ func execConnStatements(t *testing.T, conn *dbconnpool.DBConnection, queries []s
}
}
+func TestShortModeHarnessInitialized(t *testing.T) {
+ if !testing.Short() {
+ t.Skip("short-mode only")
+ }
+
+ require.NotNil(t, env)
+ require.NotNil(t, playerEngine)
+ require.NotNil(t, streamerEngine)
+}
+
+func TestFakeTabletConnVStreamRowsForwardsOptions(t *testing.T) {
+ execStatements(t, []string{
+ "create table vstream_rows_options(id int, val varbinary(9), primary key(id))",
+ "insert into vstream_rows_options values (1, '123456789'), (2, '123456789')",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table vstream_rows_options"})
+ })
+
+ var packetRowCounts []int
+ err := (&fakeTabletConn{}).VStreamRows(context.Background(), &binlogdatapb.VStreamRowsRequest{
+ Query: "select * from vstream_rows_options",
+ Options: &binlogdatapb.VStreamOptions{
+ ConfigOverrides: map[string]string{
+ "vstream-dynamic-packet-size": "false",
+ "vstream-packet-size": "10",
+ },
+ },
+ }, func(rows *binlogdatapb.VStreamRowsResponse) error {
+ if len(rows.Rows) > 0 {
+ packetRowCounts = append(packetRowCounts, len(rows.Rows))
+ }
+ return nil
+ })
+ require.NoError(t, err)
+ require.Equal(t, []int{1, 1}, packetRowCounts)
+}
+
+func TestFakeTabletConnVStreamForwardsOptions(t *testing.T) {
+ execStatements(t, []string{
+ "create table vstream_options_t1(id int, primary key(id))",
+ "create table vstream_options_t2(id int, primary key(id))",
+ "insert into vstream_options_t1 values (1)",
+ "insert into vstream_options_t2 values (2)",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{
+ "drop table vstream_options_t1",
+ "drop table vstream_options_t2",
+ })
+ })
+
+ copiedTables := map[string]bool{}
+ err := (&fakeTabletConn{tablet: &topodatapb.Tablet{Alias: &topodatapb.TabletAlias{Uid: 100}}}).VStream(context.Background(), &binlogdatapb.VStreamRequest{
+ Target: &querypb.Target{Keyspace: "vttest"},
+ Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: "vstream_options_t1",
+ Filter: "select * from vstream_options_t1",
+ }, {
+ Match: "vstream_options_t2",
+ Filter: "select * from vstream_options_t2",
+ }}},
+ Options: &binlogdatapb.VStreamOptions{TablesToCopy: []string{"vstream_options_t1"}},
+ }, func(evs []*binlogdatapb.VEvent) error {
+ for _, ev := range evs {
+ switch ev.Type {
+ case binlogdatapb.VEventType_FIELD:
+ copiedTables[ev.FieldEvent.TableName] = true
+ case binlogdatapb.VEventType_ROW:
+ copiedTables[ev.RowEvent.TableName] = true
+ case binlogdatapb.VEventType_LASTPK:
+ copiedTables[ev.LastPKEvent.TableLastPK.TableName] = true
+ case binlogdatapb.VEventType_COPY_COMPLETED:
+ return io.EOF
+ }
+ }
+ return nil
+ })
+ require.ErrorIs(t, err, io.EOF)
+ require.Contains(t, copiedTables, "vstream_options_t1")
+ require.NotContains(t, copiedTables, "vstream_options_t2")
+}
+
+func TestVPlayerForwardsWorkflowOverridesToSourceVStream(t *testing.T) {
+ streamer := &capturingVStreamerClient{err: errors.New("stream failed")}
+ mockDB := binlogplayer.NewMockDBClient(t)
+ mockDB.ExpectRequest("SELECT rows_copied FROM _vt.vreplication WHERE id=1", sqltypes.MakeTestResult(sqltypes.MakeTestFields("rows_copied", "int64"), "0"), nil)
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+ config, err := vttablet.NewVReplicationConfig(map[string]string{
+ "vreplication-net-read-timeout": "123",
+ "vreplication-parallel-replication-workers": "4",
+ "vstream-packet-size": "42",
+ })
+ require.NoError(t, err)
+
+ stats := binlogplayer.NewStats()
+ t.Cleanup(stats.Stop)
+ vr := newVReplicator(1, &binlogdatapb.BinlogSource{Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "/.*"}}}}, streamer, stats, mockDB, nil, playerEngine, config)
+ vp := newVPlayer(vr, binlogplayer.VRSettings{}, nil, replication.Position{}, "replicate")
+ vp.replicatorPlan = &ReplicatorPlan{VStreamFilter: vr.source.Filter}
+
+ err = vp.fetchAndApply(t.Context())
+ require.ErrorContains(t, err, "stream failed")
+ mockDB.Wait()
+ require.NotNil(t, streamer.vstreamOptions)
+ require.Equal(t, map[string]string{
+ "vreplication-net-read-timeout": "123",
+ "vstream-packet-size": "42",
+ }, streamer.vstreamOptions.ConfigOverrides)
+}
+
+func TestVCopierCopyAllForwardsWorkflowOverridesToSourceVStreamTables(t *testing.T) {
+ streamer := &capturingVStreamerClient{vstreamTablesErr: errors.New("stream tables failed")}
+ mockDB := binlogplayer.NewMockDBClient(t)
+ mockDB.ExpectRequest("SELECT rows_copied FROM _vt.vreplication WHERE id=1", sqltypes.MakeTestResult(sqltypes.MakeTestFields("rows_copied", "int64"), "0"), nil)
+ mockDB.ExpectRequest(SqlMaxAllowedPacket, sqltypes.MakeTestResult(sqltypes.MakeTestFields("max_allowed_packet", "int64"), "67108864"), nil)
+ config, err := vttablet.NewVReplicationConfig(map[string]string{
+ "vreplication-net-read-timeout": "123",
+ "vreplication-parallel-replication-workers": "4",
+ "vstream-packet-size": "42",
+ })
+ require.NoError(t, err)
+
+ stats := binlogplayer.NewStats()
+ t.Cleanup(stats.Stop)
+ vr := newVReplicator(1, &binlogdatapb.BinlogSource{Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "t1"}}}}, streamer, stats, mockDB, nil, playerEngine, config)
+ vr.colInfoMap = map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}}}
+
+ err = newVCopier(vr).copyAll(t.Context(), binlogplayer.VRSettings{WorkflowName: "copy-all"})
+ require.ErrorContains(t, err, "stream tables failed")
+ mockDB.Wait()
+ require.NotNil(t, streamer.vstreamTablesOptions)
+ require.Equal(t, map[string]string{
+ "vreplication-net-read-timeout": "123",
+ "vstream-packet-size": "42",
+ }, streamer.vstreamTablesOptions.ConfigOverrides)
+}
+
+func TestVCopierCopyTableForwardsWorkflowOverridesToSourceVStreamRows(t *testing.T) {
+ streamer := &capturingVStreamerClient{vstreamRowsErr: errors.New("stream rows failed")}
+ mockDB := binlogplayer.NewMockDBClient(t)
+ mockDB.ExpectRequest("SELECT rows_copied FROM _vt.vreplication WHERE id=1", sqltypes.MakeTestResult(sqltypes.MakeTestFields("rows_copied", "int64"), "0"), nil)
+ mockDB.ExpectRequest(SqlMaxAllowedPacket, sqltypes.MakeTestResult(sqltypes.MakeTestFields("max_allowed_packet", "int64"), "67108864"), nil)
+ config, err := vttablet.NewVReplicationConfig(map[string]string{
+ "vreplication-net-read-timeout": "123",
+ "vreplication-parallel-replication-workers": "4",
+ "vstream-packet-size": "42",
+ })
+ require.NoError(t, err)
+
+ stats := binlogplayer.NewStats()
+ t.Cleanup(stats.Stop)
+ vr := newVReplicator(1, &binlogdatapb.BinlogSource{Filter: &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "t1"}}}}, streamer, stats, mockDB, nil, playerEngine, config)
+ vr.colInfoMap = map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}}}
+
+ err = newVCopier(vr).copyTable(t.Context(), "t1", map[string]*sqltypes.Result{"t1": nil})
+ require.ErrorContains(t, err, "stream rows failed")
+ mockDB.Wait()
+ require.NotNil(t, streamer.vstreamRowsOptions)
+ require.Equal(t, map[string]string{
+ "vreplication-net-read-timeout": "123",
+ "vstream-packet-size": "42",
+ }, streamer.vstreamRowsOptions.ConfigOverrides)
+}
+
// --------------------------------------
// Topos and tablets
@@ -339,6 +511,43 @@ type fakeTabletConn struct {
tablet *topodatapb.Tablet
}
+type capturingVStreamerClient struct {
+ vstreamOptions *binlogdatapb.VStreamOptions
+ vstreamRowsOptions *binlogdatapb.VStreamOptions
+ vstreamTablesOptions *binlogdatapb.VStreamOptions
+ err error
+ vstreamRowsErr error
+ vstreamTablesErr error
+}
+
+func (c *capturingVStreamerClient) Open(context.Context) error { return nil }
+
+func (c *capturingVStreamerClient) Close(context.Context) error { return nil }
+
+func (c *capturingVStreamerClient) VStream(ctx context.Context, startPos string, tablePKs []*binlogdatapb.TableLastPK, filter *binlogdatapb.Filter, send func([]*binlogdatapb.VEvent) error, options *binlogdatapb.VStreamOptions) error {
+ c.vstreamOptions = options
+ if c.err != nil {
+ return c.err
+ }
+ return io.EOF
+}
+
+func (c *capturingVStreamerClient) VStreamRows(ctx context.Context, query string, lastpk *querypb.QueryResult, send func(*binlogdatapb.VStreamRowsResponse) error, options *binlogdatapb.VStreamOptions) error {
+ c.vstreamRowsOptions = options
+ if c.vstreamRowsErr != nil {
+ return c.vstreamRowsErr
+ }
+ return nil
+}
+
+func (c *capturingVStreamerClient) VStreamTables(ctx context.Context, send func(*binlogdatapb.VStreamTablesResponse) error, options *binlogdatapb.VStreamOptions) error {
+ c.vstreamTablesOptions = options
+ if c.vstreamTablesErr != nil {
+ return c.vstreamTablesErr
+ }
+ return nil
+}
+
// StreamHealth is part of queryservice.QueryService.
func (ftc *fakeTabletConn) StreamHealth(ctx context.Context, callback func(*querypb.StreamHealthResponse) error) error {
return callback(&querypb.StreamHealthResponse{
@@ -372,7 +581,7 @@ func (ftc *fakeTabletConn) VStream(ctx context.Context, request *binlogdatapb.VS
return err
}
}
- return streamerEngine.Stream(ctx, request.Position, request.TableLastPKs, request.Filter, throttlerapp.VStreamerName, send, nil)
+ return streamerEngine.Stream(ctx, request.Position, request.TableLastPKs, request.Filter, throttlerapp.VStreamerName, send, request.Options)
}
// vstreamRowsHook allows you to do work just before calling VStreamRows.
@@ -394,15 +603,12 @@ func (ftc *fakeTabletConn) VStreamRows(ctx context.Context, request *binlogdatap
}
row = r.Rows[0]
}
- vstreamOptions := &binlogdatapb.VStreamOptions{
- ConfigOverrides: vttablet.GetVReplicationConfigDefaults(false).Map(),
- }
return streamerEngine.StreamRows(ctx, request.Query, row, func(rows *binlogdatapb.VStreamRowsResponse) error {
if vstreamRowsSendHook != nil {
vstreamRowsSendHook(ctx)
}
return send(rows)
- }, vstreamOptions)
+ }, request.Options)
}
// --------------------------------------
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply.go
new file mode 100644
index 00000000000..b28e4dc2227
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply.go
@@ -0,0 +1,2482 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "log/slog"
+ "maps"
+ "math"
+ "runtime/debug"
+ "slices"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "vitess.io/vitess/go/mysql/replication"
+ "vitess.io/vitess/go/sqltypes"
+ "vitess.io/vitess/go/vt/binlog/binlogplayer"
+ "vitess.io/vitess/go/vt/log"
+ "vitess.io/vitess/go/vt/sqlparser"
+ "vitess.io/vitess/go/vt/vterrors"
+ "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
+)
+
+// recoverParallelApply is a defer helper used by every goroutine the parallel
+// applier spawns. A panic in a worker, commitLoop, or scheduleLoop must not
+// crash the entire vttablet process; it must turn into a normal error that
+// propagates through the usual shutdown machinery. The callback receives the
+// converted error and is expected to (a) push the error onto the orchestrator's
+// error channel and (b) cancel the shared context so sibling goroutines
+// unwind promptly. Passing nil for cb is allowed when the caller already
+// returns an error it will examine itself.
+func recoverParallelApply(name string, cb func(err error)) {
+ r := recover()
+ if r == nil {
+ return
+ }
+ stack := debug.Stack()
+ log.Error("parallel apply goroutine panicked",
+ slog.String("goroutine", name),
+ slog.Any("panic", r),
+ slog.String("stack", string(stack)),
+ )
+ err := vterrors.Errorf(vtrpcpb.Code_INTERNAL, "parallel apply: %s panicked: %v", name, r)
+ if cb != nil {
+ cb(err)
+ }
+}
+
+type applyTxnPayload struct {
+ // pos is the GTID position to record when committing this transaction.
+ pos replication.Position
+ // timestamp is the source binlog timestamp, used for lag calculation
+ // and the time_updated column in _vt.vreplication.
+ timestamp int64
+ // mustSave forces an immediate position save (e.g., stop position reached
+ // or time-based flush bound exceeded).
+ mustSave bool
+ // events holds the VEvents that make up this transaction's data.
+ // For row transactions these are ROW/FIELD events; for commitOnly
+ // transactions this is typically a single DDL, OTHER, or COMMIT event.
+ events []*binlogdatapb.VEvent
+ // rowOnly is true when the transaction contains only ROW and FIELD events
+ // (no DDL, OTHER, or JOURNAL). Row-only transactions can have writesets
+ // computed for parallel conflict detection. FIELD events are pure metadata
+ // and do not affect conflict detection.
+ rowOnly bool
+ // commitOnly is true for transactions that are applied by the commitLoop
+ // on the main connection rather than by a worker (DDL, OTHER, JOURNAL,
+ // position-only saves). Workers forward these directly to commitCh
+ // without applying events or waiting on txn.done.
+ commitOnly bool
+ // updatePosOnly is true for position-only saves (idle timeout flush).
+ // The commitLoop calls updatePos without applying any events.
+ updatePosOnly bool
+ // query/commit/client are the DB connection functions for this transaction.
+ // For worker transactions, these are set by the worker after applying events.
+ // For commitOnly transactions, these point to the main vplayer connection.
+ query func(ctx context.Context, sql string) (*sqltypes.Result, error)
+ commit func() error
+ client *vdbClient
+ // Pre-computed during scheduling so commitLoop doesn't need to scan
+ // all events to find the last qualifying timestamp for lag calculation.
+ // Zero means no qualifying event was found.
+ lastEventTimestamp int64
+ lastEventCurrentTime int64
+}
+
+var (
+ applyTxnPool = sync.Pool{
+ New: func() any { return new(applyTxn) },
+ }
+ applyTxnPayloadPool = sync.Pool{
+ New: func() any { return new(applyTxnPayload) },
+ }
+)
+
+// acquireApplyTxn gets an applyTxn from the pool with a fresh done channel.
+// A fresh channel is allocated each time (not reused from the pool) because
+// the worker captures a reference to the done channel via pendingDone before
+// the txn is returned to the pool. If the channel were reused, the worker's
+// pendingDone and the new txn's done would alias the same channel, and the
+// drain here would steal the signal intended for the worker.
+func acquireApplyTxn() *applyTxn {
+ txn := applyTxnPool.Get().(*applyTxn)
+ txn.done = make(chan struct{}, 1)
+ return txn
+}
+
+// acquireApplyTxnPayload gets an applyTxnPayload from the pool.
+func acquireApplyTxnPayload() *applyTxnPayload {
+ return applyTxnPayloadPool.Get().(*applyTxnPayload)
+}
+
+// releaseApplyTxn returns an applyTxn and its payload to their pools.
+// Must only be called after commitLoop has fully processed the txn.
+func releaseApplyTxn(txn *applyTxn) {
+ if txn.payload != nil {
+ p := txn.payload
+ *p = applyTxnPayload{}
+ applyTxnPayloadPool.Put(p)
+ }
+ // Zero out the txn completely (including done channel). acquireApplyTxn
+ // always creates a fresh done channel, so there's nothing to preserve.
+ *txn = applyTxn{}
+ applyTxnPool.Put(txn)
+}
+
+type postDDLStalePlan struct {
+ stalePlan *TablePlan
+ refreshedPlans map[string]*TablePlan
+ allowDisappear bool
+}
+
+// clonePostDDLStalePlan deep-copies the refreshed-name set so scheduler and
+// commitLoop state can evolve independently without sharing inner maps.
+func clonePostDDLStalePlan(stale postDDLStalePlan) postDDLStalePlan {
+ clone := stale
+ if len(stale.refreshedPlans) == 0 {
+ return clone
+ }
+ clone.refreshedPlans = make(map[string]*TablePlan, len(stale.refreshedPlans))
+ maps.Copy(clone.refreshedPlans, stale.refreshedPlans)
+ return clone
+}
+
+// clonePostDDLStalePlans returns a detached copy of the current barrier state.
+func clonePostDDLStalePlans(src map[string]postDDLStalePlan) map[string]postDDLStalePlan {
+ if len(src) == 0 {
+ return nil
+ }
+ cloned := make(map[string]postDDLStalePlan, len(src))
+ for name, stale := range src {
+ cloned[name] = clonePostDDLStalePlan(stale)
+ }
+ return cloned
+}
+
+// cloneDroppedTables mirrors clonePostDDLStalePlans for the dropped-table set:
+// scheduler and commitLoop evolve independent copies so a resync between them
+// does not alias their inner maps.
+func cloneDroppedTables(src map[string]struct{}) map[string]struct{} {
+ if len(src) == 0 {
+ return nil
+ }
+ cloned := make(map[string]struct{}, len(src))
+ for name := range src {
+ cloned[name] = struct{}{}
+ }
+ return cloned
+}
+
+// canonicalPostDDLTableKey resolves the exact map key that corresponds to a
+// DDL-parsed table name. MySQL identifiers can arrive in inconsistent casing
+// between binlog events, the parser, and cached plans; without this
+// reconciliation, barrier bookkeeping would silently miss entries because of
+// a case mismatch.
+func canonicalPostDDLTableKey[T any](entries map[string]T, name string) string {
+ if name == "" {
+ return ""
+ }
+ if _, ok := entries[name]; ok {
+ return name
+ }
+ for candidate := range entries {
+ if strings.EqualFold(candidate, name) {
+ return candidate
+ }
+ }
+ return name
+}
+
+// postDDLTableKeyMatches compares two table names case-insensitively so that
+// binlog-event names and tracked barrier names line up regardless of the DDL's
+// original casing.
+func postDDLTableKeyMatches(a, b string) bool {
+ return a != "" && b != "" && strings.EqualFold(a, b)
+}
+
+// snapshotPostDDLStalePlans widens an unknown-DDL barrier to all currently
+// live plans except names already known to have been dropped.
+func snapshotPostDDLStalePlans(tablePlans map[string]*TablePlan, droppedTables map[string]struct{}) map[string]postDDLStalePlan {
+ if len(tablePlans) == 0 {
+ return nil
+ }
+ tracked := make(map[string]postDDLStalePlan, len(tablePlans))
+ for name, plan := range tablePlans {
+ if _, dropped := droppedTables[canonicalPostDDLTableKey(droppedTables, name)]; dropped {
+ continue
+ }
+ tracked[name] = postDDLStalePlan{
+ stalePlan: plan,
+ refreshedPlans: map[string]*TablePlan{name: plan},
+ }
+ }
+ if len(tracked) == 0 {
+ return nil
+ }
+ return tracked
+}
+
+// addPostDDLStalePlan registers a stale plan along with the refreshed table
+// names whose FIELD event will satisfy the barrier. Centralizes the
+// canonical-key, dropped-table, and missing-plan checks so
+// extractDDLAffectedTables can emit consistent entries across CREATE / RENAME /
+// ALTER / DROP without duplicating those rules per case.
+func addPostDDLStalePlan(tracked map[string]postDDLStalePlan, tablePlans map[string]*TablePlan, droppedTables map[string]struct{}, allowDroppedRefreshedNames bool, staleName string, refreshedNames ...string) {
+ staleName = canonicalPostDDLTableKey(tablePlans, staleName)
+ if _, dropped := droppedTables[canonicalPostDDLTableKey(droppedTables, staleName)]; dropped {
+ return
+ }
+ plan, ok := tablePlans[staleName]
+ if !ok {
+ return
+ }
+ entry := postDDLStalePlan{
+ stalePlan: plan,
+ refreshedPlans: make(map[string]*TablePlan, len(refreshedNames)),
+ }
+ for _, refreshedName := range refreshedNames {
+ if refreshedName == "" {
+ continue
+ }
+ refreshedName = canonicalPostDDLTableKey(tablePlans, refreshedName)
+ if !allowDroppedRefreshedNames && refreshedName != staleName {
+ if _, dropped := droppedTables[canonicalPostDDLTableKey(droppedTables, refreshedName)]; dropped {
+ continue
+ }
+ }
+ if tablePlans == nil {
+ continue
+ }
+ entry.refreshedPlans[refreshedName] = tablePlans[refreshedName]
+ }
+ if len(entry.refreshedPlans) == 0 {
+ return
+ }
+ tracked[staleName] = entry
+}
+
+// extractDDLAffectedTables parses a DDL statement and returns the tracked stale
+// plans plus the table names whose future FIELD refresh can satisfy each entry.
+// The caller uses this to keep the stale-plan barrier scoped to the plans that
+// actually matter for the DDL, including rename operations where the refreshed
+// FIELD arrives under a different table name.
+func extractDDLAffectedTables(sql string, parser *sqlparser.Parser, tablePlans map[string]*TablePlan, droppedTables map[string]struct{}) (map[string]postDDLStalePlan, bool) {
+ stmt, err := parser.ParseStrictDDL(sql)
+ if err != nil {
+ tracked := snapshotPostDDLStalePlans(tablePlans, droppedTables)
+ return tracked, len(tracked) != 0
+ }
+ ddlStmt, ok := stmt.(sqlparser.DDLStatement)
+ if !ok {
+ tracked := snapshotPostDDLStalePlans(tablePlans, droppedTables)
+ return tracked, len(tracked) != 0
+ }
+ tracked := make(map[string]postDDLStalePlan)
+ switch stmt := ddlStmt.(type) {
+ case *sqlparser.CreateTable:
+ // A same-name recreate can arrive before FIELD refreshes the live plan.
+ // Keep the stale pre-drop plan tracked until the new FIELD replaces it.
+ addPostDDLStalePlan(tracked, tablePlans, nil, true, stmt.Table.Name.String(), stmt.Table.Name.String())
+ case *sqlparser.RenameTable:
+ for _, pair := range stmt.TablePairs {
+ addPostDDLStalePlan(tracked, tablePlans, droppedTables, true, pair.FromTable.Name.String(), pair.ToTable.Name.String())
+ }
+ case *sqlparser.AlterTable:
+ refreshedNames := []string{stmt.Table.Name.String()}
+ allowDroppedRefreshedNames := false
+ for _, option := range stmt.AlterOptions {
+ if rename, ok := option.(*sqlparser.RenameTableName); ok {
+ refreshedNames = []string{rename.Table.Name.String()}
+ allowDroppedRefreshedNames = true
+ }
+ }
+ addPostDDLStalePlan(tracked, tablePlans, droppedTables, allowDroppedRefreshedNames, stmt.Table.Name.String(), refreshedNames...)
+ case *sqlparser.DropTable:
+ for _, table := range stmt.FromTables {
+ if table.IsEmpty() {
+ continue
+ }
+ name := canonicalPostDDLTableKey(tablePlans, table.Name.String())
+ addPostDDLStalePlan(tracked, tablePlans, nil, true, name, name)
+ entry := tracked[name]
+ entry.allowDisappear = true
+ tracked[name] = entry
+ }
+ default:
+ for _, table := range ddlStmt.AffectedTables() {
+ if table.IsEmpty() {
+ continue
+ }
+ name := table.Name.String()
+ addPostDDLStalePlan(tracked, tablePlans, droppedTables, false, name, name)
+ }
+ }
+ if len(tracked) == 0 {
+ return nil, false
+ }
+ return tracked, false
+}
+
+// extractDroppedTables pulls the set of DROP TABLE names out of a DDL. A stale
+// plan for a dropped table will never be satisfied by a future FIELD refresh,
+// so the barrier has to allow those plans to simply disappear rather than
+// stall the pipeline waiting for a refresh that will never arrive.
+func extractDroppedTables(sql string, parser *sqlparser.Parser) map[string]struct{} {
+ stmt, err := parser.ParseStrictDDL(sql)
+ if err != nil {
+ return nil
+ }
+ dropped := map[string]struct{}{}
+ switch stmt := stmt.(type) {
+ case *sqlparser.DropTable:
+ for _, table := range stmt.FromTables {
+ if table.IsEmpty() {
+ continue
+ }
+ dropped[strings.ToLower(table.Name.String())] = struct{}{}
+ }
+ }
+ if len(dropped) == 0 {
+ return nil
+ }
+ return dropped
+}
+
+// retireResolvedPostDDLTablePlans removes stale rename-source plans once the
+// rename barrier is fully satisfied. This keeps later unknown-DDL snapshots
+// from tracking names that no longer exist, while preserving fail-closed
+// behavior until the rename target actually refreshes.
+func retireResolvedPostDDLTablePlans(tablePlans map[string]*TablePlan, stalePlans map[string]postDDLStalePlan) bool {
+ retired := false
+ for staleName, stale := range stalePlans {
+ if stale.stalePlan == nil {
+ continue
+ }
+ if _, recreated := stale.refreshedPlans[staleName]; recreated {
+ continue
+ }
+ if tablePlans[staleName] != stale.stalePlan {
+ continue
+ }
+ delete(tablePlans, staleName)
+ retired = true
+ }
+ return retired
+}
+
+// resolvedPostDDLStalePlans returns the subset of barrier entries whose
+// refreshed plans have already arrived (or, for DROP entries, whose name now
+// appears in droppedTables). Callers use this to retire resolved entries from
+// shared state while still-unresolved ones remain active.
+func resolvedPostDDLStalePlans(tablePlans map[string]*TablePlan, droppedTables map[string]struct{}, stalePlans map[string]postDDLStalePlan) map[string]postDDLStalePlan {
+ if len(stalePlans) == 0 {
+ return nil
+ }
+ resolved := make(map[string]postDDLStalePlan, len(stalePlans))
+ for name, stale := range stalePlans {
+ if stale.allowDisappear {
+ if _, ok := droppedTables[canonicalPostDDLTableKey(droppedTables, name)]; ok {
+ resolved[name] = clonePostDDLStalePlan(stale)
+ continue
+ }
+ }
+ allRefreshed := true
+ for refreshedName, priorPlan := range stale.refreshedPlans {
+ refreshedPlan, ok := tablePlans[canonicalPostDDLTableKey(tablePlans, refreshedName)]
+ if !ok || refreshedPlan == priorPlan {
+ allRefreshed = false
+ break
+ }
+ }
+ if allRefreshed {
+ resolved[name] = clonePostDDLStalePlan(stale)
+ }
+ }
+ if len(resolved) == 0 {
+ return nil
+ }
+ return resolved
+}
+
+// mergePostDDLStalePlans union-merges two barrier maps so a resync between
+// scheduler and commitLoop preserves every refreshed name that either side
+// observed. Losing an entry here would leave a rename or same-name recreate's
+// barrier unsatisfiable.
+func mergePostDDLStalePlans(dst, src map[string]postDDLStalePlan) map[string]postDDLStalePlan {
+ if len(src) == 0 {
+ return dst
+ }
+ if dst == nil {
+ dst = make(map[string]postDDLStalePlan, len(src))
+ }
+ for name, stale := range src {
+ existing, ok := dst[name]
+ if !ok {
+ dst[name] = clonePostDDLStalePlan(stale)
+ continue
+ }
+ merged := existing
+ if merged.stalePlan == nil {
+ merged.stalePlan = stale.stalePlan
+ }
+ if len(stale.refreshedPlans) != 0 {
+ if merged.refreshedPlans == nil {
+ merged.refreshedPlans = make(map[string]*TablePlan, len(existing.refreshedPlans)+len(stale.refreshedPlans))
+ maps.Copy(merged.refreshedPlans, existing.refreshedPlans)
+ }
+ for refreshedName, refreshedPlan := range stale.refreshedPlans {
+ if _, ok := merged.refreshedPlans[refreshedName]; !ok {
+ merged.refreshedPlans[refreshedName] = refreshedPlan
+ }
+ }
+ }
+ merged.allowDisappear = merged.allowDisappear || stale.allowDisappear
+ dst[name] = merged
+ }
+ return dst
+}
+
+// extractDDLRenameTargets pulls the from→to pairs out of a RENAME TABLE or
+// ALTER TABLE ... RENAME so barrier entries can be retargeted: the FIELD event
+// that satisfies the stale plan will now arrive under the destination table
+// name, and the watched refreshed-name has to follow.
+func extractDDLRenameTargets(sql string, parser *sqlparser.Parser) map[string]string {
+ stmt, err := parser.ParseStrictDDL(sql)
+ if err != nil {
+ return nil
+ }
+ renames := map[string]string{}
+ switch stmt := stmt.(type) {
+ case *sqlparser.RenameTable:
+ for _, pair := range stmt.TablePairs {
+ fromName := strings.ToLower(pair.FromTable.Name.String())
+ toName := strings.ToLower(pair.ToTable.Name.String())
+ if fromName == "" || toName == "" {
+ continue
+ }
+ renames[fromName] = toName
+ }
+ case *sqlparser.AlterTable:
+ for _, option := range stmt.AlterOptions {
+ rename, ok := option.(*sqlparser.RenameTableName)
+ if !ok {
+ continue
+ }
+ fromName := strings.ToLower(stmt.Table.Name.String())
+ toName := strings.ToLower(rename.Table.Name.String())
+ if fromName == "" || toName == "" {
+ continue
+ }
+ renames[fromName] = toName
+ }
+ }
+ if len(renames) == 0 {
+ return nil
+ }
+ return renames
+}
+
+// uniqueKeyColumnsEqual reports whether two per-index unique-key column lists
+// are identical (same number of indexes, each with the same ordered columns).
+func uniqueKeyColumnsEqual(a, b [][]string) bool {
+ if len(a) != len(b) {
+ return false
+ }
+ for i := range a {
+ if !slices.Equal(a[i], b[i]) {
+ return false
+ }
+ }
+ return true
+}
+
+// shouldPublishExecIgnoreDDLBarrier decides whether an ALTER that adds or
+// drops a unique secondary index has actually changed the cached plan's
+// writeset state — either its force-serialize (HasExtraUniqueSecondary) flag
+// or its hashable unique-key column lists. When the state changes, an
+// ExecIgnore barrier must be published so rows planned under the old state do
+// not leak into execution under the new one. Returning false means the ALTER
+// does not touch correctness-relevant plan state and the barrier can be
+// skipped.
+func shouldPublishExecIgnoreDDLBarrier(ctx context.Context, vp *vplayer, statement string) (bool, error) {
+ if vp == nil || vp.vr == nil || vp.vr.vre == nil || vp.vr.vre.env == nil {
+ return false, nil
+ }
+ parser := vp.vr.vre.env.Parser()
+ stmt, err := parser.ParseStrictDDL(statement)
+ if err != nil {
+ return false, nil
+ }
+ alter, ok := stmt.(*sqlparser.AlterTable)
+ if !ok || alter.Table.IsEmpty() {
+ return false, nil
+ }
+ tableName := alter.Table.Name.String()
+ vp.tablePlansMu.RLock()
+ cachedPlan := vp.tablePlans[canonicalPostDDLTableKey(vp.tablePlans, tableName)]
+ vp.tablePlansMu.RUnlock()
+ if cachedPlan == nil {
+ return false, nil
+ }
+ for _, option := range alter.AlterOptions {
+ switch option := option.(type) {
+ case *sqlparser.AddIndexDefinition:
+ if option.IndexDefinition == nil || option.IndexDefinition.Info == nil || !option.IndexDefinition.Info.IsUnique() {
+ continue
+ }
+ uniqueKeys, mustSerialize, err := vp.vr.writesetUniqueKeys(ctx, cachedPlan.TargetName, cachedPlan)
+ if err != nil {
+ return false, err
+ }
+ return mustSerialize != cachedPlan.HasExtraUniqueSecondary ||
+ !uniqueKeyColumnsEqual(uniqueKeys, cachedPlan.UniqueKeyColumns), nil
+ case *sqlparser.DropKey:
+ if option.Type != sqlparser.NormalKeyType && option.Type != sqlparser.ConstraintType {
+ continue
+ }
+ uniqueKeys, mustSerialize, err := vp.vr.writesetUniqueKeys(ctx, cachedPlan.TargetName, cachedPlan)
+ if err != nil {
+ return false, err
+ }
+ return mustSerialize != cachedPlan.HasExtraUniqueSecondary ||
+ !uniqueKeyColumnsEqual(uniqueKeys, cachedPlan.UniqueKeyColumns), nil
+ }
+ }
+ return false, nil
+}
+
+// retargetPostDDLStalePlans rewrites in-flight barrier entries after a RENAME
+// has landed: the FIELD refresh that satisfies each stale plan will now
+// arrive under the destination name, so the watched refreshed-name must
+// follow. Only the entries originally watched are retargeted, so overlapping
+// rename sets do not cascade based on map iteration order.
+func retargetPostDDLStalePlans(stalePlans map[string]postDDLStalePlan, renameTargets map[string]string, tablePlans map[string]*TablePlan) {
+ if len(stalePlans) == 0 || len(renameTargets) == 0 {
+ return
+ }
+ for staleName, stale := range stalePlans {
+ if len(stale.refreshedPlans) == 0 {
+ continue
+ }
+ refreshedPlans := make(map[string]*TablePlan, len(stale.refreshedPlans))
+ changed := false
+ for refreshedName, priorPlan := range stale.refreshedPlans {
+ if toName, ok := renameTargets[canonicalPostDDLTableKey(renameTargets, refreshedName)]; ok {
+ // Retarget from the original watched names only so overlapping
+ // rename sets do not cascade based on map iteration order.
+ toName = canonicalPostDDLTableKey(tablePlans, toName)
+ refreshedPlans[toName] = tablePlans[toName]
+ changed = true
+ continue
+ }
+ refreshedPlans[refreshedName] = priorPlan
+ }
+ if !changed {
+ continue
+ }
+ stale.refreshedPlans = refreshedPlans
+ stalePlans[staleName] = stale
+ }
+}
+
+// unresolvedPostDDLStalePlans drops entries whose replacement FIELD has already
+// arrived, so only still-stale table plans participate in later scheduling.
+func unresolvedPostDDLStalePlans(tablePlans map[string]*TablePlan, droppedTables map[string]struct{}, stalePlans map[string]postDDLStalePlan) map[string]postDDLStalePlan {
+ if len(stalePlans) == 0 {
+ return nil
+ }
+ unresolved := make(map[string]postDDLStalePlan, len(stalePlans))
+ for name, stale := range stalePlans {
+ if stale.allowDisappear {
+ if _, ok := droppedTables[canonicalPostDDLTableKey(droppedTables, name)]; ok {
+ continue
+ }
+ }
+ allRefreshed := true
+ for refreshedName, priorPlan := range stale.refreshedPlans {
+ refreshedPlan, ok := tablePlans[canonicalPostDDLTableKey(tablePlans, refreshedName)]
+ if !ok {
+ allRefreshed = false
+ break
+ }
+ if refreshedPlan == priorPlan {
+ allRefreshed = false
+ break
+ }
+ }
+ if allRefreshed {
+ continue
+ }
+ unresolved[name] = clonePostDDLStalePlan(stale)
+ }
+ if len(unresolved) == 0 {
+ return nil
+ }
+ return unresolved
+}
+
+// txnTouchesPostDDLBarrier keeps known DDL barriers table-scoped while still
+// letting unknown DDLs remain conservative until every tracked plan refreshes.
+func txnTouchesPostDDLBarrier(events []*binlogdatapb.VEvent, stalePlans map[string]postDDLStalePlan, conservative bool) bool {
+ if len(stalePlans) == 0 {
+ return false
+ }
+ for _, event := range events {
+ var tableName string
+ switch event.Type {
+ case binlogdatapb.VEventType_FIELD:
+ if event.FieldEvent != nil {
+ tableName = event.FieldEvent.TableName
+ }
+ case binlogdatapb.VEventType_ROW:
+ if event.RowEvent != nil {
+ tableName = event.RowEvent.TableName
+ }
+ }
+ if tableName == "" {
+ continue
+ }
+ if conservative {
+ return true
+ }
+ for staleName, stale := range stalePlans {
+ if postDDLTableKeyMatches(tableName, staleName) {
+ return true
+ }
+ for refreshedName := range stale.refreshedPlans {
+ if postDDLTableKeyMatches(tableName, refreshedName) {
+ return true
+ }
+ }
+ }
+ }
+ return false
+}
+
+// postDDLRefreshTargetMatchesCachedPlan reports whether the currently-cached
+// plan for refreshedName is still the stale one tracked by a barrier entry.
+// Callers use this to distinguish a genuine replacement (progress) from a
+// no-op refresh that would not advance the barrier.
+func postDDLRefreshTargetMatchesCachedPlan(stalePlans map[string]postDDLStalePlan, refreshedName string, cachedPlan *TablePlan) bool {
+ for _, stale := range stalePlans {
+ for trackedName, priorPlan := range stale.refreshedPlans {
+ if !postDDLTableKeyMatches(trackedName, refreshedName) {
+ continue
+ }
+ if priorPlan == cachedPlan {
+ return true
+ }
+ }
+ }
+ return false
+}
+
+// mergeDroppedTables is the dropped-table counterpart to
+// mergePostDDLStalePlans: a resync between scheduler and commitLoop has to
+// preserve the allow-disappear bookkeeping from both sides, or a DROP
+// observed by only one side could leave a barrier stuck.
+func mergeDroppedTables(dst, src map[string]struct{}) map[string]struct{} {
+ if len(src) == 0 {
+ return dst
+ }
+ merged := make(map[string]struct{}, len(dst)+len(src))
+ for name := range dst {
+ merged[name] = struct{}{}
+ }
+ for name := range src {
+ merged[name] = struct{}{}
+ }
+ return merged
+}
+
+// extractFieldRefreshTables collects the table names refreshed by FIELD
+// events in a txn. Used by txnNeedsFieldRefreshSerialization to detect the
+// same-txn FIELD+ROW case that cannot be parallelized (the row would apply
+// against a plan the same txn is replacing).
+func extractFieldRefreshTables(events []*binlogdatapb.VEvent) map[string]struct{} {
+ var refreshed map[string]struct{}
+ for _, event := range events {
+ if event.Type != binlogdatapb.VEventType_FIELD || event.FieldEvent == nil || event.FieldEvent.TableName == "" {
+ continue
+ }
+ if refreshed == nil {
+ refreshed = make(map[string]struct{})
+ }
+ refreshed[event.FieldEvent.TableName] = struct{}{}
+ }
+ return refreshed
+}
+
+// Same-txn FIELD+ROW means the worker may build or replace the plan before it
+// applies the row. The scheduler cannot safely compute a writeset from the
+// pre-apply snapshot, so serialize these rare refresh transactions.
+//
+// This guard is also what keeps concurrent plan refreshes ordered: vstreamer
+// emits FIELD in the same transaction as the first ROW for that table, so
+// every plan-storing transaction is forceGlobal and two workers can never
+// store table plans for the same table out of order. A FIELD-only
+// transaction (no ROW for that table) would bypass this; if vstreamer ever
+// emits those, the pendingFieldRefreshTables serialization still covers the
+// ordering of later row transactions, but the plan-store race between two
+// FIELD-only txns would need revisiting.
+func txnNeedsFieldRefreshSerialization(events []*binlogdatapb.VEvent) bool {
+ refreshedTables := extractFieldRefreshTables(events)
+ if len(refreshedTables) == 0 {
+ return false
+ }
+ for _, event := range events {
+ if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil {
+ continue
+ }
+ if _, ok := refreshedTables[event.RowEvent.TableName]; ok {
+ return true
+ }
+ }
+ return false
+}
+
+// txnTouchesPendingFieldRefresh reports whether any ROW event in the txn
+// targets a table whose FIELD refresh is still queued ahead of it. Such txns
+// must serialize behind the pending refresh; otherwise the worker would apply
+// rows against a plan that is about to be replaced.
+func txnTouchesPendingFieldRefresh(events []*binlogdatapb.VEvent, pending map[string]int) bool {
+ if len(pending) == 0 {
+ return false
+ }
+ for _, event := range events {
+ if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil {
+ continue
+ }
+ for tableName, count := range pending {
+ if count <= 0 {
+ continue
+ }
+ if postDDLTableKeyMatches(event.RowEvent.TableName, tableName) {
+ return true
+ }
+ }
+ }
+ return false
+}
+
+// workerLocalVPlayer builds a worker-scoped shadow of the orchestrator's
+// vplayer that exposes only the fields workers are allowed to share
+// (tablePlans, replicatorPlan, serialMu, etc). Keeps worker code from
+// reaching into main-goroutine-owned vplayer state by accident.
+func workerLocalVPlayer(vp *vplayer) vplayer {
+ return vplayer{
+ vr: vp.vr,
+ copyState: vp.copyState,
+ replicatorPlan: vp.replicatorPlan,
+ canAcceptStmtEvents: vp.canAcceptStmtEvents,
+ tablePlansMu: vp.tablePlansMu,
+ tablePlans: vp.tablePlans,
+ tablePlansVersion: vp.tablePlansVersion,
+ batchMode: vp.batchMode,
+ phase: vp.phase,
+ serialMu: vp.serialMu,
+ }
+}
+
+// writesetErrorForcesSerialization flags the specific writeset-build errors
+// (partial row image, missing streamed fields) that mean we cannot prove
+// absence of conflict for the txn, so it must take the serial path. Other
+// writeset errors propagate as real failures.
+func writesetErrorForcesSerialization(err error) bool {
+ if vterrors.Code(err) != vtrpcpb.Code_FAILED_PRECONDITION {
+ return false
+ }
+ return strings.Contains(err.Error(), "partial row image on table ") ||
+ strings.Contains(err.Error(), "not in streamed fields") ||
+ strings.Contains(err.Error(), "no usable writeset identity") ||
+ strings.Contains(err.Error(), "streamed field metadata mismatch")
+}
+
+// computeLastEventTimestamp scans events in reverse to find the last event
+// with a non-zero timestamp that isn't a throttled heartbeat. Returns the
+// timestamp and currentTime from that event, or (0, 0) if none qualifies.
+func computeLastEventTimestamp(events []*binlogdatapb.VEvent) (timestamp, currentTime int64) {
+ for _, ev := range slices.Backward(events) {
+ if ev.Timestamp == 0 {
+ continue
+ }
+ if ev.Type == binlogdatapb.VEventType_HEARTBEAT && ev.Throttled {
+ continue
+ }
+ return ev.Timestamp, ev.CurrentTime
+ }
+ return 0, 0
+}
+
+// txnNeedsWorker reports whether a transaction has work for a worker connection.
+func txnNeedsWorker(events []*binlogdatapb.VEvent) bool {
+ for _, ev := range events {
+ if ev.Type != binlogdatapb.VEventType_ROWS_QUERY {
+ return true
+ }
+ }
+ return false
+}
+
+// applyEventsParallel is the top-level orchestrator for the parallel applier.
+// It creates N worker goroutines and a commitLoop goroutine, then runs
+// scheduleLoop on the calling goroutine. On exit, it tears down the pipeline
+// in order: close scheduler → wait workers → close commitCh → wait commitLoop.
+func (vp *vplayer) applyEventsParallel(ctx context.Context, relay *relayLog) error {
+ workerCount := vp.vr.workflowConfig.ParallelReplicationWorkers
+ if workerCount <= 1 {
+ return vp.applyEvents(ctx, relay)
+ }
+
+ // Mirror the serial applier: reset lag stats to MaxInt64 when we exit,
+ // signalling that replication is no longer running.
+ defer vp.vr.stats.ReplicationLagSeconds.Store(math.MaxInt64)
+ defer vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, math.MaxInt64)
+
+ ctx, cancel := context.WithCancel(ctx)
+ defer cancel()
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+
+ scheduler := newApplyScheduler(ctx)
+ // Buffer 4x worker count to decouple worker throughput from commit
+ // latency. Workers block when commitCh is full, stalling the pipeline.
+ commitCh := make(chan *applyTxn, workerCount*4)
+ // Cap total ordered work in the parallel pipeline to approximately one
+ // applying transaction per worker plus the commit buffer. This provides
+ // end-to-end backpressure when commitLoop is stalled on an early order.
+ scheduler.maxOutstandingOrders = int64(workerCount + cap(commitCh))
+ applyErr := make(chan error, 2)
+ commitLoopErr := make(chan error, 1)
+ workerErr := make(chan error, workerCount)
+
+ workers := make([]*applyWorker, 0, workerCount)
+ // Register the defer BEFORE the creation loop so that if creating
+ // worker N fails, workers 0..N-1 are still closed. Without this,
+ // a partial creation failure would leak DB connections.
+ defer func() {
+ for _, worker := range workers {
+ worker.close()
+ }
+ }()
+ for range workerCount {
+ worker, err := newApplyWorker(ctx, vp.vr)
+ if err != nil {
+ return err
+ }
+ workers = append(workers, worker)
+ }
+
+ // Query FK constraints from the target database so that we can
+ // generate writeset keys that create conflicts between child and
+ // parent table transactions, preventing FK constraint violations
+ // during parallel apply.
+ //
+ // Fail closed: if we can't read FK metadata we cannot know whether
+ // the schema has FK constraints that require cross-table ordering.
+ // Continuing with fkRefs=nil would silently degrade to PK-only
+ // writeset scheduling and could reorder parent/child transactions.
+ // Return the error so the workflow retries via the normal retry path.
+ fkRefs, err := queryFKRefs(vp.vr.dbClient, vp.vr.dbClient.DBName())
+ if err != nil {
+ return vterrors.Wrapf(err, "parallel apply: failed to query FK metadata from %q", vp.vr.dbClient.DBName())
+ }
+ if len(fkRefs) > 0 {
+ for table, refs := range fkRefs {
+ for _, ref := range refs {
+ log.Info("Parallel apply: FK ref", slog.String("child", table), slog.String("parent", ref.ParentTable), slog.Any("childCols", ref.ChildColumnNames), slog.Any("referencedCols", ref.ReferencedColumnNames))
+ }
+ }
+ } else {
+ log.Info("Parallel apply: no FK refs found", slog.String("db", vp.vr.dbClient.DBName()))
+ }
+ vp.fkRefs = fkRefs
+ vp.parentFKRefs = buildParentFKRefs(fkRefs)
+
+ // sendWorkerErr is a non-blocking send to workerErr. The channel is
+ // buffered to workerCount, so in normal operation this always succeeds;
+ // the non-blocking form is defensive against test hooks or double-send
+ // scenarios and mirrors the convention used elsewhere in this package.
+ sendWorkerErr := func(err error) {
+ select {
+ case workerErr <- err:
+ default:
+ }
+ cancel()
+ }
+ sendCommitLoopErr := func(err error) {
+ select {
+ case commitLoopErr <- err:
+ default:
+ }
+ cancel()
+ }
+
+ var wg sync.WaitGroup
+ for i := range workerCount {
+ worker := workers[i]
+ workerIdx := i
+ wg.Go(func() {
+ // Recover from panics so a buggy event or driver crash does not
+ // tear down the entire vttablet process. The recovered error is
+ // routed through the same path as a normal worker error.
+ defer recoverParallelApply(fmt.Sprintf("worker %d", workerIdx), sendWorkerErr)
+ err := vp.workerLoop(ctx, scheduler, commitCh, worker)
+ if err != nil && err != io.EOF {
+ sendWorkerErr(err)
+ }
+ })
+ }
+
+ commitDone := make(chan struct{})
+ go func() {
+ defer close(commitDone)
+ // Recover from panics so a buggy commit path does not tear down the
+ // entire vttablet process. The recovered error is routed through the
+ // same path as a normal commitLoop error.
+ defer recoverParallelApply("commitLoop", sendCommitLoopErr)
+ if err := vp.commitLoop(ctx, scheduler, commitCh); err != nil {
+ commitLoopErr <- err
+ // Always cancel context when commitLoop exits with an error,
+ // including io.EOF (stop position reached). This ensures
+ // scheduleLoop and workers shut down promptly instead of
+ // blocking on a commitCh that has no reader.
+ cancel()
+ }
+ }()
+
+ // Recover from panics in scheduleLoop so they become a normal applyErr
+ // rather than crashing the process. Routed to applyErr via the closure.
+ var schedulePanicErr error
+ func() {
+ defer recoverParallelApply("scheduleLoop", func(err error) {
+ schedulePanicErr = err
+ cancel()
+ })
+ schedulePanicErr = vp.scheduleLoop(ctx, relay, scheduler)
+ }()
+ schedErr := schedulePanicErr
+ if schedErr != nil {
+ applyErr <- schedErr
+ }
+
+ scheduler.close()
+ wg.Wait()
+ close(commitCh)
+ <-commitDone
+ select {
+ case err := <-commitLoopErr:
+ if err == io.EOF {
+ return nil
+ }
+ applyErr <- err
+ default:
+ }
+
+ // Now that commitLoop is done, it's safe to rollback any leftover
+ // transaction on the main connection. This must happen after commitDone
+ // because commitOnlyTxn in the commitLoop also uses the main connection.
+ vp.vr.dbClient.Rollback()
+
+ // Drain all errors and prioritize real failures over io.EOF/context.Canceled.
+ // We must always inspect workerErr too: teardown after a worker failure makes
+ // scheduleLoop/commitLoop commonly return io.EOF/context.Canceled, and those
+ // benign shutdown signals must not mask the original worker error.
+ var realErrs []error
+ var hasEOF bool
+ var hasCanceled bool
+ classifyErr := func(err error) {
+ if err == nil {
+ return
+ }
+ if err == io.EOF {
+ hasEOF = true
+ return
+ }
+ if errors.Is(err, context.Canceled) {
+ hasCanceled = true
+ return
+ }
+ realErrs = append(realErrs, err)
+ }
+drainApplyErrs:
+ for {
+ select {
+ case err := <-applyErr:
+ classifyErr(err)
+ default:
+ break drainApplyErrs
+ }
+ }
+drainWorkerErrs:
+ for {
+ select {
+ case err := <-workerErr:
+ classifyErr(err)
+ default:
+ break drainWorkerErrs
+ }
+ }
+ if len(realErrs) > 0 {
+ return errors.Join(realErrs...)
+ }
+ // Convert io.EOF (stop position reached) and context.Canceled (shutdown)
+ // to nil. fetchAndApply's caller treats nil from applyEventsParallel
+ // the same as io.EOF from the serial path — it stops the controller
+ // without retrying.
+ if hasEOF || hasCanceled {
+ return nil
+ }
+ return nil
+}
+
+// scheduleLoop reads event batches from the relay log and dispatches them
+// through scheduleItems. It also handles idle-timeout position saves and
+// throttle-lag estimation. Runs on the main goroutine of applyEventsParallel.
+func (vp *vplayer) scheduleLoop(ctx context.Context, relay *relayLog, scheduler *applyScheduler) error {
+ // Note: do NOT defer vp.vr.dbClient.Rollback() here. The main connection
+ // is shared with commitLoop (via commitOnlyTxn), which may still be running
+ // when scheduleLoop returns. The rollback is deferred in applyEventsParallel
+ // after commitLoop has finished.
+ workerCount := vp.vr.workflowConfig.ParallelReplicationWorkers
+ // Compute the max number of source transactions to batch into one
+ // mega-transaction. With parallel workers, we need enough separate
+ // mega-transactions per relay fetch to keep all workers busy.
+ //
+ // The relay log size limit (default 250KB) often limits each fetch to
+ // far fewer transactions than maxItems (5000). With 1-2KB rows, a
+ // typical fetch may contain only ~150-250 source transactions. To
+ // ensure all workers get work, we limit each mega-transaction to a
+ // small multiple of the worker count. This produces enough independent
+ // mega-transactions for the scheduler to keep all workers busy.
+ maxBatched := 0 // 0 means unlimited (serial behavior)
+ if workerCount > 1 {
+ // Batch multiple source transactions into each mega-transaction.
+ // This amortizes per-commit overhead (position update, MySQL COMMIT,
+ // done-signal, scheduler dispatch) across multiple source txns.
+ // With workerCount*4, a single relay fetch produces enough
+ // mega-transactions to keep all workers busy while still reducing
+ // commit overhead by Nx. The writeset for the mega-txn is the
+ // union of all contained source txns, so conflict detection
+ // remains correct — if any source txn in mega-A conflicts with
+ // any source txn in mega-B, they serialize.
+ maxBatched = workerCount * 4
+ }
+ state := ¶llelScheduleState{
+ maxBatchedCommits: maxBatched,
+ }
+ for {
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ vp.serialMu.Lock()
+ if time.Since(vp.timeLastSaved) >= idleTimeout && vp.unsavedEvent != nil {
+ event := vp.unsavedEvent
+ vp.unsavedEvent = nil
+ vp.timeLastSaved = time.Now()
+ vp.serialMu.Unlock()
+ if err := vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false); err != nil {
+ return err
+ }
+ } else {
+ vp.serialMu.Unlock()
+ }
+ if checkResult, ok := vp.vr.vre.throttlerClient.ThrottleCheckOKOrWaitAppName(ctx, throttlerapp.Name(vp.throttlerAppName)); !ok {
+ // Must hold serialMu when calling updateTimeThrottled because
+ // it uses vr.dbClient, which may also be in use by the
+ // commitLoop for commitOnly transactions on the main connection.
+ vp.serialMu.Lock()
+ _ = vp.vr.updateTimeThrottled(throttlerapp.VPlayerName, checkResult.Summary())
+ vp.serialMu.Unlock()
+ snap := vp.loadLagSnapshot()
+ // Estimate lag while throttled, same as the serial applier.
+ if snap.timestampNs > 0 {
+ behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs
+ if behind >= 0 {
+ behindSecs := behind / 1e9
+ vp.vr.stats.ReplicationLagSeconds.Store(behindSecs)
+ vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs)
+ }
+ }
+ continue
+ }
+ items, err := relay.Fetch()
+ if err != nil {
+ return err
+ }
+ if err := vp.scheduleItems(ctx, scheduler, state, items); err != nil {
+ return err
+ }
+ // If a DDL was in this fetch, wait for the commitLoop to process it
+ // (including FK metadata refresh) before starting the next fetch.
+ // Without this barrier, the next fetch would snapshot stale FK refs.
+ //
+ // Also drain when the post-DDL stale-plan guard is active. This
+ // ensures that all serialized work (including FIELD events for the
+ // DDL-affected table) has been applied by workers before the next
+ // fetch's scheduleItems re-evaluates the guard by comparing plan
+ // pointers.
+ if state.ddlSeen || state.postDDLStalePlans != nil {
+ if err := scheduler.waitForIdle(ctx); err != nil {
+ return err
+ }
+ }
+ }
+}
+
+type parallelScheduleState struct {
+ // curEvents accumulates VEvents for the current transaction being built.
+ // Reset after each flush (COMMIT or DDL boundary).
+ curEvents []*binlogdatapb.VEvent
+ // curRowOnly tracks whether the current transaction contains only ROW
+ // events. Set to true on the first ROW event, false on FIELD/DDL/OTHER/
+ // JOURNAL events. Only meaningful when curRowOnlySet is true.
+ curRowOnly bool
+ // curRowOnlySet indicates whether curRowOnly has been determined for the
+ // current transaction. False at the start of each transaction; set to
+ // true on the first event that classifies it. This distinguishes
+ // "not yet classified" from "classified as not row-only".
+ curRowOnlySet bool
+ // curTimestamp is the most recent non-zero event timestamp seen in the
+ // current transaction, used for the time_updated column on flush.
+ curTimestamp int64
+ // curMustSave forces the next flush to save the position immediately
+ // (set when stop position is reached or time-based batch bound fires).
+ curMustSave bool
+ // curPos is the GTID position from the most recent GTID event,
+ // recorded in _vt.vreplication when the transaction is committed.
+ curPos replication.Position
+ // curCommitParent is the source MySQL commit parent from the GTID event,
+ // used for commit-parent ordering when writeset is unavailable.
+ curCommitParent int64
+ // curSequence is the source MySQL sequence number from the GTID event,
+ // used to track lastCommittedSequence in the scheduler.
+ curSequence int64
+ // curHasCommitMeta is true when the current transaction's GTID event
+ // carried non-zero sequenceNumber or commitParent metadata.
+ curHasCommitMeta bool
+ // batchMissingCommitMeta is sticky across batched source transactions.
+ // Once a merged batch contains any txn without commit metadata, the
+ // flushed mega-transaction must stay in the no-metadata scheduler mode.
+ batchMissingCommitMeta bool
+ // lastFlushTime tracks when the last transaction was flushed, used to
+ // enforce the 500ms time-based batch bound during catch-up replay.
+ lastFlushTime time.Time
+ // lastHeartbeatRefresh tracks when time_updated was last refreshed via
+ // SQL for empty transaction streams, independent of lastFlushTime so
+ // that the idle timeout position save still fires normally.
+ lastHeartbeatRefresh time.Time
+ // cachedPlanSnapshot is a copy-on-write snapshot of vplayer.tablePlans,
+ // refreshed only when tablePlansVersion changes (new FIELD events).
+ cachedPlanSnapshot map[string]*TablePlan
+ // cachedPlanVersion tracks which tablePlansVersion the snapshot
+ // corresponds to, so we know when to re-snapshot.
+ cachedPlanVersion int64
+ // fieldIdxCache caches the field-name→index map per table to avoid
+ // rebuilding it on every transaction. Most transactions touch the same
+ // tables so this eliminates redundant map construction. Invalidated
+ // when tablePlansVersion changes (new FIELD events arrive).
+ fieldIdxCache map[string]map[string]int
+ fieldIdxCacheVersion int64
+ // planFlagsVersion, planHasExtraUniqueSecondary, and
+ // planHasUnsupportedWritesetMapping cache aggregate flags for the
+ // cached plan snapshot. Avoids a per-txn scan of every plan when the
+ // workflow's tables carry none of these properties (the common case).
+ // Recomputed lazily when the plan version changes.
+ planFlagsVersion int64
+ planHasExtraUniqueSecondary bool
+ planHasUnsupportedWritesetMapping bool
+ // curHasFieldEvent is true when the current transaction has
+ // accumulated at least one FIELD event. Lets the flush path skip
+ // txnNeedsFieldRefreshSerialization entirely for the common
+ // rowOnly-with-no-FIELDs case.
+ curHasFieldEvent bool
+ // batchedCommitCount tracks how many source transactions have been
+ // merged into the current mega-transaction via commit batching. When
+ // this exceeds maxBatchedCommits, the mega-transaction is flushed even
+ // if more consecutive commits follow. This ensures the parallel applier
+ // produces enough mega-transactions per relay fetch to keep all workers
+ // busy, rather than merging everything into one huge transaction that
+ // only a single worker can process.
+ batchedCommitCount int
+ // maxBatchedCommits is the maximum number of source transactions to
+ // merge into one mega-transaction. Set once based on the relay log
+ // max items and worker count.
+ maxBatchedCommits int
+ // mergedSequences tracks sequence numbers of transactions that were
+ // merged into the current batch. These are advanced in the scheduler
+ // when the batch is actually enqueued (not before), so that
+ // commit-parent dependencies aren't prematurely satisfied.
+ mergedSequences []int64
+ // ddlSeen is set to true when a DDL event is seen in the current fetch.
+ // The scheduleLoop checks this after scheduleItems returns and waits
+ // for the commitLoop to drain (so FK refs are refreshed) before
+ // starting the next fetch. Reset at the start of each scheduleItems call.
+ ddlSeen bool
+ // postDDLStalePlans records a snapshot of the tablePlans entries at the
+ // time an executed DDL was observed. Parallel scheduling is force-
+ // serialized as long as any plan in this snapshot is still the same
+ // object in the live tablePlans map. When a FIELD event for the DDL-
+ // affected table arrives, vplayer.applyEvent builds a new *TablePlan
+ // and stores it in tablePlans, replacing the stale pointer. At that
+ // point the guard clears for that table.
+ //
+ // This is per-table rather than global-version-based because vstreamer
+ // only emits FIELD on first-seen or remapped table ids. An unrelated
+ // table's FIELD would bump the global tablePlansVersion but not replace
+ // the DDL-affected table's plan pointer.
+ //
+ // nil means no DDL barrier is active.
+ postDDLStalePlans map[string]postDDLStalePlan
+ // postDDLDroppedTables records dropped tables that have been explicitly
+ // satisfied for the current DDL barrier.
+ postDDLDroppedTables map[string]struct{}
+ // postDDLConservative marks barriers from unparsed DDL, where we must keep
+ // serializing tracked-table transactions until every captured plan refreshes.
+ postDDLConservative bool
+}
+
+// scheduleItems processes one relay log fetch worth of event batches. It tracks
+// transaction boundaries (GTID → events → COMMIT), classifies transactions,
+// builds writesets, handles batching of consecutive commits, and enqueues
+// applyTxn structs into the scheduler. Empty transactions bypass the scheduler
+// and are saved via unsavedEvent / idle timeout.
+func (vp *vplayer) scheduleItems(ctx context.Context, scheduler *applyScheduler, state *parallelScheduleState, items [][]*binlogdatapb.VEvent) error {
+ stopPosReached := func(pos replication.Position) bool {
+ return !vp.stopPos.IsZero() && !pos.IsZero() && pos.AtLeast(vp.stopPos)
+ }
+ journalTerminates := func(event *binlogdatapb.VEvent) bool {
+ if event.Type != binlogdatapb.VEventType_JOURNAL || event.Journal == nil {
+ return false
+ }
+ if event.Journal.MigrationType != binlogdatapb.MigrationType_TABLES {
+ return true
+ }
+ jtables := make(map[string]struct{}, len(event.Journal.Tables))
+ for _, table := range event.Journal.Tables {
+ jtables[table] = struct{}{}
+ }
+ found := false
+ notFound := false
+ for tableName := range vp.replicatorPlan.TablePlans {
+ if _, ok := jtables[tableName]; ok {
+ found = true
+ } else {
+ notFound = true
+ }
+ }
+ switch {
+ case found && notFound:
+ return true
+ case notFound:
+ return false
+ default:
+ return true
+ }
+ }
+ ddlTerminates := func(event *binlogdatapb.VEvent) bool {
+ return event.Type == binlogdatapb.VEventType_DDL && vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_STOP
+ }
+
+ // Snapshot FK refs under serialMu so we have a consistent view for this
+ // relay fetch. The commitLoop may update these after DDL events.
+ // pendingFieldRefreshTables is needed for FIELD events during normal
+ // replication (initial table plan setup), so we always clone it.
+ // postDDLDroppedTables and postDDLStalePlans can only be populated when
+ // OnDdl is EXEC or EXEC_IGNORE, so we skip that work otherwise.
+ ddlExecEnabled := vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC ||
+ vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE
+ vp.serialMu.Lock()
+ fkRefs := vp.fkRefs
+ parentFKRefs := vp.parentFKRefs
+ pendingFieldRefreshTables := maps.Clone(vp.pendingFieldRefreshTables)
+ if ddlExecEnabled {
+ state.postDDLDroppedTables = cloneDroppedTables(vp.postDDLDroppedTables)
+ if len(vp.postDDLStalePlans) != 0 {
+ if state.postDDLStalePlans == nil {
+ state.postDDLStalePlans = make(map[string]postDDLStalePlan, len(vp.postDDLStalePlans))
+ }
+ for name, stale := range vp.postDDLStalePlans {
+ state.postDDLStalePlans[name] = clonePostDDLStalePlan(stale)
+ }
+ }
+ state.postDDLConservative = state.postDDLConservative || vp.postDDLConservative
+ }
+ vp.serialMu.Unlock()
+
+ // After DDL events that may change schema or FK topology, force all
+ // remaining transactions in this relay fetch to serialize. The
+ // commitLoop will refresh FK metadata when the DDL commits, so the
+ // next relay fetch will have updated snapshots.
+ //
+ // If a previous DDL that was executed on the target changed the schema,
+ // force-serialize until the DDL-affected table's plan pointer has been
+ // replaced by a new FIELD event. We check by comparing the live
+ // tablePlans pointers to the snapshot taken at DDL time. When the
+ // affected table's plan is replaced (new *TablePlan from FIELD), the
+ // stale pointer no longer matches and the guard clears.
+ forceSerialize := false
+ if state.postDDLStalePlans != nil {
+ // Keep serializing until every affected table plan captured at DDL time
+ // has been replaced. Unrelated FIELD events must not clear the barrier.
+ vp.tablePlansMu.Lock()
+ resolvedStalePlans := resolvedPostDDLStalePlans(vp.tablePlans, state.postDDLDroppedTables, state.postDDLStalePlans)
+ if retireResolvedPostDDLTablePlans(vp.tablePlans, resolvedStalePlans) {
+ vp.tablePlansVersion.Add(1)
+ }
+ state.postDDLStalePlans = unresolvedPostDDLStalePlans(vp.tablePlans, state.postDDLDroppedTables, state.postDDLStalePlans)
+ vp.tablePlansMu.Unlock()
+ if state.postDDLStalePlans == nil {
+ state.postDDLStalePlans = nil
+ state.postDDLDroppedTables = nil
+ state.postDDLConservative = false
+ vp.serialMu.Lock()
+ vp.postDDLStalePlans = nil
+ vp.postDDLConservative = false
+ vp.serialMu.Unlock()
+ }
+ }
+ state.ddlSeen = false
+ var fkBatchingResolvedTables map[string]struct{}
+ fkBatchingResolvedVersion := int64(-1)
+ writesetCache := &txnWritesetCache{fieldIdxCache: state.fieldIdxCache}
+ getFKBatchingSnapshot := func() (map[string]*TablePlan, map[string]struct{}) {
+ planSnapshot := snapshotTablePlans(vp.tablePlansMu, vp.tablePlans, vp.tablePlansVersion, &state.cachedPlanVersion, state.cachedPlanSnapshot)
+ state.cachedPlanSnapshot = planSnapshot
+ if fkBatchingResolvedVersion == state.cachedPlanVersion {
+ return planSnapshot, fkBatchingResolvedTables
+ }
+ fkBatchingResolvedTables = buildResolvedFKRefTableSet(fkRefs, parentFKRefs, buildCanonicalTargetTableNames(planSnapshot))
+ fkBatchingResolvedVersion = state.cachedPlanVersion
+ return planSnapshot, fkBatchingResolvedTables
+ }
+
+ flush := func(commitOnly bool) error {
+ if len(state.curEvents) == 0 && !commitOnly {
+ return nil
+ }
+ order := vp.parallelOrder.Add(1)
+ lastTs, lastCT := computeLastEventTimestamp(state.curEvents)
+ payload := acquireApplyTxnPayload()
+ payload.pos = state.curPos
+ payload.timestamp = state.curTimestamp
+ payload.mustSave = state.curMustSave
+ payload.events = state.curEvents
+ payload.rowOnly = state.curRowOnly
+ payload.commitOnly = commitOnly
+ payload.updatePosOnly = false
+ payload.lastEventTimestamp = lastTs
+ payload.lastEventCurrentTime = lastCT
+ // query/commit/client are left nil here; the worker will
+ // set them to its own connection before sending to commitCh.
+ txn := acquireApplyTxn()
+ txn.order = order
+ if !state.batchMissingCommitMeta {
+ txn.sequenceNumber = state.curSequence
+ txn.commitParent = state.curCommitParent
+ txn.hasCommitMeta = state.curHasCommitMeta
+ }
+ txn.payload = payload
+ postDDLSerialize := state.postDDLStalePlans != nil && txnTouchesPostDDLBarrier(state.curEvents, state.postDDLStalePlans, state.postDDLConservative)
+ if forceSerialize {
+ txn.forceGlobal = true
+ } else if postDDLSerialize {
+ txn.forceGlobal = true
+ } else if state.curRowOnlySet && !state.curRowOnly {
+ txn.forceGlobal = true
+ } else if len(vp.copyState) != 0 {
+ txn.forceGlobal = true
+ } else if state.curHasFieldEvent && txnNeedsFieldRefreshSerialization(state.curEvents) {
+ txn.forceGlobal = true
+ } else if txnTouchesPendingFieldRefresh(state.curEvents, pendingFieldRefreshTables) {
+ txn.forceGlobal = true
+ } else {
+ planSnapshot := snapshotTablePlans(vp.tablePlansMu, vp.tablePlans, vp.tablePlansVersion, &state.cachedPlanVersion, state.cachedPlanSnapshot)
+ state.cachedPlanSnapshot = planSnapshot
+ if state.planFlagsVersion != state.cachedPlanVersion {
+ state.planHasExtraUniqueSecondary = false
+ state.planHasUnsupportedWritesetMapping = false
+ for _, plan := range planSnapshot {
+ if plan == nil {
+ continue
+ }
+ if plan.HasExtraUniqueSecondary {
+ state.planHasExtraUniqueSecondary = true
+ }
+ if plan.HasUnsupportedWritesetMapping {
+ state.planHasUnsupportedWritesetMapping = true
+ }
+ }
+ state.planFlagsVersion = state.cachedPlanVersion
+ }
+ extraUniqueTouched := state.planHasExtraUniqueSecondary && txnTouchesExtraUniqueSecondary(state.curEvents, planSnapshot)
+ unsupportedTouched := state.planHasUnsupportedWritesetMapping && txnTouchesUnsupportedWritesetMapping(state.curEvents, planSnapshot)
+ if extraUniqueTouched || unsupportedTouched {
+ txn.forceGlobal = true
+ } else {
+ // Invalidate fieldIdxCache when table plans change (new FIELD events).
+ if state.fieldIdxCacheVersion != state.cachedPlanVersion {
+ state.fieldIdxCache = make(map[string]map[string]int)
+ state.fieldIdxCacheVersion = state.cachedPlanVersion
+ writesetCache = &txnWritesetCache{fieldIdxCache: state.fieldIdxCache}
+ } else if writesetCache == nil {
+ writesetCache = &txnWritesetCache{fieldIdxCache: state.fieldIdxCache}
+ }
+ writeset, err := buildTxnWritesetWithCache(planSnapshot, fkRefs, parentFKRefs, state.curEvents, writesetCache)
+ if err != nil {
+ if writesetErrorForcesSerialization(err) {
+ txn.forceGlobal = true
+ } else {
+ releaseApplyTxn(txn)
+ return err
+ }
+ } else {
+ txn.writeset = writeset
+ }
+ }
+ }
+ // Attach any merged-away sequences to the txn so the scheduler can
+ // advance lastCommittedSequence for them when this batch actually
+ // commits (inside markCommitted), not now at enqueue time. Advancing
+ // at enqueue would satisfy commit-parent dependencies for later
+ // empty-writeset txns before the batch containing those sequences
+ // has actually committed.
+ if len(state.mergedSequences) > 0 {
+ txn.mergedSequences = append(txn.mergedSequences[:0], state.mergedSequences...)
+ state.mergedSequences = state.mergedSequences[:0]
+ }
+ if state.batchMissingCommitMeta && state.curHasCommitMeta && state.curSequence > 0 {
+ txn.mergedSequences = append(txn.mergedSequences, state.curSequence)
+ }
+ // Increment pendingFieldRefreshTables BEFORE scheduler.enqueue so the
+ // counter is visible to commitLoop's matching decrement (parallel_apply.go
+ // ~L2148-2160). Otherwise a worker could pick up this txn and commitLoop
+ // could observe an empty map (no-op decrement) before this increment
+ // runs, leaving the counter permanently stuck at 1 — every future ROW
+ // txn touching this table would then be force-serialized for the
+ // lifetime of the workflow.
+ // Skip the full event scan in the common rowOnly case; curHasFieldEvent
+ // already tracks whether any FIELD event was accumulated (the
+ // commitLoop side has the analogous map-emptiness guard).
+ var refreshedTables map[string]struct{}
+ if state.curHasFieldEvent {
+ refreshedTables = extractFieldRefreshTables(payload.events)
+ }
+ if len(refreshedTables) != 0 {
+ if pendingFieldRefreshTables == nil {
+ pendingFieldRefreshTables = make(map[string]int, len(refreshedTables))
+ }
+ vp.serialMu.Lock()
+ if vp.pendingFieldRefreshTables == nil {
+ vp.pendingFieldRefreshTables = make(map[string]int, len(refreshedTables))
+ }
+ for tableName := range refreshedTables {
+ pendingFieldRefreshTables[tableName]++
+ vp.pendingFieldRefreshTables[tableName]++
+ }
+ vp.serialMu.Unlock()
+ }
+ if err := scheduler.enqueue(txn); err != nil {
+ // Roll back the increment so a transient enqueue error
+ // (scheduler closed during teardown, ctx cancellation) does not
+ // leave the table permanently force-serialized after restart.
+ if len(refreshedTables) != 0 {
+ vp.serialMu.Lock()
+ for tableName := range refreshedTables {
+ pendingFieldRefreshTables[tableName]--
+ if pendingFieldRefreshTables[tableName] <= 0 {
+ delete(pendingFieldRefreshTables, tableName)
+ }
+ if remaining := vp.pendingFieldRefreshTables[tableName] - 1; remaining > 0 {
+ vp.pendingFieldRefreshTables[tableName] = remaining
+ } else {
+ delete(vp.pendingFieldRefreshTables, tableName)
+ }
+ }
+ vp.serialMu.Unlock()
+ }
+ // Return the unsent txn to the pool, matching the DDL/OTHER/JOURNAL
+ // and enqueueCommitOnly paths: a retry storm must not defeat the
+ // pool by leaking one applyTxn + payload per failed enqueue.
+ releaseApplyTxn(txn)
+ return err
+ }
+ // Pre-allocate with capacity 16 to avoid the nil→1→2→4→8 growth
+ // pattern on the hot path. We can't reuse the old slice via [:0]
+ // because the payload still references the backing array.
+ state.curEvents = make([]*binlogdatapb.VEvent, 0, 16)
+ state.curRowOnly = false
+ state.curRowOnlySet = false
+ state.curMustSave = false
+ state.curTimestamp = 0
+ state.curCommitParent = 0
+ state.curSequence = 0
+ state.curHasCommitMeta = false
+ state.curHasFieldEvent = false
+ state.batchMissingCommitMeta = false
+ state.batchedCommitCount = 0
+ state.lastFlushTime = time.Now()
+ return nil
+ }
+
+ for i := range items {
+ for j := 0; j < len(items[i]); j++ {
+ event := items[i][j]
+ switch event.Type {
+ case binlogdatapb.VEventType_GTID:
+ pos, err := binlogplayer.DecodePosition(event.Gtid)
+ if err != nil {
+ return err
+ }
+ state.curPos = pos
+ state.curCommitParent = event.CommitParent
+ state.curSequence = event.SequenceNumber
+ state.curHasCommitMeta = event.SequenceNumber != 0 || event.CommitParent != 0
+ if !state.curHasCommitMeta {
+ state.batchMissingCommitMeta = true
+ }
+ vp.serialMu.Lock()
+ vp.pos = pos
+ vp.unsavedEvent = nil
+ vp.serialMu.Unlock()
+ case binlogdatapb.VEventType_ROW:
+ state.curEvents = append(state.curEvents, event)
+ if !state.curRowOnlySet {
+ state.curRowOnly = true
+ state.curRowOnlySet = true
+ }
+ case binlogdatapb.VEventType_COMMIT:
+ posReached := stopPosReached(state.curPos)
+ state.curMustSave = posReached
+ if !txnNeedsWorker(state.curEvents) {
+ if state.curMustSave {
+ eventCopy := event
+ if err := vp.enqueueCommitOnly(ctx, scheduler, eventCopy, true, true, state.curSequence, state.curCommitParent, state.curHasCommitMeta); err != nil {
+ return err
+ }
+ return io.EOF
+ }
+
+ now := time.Now()
+ queuePositionSave := false
+ vp.serialMu.Lock()
+ if time.Since(vp.timeLastSaved) >= idleTimeout {
+ vp.timeLastSaved = now
+ queuePositionSave = true
+ }
+ vp.serialMu.Unlock()
+ if queuePositionSave {
+ state.lastHeartbeatRefresh = now
+ eventCopy := event
+ if err := vp.enqueueCommitOnly(ctx, scheduler, eventCopy, true, true, state.curSequence, state.curCommitParent, state.curHasCommitMeta); err != nil {
+ return err
+ }
+ } else {
+ // During catch-up, a stream may continuously process
+ // empty transactions (from other shards' data) that
+ // keep the scheduleLoop busy, so the idle timeout at
+ // the top of the loop never fires. Periodically refresh
+ // time_updated directly via SQL to keep
+ // max_v_replication_lag fresh until the next ordered
+ // position save is queued.
+ needRefresh := time.Since(state.lastHeartbeatRefresh) >= idleTimeout
+ if needRefresh {
+ state.lastHeartbeatRefresh = now
+ vp.serialMu.Lock()
+ err := vp.vr.updateHeartbeatTime(now.Unix())
+ vp.serialMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ vp.serialMu.Lock()
+ vp.unsavedEvent = event
+ vp.serialMu.Unlock()
+ // Advance lastCommittedSequence immediately only when the
+ // empty transaction stays on the unsavedEvent path.
+ // Queued position saves publish their sequence on commit.
+ if state.curHasCommitMeta {
+ scheduler.advanceCommittedSequence(state.curSequence)
+ }
+ }
+ // Advance lastCommittedSequence immediately for this empty
+ // transaction. Empty txns have no data effects, so their
+ // commit-parent dependency is trivially satisfied. Deferring
+ // the advance (via mergedSequences) would deadlock: a later
+ // txn with commitParent=thisSequence and empty writeset would
+ // block forever waiting for lastCommittedSequence to reach
+ // its commitParent, but the deferred sequence only publishes
+ // when that later txn commits — a circular dependency.
+ //
+ // markCommitted() uses max() for lastCommittedSequence, so
+ // this early advance cannot regress the watermark when a
+ // later txn commits with a lower sequence number.
+ state.curEvents = make([]*binlogdatapb.VEvent, 0, 16)
+ state.curRowOnly = false
+ state.curRowOnlySet = false
+ state.curHasFieldEvent = false
+ state.curMustSave = false
+ state.curTimestamp = 0
+ state.curCommitParent = 0
+ state.curSequence = 0
+ state.curHasCommitMeta = false
+ state.batchMissingCommitMeta = false
+ continue
+ }
+ // Group multiple consecutive transactions into a single batch
+ // to reduce the number of MySQL COMMITs. This mirrors the serial
+ // applier's hasAnotherCommit lookahead. If another COMMIT is
+ // ahead in this relay batch and we don't need to force-save,
+ // skip the flush and let events accumulate. The next GTID will
+ // update curPos/curSequence/curCommitParent and the accumulated
+ // events will be flushed as one larger transaction.
+ //
+ // Time-based bound: during heavy catch-up, heartbeats don't
+ // arrive to set curMustSave. Without a time bound, a single
+ // batch can grow for 30+ seconds, keeping time_updated stale
+ // and max_v_replication_lag stuck at 1+. Force a flush every
+ // 500ms to keep lag fresh.
+ if !state.lastFlushTime.IsZero() && time.Since(state.lastFlushTime) > 500*time.Millisecond {
+ state.curMustSave = true
+ }
+ // When the current transaction touches FK-related tables,
+ // skip batching to keep writesets small. Merging parent/
+ // child operations into one mega-transaction would make
+ // nearly all batches conflict on FK ref keys, serializing
+ // the workload. Flushing each source transaction
+ // individually lets the scheduler detect truly independent
+ // transactions and run them in parallel. Transactions on
+ // unrelated tables can still batch normally.
+ hasFKRefs := false
+ if len(fkRefs) > 0 || len(parentFKRefs) > 0 {
+ planSnapshot, resolvedFKRefTables := getFKBatchingSnapshot()
+ for _, ev := range state.curEvents {
+ if ev.Type != binlogdatapb.VEventType_ROW || ev.RowEvent == nil {
+ continue
+ }
+ tableName := ev.RowEvent.TableName
+ if plan := planSnapshot[tableName]; plan != nil {
+ tableName = plan.TargetName
+ }
+ if _, ok := resolvedFKRefTables[tableName]; ok {
+ hasFKRefs = true
+ break
+ }
+ }
+ }
+ // With parallel workers, limit the mega-transaction size
+ // to ensure enough transactions for all workers. Without
+ // this limit, all consecutive commits in a relay fetch
+ // merge into one mega-transaction, leaving all but one
+ // worker idle.
+ if state.maxBatchedCommits > 0 {
+ state.batchedCommitCount++
+ if state.batchedCommitCount >= state.maxBatchedCommits {
+ state.curMustSave = true
+ }
+ }
+ if !state.curMustSave && !hasFKRefs && hasAnotherCommit(items, i, j+1) {
+ // Track merged sequence numbers so they can be advanced
+ // when the batch actually commits. We must NOT advance
+ // lastCommittedSequence here because the batch hasn't
+ // committed yet. Empty-writeset transactions that depend
+ // on commit-parent ordering would otherwise become
+ // runnable too early.
+ if state.curHasCommitMeta {
+ state.mergedSequences = append(state.mergedSequences, state.curSequence)
+ }
+ // Reset only metadata — keep accumulated events and
+ // rowOnly state. The next GTID will set new metadata.
+ state.curCommitParent = 0
+ state.curSequence = 0
+ state.curHasCommitMeta = false
+ state.curMustSave = false
+ continue
+ }
+ if err := flush(false); err != nil {
+ return err
+ }
+ if posReached {
+ return io.EOF
+ }
+ case binlogdatapb.VEventType_BEGIN:
+ // No-op: BEGIN is handled on-demand by workers when they encounter
+ // ROW/FIELD events (via activeDBClient().Begin()). We intentionally
+ // do NOT add BEGIN to curEvents so that empty transactions
+ // (GTID→BEGIN→COMMIT) have curEvents=0 and take the fast path
+ // (unsavedEvent) instead of being enqueued through the scheduler.
+ case binlogdatapb.VEventType_FIELD:
+ // FIELD events carry table metadata (column definitions) and
+ // must be applied before the ROW events that follow them, but
+ // they are emitted routinely by MySQL at the start of each
+ // transaction — they do not indicate a schema change. The
+ // execution plan only actually changes after DDL, which
+ // already sets forceSerialize. Accumulate FIELD events like
+ // ROW events so they stay in the same applyTxn.
+ state.curEvents = append(state.curEvents, event)
+ state.curHasFieldEvent = true
+ case binlogdatapb.VEventType_INSERT,
+ binlogdatapb.VEventType_DELETE,
+ binlogdatapb.VEventType_UPDATE,
+ binlogdatapb.VEventType_REPLACE,
+ binlogdatapb.VEventType_SAVEPOINT:
+ // Statement-based DML events are supported for external MySQL
+ // streams. Keep them in the transaction payload, but classify the
+ // transaction as non-row-only so it serializes like the serial
+ // applier's statement path.
+ state.curEvents = append(state.curEvents, event)
+ state.curRowOnly = false
+ state.curRowOnlySet = true
+ case binlogdatapb.VEventType_DDL, binlogdatapb.VEventType_OTHER, binlogdatapb.VEventType_JOURNAL:
+ if err := flush(false); err != nil {
+ return err
+ }
+ posReached := stopPosReached(state.curPos)
+ order := vp.parallelOrder.Add(1)
+ vp.serialMu.Lock()
+ query := vp.query
+ commit := vp.commit
+ client := vp.dbClient
+ vp.serialMu.Unlock()
+ payload := acquireApplyTxnPayload()
+ payload.pos = state.curPos
+ payload.timestamp = event.Timestamp
+ payload.mustSave = true
+ payload.events = []*binlogdatapb.VEvent{event}
+ payload.rowOnly = false
+ payload.commitOnly = true
+ payload.updatePosOnly = false
+ payload.query = query
+ payload.commit = commit
+ payload.client = client
+ payload.lastEventTimestamp = event.Timestamp
+ payload.lastEventCurrentTime = event.CurrentTime
+ txn := acquireApplyTxn()
+ txn.order = order
+ txn.sequenceNumber = state.curSequence
+ txn.commitParent = state.curCommitParent
+ txn.hasCommitMeta = state.curHasCommitMeta
+ txn.forceGlobal = true
+ // OTHER events and DDL events with OnDdl=IGNORE only update the
+ // replication position — they never touch user table data. Marking
+ // them noConflict lets workers pick them up immediately without
+ // waiting for all inflight row transactions to drain first. The
+ // commitLoop still enforces strict ordering, so the position write
+ // happens after all prior commits. This eliminates the forceGlobal
+ // serialization stall that occurs during Online DDL cutover when the
+ // RENAME TABLE DDL event arrives while workers are still applying rows.
+ txn.noConflict = event.Type == binlogdatapb.VEventType_OTHER ||
+ (event.Type == binlogdatapb.VEventType_DDL && vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_IGNORE)
+ txn.payload = payload
+ if err := scheduler.enqueue(txn); err != nil {
+ // Return the unsent txn to the pool so a retry storm
+ // (scheduler close + workflow restart in a tight loop)
+ // does not defeat the pool by leaking one applyTxn +
+ // payload per failed enqueue.
+ releaseApplyTxn(txn)
+ return err
+ }
+ // DDL that is actually executed on the target (EXEC, EXEC_IGNORE)
+ // may change schema or FK topology. Force all remaining
+ // transactions in this relay fetch to serialize so they don't
+ // use stale FK refs or table plans for writeset computation.
+ // Also set state.ddlSeen so the scheduleLoop waits for the
+ // commitLoop to refresh FK metadata before the next fetch.
+ // Record the current tablePlansVersion so that force-
+ // serialization persists until workers apply new FIELD events
+ // that bump the version past this snapshot.
+ //
+ // IGNORE and STOP DDLs do not modify the target schema, so
+ // they don't need the barrier. STOP terminates the workflow
+ // entirely. IGNORE just advances the position.
+ if event.Type == binlogdatapb.VEventType_DDL &&
+ (vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC ||
+ vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE) {
+ forceSerialize = true
+ state.ddlSeen = true
+ }
+ if posReached || journalTerminates(event) || ddlTerminates(event) {
+ return io.EOF
+ }
+ case binlogdatapb.VEventType_HEARTBEAT:
+ // Handle heartbeats inline without enqueuing through the scheduler.
+ // Heartbeats are very frequent (~250/sec) and enqueuing them as
+ // forceGlobal transactions serializes the entire pipeline, making
+ // VDiff sync cycles extremely slow.
+ //
+ // If we have accumulated events, force the next COMMIT to flush
+ // instead of continuing to batch. Without this, commit batching
+ // can create unbounded super-transactions during catch-up replay,
+ // starving time_updated refreshes and causing max_v_replication_lag
+ // to stay high indefinitely. Heartbeats arrive regularly from the
+ // source (~1/sec), providing a natural bound on batch size.
+ if len(state.curEvents) > 0 {
+ state.curMustSave = true
+ }
+ //
+ // Must hold serialMu for DB writes (updateTimeThrottled,
+ // recordHeartbeat) because they use vr.dbClient, which may
+ // also be in use by the commitLoop for commitOnly transactions.
+ if event.Throttled {
+ vp.serialMu.Lock()
+ err := vp.vr.updateTimeThrottled(throttlerapp.VStreamerName, event.ThrottledReason)
+ vp.serialMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ vp.serialMu.Lock()
+ vp.numAccumulatedHeartbeats++
+ err := vp.recordHeartbeat()
+ vp.serialMu.Unlock()
+ if err != nil {
+ return err
+ }
+ // Update lag from heartbeat timestamp.
+ if event.Timestamp != 0 && !event.Throttled {
+ tsNs := event.Timestamp * 1e9
+ now := time.Now().UnixNano()
+ offset := now - event.CurrentTime
+ vp.storeLagSnapshot(tsNs, offset)
+ lag := now - tsNs - offset
+ if lag >= 0 {
+ lagSecs := lag / 1e9
+ vp.vr.stats.ReplicationLagSeconds.Store(lagSecs)
+ vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, lagSecs)
+ }
+ } else if event.Throttled {
+ // When the vstreamer is throttled, we can't determine the
+ // actual lag from the event. Estimate it from the last known
+ // timestamp, matching the serial applier's estimateLag().
+ snap := vp.loadLagSnapshot()
+ if snap.timestampNs > 0 {
+ behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs
+ if behind >= 0 {
+ behindSecs := behind / 1e9
+ vp.vr.stats.ReplicationLagSeconds.Store(behindSecs)
+ vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs)
+ }
+ }
+ }
+ case binlogdatapb.VEventType_ROWS_QUERY:
+ // Informational only; keep it with the surrounding txn if present.
+ // vstreamer emits ROWS_QUERY ahead of the ROW events it describes,
+ // so this metadata must not force serialization on its own.
+ state.curEvents = append(state.curEvents, event)
+ case binlogdatapb.VEventType_VERSION:
+ // VERSION is informational only for the applier. Preserve the
+ // old serial behavior by ignoring it instead of failing the stream.
+ default:
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unsupported vevent type: %v", event.Type)
+ }
+ if event.Timestamp != 0 {
+ state.curTimestamp = event.Timestamp
+ }
+ }
+ }
+ return nil
+}
+
+// enqueueCommitOnly creates a commitOnly transaction and enqueues it into the
+// scheduler. Used for DDL, OTHER, JOURNAL events, and position-only saves
+// (idle timeout). These transactions are applied by the commitLoop on the main
+// connection, not by workers.
+func (vp *vplayer) enqueueCommitOnly(ctx context.Context, scheduler *applyScheduler, event *binlogdatapb.VEvent, mustSave bool, updatePosOnly bool, sequenceNumber int64, commitParent int64, hasCommitMeta bool) error {
+ var order int64
+ var pos replication.Position
+ var query func(ctx context.Context, sql string) (*sqltypes.Result, error)
+ var commit func() error
+ var client *vdbClient
+ order = vp.parallelOrder.Add(1)
+ vp.serialMu.Lock()
+ pos = vp.pos
+ query = vp.query
+ commit = vp.commit
+ client = vp.dbClient
+ vp.serialMu.Unlock()
+ payload := acquireApplyTxnPayload()
+ payload.pos = pos
+ payload.timestamp = event.Timestamp
+ payload.mustSave = mustSave
+ payload.events = []*binlogdatapb.VEvent{event}
+ payload.rowOnly = false
+ payload.commitOnly = true
+ payload.updatePosOnly = updatePosOnly
+ payload.query = query
+ payload.commit = commit
+ payload.client = client
+ payload.lastEventTimestamp = event.Timestamp
+ payload.lastEventCurrentTime = event.CurrentTime
+ txn := acquireApplyTxn()
+ txn.order = order
+ txn.sequenceNumber = sequenceNumber
+ txn.commitParent = commitParent
+ txn.hasCommitMeta = hasCommitMeta
+ txn.forceGlobal = true
+ txn.noConflict = updatePosOnly
+ txn.payload = payload
+ // commitOnly transactions carry a pooled done channel like any other
+ // txn, but it is unused: workers forward them directly to commitCh
+ // without waiting for completion.
+ if err := scheduler.enqueue(txn); err != nil {
+ // Match the DDL/OTHER/JOURNAL branch above: return the unsent txn
+ // to the pool so retry storms don't leak per-call.
+ releaseApplyTxn(txn)
+ return err
+ }
+ return nil
+}
+
+// workerLoop runs on each of the N worker goroutines. It blocks on
+// scheduler.nextReady() until a transaction is dispatched, applies the row
+// events using the worker's private MySQL connection, then sends the txn
+// to commitCh. Each worker has double-buffered connections: after sending
+// a transaction, the worker rotates to its spare connection and immediately
+// starts the next transaction, overlapping apply with the commitLoop's commit.
+func (vp *vplayer) workerLoop(ctx context.Context, scheduler *applyScheduler, commitCh chan<- *applyTxn, worker *applyWorker) error {
+ // Workers only apply ROW/FIELD/ROWS_QUERY events. Build a narrow local
+ // vplayer view once and refresh the DDL barrier snapshots per transaction
+ // under serialMu, instead of racing on a whole-struct shallow copy.
+ workerVP := workerLocalVPlayer(vp)
+
+ // pendingDone holds the done channel of the most recently sent worker
+ // transaction that the commitLoop may still be committing. We capture
+ // only the channel (not the *applyTxn) because the commitLoop returns
+ // the applyTxn to the pool after signaling done — if the scheduleLoop
+ // reacquires it, it drains the channel, which would cause waitPending
+ // to block forever if we were still dereferencing through the txn.
+ var pendingDone chan struct{}
+
+ waitPending := func() error {
+ if pendingDone == nil {
+ return nil
+ }
+ select {
+ case <-pendingDone:
+ pendingDone = nil
+ return nil
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+ }
+
+ // Register a single ctx.AfterFunc for the lifetime of this worker.
+ // On ctx cancellation, close whichever client is currently executing
+ // MySQL calls — published via activeApplyClient. Registering per-txn
+ // allocates a new closure + runtime bookkeeping on every transaction;
+ // this hoists both to once per worker.
+ var activeApplyClient atomic.Pointer[vdbClient]
+ stopInterrupt := context.AfterFunc(ctx, func() {
+ if c := activeApplyClient.Load(); c != nil {
+ c.Close()
+ }
+ })
+ defer stopInterrupt()
+
+ // Hoist the OnDdl check: the source's OnDdl action is fixed for the
+ // lifetime of the workflow, so we compute once rather than per-txn.
+ // When DDL execution is disabled, the worker can skip the per-txn
+ // serialMu acquisition that clones postDDL bookkeeping (since those
+ // maps stay empty in that mode).
+ ddlExecEnabled := vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC ||
+ vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE
+
+ for {
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ txn, err := scheduler.nextReady(ctx)
+ if err != nil {
+ return err
+ }
+ payload := txn.payload
+ if payload.commitOnly {
+ // Forward commitOnly txns (DDL, OTHER, JOURNAL, position saves)
+ // to the commitLoop immediately without waiting for any pending
+ // worker commit. commitOnly work runs on the main connection, not
+ // the worker's connection, so it has no dependency on the prior
+ // row txn's commit. The commitLoop enforces strict ordering via
+ // nextOrder regardless of when the txn arrives in commitCh.
+ select {
+ case commitCh <- txn:
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+ continue
+ }
+
+ // Apply events on the current active connection. This runs
+ // concurrently with the commitLoop committing the previous
+ // transaction on the other connection (double-buffering).
+ // Publish the current worker client so the worker-scoped
+ // context.AfterFunc can close it if ctx is cancelled.
+ activeApplyClient.Store(worker.client)
+ // DDL bookkeeping (postDDLStalePlans, postDDLDroppedTables) is only
+ // populated when OnDdl is EXEC or EXEC_IGNORE. In the default IGNORE
+ // mode, these maps stay empty for the workflow's lifetime, so we can
+ // skip the serialMu acquisition and per-txn clone entirely. Taking
+ // serialMu here on every worker txn was the dominant contention point
+ // under parallel apply on OnDdl=IGNORE workflows.
+ if ddlExecEnabled {
+ vp.serialMu.Lock()
+ workerVP.postDDLStalePlans = clonePostDDLStalePlans(vp.postDDLStalePlans)
+ workerVP.postDDLDroppedTables = cloneDroppedTables(vp.postDDLDroppedTables)
+ vp.serialMu.Unlock()
+ }
+ for _, event := range payload.events {
+ if err := worker.applyEvent(ctx, event, payload.mustSave, &workerVP); err != nil {
+ activeApplyClient.Store(nil)
+ worker.rollback()
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ return err
+ }
+ }
+ // In batch mode, flush all buffered SQL statements to MySQL in
+ // one multi-statement call. This is the key parallelism point:
+ // all workers execute their batches concurrently here, while the
+ // commitLoop only needs to do a cheap COMMIT + position update.
+ if err := worker.flushWorkerBatch(); err != nil {
+ activeApplyClient.Store(nil)
+ worker.rollback()
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ return err
+ }
+ activeApplyClient.Store(nil)
+
+ // Wait for the previous transaction's commit to complete. Because
+ // we waited AFTER applying the current transaction, the apply and
+ // commit phases overlapped — this is the key pipelining benefit.
+ // If the commit finished during our apply phase, this returns
+ // immediately. We must wait here because rotate() switches to the
+ // connection that the commitLoop was using for the previous txn.
+ if err := waitPending(); err != nil {
+ worker.rollback()
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ return err
+ }
+
+ // Capture the current connection for the payload before rotating.
+ // The commitLoop will use these to commit this transaction while
+ // the worker moves on to the next transaction on the spare connection.
+ //
+ // In batch mode we leave payload.query/commit nil and let commitLoop
+ // dispatch directly off payload.client via AddQueryToTrxBatch +
+ // CommitTrxQueryBatch. The commit still sends "UPDATE …;commit" in
+ // one multi-statement round-trip (the combine-commit win), but we
+ // avoid allocating two closures per mega-txn just to hold a reference
+ // to the worker's active connection.
+ payload.client = worker.client
+ if !worker.batchMode {
+ payload.query = worker.query
+ payload.commit = worker.commit
+ }
+
+ done := txn.done
+ select {
+ case commitCh <- txn:
+ case <-ctx.Done():
+ worker.rollback()
+ return ctx.Err()
+ }
+
+ // Capture the done channel BEFORE rotating. The commitLoop may
+ // return the txn to the pool after signaling done, and
+ // acquireApplyTxn drains the channel on reuse. By holding our
+ // own reference, we are immune to that race.
+ pendingDone = done
+
+ // Rotate to the spare connection for the next transaction.
+ // The commitLoop will commit the current txn on the old connection
+ // and signal txn.done when it's safe to reuse.
+ worker.rotate()
+ }
+}
+
+// commitLoop receives completed transactions from workers via commitCh and
+// commits them in strict order (by the order field). For worker transactions,
+// it executes the position update and commit on the worker's connection
+// WITHOUT holding serialMu, then briefly locks to update vp state.
+// For commitOnly transactions, it applies events on the main connection
+// under serialMu.
+func (vp *vplayer) commitLoop(ctx context.Context, scheduler *applyScheduler, commitCh <-chan *applyTxn) error {
+ updateLag := func(payload *applyTxnPayload) {
+ if payload.lastEventTimestamp != 0 {
+ tsNs := payload.lastEventTimestamp * 1e9
+ now := time.Now().UnixNano()
+ offset := now - payload.lastEventCurrentTime
+ vp.storeLagSnapshot(tsNs, offset)
+ lag := now - tsNs - offset
+ if lag >= 0 {
+ lagSecs := lag / 1e9
+ vp.vr.stats.ReplicationLagSeconds.Store(lagSecs)
+ vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, lagSecs)
+ return
+ }
+ }
+ snap := vp.loadLagSnapshot()
+ behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs
+ behindSecs := behind / 1e9
+ vp.vr.stats.ReplicationLagSeconds.Store(behindSecs)
+ vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs)
+ }
+
+ // commitWorkerTxn handles a worker's row transaction. It executes the
+ // position update SQL, optional stop-state update, and commit on the
+ // worker's private MySQL connection WITHOUT holding serialMu. This avoids
+ // blocking the scheduleLoop during slow MySQL commits.
+ commitWorkerTxn := func(txn *applyTxn) error {
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ payload := txn.payload
+ dbClient := payload.client
+ if dbClient == nil {
+ dbClient = vp.activeDBClient()
+ }
+
+ // Worker batch-mode fast path: the worker set payload.client but left
+ // payload.query/commit nil so we wouldn't allocate a closure per
+ // mega-txn just to hold a reference to its connection. Use the client
+ // directly here. The AddQueryToTrxBatch + CommitTrxQueryBatch pair
+ // still sends "UPDATE _vt.vreplication …;commit" in a single
+ // multi-statement round-trip.
+ var posReached bool
+ if payload.client != nil && payload.query == nil && payload.commit == nil {
+ if err := payload.client.AddQueryToTrxBatch(vp.generateUpdatePosQuery(payload.pos, payload.timestamp)); err != nil {
+ return err
+ }
+ posReached = !vp.stopPos.IsZero() && payload.pos.AtLeast(vp.stopPos)
+ if posReached {
+ if err := vp.setStopPositionStateImmediate(dbClient); err != nil {
+ return err
+ }
+ }
+ if err := payload.client.CommitTrxQueryBatch(); err != nil {
+ return err
+ }
+ } else {
+ queryFn := payload.query
+ if queryFn == nil {
+ queryFn = vp.query
+ }
+ commitFn := payload.commit
+ if commitFn == nil {
+ commitFn = vp.commit
+ }
+ var err error
+ posReached, err = vp.updatePosWithoutStop(ctx, payload.pos, payload.timestamp, queryFn)
+ if err != nil {
+ return err
+ }
+ if posReached {
+ if err := vp.setStopPositionStateImmediate(dbClient); err != nil {
+ return err
+ }
+ }
+ if err := commitFn(); err != nil {
+ return err
+ }
+ }
+
+ // Briefly lock to update vp state that scheduleLoop reads.
+ // Do NOT clear vp.unsavedEvent here: a later empty transaction
+ // may have recorded a higher position that hasn't been flushed yet.
+ // The idle-timeout saver at the top of scheduleLoop will handle it.
+ vp.serialMu.Lock()
+ vp.recordPositionSave(payload.pos, false)
+ // Skip the per-commit FIELD refresh scan when neither map has entries.
+ // The common ROW-only steady state has no FIELD events to process,
+ // and extractFieldRefreshTables otherwise does a full payload scan
+ // that returns nil on every call.
+ if len(vp.pendingFieldRefreshTables) != 0 || len(vp.postDDLDroppedTables) != 0 {
+ for refreshedName := range extractFieldRefreshTables(payload.events) {
+ if vp.pendingFieldRefreshTables != nil {
+ key := canonicalPostDDLTableKey(vp.pendingFieldRefreshTables, refreshedName)
+ if remaining := vp.pendingFieldRefreshTables[key] - 1; remaining > 0 {
+ vp.pendingFieldRefreshTables[key] = remaining
+ } else {
+ delete(vp.pendingFieldRefreshTables, key)
+ }
+ }
+ delete(vp.postDDLDroppedTables, canonicalPostDDLTableKey(vp.postDDLDroppedTables, refreshedName))
+ }
+ }
+ vp.serialMu.Unlock()
+
+ updateLag(payload)
+
+ // Release scheduler inflight state BEFORE signaling the worker. If
+ // markCommitted errors (scheduler closed during teardown), we want
+ // the commitLoop to observe the error and unwind rather than letting
+ // the worker race ahead to its next txn.
+ if err := scheduler.markCommitted(txn); err != nil {
+ return err
+ }
+
+ // Signal the worker that commit is done so it can reuse its
+ // DB connection for the next transaction.
+ txn.done <- struct{}{}
+
+ if posReached {
+ return io.EOF
+ }
+ return nil
+ }
+
+ // commitOnlyTxn handles commitOnly transactions (DDL, OTHER, JOURNAL,
+ // position-only saves). These run on the main connection under serialMu.
+ commitOnlyTxn := func(txn *applyTxn) error {
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ payload := txn.payload
+ dbClient := payload.client
+ if dbClient == nil {
+ dbClient = vp.activeDBClient()
+ }
+ vp.serialMu.Lock()
+ defer vp.serialMu.Unlock()
+
+ if payload.updatePosOnly {
+ savePos := payload.pos
+ if savePos.IsZero() {
+ savePos = vp.pos
+ }
+ posReached := !savePos.IsZero() && !vp.stopPos.IsZero() && savePos.AtLeast(vp.stopPos)
+ if posReached && vp.saveStop {
+ if err := dbClient.BeginImmediate(); err != nil {
+ return err
+ }
+ if _, err := dbClient.ExecuteFetch(vp.generateUpdatePosQuery(savePos, payload.timestamp), 1); err != nil {
+ return fmt.Errorf("error %v updating position", err)
+ }
+ if err := vp.setStopPositionStateImmediate(dbClient); err != nil {
+ return err
+ }
+ if err := dbClient.Commit(); err != nil {
+ return err
+ }
+ vp.recordPositionSave(savePos, false)
+ updateLag(payload)
+ if err := scheduler.markCommitted(txn); err != nil {
+ return err
+ }
+ return io.EOF
+ }
+
+ queryFn := payload.query
+ if queryFn == nil {
+ queryFn = vp.query
+ }
+
+ posReached, err := vp.updatePosWithoutStop(ctx, savePos, payload.timestamp, queryFn)
+ if err != nil {
+ return err
+ }
+ if payload.timestamp == 0 {
+ if err := vp.vr.updateHeartbeatTime(time.Now().Unix()); err != nil {
+ return err
+ }
+ }
+ vp.recordPositionSave(savePos, false)
+ if posReached {
+ if err := vp.setStopPositionState(dbClient); err != nil {
+ return err
+ }
+ }
+ updateLag(payload)
+ if err := scheduler.markCommitted(txn); err != nil {
+ return err
+ }
+ if posReached {
+ return io.EOF
+ }
+ return nil
+ }
+
+ // Temporarily swap pos for the main connection's updatePos call.
+ prevPos := vp.pos
+ if !payload.pos.IsZero() {
+ vp.pos = payload.pos
+ }
+ defer func() { vp.pos = prevPos }()
+
+ // applyEvent handles position updates internally for DDL, OTHER,
+ // and JOURNAL events, and returns io.EOF when the stop position
+ // is reached or when a JOURNAL forces termination. We therefore
+ // do NOT call updatePos again below — doing so would produce a
+ // redundant _vt.vreplication write and create an awkward
+ // partial-failure window where applyEvent succeeded but a second
+ // position write could fail.
+ event := payload.events[0]
+ ddlExecuted := false
+ publishExecIgnoreDDLBarrier := false
+ var terminalErr error
+ if event.Type == binlogdatapb.VEventType_DDL {
+ var err error
+ ddlExecuted, err = vp.applyDDLEvent(ctx, event, nil)
+ if err != nil {
+ if !errors.Is(err, io.EOF) {
+ return err
+ }
+ terminalErr = err
+ }
+ if !ddlExecuted && vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE {
+ publishExecIgnoreDDLBarrier, err = shouldPublishExecIgnoreDDLBarrier(ctx, vp, event.Statement)
+ if err != nil {
+ return vterrors.Wrapf(err, "failed to inspect EXEC_IGNORE DDL target schema")
+ }
+ }
+ } else if err := vp.applyEvent(ctx, event, payload.mustSave); err != nil {
+ if !errors.Is(err, io.EOF) {
+ return err
+ }
+ terminalErr = err
+ }
+ // After EXEC DDLs and all EXEC_IGNORE DDLs, refresh FK metadata so
+ // that ADD/DROP FOREIGN KEY changes are reflected in subsequent
+ // writeset conflict detection. EXEC_IGNORE still advances the stream
+ // position after a statement error, and that error can mean the
+ // target is already in the post-DDL FK state (for example, dropping
+ // a foreign key that's already gone). Continuing with the old FK
+ // cache would silently use stale conflict metadata.
+ //
+ // We hold serialMu for the DB round-trip here. DDL is rare, and
+ // the main connection must not be used concurrently by scheduleLoop.
+ // Fail fast on refresh errors: stale FK topology after a schema
+ // change would silently compromise conflict detection.
+ if event.Type == binlogdatapb.VEventType_DDL && (ddlExecuted || vp.vr.source.OnDdl == binlogdatapb.OnDDLAction_EXEC_IGNORE) {
+ newRefs, err := queryFKRefs(vp.vr.dbClient, vp.vr.dbClient.DBName())
+ if err != nil {
+ return vterrors.Wrapf(err, "failed to refresh FK metadata after DDL")
+ }
+ vp.fkRefs = newRefs
+ vp.parentFKRefs = buildParentFKRefs(newRefs)
+ }
+ if event.Type == binlogdatapb.VEventType_DDL && (ddlExecuted || publishExecIgnoreDDLBarrier) {
+ vp.tablePlansMu.RLock()
+ renameTargets := extractDDLRenameTargets(event.Statement, vp.vr.vre.env.Parser())
+ retargetPostDDLStalePlans(vp.postDDLStalePlans, renameTargets, vp.tablePlans)
+ ddlStalePlans, conservative := extractDDLAffectedTables(event.Statement, vp.vr.vre.env.Parser(), vp.tablePlans, vp.postDDLDroppedTables)
+ ddlStalePlans = unresolvedPostDDLStalePlans(vp.tablePlans, vp.postDDLDroppedTables, ddlStalePlans)
+ vp.tablePlansMu.RUnlock()
+ vp.postDDLStalePlans = mergePostDDLStalePlans(vp.postDDLStalePlans, ddlStalePlans)
+ vp.postDDLConservative = vp.postDDLConservative || conservative
+ vp.postDDLDroppedTables = mergeDroppedTables(vp.postDDLDroppedTables, extractDroppedTables(event.Statement, vp.vr.vre.env.Parser()))
+ }
+ updateLag(payload)
+ if err := scheduler.markCommitted(txn); err != nil {
+ return err
+ }
+ return terminalErr
+ }
+
+ commitTxn := func(txn *applyTxn) error {
+ if txn.payload.commitOnly {
+ return commitOnlyTxn(txn)
+ }
+ return commitWorkerTxn(txn)
+ }
+
+ pending := make(map[int64]*applyTxn)
+ nextOrder := int64(1)
+
+ // On error exit, release all remaining pending entries to return pool
+ // objects that would otherwise be leaked. We intentionally do NOT
+ // signal txn.done here: workers unblock via ctx.Done() instead (the
+ // caller cancels the context on commitLoop error). Signaling done
+ // would tell the worker its old connection is safe to reuse, but
+ // the commit may have failed leaving the connection in a dirty state.
+ defer func() {
+ for _, txn := range pending {
+ releaseApplyTxn(txn)
+ }
+ }()
+
+ drainPending := func() error {
+ for {
+ next := pending[nextOrder]
+ if next == nil {
+ break
+ }
+ delete(pending, nextOrder)
+ if err := commitTxn(next); err != nil {
+ // Re-add the failed txn so the defer cleanup can
+ // signal its done channel and release it to the pool.
+ pending[nextOrder] = next
+ return err
+ }
+ releaseApplyTxn(next)
+ nextOrder++
+ }
+ return nil
+ }
+
+ for {
+ select {
+ case txn, ok := <-commitCh:
+ if !ok {
+ // The commit channel has been closed so we cannot add anything else.
+ // We only need to drain any already pending transactions.
+ if err := drainPending(); err != nil {
+ return err
+ }
+ if len(pending) > 0 {
+ return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "parallel apply commit missing order: pending=%d next=%d", len(pending), nextOrder)
+ }
+ return nil
+ }
+ if ctx.Err() != nil {
+ return ctx.Err()
+ }
+ if txn.order == 0 {
+ // All production enqueue paths assign order via
+ // vp.parallelOrder.Add(1), which is monotonic and starts at 1.
+ // Reaching here means a regression introduced an unordered
+ // txn — silently committing it would bypass strict ordering
+ // and break the monotonic position invariant on
+ // _vt.vreplication.pos. Fail fast so the workflow restarts
+ // cleanly from the last durable position.
+ return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "parallel apply commit txn missing order: payload=%+v", txn.payload)
+ }
+ // Add the new transaction to be committed and then drain all pending ones.
+ pending[txn.order] = txn
+ if err := drainPending(); err != nil {
+ return err
+ }
+ case <-ctx.Done():
+ return ctx.Err()
+ }
+ }
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler.go
new file mode 100644
index 00000000000..8596ddb8966
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler.go
@@ -0,0 +1,591 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "context"
+ "errors"
+ "io"
+ "sync"
+)
+
+// errSchedulerAbandonedPendingWork is returned by nextReady when the scheduler
+// is closed with pending work that can never become ready because nothing is
+// inflight to advance the scheduler's state (lastCommittedSequence, writeset,
+// inflight* counters). Workers surface this to their caller so the controller
+// retries the stream from the last saved position rather than silently
+// treating the abandoned pending work as "stream finished cleanly".
+var errSchedulerAbandonedPendingWork = errors.New("parallel apply scheduler closed with unreachable pending transactions")
+
+type applyTxn struct {
+ // order is a monotonically increasing sequence number assigned by
+ // scheduleLoop. The commitLoop commits transactions in strict order
+ // so that the position saved to _vt.vreplication only moves forward.
+ order int64
+ // sequenceNumber is the source MySQL binlog sequence number from the
+ // GTID event. Used to advance lastCommittedSequence after commit.
+ sequenceNumber int64
+ // commitParent is the source MySQL commit parent from the GTID event.
+ // When the writeset is empty, the scheduler falls back to commit-parent
+ // ordering: the transaction is ready only when commitParent <=
+ // lastCommittedSequence.
+ commitParent int64
+ // hasCommitMeta is true when the GTID event carried non-zero
+ // sequenceNumber or commitParent. Transactions with and without
+ // commit metadata are never run concurrently (safety boundary).
+ hasCommitMeta bool
+ // forceGlobal is true for transactions that must serialize with
+ // everything: non-row-only transactions (DDL, FIELD, OTHER, JOURNAL)
+ // and copy-phase transactions.
+ forceGlobal bool
+ // noConflict is true for position-only saves and certain pass-through
+ // events (OTHER, ignored DDL). These bypass all conflict checking and
+ // are always ready, preventing deadlocks where an earlier-order
+ // position save is blocked by later-order inflight data transactions.
+ noConflict bool
+ // writeset holds xxhash digests of PK-based keys (e.g. hash of "table:pk1,pk2").
+ // Using uint64 hashes instead of strings eliminates per-txn heap allocations
+ // in the scheduler hot path, reducing GC pressure at high TPS.
+ writeset []uint64
+ // mergedSequences tracks sequence numbers of source transactions that
+ // were merged into this batched mega-transaction. They must be advanced
+ // in lastCommittedSequence only after this txn actually commits, so that
+ // later empty-writeset transactions whose commitParent references one of
+ // these sequences don't become runnable before the batch commits.
+ mergedSequences []int64
+ // payload carries the transaction's events and DB connection info.
+ // Pooled via applyTxnPayloadPool to reduce allocations.
+ payload *applyTxnPayload
+ // done is a buffered channel (cap 1) used to synchronize the commitLoop
+ // with the worker that applied this transaction. The commitLoop sends on
+ // done after committing, unblocking the worker to reuse its DB connection.
+ // Always freshly allocated by acquireApplyTxn; commitOnly transactions
+ // carry one too but never use it (workers don't wait on them).
+ done chan struct{}
+}
+
+type applyScheduler struct {
+ // ctx is the parent context for the parallel applier. When cancelled,
+ // all blocked nextReady/waitForIdle calls return immediately.
+ ctx context.Context
+
+ mu sync.Mutex
+ cond *sync.Cond
+ // orderCond is a dedicated condition for the enqueue backpressure wait
+ // (maxOutstandingOrders). The shared cond's Signal in markCommitted can
+ // land on an idle worker that consumes the wakeup without re-signaling,
+ // leaving the scheduleLoop asleep until the pipeline fully drains (the
+ // allDrained Broadcast backstop). A dedicated cond makes the order-window
+ // wakeup deterministic.
+ orderCond *sync.Cond
+
+ // pending is the queue of transactions waiting to be dispatched to
+ // workers. Entries are set to nil when consumed; pendingOff tracks
+ // how far into the slice consumed entries extend, and the slice is
+ // compacted when half its capacity is nil entries.
+ pending []*applyTxn
+ pendingOff int // offset into pending slice; entries before this index are consumed
+ pendingCount int // number of live (non-nil) entries in pending
+ // lastCommittedSequence is the highest source MySQL sequence number
+ // that has been committed. Used for commit-parent ordering: a
+ // transaction whose writeset is empty is ready only when its
+ // commitParent <= lastCommittedSequence.
+ lastCommittedSequence int64
+ // lastCommittedOrder is the highest transaction order number that
+ // has been committed, used for diagnostics.
+ lastCommittedOrder int64
+ // maxOutstandingOrders caps how many ordered transactions may exist ahead
+ // of durable commit progress. Zero disables the cap.
+ maxOutstandingOrders int64
+
+ // inflightWriteset maps writeset key hashes to reference counts.
+ // A transaction is blocked if any of its writeset keys are present
+ // in this map with count > 0.
+ inflightWriteset map[uint64]int
+ // inflightGlobal counts inflight forceGlobal transactions and
+ // no-metadata-no-writeset transactions. When > 0, all non-noConflict
+ // transactions are blocked.
+ inflightGlobal int
+ // inflightMissingMeta counts inflight transactions that lack commit
+ // metadata. When > 0, hasCommitMeta transactions are blocked to
+ // maintain the safety boundary between metadata modes.
+ inflightMissingMeta int
+ // inflightCommitMeta counts inflight transactions that have commit
+ // metadata. When > 0, no-metadata transactions with writesets must
+ // wait to prevent mixing metadata modes.
+ inflightCommitMeta int
+ // inflightNoConflict counts dispatched-but-uncommitted noConflict
+ // transactions. They do not participate in conflict checking, but the
+ // abandoned-pending-work check must not fire while one is in flight:
+ // its markCommitted can advance lastCommittedSequence and unblock the
+ // pending head.
+ inflightNoConflict int
+
+ // closed is set by close() to signal that no more transactions will
+ // be enqueued. nextReady checks this to return io.EOF instead of
+ // blocking forever on cond.Wait after the scheduler is shut down.
+ closed bool
+}
+
+// newApplyScheduler creates a scheduler and starts a background goroutine
+// that broadcasts on cond when ctx is cancelled, unblocking any workers
+// waiting in nextReady.
+func newApplyScheduler(ctx context.Context) *applyScheduler {
+ s := &applyScheduler{
+ ctx: ctx,
+ inflightWriteset: make(map[uint64]int),
+ }
+ s.cond = sync.NewCond(&s.mu)
+ s.orderCond = sync.NewCond(&s.mu)
+ go func() {
+ <-ctx.Done()
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ s.cond.Broadcast()
+ s.orderCond.Broadcast()
+ }()
+ return s
+}
+
+// enqueue adds a transaction to the pending queue and signals one waiting
+// worker. On the first hasCommitMeta transaction, it seeds lastCommittedSequence
+// from commitParent so that subsequent commit-parent checks have a baseline.
+func (s *applyScheduler) enqueue(txn *applyTxn) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ if err := s.ctx.Err(); err != nil {
+ return err
+ }
+ if s.closed {
+ return io.EOF
+ }
+ for s.maxOutstandingOrders > 0 && txn.order > 0 && txn.order-s.lastCommittedOrder > s.maxOutstandingOrders {
+ s.orderCond.Wait()
+ if err := s.ctx.Err(); err != nil {
+ return err
+ }
+ if s.closed {
+ return io.EOF
+ }
+ }
+ if txn.hasCommitMeta && s.lastCommittedSequence == 0 && s.inflightGlobal == 0 && s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && s.pendingCount == 0 && txn.commitParent > 0 {
+ s.lastCommittedSequence = txn.commitParent
+ }
+ s.pending = append(s.pending, txn)
+ s.pendingCount++
+ // Signal wakes one worker. enqueue adds at most one transaction, so at
+ // most one worker can dequeue it via popReadyLocked. This avoids the
+ // thundering-herd effect of Broadcast which wakes all N workers.
+ s.cond.Signal()
+ return nil
+}
+
+// nextReady blocks until a transaction in the pending queue passes the
+// readiness check, marks it inflight, removes it from the queue, and returns
+// it to the calling worker. Returns io.EOF when the scheduler is closed and
+// there is no pending work left to drain.
+func (s *applyScheduler) nextReady(ctx context.Context) (*applyTxn, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ for {
+ if err := ctx.Err(); err != nil {
+ return nil, err
+ }
+ if err := s.ctx.Err(); err != nil {
+ return nil, err
+ }
+ txn := s.popReadyLocked()
+ if txn != nil {
+ s.markInflightLocked(txn)
+ // Pass the baton: one wakeup (e.g. a markCommitted that released
+ // a multi-key writeset) can make several pending transactions
+ // ready at once, but each waiter pops at most one. Signal the
+ // next waiter while pending work remains so independent ready
+ // transactions dispatch immediately instead of waiting for the
+ // next commit event.
+ if s.pendingCount > 0 {
+ s.cond.Signal()
+ }
+ return txn, nil
+ }
+ // Check closed only after attempting to drain any queued work so
+ // transactions already scheduled before shutdown still commit.
+ if s.closed {
+ if s.pendingCount == 0 {
+ return nil, io.EOF
+ }
+ // A closed scheduler may still have blocked pending work that
+ // becomes ready only after an inflight txn commits — in that
+ // case we keep waiting so the blocked pending txns unblock.
+ // But if nothing is inflight AND no pending txn is ready,
+ // nothing will ever advance lastCommittedSequence or release
+ // writeset/inflight counters, so workers would park forever.
+ // Return a non-EOF error so the controller retries the stream
+ // from the last saved position instead of silently abandoning
+ // the pending work.
+ if s.inflightGlobal == 0 && s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0 && s.inflightNoConflict == 0 {
+ return nil, errSchedulerAbandonedPendingWork
+ }
+ }
+ s.cond.Wait()
+ }
+}
+
+// markCommitted releases the transaction's inflight state and advances
+// lastCommittedSequence. Uses Broadcast when a global/missingMeta counter
+// drops to zero (multiple txns may unblock), Signal otherwise.
+func (s *applyScheduler) markCommitted(txn *applyTxn) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ if err := s.ctx.Err(); err != nil {
+ return err
+ }
+ if txn.hasCommitMeta && txn.sequenceNumber > s.lastCommittedSequence {
+ s.lastCommittedSequence = txn.sequenceNumber
+ }
+ // Advance any sequences that were batched (merged away) into this txn.
+ // These represent transactions whose events were merged into this batch
+ // but whose GTID sequence numbers must still become visible in
+ // lastCommittedSequence so that later empty-writeset commit-parent
+ // dependents can unblock. Doing this here (after commit) instead of at
+ // enqueue time preserves the invariant that commit-parent dependencies
+ // are only satisfied after the parent has actually committed.
+ for _, seq := range txn.mergedSequences {
+ if seq > s.lastCommittedSequence {
+ s.lastCommittedSequence = seq
+ }
+ }
+ if txn.order > 0 && txn.order > s.lastCommittedOrder {
+ s.lastCommittedOrder = txn.order
+ // Wake the scheduleLoop if it is blocked on the order window; only
+ // commits advance lastCommittedOrder, so this is the only wake site.
+ s.orderCond.Signal()
+ }
+ // Track pre-release state to decide between Signal and Broadcast.
+ wasForceGlobal := txn.forceGlobal
+ hadInflightGlobal := s.inflightGlobal > 0
+ hadInflightMissingMeta := s.inflightMissingMeta > 0
+ s.releaseInflightLocked(txn)
+ // Use Broadcast when releasing a forceGlobal txn, when a global/
+ // missingMeta counter drops to zero, or when all inflight work has
+ // drained (so waitForIdle waiters are woken). Otherwise use Signal
+ // to avoid thundering-herd wakeup of N workers when only one txn
+ // can proceed.
+ allDrained := s.inflightGlobal == 0 && s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0
+ if wasForceGlobal ||
+ (hadInflightGlobal && s.inflightGlobal == 0) ||
+ (hadInflightMissingMeta && s.inflightMissingMeta == 0) ||
+ allDrained {
+ s.cond.Broadcast()
+ } else {
+ s.cond.Signal()
+ }
+ return nil
+}
+
+// popReadyLocked scans the pending queue for the first dispatchable transaction.
+// Once it encounters a blocked ordered transaction, it continues scanning only
+// for later noConflict transactions. This preserves the deadlock protection for
+// normal ordered work while still allowing position-only and OTHER/IGNORE stop
+// transactions to bypass the blocked head and reach the commitLoop.
+func (s *applyScheduler) popReadyLocked() *applyTxn {
+ blockedOrdered := false
+ for i := s.pendingOff; i < len(s.pending); i++ {
+ txn := s.pending[i]
+ if txn == nil {
+ continue
+ }
+ if txn.noConflict {
+ // noConflict transactions are always ready and don't affect
+ // inflight counters, so we can safely skip past them when
+ // looking for the next ready transaction.
+ if s.isReadyLocked(txn) {
+ s.removePendingLocked(i)
+ return txn
+ }
+ continue
+ }
+ if blockedOrdered {
+ continue
+ }
+ if s.isReadyLocked(txn) {
+ s.removePendingLocked(i)
+ return txn
+ }
+ // A non-noConflict transaction is not ready. We must NOT skip past it to
+ // dispatch a later ordered transaction, because doing so could create a
+ // deadlock: the later transaction's inflight state may prevent this
+ // earlier transaction from ever becoming ready, while the commitLoop
+ // (which requires strict ordering) waits for this earlier transaction to
+ // be committed before it can commit the later one. Keep scanning only so
+ // later noConflict transactions can bypass this blocked ordered head.
+ blockedOrdered = true
+ }
+ return nil
+}
+
+// removePendingLocked removes the element at index i by setting it to nil and
+// advancing pendingOff if it's the head element. This avoids O(n) memory shifts
+// from append-based removal. The slice is compacted when half or more of its
+// capacity is consumed by nil entries.
+func (s *applyScheduler) removePendingLocked(i int) {
+ s.pending[i] = nil
+ s.pendingCount--
+ // Advance the offset past any leading nils.
+ for s.pendingOff < len(s.pending) && s.pending[s.pendingOff] == nil {
+ s.pendingOff++
+ }
+ // Compact when the offset has consumed half or more of the slice.
+ if s.pendingOff > 0 && s.pendingOff >= len(s.pending)/2 {
+ n := copy(s.pending, s.pending[s.pendingOff:])
+ // Clear trailing pointers so GC can collect them.
+ for j := n; j < len(s.pending); j++ {
+ s.pending[j] = nil
+ }
+ s.pending = s.pending[:n]
+ s.pendingOff = 0
+ }
+ // Shrink capacity after bursts to prevent permanent memory retention.
+ // If the backing array is >64 slots and >4x the live element count,
+ // allocate a right-sized slice and copy.
+ n := len(s.pending)
+ if cap(s.pending) > 64 && cap(s.pending) > 4*n {
+ shrunk := make([]*applyTxn, n, 2*n+1)
+ copy(shrunk, s.pending)
+ s.pending = shrunk
+ }
+}
+
+// isReadyLocked checks whether a transaction can be dispatched to a worker
+// based on its classification (noConflict, forceGlobal, hasCommitMeta) and
+// the current inflight state. See the ready-check hierarchy in the PR docs.
+func (s *applyScheduler) isReadyLocked(txn *applyTxn) bool {
+ // noConflict transactions (e.g., position-only saves) are always ready.
+ // They have no data conflicts and must not block or be blocked by other
+ // transactions. This prevents deadlocks where forceGlobal position saves
+ // (with earlier orders) are blocked by inflight data transactions (with
+ // later orders), while the commitLoop waits for those earlier orders.
+ if txn.noConflict {
+ return true
+ }
+ if s.inflightGlobal > 0 {
+ return false
+ }
+ if txn.forceGlobal {
+ ready := s.inflightMissingMeta == 0 && s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0
+ return ready
+ }
+ if txn.hasCommitMeta {
+ if s.inflightMissingMeta > 0 {
+ return false
+ }
+ for _, key := range txn.writeset {
+ if s.inflightWriteset[key] > 0 {
+ return false
+ }
+ }
+ // When the transaction has a non-empty writeset, we use writeset-only
+ // conflict detection and skip the commit-parent dependency check. This
+ // is critical because the source MySQL may use COMMIT_ORDER dependency
+ // tracking, which produces a strict serial chain where every
+ // transaction's commitParent equals the immediately prior sequence
+ // number. Under COMMIT_ORDER, the commit-parent check alone would
+ // serialize ALL transactions regardless of whether their writesets
+ // actually conflict. With a valid writeset, the writeset conflict
+ // checks above are sufficient for correctness — the same approach
+ // MySQL uses internally with WRITESET dependency tracking.
+ //
+ // When the writeset is empty, we fall back to commit-parent ordering
+ // as the safety net.
+ if len(txn.writeset) > 0 {
+ return true
+ }
+ if s.inflightCommitMeta > 0 {
+ return false
+ }
+ // NOTE: sequence_number/last_committed reset per binlog FILE on the
+ // source, while lastCommittedSequence only advances (max). After a
+ // binlog rotation the new file's small commitParent values compare
+ // against the old file's high watermark, making this check vacuously
+ // true. That is safe ONLY because of the inflightCommitMeta == 0
+ // gate above: with nothing inflight, every earlier-ordered txn has
+ // already committed, so the parent is durably applied regardless of
+ // what this comparison says. Do not remove that gate without
+ // rethinking rotation.
+ ready := txn.commitParent <= s.lastCommittedSequence
+ return ready
+ }
+ if s.inflightCommitMeta > 0 {
+ return false
+ }
+ if len(txn.writeset) == 0 {
+ return s.inflightMissingMeta == 0 && len(s.inflightWriteset) == 0
+ }
+ for _, key := range txn.writeset {
+ if s.inflightWriteset[key] > 0 {
+ return false
+ }
+ }
+ return true
+}
+
+// markInflightLocked increments the appropriate inflight counters and adds
+// writeset keys to inflightWriteset. Must be called under s.mu.
+func (s *applyScheduler) markInflightLocked(txn *applyTxn) {
+ if txn.noConflict {
+ s.inflightNoConflict++
+ return
+ }
+ if txn.forceGlobal {
+ s.inflightGlobal++
+ return
+ }
+ if txn.hasCommitMeta {
+ s.inflightCommitMeta++
+ for _, key := range txn.writeset {
+ s.inflightWriteset[key]++
+ }
+ return
+ }
+ if len(txn.writeset) == 0 {
+ s.inflightGlobal++
+ s.inflightMissingMeta++
+ return
+ }
+ s.inflightMissingMeta++
+ for _, key := range txn.writeset {
+ s.inflightWriteset[key]++
+ }
+}
+
+// releaseInflightLocked decrements the inflight counters and removes
+// writeset keys. The inverse of markInflightLocked. Must be called under s.mu.
+func (s *applyScheduler) releaseInflightLocked(txn *applyTxn) {
+ if txn.noConflict {
+ if s.inflightNoConflict > 0 {
+ s.inflightNoConflict--
+ }
+ return
+ }
+ if txn.forceGlobal {
+ if s.inflightGlobal > 0 {
+ s.inflightGlobal--
+ }
+ return
+ }
+ if txn.hasCommitMeta {
+ if s.inflightCommitMeta > 0 {
+ s.inflightCommitMeta--
+ }
+ for _, key := range txn.writeset {
+ count := s.inflightWriteset[key]
+ if count <= 1 {
+ delete(s.inflightWriteset, key)
+ } else {
+ s.inflightWriteset[key] = count - 1
+ }
+ }
+ return
+ }
+ if len(txn.writeset) == 0 {
+ if s.inflightGlobal > 0 {
+ s.inflightGlobal--
+ }
+ if s.inflightMissingMeta > 0 {
+ s.inflightMissingMeta--
+ }
+ return
+ }
+ if s.inflightMissingMeta > 0 {
+ s.inflightMissingMeta--
+ }
+ for _, key := range txn.writeset {
+ count := s.inflightWriteset[key]
+ if count <= 1 {
+ delete(s.inflightWriteset, key)
+ } else {
+ s.inflightWriteset[key] = count - 1
+ }
+ }
+}
+
+// advanceCommittedSequence advances lastCommittedSequence for transactions
+// that bypass the scheduler (e.g., empty transactions handled via unsavedEvent).
+// Without this, hasCommitMeta transactions whose commitParent references a
+// skipped empty transaction would be blocked forever because lastCommittedSequence
+// would never reach their commitParent value.
+func (s *applyScheduler) advanceCommittedSequence(seq int64) {
+ if seq <= 0 {
+ return
+ }
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ if seq > s.lastCommittedSequence {
+ s.lastCommittedSequence = seq
+ // Only wake waiters when there is pending work that could now be
+ // ready. During catch-up on a filtered shard this is called for
+ // every empty transaction (thousands/sec); an unconditional
+ // Broadcast would wake all N workers each time just to rescan an
+ // empty queue.
+ if s.pendingCount > 0 {
+ s.cond.Broadcast()
+ }
+ }
+}
+
+// waitForIdle blocks until there are no pending or inflight transactions of
+// any class. scheduleLoop calls it as a barrier after a DDL fetch so that the
+// DDL, its FK-metadata refresh, and any FIELD events for DDL-affected tables
+// are fully applied before the next fetch snapshots plans/FK refs. The idle
+// predicate must therefore cover every inflight counter — including
+// inflightNoConflict (position-only saves, OTHER/IGNORE stops) and the
+// inflightWriteset map — so the barrier cannot return while any dispatched
+// transaction is still uncommitted. This mirrors the fully-drained predicate
+// in nextReady's abandoned-work check.
+func (s *applyScheduler) waitForIdle(ctx context.Context) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ for {
+ if err := ctx.Err(); err != nil {
+ return err
+ }
+ if err := s.ctx.Err(); err != nil {
+ return err
+ }
+ if s.pendingCount == 0 && s.inflightGlobal == 0 && s.inflightMissingMeta == 0 &&
+ s.inflightCommitMeta == 0 && len(s.inflightWriteset) == 0 && s.inflightNoConflict == 0 {
+ return nil
+ }
+ s.cond.Wait()
+ }
+}
+
+// close marks the scheduler as closed and broadcasts to wake blocked workers.
+// Already-enqueued work remains available so callers can drain the scheduled
+// prefix before observing io.EOF.
+func (s *applyScheduler) close() error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ if err := s.ctx.Err(); err != nil {
+ return err
+ }
+ s.closed = true
+ s.cond.Broadcast()
+ s.orderCond.Broadcast()
+ return io.EOF
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler_test.go
new file mode 100644
index 00000000000..873028afa74
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_scheduler_test.go
@@ -0,0 +1,1005 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "context"
+ "errors"
+ "io"
+ "math/rand/v2"
+ "runtime"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func requireNoReadyTxn(t *testing.T, s *applyScheduler) {
+ t.Helper()
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ require.Nil(t, s.popReadyLocked())
+}
+
+func requireReadyTxn(t *testing.T, s *applyScheduler, want *applyTxn) {
+ t.Helper()
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ require.Same(t, want, s.popReadyLocked())
+}
+
+func TestApplySchedulerCommitParentOrder(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // txn2 is enqueued first. Since it's the first hasCommitMeta transaction
+ // and the scheduler is idle, enqueue seeds lastCommittedSequence to
+ // txn2.commitParent (1). This makes txn2 immediately ready because
+ // commitParent (1) <= lastCommittedSequence (1). The scheduler dispatches
+ // in FIFO order, so txn2 goes first.
+ txn2 := &applyTxn{sequenceNumber: 2, commitParent: 1, hasCommitMeta: true}
+ txn1 := &applyTxn{sequenceNumber: 1, commitParent: 0, hasCommitMeta: true}
+
+ require.NoError(t, s.enqueue(txn2))
+ require.NoError(t, s.enqueue(txn1))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn2, got1)
+ require.NoError(t, s.markCommitted(got1))
+
+ got2, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn1, got2)
+ require.NoError(t, s.markCommitted(got2))
+}
+
+func TestApplySchedulerAllowsIndependentWritesets(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ txn1 := &applyTxn{writeset: []uint64{1}}
+ txn2 := &applyTxn{writeset: []uint64{2}}
+
+ require.NoError(t, s.enqueue(txn1))
+ require.NoError(t, s.enqueue(txn2))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ got2, err := s.nextReady(ctx)
+ require.NoError(t, err)
+
+ require.NotEqual(t, got1, got2)
+}
+
+func TestApplySchedulerBlocksConflictingWritesets(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ txn1 := &applyTxn{writeset: []uint64{100}}
+ txn2 := &applyTxn{writeset: []uint64{100}}
+
+ require.NoError(t, s.enqueue(txn1))
+ require.NoError(t, s.enqueue(txn2))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(got1))
+
+ requireReadyTxn(t, s, txn2)
+}
+
+func TestApplySchedulerBlocksCommitMetaDuringMissingMeta(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ missing := &applyTxn{writeset: []uint64{100}}
+ meta := &applyTxn{sequenceNumber: 2, commitParent: 0, hasCommitMeta: true}
+
+ require.NoError(t, s.enqueue(missing))
+ require.NoError(t, s.enqueue(meta))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, missing, got1)
+
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(got1))
+
+ requireReadyTxn(t, s, meta)
+}
+
+func TestApplySchedulerBlocksCommitMetaConflictingWritesets(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ txn1 := &applyTxn{writeset: []uint64{100}, sequenceNumber: 1, commitParent: 0, hasCommitMeta: true}
+ txn2 := &applyTxn{writeset: []uint64{100}, sequenceNumber: 2, commitParent: 0, hasCommitMeta: true}
+
+ require.NoError(t, s.enqueue(txn1))
+ require.NoError(t, s.enqueue(txn2))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn1, got1)
+
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(got1))
+
+ requireReadyTxn(t, s, txn2)
+}
+
+func TestApplySchedulerCommitMetaDoesNotAdvanceOnMissingMeta(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+ require.Equal(t, int64(0), s.lastCommittedSequence)
+
+ missing := &applyTxn{writeset: []uint64{100}}
+ meta := &applyTxn{sequenceNumber: 5, commitParent: 0, hasCommitMeta: true}
+
+ require.NoError(t, s.enqueue(missing))
+ require.NoError(t, s.enqueue(meta))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, missing, got1)
+
+ require.NoError(t, s.markCommitted(got1))
+
+ got2, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, meta, got2)
+
+ require.NoError(t, s.markCommitted(got2))
+ require.Equal(t, int64(5), s.lastCommittedSequence)
+}
+
+func TestApplySchedulerSeedsCommitParentOnFirstMeta(t *testing.T) {
+ ctx := t.Context()
+ // The scheduler seeds lastCommittedSequence from the first hasCommitMeta
+ // transaction when the scheduler is completely idle (no pending, no inflight).
+ // Enqueue meta as the very first transaction to trigger seeding.
+ meta := &applyTxn{sequenceNumber: 6, commitParent: 5, hasCommitMeta: true}
+
+ s := newApplyScheduler(ctx)
+
+ require.NoError(t, s.enqueue(meta))
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, meta, got)
+ require.Equal(t, int64(5), s.lastCommittedSequence)
+}
+
+func TestApplySchedulerWritesetBypassesCommitParent(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // Simulate COMMIT_ORDER dependency tracking: each txn's commitParent is
+ // the immediately prior sequence number, forming a strict serial chain.
+ // With non-conflicting writesets, the scheduler should allow parallelism
+ // by ignoring the commit-parent dependency.
+ txn1 := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{1}}
+ txn2 := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true, writeset: []uint64{2}}
+ txn3 := &applyTxn{order: 3, sequenceNumber: 12, commitParent: 11, hasCommitMeta: true, writeset: []uint64{3}}
+
+ require.NoError(t, s.enqueue(txn1))
+ require.NoError(t, s.enqueue(txn2))
+ require.NoError(t, s.enqueue(txn3))
+
+ // All three should be immediately ready since their writesets don't conflict.
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn1, got1)
+
+ got2, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn2, got2)
+
+ got3, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn3, got3)
+
+ // Commit in order.
+ require.NoError(t, s.markCommitted(got1))
+ require.NoError(t, s.markCommitted(got2))
+ require.NoError(t, s.markCommitted(got3))
+}
+
+func TestApplySchedulerWritesetConflictStillBlocks(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // Even with the commit-parent bypass, conflicting writesets must still
+ // cause serialization.
+ txn1 := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}}
+ txn2 := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true, writeset: []uint64{100}}
+
+ require.NoError(t, s.enqueue(txn1))
+ require.NoError(t, s.enqueue(txn2))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn1, got1)
+
+ // txn2 should be blocked because it conflicts with inflight txn1.
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(got1))
+
+ requireReadyTxn(t, s, txn2)
+}
+
+func TestApplySchedulerEmptyWritesetFallsBackToCommitParent(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // When a hasCommitMeta transaction has an empty writeset (e.g., writeset
+ // build failed), it should fall back to commit-parent ordering.
+ txn1 := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true}
+ txn2 := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true}
+
+ // Seed lastCommittedSequence to 9 so txn1 is ready.
+ require.NoError(t, s.enqueue(txn1))
+ require.NoError(t, s.enqueue(txn2))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, txn1, got1)
+
+ // txn2 has commitParent=10 but lastCommittedSequence is still 9 (seeded).
+ // txn2's writeset is empty, so it falls back to commit-parent check.
+ requireNoReadyTxn(t, s)
+
+ // After committing txn1, lastCommittedSequence advances to 10,
+ // making txn2 ready (commitParent 10 <= 10).
+ require.NoError(t, s.markCommitted(got1))
+
+ requireReadyTxn(t, s, txn2)
+}
+
+func TestApplySchedulerNoConflictDoesNotBlockPending(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // Enqueue a noConflict txn first and a normal txn second.
+ nc := &applyTxn{order: 1, noConflict: true}
+ normal := &applyTxn{order: 2, writeset: []uint64{100}}
+
+ require.NoError(t, s.enqueue(nc))
+ require.NoError(t, s.enqueue(normal))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, nc, got1)
+
+ // Commit noConflict should not affect inflight counters for normal txn.
+ require.NoError(t, s.markCommitted(got1))
+
+ got2, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, normal, got2)
+}
+
+func TestApplySchedulerForceGlobalBlocksWritesets(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ global := &applyTxn{order: 1, forceGlobal: true}
+ conflict := &applyTxn{order: 2, writeset: []uint64{100}}
+
+ require.NoError(t, s.enqueue(global))
+ require.NoError(t, s.enqueue(conflict))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, global, got1)
+
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(got1))
+
+ requireReadyTxn(t, s, conflict)
+}
+
+func TestApplySchedulerAdvanceCommittedSequenceUnblocks(t *testing.T) {
+ ctx := t.Context()
+ // Use a non-empty pending queue to prevent commit-parent seeding.
+ seed := &applyTxn{order: 1, noConflict: true}
+ meta := &applyTxn{order: 2, sequenceNumber: 6, commitParent: 5, hasCommitMeta: true}
+
+ s := newApplyScheduler(ctx)
+
+ require.NoError(t, s.enqueue(seed))
+ require.NoError(t, s.enqueue(meta))
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, seed, got1)
+ require.NoError(t, s.markCommitted(got1))
+
+ requireNoReadyTxn(t, s)
+
+ s.advanceCommittedSequence(5)
+
+ requireReadyTxn(t, s, meta)
+}
+
+func TestApplySchedulerAdvanceCommittedSequenceDoesNotBypassInflightMetaParent(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ metaParent := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{1}}
+ metaChild := &applyTxn{order: 2, sequenceNumber: 12, commitParent: 11, hasCommitMeta: true}
+
+ require.NoError(t, s.enqueue(metaParent))
+ gotParent, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, metaParent, gotParent)
+
+ require.NoError(t, s.enqueue(metaChild))
+ s.advanceCommittedSequence(11)
+
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(gotParent))
+
+ requireReadyTxn(t, s, metaChild)
+}
+
+func TestApplySchedulerMergedSequencesUnblockCommitParentChild(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ batchedParent := &applyTxn{order: 1, writeset: []uint64{1}, mergedSequences: []int64{10}}
+ metaChild := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 10, hasCommitMeta: true}
+
+ require.NoError(t, s.enqueue(batchedParent))
+ require.NoError(t, s.enqueue(metaChild))
+
+ gotParent, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Same(t, batchedParent, gotParent)
+
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(gotParent))
+
+ s.mu.Lock()
+ require.Equal(t, int64(10), s.lastCommittedSequence)
+ s.mu.Unlock()
+
+ requireReadyTxn(t, s, metaChild)
+}
+
+func TestApplySchedulerWaitForIdleReturnsWhenIdle(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ require.NoError(t, s.waitForIdle(ctx))
+}
+
+func TestApplySchedulerWaitForIdleReturnsOnSchedulerCancel(t *testing.T) {
+ ctx := t.Context()
+ sCtx, cancel := context.WithCancel(ctx)
+ s := newApplyScheduler(sCtx)
+
+ require.NoError(t, s.enqueue(&applyTxn{writeset: []uint64{100}}))
+
+ s.mu.Lock()
+ require.NotZero(t, s.pendingCount)
+ s.mu.Unlock()
+
+ cancel()
+
+ require.ErrorIs(t, s.waitForIdle(ctx), context.Canceled)
+}
+
+func TestApplySchedulerClosePreservesPending(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ txn := &applyTxn{writeset: []uint64{100}, noConflict: true}
+ require.NoError(t, s.enqueue(txn))
+
+ err := s.close()
+ require.ErrorIs(t, err, io.EOF)
+ require.Equal(t, 1, s.pendingCount)
+ require.Zero(t, s.pendingOff)
+ require.Len(t, s.pending, 1)
+ require.Same(t, txn, s.pending[0])
+}
+
+func TestApplySchedulerNextReadyDrainsPendingAfterClose(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ txn := &applyTxn{order: 1, noConflict: true}
+ require.NoError(t, s.enqueue(txn))
+ require.ErrorIs(t, s.close(), io.EOF)
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Same(t, txn, got)
+
+ _, err = s.nextReady(ctx)
+ require.ErrorIs(t, err, io.EOF)
+}
+
+func TestApplySchedulerNextReadyWaitsForBlockedPendingAfterClose(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ blocker := &applyTxn{order: 1, writeset: []uint64{100}}
+ blocked := &applyTxn{order: 2, writeset: []uint64{100}}
+
+ require.NoError(t, s.enqueue(blocker))
+ require.NoError(t, s.enqueue(blocked))
+
+ gotBlocker, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Same(t, blocker, gotBlocker)
+
+ require.ErrorIs(t, s.close(), io.EOF)
+
+ type nextReadyResult struct {
+ txn *applyTxn
+ err error
+ }
+ resultCh := make(chan nextReadyResult, 1)
+ go func() {
+ txn, err := s.nextReady(ctx)
+ resultCh <- nextReadyResult{txn: txn, err: err}
+ }()
+
+ assert.Never(t, func() bool {
+ return len(resultCh) > 0
+ }, 100*time.Millisecond, 5*time.Millisecond)
+
+ require.NoError(t, s.markCommitted(gotBlocker))
+
+ assert.Eventually(t, func() bool {
+ return len(resultCh) > 0
+ }, 200*time.Millisecond, 5*time.Millisecond)
+
+ gotBlocked := <-resultCh
+ require.NoError(t, gotBlocked.err)
+ require.Same(t, blocked, gotBlocked.txn)
+
+ require.NoError(t, s.markCommitted(gotBlocked.txn))
+
+ _, err = s.nextReady(ctx)
+ require.ErrorIs(t, err, io.EOF)
+}
+
+func TestApplySchedulerEnqueueBlocksWhenOutstandingOrdersReachCap(t *testing.T) {
+ ctx, cancel := context.WithCancel(t.Context())
+ defer cancel()
+
+ s := newApplyScheduler(ctx)
+ s.maxOutstandingOrders = 2
+
+ require.NoError(t, s.enqueue(&applyTxn{order: 1, noConflict: true}))
+ require.NoError(t, s.enqueue(&applyTxn{order: 2, noConflict: true}))
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- s.enqueue(&applyTxn{order: 3, noConflict: true})
+ }()
+
+ assert.Never(t, func() bool {
+ return len(errCh) > 0
+ }, 100*time.Millisecond, 5*time.Millisecond)
+
+ // Advance durable progress through the real path: markCommitted bumps
+ // lastCommittedOrder and wakes the order-window waiter (orderCond).
+ require.NoError(t, s.markCommitted(&applyTxn{order: 1, noConflict: true}))
+
+ assert.Eventually(t, func() bool {
+ return len(errCh) > 0
+ }, 30*time.Second, 5*time.Millisecond)
+ require.NoError(t, <-errCh)
+
+ s.mu.Lock()
+ require.Equal(t, 3, s.pendingCount)
+ s.mu.Unlock()
+}
+
+func TestApplySchedulerLaterNoConflictBypassesBlockedEarlierTxn(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ blocker := &applyTxn{order: 1, writeset: []uint64{100}}
+ require.NoError(t, s.enqueue(blocker))
+
+ gotBlocker, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Same(t, blocker, gotBlocker)
+
+ blocked := &applyTxn{order: 2, writeset: []uint64{100}}
+ stopTxn1 := &applyTxn{order: 3, noConflict: true}
+ require.NoError(t, s.enqueue(blocked))
+ require.NoError(t, s.enqueue(stopTxn1))
+
+ requireReadyTxn(t, s, stopTxn1)
+
+ // The first bypass leaves a nil gap in pending. A second noConflict txn
+ // must still be discoverable while the earlier normal txn remains blocked.
+ stopTxn2 := &applyTxn{order: 4, noConflict: true}
+ require.NoError(t, s.enqueue(stopTxn2))
+ requireReadyTxn(t, s, stopTxn2)
+
+ require.NoError(t, s.markCommitted(gotBlocker))
+ requireReadyTxn(t, s, blocked)
+}
+
+func TestApplySchedulerPendingCompaction(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ for i := range 4 {
+ require.NoError(t, s.enqueue(&applyTxn{order: int64(i + 1), noConflict: true}))
+ }
+
+ got1, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, int64(1), got1.order)
+ require.NoError(t, s.markCommitted(got1))
+
+ got2, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, int64(2), got2.order)
+ require.NoError(t, s.markCommitted(got2))
+
+ require.Zero(t, s.pendingOff)
+ require.Len(t, s.pending, 2)
+ require.Equal(t, 2, s.pendingCount)
+}
+
+// TestApplySchedulerConcurrentEnqueueAndCommitStress exercises the scheduler
+// under concurrent producers and real worker goroutines (nextReady +
+// markCommitted, so inflight state and the writeset-refcount machinery are
+// genuinely engaged) to flush out deadlocks, lost wakeups, counter-balance
+// bugs, and — most importantly — conflicting dispatches.
+//
+// Correctness properties checked:
+// - No two concurrently-dispatched transactions share a writeset key, and
+// forceGlobal transactions run exclusively (verified by an external
+// conflict tracker, independent of the scheduler's own bookkeeping).
+// - Every enqueued transaction is dispatched exactly once.
+// - After all work drains, every inflight counter is zero.
+func TestApplySchedulerConcurrentEnqueueAndCommitStress(t *testing.T) {
+ ctx, cancel := context.WithTimeout(t.Context(), 60*time.Second)
+ defer cancel()
+ s := newApplyScheduler(ctx)
+
+ const (
+ numProducers = 2
+ numWorkers = 6
+ txnsPerProducer = 500
+ maxWritesetKeys = 4
+ writesetKeySpace = 32
+ maxOutstandingOrder = int64(128)
+ )
+ totalTxns := numProducers * txnsPerProducer
+ s.maxOutstandingOrders = maxOutstandingOrder
+
+ // Atomically assigned order so all producers share one sequence.
+ var nextOrder atomic.Int64
+
+ // Producer goroutines enqueue a mix of writeset-based and forceGlobal
+ // transactions. Writeset keys are drawn from a small space so workers
+ // frequently conflict, exercising the writeset-refcount machinery.
+ var producers sync.WaitGroup
+ for p := range numProducers {
+ producers.Add(1)
+ go func(producerID int) {
+ defer producers.Done()
+ // Deterministic per-producer RNG so flakes are reproducible.
+ rng := rand.New(rand.NewPCG(uint64(producerID+1), 0x51ED))
+ for i := range txnsPerProducer {
+ txn := &applyTxn{
+ order: nextOrder.Add(1),
+ }
+ // 5% of transactions force-global, others carry a writeset.
+ if rng.IntN(20) == 0 {
+ txn.forceGlobal = true
+ } else {
+ n := 1 + rng.IntN(maxWritesetKeys)
+ txn.writeset = make([]uint64, 0, n)
+ seen := map[uint64]struct{}{}
+ for range n {
+ k := uint64(rng.IntN(writesetKeySpace))
+ if _, dup := seen[k]; dup {
+ continue
+ }
+ seen[k] = struct{}{}
+ txn.writeset = append(txn.writeset, k)
+ }
+ }
+ if err := s.enqueue(txn); err != nil {
+ t.Errorf("producer %d txn %d enqueue: %v", producerID, i, err)
+ return
+ }
+ }
+ }(p)
+ }
+
+ // External conflict tracker: validates, independently of the scheduler's
+ // own counters, that no two dispatched-and-uncommitted transactions
+ // conflict. Registration is atomic with the check under one mutex.
+ var (
+ trackerMu sync.Mutex
+ activeKeys = map[uint64]int64{} // key -> holding txn order
+ activeGlobal int64 // order of the active forceGlobal txn, 0 = none
+ activeCount int
+ )
+ dispatch := func(txn *applyTxn) {
+ trackerMu.Lock()
+ defer trackerMu.Unlock()
+ if activeGlobal != 0 {
+ t.Errorf("txn %d dispatched while forceGlobal txn %d active", txn.order, activeGlobal)
+ }
+ if txn.forceGlobal {
+ if activeCount != 0 {
+ t.Errorf("forceGlobal txn %d dispatched with %d txns active", txn.order, activeCount)
+ }
+ activeGlobal = txn.order
+ }
+ for _, k := range txn.writeset {
+ if holder, conflict := activeKeys[k]; conflict {
+ t.Errorf("txn %d dispatched with writeset key %d held by active txn %d", txn.order, k, holder)
+ }
+ activeKeys[k] = txn.order
+ }
+ activeCount++
+ }
+ finish := func(txn *applyTxn) {
+ trackerMu.Lock()
+ defer trackerMu.Unlock()
+ if txn.forceGlobal {
+ activeGlobal = 0
+ }
+ for _, k := range txn.writeset {
+ delete(activeKeys, k)
+ }
+ activeCount--
+ }
+
+ // Worker goroutines: the REAL dispatch path. nextReady marks inflight;
+ // markCommitted releases it. The tracker unregisters BEFORE
+ // markCommitted, mirroring the real pipeline where a conflicting txn may
+ // dispatch the instant the commit releases the scheduler state.
+ observed := make([]int64, 0, totalTxns)
+ var observedMu sync.Mutex
+ var workers sync.WaitGroup
+ for range numWorkers {
+ workers.Go(func() {
+ for {
+ txn, err := s.nextReady(ctx)
+ if err != nil {
+ if !errors.Is(err, io.EOF) && ctx.Err() == nil {
+ t.Errorf("nextReady: %v", err)
+ }
+ return
+ }
+ dispatch(txn)
+ if txn.order%7 == 0 {
+ runtime.Gosched() // widen the race window a little
+ }
+ observedMu.Lock()
+ observed = append(observed, txn.order)
+ observedMu.Unlock()
+ finish(txn)
+ if err := s.markCommitted(txn); err != nil {
+ t.Errorf("markCommitted: %v", err)
+ return
+ }
+ }
+ })
+ }
+
+ producers.Wait()
+ s.close()
+ workersDone := make(chan struct{})
+ go func() { workers.Wait(); close(workersDone) }()
+ select {
+ case <-workersDone:
+ case <-ctx.Done():
+ observedMu.Lock()
+ n := len(observed)
+ observedMu.Unlock()
+ t.Fatalf("stress test timed out: observed %d / %d transactions", n, totalTxns)
+ }
+
+ // Invariants after the scheduler has drained.
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ require.Zero(t, s.inflightGlobal, "inflightGlobal leaked")
+ require.Zero(t, s.inflightMissingMeta, "inflightMissingMeta leaked")
+ require.Zero(t, s.inflightCommitMeta, "inflightCommitMeta leaked")
+ require.Zero(t, s.inflightNoConflict, "inflightNoConflict leaked")
+ require.Empty(t, s.inflightWriteset, "inflightWriteset leaked")
+ require.Zero(t, s.pendingCount, "pendingCount not drained")
+ require.Len(t, observed, totalTxns)
+
+ // All order numbers from 1..totalTxns must appear exactly once.
+ seen := make(map[int64]struct{}, totalTxns)
+ for _, o := range observed {
+ if _, dup := seen[o]; dup {
+ t.Fatalf("order %d observed twice", o)
+ }
+ seen[o] = struct{}{}
+ }
+ require.Len(t, seen, totalTxns)
+}
+
+func TestApplySchedulerMultiKeyReleaseWakesAllReadyWaiters(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // megaTxn holds two writeset keys; blocker keeps a third key inflight for
+ // the whole test so markCommitted(megaTxn) does not take the
+ // all-drained Broadcast path.
+ megaTxn := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100, 200}}
+ blocker := &applyTxn{order: 2, sequenceNumber: 11, commitParent: 9, hasCommitMeta: true, writeset: []uint64{900}}
+ require.NoError(t, s.enqueue(megaTxn))
+ require.NoError(t, s.enqueue(blocker))
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, megaTxn, got)
+ got, err = s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, blocker, got)
+
+ // Two pending transactions, each conflicting with a different one of
+ // megaTxn's keys.
+ waiterA := &applyTxn{order: 3, sequenceNumber: 12, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}}
+ waiterB := &applyTxn{order: 4, sequenceNumber: 13, commitParent: 9, hasCommitMeta: true, writeset: []uint64{200}}
+ require.NoError(t, s.enqueue(waiterA))
+ require.NoError(t, s.enqueue(waiterB))
+
+ // Two workers block in nextReady before the commit.
+ results := make(chan *applyTxn, 2)
+ errs := make(chan error, 2)
+ for range 2 {
+ go func() {
+ txn, err := s.nextReady(ctx)
+ if err != nil {
+ errs <- err
+ return
+ }
+ results <- txn
+ }()
+ }
+ // Wait until both goroutines are parked in cond.Wait. There is no direct
+ // hook for "waiter count", so poll the scheduler state: both pending txns
+ // are still queued and neither result has arrived.
+ require.Eventually(t, func() bool {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ return s.pendingCount == 2
+ }, 30*time.Second, time.Millisecond)
+
+ // One commit releases both keys; both waiters must be dispatched without
+ // any further commit happening (blocker stays inflight throughout).
+ require.NoError(t, s.markCommitted(megaTxn))
+
+ dispatched := make(map[int64]bool)
+ for range 2 {
+ select {
+ case txn := <-results:
+ dispatched[txn.order] = true
+ case err := <-errs:
+ t.Fatalf("nextReady returned error: %v", err)
+ case <-time.After(30 * time.Second):
+ t.Fatalf("timed out waiting for both ready transactions to be dispatched; got %v", dispatched)
+ }
+ }
+ require.True(t, dispatched[waiterA.order])
+ require.True(t, dispatched[waiterB.order])
+}
+
+// TestApplySchedulerNoMetaNoWritesetIsGlobal pins ready-check case 7: a
+// transaction without commit metadata and without a writeset must serialize
+// as global, with BOTH inflightGlobal and inflightMissingMeta held and then
+// released in balance. An unbalanced release here would silently wedge the
+// scheduler (counter stuck > 0) or unsafely unblock it (counter goes
+// negative-equivalent via early zero).
+func TestApplySchedulerNoMetaNoWritesetIsGlobal(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ opaque := &applyTxn{order: 1} // no meta, no writeset
+ other := &applyTxn{order: 2, sequenceNumber: 10, commitParent: 0, hasCommitMeta: true, writeset: []uint64{100}}
+ require.NoError(t, s.enqueue(opaque))
+ require.NoError(t, s.enqueue(other))
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, opaque, got)
+ s.mu.Lock()
+ require.Equal(t, 1, s.inflightGlobal, "no-meta/no-writeset must count as global")
+ require.Equal(t, 1, s.inflightMissingMeta, "no-meta/no-writeset must count as missing-meta")
+ s.mu.Unlock()
+
+ // While the opaque txn is inflight, everything else is blocked.
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(opaque))
+ s.mu.Lock()
+ require.Zero(t, s.inflightGlobal, "release must balance the global count")
+ require.Zero(t, s.inflightMissingMeta, "release must balance the missing-meta count")
+ s.mu.Unlock()
+ requireReadyTxn(t, s, other)
+}
+
+// TestApplySchedulerNoMetaWritesetBlockedByInflightCommitMeta pins the
+// blocked direction of ready-check case 8: a transaction without commit
+// metadata (even with a non-conflicting writeset) must not run alongside an
+// inflight transaction that has metadata — the two metadata modes never mix.
+func TestApplySchedulerNoMetaWritesetBlockedByInflightCommitMeta(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ withMeta := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}}
+ noMeta := &applyTxn{order: 2, writeset: []uint64{200}} // disjoint writeset, no metadata
+ require.NoError(t, s.enqueue(withMeta))
+ require.NoError(t, s.enqueue(noMeta))
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, withMeta, got)
+
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(withMeta))
+ requireReadyTxn(t, s, noMeta)
+}
+
+// TestApplySchedulerForceGlobalWaitsForInflightAndThenBlocksAll pins the
+// blocked direction of ready-check case 3: a forceGlobal transaction (e.g. a
+// DDL) must wait until ALL inflight work drains, and while it is inflight it
+// must block everything behind it. A regression here would let a DDL execute
+// concurrently with inflight row transactions on other connections.
+func TestApplySchedulerForceGlobalWaitsForInflightAndThenBlocksAll(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ row := &applyTxn{order: 1, sequenceNumber: 10, commitParent: 9, hasCommitMeta: true, writeset: []uint64{100}}
+ global := &applyTxn{order: 2, forceGlobal: true}
+ row2 := &applyTxn{order: 3, sequenceNumber: 11, commitParent: 9, hasCommitMeta: true, writeset: []uint64{200}}
+ require.NoError(t, s.enqueue(row))
+ require.NoError(t, s.enqueue(global))
+ require.NoError(t, s.enqueue(row2))
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, row, got)
+
+ // The DDL must not be dispatchable while the row txn is inflight, and
+ // head-of-line blocking must also keep row2 queued behind it.
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(row))
+ // Dispatch via nextReady so the DDL is actually marked inflight
+ // (requireReadyTxn only pops, without marking).
+ got, err = s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, global, got)
+
+ // While the DDL is inflight, nothing else may start.
+ requireNoReadyTxn(t, s)
+
+ require.NoError(t, s.markCommitted(global))
+ requireReadyTxn(t, s, row2)
+}
+
+// TestApplySchedulerClosedWithUnreachablePendingWorkErrors pins the abandoned
+// -work escape hatch: when the scheduler is closed while pending transactions
+// exist that can never become ready (nothing is inflight to advance the
+// scheduler state), nextReady must return errSchedulerAbandonedPendingWork —
+// not io.EOF (which would silently drop the pending suffix) and not block
+// forever (which would leak the worker).
+func TestApplySchedulerClosedWithUnreachablePendingWorkErrors(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // commitParent 0 skips enqueue's lastCommittedSequence seeding, keeping
+ // the watermark at 0.
+ first := &applyTxn{order: 1, sequenceNumber: 1, commitParent: 0, hasCommitMeta: true, writeset: []uint64{100}}
+ // Empty writeset -> commit-parent fallback; parent 99 is never reached.
+ stuck := &applyTxn{order: 2, sequenceNumber: 100, commitParent: 99, hasCommitMeta: true}
+ require.NoError(t, s.enqueue(first))
+ require.NoError(t, s.enqueue(stuck))
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, first, got)
+ require.NoError(t, s.markCommitted(got))
+
+ // Nothing inflight, the pending txn is permanently blocked, and the
+ // scheduler is closed: workers must get the abandoned-work error.
+ require.Equal(t, io.EOF, s.close())
+ _, err = s.nextReady(ctx)
+ require.ErrorIs(t, err, errSchedulerAbandonedPendingWork)
+}
+
+// TestApplySchedulerClosedWaitsForInflightNoConflict pins that the
+// abandoned-pending-work check does NOT fire while a noConflict transaction
+// is dispatched but uncommitted: a noConflict position-save carrying commit
+// metadata advances lastCommittedSequence when it commits, which can unblock
+// the pending head. Erroring early would convert a clean stop-drain into a
+// spurious workflow restart.
+func TestApplySchedulerClosedWaitsForInflightNoConflict(t *testing.T) {
+ ctx := t.Context()
+ s := newApplyScheduler(ctx)
+
+ // Position-only save with metadata: its commit publishes sequence 99.
+ save := &applyTxn{order: 1, sequenceNumber: 99, commitParent: 0, hasCommitMeta: true, noConflict: true}
+ // Blocked on commit-parent 99 (empty writeset fallback).
+ stuck := &applyTxn{order: 2, sequenceNumber: 100, commitParent: 99, hasCommitMeta: true}
+ require.NoError(t, s.enqueue(save))
+ require.NoError(t, s.enqueue(stuck))
+
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, save, got)
+
+ require.Equal(t, io.EOF, s.close())
+
+ // A worker parks in nextReady. With the save still uncommitted it must
+ // WAIT, not return errSchedulerAbandonedPendingWork.
+ type result struct {
+ txn *applyTxn
+ err error
+ }
+ resCh := make(chan result, 1)
+ go func() {
+ txn, err := s.nextReady(ctx)
+ resCh <- result{txn, err}
+ }()
+ select {
+ case r := <-resCh:
+ t.Fatalf("nextReady returned early (txn=%v err=%v); it must wait for the inflight noConflict txn to commit", r.txn, r.err)
+ case <-time.After(2 * time.Second):
+ }
+
+ // Committing the save publishes sequence 99 and unblocks the head.
+ require.NoError(t, s.markCommitted(save))
+ select {
+ case r := <-resCh:
+ require.NoError(t, r.err)
+ require.Equal(t, stuck, r.txn)
+ case <-time.After(30 * time.Second):
+ t.Fatal("timed out waiting for the unblocked transaction to be dispatched")
+ }
+
+ // With nothing inflight and nothing pending, drain ends cleanly.
+ require.NoError(t, s.markCommitted(stuck))
+ _, err = s.nextReady(ctx)
+ require.ErrorIs(t, err, io.EOF)
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_test.go
new file mode 100644
index 00000000000..1ae7e91d777
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_test.go
@@ -0,0 +1,7254 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "math"
+ "strconv"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "vitess.io/vitess/go/mysql/replication"
+ "vitess.io/vitess/go/mysql/sqlerror"
+ "vitess.io/vitess/go/sqltypes"
+ "vitess.io/vitess/go/stats"
+ "vitess.io/vitess/go/timer"
+ "vitess.io/vitess/go/vt/binlog/binlogplayer"
+ vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
+ "vitess.io/vitess/go/vt/sqlparser"
+ "vitess.io/vitess/go/vt/vtenv"
+ "vitess.io/vitess/go/vt/vterrors"
+ "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle"
+ "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base"
+ "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ querypb "vitess.io/vitess/go/vt/proto/query"
+ vttablet "vitess.io/vitess/go/vt/vttablet/common"
+)
+
+// testCtx returns the test context, which is cancelled when the test ends.
+// This is essential for tests that create an applyScheduler, because
+// newApplyScheduler spawns a goroutine that blocks on ctx.Done().
+func testCtx(t *testing.T) context.Context {
+ t.Helper()
+ return t.Context()
+}
+
+// ---------- computeLastEventTimestamp tests ----------
+
+func TestComputeLastEventTimestamp_EmptyEvents(t *testing.T) {
+ ts, ct := computeLastEventTimestamp(nil)
+ assert.Equal(t, int64(0), ts)
+ assert.Equal(t, int64(0), ct)
+
+ ts, ct = computeLastEventTimestamp([]*binlogdatapb.VEvent{})
+ assert.Equal(t, int64(0), ts)
+ assert.Equal(t, int64(0), ct)
+}
+
+func TestComputeLastEventTimestamp_LastEventHasTimestamp(t *testing.T) {
+ events := []*binlogdatapb.VEvent{
+ {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200},
+ {Type: binlogdatapb.VEventType_ROW, Timestamp: 300, CurrentTime: 400},
+ }
+ ts, ct := computeLastEventTimestamp(events)
+ assert.Equal(t, int64(300), ts)
+ assert.Equal(t, int64(400), ct)
+}
+
+func TestComputeLastEventTimestamp_SkipsZeroTimestamp(t *testing.T) {
+ events := []*binlogdatapb.VEvent{
+ {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200},
+ {Type: binlogdatapb.VEventType_COMMIT, Timestamp: 0, CurrentTime: 0},
+ }
+ ts, ct := computeLastEventTimestamp(events)
+ assert.Equal(t, int64(100), ts)
+ assert.Equal(t, int64(200), ct)
+}
+
+func TestComputeLastEventTimestamp_SkipsThrottledHeartbeat(t *testing.T) {
+ events := []*binlogdatapb.VEvent{
+ {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200},
+ {Type: binlogdatapb.VEventType_HEARTBEAT, Timestamp: 500, CurrentTime: 600, Throttled: true},
+ }
+ ts, ct := computeLastEventTimestamp(events)
+ assert.Equal(t, int64(100), ts)
+ assert.Equal(t, int64(200), ct)
+}
+
+func TestComputeLastEventTimestamp_NonThrottledHeartbeatCounts(t *testing.T) {
+ events := []*binlogdatapb.VEvent{
+ {Type: binlogdatapb.VEventType_ROW, Timestamp: 100, CurrentTime: 200},
+ {Type: binlogdatapb.VEventType_HEARTBEAT, Timestamp: 500, CurrentTime: 600, Throttled: false},
+ }
+ ts, ct := computeLastEventTimestamp(events)
+ assert.Equal(t, int64(500), ts)
+ assert.Equal(t, int64(600), ct)
+}
+
+func TestComputeLastEventTimestamp_AllZeroTimestamp(t *testing.T) {
+ events := []*binlogdatapb.VEvent{
+ {Type: binlogdatapb.VEventType_COMMIT, Timestamp: 0},
+ {Type: binlogdatapb.VEventType_BEGIN, Timestamp: 0},
+ }
+ ts, ct := computeLastEventTimestamp(events)
+ assert.Equal(t, int64(0), ts)
+ assert.Equal(t, int64(0), ct)
+}
+
+// ---------- sync.Pool helpers tests ----------
+
+func TestAcquireReleaseApplyTxn(t *testing.T) {
+ txn := acquireApplyTxn()
+ require.NotNil(t, txn)
+
+ // Set some fields
+ txn.order = 42
+ txn.sequenceNumber = 10
+ txn.forceGlobal = true
+
+ // Release should zero out the struct
+ releaseApplyTxn(txn)
+
+ // Acquire again — may get the same or a new object, but it should be zeroed
+ txn2 := acquireApplyTxn()
+ require.NotNil(t, txn2)
+ assert.Equal(t, int64(0), txn2.order)
+ assert.Equal(t, int64(0), txn2.sequenceNumber)
+ assert.False(t, txn2.forceGlobal)
+ releaseApplyTxn(txn2)
+}
+
+func TestAcquireReleaseApplyTxnPayload(t *testing.T) {
+ p := acquireApplyTxnPayload()
+ require.NotNil(t, p)
+
+ p.timestamp = 999
+ p.mustSave = true
+
+ // Attach to a txn and release
+ txn := acquireApplyTxn()
+ txn.payload = p
+ releaseApplyTxn(txn)
+
+ p2 := acquireApplyTxnPayload()
+ require.NotNil(t, p2)
+ assert.Equal(t, int64(0), p2.timestamp)
+ assert.False(t, p2.mustSave)
+ applyTxnPayloadPool.Put(p2)
+}
+
+func TestReleaseApplyTxnNilPayload(t *testing.T) {
+ txn := acquireApplyTxn()
+ txn.order = 5
+ txn.payload = nil
+ // Should not panic
+ releaseApplyTxn(txn)
+}
+
+// ---------- scheduler gaps: advanceCommittedSequence, waitForIdle, close ----------
+
+func TestApplySchedulerAdvanceCommittedSequence(t *testing.T) {
+ ctx := testCtx(t)
+ s := newApplyScheduler(ctx)
+
+ // Initially zero
+ assert.Equal(t, int64(0), s.lastCommittedSequence)
+
+ // Advance to 5
+ s.advanceCommittedSequence(5)
+ s.mu.Lock()
+ assert.Equal(t, int64(5), s.lastCommittedSequence)
+ s.mu.Unlock()
+
+ // Advance to 10
+ s.advanceCommittedSequence(10)
+ s.mu.Lock()
+ assert.Equal(t, int64(10), s.lastCommittedSequence)
+ s.mu.Unlock()
+
+ // Lower value does not regress
+ s.advanceCommittedSequence(3)
+ s.mu.Lock()
+ assert.Equal(t, int64(10), s.lastCommittedSequence)
+ s.mu.Unlock()
+
+ // Zero is a no-op
+ s.advanceCommittedSequence(0)
+ s.mu.Lock()
+ assert.Equal(t, int64(10), s.lastCommittedSequence)
+ s.mu.Unlock()
+
+ // Negative is a no-op
+ s.advanceCommittedSequence(-1)
+ s.mu.Lock()
+ assert.Equal(t, int64(10), s.lastCommittedSequence)
+ s.mu.Unlock()
+}
+
+func TestApplySchedulerAdvanceUnblocksMeta(t *testing.T) {
+ ctx := testCtx(t)
+ s := newApplyScheduler(ctx)
+
+ // Enqueue a non-meta txn first AND keep it inflight so that when
+ // meta2 is enqueued, the seeding condition is NOT met (inflightMissingMeta > 0).
+ // This ensures lastCommittedSequence stays 0 and meta2 is blocked.
+ blocker := &applyTxn{order: 1, writeset: []uint64{100}}
+ require.NoError(t, s.enqueue(blocker))
+ gotBlocker, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, blocker, gotBlocker)
+ // blocker is still inflight (inflightMissingMeta=1)
+
+ meta2 := &applyTxn{order: 2, sequenceNumber: 5, commitParent: 3, hasCommitMeta: true}
+ require.NoError(t, s.enqueue(meta2))
+
+ // meta2 has empty writeset, commitParent=3, lastCommittedSequence=0.
+ // Also blocked by inflightMissingMeta > 0 from the blocker.
+ readyCh := make(chan *applyTxn, 1)
+ go func() {
+ txn, err := s.nextReady(ctx)
+ if err == nil {
+ readyCh <- txn
+ }
+ }()
+
+ assert.Never(t, func() bool {
+ return len(readyCh) > 0
+ }, 50*time.Millisecond, 5*time.Millisecond)
+
+ // Commit the blocker to clear inflightMissingMeta, but
+ // lastCommittedSequence is still 0 so meta2 stays blocked.
+ require.NoError(t, s.markCommitted(gotBlocker))
+
+ assert.Never(t, func() bool {
+ return len(readyCh) > 0
+ }, 50*time.Millisecond, 5*time.Millisecond)
+
+ // Now advance committed sequence to 3 — should unblock meta2
+ s.advanceCommittedSequence(3)
+
+ assert.Eventually(t, func() bool {
+ return len(readyCh) > 0
+ }, 200*time.Millisecond, 5*time.Millisecond)
+}
+
+func TestApplySchedulerWaitForIdle(t *testing.T) {
+ ctx := testCtx(t)
+ s := newApplyScheduler(ctx)
+
+ // Empty scheduler: waitForIdle returns immediately
+ err := s.waitForIdle(ctx)
+ require.NoError(t, err)
+
+ // Enqueue and dequeue a txn, mark committed, then waitForIdle
+ txn := &applyTxn{order: 1, writeset: []uint64{100}}
+ require.NoError(t, s.enqueue(txn))
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+
+ // With inflight txn, waitForIdle should block
+ doneCh := make(chan error, 1)
+ go func() {
+ doneCh <- s.waitForIdle(ctx)
+ }()
+
+ assert.Never(t, func() bool {
+ return len(doneCh) > 0
+ }, 50*time.Millisecond, 5*time.Millisecond)
+
+ // Mark committed → idle
+ require.NoError(t, s.markCommitted(got))
+
+ assert.Eventually(t, func() bool {
+ return len(doneCh) > 0
+ }, 200*time.Millisecond, 5*time.Millisecond)
+ require.NoError(t, <-doneCh)
+
+ // A dispatched-but-uncommitted noConflict txn (e.g. a position-only save)
+ // must also keep waitForIdle blocked: the DDL barrier relies on it to
+ // guarantee ALL scheduled work has been applied before the next fetch.
+ // noConflict txns bump only inflightNoConflict, so an idle check that
+ // omits that counter would let the barrier return too early.
+ noConflict := &applyTxn{order: 2, noConflict: true}
+ require.NoError(t, s.enqueue(noConflict))
+ gotNoConflict, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Same(t, noConflict, gotNoConflict)
+
+ noConflictDone := make(chan error, 1)
+ go func() {
+ noConflictDone <- s.waitForIdle(ctx)
+ }()
+ assert.Never(t, func() bool {
+ return len(noConflictDone) > 0
+ }, 50*time.Millisecond, 5*time.Millisecond)
+
+ require.NoError(t, s.markCommitted(gotNoConflict))
+ assert.Eventually(t, func() bool {
+ return len(noConflictDone) > 0
+ }, 200*time.Millisecond, 5*time.Millisecond)
+ require.NoError(t, <-noConflictDone)
+}
+
+func TestApplySchedulerWaitForIdleCancelled(t *testing.T) {
+ ctx, cancel := context.WithCancel(t.Context())
+ s := newApplyScheduler(ctx)
+
+ txn := &applyTxn{order: 1, forceGlobal: true}
+ require.NoError(t, s.enqueue(txn))
+ _, err := s.nextReady(ctx)
+ require.NoError(t, err)
+
+ doneCh := make(chan error, 1)
+ go func() {
+ doneCh <- s.waitForIdle(ctx)
+ }()
+
+ cancel()
+
+ assert.Eventually(t, func() bool {
+ return len(doneCh) > 0
+ }, 200*time.Millisecond, 5*time.Millisecond)
+ err = <-doneCh
+ require.Error(t, err)
+}
+
+func TestApplySchedulerClose(t *testing.T) {
+ ctx := testCtx(t)
+ s := newApplyScheduler(ctx)
+
+ // Enqueue some transactions
+ require.NoError(t, s.enqueue(&applyTxn{order: 1, writeset: []uint64{100}}))
+ require.NoError(t, s.enqueue(&applyTxn{order: 2, writeset: []uint64{200}}))
+
+ s.mu.Lock()
+ assert.Equal(t, 2, s.pendingCount)
+ s.mu.Unlock()
+
+ err := s.close()
+ require.Error(t, err) // returns io.EOF
+
+ s.mu.Lock()
+ assert.Equal(t, 2, s.pendingCount)
+ assert.Len(t, s.pending, 2)
+ assert.Equal(t, 0, s.pendingOff)
+ s.mu.Unlock()
+}
+
+// ---------- noConflict scheduling tests ----------
+
+func TestApplySchedulerNoConflictAlwaysReady(t *testing.T) {
+ ctx := testCtx(t)
+ s := newApplyScheduler(ctx)
+
+ // A forceGlobal txn is inflight
+ global := &applyTxn{order: 1, forceGlobal: true}
+ require.NoError(t, s.enqueue(global))
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, global, got)
+
+ // Now enqueue a noConflict txn — should be ready even with inflight global
+ nc := &applyTxn{order: 2, noConflict: true}
+ require.NoError(t, s.enqueue(nc))
+
+ gotNC, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, nc, gotNC)
+
+ require.NoError(t, s.markCommitted(got))
+ require.NoError(t, s.markCommitted(gotNC))
+}
+
+// ---------- removePendingLocked compaction tests ----------
+
+func TestApplySchedulerRemovePendingCompaction(t *testing.T) {
+ ctx := testCtx(t)
+ s := newApplyScheduler(ctx)
+
+ // Enqueue 4 transactions with independent writesets
+ for i := int64(1); i <= 4; i++ {
+ require.NoError(t, s.enqueue(&applyTxn{order: i, writeset: []uint64{uint64(i)}}))
+ }
+
+ // Dequeue all 4 — this exercises removePendingLocked compaction
+ for range 4 {
+ got, err := s.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, got)
+ require.NoError(t, s.markCommitted(got))
+ }
+
+ s.mu.Lock()
+ assert.Equal(t, 0, s.pendingCount)
+ s.mu.Unlock()
+}
+
+// ---------- snapshotTablePlans tests ----------
+
+func TestSnapshotTablePlans_Nil(t *testing.T) {
+ mu := &sync.RWMutex{}
+ version := &atomic.Int64{}
+ var cachedVersion int64
+ result := snapshotTablePlans(mu, nil, version, &cachedVersion, nil)
+ assert.Nil(t, result)
+}
+
+func TestSnapshotTablePlans_CopiesMap(t *testing.T) {
+ mu := &sync.RWMutex{}
+ plans := map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ "t2": {TargetName: "t2"},
+ }
+ version := &atomic.Int64{}
+ version.Store(1)
+ var cachedVersion int64
+
+ snap := snapshotTablePlans(mu, plans, version, &cachedVersion, nil)
+ require.Len(t, snap, 2)
+ assert.Equal(t, "t1", snap["t1"].TargetName)
+ assert.Equal(t, "t2", snap["t2"].TargetName)
+ assert.Equal(t, int64(1), cachedVersion)
+
+ // Modify original — snapshot should not be affected
+ plans["t3"] = &TablePlan{TargetName: "t3"}
+ assert.Len(t, snap, 2)
+}
+
+func TestSnapshotTablePlans_UsesCacheWhenVersionMatches(t *testing.T) {
+ mu := &sync.RWMutex{}
+ plans := map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ }
+ version := &atomic.Int64{}
+ version.Store(5)
+ var cachedVersion int64 = 5
+ cached := map[string]*TablePlan{
+ "cached": {TargetName: "cached"},
+ }
+
+ snap := snapshotTablePlans(mu, plans, version, &cachedVersion, cached)
+ // Should return the cached map since versions match
+ require.Len(t, snap, 1)
+ assert.Equal(t, "cached", snap["cached"].TargetName)
+}
+
+func TestSnapshotTablePlans_RefreshesCacheWhenVersionChanges(t *testing.T) {
+ mu := &sync.RWMutex{}
+ plans := map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ }
+ version := &atomic.Int64{}
+ version.Store(6)
+ var cachedVersion int64 = 5
+ cached := map[string]*TablePlan{
+ "stale": {TargetName: "stale"},
+ }
+
+ snap := snapshotTablePlans(mu, plans, version, &cachedVersion, cached)
+ require.Len(t, snap, 1)
+ assert.Equal(t, "t1", snap["t1"].TargetName)
+ assert.Equal(t, int64(6), cachedVersion)
+}
+
+// ---------- scheduleItems tests ----------
+
+// testVPlayer creates a minimal vplayer stub for testing scheduleItems.
+// The returned vplayer has mocked query/commit functions and a mock DB client.
+func testVPlayer(t *testing.T) (*vplayer, *binlogplayer.MockDBClient) {
+ t.Helper()
+ mockDB := binlogplayer.NewMockDBClient(t)
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ config, err := vttablet.NewVReplicationConfig(nil)
+ require.NoError(t, err)
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ dbClient: newVDBClient(mockDB, stats, config.RelayLogMaxItems),
+ workflowConfig: config,
+ vre: &Engine{},
+ source: &binlogdatapb.BinlogSource{OnDdl: binlogdatapb.OnDDLAction_IGNORE},
+ }
+
+ vp := &vplayer{
+ vr: vr,
+ tablePlansMu: &sync.RWMutex{},
+ tablePlans: make(map[string]*TablePlan),
+ tablePlansVersion: &atomic.Int64{},
+ serialMu: &sync.Mutex{},
+ parallelOrder: &atomic.Int64{},
+ lagSnapshot: &atomic.Pointer[lagSnapshot]{},
+ timeLastSaved: time.Now(),
+ idStr: "1",
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return &sqltypes.Result{}, nil
+ },
+ commit: func() error {
+ return nil
+ },
+ dbClient: vr.dbClient,
+ }
+ return vp, mockDB
+}
+
+func TestTestVPlayerDoesNotMutateDefaultWorkflowConfig(t *testing.T) {
+ defaults := vttablet.InitVReplicationConfigDefaults()
+ savedWorkers := defaults.ParallelReplicationWorkers
+ t.Cleanup(func() {
+ defaults.ParallelReplicationWorkers = savedWorkers
+ })
+ defaults.ParallelReplicationWorkers = 1
+
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ config, err := vttablet.NewVReplicationConfig(nil)
+ require.NoError(t, err)
+ assert.Equal(t, 1, config.ParallelReplicationWorkers)
+}
+
+func TestExtractDDLAffectedTables_MixedCaseDDLMatchesLowercasePlan(t *testing.T) {
+ tracked, conservative := extractDDLAffectedTables(
+ "alter table T1 add column c1 bigint",
+ sqlparser.NewTestParser(),
+ map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ },
+ nil,
+ )
+
+ require.False(t, conservative)
+ require.Contains(t, tracked, "t1")
+ require.Contains(t, tracked["t1"].refreshedPlans, "t1")
+}
+
+func TestResolvedPostDDLStalePlans_MixedCaseDroppedNameMatchesLowercaseBarrier(t *testing.T) {
+ stalePlan := &TablePlan{TargetName: "t1"}
+ resolved := resolvedPostDDLStalePlans(
+ map[string]*TablePlan{"t1": stalePlan},
+ map[string]struct{}{"T1": {}},
+ map[string]postDDLStalePlan{
+ "t1": {
+ stalePlan: stalePlan,
+ refreshedPlans: map[string]*TablePlan{"t1": stalePlan},
+ allowDisappear: true,
+ },
+ },
+ )
+
+ require.Contains(t, resolved, "t1")
+}
+
+func TestSnapshotPostDDLStalePlans_MixedCaseDroppedNameSkipsLowercasePlan(t *testing.T) {
+ tracked := snapshotPostDDLStalePlans(
+ map[string]*TablePlan{"t1": {TargetName: "t1"}},
+ map[string]struct{}{"T1": {}},
+ )
+
+ assert.Nil(t, tracked)
+}
+
+func TestUnresolvedPostDDLStalePlans_MixedCaseRefreshNameMatchesLowercasePlan(t *testing.T) {
+ unresolved := unresolvedPostDDLStalePlans(
+ map[string]*TablePlan{"t1_new": {TargetName: "t1_new"}},
+ nil,
+ map[string]postDDLStalePlan{
+ "t1": {
+ stalePlan: &TablePlan{TargetName: "t1"},
+ refreshedPlans: map[string]*TablePlan{"T1_NEW": nil},
+ },
+ },
+ )
+
+ assert.Nil(t, unresolved)
+}
+
+func TestTxnTouchesPostDDLBarrier_MixedCaseRefreshTargetMatchesLowercaseRow(t *testing.T) {
+ touches := txnTouchesPostDDLBarrier(
+ []*binlogdatapb.VEvent{{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1_new",
+ RowChanges: []*binlogdatapb.RowChange{{}},
+ },
+ }},
+ map[string]postDDLStalePlan{
+ "t1": {
+ stalePlan: &TablePlan{TargetName: "t1"},
+ refreshedPlans: map[string]*TablePlan{"T1_NEW": nil},
+ },
+ },
+ false,
+ )
+
+ assert.True(t, touches)
+}
+
+func TestPostDDLRefreshTargetMatchesCachedPlan_MixedCaseRefreshNameMatches(t *testing.T) {
+ cachedPlan := &TablePlan{TargetName: "t1_new"}
+ matches := postDDLRefreshTargetMatchesCachedPlan(
+ map[string]postDDLStalePlan{
+ "t1": {
+ stalePlan: &TablePlan{TargetName: "t1"},
+ refreshedPlans: map[string]*TablePlan{"T1_NEW": cachedPlan},
+ },
+ },
+ "t1_new",
+ cachedPlan,
+ )
+
+ assert.True(t, matches)
+}
+
+func TestApplyEvent_FieldClearsMixedCaseDroppedTableEntry(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 1
+ vp.postDDLDroppedTables = map[string]struct{}{"T1": {}}
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {
+ TargetName: "t1",
+ IdentityColumns: []string{"id"},
+ Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)"),
+ TablePlanBuilder: &tablePlanBuilder{},
+ },
+ }}
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ },
+ }, false))
+
+ assert.Empty(t, vp.postDDLDroppedTables)
+}
+
+// publishExecutedDDLBarrier mirrors commitLoop's post-DDL publication so the
+// scheduler tests can model only DDLs that actually executed on the target.
+func publishExecutedDDLBarrier(t *testing.T, vp *vplayer, statement string) {
+ t.Helper()
+ vp.serialMu.Lock()
+ defer vp.serialMu.Unlock()
+ vp.tablePlansMu.RLock()
+ renameTargets := extractDDLRenameTargets(statement, vp.vr.vre.env.Parser())
+ retargetPostDDLStalePlans(vp.postDDLStalePlans, renameTargets, vp.tablePlans)
+ ddlStalePlans, conservative := extractDDLAffectedTables(statement, vp.vr.vre.env.Parser(), vp.tablePlans, vp.postDDLDroppedTables)
+ ddlStalePlans = unresolvedPostDDLStalePlans(vp.tablePlans, vp.postDDLDroppedTables, ddlStalePlans)
+ vp.tablePlansMu.RUnlock()
+ vp.postDDLStalePlans = mergePostDDLStalePlans(vp.postDDLStalePlans, ddlStalePlans)
+ vp.postDDLConservative = vp.postDDLConservative || conservative
+ vp.postDDLDroppedTables = mergeDroppedTables(vp.postDDLDroppedTables, extractDroppedTables(statement, vp.vr.vre.env.Parser()))
+}
+
+// commitScheduledExecutedDDL models a commitLoop DDL commit and then syncs the
+// resulting barrier into scheduler state the way the next fetch would observe it.
+func commitScheduledExecutedDDL(t *testing.T, ctx context.Context, scheduler *applyScheduler, state *parallelScheduleState, vp *vplayer) {
+ t.Helper()
+ ddlTxn, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type)
+ publishExecutedDDLBarrier(t, vp, ddlTxn.payload.events[0].Statement)
+ require.NoError(t, scheduler.markCommitted(ddlTxn))
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil))
+}
+
+func TestApplyEventsParallelCanceledContext(t *testing.T) {
+ vp, _ := testVPlayer(t)
+
+ ctx, cancel := context.WithCancel(testCtx(t))
+ cancel()
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 1
+
+ relay := newRelayLog(ctx, 10, 100)
+
+ err := vp.applyEventsParallel(ctx, relay)
+ require.ErrorIs(t, err, context.Canceled)
+}
+
+func TestApplyEventsParallelReturnsScheduleError(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{})
+ mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{})
+ mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables))
+ mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES",
+ ))
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ if vp.vr.vre.dbClientFactoryFiltered == nil {
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB }
+ }
+
+ relay := newRelayLog(ctx, 10, 100)
+ invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{invalidGTID}))
+
+ err := vp.applyEventsParallel(ctx, relay)
+ require.Error(t, err)
+}
+
+func TestApplyEventsParallelCommitsScheduledPrefixBeforeScheduleError(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.workflowConfig.StoreCompressedGTID = false
+
+ mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.foreign_key_checks", &sqltypes.Result{})
+ mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{})
+ mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables))
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+ mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil)
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return vp.dbClient.Execute(sql)
+ }
+ vp.commit = vp.dbClient.Commit
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ if vp.vr.vre.dbClientFactoryFiltered == nil {
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB }
+ }
+
+ relay := newRelayLog(ctx, 10, 100)
+ validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}
+ otherEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_OTHER, Timestamp: 100}
+ invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, otherEvent, invalidGTID}))
+
+ err := vp.applyEventsParallel(ctx, relay)
+ require.Error(t, err)
+ mockDB.Wait()
+}
+
+func TestApplyEventsParallelReturnsNilAfterScheduledStopPosEvenIfLaterScheduleFails(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.workflowConfig.StoreCompressedGTID = false
+
+ mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{})
+ mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{})
+ mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables))
+ mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES",
+ ))
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+ mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil)
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return vp.dbClient.Execute(sql)
+ }
+ vp.commit = vp.dbClient.Commit
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ if vp.vr.vre.dbClientFactoryFiltered == nil {
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB }
+ }
+
+ stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = stopPos
+
+ relay := newRelayLog(ctx, 10, 100)
+ validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: replication.EncodePosition(stopPos)}
+ otherEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_OTHER, Timestamp: 100}
+ invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, otherEvent, invalidGTID}))
+
+ err = vp.applyEventsParallel(ctx, relay)
+ require.NoError(t, err)
+ mockDB.Wait()
+}
+
+func TestApplyEventsParallelReturnsNilAfterEmptyTxnStopPosEvenIfLaterScheduleFails(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.workflowConfig.StoreCompressedGTID = false
+
+ mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{})
+ mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{})
+ mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables))
+ mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES",
+ ))
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+ mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil)
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return vp.dbClient.Execute(sql)
+ }
+ vp.commit = vp.dbClient.Commit
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ if vp.vr.vre.dbClientFactoryFiltered == nil {
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB }
+ }
+
+ stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = stopPos
+
+ relay := newRelayLog(ctx, 10, 100)
+ validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: replication.EncodePosition(stopPos)}
+ commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 100}
+ invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, commitEvent, invalidGTID}))
+
+ err = vp.applyEventsParallel(ctx, relay)
+ require.NoError(t, err)
+ mockDB.Wait()
+}
+
+func TestApplyEventsParallelReturnsNilAfterScheduledStopDDLEvenIfLaterScheduleFails(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.workflowConfig.StoreCompressedGTID = false
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_STOP
+
+ mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{})
+ mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{})
+ mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables))
+ mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES",
+ ))
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+ mockDB.ExpectRequestRE("update _vt\\.vreplication set pos=", &sqltypes.Result{}, nil)
+ mockDB.ExpectRequestRE("update _vt\\.vreplication set state=", &sqltypes.Result{}, nil)
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return vp.dbClient.Execute(sql)
+ }
+ vp.commit = vp.dbClient.Commit
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ if vp.vr.vre.dbClientFactoryFiltered == nil {
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB }
+ }
+
+ relay := newRelayLog(ctx, 10, 100)
+ validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}
+ ddlEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 100}
+ invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, ddlEvent, invalidGTID}))
+
+ err := vp.applyEventsParallel(ctx, relay)
+ require.NoError(t, err)
+}
+
+func TestApplyEventsParallelReturnsNilAfterScheduledRelevantJournalEvenIfLaterScheduleFails(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.workflowConfig.StoreCompressedGTID = false
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ }}
+
+ mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{})
+ mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{})
+ mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables))
+ mockDB.AddInvariant("select @@session.sql_mode as sql_mode", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES",
+ ))
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return vp.dbClient.Execute(sql)
+ }
+ vp.commit = vp.dbClient.Commit
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ if vp.vr.vre.dbClientFactoryFiltered == nil {
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient { return mockDB }
+ }
+ vp.vr.vre.isOpen = true
+ vp.vr.vre.journaler = make(map[string]*journalEvent)
+ vp.vr.vre.controllers = map[int32]*controller{
+ vp.vr.id: {
+ workflow: "wf",
+ source: &binlogdatapb.BinlogSource{
+ Keyspace: "ks",
+ Shard: "0",
+ },
+ },
+ 2: {
+ workflow: "wf",
+ source: &binlogdatapb.BinlogSource{
+ Keyspace: "ks",
+ Shard: "1",
+ },
+ },
+ }
+
+ relay := newRelayLog(ctx, 10, 100)
+ validGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}
+ journalEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_JOURNAL, Timestamp: 100, Journal: &binlogdatapb.Journal{
+ Id: 1,
+ MigrationType: binlogdatapb.MigrationType_TABLES,
+ Participants: []*binlogdatapb.KeyspaceShard{{
+ Keyspace: "ks",
+ Shard: "0",
+ }, {
+ Keyspace: "ks",
+ Shard: "1",
+ }},
+ Tables: []string{"t1"},
+ }}
+ invalidGTID := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{validGTID, journalEvent, invalidGTID}))
+
+ err := vp.applyEventsParallel(ctx, relay)
+ require.NoError(t, err)
+}
+
+func TestApplyEventsParallelReturnsWorkerErrorEvenIfCancellationLooksLikeEOF(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.workflowConfig.ExperimentalFlags = 0
+ vp.canAcceptStmtEvents = true
+
+ workerApplyErr := errors.New("worker apply failed")
+ mockDB.AddInvariant("information_schema.key_column_usage", &sqltypes.Result{})
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient {
+ return &failingDBClient{failOnQuery: map[string]error{
+ "insert into t1": workerApplyErr,
+ }}
+ }
+
+ relay := newRelayLog(ctx, 10, 100)
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_INSERT, Dml: "insert into t1(id) values (1)", Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }))
+
+ err := vp.applyEventsParallel(ctx, relay)
+ require.ErrorContains(t, err, workerApplyErr.Error())
+}
+
+type blockingBatchDBClient struct {
+ recordingDBClient
+ blockMulti chan struct{}
+ entered chan struct{}
+ closed chan struct{}
+}
+
+func (b *blockingBatchDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) {
+ b.queries = append(b.queries, query)
+ select {
+ case b.entered <- struct{}{}:
+ default:
+ }
+ select {
+ case <-b.blockMulti:
+ case <-b.closed:
+ return nil, context.Canceled
+ }
+ return []*sqltypes.Result{{}}, nil
+}
+
+func (b *blockingBatchDBClient) Close() {
+ select {
+ case <-b.closed:
+ default:
+ close(b.closed)
+ }
+}
+
+func TestWorkerLoopCancelDoesNotUnblockBlockedBatchFlush(t *testing.T) {
+ ctx, cancel := context.WithCancel(testCtx(t))
+ defer cancel()
+
+ vp, _ := testVPlayer(t)
+ vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running)
+ vp.batchMode = true
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {
+ TargetName: "t1",
+ IdentityColumns: []string{"id"},
+ Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a_id)"),
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ PKReferences: []string{"id"},
+ Stats: vp.vr.stats,
+ TablePlanBuilder: &tablePlanBuilder{},
+ WorkflowConfig: vp.vr.workflowConfig,
+ },
+ }}
+ vp.tablePlans["t1"] = vp.replicatorPlan.TablePlans["t1"]
+ vp.tablePlansVersion.Store(1)
+
+ scheduler := newApplyScheduler(ctx)
+ commitCh := make(chan *applyTxn, 1)
+ blockingClient := &blockingBatchDBClient{
+ blockMulti: make(chan struct{}),
+ entered: make(chan struct{}, 1),
+ closed: make(chan struct{}),
+ }
+ workerClient := newVDBClient(blockingClient, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ workerClient.maxBatchSize = 1024
+ worker := &applyWorker{
+ ctx: ctx,
+ conns: [2]*vdbClient{workerClient, workerClient},
+ client: workerClient,
+ batchMode: true,
+ }
+ worker.bindFunctions()
+
+ txn := acquireApplyTxn()
+ t.Cleanup(func() {
+ close(blockingClient.blockMulti)
+ })
+ txn.order = 1
+ txn.payload = &applyTxnPayload{events: []*binlogdatapb.VEvent{{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}},
+ }},
+ },
+ }}}
+ require.NoError(t, scheduler.enqueue(txn))
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker)
+ }()
+
+ select {
+ case <-blockingClient.entered:
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("timed out waiting for blocking batch flush")
+ }
+
+ cancel()
+
+ select {
+ case err := <-errCh:
+ require.ErrorIs(t, err, context.Canceled)
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("workerLoop remained stuck after cancellation while batch flush was blocked")
+ }
+}
+
+func TestApplyEventsParallelCancelledContext(t *testing.T) {
+ vp, _ := testVPlayer(t)
+
+ ctx, cancel := context.WithCancel(testCtx(t))
+ cancel()
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 1
+
+ relay := newRelayLog(ctx, 10, 100)
+
+ err := vp.applyEventsParallel(ctx, relay)
+ require.ErrorIs(t, err, context.Canceled)
+}
+
+func TestApplyEventsParallelParallelWorkersFailFastOnCanceledContext(t *testing.T) {
+ vp, _ := testVPlayer(t)
+
+ ctx, cancel := context.WithCancel(testCtx(t))
+ cancel()
+
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.vre.dbClientFactoryFiltered = func() binlogplayer.DBClient {
+ panic("worker factory should not be called for canceled context")
+ }
+
+ relay := newRelayLog(ctx, 10, 100)
+
+ var err error
+ require.NotPanics(t, func() {
+ err = vp.applyEventsParallel(ctx, relay)
+ })
+ require.ErrorIs(t, err, context.Canceled)
+}
+
+func TestScheduleItems_GTIDAndROWAndCOMMIT(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ // Set up a table plan so writeset can be built
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}},
+ },
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // Should have enqueued exactly one transaction
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, got)
+ assert.Equal(t, int64(1), got.order)
+ assert.NotNil(t, got.payload)
+ assert.Len(t, got.payload.events, 1) // ROW event only
+ assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[0].Type)
+ assert.True(t, got.payload.rowOnly)
+}
+
+func TestScheduleItemsBackpressuresOutstandingOrderedTransactions(t *testing.T) {
+ ctx, cancel := context.WithCancel(testCtx(t))
+ defer cancel()
+
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+ scheduler.maxOutstandingOrders = 3
+ state := ¶llelScheduleState{
+ lastFlushTime: time.Now(),
+ lastHeartbeatRefresh: time.Now(),
+ maxBatchedCommits: 1,
+ }
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ batch := make([]*binlogdatapb.VEvent, 0, 12)
+ for i := 1; i <= 4; i++ {
+ gtid := fmt.Sprintf("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-%d", i)
+ value := strconv.Itoa(i)
+ batch = append(batch,
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: gtid},
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte(value), Lengths: []int64{int64(len(value))}},
+ }},
+ }, Timestamp: int64(100 + i)},
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT},
+ )
+ }
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{batch})
+ }()
+
+ assert.Eventually(t, func() bool {
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ return int64(scheduler.pendingCount) >= scheduler.maxOutstandingOrders
+ }, 200*time.Millisecond, 5*time.Millisecond)
+
+ assert.Never(t, func() bool {
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ return int64(scheduler.pendingCount) > scheduler.maxOutstandingOrders
+ }, 100*time.Millisecond, 5*time.Millisecond)
+
+ assert.Never(t, func() bool {
+ return len(errCh) > 0
+ }, 100*time.Millisecond, 5*time.Millisecond)
+
+ cancel()
+ require.ErrorIs(t, <-errCh, context.Canceled)
+}
+
+func TestScheduleLoopCanceledContext(t *testing.T) {
+ vp, _ := testVPlayer(t)
+
+ ctx, cancel := context.WithCancel(testCtx(t))
+ cancel()
+
+ scheduler := newApplyScheduler(ctx)
+ relay := newRelayLog(ctx, 10, 100)
+
+ err := vp.scheduleLoop(ctx, relay, scheduler)
+ require.ErrorIs(t, err, context.Canceled)
+}
+
+func TestScheduleLoopProcessesItems(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+
+ ctx, cancel := context.WithCancel(testCtx(t))
+ defer cancel()
+
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+
+ scheduler := newApplyScheduler(ctx)
+ relay := newRelayLog(ctx, 10, 100)
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}
+
+ require.NoError(t, relay.Send([]*binlogdatapb.VEvent{gtidEvent, rowEvent, commitEvent}))
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- vp.scheduleLoop(ctx, relay, scheduler)
+ }()
+
+ ready, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, ready)
+
+ cancel()
+
+ select {
+ case err := <-errCh:
+ require.True(t, errors.Is(err, context.Canceled) || errors.Is(err, io.EOF))
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("timed out waiting for scheduleLoop")
+ }
+}
+
+func TestScheduleLoopThrottledUpdates(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+
+ ctx, cancel := context.WithCancel(testCtx(t))
+ defer cancel()
+
+ if globalStats.ThrottledCount == nil {
+ globalStats.ThrottledCount = stats.NewCounter("", "")
+ }
+
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+ mockDB.AddInvariant("time_throttled", &sqltypes.Result{})
+
+ if vp.vr.vre == nil {
+ vp.vr.vre = &Engine{}
+ }
+ if vp.vr.vre.throttlerClient == nil {
+ vp.vr.vre.throttlerClient = throttle.NewBackgroundClient(nil, throttlerapp.VReplicationName, base.UndefinedScope)
+ }
+ vp.throttlerAppName = throttlerapp.TestingAlwaysThrottledName.String()
+ if vp.vr.throttleUpdatesRateLimiter == nil {
+ vp.vr.throttleUpdatesRateLimiter = timer.NewRateLimiter(time.Millisecond)
+ defer vp.vr.throttleUpdatesRateLimiter.Stop()
+ }
+
+ scheduler := newApplyScheduler(ctx)
+ relay := newRelayLog(ctx, 10, 100)
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- vp.scheduleLoop(ctx, relay, scheduler)
+ }()
+
+ time.Sleep(10 * time.Millisecond)
+ cancel()
+
+ select {
+ case err := <-errCh:
+ require.True(t, errors.Is(err, context.Canceled) || errors.Is(err, io.EOF))
+ case <-time.After(2 * time.Second):
+ t.Fatal("timed out waiting for scheduleLoop")
+ }
+}
+
+func TestScheduleLoopCancelledContext(t *testing.T) {
+ vp, _ := testVPlayer(t)
+
+ ctx, cancel := context.WithCancel(testCtx(t))
+ cancel()
+
+ scheduler := newApplyScheduler(ctx)
+ relay := newRelayLog(ctx, 10, 100)
+
+ err := vp.scheduleLoop(ctx, relay, scheduler)
+ require.ErrorIs(t, err, context.Canceled)
+}
+
+func TestScheduleItems_EmptyTransaction(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ Timestamp: 100,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // Empty transaction should NOT be enqueued — it just sets unsavedEvent
+ vp.serialMu.Lock()
+ assert.Equal(t, commitEvent, vp.unsavedEvent)
+ vp.serialMu.Unlock()
+}
+
+func TestScheduleItems_EmptyTxnAfterIdleTimeoutEnqueuesPositionSave(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.serialMu.Lock()
+ vp.timeLastSaved = time.Now().Add(-2 * idleTimeout)
+ vp.serialMu.Unlock()
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ SequenceNumber: 7,
+ CommitParent: 6,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ Timestamp: 100,
+ }
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}})
+ require.NoError(t, err)
+
+ scheduler.mu.Lock()
+ require.Equal(t, 1, scheduler.pendingCount)
+ scheduler.mu.Unlock()
+
+ got, gerr := scheduler.nextReady(ctx)
+ require.NoError(t, gerr)
+ require.NotNil(t, got)
+ assert.True(t, got.payload.commitOnly)
+ assert.True(t, got.payload.updatePosOnly)
+ assert.True(t, got.noConflict)
+ assert.Equal(t, int64(7), got.sequenceNumber)
+ assert.Equal(t, int64(6), got.commitParent)
+ assert.True(t, got.hasCommitMeta)
+
+ vp.serialMu.Lock()
+ assert.Nil(t, vp.unsavedEvent)
+ vp.serialMu.Unlock()
+}
+
+func TestScheduleItems_VERSIONIsIgnoredLikeEmptyTransaction(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ versionEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_VERSION,
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ Timestamp: 100,
+ }
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, versionEvent, commitEvent}})
+ require.NoError(t, err)
+
+ vp.serialMu.Lock()
+ assert.Equal(t, commitEvent, vp.unsavedEvent)
+ vp.serialMu.Unlock()
+}
+
+func TestScheduleItems_ROWSQUERYOnlyTransactionIsEmpty(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ beginEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_BEGIN,
+ }
+ rowsQueryEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROWS_QUERY,
+ Statement: "update t1 set id = id where id = 1",
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ Timestamp: 100,
+ }
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, beginEvent, rowsQueryEvent, commitEvent}})
+ require.NoError(t, err)
+
+ vp.serialMu.Lock()
+ assert.Equal(t, commitEvent, vp.unsavedEvent)
+ vp.serialMu.Unlock()
+
+ scheduler.mu.Lock()
+ assert.Equal(t, 0, scheduler.pendingCount)
+ scheduler.mu.Unlock()
+}
+
+func TestScheduleItems_EmptyTxnWithCommitMeta_AdvancesSequence(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ SequenceNumber: 7,
+ CommitParent: 6,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // Should have advanced the committed sequence to 7
+ scheduler.mu.Lock()
+ assert.Equal(t, int64(7), scheduler.lastCommittedSequence)
+ scheduler.mu.Unlock()
+}
+
+func TestScheduleItems_BEGINIsIgnored(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ beginEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_BEGIN,
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}},
+ },
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ // BEGIN should not be added to curEvents
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, beginEvent, rowEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ // Should only have the ROW event, not BEGIN
+ assert.Len(t, got.payload.events, 1)
+ assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[0].Type)
+}
+
+func TestScheduleItems_DDLIsForceGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ ddlEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_DDL,
+ Timestamp: 200,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, ddlEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.True(t, got.payload.commitOnly)
+ assert.Len(t, got.payload.events, 1)
+ assert.Equal(t, binlogdatapb.VEventType_DDL, got.payload.events[0].Type)
+}
+
+func TestScheduleItems_PostDDLComplexDDLDoesNotClearOnUnrelatedPlanRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ staleT1 := &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ staleT2 := &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ staleT3 := &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t1"] = staleT1
+ vp.tablePlans["t2"] = staleT2
+ vp.tablePlans["t3"] = staleT3
+ vp.tablePlansVersion.Store(1)
+
+ ddlItems := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new, t2 to t2_new", Timestamp: 200},
+ }}
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, ddlItems))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+ require.Contains(t, state.postDDLStalePlans, "t2")
+ require.NotContains(t, state.postDDLStalePlans, "t3")
+
+ // Simulate an unrelated plan refresh while plans for DDL-affected tables remain stale.
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansMu.Unlock()
+
+ rowItems := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, rowItems))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.NotNil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLRenameClearsAfterRenamedTableFieldRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1_new"] = &TablePlan{
+ TargetName: "t1_new",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1_new",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLUnknownDDLRetainsConservativeBarrier(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ staleT1 := &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ staleT2 := &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t1"] = staleT1
+ vp.tablePlans["t2"] = staleT2
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.NotNil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLDropDoesNotClearOnUnrelatedPlanRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")},
+ }}
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ assert.Nil(t, state.postDDLStalePlans)
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLDropClearsAfterDroppedTableSatisfied(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")},
+ }}
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ assert.Nil(t, state.postDDLStalePlans)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLDroppedTablesSnapshotDoesNotAliasVPlayer(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.postDDLDroppedTables = map[string]struct{}{"t1": {}}
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil))
+ require.Equal(t, map[string]struct{}{"t1": {}}, state.postDDLDroppedTables)
+
+ delete(vp.postDDLDroppedTables, "t1")
+ vp.postDDLDroppedTables["t2"] = struct{}{}
+
+ require.Equal(t, map[string]struct{}{"t1": {}}, state.postDDLDroppedTables)
+}
+
+func TestScheduleItems_PostDDLAlterRenameClearsAfterRenamedTableFieldRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 rename to t1_new", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1_new",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLAlterClearsAfterSameTableFieldRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLSecondDDLDoesNotReplaceEarlierUnresolvedBarrier(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t2 add column c1 int", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+ require.Contains(t, state.postDDLStalePlans, "t2")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t3",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.Contains(t, state.postDDLStalePlans, "t1")
+}
+
+func TestScheduleItems_PostDDLUnknownSecondDDLExpandsBarrierConservatively(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+ require.NotContains(t, state.postDDLStalePlans, "t2")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+ require.Contains(t, state.postDDLStalePlans, "t2")
+ require.Contains(t, state.postDDLStalePlans, "t3")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.Contains(t, state.postDDLStalePlans, "t1")
+}
+
+func TestScheduleItems_PostDDLRenameThenUnknownStillBlocksAfterRenamedTableRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")},
+ }}
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1_new",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.Contains(t, state.postDDLStalePlans, "t1")
+}
+
+func TestScheduleItems_PostDDLRenameRetiresOldNameFromLaterUnknownBarrier(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1_new",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 250},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ require.NoError(t, scheduler.markCommitted(got))
+ assert.Nil(t, state.postDDLStalePlans)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 300},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.NotContains(t, state.postDDLStalePlans, "t1")
+ assert.Contains(t, state.postDDLStalePlans, "t1_new")
+ assert.Contains(t, state.postDDLStalePlans, "t2")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 350},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err = scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLRenameRetiresOldNameEvenWhenAnotherBarrierRemains(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t1_new", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t3 add column c1 int", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+ require.Contains(t, state.postDDLStalePlans, "t3")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1_new"] = &TablePlan{TargetName: "t1_new", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1_new",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ require.NoError(t, scheduler.markCommitted(got))
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.NotContains(t, state.postDDLStalePlans, "t1")
+ assert.Contains(t, state.postDDLStalePlans, "t3")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 350},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.NotContains(t, state.postDDLStalePlans, "t1")
+ assert.Contains(t, state.postDDLStalePlans, "t1_new")
+ assert.Contains(t, state.postDDLStalePlans, "t3")
+}
+
+func TestScheduleItems_PostDDLRenameSwapRequiresBothTablesToRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2, t2 to t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+}
+
+func TestScheduleItems_PostDDLCreateTableDoesNotBlockUnrelatedTable(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "create table t3(id bigint primary key)", Timestamp: 200},
+ }}))
+
+ ddlTxn, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type)
+ _, err = vp.applyDDLEvent(ctx, ddlTxn.payload.events[0], nil)
+ require.NoError(t, err)
+ require.NoError(t, scheduler.markCommitted(ddlTxn))
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+}
+
+func TestScheduleItems_PostDDLDropTableDoesNotBlockUnrelatedTable(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+
+ ddlTxn, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type)
+ require.NoError(t, scheduler.markCommitted(ddlTxn))
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+}
+
+func TestScheduleItems_PostDDLExecIgnoreFailureDoesNotBlockAffectedTable(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC_IGNORE
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{})
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ if sql == "alter table t1 add column c1 int" {
+ return nil, errors.New("ddl failed")
+ }
+ return &sqltypes.Result{}, nil
+ }
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 200},
+ }}))
+
+ ddlTxn, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, binlogdatapb.VEventType_DDL, ddlTxn.payload.events[0].Type)
+ require.NoError(t, vp.applyEvent(ctx, ddlTxn.payload.events[0], ddlTxn.payload.mustSave))
+ require.NoError(t, scheduler.markCommitted(ddlTxn))
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+}
+
+func TestScheduleItems_PostDDLDropThenUnknownStillClearsAfterDropSatisfaction(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ assert.Nil(t, state.postDDLStalePlans)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.NotContains(t, state.postDDLStalePlans, "t1")
+ assert.Contains(t, state.postDDLStalePlans, "t2")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLRecreatedDroppedTableIsTrackedAgain(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1", Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)")},
+ }}
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Nil(t, state.postDDLStalePlans)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.NotContains(t, state.postDDLStalePlans, "t1")
+ require.Contains(t, state.postDDLStalePlans, "t2")
+
+ fieldEvent := &binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "c1", Type: querypb.Type_INT64},
+ },
+ }
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_FIELD, FieldEvent: fieldEvent}, false))
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "this is not valid ddl", Timestamp: 300},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.Contains(t, state.postDDLStalePlans, "t1")
+ assert.Contains(t, state.postDDLStalePlans, "t2")
+}
+
+func TestScheduleItems_PostDDLDropThenCreateSameTableBlocksUntilFieldRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ staleT1 := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ IdentityColumns: []string{"id"},
+ HasExtraUniqueSecondary: false,
+ }
+ vp.tablePlans["t1"] = staleT1
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Nil(t, state.postDDLStalePlans)
+ require.Contains(t, vp.postDDLDroppedTables, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "create table t1(id bigint primary key, email bigint unique)", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.NotNil(t, state.postDDLStalePlans)
+ assert.Contains(t, state.postDDLStalePlans, "t1")
+ require.NoError(t, scheduler.markCommitted(got))
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "email", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true, false},
+ IdentityColumns: []string{"id"},
+ HasExtraUniqueSecondary: true,
+ }
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 350},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err = scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLDropThenRenameToDroppedNameBlocksUntilFieldRefresh(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Nil(t, state.postDDLStalePlans)
+ require.Contains(t, vp.postDDLDroppedTables, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t2 to t1", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t2")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.NoError(t, scheduler.markCommitted(got))
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t3",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 350},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err = scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLRenameThenCreateSameNameRequiresBothFieldRefreshes(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "create table t1(id bigint primary key)", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t2",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 300},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.NoError(t, scheduler.markCommitted(got))
+ require.NotNil(t, state.postDDLStalePlans)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "email", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-8"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}}}},
+ }, Timestamp: 350},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err = scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLDropCreateRenameRetargetsBarrierToFinalName(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "drop table t1", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "create table t1(id bigint primary key)", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-7"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2", Timestamp: 300},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil))
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestScheduleItems_PostDDLRenameChainRetargetsBarrierToFinalName(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t2"] = &TablePlan{TargetName: "t2", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, PKIndices: []bool{true}}
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t1 to t2", Timestamp: 200},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "rename table t2 to t3", Timestamp: 250},
+ }}))
+ commitScheduledExecutedDDL(t, ctx, scheduler, state, vp)
+ require.Contains(t, state.postDDLStalePlans, "t1")
+ require.Contains(t, state.postDDLStalePlans, "t2")
+
+ vp.tablePlansMu.Lock()
+ vp.tablePlans["t3"] = &TablePlan{TargetName: "t3", Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "c1", Type: querypb.Type_INT64}}, PKIndices: []bool{true, false}}
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, nil))
+ assert.Nil(t, state.postDDLStalePlans)
+}
+
+func TestMergeDroppedTables_DoesNotMutateInput(t *testing.T) {
+ original := map[string]struct{}{"t1": {}}
+ merged := mergeDroppedTables(original, map[string]struct{}{"t2": {}})
+
+ assert.Equal(t, map[string]struct{}{"t1": {}}, original)
+ assert.Equal(t, map[string]struct{}{"t1": {}, "t2": {}}, merged)
+}
+
+func TestRetargetPostDDLStalePlans_RenameSwapUsesOriginalRefreshNames(t *testing.T) {
+ t1Old := &TablePlan{TargetName: "t1"}
+ t2Old := &TablePlan{TargetName: "t2"}
+ t1New := &TablePlan{TargetName: "t1"}
+ t2New := &TablePlan{TargetName: "t2"}
+
+ stalePlans := map[string]postDDLStalePlan{
+ "barrier": {
+ stalePlan: t1Old,
+ refreshedPlans: map[string]*TablePlan{
+ "t1": t1Old,
+ "t2": t2Old,
+ },
+ },
+ }
+
+ retargetPostDDLStalePlans(stalePlans, map[string]string{"t1": "t2", "t2": "t1"}, map[string]*TablePlan{"t1": t1New, "t2": t2New})
+
+ require.Contains(t, stalePlans, "barrier")
+ assert.Equal(t, map[string]*TablePlan{"t1": t1New, "t2": t2New}, stalePlans["barrier"].refreshedPlans)
+}
+
+func TestRetargetPostDDLStalePlans_MixedCaseRenameTargetsUseMatchingLiveKeys(t *testing.T) {
+ t1Old := &TablePlan{TargetName: "t1"}
+ t2New := &TablePlan{TargetName: "t2"}
+
+ stalePlans := map[string]postDDLStalePlan{
+ "t1": {
+ stalePlan: t1Old,
+ refreshedPlans: map[string]*TablePlan{
+ "t1": t1Old,
+ },
+ },
+ }
+
+ retargetPostDDLStalePlans(stalePlans, map[string]string{"T1": "T2"}, map[string]*TablePlan{"t2": t2New})
+
+ require.Contains(t, stalePlans, "t1")
+ assert.Equal(t, map[string]*TablePlan{"t2": t2New}, stalePlans["t1"].refreshedPlans)
+}
+
+func TestScheduleItems_OTHERIsForceGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ otherEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_OTHER,
+ Timestamp: 200,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, otherEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.True(t, got.payload.commitOnly)
+}
+
+func TestScheduleItems_CopyStateForceGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ // When copyState is non-empty, all transactions should be forceGlobal
+ vp.copyState = map[string]*sqltypes.Result{"t1": {}}
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}},
+ },
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+}
+
+// TestScheduleItems_UniqueSecondaryIndexEmitsWritesetKey pins that a plain
+// unique secondary no longer force-serializes: the scheduled txn carries a
+// writeset that includes the unique-key conflict key (so colliding unique
+// values serialize against each other while non-colliding rows run in
+// parallel), rather than being marked forceGlobal.
+func TestScheduleItems_UniqueSecondaryIndexEmitsWritesetKey(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_unique_secondary_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ // Confirm the FIELD handler classified the plain unique secondary as
+ // hashable (emits a writeset key) rather than force-serializing.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+
+ err = vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: tableName,
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1a"), Lengths: []int64{1, 1}},
+ }},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }})
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ // The txn runs in parallel (not force-serialized) and carries the
+ // unique-key conflict key for email="a".
+ assert.False(t, got.forceGlobal)
+ expectedUniqueKey := map[uint64]struct{}{}
+ require.NoError(t, writesetKeysForUniqueKey(
+ tableName, 0, []int{1},
+ vp.tablePlans[tableName].Fields,
+ nil,
+ []sqltypes.Value{sqltypes.NewInt32(1), sqltypes.NewVarChar("a")},
+ expectedUniqueKey,
+ ))
+ require.Len(t, expectedUniqueKey, 1)
+ for key := range expectedUniqueKey {
+ assert.Contains(t, got.writeset, key,
+ "scheduled txn writeset must include the unique-key conflict key")
+ }
+}
+
+func TestScheduleItems_UnsupportedWritesetMappingForcesGlobal(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ env := vtenv.NewTestEnv()
+ plan, err := (&vreplicator{workflowConfig: vp.vr.workflowConfig}).buildReplicatorPlan(
+ getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: "t1",
+ Filter: "select a + b as c1, c as c2 from t1",
+ }}}),
+ map[string][]*ColumnInfo{"t1": {{Name: "c1", IsPK: true}, {Name: "c2"}}},
+ nil,
+ vp.vr.stats,
+ env.CollationEnv(),
+ env.Parser(),
+ )
+ require.NoError(t, err)
+
+ tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "a", Type: querypb.Type_INT64},
+ {Name: "b", Type: querypb.Type_INT64},
+ {Name: "c", Type: querypb.Type_INT64},
+ },
+ })
+ require.NoError(t, err)
+ require.True(t, tplan.HasUnsupportedWritesetMapping)
+
+ vp.tablePlans["t1"] = tplan
+ vp.tablePlansVersion.Store(1)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("123"), Lengths: []int64{1, 1, 1}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.NoError(t, scheduler.markCommitted(got))
+}
+
+func TestApplyEvent_FIELDEmitsWritesetKeyForUniqueSecondaryIndex(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_unique_secondary_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ // A plain (non-prefix, non-expression) unique secondary not covered by
+ // the identity no longer force-serializes; it emits a writeset unique
+ // key instead.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDMarksAlternateIdentityAgainstPrimaryKeyAsUnsafe(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_alt_identity_primary_key_conflict"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: tableName,
+ Filter: "select id, email from " + tableName,
+ TargetUniqueKeyColumns: "email",
+ }}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ require.Equal(t, []string{"email"}, vp.tablePlans[tableName].IdentityColumns)
+ // PK(id) does not match the chosen identity (email): the writeset hasher
+ // can't reason about it, so force-serialize and emit no unique keys.
+ require.True(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDCachesExtraUniqueSecondaryLookup(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_cached_unique_secondary_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ vp.vr.mysqld = nil
+
+ // Second FIELD reuses the cached unique-key analysis (mysqld is nil, so a
+ // fresh schema fetch would fail): a plain unique secondary emits a
+ // writeset unique key, no force-serialization.
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDCachesNoExtraUniqueSecondaryLookup(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_cached_no_unique_secondary_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), key idx_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ require.Contains(t, vp.tablePlans, tableName)
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+
+ vp.vr.mysqld = nil
+
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDEmitsWritesetKeyForNullableUniqueSecondaryIndex(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_nullable_unique_secondary_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ // A NULLABLE plain unique secondary is still hashable: NULL key values
+ // simply emit no key at write time (MySQL permits multiple NULLs), so we
+ // emit a writeset unique key rather than force-serializing.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDIgnoresIdentityEquivalentReorderedUniqueSecondaryIndex(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_reordered_identity_equivalent_unique_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (a int not null, b int not null, c varchar(128) not null, primary key(a, b), unique key uk_b_a(b, a))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "a", Type: querypb.Type_INT32},
+ {Name: "b", Type: querypb.Type_INT32},
+ {Name: "c", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ require.Equal(t, []string{"a", "b"}, vp.tablePlans[tableName].IdentityColumns)
+ // The unique secondary covers the full identity, so it can't create
+ // cross-identity conflicts: no force-serialization, no extra unique key.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+// TestApplyEvent_FIELDIgnoresUniqueSecondaryIndexThatContainsIdentity covers a
+// unique secondary index whose column set is a strict superset of the
+// identity. UNIQUE(id, name) where id is the PK cannot introduce conflicts
+// beyond what PK(id) already enforces, so the table must not be flagged as
+// having an "extra" unique secondary index. The pre-fix code short-circuited
+// on column-count mismatch, forcing unnecessary global serialization.
+func TestApplyEvent_FIELDIgnoresUniqueSecondaryIndexThatContainsIdentity(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_unique_secondary_contains_identity"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, name varchar(128) not null, primary key(id), unique key uk_id_name(id, name))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "name", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ require.Equal(t, []string{"id"}, vp.tablePlans[tableName].IdentityColumns)
+ // UNIQUE(id, name) is a superset of the identity (id), so it adds no
+ // conflicts beyond the PK: no force-serialization, no extra unique key.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDIgnoresIdentityEquivalentReorderedPrimaryKey(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_reordered_identity_equivalent_primary_key"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (a int not null, b int not null, c varchar(128) not null, primary key(a, b), unique key uk_b_a(b, a))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: tableName,
+ Filter: "select a, b, c from " + tableName,
+ TargetUniqueKeyColumns: "b,a",
+ }}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "a", Type: querypb.Type_INT32},
+ {Name: "b", Type: querypb.Type_INT32},
+ {Name: "c", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ require.Equal(t, []string{"b", "a"}, vp.tablePlans[tableName].IdentityColumns)
+ // The unique secondary's column set equals the identity (reordered), so
+ // it can't create cross-identity conflicts: no force-serialization, no
+ // extra unique key.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDMarksPrefixUniqueIndexAsExtraUniqueSecondary(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_prefix_unique_secondary_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (email varchar(128) not null, payload varchar(128), primary key(email), unique key uk_email_prefix(email(10)))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ {Name: "payload", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ require.Equal(t, []string{"email"}, vp.tablePlans[tableName].IdentityColumns)
+ // A prefix unique index enforces uniqueness over a derived value the
+ // hasher can't reproduce, so it force-serializes and emits no unique key.
+ require.True(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDAfterExecutedDDLRefreshesUniqueSecondaryLookup(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+
+ tableName := "parallel_apply_field_refresh_after_unique_ddl"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), key idx_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ // Before the DDL: only a non-unique secondary, so no unique keys.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+
+ ddlEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_DDL,
+ Statement: "alter table " + tableName + " add unique key uk_email(email)",
+ Timestamp: 100,
+ }
+ execStatements(t, []string{"alter table " + qualifiedTableName + " add unique key uk_email(email)"})
+ publishExecutedDDLBarrier(t, vp, ddlEvent.Statement)
+
+ // After the DDL barrier the FIELD handler re-runs the unique-key analysis:
+ // the new plain unique secondary emits a writeset unique key.
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+
+ vp.vr.mysqld = nil
+
+ // The refreshed analysis is cached; a later FIELD reuses it (mysqld nil).
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestWorkerLoop_FIELDRefreshesPublishedDDLBarrierState(t *testing.T) {
+ ctx, cancel := context.WithCancel(testCtx(t))
+ defer cancel()
+
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+
+ tableName := "parallel_apply_worker_field_refresh_after_unique_ddl"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), key idx_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans[tableName].UniqueKeyColumns)
+
+ workerDB := &recordingDBClient{}
+ workerClient := newVDBClient(workerDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ worker := &applyWorker{
+ ctx: ctx,
+ conns: [2]*vdbClient{workerClient, workerClient},
+ client: workerClient,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return &sqltypes.Result{}, nil
+ },
+ commit: func() error {
+ return nil
+ },
+ }
+
+ scheduler := newApplyScheduler(ctx)
+ commitCh := make(chan *applyTxn, 2)
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker)
+ }()
+
+ commitOnlyTxn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ commitOnly: true,
+ events: []*binlogdatapb.VEvent{{
+ Type: binlogdatapb.VEventType_OTHER,
+ }},
+ },
+ }
+ require.NoError(t, scheduler.enqueue(commitOnlyTxn))
+ require.Same(t, commitOnlyTxn, <-commitCh)
+ require.NoError(t, scheduler.markCommitted(commitOnlyTxn))
+
+ execStatements(t, []string{"alter table " + qualifiedTableName + " add unique key uk_email(email)"})
+ publishExecutedDDLBarrier(t, vp, "alter table "+tableName+" add unique key uk_email(email)")
+
+ fieldTxn := &applyTxn{
+ order: 2,
+ noConflict: true,
+ payload: &applyTxnPayload{
+ events: []*binlogdatapb.VEvent{fieldEvent},
+ },
+ }
+ require.NoError(t, scheduler.enqueue(fieldTxn))
+ require.Same(t, fieldTxn, <-commitCh)
+
+ // The worker-loop FIELD refresh re-ran the unique-key analysis after the
+ // published DDL barrier: the new plain unique secondary emits a writeset
+ // unique key (it does not force-serialize).
+ assert.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ assert.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+
+ cancel()
+ require.ErrorIs(t, <-errCh, context.Canceled)
+}
+
+func TestWorkerLoop_FIELDRefreshClearsPublishedDroppedTablesAfterCommit(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_worker_field_does_not_clear_dropped_state"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, primary key(id))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}},
+ },
+ }
+ require.NoError(t, vp.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ vp.postDDLDroppedTables = map[string]struct{}{tableName: {}}
+ vp2 := *vp
+ vp2.postDDLDroppedTables = cloneDroppedTables(vp.postDDLDroppedTables)
+
+ require.NoError(t, vp2.applyEvent(ctx, fieldEvent, false))
+ require.NoError(t, vp.dbClient.Rollback())
+ require.Contains(t, vp.postDDLDroppedTables, tableName)
+
+ scheduler := newApplyScheduler(ctx)
+ payload := acquireApplyTxnPayload()
+ payload.pos = vp.pos
+ payload.timestamp = 123
+ payload.events = []*binlogdatapb.VEvent{fieldEvent}
+ payload.query = func(context.Context, string) (*sqltypes.Result, error) {
+ return &sqltypes.Result{}, nil
+ }
+ payload.commit = func() error { return nil }
+ payload.client = vp.dbClient
+ txn := acquireApplyTxn()
+ txn.order = 1
+ txn.payload = payload
+ defer releaseApplyTxn(txn)
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- txn
+ close(commitCh)
+ require.NoError(t, vp.commitLoop(ctx, scheduler, commitCh))
+ assert.NotContains(t, vp.postDDLDroppedTables, tableName)
+}
+
+func TestApplyEvent_FIELDRefreshTargetInvalidatesUniqueSecondaryCache(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_field_refresh_target_unique_secondary_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ cachedPlan := &TablePlan{
+ TargetName: tableName,
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}, {Name: "email", Type: querypb.Type_VARCHAR}},
+ PKIndices: []bool{true, false},
+ HasExtraUniqueSecondary: false,
+ }
+ vp.tablePlans[tableName] = cachedPlan
+ vp.postDDLStalePlans = map[string]postDDLStalePlan{
+ "old_" + tableName: {
+ stalePlan: &TablePlan{TargetName: "old_" + tableName},
+ refreshedPlans: map[string]*TablePlan{tableName: cachedPlan},
+ },
+ }
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ // The refresh-target barrier invalidated the cached plan, so the FIELD
+ // handler re-ran the unique-key analysis: the plain unique secondary
+ // emits a writeset unique key rather than force-serializing.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDRefreshTargetInvalidatesUniqueSecondaryCacheAcrossMultipleBarriers(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "pa_field_refresh_multi_barrier_uniq_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ cachedPlan := &TablePlan{
+ TargetName: tableName,
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}, {Name: "email", Type: querypb.Type_VARCHAR}},
+ PKIndices: []bool{true, false},
+ HasExtraUniqueSecondary: false,
+ }
+ vp.tablePlans[tableName] = cachedPlan
+ vp.postDDLStalePlans = map[string]postDDLStalePlan{
+ "old_" + tableName: {
+ stalePlan: &TablePlan{TargetName: "old_" + tableName},
+ refreshedPlans: map[string]*TablePlan{tableName: {TargetName: tableName}},
+ },
+ "other_old_" + tableName: {
+ stalePlan: cachedPlan,
+ refreshedPlans: map[string]*TablePlan{tableName: cachedPlan},
+ },
+ }
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ // Across multiple barriers the cache was invalidated and the FIELD handler
+ // re-ran the unique-key analysis: the plain unique secondary emits a
+ // writeset unique key rather than force-serializing.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+func TestApplyEvent_FIELDWithoutParallelApplySkipsUniqueSecondaryLookup(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 1
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {
+ TargetName: "t1",
+ IdentityColumns: []string{"id"},
+ Insert: sqlparser.BuildParsedQuery("insert into t1 values (:a)"),
+ TablePlanBuilder: &tablePlanBuilder{},
+ },
+ }}
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ },
+ }, false))
+
+ require.Contains(t, vp.tablePlans, "t1")
+ require.False(t, vp.tablePlans["t1"].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans["t1"].UniqueKeyColumns)
+
+ // During the copy phase (catchup/fastforward run the SERIAL applier and
+ // this vplayer's table plans die with it), the lookup must be skipped
+ // even when parallel workers are configured: the schema fetch is a
+ // wasted mysqld round-trip and a needless failure mode. vr.mysqld is
+ // nil here, so reaching the lookup would fail loudly.
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 4
+ vp.copyState = map[string]*sqltypes.Result{"t1": nil}
+ delete(vp.tablePlans, "t1")
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ },
+ }, false))
+ require.Contains(t, vp.tablePlans, "t1")
+ require.False(t, vp.tablePlans["t1"].HasExtraUniqueSecondary)
+ require.Nil(t, vp.tablePlans["t1"].UniqueKeyColumns)
+}
+
+func TestApplyEvent_VERSIONIsIgnored(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+
+ gtid := "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: gtid}, false))
+
+ versionEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_VERSION, Timestamp: 100}
+ require.NoError(t, vp.applyEvent(ctx, versionEvent, false))
+ require.False(t, vp.dbClient.InTransaction)
+ require.Nil(t, vp.unsavedEvent)
+
+ commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 100}
+ require.NoError(t, vp.applyEvent(ctx, commitEvent, false))
+ require.Equal(t, commitEvent, vp.unsavedEvent)
+}
+
+// TestApplyEvent_JournalDoesNotPersistPositionBeforeTransition pins that a
+// JOURNAL event must NOT durably advance the saved position when it is
+// registered. registerJournal returns nil as soon as THIS participant has
+// joined — here participant ks:1 has not — and the engine's journaler state
+// is in-memory only. If the position were saved past the journal and the
+// tablet restarted before all participants joined, the stream would resume
+// past the journal, never re-register, and the workflow would hang forever
+// waiting for a transition that can no longer happen.
+func TestApplyEvent_JournalDoesNotPersistPositionBeforeTransition(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ }}
+
+ oldPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5540")
+ require.NoError(t, err)
+ journalPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5541")
+ require.NoError(t, err)
+ vp.pos = oldPos
+ vp.stopPos = journalPos
+
+ recording := &recordingDBClient{}
+ mainClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.vr.dbClient = mainClient
+ vp.dbClient = mainClient
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return mainClient.Execute(sql)
+ }
+ vp.commit = mainClient.Commit
+
+ vp.vr.vre = &Engine{
+ isOpen: true,
+ journaler: make(map[string]*journalEvent),
+ controllers: map[int32]*controller{
+ vp.vr.id: {
+ workflow: "wf",
+ source: &binlogdatapb.BinlogSource{
+ Keyspace: "ks",
+ Shard: "0",
+ },
+ },
+ 2: {
+ workflow: "wf",
+ source: &binlogdatapb.BinlogSource{
+ Keyspace: "ks",
+ Shard: "1",
+ },
+ },
+ },
+ }
+
+ err = vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_JOURNAL,
+ Timestamp: 100,
+ EventGtid: "3e11fa47-71ca-11e1-9e33-c80aa9429562:5541",
+ Journal: &binlogdatapb.Journal{
+ Id: 1,
+ MigrationType: binlogdatapb.MigrationType_TABLES,
+ Participants: []*binlogdatapb.KeyspaceShard{{
+ Keyspace: "ks",
+ Shard: "0",
+ }, {
+ Keyspace: "ks",
+ Shard: "1",
+ }},
+ Tables: []string{"t1"},
+ },
+ }, true)
+ require.ErrorIs(t, err, io.EOF)
+
+ // This participant registered with the journaler...
+ require.Contains(t, vp.vr.vre.journaler, "wf:1")
+ // ...but the in-memory and durable positions must remain BEFORE the
+ // journal event so a restart re-delivers it and re-registers.
+ assert.Equal(t, oldPos, vp.pos)
+ for _, query := range recording.queries {
+ assert.NotContains(t, query, "update _vt.vreplication set pos=", "no position may be persisted at journal registration; queries: %v", recording.queries)
+ }
+}
+
+// TestVPlayerLagSnapshotIsAtomic pins the invariant that lag-state reads
+// never observe a torn pair of (lastTimestampNs, timeOffsetNs). The two
+// values are written together (commitLoop.updateLag and the heartbeat
+// store path), and the throttled-path lag estimator reads them both. If
+// they were independent atomics, a reader could see (new ts, old offset)
+// and report nonsense lag values. We pack them into a single atomic
+// snapshot so loads are naturally consistent.
+//
+// The test pairs ts and offset (offset = ts + sentinelOffset) on every
+// write, then a concurrent reader checks the relationship on every load.
+// A torn read would produce a pair that violates the invariant.
+func TestVPlayerLagSnapshotIsAtomic(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ const sentinelOffset = int64(0xDEADBEEF)
+ const iterations = 50_000
+
+ stop := make(chan struct{})
+ var writeWG sync.WaitGroup
+ writeWG.Go(func() {
+ for i := int64(1); ; i++ {
+ select {
+ case <-stop:
+ return
+ default:
+ }
+ vp.storeLagSnapshot(i, i+sentinelOffset)
+ }
+ })
+
+ var readWG sync.WaitGroup
+ mismatches := atomic.Int64{}
+ readWG.Go(func() {
+ for range iterations {
+ snap := vp.loadLagSnapshot()
+ if snap.timestampNs == 0 {
+ continue
+ }
+ if snap.offsetNs != snap.timestampNs+sentinelOffset {
+ mismatches.Add(1)
+ }
+ }
+ })
+
+ readWG.Wait()
+ close(stop)
+ writeWG.Wait()
+
+ require.Equal(t, int64(0), mismatches.Load(), "lag snapshot reads must always observe a consistent ts/offset pair")
+}
+
+// TestScheduleItems_FIELDIncrementsPendingRefreshBeforeEnqueue pins the
+// ordering invariant: when scheduleItems processes a FIELD event, the
+// vp.pendingFieldRefreshTables increment must happen BEFORE the txn is
+// enqueued into the scheduler. Otherwise a worker can pick up the txn and
+// commitLoop's matching decrement loop can run with an empty map (no-op),
+// leaving the counter permanently stuck at 1 and force-serializing every
+// future ROW txn that touches this table.
+//
+// The test stubs the scheduler-dispatch path by reading the increment via a
+// nextReady call from the same goroutine immediately after scheduleItems
+// returns; the contract is that pendingFieldRefreshTables must be visible
+// before the txn becomes pickable. We additionally assert that on a
+// scheduler-closed enqueue error, the speculative increment is rolled back
+// so a transient teardown does not poison the next workflow restart.
+func TestScheduleItems_FIELDIncrementsPendingRefreshBeforeEnqueue(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, commitEvent}})
+ require.NoError(t, err)
+
+ // As a worker would: pull the txn out of the scheduler. By the time the
+ // scheduler hands a worker any txn that carries a FIELD refresh, the
+ // pendingFieldRefreshTables count for that table MUST already be at
+ // least 1 — otherwise commitLoop's decrement could race past the
+ // increment and leave the counter permanently stuck.
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, got)
+
+ vp.serialMu.Lock()
+ count := vp.pendingFieldRefreshTables["t1"]
+ vp.serialMu.Unlock()
+ require.GreaterOrEqual(t, count, 1, "pendingFieldRefreshTables[t1] must be incremented before the txn is dispatched to a worker")
+}
+
+func TestScheduleItems_FIELDRollsBackPendingRefreshOnEnqueueError(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ // Close the scheduler so any subsequent enqueue returns io.EOF.
+ require.ErrorIs(t, scheduler.close(), io.EOF)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, commitEvent}})
+ require.ErrorIs(t, err, io.EOF)
+
+ vp.serialMu.Lock()
+ count := vp.pendingFieldRefreshTables["t1"]
+ vp.serialMu.Unlock()
+ require.Equal(t, 0, count, "pendingFieldRefreshTables[t1] must roll back to 0 when enqueue fails")
+}
+
+func TestScheduleItems_FIELDEventDoesNotForceGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ // FIELD events are metadata (table definitions). They should NOT force
+ // global serialization — they are harmless for conflict detection and
+ // just need to be applied before the ROW events that follow.
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ // FIELD events have an explicit handler that does NOT set curRowOnly=false,
+ // so the transaction is scheduled normally with an empty writeset (noConflict).
+ assert.False(t, got.forceGlobal)
+}
+
+func TestScheduleItems_ROWSQUERYEventDoesNotForceGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowsQueryEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROWS_QUERY,
+ Statement: "insert into t1 values (1)",
+ Timestamp: 100,
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}},
+ },
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, rowsQueryEvent, rowEvent, commitEvent}})
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ require.Len(t, got.payload.events, 2)
+ assert.Equal(t, binlogdatapb.VEventType_ROWS_QUERY, got.payload.events[0].Type)
+ assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[1].Type)
+ require.Len(t, got.writeset, 1)
+}
+
+func TestScheduleItems_UnknownVEventTypeFailsFast(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType(12345)},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "unsupported vevent type")
+}
+
+func TestScheduleItems_InsertStatementEventDoesNotFailFast(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_INSERT, Dml: "insert into t1(id) values (1)", Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }})
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.Len(t, got.payload.events, 1)
+ assert.Equal(t, binlogdatapb.VEventType_INSERT, got.payload.events[0].Type)
+}
+
+func TestScheduleItems_TimestampTracking(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ Timestamp: 50,
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}},
+ },
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ // Timestamp from the ROW event should be tracked
+ assert.Equal(t, int64(100), got.payload.timestamp)
+}
+
+func TestScheduleItems_WritesetBuild(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {After: &querypb.Row{Values: []byte("42"), Lengths: []int64{2}}},
+ },
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, got.forceGlobal)
+ assert.Contains(t, got.payload.events[0].RowEvent.TableName, "t1")
+ // Writeset should contain PK-based key
+ require.Len(t, got.writeset, 1)
+ expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42")))
+ assert.Equal(t, expected, got.writeset[0])
+}
+
+func TestScheduleItems_MissingTablePlanReturnsWritesetError(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ // No table plan for "t1" — writeset build should fail closed.
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}},
+ },
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.Error(t, err)
+ assert.Equal(t, vtrpcpb.Code_FAILED_PRECONDITION, vterrors.Code(err))
+ assert.Contains(t, err.Error(), "missing table plan for t1")
+}
+
+func TestScheduleItems_FieldThenRowWithoutCachedPlanForcesGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ fieldEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ },
+ Timestamp: 100,
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}},
+ }},
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}
+
+ err := vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, fieldEvent, rowEvent, commitEvent}})
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ require.Len(t, got.payload.events, 2)
+ assert.Equal(t, binlogdatapb.VEventType_FIELD, got.payload.events[0].Type)
+ assert.Equal(t, binlogdatapb.VEventType_ROW, got.payload.events[1].Type)
+ assert.Nil(t, got.writeset)
+}
+
+func TestScheduleItems_RowAfterPendingFieldRefreshForKnownTableForcesGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "customer": {
+ TargetName: "customer",
+ SendRule: &binlogdatapb.Rule{Match: "customer", Filter: "select * from customer"},
+ },
+ }}
+
+ fieldTxn := [][]*binlogdatapb.VEvent{{
+ {
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ SequenceNumber: 5,
+ CommitParent: 4,
+ },
+ {
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: "customer",
+ Fields: []*querypb.Field{
+ {Name: "cid", Type: querypb.Type_INT64},
+ {Name: "name", Type: querypb.Type_VARCHAR},
+ },
+ },
+ Timestamp: 100,
+ },
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, fieldTxn))
+
+ fieldReady, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.False(t, fieldReady.forceGlobal)
+ require.Len(t, fieldReady.payload.events, 1)
+ assert.Equal(t, binlogdatapb.VEventType_FIELD, fieldReady.payload.events[0].Type)
+
+ rowTxn := [][]*binlogdatapb.VEvent{{
+ {
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6",
+ SequenceNumber: 6,
+ CommitParent: 5,
+ },
+ {
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "customer",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1alice"), Lengths: []int64{1, 5}},
+ }},
+ },
+ Timestamp: 101,
+ },
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, rowTxn))
+
+ scheduler.mu.Lock()
+ require.Equal(t, 1, scheduler.pendingCount)
+ var queued *applyTxn
+ for _, pending := range scheduler.pending {
+ if pending != nil {
+ queued = pending
+ break
+ }
+ }
+ scheduler.mu.Unlock()
+ require.NotNil(t, queued)
+ assert.True(t, queued.forceGlobal)
+ require.Len(t, queued.payload.events, 1)
+ assert.Equal(t, binlogdatapb.VEventType_ROW, queued.payload.events[0].Type)
+ assert.Nil(t, queued.writeset)
+
+ require.NoError(t, scheduler.markCommitted(fieldReady))
+ ready, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.Same(t, queued, ready)
+ require.NoError(t, scheduler.markCommitted(ready))
+}
+
+func TestScheduleItems_PartialRowImageFallsBackToSerializedApply(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "a", Type: querypb.Type_INT64},
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "b", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{false, true, false},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("23"), Lengths: []int64{1, 1}},
+ DataColumns: &binlogdatapb.RowChange_Bitmap{
+ Count: 3,
+ Cols: []byte{0x06},
+ },
+ }},
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.Empty(t, got.writeset)
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Zero(t, scheduler.pendingCount)
+ assert.Equal(t, 1, scheduler.inflightGlobal)
+ assert.Zero(t, scheduler.inflightMissingMeta)
+ assert.Zero(t, scheduler.inflightCommitMeta)
+ assert.Empty(t, scheduler.pending)
+ assert.Empty(t, scheduler.inflightWriteset)
+}
+
+func TestScheduleItems_MissingFKColumnFallsBackToSerializedApply(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["child"] = &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+ vp.fkRefs = map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}},
+ }
+ vp.parentFKRefs = buildParentFKRefs(vp.fkRefs)
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.Empty(t, got.writeset)
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Zero(t, scheduler.pendingCount)
+ assert.Equal(t, 1, scheduler.inflightGlobal)
+ assert.Zero(t, scheduler.inflightMissingMeta)
+ assert.Zero(t, scheduler.inflightCommitMeta)
+ assert.Empty(t, scheduler.pending)
+ assert.Empty(t, scheduler.inflightWriteset)
+}
+
+func TestScheduleItems_CommitMeta(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ SequenceNumber: 10,
+ CommitParent: 9,
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.hasCommitMeta)
+ assert.Equal(t, int64(10), got.sequenceNumber)
+ assert.Equal(t, int64(9), got.commitParent)
+}
+
+func TestScheduleItems_DDLCommitOnlyPreservesCommitMetaFromGTID(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ SequenceNumber: 10,
+ CommitParent: 9,
+ }
+ ddlEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_DDL,
+ Timestamp: 200,
+ }
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, [][]*binlogdatapb.VEvent{{gtidEvent, ddlEvent}}))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.payload.commitOnly)
+ assert.True(t, got.hasCommitMeta)
+ assert.Equal(t, int64(10), got.sequenceNumber)
+ assert.Equal(t, int64(9), got.commitParent)
+}
+
+func TestScheduleItems_BatchingMixedCommitMetaStaysMissingMeta(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+ state.maxBatchedCommits = 2
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6", SequenceNumber: 11, CommitParent: 10},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, items))
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, got)
+ assert.False(t, got.hasCommitMeta)
+ assert.Zero(t, got.sequenceNumber)
+ assert.Zero(t, got.commitParent)
+ assert.Len(t, got.payload.events, 2)
+ assert.NotNil(t, got.writeset)
+ require.NoError(t, scheduler.markCommitted(got))
+ scheduler.mu.Lock()
+ assert.Equal(t, int64(11), scheduler.lastCommittedSequence)
+ scheduler.mu.Unlock()
+}
+
+func TestScheduleItems_HeartbeatSetsMustSave(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.vr.workflowConfig.HeartbeatUpdateInterval = math.MaxInt
+
+ vp.numAccumulatedHeartbeats = 1
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ // recordHeartbeat() calls vr.stats.RecordHeartbeat (no DB) then
+ // mustUpdateHeartbeat() → false (numAccumulatedHeartbeats=0), so no DB call.
+ _ = mockDB
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ },
+ Timestamp: 100,
+ }
+ heartbeatEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_HEARTBEAT,
+ Timestamp: 200,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ // GTID, ROW, HEARTBEAT, COMMIT — heartbeat should set curMustSave
+ // because there are accumulated events when heartbeat arrives
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, heartbeatEvent, commitEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, got)
+ // The heartbeat forced curMustSave=true, which means the transaction was flushed
+ // even if batching would otherwise accumulate it
+ assert.Equal(t, int64(1), got.order)
+}
+
+func TestScheduleItems_BatchingSkipsFlushWhenAnotherCommitAhead(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ // Two transactions in same batch — first COMMIT should be skipped (batched)
+ // since another COMMIT follows
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ // Second transaction in same batch
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // With batching, both transactions merge into one — only one enqueue
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, got)
+
+ // The batched transaction should have both ROW events
+ assert.Len(t, got.payload.events, 2)
+ assert.Equal(t, int64(1), got.order)
+}
+
+func TestScheduleItems_FKRefsDisableBatching(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ // Set FK refs — this should disable batching
+ vp.fkRefs = map[string][]fkConstraintRef{
+ "t1": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}}},
+ }
+
+ // Two transactions in same batch
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("223"), Lengths: []int64{1, 2}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // With FK refs, batching is disabled — two separate transactions
+ got1, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NoError(t, scheduler.markCommitted(got1))
+ got2, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+
+ assert.Len(t, got1.payload.events, 1)
+ assert.Len(t, got2.payload.events, 1)
+ assert.Equal(t, int64(1), got1.order)
+ assert.Equal(t, int64(2), got2.order)
+}
+
+func TestScheduleItems_FKRefsDisableBatchingForRenamedTable(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["child_src"] = &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ vp.fkRefs = map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}}},
+ }
+ vp.parentFKRefs = buildParentFKRefs(vp.fkRefs)
+
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("223"), Lengths: []int64{1, 2}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got1, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Len(t, got1.payload.events, 1)
+ require.Equal(t, int64(1), got1.order)
+ scheduler.mu.Lock()
+ pendingCount := scheduler.pendingCount
+ scheduler.mu.Unlock()
+ require.Equal(t, 1, pendingCount)
+ require.NoError(t, scheduler.markCommitted(got1))
+ got2, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+
+ assert.Len(t, got2.payload.events, 1)
+ assert.Equal(t, int64(2), got2.order)
+}
+
+func TestScheduleItems_FKRefsDisableBatchingForMixedCaseTargetTable(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["child_src"] = &TablePlan{
+ TargetName: "Child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ vp.fkRefs = map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}}},
+ }
+ vp.parentFKRefs = buildParentFKRefs(vp.fkRefs)
+
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("223"), Lengths: []int64{1, 2}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ got1, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Len(t, got1.payload.events, 1)
+ require.Equal(t, int64(1), got1.order)
+ scheduler.mu.Lock()
+ pendingCount := scheduler.pendingCount
+ scheduler.mu.Unlock()
+ require.Equal(t, 1, pendingCount)
+ require.NoError(t, scheduler.markCommitted(got1))
+ got2, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+
+ assert.Len(t, got2.payload.events, 1)
+ assert.Equal(t, int64(2), got2.order)
+}
+
+func TestScheduleItems_BatchingMergedSequenceAdvanced(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ // Pre-advance the watermark so enqueue's idle-seeding path (which seeds
+ // lastCommittedSequence from the enqueued txn's commitParent) cannot
+ // mask the mergedSequences behavior this test pins.
+ scheduler.advanceCommittedSequence(9)
+
+ // Two transactions with commit meta — the first gets merged into the
+ // second's batch, so its sequence (10) must ride along in mergedSequences
+ // and publish only when the batch commits.
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5", SequenceNumber: 10, CommitParent: 9},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ // Second txn
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6", SequenceNumber: 11, CommitParent: 10},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // The merged-away sequence must NOT be visible yet: publishing it at
+ // enqueue time would let an empty-writeset dependent with commitParent=10
+ // run before the batch containing sequence 10 has actually committed.
+ scheduler.mu.Lock()
+ seq := scheduler.lastCommittedSequence
+ scheduler.mu.Unlock()
+ assert.Equal(t, int64(9), seq)
+
+ // Both source transactions were batched into a single txn that carries
+ // the surviving commit meta (sequence 11) plus the merged-away sequence.
+ txn, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Equal(t, int64(1), txn.order)
+ assert.Equal(t, int64(11), txn.sequenceNumber)
+ assert.Len(t, txn.payload.events, 2)
+ require.Equal(t, []int64{10}, txn.mergedSequences)
+
+ // Committing the batch publishes both its own and the merged sequence.
+ require.NoError(t, scheduler.markCommitted(txn))
+ scheduler.mu.Lock()
+ seq = scheduler.lastCommittedSequence
+ scheduler.mu.Unlock()
+ assert.Equal(t, int64(11), seq)
+}
+
+func TestScheduleItems_StopPosSetsMustSave(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ // Set a stop position
+ stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = stopPos
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-10", // at or past stopPos
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ },
+ Timestamp: 100,
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, commitEvent}}
+ err = vp.scheduleItems(ctx, scheduler, state, items)
+ require.ErrorIs(t, err, io.EOF)
+
+ got, gerr := scheduler.nextReady(ctx)
+ require.NoError(t, gerr)
+ require.NotNil(t, got)
+ assert.True(t, got.payload.mustSave)
+}
+
+func TestScheduleItems_StopPosStopsSchedulingLaterTransactionsInSameFetch(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = stopPos
+
+ items := [][]*binlogdatapb.VEvent{
+ {
+ {Type: binlogdatapb.VEventType_GTID, Gtid: replication.EncodePosition(stopPos)},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ },
+ {
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ },
+ }
+
+ err = vp.scheduleItems(ctx, scheduler, state, items)
+ require.ErrorIs(t, err, io.EOF)
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Equal(t, 1, scheduler.pendingCount)
+}
+
+func TestScheduleItems_HeartbeatUpdatesLag(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.vr.workflowConfig.HeartbeatUpdateInterval = math.MaxInt
+
+ vp.numAccumulatedHeartbeats = 1
+
+ hbEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_HEARTBEAT,
+ Timestamp: 100,
+ CurrentTime: time.Now().UnixNano(),
+ }
+
+ items := [][]*binlogdatapb.VEvent{{hbEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(100*1e9), vp.loadLagSnapshot().timestampNs)
+ assert.Equal(t, 2, vp.numAccumulatedHeartbeats)
+}
+
+func TestScheduleItems_ThrottledHeartbeatEstimatesLag(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.vr.workflowConfig.HeartbeatUpdateInterval = math.MaxInt
+ vp.vr.throttleUpdatesRateLimiter = timer.NewRateLimiter(time.Second)
+ t.Cleanup(vp.vr.throttleUpdatesRateLimiter.Stop)
+
+ vp.numAccumulatedHeartbeats = 1
+
+ // Set last known timestamp so estimateLag works
+ vp.storeLagSnapshot(time.Now().Add(-5*time.Second).UnixNano(), 0)
+
+ // updateTimeThrottled calls dbClient.ExecuteFetch
+ mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{})
+
+ hbEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_HEARTBEAT,
+ Timestamp: 100,
+ CurrentTime: time.Now().UnixNano(),
+ Throttled: true,
+ ThrottledReason: "test",
+ }
+
+ items := [][]*binlogdatapb.VEvent{{hbEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // Lag should be estimated (non-zero)
+ lag := vp.vr.stats.ReplicationLagSeconds.Load()
+ assert.GreaterOrEqual(t, lag, int64(4))
+}
+
+func BenchmarkScheduleItems_FKBatchingCheckSkipsUnrelatedTables(b *testing.B) {
+ ctx := context.Background()
+ vp, _ := testVPlayer(&testing.T{})
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ vp.tablePlans["hot"] = &TablePlan{
+ TargetName: "hot",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true},
+ }
+ vp.tablePlans["child"] = &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ vp.tablePlansVersion.Store(1)
+ vp.fkRefs = map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}},
+ }
+ vp.parentFKRefs = buildParentFKRefs(vp.fkRefs)
+
+ items := make([][]*binlogdatapb.VEvent, 1)
+ batch := make([]*binlogdatapb.VEvent, 0, 96)
+ for range 32 {
+ batch = append(batch,
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "hot",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}},
+ }},
+ }},
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT},
+ )
+ }
+ items[0] = batch
+
+ b.ReportAllocs()
+ b.ResetTimer()
+ for range b.N {
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ if err := vp.scheduleItems(ctx, scheduler, state, items); err != nil {
+ b.Fatal(err)
+ }
+ _ = scheduler.close()
+ }
+}
+
+func BenchmarkScheduleItems_WritesetFKResolutionForRepeatedTable(b *testing.B) {
+ const (
+ tableCount = 256
+ txnCount = 32
+ )
+
+ ctx := context.Background()
+ vp, _ := testVPlayer(&testing.T{})
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ fkRefs := make(map[string][]fkConstraintRef, tableCount)
+ for i := range tableCount {
+ parentTable := fmt.Sprintf("parent%d", i)
+ childTable := fmt.Sprintf("child%d", i)
+ vp.tablePlans[parentTable] = &TablePlan{
+ TargetName: parentTable,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true},
+ }
+ vp.tablePlans[childTable] = &TablePlan{
+ TargetName: childTable,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs[childTable] = []fkConstraintRef{{
+ ParentTable: parentTable,
+ ChildColumnNames: []string{"parent_id"},
+ ReferencedColumnNames: []string{"id"},
+ }}
+ }
+ vp.tablePlansVersion.Store(1)
+ vp.fkRefs = fkRefs
+ vp.parentFKRefs = buildParentFKRefs(fkRefs)
+
+ items := make([][]*binlogdatapb.VEvent, 1)
+ batch := make([]*binlogdatapb.VEvent, 0, txnCount*3)
+ for range txnCount {
+ batch = append(batch,
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child0",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("11"), Lengths: []int64{1, 1}},
+ }},
+ }},
+ &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT},
+ )
+ }
+ items[0] = batch
+
+ b.ReportAllocs()
+ b.ResetTimer()
+ for range b.N {
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ if err := vp.scheduleItems(ctx, scheduler, state, items); err != nil {
+ b.Fatal(err)
+ }
+ _ = scheduler.close()
+ }
+}
+
+// ---------- commitLoop tests ----------
+
+func TestCommitLoop_InOrderCommit(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ // commitLoop calls commitTxn → updatePos → vp.query/commit on each txn.
+ // For commitOnly+updatePosOnly, it calls vp.updatePos which calls
+ // vp.query (binlogplayer.GenerateUpdatePos).
+ // We mock the DB to accept any update/commit.
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ commitCh := make(chan *applyTxn, 3)
+
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ // Send 3 transactions in order
+ for i := int64(1); i <= 3; i++ {
+ txn := &applyTxn{
+ order: i,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100 * i,
+ commitOnly: true,
+ updatePosOnly: true,
+ lastEventTimestamp: 100 * i,
+ },
+ done: make(chan struct{}),
+ }
+ commitCh <- txn
+ }
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+}
+
+func TestCommitLoop_OutOfOrderReordering(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ commitCh := make(chan *applyTxn, 3)
+
+ // Each txn records its position write through its own query closure.
+ // commitLoop is single-goroutine, so plain slices are safe here.
+ var committedOrders []int64
+ var committedSQL []string
+ makeTxn := func(order int64) *applyTxn {
+ pos, err := replication.DecodePosition(fmt.Sprintf("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-%d", 4+order))
+ require.NoError(t, err)
+ return &applyTxn{
+ order: order,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 100 * order,
+ commitOnly: true,
+ updatePosOnly: true,
+ lastEventTimestamp: 100 * order,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ committedOrders = append(committedOrders, order)
+ committedSQL = append(committedSQL, sql)
+ return &sqltypes.Result{}, nil
+ },
+ },
+ done: make(chan struct{}),
+ }
+ }
+
+ // Send transactions out of order: 2, 1, 3
+ for _, order := range []int64{2, 1, 3} {
+ commitCh <- makeTxn(order)
+ }
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+
+ // The position writes must happen in strict order despite arrival order,
+ // and each write must carry its own txn's position.
+ require.Equal(t, []int64{1, 2, 3}, committedOrders)
+ require.Len(t, committedSQL, 3)
+ assert.Contains(t, committedSQL[0], ":1-5")
+ assert.Contains(t, committedSQL[1], ":1-6")
+ assert.Contains(t, committedSQL[2], ":1-7")
+
+ scheduler.mu.Lock()
+ lastCommittedOrder := scheduler.lastCommittedOrder
+ scheduler.mu.Unlock()
+ assert.Equal(t, int64(3), lastCommittedOrder)
+}
+
+// TestCommitLoop_ZeroOrderIsRejected pins the invariant that every txn
+// reaching commitLoop must carry a positive order. All production enqueue
+// paths use parallelOrder.Add(1) (>= 1), so an order==0 txn indicates a
+// regression that would silently bypass strict commit ordering — and silently
+// regress the monotonic position invariant on _vt.vreplication.pos. Fail
+// fast so the workflow restarts cleanly instead of corrupting position state.
+func TestCommitLoop_ZeroOrderIsRejected(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ commitCh := make(chan *applyTxn, 1)
+
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ txn := &applyTxn{
+ order: 0,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ commitCh <- txn
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "parallel apply commit txn missing order")
+}
+
+func TestCommitLoop_PendingLeftover(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ commitCh := make(chan *applyTxn, 2)
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ // Send order 3 and 1, but no order 2 → should error about missing order
+ for _, order := range []int64{3, 1} {
+ txn := &applyTxn{
+ order: order,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ commitCh <- txn
+ }
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "parallel apply commit missing order")
+}
+
+func TestCommitLoop_MarksCommittedOnScheduler(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ commitCh := make(chan *applyTxn, 1)
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ txn := &applyTxn{
+ order: 1,
+ sequenceNumber: 7,
+ hasCommitMeta: true,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ commitCh <- txn
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+
+ // markCommitted should have advanced lastCommittedSequence
+ scheduler.mu.Lock()
+ assert.Equal(t, int64(7), scheduler.lastCommittedSequence)
+ scheduler.mu.Unlock()
+
+ // lastCommittedOrder should be 1
+ scheduler.mu.Lock()
+ assert.Equal(t, int64(1), scheduler.lastCommittedOrder)
+ scheduler.mu.Unlock()
+}
+
+func TestCommitLoop_UpdatesLag(t *testing.T) {
+ vp, mockDB := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ commitCh := make(chan *applyTxn, 1)
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ now := time.Now()
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ lastEventTimestamp: now.Add(-3 * time.Second).Unix(),
+ lastEventCurrentTime: now.UnixNano(),
+ },
+ done: make(chan struct{}),
+ }
+ commitCh <- txn
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+
+ // Lag should be approximately 3 seconds
+ lag := vp.vr.stats.ReplicationLagSeconds.Load()
+ assert.GreaterOrEqual(t, lag, int64(2))
+ assert.LessOrEqual(t, lag, int64(5))
+}
+
+func TestCommitLoop_UpdatePosOnlyKeepsLaterUnsavedEvent(t *testing.T) {
+ ctx := testCtx(t)
+ vp, mockDB := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+
+ laterUnsaved := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 200}
+ vp.serialMu.Lock()
+ vp.unsavedEvent = laterUnsaved
+ vp.serialMu.Unlock()
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ },
+ done: make(chan struct{}),
+ }
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+
+ vp.serialMu.Lock()
+ defer vp.serialMu.Unlock()
+ require.Same(t, laterUnsaved, vp.unsavedEvent)
+}
+
+func TestCommitLoop_UpdatePosOnlyDoesNotRefreshIdleTimerBehindLaterUnsavedEvent(t *testing.T) {
+ ctx := testCtx(t)
+ vp, mockDB := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ committedPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ laterPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-9")
+ require.NoError(t, err)
+
+ laterUnsaved := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 200}
+ oldSavedAt := time.Now().Add(-2 * idleTimeout)
+ vp.serialMu.Lock()
+ vp.pos = laterPos
+ vp.unsavedEvent = laterUnsaved
+ vp.timeLastSaved = oldSavedAt
+ vp.serialMu.Unlock()
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: committedPos,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ },
+ done: make(chan struct{}),
+ }
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+
+ vp.serialMu.Lock()
+ defer vp.serialMu.Unlock()
+ require.Same(t, laterUnsaved, vp.unsavedEvent)
+ assert.Equal(t, laterPos, vp.pos)
+ assert.Equal(t, oldSavedAt, vp.timeLastSaved)
+}
+
+func TestCommitLoop_UpdatePosOnlyWithoutTimestampRefreshesHeartbeat(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ recording := &recordingDBClient{}
+ mainClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.vr.dbClient = mainClient
+ vp.dbClient = mainClient
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return mainClient.Execute(sql)
+ }
+ vp.commit = mainClient.Commit
+
+ pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 0,
+ commitOnly: true,
+ updatePosOnly: true,
+ },
+ done: make(chan struct{}),
+ }
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+
+ require.Len(t, recording.queries, 2)
+ assert.Contains(t, recording.queries[0], "update _vt.vreplication set pos=")
+ assert.NotContains(t, recording.queries[0], "transaction_timestamp=")
+ assert.Contains(t, recording.queries[1], "time_heartbeat=")
+}
+
+func TestCommitLoop_WorkerCommitDoesNotRefreshIdleTimerBehindLaterUnsavedEvent(t *testing.T) {
+ ctx := testCtx(t)
+ vp, mockDB := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ committedPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ laterPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-9")
+ require.NoError(t, err)
+
+ workerClient := newVDBClient(&recordingDBClient{}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ require.NoError(t, workerClient.Begin())
+ t.Cleanup(func() {
+ _ = workerClient.Rollback()
+ })
+
+ laterUnsaved := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 200}
+ oldSavedAt := time.Now().Add(-2 * idleTimeout)
+ vp.serialMu.Lock()
+ vp.pos = laterPos
+ vp.unsavedEvent = laterUnsaved
+ vp.timeLastSaved = oldSavedAt
+ vp.serialMu.Unlock()
+
+ doneCh := make(chan struct{}, 1)
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: committedPos,
+ timestamp: 100,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return workerClient.Execute(sql)
+ },
+ commit: workerClient.Commit,
+ client: workerClient,
+ },
+ done: doneCh,
+ }
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+
+ select {
+ case <-doneCh:
+ default:
+ t.Fatal("worker done was not signaled")
+ }
+
+ vp.serialMu.Lock()
+ defer vp.serialMu.Unlock()
+ require.Same(t, laterUnsaved, vp.unsavedEvent)
+ assert.Equal(t, laterPos, vp.pos)
+ assert.Equal(t, oldSavedAt, vp.timeLastSaved)
+}
+
+func TestCommitLoop_CommitOnlyAppliesEvent(t *testing.T) {
+ ctx := testCtx(t)
+ vp, mockDB := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+ mockDB.AddInvariant("insert", &sqltypes.Result{})
+
+ commitCh := make(chan *applyTxn, 1)
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ heartbeatEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_HEARTBEAT,
+ Timestamp: 100,
+ }
+
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: false,
+ mustSave: true,
+ events: []*binlogdatapb.VEvent{heartbeatEvent},
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ commitCh <- txn
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+}
+
+func TestCommitLoop_UpdatePosOnlyStopPosReached(t *testing.T) {
+ ctx := testCtx(t)
+ vp, mockDB := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ stopPos, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ vp.stopPos = stopPos
+
+ commitCh := make(chan *applyTxn, 1)
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ mustSave: true,
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ commitCh <- txn
+ close(commitCh)
+
+ err := vp.commitLoop(ctx, scheduler, commitCh)
+ require.ErrorIs(t, err, io.EOF)
+}
+
+func TestCommitLoop_UpdatePosOnlyStopPosStateFailureKeepsTransactionOpen(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = stopPos
+ vp.saveStop = true
+
+ stateErr := errors.New("set state failed")
+ mainClient := newVDBClient(&failingDBClient{failOnQuery: map[string]error{
+ "update _vt.vreplication set state=": stateErr,
+ }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ t.Cleanup(func() {
+ _ = mainClient.Rollback()
+ })
+ vp.vr.dbClient = mainClient
+ vp.dbClient = mainClient
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return mainClient.Execute(sql)
+ }
+ vp.commit = mainClient.Commit
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: stopPos,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: true,
+ mustSave: true,
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.ErrorContains(t, err, stateErr.Error())
+ assert.True(t, mainClient.InTransaction)
+ assert.Contains(t, mainClient.queries, "begin")
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Zero(t, scheduler.lastCommittedOrder)
+ assert.Zero(t, scheduler.lastCommittedSequence)
+ assert.Zero(t, scheduler.inflightGlobal)
+ assert.Zero(t, scheduler.inflightMissingMeta)
+ assert.Zero(t, scheduler.inflightCommitMeta)
+}
+
+func TestCommitLoop_WorkerStopPosStateFailureDoesNotCommit(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = stopPos
+ vp.saveStop = true
+
+ stateErr := errors.New("set state failed")
+ vp.vr.dbClient = newVDBClient(&failingDBClient{failOnQuery: map[string]error{
+ "update _vt.vreplication set state=": stateErr,
+ }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+
+ workerClient := newVDBClient(&failingDBClient{failOnQuery: map[string]error{
+ "update _vt.vreplication set state=": stateErr,
+ }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ require.NoError(t, workerClient.Begin())
+ t.Cleanup(func() {
+ _ = workerClient.Rollback()
+ })
+
+ doneCh := make(chan struct{}, 1)
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: stopPos,
+ timestamp: 100,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return workerClient.Execute(sql)
+ },
+ commit: workerClient.Commit,
+ client: workerClient,
+ lastEventTimestamp: 100,
+ },
+ done: doneCh,
+ }
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- txn
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.ErrorContains(t, err, stateErr.Error())
+ assert.True(t, workerClient.InTransaction)
+ assert.NotContains(t, workerClient.queries, "commit")
+ select {
+ case <-doneCh:
+ t.Fatal("worker done signaled before stop-state update succeeded")
+ default:
+ }
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Zero(t, scheduler.lastCommittedOrder)
+ assert.Zero(t, scheduler.lastCommittedSequence)
+ assert.Zero(t, scheduler.inflightGlobal)
+ assert.Zero(t, scheduler.inflightMissingMeta)
+ assert.Zero(t, scheduler.inflightCommitMeta)
+}
+
+// TestCommitLoop_WorkerPosUpdateFailureDoesNotCommit pins the failure path
+// where the position update on the worker's connection fails: the commitLoop
+// must return the error without committing the worker's transaction, without
+// signaling the worker's done channel (the connection is in an unknown state
+// and must not be reused), and without advancing the scheduler.
+func TestCommitLoop_WorkerPosUpdateFailureDoesNotCommit(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ posErr := errors.New("pos update failed")
+ workerClient := newVDBClient(&failingDBClient{failOnQuery: map[string]error{
+ "update _vt.vreplication set pos=": posErr,
+ }}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ require.NoError(t, workerClient.Begin())
+ t.Cleanup(func() {
+ _ = workerClient.Rollback()
+ })
+
+ pos1, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+
+ doneCh := make(chan struct{}, 1)
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return workerClient.Execute(sql)
+ },
+ commit: workerClient.Commit,
+ client: workerClient,
+ lastEventTimestamp: 100,
+ },
+ done: doneCh,
+ }
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- txn
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.ErrorContains(t, err, posErr.Error())
+ assert.True(t, workerClient.InTransaction)
+ assert.NotContains(t, workerClient.queries, "commit")
+ select {
+ case <-doneCh:
+ require.Fail(t, "worker done signaled after failed position update")
+ default:
+ }
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Zero(t, scheduler.lastCommittedOrder)
+ assert.Zero(t, scheduler.lastCommittedSequence)
+ assert.Zero(t, scheduler.inflightGlobal)
+ assert.Zero(t, scheduler.inflightMissingMeta)
+ assert.Zero(t, scheduler.inflightCommitMeta)
+}
+
+// TestCommitLoop_WorkerCommitFailureKeepsTransactionOpen pins the failure
+// path where the position update succeeds but the COMMIT itself fails: the
+// commitLoop must return the error, the vdbClient must still consider the
+// transaction open (a failed COMMIT leaves the server-side state unknown),
+// the worker must not be signaled to reuse the connection, and the scheduler
+// must not record the txn as committed.
+func TestCommitLoop_WorkerCommitFailureKeepsTransactionOpen(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ commitErr := errors.New("commit failed")
+ workerClient := newVDBClient(&failingCommitDBClient{commitErr: commitErr}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ require.NoError(t, workerClient.Begin())
+ t.Cleanup(func() {
+ _ = workerClient.Rollback()
+ })
+
+ pos1, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+
+ doneCh := make(chan struct{}, 1)
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos1,
+ timestamp: 100,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return workerClient.Execute(sql)
+ },
+ commit: workerClient.Commit,
+ client: workerClient,
+ lastEventTimestamp: 100,
+ },
+ done: doneCh,
+ }
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- txn
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.ErrorContains(t, err, commitErr.Error())
+ assert.True(t, workerClient.InTransaction)
+ select {
+ case <-doneCh:
+ require.Fail(t, "worker done signaled after failed commit")
+ default:
+ }
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Zero(t, scheduler.lastCommittedOrder)
+ assert.Zero(t, scheduler.lastCommittedSequence)
+ assert.Zero(t, scheduler.inflightGlobal)
+ assert.Zero(t, scheduler.inflightMissingMeta)
+ assert.Zero(t, scheduler.inflightCommitMeta)
+}
+
+func TestCommitLoop_CommitOnlyEOFStillMarksCommitted(t *testing.T) {
+ ctx := testCtx(t)
+ vp, mockDB := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ mockDB.AddInvariant("update _vt.vreplication set state=", &sqltypes.Result{})
+ mockDB.AddInvariant("update _vt.vreplication set pos=", &sqltypes.Result{})
+ mockDB.AddInvariant("commit", &sqltypes.Result{})
+ mockDB.AddInvariant("begin", &sqltypes.Result{})
+
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_STOP
+
+ pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.pos = pos
+ vp.stopPos = pos
+
+ txn := &applyTxn{
+ order: 1,
+ forceGlobal: true,
+ hasCommitMeta: true,
+ sequenceNumber: 7,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: false,
+ events: []*binlogdatapb.VEvent{{
+ Type: binlogdatapb.VEventType_DDL,
+ Statement: "alter table t1 add column c1 int",
+ Timestamp: 100,
+ }},
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ require.NoError(t, scheduler.enqueue(txn))
+
+ ready, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Same(t, txn, ready)
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- ready
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.ErrorIs(t, err, io.EOF)
+
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Equal(t, int64(7), scheduler.lastCommittedSequence)
+ assert.Equal(t, int64(1), scheduler.lastCommittedOrder)
+ assert.Zero(t, scheduler.inflightGlobal)
+ assert.Zero(t, scheduler.inflightMissingMeta)
+ assert.Zero(t, scheduler.inflightCommitMeta)
+}
+
+func TestCommitLoop_EXECIGNOREIdempotentDropForeignKeyRefreshesFKMetadata(t *testing.T) {
+ ctx := testCtx(t)
+ vp, mockDB := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC_IGNORE
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+
+ oldFKRefs := map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}},
+ }
+ vp.fkRefs = oldFKRefs
+ vp.parentFKRefs = buildParentFKRefs(oldFKRefs)
+
+ mockDB.RemoveInvariant("information_schema.key_column_usage")
+ mockDB.ExpectRequestRE("update _vt\\.vreplication set pos='MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5', time_updated=.*", &sqltypes.Result{}, nil)
+ mockDB.ExpectRequestRE(
+ "SELECT kcu\\.TABLE_NAME, kcu\\.CONSTRAINT_NAME, kcu\\.COLUMN_NAME, kcu\\.REFERENCED_TABLE_NAME, kcu\\.REFERENCED_COLUMN_NAME, .* FROM information_schema\\.KEY_COLUMN_USAGE kcu JOIN information_schema\\.COLUMNS child_cols .* JOIN information_schema\\.COLUMNS parent_cols .* WHERE kcu\\.TABLE_SCHEMA = 'db' AND kcu\\.REFERENCED_TABLE_NAME IS NOT NULL ORDER BY kcu\\.TABLE_NAME, kcu\\.CONSTRAINT_NAME, kcu\\.ORDINAL_POSITION",
+ &sqltypes.Result{},
+ nil,
+ )
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ if sql == "alter table child drop foreign key fk_child_parent" {
+ return nil, sqlerror.NewSQLErrorf(sqlerror.ERCantDropFieldOrKey, sqlerror.SSBadFieldError, "Can't DROP 'fk_child_parent'; check that column/key exists")
+ }
+ return vp.vr.dbClient.Execute(sql)
+ }
+
+ pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+
+ txn := &applyTxn{
+ order: 1,
+ forceGlobal: true,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: false,
+ events: []*binlogdatapb.VEvent{{
+ Type: binlogdatapb.VEventType_DDL,
+ Statement: "alter table child drop foreign key fk_child_parent",
+ Timestamp: 100,
+ }},
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- txn
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+ assert.Nil(t, vp.fkRefs)
+ assert.Nil(t, vp.parentFKRefs)
+ mockDB.Wait()
+
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.tablePlans["child"] = &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("112"), Lengths: []int64{1, 2}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ require.NoError(t, vp.scheduleItems(ctx, scheduler, state, items))
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.Len(t, got.payload.events, 1)
+ assert.False(t, got.forceGlobal)
+}
+
+func TestCommitLoop_EXECIGNOREIdempotentAddUniqueIndexInvalidatesUniqueSecondaryCache(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_EXEC_IGNORE
+ vp.vr.vre = &Engine{env: vtenv.NewTestEnv()}
+ vp.vr.workflowConfig.ParallelReplicationWorkers = 2
+
+ tableName := "parallel_apply_execignore_idempotent_add_unique_idx"
+ qualifiedTableName := vrepldb + "." + tableName
+ execStatements(t, []string{
+ "create table " + qualifiedTableName + " (id int not null, email varchar(128) not null, primary key(id), unique key uk_email(email))",
+ })
+ t.Cleanup(func() {
+ execStatements(t, []string{"drop table if exists " + qualifiedTableName})
+ })
+
+ realDB := &realDBClient{nolog: true}
+ require.NoError(t, realDB.Connect())
+ t.Cleanup(realDB.Close)
+
+ vp.vr.dbClient = newVDBClient(realDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.dbClient = vp.vr.dbClient
+ vp.vr.mysqld = &infoSchemaMysqld{MysqlDaemon: env.Mysqld}
+ vp.vr.source.Filter = &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: tableName}}}
+
+ colInfoMap, err := vp.vr.buildColInfoMap(ctx)
+ require.NoError(t, err)
+ vp.vr.colInfoMap = colInfoMap
+
+ vp.replicatorPlan, err = vp.vr.buildReplicatorPlan(
+ vp.vr.source,
+ vp.vr.colInfoMap,
+ nil,
+ vp.vr.stats,
+ vp.vr.vre.env.CollationEnv(),
+ vp.vr.vre.env.Parser(),
+ )
+ require.NoError(t, err)
+
+ stalePlan := &TablePlan{
+ TargetName: tableName,
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT32}, {Name: "email", Type: querypb.Type_VARCHAR}},
+ PKIndices: []bool{true, false},
+ IdentityColumns: []string{"id"},
+ HasExtraUniqueSecondary: false,
+ }
+ vp.tablePlans[tableName] = stalePlan
+ vp.tablePlansVersion.Store(1)
+
+ vp.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ if sql == "alter table "+tableName+" add unique key uk_email(email)" {
+ return nil, sqlerror.NewSQLErrorf(sqlerror.ERDupKeyName, sqlerror.SSAccessDeniedError, "Duplicate key name 'uk_email'")
+ }
+ return vp.vr.dbClient.Execute(sql)
+ }
+
+ pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+
+ txn := &applyTxn{
+ order: 1,
+ forceGlobal: true,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 100,
+ commitOnly: true,
+ updatePosOnly: false,
+ events: []*binlogdatapb.VEvent{{
+ Type: binlogdatapb.VEventType_DDL,
+ Statement: "alter table " + tableName + " add unique key uk_email(email)",
+ Timestamp: 100,
+ }},
+ lastEventTimestamp: 100,
+ },
+ done: make(chan struct{}),
+ }
+ require.NoError(t, scheduler.enqueue(txn))
+
+ ready, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.Same(t, txn, ready)
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- ready
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.NoError(t, err)
+ require.NotNil(t, vp.postDDLStalePlans)
+ require.Contains(t, vp.postDDLStalePlans, tableName)
+
+ require.NoError(t, vp.applyEvent(ctx, &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_FIELD,
+ FieldEvent: &binlogdatapb.FieldEvent{
+ TableName: tableName,
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT32},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ },
+ }, false))
+ require.NoError(t, vp.dbClient.Rollback())
+
+ // The idempotent EXEC_IGNORE add-unique-index barrier invalidated the
+ // cached plan, so the FIELD handler re-ran the unique-key analysis: the
+ // plain unique secondary emits a writeset unique key.
+ require.False(t, vp.tablePlans[tableName].HasExtraUniqueSecondary)
+ require.Equal(t, [][]string{{"email"}}, vp.tablePlans[tableName].UniqueKeyColumns)
+}
+
+// TestCommitLoop_WorkerTxnCommitProtocol drives a single worker transaction
+// through the real commitLoop and pins the commit protocol end-to-end: the
+// position update and COMMIT run on the worker's connection, the worker's
+// done channel is signaled, and the scheduler observes the committed order.
+func TestCommitLoop_WorkerTxnCommitProtocol(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ recording := &recordingDBClient{}
+ workerClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ require.NoError(t, workerClient.Begin())
+
+ pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+
+ doneCh := make(chan struct{}, 1)
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 100,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return workerClient.Execute(sql)
+ },
+ commit: workerClient.Commit,
+ client: workerClient,
+ lastEventTimestamp: 100,
+ },
+ done: doneCh,
+ }
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- txn
+ close(commitCh)
+
+ require.NoError(t, vp.commitLoop(ctx, scheduler, commitCh))
+
+ require.NotEmpty(t, recording.queries)
+ assert.Contains(t, recording.queries[0], "update _vt.vreplication set pos=")
+ assert.False(t, workerClient.InTransaction, "commitLoop must commit the worker's transaction")
+ select {
+ case <-doneCh:
+ default:
+ t.Fatal("commitLoop must signal the worker's done channel after committing")
+ }
+ scheduler.mu.Lock()
+ defer scheduler.mu.Unlock()
+ assert.Equal(t, int64(1), scheduler.lastCommittedOrder)
+}
+
+// TestCommitLoop_WorkerStopPosSetsStateAndStops pins the stop-position path
+// of the worker commit protocol: when the transaction's position reaches
+// stopPos, the Stopped state update is written on the worker's connection
+// (inside the same MySQL transaction as the position update), the
+// transaction commits, the worker is unblocked, and commitLoop returns
+// io.EOF to stop the stream.
+func TestCommitLoop_WorkerStopPosSetsStateAndStops(t *testing.T) {
+ ctx := testCtx(t)
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+
+ pos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = pos
+ vp.saveStop = true
+
+ recording := &recordingDBClient{}
+ workerClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ require.NoError(t, workerClient.Begin())
+
+ doneCh := make(chan struct{}, 1)
+ txn := &applyTxn{
+ order: 1,
+ payload: &applyTxnPayload{
+ pos: pos,
+ timestamp: 100,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return workerClient.Execute(sql)
+ },
+ commit: workerClient.Commit,
+ client: workerClient,
+ lastEventTimestamp: 100,
+ },
+ done: doneCh,
+ }
+
+ commitCh := make(chan *applyTxn, 1)
+ commitCh <- txn
+ close(commitCh)
+
+ err = vp.commitLoop(ctx, scheduler, commitCh)
+ require.ErrorIs(t, err, io.EOF, "reaching the stop position must stop the stream")
+
+ assert.False(t, workerClient.InTransaction, "the worker's transaction must be committed")
+ var sawPosUpdate, sawStateUpdate bool
+ for _, q := range recording.queries {
+ if strings.Contains(q, "update _vt.vreplication set pos=") {
+ sawPosUpdate = true
+ }
+ if strings.Contains(q, "update _vt.vreplication set state=") {
+ sawStateUpdate = true
+ }
+ }
+ assert.True(t, sawPosUpdate, "position update must run on the worker's connection")
+ assert.True(t, sawStateUpdate, "the Stopped state update must run on the worker's connection so it commits atomically with the position")
+ select {
+ case <-doneCh:
+ default:
+ t.Fatal("commitLoop must signal the worker's done channel after committing")
+ }
+}
+
+// TestSetState_BatchedTransactionExecutesImmediatelyWithoutReplay pins the
+// mid-batch setState contract: the pending batch is flushed first (inside
+// the same open MySQL transaction, preserving stop-path atomicity), the
+// state UPDATE and insertLog statements execute immediately, and the batch
+// buffer is marked flushed so the later CommitTrxQueryBatch sends only
+// "commit" — replaying nothing. Deferring the state UPDATE into the batch
+// (the previous design) double-executed insertLog's SELECT/INSERT on
+// replay, duplicating vreplication_log rows.
+func TestSetState_BatchedTransactionExecutesImmediatelyWithoutReplay(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ recording := &recordingDBClient{}
+
+ vp.vr.dbClient = newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Stopped)
+ vp.vr.dbClient.maxBatchSize = 1024
+
+ require.NoError(t, vp.vr.dbClient.Begin())
+ require.NoError(t, vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, ""))
+ // The state UPDATE executed on the wire immediately (after the batch
+ // flush), within the open transaction.
+ require.NotEmpty(t, recording.queries)
+ sawStateUpdate := 0
+ for _, q := range recording.queries {
+ if strings.Contains(q, "update _vt.vreplication set state='Stopped'") {
+ sawStateUpdate++
+ }
+ }
+ require.Equal(t, 1, sawStateUpdate)
+
+ // The later batch commit must replay nothing: only "commit" goes out.
+ preCommit := len(recording.queries)
+ require.NoError(t, vp.vr.dbClient.CommitTrxQueryBatch())
+ require.Equal(t, preCommit+1, len(recording.queries))
+ assert.Equal(t, "commit", recording.queries[len(recording.queries)-1])
+ // Still exactly one state UPDATE — no double execution.
+ sawStateUpdate = 0
+ for _, q := range recording.queries {
+ if strings.Contains(q, "update _vt.vreplication set state='Stopped'") {
+ sawStateUpdate++
+ }
+ }
+ require.Equal(t, 1, sawStateUpdate)
+}
+
+// TestSetStateImmediate_BatchedTransactionDoesNotDuplicateWrites exercises
+// the worker batch-mode stop-position pattern at parallel_apply.go: the
+// caller buffers the position update with AddQueryToTrxBatch, then flushes
+// the batch with ExecuteTrxQueryBatch so the upcoming immediate writes share
+// the same MySQL transaction, runs setStateWithDBClientImmediate to emit
+// the state UPDATE and vreplication_log INSERT via ExecuteFetch, marks
+// those already-executed queries as flushed, and finally calls
+// CommitTrxQueryBatch which must send only ";commit" — not a replay of
+// every prior query. Skipping any step in this dance doubles the
+// vreplication_log row and (in the previous fix) broke atomicity with the
+// position save by implicit-committing the active transaction via a
+// nested BEGIN.
+func TestSetStateImmediate_BatchedTransactionDoesNotDuplicateWrites(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ recording := &recordingDBClient{}
+
+ dbClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.vr.dbClient = dbClient
+ vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running)
+ dbClient.maxBatchSize = 1024
+
+ require.NoError(t, dbClient.Begin())
+ require.NoError(t, dbClient.AddQueryToTrxBatch(
+ "update _vt.vreplication set pos='MySQL56/x:1-5', time_updated=1 where id=1"))
+ require.NoError(t, vp.vr.setStateWithDBClientImmediate(
+ dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at position foo"))
+ require.NoError(t, dbClient.CommitTrxQueryBatch())
+
+ joined := strings.Join(recording.queries, ";")
+ assert.Equal(t, 1, strings.Count(joined, "update _vt.vreplication set state='Stopped'"),
+ "state UPDATE must be sent exactly once. Queries: %v", recording.queries)
+ assert.Equal(t, 1, strings.Count(joined, "insert into _vt.vreplication_log"),
+ "vreplication_log INSERT must be sent exactly once. Queries: %v", recording.queries)
+ assert.Equal(t, 1, strings.Count(joined, "update _vt.vreplication set pos="),
+ "position UPDATE must be sent exactly once. Queries: %v", recording.queries)
+}
+
+// TestSetStateImmediate_FollowedByAddQueryToTrxBatchPreservesNoDuplicate
+// covers the failure mode the immediate path was designed to defeat:
+// after setStateWithDBClientImmediate flushes pre-batched queries and
+// runs the state UPDATE on the wire, queriesPos is advanced. A subsequent
+// AddQueryToTrxBatch (the natural pattern: immediate write, then more
+// batched work, then commit) must NOT replay the immediate queries when
+// CommitTrxQueryBatch sends queries[queriesPos:]. If a future regression
+// removed the markTrxBatchedQueriesFlushed call inside
+// setStateWithDBClient, this test would catch it because the state UPDATE
+// would appear twice in the recorded queries.
+func TestSetStateImmediate_FollowedByAddQueryToTrxBatchPreservesNoDuplicate(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ recording := &recordingDBClient{}
+
+ dbClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ vp.vr.dbClient = dbClient
+ vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running)
+ dbClient.maxBatchSize = 1024
+
+ require.NoError(t, dbClient.Begin())
+ require.NoError(t, dbClient.AddQueryToTrxBatch(
+ "update _vt.vreplication set pos='MySQL56/x:1-5', time_updated=1 where id=1"))
+ require.NoError(t, vp.vr.setStateWithDBClientImmediate(
+ dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at position foo"))
+ // Future-style follow-up batched work after the immediate write.
+ require.NoError(t, dbClient.AddQueryToTrxBatch(
+ "insert into _vt.vreplication_log(vrepl_id, type, message) values (1, 'Note', 'after')"))
+ require.NoError(t, dbClient.CommitTrxQueryBatch())
+
+ joined := strings.Join(recording.queries, ";")
+ assert.Equal(t, 1, strings.Count(joined, "update _vt.vreplication set state='Stopped'"),
+ "state UPDATE must be sent exactly once even with later batched work. Queries: %v", recording.queries)
+ // We expect 2 vreplication_log inserts: one from the immediate setState
+ // (LogStateChange) and one from the follow-up AddQueryToTrxBatch.
+ assert.Equal(t, 2, strings.Count(joined, "insert into _vt.vreplication_log"),
+ "each vreplication_log INSERT must be sent exactly once. Queries: %v", recording.queries)
+}
+
+// TestBeginImmediate_AdvancesQueriesPosPastBeginSeed pins that BeginImmediate
+// leaves vc.queriesPos past the synthetic "begin" entry it seeds. BEGIN was
+// already sent on the wire by BeginImmediate; the buffer entry only exists so
+// Retry's replay loop calls vc.Begin() instead of ExecuteFetch("begin"). Any
+// subsequent ExecuteTrxQueryBatch / CommitTrxQueryBatch must not include it
+// in its multi-statement, or the nested BEGIN would implicit-commit the
+// active transaction and break atomicity with whatever the caller has done
+// so far.
+func TestBeginImmediate_AdvancesQueriesPosPastBeginSeed(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ recording := &recordingDBClient{}
+ dbClient := newVDBClient(recording, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ dbClient.maxBatchSize = 1024
+
+ require.NoError(t, dbClient.BeginImmediate())
+ assert.Equal(t, []string{"begin"}, dbClient.queries)
+ assert.Equal(t, int64(1), dbClient.queriesPos)
+}
+
+// ---------- enqueueCommitOnly tests ----------
+
+func TestEnqueueCommitOnly(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ // Set up a known position
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ vp.serialMu.Lock()
+ vp.pos = pos1
+ vp.serialMu.Unlock()
+
+ event := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ Timestamp: 200,
+ }
+
+ err := vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NotNil(t, got)
+ assert.True(t, got.forceGlobal)
+ assert.True(t, got.noConflict)
+ assert.True(t, got.payload.commitOnly)
+ assert.True(t, got.payload.updatePosOnly)
+ assert.True(t, got.payload.mustSave)
+ assert.Equal(t, int64(200), got.payload.timestamp)
+}
+
+func TestEnqueueCommitOnly_NotUpdatePosOnly(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ vp.serialMu.Lock()
+ vp.pos = pos1
+ vp.serialMu.Unlock()
+
+ event := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ Timestamp: 200,
+ SequenceNumber: 5,
+ CommitParent: 4,
+ }
+
+ err := vp.enqueueCommitOnly(ctx, scheduler, event, false, false, event.SequenceNumber, event.CommitParent, true)
+ require.NoError(t, err)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.False(t, got.noConflict) // updatePosOnly=false → noConflict=false
+ assert.False(t, got.payload.mustSave)
+ assert.False(t, got.payload.updatePosOnly)
+ assert.True(t, got.hasCommitMeta) // SequenceNumber=5
+ assert.Equal(t, int64(5), got.sequenceNumber)
+}
+
+func TestEnqueueCommitOnly_IncrementsOrder(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT, Timestamp: 100}
+
+ require.NoError(t, vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false))
+ require.NoError(t, vp.enqueueCommitOnly(ctx, scheduler, event, true, true, 0, 0, false))
+
+ got1, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ got2, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(1), got1.order)
+ assert.Equal(t, int64(2), got2.order)
+}
+
+// ---------- workerLoop tests ----------
+
+func TestWorkerLoop_CommitOnlyBypassesApply(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx, cancel := context.WithCancel(t.Context())
+ defer cancel()
+ scheduler := newApplyScheduler(ctx)
+ commitCh := make(chan *applyTxn, 1)
+
+ pos1, _ := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+
+ // Enqueue a commitOnly transaction
+ payload := &applyTxnPayload{
+ pos: pos1,
+ commitOnly: true,
+ timestamp: 100,
+ lastEventTimestamp: 100,
+ }
+ txn := &applyTxn{
+ order: 1,
+ payload: payload,
+ }
+ require.NoError(t, scheduler.enqueue(txn))
+
+ // Worker is nil since commitOnly doesn't need it
+ worker := &applyWorker{ctx: ctx}
+
+ // Run workerLoop in background
+ doneCh := make(chan error, 1)
+ go func() {
+ doneCh <- vp.workerLoop(ctx, scheduler, commitCh, worker)
+ }()
+
+ // Should forward to commitCh
+ assert.Eventually(t, func() bool {
+ return len(commitCh) > 0
+ }, 200*time.Millisecond, 5*time.Millisecond)
+
+ got := <-commitCh
+ assert.Equal(t, txn, got)
+
+ // Cancel to stop worker loop
+ cancel()
+
+ assert.Eventually(t, func() bool {
+ return len(doneCh) > 0
+ }, 200*time.Millisecond, 5*time.Millisecond)
+}
+
+func TestWorkerLoop_AppliesAndDispatches(t *testing.T) {
+ ctx, cancel := context.WithCancel(testCtx(t))
+ defer cancel()
+
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+ commitCh := make(chan *applyTxn, 1)
+
+ worker := &applyWorker{
+ ctx: ctx,
+ query: func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return &sqltypes.Result{}, nil
+ },
+ commit: func() error {
+ return nil
+ },
+ }
+ activeClient := newVDBClient(&recordingDBClient{}, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+ worker.client = activeClient
+
+ event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}
+ payload := &applyTxnPayload{events: []*binlogdatapb.VEvent{event}}
+ gotTxn := &applyTxn{order: 1, payload: payload}
+
+ require.NoError(t, scheduler.enqueue(gotTxn))
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker)
+ }()
+
+ select {
+ case txn := <-commitCh:
+ require.NotNil(t, txn)
+ assert.NotNil(t, txn.payload.query)
+ assert.NotNil(t, txn.payload.commit)
+ assert.Same(t, activeClient, txn.payload.client)
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("timed out waiting for commitCh")
+ }
+
+ cancel()
+
+ select {
+ case err := <-errCh:
+ require.ErrorIs(t, err, context.Canceled)
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("timed out waiting for workerLoop exit")
+ }
+}
+
+func TestWorkerLoop_ErrorRollsBack(t *testing.T) {
+ ctx, cancel := context.WithCancel(testCtx(t))
+ defer cancel()
+
+ vp, _ := testVPlayer(t)
+ scheduler := newApplyScheduler(ctx)
+ commitCh := make(chan *applyTxn, 1)
+
+ mockDB := binlogplayer.NewMockDBClient(t)
+ mockDB.AddInvariant("rollback", &sqltypes.Result{})
+
+ worker := &applyWorker{
+ ctx: ctx,
+ client: newVDBClient(mockDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems),
+ }
+
+ badEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"}
+ payload := &applyTxnPayload{events: []*binlogdatapb.VEvent{badEvent}}
+ gotTxn := &applyTxn{order: 1, payload: payload}
+
+ require.NoError(t, scheduler.enqueue(gotTxn))
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- vp.workerLoop(ctx, scheduler, commitCh, worker)
+ }()
+
+ select {
+ case err := <-errCh:
+ require.Error(t, err)
+ case <-time.After(200 * time.Millisecond):
+ t.Fatal("timed out waiting for workerLoop error")
+ }
+}
+
+// ---------- Batch time bound test ----------
+
+func TestScheduleItems_BatchTimeBoundForcesSave(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+
+ // Set lastFlushTime to long ago to trigger the 500ms time bound
+ state := ¶llelScheduleState{
+ lastFlushTime: time.Now().Add(-1 * time.Second),
+ lastHeartbeatRefresh: time.Now(),
+ }
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ // Two transactions in same batch — but time bound should force flush
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ }, Timestamp: 100},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-6"},
+ {Type: binlogdatapb.VEventType_ROW, RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}}},
+ }, Timestamp: 200},
+ {Type: binlogdatapb.VEventType_COMMIT},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // Time bound forced a flush — should have 2 separate transactions
+ got1, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ got2, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+
+ assert.Equal(t, int64(1), got1.order)
+ assert.Equal(t, int64(2), got2.order)
+}
+
+// ---------- Empty txn with stop position enqueues commitOnly ----------
+
+func TestScheduleItems_EmptyTxnWithStopPos(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ stopPos, err := replication.DecodePosition("MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5")
+ require.NoError(t, err)
+ vp.stopPos = stopPos
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-10",
+ }
+ commitEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_COMMIT,
+ Timestamp: 300,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, commitEvent}}
+ err = vp.scheduleItems(ctx, scheduler, state, items)
+ require.ErrorIs(t, err, io.EOF)
+
+ // Empty txn at/past stop pos → enqueueCommitOnly should fire
+ got, gerr := scheduler.nextReady(ctx)
+ require.NoError(t, gerr)
+ require.NotNil(t, got)
+ assert.True(t, got.forceGlobal)
+ assert.True(t, got.payload.commitOnly)
+}
+
+// ---------- JOURNAL is ForceGlobal ----------
+
+func TestScheduleItems_JOURNALIsForceGlobal(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ }}
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ journalEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_JOURNAL,
+ Timestamp: 200,
+ Journal: &binlogdatapb.Journal{
+ MigrationType: binlogdatapb.MigrationType_TABLES,
+ Tables: []string{"t1"},
+ },
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, journalEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.ErrorIs(t, err, io.EOF)
+
+ got, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ assert.True(t, got.forceGlobal)
+ assert.True(t, got.payload.commitOnly)
+}
+
+func TestScheduleItems_RelevantJournalStopsSchedulingLaterEventsInSameFetch(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.replicatorPlan = &ReplicatorPlan{TablePlans: map[string]*TablePlan{
+ "t1": {TargetName: "t1"},
+ }}
+
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {
+ Type: binlogdatapb.VEventType_JOURNAL,
+ Timestamp: 200,
+ Journal: &binlogdatapb.Journal{
+ MigrationType: binlogdatapb.MigrationType_TABLES,
+ Tables: []string{"t1"},
+ },
+ },
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.ErrorIs(t, err, io.EOF)
+
+ scheduler.mu.Lock()
+ assert.Equal(t, 1, scheduler.pendingCount)
+ scheduler.mu.Unlock()
+
+ got, gerr := scheduler.nextReady(ctx)
+ require.NoError(t, gerr)
+ require.NotNil(t, got)
+ assert.Equal(t, binlogdatapb.VEventType_JOURNAL, got.payload.events[0].Type)
+ assert.True(t, got.payload.commitOnly)
+}
+
+func TestScheduleItems_StopDDLStopsSchedulingLaterEventsInSameFetch(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+ vp.vr.source.OnDdl = binlogdatapb.OnDDLAction_STOP
+
+ items := [][]*binlogdatapb.VEvent{{
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"},
+ {Type: binlogdatapb.VEventType_DDL, Statement: "alter table t1 add column c1 int", Timestamp: 200},
+ {Type: binlogdatapb.VEventType_GTID, Gtid: "invalid"},
+ }}
+
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.ErrorIs(t, err, io.EOF)
+
+ scheduler.mu.Lock()
+ assert.Equal(t, 1, scheduler.pendingCount)
+ scheduler.mu.Unlock()
+
+ got, gerr := scheduler.nextReady(ctx)
+ require.NoError(t, gerr)
+ require.NotNil(t, got)
+ assert.Equal(t, binlogdatapb.VEventType_DDL, got.payload.events[0].Type)
+ assert.True(t, got.payload.commitOnly)
+}
+
+// ---------- DDL after accumulated ROW events flushes first ----------
+
+func TestScheduleItems_DDLFlushesAccumulatedEvents(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := testCtx(t)
+ scheduler := newApplyScheduler(ctx)
+ state := ¶llelScheduleState{lastFlushTime: time.Now(), lastHeartbeatRefresh: time.Now()}
+
+ vp.tablePlans["t1"] = &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ vp.tablePlansVersion.Store(1)
+
+ gtidEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_GTID,
+ Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
+ }
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}}},
+ },
+ Timestamp: 100,
+ }
+ ddlEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_DDL,
+ Timestamp: 200,
+ }
+
+ items := [][]*binlogdatapb.VEvent{{gtidEvent, rowEvent, ddlEvent}}
+ err := vp.scheduleItems(ctx, scheduler, state, items)
+ require.NoError(t, err)
+
+ // Should have 2 transactions: the flush of ROW events, then the DDL
+ got1, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+ require.NoError(t, scheduler.markCommitted(got1))
+
+ got2, err := scheduler.nextReady(ctx)
+ require.NoError(t, err)
+
+ // First is the row data flush
+ assert.Len(t, got1.payload.events, 1)
+ assert.Equal(t, binlogdatapb.VEventType_ROW, got1.payload.events[0].Type)
+
+ // Second is the DDL (commitOnly, forceGlobal)
+ assert.True(t, got2.forceGlobal)
+ assert.Equal(t, binlogdatapb.VEventType_DDL, got2.payload.events[0].Type)
+}
+
+// TestRecoverParallelApplyCatchesPanic verifies that the panic-recovery
+// helper used by every parallel-applier goroutine turns a panic into a
+// normal error routed through the supplied callback (which in production
+// pushes onto the orchestrator's error channel and cancels ctx). Without
+// this helper a panic in any worker would crash the entire vttablet.
+func TestRecoverParallelApplyCatchesPanic(t *testing.T) {
+ t.Run("nil callback does not panic", func(t *testing.T) {
+ // Explicitly runs the helper with no callback supplied to ensure
+ // the nil-cb branch is safe.
+ func() {
+ defer recoverParallelApply("testGoroutine", nil)
+ panic("boom")
+ }()
+ })
+
+ t.Run("callback receives a wrapped error on panic", func(t *testing.T) {
+ var got error
+ func() {
+ defer recoverParallelApply("worker-1", func(err error) { got = err })
+ panic("ouch")
+ }()
+ require.Error(t, got)
+ require.ErrorContains(t, got, "worker-1")
+ require.ErrorContains(t, got, "panicked")
+ })
+
+ t.Run("no panic means no callback invocation", func(t *testing.T) {
+ invoked := false
+ func() {
+ defer recoverParallelApply("happy", func(err error) { invoked = true })
+ }()
+ require.False(t, invoked, "callback must not fire without a panic")
+ })
+
+ t.Run("runtime panic types are caught and surfaced", func(t *testing.T) {
+ var got error
+ func() {
+ defer recoverParallelApply("oob", func(err error) { got = err })
+ // Force a runtime-panic path (slice index OOB) rather than an
+ // explicit panic() call to exercise Go's typed-panic surface.
+ s := []int{1, 2, 3}
+ idx := len(s) + 1 // silence staticcheck's constant-OOB check
+ _ = s[idx]
+ }()
+ require.Error(t, got)
+ require.ErrorContains(t, got, "oob")
+ })
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker.go
new file mode 100644
index 00000000000..d7ad9e1937c
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker.go
@@ -0,0 +1,257 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "context"
+ "errors"
+ "log/slog"
+
+ "vitess.io/vitess/go/sqltypes"
+ "vitess.io/vitess/go/vt/binlog/binlogplayer"
+ "vitess.io/vitess/go/vt/log"
+ vttablet "vitess.io/vitess/go/vt/vttablet/common"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+)
+
+type applyWorker struct {
+ ctx context.Context
+ vr *vreplicator
+ // conns holds a pair of MySQL connections for double-buffering. While
+ // one connection is being committed by the commitLoop, the worker can
+ // immediately start applying the next transaction on the other. This
+ // decouples the worker's apply phase from the serial commitLoop,
+ // allowing true pipeline parallelism.
+ conns [2]*vdbClient
+ active int
+ // client points to conns[active] for convenience. Updated by rotate().
+ client *vdbClient
+ // batchMode indicates whether this worker buffers SQL statements and
+ // flushes them as a single multi-statement request. When true, the
+ // apply phase buffers INSERTs via AddQueryToTrxBatch (near-zero cost),
+ // then flushWorkerBatch sends them all to MySQL in one ExecuteFetchMulti
+ // call. This happens during the parallel apply phase, so all workers
+ // execute their multi-statement batches concurrently. The commitLoop
+ // then just does a quick COMMIT + position update.
+ batchMode bool
+ // query executes a SQL statement on this worker's active connection.
+ // Rebound by rotate() to use the new active connection.
+ query func(ctx context.Context, sql string) (*sqltypes.Result, error)
+ // commit commits the current transaction on this worker's active connection.
+ // Rebound by rotate() alongside query.
+ commit func() error
+}
+
+// createWorkerConn creates a single configured vdbClient for a worker.
+func createWorkerConn(ctx context.Context, vr *vreplicator) (*vdbClient, error) {
+ dbClient := vr.vre.dbClientFactoryFiltered()
+ if err := dbClient.Connect(); err != nil {
+ return nil, err
+ }
+ if err := setDBClientSettings(dbClient, vr.workflowConfig); err != nil {
+ dbClient.Close()
+ return nil, err
+ }
+ // Workers apply transactions concurrently. The writeset scheduler models
+ // PK/unique/FK conflicts, but it cannot model InnoDB gap/next-key locks,
+ // which REPEATABLE READ takes even for point operations on absent rows
+ // (e.g. DELETE of a row that does not exist, or delete-marking in a
+ // non-unique secondary index). A later-ordered transaction's gap lock can
+ // block an earlier-ordered transaction's INSERT while the commitLoop's
+ // strict ordering keeps that gap lock held until the earlier transaction
+ // commits — a deadlock InnoDB's detector cannot see because half the
+ // cycle lives in the commitLoop (MySQL's MTA has its Commit_order_manager
+ // for exactly this). READ COMMITTED takes no gap locks for row-image
+ // application and is MySQL's own recommendation for row-based parallel
+ // appliers. Statement-based events force-serialize, so RC cannot change
+ // their outcome either.
+ //
+ // Use the SQL-standard statement form rather than setting the
+ // transaction_isolation sysvar: this connection goes directly to the
+ // target mysqld (no vtgate sysvar compatibility layer), and the sysvar
+ // spelling is flavor-specific (MariaDB used tx_isolation until 11.1;
+ // MySQL only added transaction_isolation in 5.7.20). Keep it lowercase
+ // to match the other session-setup statements (set names, set @@session.*).
+ if _, err := dbClient.ExecuteFetch("set session transaction isolation level read committed", 1); err != nil {
+ dbClient.Close()
+ return nil, err
+ }
+ vdbc := newVDBClientWithID(dbClient, vr.stats, vr.workflowConfig.RelayLogMaxItems, vr.id)
+ if _, err := vr.setSQLMode(ctx, vdbc); err != nil {
+ dbClient.Close()
+ return nil, err
+ }
+ if err := vr.resetFKCheckAfterCopy(vdbc); err != nil {
+ dbClient.Close()
+ return nil, err
+ }
+ if err := vr.resetFKRestrictAfterCopy(vdbc); err != nil {
+ dbClient.Close()
+ return nil, err
+ }
+ return vdbc, nil
+}
+
+// newApplyWorker constructs a worker with two DB connections so its apply
+// phase can overlap with the commitLoop's commit phase: one connection
+// handles the current txn while the other is ready for the next. In batch
+// mode it also reads MySQL's max_allowed_packet to size the multi-statement
+// flush so a worker's batched INSERTs cannot exceed the wire limit.
+func newApplyWorker(ctx context.Context, vr *vreplicator) (*applyWorker, error) {
+ batchMode := vr.workflowConfig.ExperimentalFlags&vttablet.VReplicationExperimentalFlagVPlayerBatching != 0
+
+ var conns [2]*vdbClient
+ for i := range 2 {
+ vdbc, err := createWorkerConn(ctx, vr)
+ if err != nil {
+ // Close any previously created connections.
+ for j := range i {
+ conns[j].Close()
+ }
+ return nil, err
+ }
+ conns[i] = vdbc
+ }
+
+ if batchMode {
+ maxBatchSize := int64(vr.workflowConfig.RelayLogMaxSize)
+ res, err := conns[0].ExecuteFetch(SqlMaxAllowedPacket, 1)
+ if err != nil {
+ log.Error("Worker: error getting max_allowed_packet, will use relay-log-max-size value", slog.Int64("bytes", int64(vr.workflowConfig.RelayLogMaxSize)), slog.Any("error", err))
+ } else {
+ if pkt, err := res.Rows[0][0].ToInt64(); err != nil {
+ log.Error("Worker: error getting max_allowed_packet, will use relay-log-max-size value", slog.Int64("bytes", int64(vr.workflowConfig.RelayLogMaxSize)), slog.Any("error", err))
+ } else {
+ maxBatchSize = pkt
+ }
+ }
+ maxBatchSize -= 64
+ for _, c := range conns {
+ c.maxBatchSize = maxBatchSize
+ }
+ }
+
+ worker := &applyWorker{
+ ctx: ctx,
+ vr: vr,
+ conns: conns,
+ active: 0,
+ client: conns[0],
+ batchMode: batchMode,
+ }
+ worker.bindFunctions()
+ return worker, nil
+}
+
+// bindFunctions sets the query and commit closures to use the active connection.
+func (w *applyWorker) bindFunctions() {
+ vdbc := w.client
+ if w.batchMode {
+ w.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ if !vdbc.InTransaction {
+ return vdbc.Execute(sql)
+ }
+ return nil, vdbc.AddQueryToTrxBatch(sql)
+ }
+ w.commit = func() error {
+ return vdbc.Commit()
+ }
+ } else {
+ w.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return vdbc.ExecuteWithRetry(ctx, sql)
+ }
+ w.commit = func() error {
+ return vdbc.Commit()
+ }
+ }
+}
+
+// rotate switches the worker to its spare connection for the next transaction.
+// The commitLoop will continue committing the previous transaction on the old
+// connection. This double-buffering allows the worker's apply phase to overlap
+// with the commitLoop's commit phase, enabling true pipeline parallelism.
+func (w *applyWorker) rotate() {
+ w.active = 1 - w.active
+ w.client = w.conns[w.active]
+ w.bindFunctions()
+}
+
+// flushWorkerBatch sends all buffered SQL statements to MySQL in one
+// multi-statement call via ExecuteTrxQueryBatch. This is called after
+// the worker has finished applying all events for a transaction, moving
+// the MySQL work into the parallel apply phase (before the serial
+// commitLoop). If batch mode is disabled, this is a no-op.
+func (w *applyWorker) flushWorkerBatch() error {
+ if !w.batchMode || w.client == nil {
+ return nil
+ }
+ _, err := w.client.ExecuteTrxQueryBatch()
+ return err
+}
+
+// close releases both of the worker's DB connections, rolling back first if
+// either is mid-transaction so no half-applied worker state leaks back into
+// the pool.
+func (w *applyWorker) close() {
+ for _, c := range w.conns {
+ if c != nil {
+ if c.InTransaction {
+ _ = c.Rollback()
+ }
+ c.Close()
+ }
+ }
+}
+
+// rollback discards in-progress work on the worker's active connection after
+// an apply error, so the next rotate() does not leave a stale partial txn
+// hanging on the connection we are about to park.
+func (w *applyWorker) rollback() {
+ if w.client != nil {
+ _ = w.client.Rollback()
+ }
+}
+
+// applyEvent dispatches through the shared vplayer.applyEvent code path while
+// temporarily rebinding vp.dbClient/query/commit to this worker's active
+// connection. Bindings are restored on return so the orchestrator's vplayer
+// (shared by the scheduler and commitLoop) never ends up pointing at
+// worker-owned state.
+func (w *applyWorker) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, mustSave bool, vp *vplayer) error {
+ if w.client == nil {
+ return errors.New("apply worker has no active client")
+ }
+ prevLocal := vp.dbClient
+ prevQuery := vp.query
+ prevCommit := vp.commit
+ vp.query = w.query
+ vp.commit = w.commit
+ vp.dbClient = w.client
+ defer func() {
+ vp.dbClient = prevLocal
+ vp.query = prevQuery
+ vp.commit = prevCommit
+ }()
+ return vp.applyEvent(ctx, event, mustSave)
+}
+
+// stats exposes the underlying vreplication stats so helpers that only hold
+// an *applyWorker can record counters without reaching through w.vr.
+func (w *applyWorker) stats() *binlogplayer.Stats {
+ return w.vr.stats
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker_test.go
new file mode 100644
index 00000000000..6b529886f72
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_worker_test.go
@@ -0,0 +1,566 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "strings"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "vitess.io/vitess/go/mysql/capabilities"
+ "vitess.io/vitess/go/sqltypes"
+ "vitess.io/vitess/go/vt/binlog/binlogplayer"
+ vttablet "vitess.io/vitess/go/vt/vttablet/common"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ querypb "vitess.io/vitess/go/vt/proto/query"
+)
+
+type failingDBClient struct {
+ connectErr error
+ failOnQuery map[string]error
+ supportsCaps bool
+}
+
+type recordingDBClient struct {
+ queries []string
+}
+
+func (f *failingDBClient) DBName() string { return "db" }
+func (f *failingDBClient) Connect() error { return f.connectErr }
+func (f *failingDBClient) Begin() error { return nil }
+func (f *failingDBClient) Commit() error { return nil }
+func (f *failingDBClient) Rollback() error { return nil }
+func (f *failingDBClient) Close() {}
+func (f *failingDBClient) IsClosed() bool { return false }
+func (f *failingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) {
+ for key, err := range f.failOnQuery {
+ if strings.Contains(query, key) {
+ return nil, err
+ }
+ }
+ if strings.Contains(query, getSQLModeQuery) {
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES,NO_ZERO_DATE,ANSI_QUOTES",
+ ), nil
+ }
+ if strings.Contains(query, "from _vt.vreplication where id=") {
+ return sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables), nil
+ }
+ if strings.Contains(query, "from _vt.copy_state where vrepl_id=") {
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ), nil
+ }
+ return &sqltypes.Result{}, nil
+}
+
+func (f *failingDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) {
+ qr, err := f.ExecuteFetch(query, maxrows)
+ if err != nil {
+ return nil, err
+ }
+ return []*sqltypes.Result{qr}, nil
+}
+
+func (f *failingDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) {
+ return f.supportsCaps, nil
+}
+
+func (r *recordingDBClient) DBName() string { return "db" }
+func (r *recordingDBClient) Connect() error { return nil }
+func (r *recordingDBClient) Begin() error { return nil }
+func (r *recordingDBClient) Commit() error { return nil }
+func (r *recordingDBClient) Rollback() error { return nil }
+func (r *recordingDBClient) Close() {}
+func (r *recordingDBClient) IsClosed() bool { return false }
+func (r *recordingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) {
+ r.queries = append(r.queries, query)
+ return &sqltypes.Result{}, nil
+}
+
+func (r *recordingDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) {
+ r.queries = append(r.queries, query)
+ return []*sqltypes.Result{{}}, nil
+}
+
+func (r *recordingDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) {
+ return false, nil
+}
+
+func TestApplyWorkerCloseRollsBack(t *testing.T) {
+ worker := &applyWorker{}
+ assert.NotPanics(t, func() {
+ worker.close()
+ })
+}
+
+func TestApplyWorkerRollbackNoError(t *testing.T) {
+ worker := &applyWorker{}
+ assert.NotPanics(t, func() {
+ worker.rollback()
+ })
+}
+
+func TestApplyWorkerApplyEventRestoresVPlayer(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := t.Context()
+
+ originalClient := vp.dbClient
+ vp.query = nil
+ vp.commit = nil
+
+ altDB := binlogplayer.NewMockDBClient(t)
+ altClient := newVDBClient(altDB, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)
+
+ worker := &applyWorker{ctx: ctx, client: altClient}
+ worker.query = func(ctx context.Context, sql string) (*sqltypes.Result, error) {
+ return &sqltypes.Result{}, nil
+ }
+ worker.commit = func() error {
+ return nil
+ }
+
+ gtid := "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"
+ event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: gtid}
+
+ err := worker.applyEvent(ctx, event, false, vp)
+ require.NoError(t, err)
+
+ expectedPos, err := binlogplayer.DecodePosition(gtid)
+ require.NoError(t, err)
+ assert.Equal(t, expectedPos.String(), vp.pos.String())
+
+ assert.Equal(t, originalClient, vp.dbClient)
+ assert.Nil(t, vp.query)
+ assert.Nil(t, vp.commit)
+}
+
+func TestApplyWorkerApplyEventNilClientFailsFast(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := t.Context()
+
+ initial := "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-1"
+ pos, err := binlogplayer.DecodePosition(initial)
+ require.NoError(t, err)
+ vp.pos = pos
+
+ worker := &applyWorker{ctx: ctx}
+ event := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_GTID, Gtid: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5"}
+
+ err = worker.applyEvent(ctx, event, false, vp)
+ require.ErrorContains(t, err, "apply worker has no active client")
+ assert.Equal(t, pos.String(), vp.pos.String())
+}
+
+func TestApplyWorkerApplyEventInsertStatementAcceptsMatchAllFilter(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := t.Context()
+ vp.canAcceptStmtEvents = true
+
+ db := &recordingDBClient{}
+ worker := &applyWorker{
+ ctx: ctx,
+ client: newVDBClient(db, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems),
+ }
+ worker.bindFunctions()
+
+ event := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_INSERT,
+ Dml: "insert into t1(id) values (1)",
+ }
+
+ workerVP := workerLocalVPlayer(vp)
+ err := worker.applyEvent(ctx, event, false, &workerVP)
+ require.NoError(t, err)
+ assert.Contains(t, db.queries, event.Dml)
+}
+
+func TestApplyWorkerStatsReturnsVReplicatorStats(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ worker := &applyWorker{vr: vp.vr}
+
+ assert.Equal(t, vp.vr.stats, worker.stats())
+}
+
+func TestNewApplyWorker(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ config := vttablet.InitVReplicationConfigDefaults()
+
+ mockDB := binlogplayer.NewMockDBClient(t)
+ mockDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ mockDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ mockDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.sql_mode", &sqltypes.Result{})
+ mockDB.AddInvariant("set @@session.foreign_key_checks", &sqltypes.Result{})
+ mockDB.AddInvariant("select pos, stop_pos, max_tps, max_replication_lag, state, workflow_type, workflow, workflow_sub_type, defer_secondary_keys, options from _vt.vreplication where id=1", sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables))
+ mockDB.AddInvariant("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ))
+ mockDB.AddInvariant("max_allowed_packet", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("max_allowed_packet", "int64"),
+ "4194304",
+ ))
+
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ dbClient: newVDBClient(mockDB, stats, config.RelayLogMaxItems),
+ workflowConfig: config,
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return mockDB }},
+ }
+
+ worker, err := newApplyWorker(t.Context(), vr)
+ require.NoError(t, err)
+ require.NotNil(t, worker)
+
+ worker.close()
+}
+
+func TestCreateWorkerConn_UsesSerialSQLModeContract(t *testing.T) {
+ testCases := []struct {
+ name string
+ workflowType binlogdatapb.VReplicationWorkflowType
+ expectedMode string
+ }{
+ {
+ name: "non-online-ddl uses exact sql mode",
+ workflowType: binlogdatapb.VReplicationWorkflowType_MoveTables,
+ expectedMode: SQLMode,
+ },
+ {
+ name: "online-ddl uses exact strict sql mode",
+ workflowType: binlogdatapb.VReplicationWorkflowType_OnlineDDL,
+ expectedMode: StrictSQLMode,
+ },
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ teardownStats := stats
+ defer teardownStats.Stop()
+
+ config := vttablet.InitVReplicationConfigDefaults()
+ workerDB := binlogplayer.NewMockDBClient(t)
+ workerDB.RemoveInvariants("select @@session.sql_mode", "set @@session.sql_mode", "set @@session.foreign_key_checks")
+ workerDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ workerDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ workerDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.sql_mode = CONCAT(@@session.sql_mode, ',NO_AUTO_VALUE_ON_ZERO')", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.sql_mode = REPLACE(REPLACE(REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', ''), 'NO_ZERO_IN_DATE', ''), 'NO_BACKSLASH_ESCAPES', '')", &sqltypes.Result{})
+ workerDB.ExpectRequest(getSQLModeQuery, sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES,NO_ZERO_DATE,ANSI_QUOTES",
+ ), nil)
+ workerDB.ExpectRequest(binlogplayer.TestGetWorkflowQueryId1, sqlModeWorkflowSettingsResult(tc.workflowType), nil)
+ workerDB.ExpectRequest("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ), nil)
+ workerDB.ExpectRequest(fmt.Sprintf(setSQLModeQueryf, tc.expectedMode), &sqltypes.Result{}, nil)
+ workerDB.ExpectRequest("set @@session.foreign_key_checks=0", &sqltypes.Result{}, nil)
+
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ dbClient: newVDBClient(workerDB, stats, config.RelayLogMaxItems),
+ workflowConfig: config,
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return workerDB }},
+ }
+
+ conn, err := createWorkerConn(t.Context(), vr)
+ require.NoError(t, err)
+ require.NotNil(t, conn)
+ workerDB.Wait()
+ conn.Close()
+ })
+ }
+}
+
+func TestCreateWorkerConn_UsesRunningFKSessionSettings(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ defer stats.Stop()
+
+ config := vttablet.InitVReplicationConfigDefaults()
+ workerDB := binlogplayer.NewMockDBClient(t)
+ workerDB.RemoveInvariants("select @@session.sql_mode", "set @@session.sql_mode", "set @@session.foreign_key_checks")
+ workerDB.AddInvariant("set @@session.time_zone", &sqltypes.Result{})
+ workerDB.AddInvariant("set session transaction isolation level read committed", &sqltypes.Result{})
+ workerDB.AddInvariant("set names 'binary'", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.net_read_timeout", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.net_write_timeout", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.sql_mode = CONCAT(@@session.sql_mode, ',NO_AUTO_VALUE_ON_ZERO')", &sqltypes.Result{})
+ workerDB.AddInvariant("set @@session.sql_mode = REPLACE(REPLACE(REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', ''), 'NO_ZERO_IN_DATE', ''), 'NO_BACKSLASH_ESCAPES', '')", &sqltypes.Result{})
+ workerDB.ExpectRequest(getSQLModeQuery, sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("sql_mode", "varchar"),
+ "STRICT_TRANS_TABLES,NO_ZERO_DATE,ANSI_QUOTES",
+ ), nil)
+ workerDB.ExpectRequest(binlogplayer.TestGetWorkflowQueryId1, sqlModeWorkflowSettingsResult(binlogdatapb.VReplicationWorkflowType_MoveTables), nil)
+ workerDB.ExpectRequest("select count(distinct table_name) from _vt.copy_state where vrepl_id=1", sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "0",
+ ), nil)
+ workerDB.ExpectRequest(fmt.Sprintf(setSQLModeQueryf, SQLMode), &sqltypes.Result{}, nil)
+ workerDB.ExpectRequest("set @@session.foreign_key_checks=1", &sqltypes.Result{}, nil)
+
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ dbClient: newVDBClient(workerDB, stats, config.RelayLogMaxItems),
+ workflowConfig: config,
+ originalFKCheckSetting: 1,
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return workerDB }},
+ }
+
+ conn, err := createWorkerConn(t.Context(), vr)
+ require.NoError(t, err)
+ require.NotNil(t, conn)
+ workerDB.Wait()
+ conn.Close()
+}
+
+func TestNewApplyWorkerConnectError(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ config := vttablet.InitVReplicationConfigDefaults()
+
+ connectErr := errors.New("connect failed")
+ badClient := &failingDBClient{connectErr: connectErr}
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ workflowConfig: config,
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return badClient }},
+ }
+
+ worker, err := newApplyWorker(t.Context(), vr)
+ require.ErrorIs(t, err, connectErr)
+ require.Nil(t, worker)
+}
+
+func TestNewApplyWorkerSettingsError(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ config := vttablet.InitVReplicationConfigDefaults()
+
+ settingsErr := errors.New("settings failed")
+ badClient := &failingDBClient{failOnQuery: map[string]error{"time_zone": settingsErr}}
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ workflowConfig: config,
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return badClient }},
+ }
+
+ worker, err := newApplyWorker(t.Context(), vr)
+ require.ErrorIs(t, err, settingsErr)
+ require.Nil(t, worker)
+}
+
+func TestNewApplyWorkerClearFKCheckError(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ config := vttablet.InitVReplicationConfigDefaults()
+
+ fkErr := errors.New("fk checks failed")
+ badClient := &failingDBClient{failOnQuery: map[string]error{"set @@session.foreign_key_checks=0": fkErr}}
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ dbClient: newVDBClient(badClient, stats, config.RelayLogMaxItems),
+ workflowConfig: config,
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return badClient }},
+ }
+
+ worker, err := newApplyWorker(t.Context(), vr)
+ require.ErrorIs(t, err, fkErr)
+ require.Nil(t, worker)
+}
+
+func TestNewApplyWorkerClearFKRestrictError(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ config := vttablet.InitVReplicationConfigDefaults()
+
+ restrictErr := errors.New("fk restrict failed")
+ workerClient := &failingDBClient{failOnQuery: map[string]error{"set @@session.restrict_fk_on_non_standard_key=0": restrictErr}}
+ capClient := &failingDBClient{supportsCaps: true}
+
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ dbClient: newVDBClient(capClient, stats, config.RelayLogMaxItems),
+ workflowConfig: config,
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return workerClient }},
+ }
+
+ worker, err := newApplyWorker(t.Context(), vr)
+ require.ErrorIs(t, err, restrictErr)
+ require.Nil(t, worker)
+}
+
+func TestApplyWorkerApplyEventSetsFKChecksAfterRotate(t *testing.T) {
+ vp, _ := testVPlayer(t)
+ ctx := t.Context()
+ vp.tablePlans["t1"] = &TablePlan{TargetName: "t1"}
+ vp.vr.storeState(binlogdatapb.VReplicationWorkflowState_Running)
+
+ db0 := &recordingDBClient{}
+ db1 := &recordingDBClient{}
+ worker := &applyWorker{
+ ctx: ctx,
+ conns: [2]*vdbClient{newVDBClient(db0, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems), newVDBClient(db1, vp.vr.stats, vp.vr.workflowConfig.RelayLogMaxItems)},
+ active: 0,
+ }
+ worker.client = worker.conns[0]
+ worker.bindFunctions()
+
+ vp.query = worker.query
+ vp.commit = worker.commit
+ vp.dbClient = worker.client
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ Flags: 0,
+ TableName: "t1",
+ },
+ }
+
+ require.NoError(t, worker.applyEvent(ctx, rowEvent, false, vp))
+ assert.Contains(t, db0.queries, "set @@session.foreign_key_checks=true")
+
+ worker.rotate()
+ vp.query = worker.query
+ vp.commit = worker.commit
+ vp.dbClient = worker.client
+
+ require.NoError(t, worker.applyEvent(ctx, rowEvent, false, vp))
+ assert.Contains(t, db1.queries, "set @@session.foreign_key_checks=true")
+}
+
+func sqlModeWorkflowSettingsResult(workflowType binlogdatapb.VReplicationWorkflowType) *sqltypes.Result {
+ return &sqltypes.Result{
+ Fields: []*querypb.Field{
+ {Name: "pos", Type: sqltypes.VarBinary},
+ {Name: "stop_pos", Type: sqltypes.VarBinary},
+ {Name: "max_tps", Type: sqltypes.Int64},
+ {Name: "max_replication_lag", Type: sqltypes.Int64},
+ {Name: "state", Type: sqltypes.VarBinary},
+ {Name: "workflow_type", Type: sqltypes.Int64},
+ {Name: "workflow", Type: sqltypes.VarChar},
+ {Name: "workflow_sub_type", Type: sqltypes.Int64},
+ {Name: "defer_secondary_keys", Type: sqltypes.Int64},
+ {Name: "options", Type: sqltypes.VarBinary},
+ },
+ RowsAffected: 1,
+ Rows: [][]sqltypes.Value{{
+ sqltypes.NewVarBinary("MariaDB/0-1-1083"),
+ sqltypes.NULL,
+ sqltypes.NewInt64(0),
+ sqltypes.NewInt64(0),
+ sqltypes.NewVarBinary(binlogdatapb.VReplicationWorkflowState_Running.String()),
+ sqltypes.NewInt64(int64(workflowType)),
+ sqltypes.NewVarChar("wf"),
+ sqltypes.NewInt64(0),
+ sqltypes.NewInt64(0),
+ sqltypes.NewVarBinary("{}"),
+ }},
+ }
+}
+
+// recordingFailingDBClient records every query while delegating behavior to
+// failingDBClient (which serves the standard setup queries).
+type recordingFailingDBClient struct {
+ failingDBClient
+ queries []string
+}
+
+func (c *recordingFailingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) {
+ c.queries = append(c.queries, query)
+ return c.failingDBClient.ExecuteFetch(query, maxrows)
+}
+
+// failingCommitDBClient delegates to failingDBClient but fails COMMIT, for
+// exercising commit-failure paths (failingDBClient.Commit always succeeds).
+type failingCommitDBClient struct {
+ failingDBClient
+ commitErr error
+}
+
+func (c *failingCommitDBClient) Commit() error { return c.commitErr }
+
+// TestCreateWorkerConnSetsReadCommitted pins that worker connections run at
+// READ COMMITTED. The writeset scheduler models PK/unique/FK conflicts, but
+// it cannot model InnoDB gap/next-key locks, which REPEATABLE READ takes
+// even for point operations on absent rows (e.g. DELETE of a row that does
+// not exist, or delete-marking in a non-unique secondary index). A
+// later-ordered transaction's gap lock can block an earlier-ordered
+// transaction's INSERT while the commitLoop's strict ordering keeps that gap
+// lock held until the earlier transaction commits — a deadlock InnoDB's
+// detector cannot see because half the cycle lives in the commitLoop. READ
+// COMMITTED takes no gap locks for row-image application and is MySQL's own
+// recommendation for row-based parallel appliers.
+func TestCreateWorkerConnSetsReadCommitted(t *testing.T) {
+ recording := &recordingFailingDBClient{}
+ stats := binlogplayer.NewStats()
+ vr := &vreplicator{
+ id: 1,
+ stats: stats,
+ dbClient: newVDBClient(&failingDBClient{}, stats, 0),
+ workflowConfig: vttablet.InitVReplicationConfigDefaults(),
+ vre: &Engine{dbClientFactoryFiltered: func() binlogplayer.DBClient { return recording }},
+ }
+ conn, err := createWorkerConn(t.Context(), vr)
+ require.NoError(t, err)
+ require.NotNil(t, conn)
+
+ // Pin the SQL-standard statement form: the worker conn talks directly to
+ // the target mysqld (no vtgate sysvar rewriting), and the
+ // transaction_isolation sysvar spelling is flavor-specific (MariaDB used
+ // tx_isolation until 11.1; MySQL only added transaction_isolation in
+ // 5.7.20). The statement form works everywhere. Lowercase keeps it
+ // consistent with the other session-setup statements and the framework's
+ // globalDBQueries filter (which skips lowercase "set ..." setup queries).
+ require.Contains(t, recording.queries, "set session transaction isolation level read committed",
+ "worker connections must run at READ COMMITTED to avoid gap-lock deadlocks through the commit order")
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset.go
new file mode 100644
index 00000000000..dc884676b50
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset.go
@@ -0,0 +1,1211 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "encoding/binary"
+ "fmt"
+ "maps"
+ "strings"
+ "sync"
+ "sync/atomic"
+
+ "github.com/cespare/xxhash/v2"
+
+ "vitess.io/vitess/go/mysql/collations"
+ "vitess.io/vitess/go/mysql/collations/charset"
+ "vitess.io/vitess/go/mysql/collations/colldata"
+ "vitess.io/vitess/go/sqltypes"
+ "vitess.io/vitess/go/vt/vterrors"
+ "vitess.io/vitess/go/vt/vthash"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ querypb "vitess.io/vitess/go/vt/proto/query"
+ vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
+)
+
+var (
+ writesetTextValueMarker = [2]byte{0xFF, 0x00}
+ // writesetKeySeparator separates the table name from the key values in
+ // the digest. A package-level array (Go has no []byte constants) so
+ // writesetDigestInit never allocates for it.
+ writesetKeySeparator = [1]byte{':'}
+)
+
+// fieldIndexForName resolves a column-name lookup in a field-index map by
+// trying the exact spelling first and falling back to lowercase. The maps are
+// populated with both variants to bridge the case-sensitivity gap between
+// sqlparser output and raw binlog field names.
+func fieldIndexForName(fieldIdx map[string]int, colName string) (int, bool) {
+ if idx, ok := fieldIdx[colName]; ok {
+ return idx, true
+ }
+ idx, ok := fieldIdx[strings.ToLower(colName)]
+ return idx, ok
+}
+
+// writesetDigestAddPayload writes a length-prefixed payload into the digest.
+// The length prefix keeps concatenated payloads unambiguous so two different
+// byte sequences cannot hash to the same digest by coincidental boundary
+// alignment.
+func writesetDigestAddPayload(d *xxhash.Digest, payload []byte) {
+ var scratch [8]byte
+ binary.LittleEndian.PutUint64(scratch[:], uint64(len(payload)))
+ d.Write(scratch[:])
+ d.Write(payload)
+}
+
+// NOTE on collisions: writeset keys are 64-bit xxhash digests. A hash
+// collision between two unrelated keys creates a FALSE conflict (needless
+// serialization) — never a missed one — so collisions degrade throughput,
+// not correctness.
+//
+// writesetDigestInit initializes an xxhash digest with the table name
+// followed by a ':' separator. Callers declare a stack-local xxhash.Digest
+// and pass its address to avoid heap allocation. xxhash provides better
+// throughput than FNV-1a for writeset keys with multiple PK columns.
+func writesetDigestInit(d *xxhash.Digest, tableName string) {
+ d.Reset()
+ d.WriteString(tableName)
+ d.Write(writesetKeySeparator[:])
+}
+
+// writesetDigestAddValue folds a sqltypes.Value into the digest by writing
+// its type discriminator (2 bytes, little-endian) followed by its raw bytes.
+// querypb.Type is a 16-bit enum and using a 1-byte discriminator would let
+// future types whose low byte collides (e.g. a hypothetical Type=N and
+// Type=N+256) hash to the same key — silently letting truly conflicting
+// transactions run in parallel.
+func writesetDigestAddValue(d *xxhash.Digest, v sqltypes.Value) {
+ var scratch [8]byte
+ raw := v.Raw()
+ binary.LittleEndian.PutUint64(scratch[:], uint64(2+len(raw)))
+ d.Write(scratch[:])
+ binary.LittleEndian.PutUint16(scratch[:2], uint16(v.Type()))
+ d.Write(scratch[:2])
+ d.Write(raw)
+}
+
+// writesetDigestAddFieldValue folds a column value into the digest using
+// collation-aware hashing for text columns. Two rows that MySQL considers
+// equal (trailing spaces under PAD SPACE, equivalent forms under *_ci
+// collations) must hash to the same writeset key or conflict detection
+// would let truly-conflicting txns run in parallel.
+func writesetDigestAddFieldValue(d *xxhash.Digest, field *querypb.Field, v sqltypes.Value) error {
+ if field == nil || !sqltypes.IsText(field.Type) || field.Charset == 0 {
+ writesetDigestAddValue(d, v)
+ return nil
+ }
+
+ collation := colldata.Lookup(collations.ID(field.Charset))
+ if collation == nil {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unknown collation %d for field %s", field.Charset, field.Name)
+ }
+
+ raw := v.Raw()
+ if collationUsesPadSpace(collation) {
+ raw = trimTrailingPadSpaceCodepoints(collation.Charset(), raw)
+ }
+
+ var semanticHash vthash.Hasher
+ semanticHash.Reset()
+ collation.Hash(&semanticHash, raw, 0)
+
+ // Fixed-size stack buffer: marker followed by the 8-byte collation hash.
+ var payload [len(writesetTextValueMarker) + 8]byte
+ copy(payload[:], writesetTextValueMarker[:])
+ binary.LittleEndian.PutUint64(payload[len(writesetTextValueMarker):], semanticHash.Sum64())
+ writesetDigestAddPayload(d, payload[:])
+ return nil
+}
+
+// collationUsesPadSpace reports whether the given collation compares strings
+// as if right-padded with spaces. Values under such collations have trailing
+// pad codepoints stripped before hashing so that e.g. 'abc' and 'abc '
+// hash to the same writeset key.
+func collationUsesPadSpace(collation colldata.Collation) bool {
+ switch collation.(type) {
+ case *colldata.Collation_utf8mb4_uca_0900, *colldata.Collation_utf8mb4_0900_bin:
+ return false
+ default:
+ return true
+ }
+}
+
+// trimTrailingPadSpaceCodepoints strips trailing space codepoints from raw
+// bytes using the column's charset decoder. Used by PAD SPACE collations so
+// values that compare equal in MySQL also hash equal in the writeset digest.
+func trimTrailingPadSpaceCodepoints(cs charset.Charset, raw []byte) []byte {
+ trimmedEnd := 0
+ for i := 0; i < len(raw); {
+ r, size := cs.DecodeRune(raw[i:])
+ if size <= 0 {
+ return raw
+ }
+ i += size
+ if r != ' ' {
+ trimmedEnd = i
+ }
+ }
+ return raw[:trimmedEnd]
+}
+
+// fkConstraintRef represents one foreign key constraint on a table.
+// It maps one or more child columns to a parent table, allowing the
+// parallel apply writeset to include FK reference keys that conflict
+// with the parent table's writeset keys.
+type fkConstraintRef struct {
+ ParentTable string // referenced parent table name
+ ChildColumnNames []string // child column names, in FK ordinal order
+ ReferencedColumnNames []string // parent column names, in FK ordinal order
+}
+
+// parentFKRef represents a foreign key constraint from the parent table's
+// perspective. When a parent row changes, we generate writeset keys using the
+// referenced column values so they conflict with child-side FK keys.
+type parentFKRef struct {
+ ParentTable string // the parent table name (same as the table being modified)
+ ReferencedColumnNames []string // parent column names referenced by the FK
+}
+
+// buildParentFKRefs builds a reverse map from parent table name to the FK
+// constraints that reference it. This allows parent-side rows to generate
+// writeset keys that match child-side FK keys.
+func buildParentFKRefs(fkRefs map[string][]fkConstraintRef) map[string][]parentFKRef {
+ if len(fkRefs) == 0 {
+ return nil
+ }
+ result := make(map[string][]parentFKRef)
+ for _, refs := range fkRefs {
+ for _, ref := range refs {
+ result[ref.ParentTable] = append(result[ref.ParentTable], parentFKRef{
+ ParentTable: ref.ParentTable,
+ ReferencedColumnNames: ref.ReferencedColumnNames,
+ })
+ }
+ }
+ return result
+}
+
+// buildCanonicalTargetTableNames builds a lowercase→original-case map of
+// target table names so canonicalTargetTableName can line up FK-graph lookups
+// with the various case variants that arrive from DDL, binlog events, and
+// replicator plans. Entries with ambiguous casing (two different target
+// names sharing the same lowercase key) are dropped rather than silently
+// picking one.
+func buildCanonicalTargetTableNames(tablePlans map[string]*TablePlan) map[string]string {
+ if len(tablePlans) == 0 {
+ return nil
+ }
+ canonical := make(map[string]string, len(tablePlans))
+ ambiguous := make(map[string]struct{})
+ for _, plan := range tablePlans {
+ if plan == nil || plan.TargetName == "" {
+ continue
+ }
+ key := strings.ToLower(plan.TargetName)
+ if _, ok := ambiguous[key]; ok {
+ continue
+ }
+ if existing, ok := canonical[key]; ok {
+ if existing != plan.TargetName {
+ delete(canonical, key)
+ ambiguous[key] = struct{}{}
+ }
+ continue
+ }
+ canonical[key] = plan.TargetName
+ }
+ if len(canonical) == 0 {
+ return nil
+ }
+ return canonical
+}
+
+// canonicalTargetTableName resolves a possibly case-varying name to the exact
+// target-table key used in tablePlans. Returns the input unchanged when no
+// canonical match exists so lookups miss cleanly rather than silently hitting
+// a sibling table.
+func canonicalTargetTableName(name string, canonical map[string]string) string {
+ if name == "" || len(canonical) == 0 {
+ return name
+ }
+ if resolved, ok := canonical[strings.ToLower(name)]; ok {
+ return resolved
+ }
+ return name
+}
+
+// resolveFKRefsForTable collects FK constraints whose child table matches the
+// given name (compared canonically). Returned refs have their ParentTable
+// canonicalized so the writeset digest for a child row hashes under the same
+// table-name key as the parent's writeset, which is what makes the two sides
+// conflict.
+func resolveFKRefsForTable(tableName string, refs map[string][]fkConstraintRef, canonical map[string]string) []fkConstraintRef {
+ if tableName == "" || len(refs) == 0 {
+ return nil
+ }
+ resolvedTableName := canonicalTargetTableName(tableName, canonical)
+ var resolved []fkConstraintRef
+ for name, tableRefs := range refs {
+ if canonicalTargetTableName(name, canonical) != resolvedTableName {
+ continue
+ }
+ start := len(resolved)
+ resolved = append(resolved, tableRefs...)
+ for i := start; i < len(resolved); i++ {
+ resolved[i].ParentTable = canonicalTargetTableName(resolved[i].ParentTable, canonical)
+ }
+ }
+ return resolved
+}
+
+// resolveParentFKRefsForTable is the parent-side counterpart to
+// resolveFKRefsForTable: when a parent row changes, we need FK-style writeset
+// keys keyed on the parent's referenced columns so the change conflicts with
+// the child-side FK keys.
+func resolveParentFKRefsForTable(tableName string, refs map[string][]parentFKRef, canonical map[string]string) []parentFKRef {
+ if tableName == "" || len(refs) == 0 {
+ return nil
+ }
+ resolvedTableName := canonicalTargetTableName(tableName, canonical)
+ var resolved []parentFKRef
+ for name, tableRefs := range refs {
+ if canonicalTargetTableName(name, canonical) != resolvedTableName {
+ continue
+ }
+ start := len(resolved)
+ resolved = append(resolved, tableRefs...)
+ for i := start; i < len(resolved); i++ {
+ resolved[i].ParentTable = canonicalTargetTableName(resolved[i].ParentTable, canonical)
+ }
+ }
+ return resolved
+}
+
+// buildResolvedFKRefTableSet returns the set of canonicalized table names
+// that participate in any FK edge, as either child or parent. The scheduler
+// uses this set to decide which tables' touched-row bookkeeping must follow
+// FK-induced conflicts across the txn graph.
+func buildResolvedFKRefTableSet(refs map[string][]fkConstraintRef, parentRefs map[string][]parentFKRef, canonical map[string]string) map[string]struct{} {
+ if len(refs) == 0 && len(parentRefs) == 0 {
+ return nil
+ }
+ resolved := make(map[string]struct{}, len(refs)+len(parentRefs))
+ for name, tableRefs := range refs {
+ if len(tableRefs) == 0 {
+ continue
+ }
+ resolved[canonicalTargetTableName(name, canonical)] = struct{}{}
+ }
+ for name, tableRefs := range parentRefs {
+ if len(tableRefs) == 0 {
+ continue
+ }
+ resolved[canonicalTargetTableName(name, canonical)] = struct{}{}
+ }
+ if len(resolved) == 0 {
+ return nil
+ }
+ return resolved
+}
+
+type txnWritesetCache struct {
+ fieldIdxCache map[string]map[string]int
+ canonicalTargetNames map[string]string
+ resolvedFKRefs map[string][]fkConstraintRef
+ resolvedParentRefs map[string][]parentFKRef
+ // identityIdxCache caches, per source table name, the plan's identity
+ // column positions resolved against the streamed fields. The list is
+ // stable per plan, so resolving it per row change would allocate a
+ // slice per change for composite-identity tables.
+ identityIdxCache map[string][]int
+ // planByTarget maps canonical target table names to their plans, for
+ // FK parent lookups (tablePlans itself is keyed by SOURCE table name).
+ planByTarget map[string]*TablePlan
+ // fkStreamedValidated records child source-table names whose FK refs
+ // have been validated against their parent plans' streamed metadata.
+ fkStreamedValidated map[string]struct{}
+ // relevantColsCache caches, per source table name, the set of column
+ // indexes the writeset depends on (PK plus FK-joined columns). Building
+ // it is O(columns + FK refs) with a map allocation, which is too
+ // expensive to repeat for every row change in the hot path.
+ relevantColsCache map[string]map[int]struct{}
+ // uniqueKeyIdxCache caches, per source table name, the resolved field
+ // positions of each hashable unique secondary index (plan.UniqueKeyColumns),
+ // so the column-name lookups happen once per table instead of per change.
+ uniqueKeyIdxCache map[string][][]int
+}
+
+// writesetKeysForParentFKRef generates writeset keys for a parent table row
+// based on foreign key constraints that reference this table. The hash uses
+// parentTable:referencedColValues, matching the child-side FK key hash.
+// Returns an error if the FK columns are missing from the streamed field list,
+// so the caller can force serialization instead of silently dropping the edge.
+func writesetKeysForParentFKRef(ref *parentFKRef, fields []*querypb.Field, fieldIdx map[string]int, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error {
+ appendKey := func(vals []sqltypes.Value) error {
+ if len(vals) == 0 {
+ return nil
+ }
+ var d xxhash.Digest
+ writesetDigestInit(&d, ref.ParentTable)
+ for _, colName := range ref.ReferencedColumnNames {
+ idx, ok := fieldIndexForName(fieldIdx, colName)
+ if !ok {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK referenced column %q not in streamed fields for parent table %s", colName, ref.ParentTable)
+ }
+ if idx >= len(fields) {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK referenced column %q index %d out of range for parent table fields %s", colName, idx, ref.ParentTable)
+ }
+ if idx >= len(vals) {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK referenced column %q index %d out of range for parent table %s", colName, idx, ref.ParentTable)
+ }
+ val := vals[idx]
+ if val.IsNull() {
+ return nil
+ }
+ if err := writesetDigestAddFieldValue(&d, fields[idx], val); err != nil {
+ return err
+ }
+ }
+ keySet[d.Sum64()] = struct{}{}
+ return nil
+ }
+ if err := appendKey(beforeVals); err != nil {
+ return err
+ }
+ return appendKey(afterVals)
+}
+
+// writesetKeysForFKRef generates writeset keys based on a foreign key constraint.
+// For each row (before and after), it looks up the child column values and produces
+// a hash keyed on the parent table name and FK column values, which will conflict
+// with the parent table's PK-based writeset key, forcing serialization of
+// dependent txns.
+// Returns an error if FK columns are missing from the streamed field list.
+func writesetKeysForFKRef(ref *fkConstraintRef, fields []*querypb.Field, fieldIdx map[string]int, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error {
+ if ref == nil {
+ return nil
+ }
+ appendFKKey := func(vals []sqltypes.Value) error {
+ if len(vals) == 0 {
+ return nil
+ }
+ var d xxhash.Digest
+ writesetDigestInit(&d, ref.ParentTable)
+ for _, colName := range ref.ChildColumnNames {
+ idx, ok := fieldIndexForName(fieldIdx, colName)
+ if !ok {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK child column %q not in streamed fields for table referencing %s", colName, ref.ParentTable)
+ }
+ if idx >= len(fields) {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK child column %q index %d out of range for table fields referencing %s", colName, idx, ref.ParentTable)
+ }
+ if idx >= len(vals) {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "FK child column %q index %d out of range for table referencing %s", colName, idx, ref.ParentTable)
+ }
+ val := vals[idx]
+ // In MySQL, if any referencing column in an FK is NULL, the FK
+ // constraint is not enforced for that row. Skip generating a
+ // writeset key in that case to avoid artificial conflicts.
+ if val.IsNull() {
+ return nil
+ }
+ if err := writesetDigestAddFieldValue(&d, fields[idx], val); err != nil {
+ return err
+ }
+ }
+ keySet[d.Sum64()] = struct{}{}
+ return nil
+ }
+ if err := appendFKKey(beforeVals); err != nil {
+ return err
+ }
+ return appendFKKey(afterVals)
+}
+
+// writesetKeysForUniqueKey emits conflict keys for one hashable unique
+// secondary index, for both row images, mirroring MySQL's WRITESET tracking.
+// A NULL in any key column emits no key for that image: MySQL unique indexes
+// permit multiple NULLs, so a NULL-valued key cannot conflict with anything.
+// The index ordinal is folded into the digest so different indexes on the
+// same table produce distinct key spaces (a cross-index hash collision would
+// only over-serialize, but unambiguous inputs are cheap).
+func writesetKeysForUniqueKey(tableName string, ordinal int, colIdxs []int, fields []*querypb.Field, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error {
+ appendKey := func(vals []sqltypes.Value) error {
+ if len(vals) == 0 {
+ return nil
+ }
+ var d xxhash.Digest
+ writesetDigestInit(&d, tableName)
+ var ordinalScratch [8]byte
+ binary.LittleEndian.PutUint64(ordinalScratch[:], uint64(ordinal))
+ writesetDigestAddPayload(&d, ordinalScratch[:])
+ for _, idx := range colIdxs {
+ if idx >= len(vals) {
+ return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "unique key index out of range for %s", tableName)
+ }
+ val := vals[idx]
+ // A NULL key column cannot conflict: MySQL unique indexes permit
+ // multiple NULLs. Emit no key for this image.
+ if val.IsNull() {
+ return nil
+ }
+ var field *querypb.Field
+ if idx < len(fields) {
+ field = fields[idx]
+ }
+ if err := writesetDigestAddFieldValue(&d, field, val); err != nil {
+ return err
+ }
+ }
+ keySet[d.Sum64()] = struct{}{}
+ return nil
+ }
+ if err := appendKey(beforeVals); err != nil {
+ return err
+ }
+ return appendKey(afterVals)
+}
+
+// writesetFieldsHashCompatible reports whether two streamed fields produce
+// identical digests for logically-equal values. writesetDigestAddFieldValue
+// hashes text fields via their collation (driven by Field.Charset) and
+// everything else as a 2-byte type discriminator plus raw bytes — so the FK
+// child/parent hash equality the scheduler relies on requires matching
+// textness, charset (text), or exact type (non-text).
+func writesetFieldsHashCompatible(a, b *querypb.Field) bool {
+ if a == nil || b == nil {
+ return false
+ }
+ aText := sqltypes.IsText(a.Type) && a.Charset != 0
+ bText := sqltypes.IsText(b.Type) && b.Charset != 0
+ if aText != bText {
+ return false
+ }
+ if aText {
+ return a.Charset == b.Charset
+ }
+ return a.Type == b.Type
+}
+
+// validateFKStreamedFieldCompatibility fails closed when a child table's FK
+// columns and the parent's referenced columns have hash-incompatible STREAMED
+// field metadata. queryFKRefs validates the TARGET schema, but the digests
+// are computed from the SOURCE (FIELD-event) metadata, which can diverge for
+// target-only FKs (e.g. source child latin1 vs source parent utf8mb4, or INT
+// vs BIGINT): equal logical values would then hash to different keys and the
+// child/parent transactions could reorder. Parents whose plan has not been
+// streamed yet are skipped — they cannot generate parent-side keys until
+// their FIELD event arrives, and FIELD-bearing transactions serialize.
+func validateFKStreamedFieldCompatibility(childPlan *TablePlan, childFieldIdx map[string]int, refs []fkConstraintRef, cache *txnWritesetCache, tablePlans map[string]*TablePlan) error {
+ if len(refs) == 0 {
+ return nil
+ }
+ var planByTarget map[string]*TablePlan
+ if cache != nil && cache.planByTarget != nil {
+ planByTarget = cache.planByTarget
+ } else {
+ planByTarget = make(map[string]*TablePlan, len(tablePlans))
+ for _, plan := range tablePlans {
+ if plan != nil && plan.TargetName != "" {
+ planByTarget[plan.TargetName] = plan
+ }
+ }
+ if cache != nil {
+ cache.planByTarget = planByTarget
+ }
+ }
+ for i := range refs {
+ ref := &refs[i]
+ parentPlan := planByTarget[ref.ParentTable]
+ if parentPlan == nil || len(parentPlan.Fields) == 0 {
+ continue
+ }
+ parentFieldIdx := make(map[string]int, len(parentPlan.Fields))
+ for j, f := range parentPlan.Fields {
+ if f == nil {
+ continue
+ }
+ parentFieldIdx[f.Name] = j
+ parentFieldIdx[strings.ToLower(f.Name)] = j
+ }
+ for k, childCol := range ref.ChildColumnNames {
+ if k >= len(ref.ReferencedColumnNames) {
+ break
+ }
+ childIdx, ok := fieldIndexForName(childFieldIdx, childCol)
+ if !ok || childIdx >= len(childPlan.Fields) {
+ continue // missing columns are caught by the key emitters
+ }
+ parentIdx, ok := fieldIndexForName(parentFieldIdx, ref.ReferencedColumnNames[k])
+ if !ok || parentIdx >= len(parentPlan.Fields) {
+ continue
+ }
+ if !writesetFieldsHashCompatible(childPlan.Fields[childIdx], parentPlan.Fields[parentIdx]) {
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION,
+ "FK streamed field metadata mismatch between child column %q and parent %s.%q: forcing serialization",
+ childCol, ref.ParentTable, ref.ReferencedColumnNames[k])
+ }
+ }
+ }
+ return nil
+}
+
+// buildTxnWriteset builds writeset keys for the given events.
+// fieldIdxCache is an optional cache of field-name→index maps, shared
+// across transactions on the same scheduler goroutine. Pass nil to
+// use a local cache (e.g. in tests).
+func buildTxnWriteset(tablePlans map[string]*TablePlan, fkRefs map[string][]fkConstraintRef, parentRefs map[string][]parentFKRef, events []*binlogdatapb.VEvent, fieldIdxCaches ...map[string]map[string]int) ([]uint64, error) {
+ var cache *txnWritesetCache
+ if len(fieldIdxCaches) > 0 && fieldIdxCaches[0] != nil {
+ cache = &txnWritesetCache{fieldIdxCache: fieldIdxCaches[0]}
+ }
+ return buildTxnWritesetWithCache(tablePlans, fkRefs, parentRefs, events, cache)
+}
+
+// buildTxnWritesetWithCache is the cache-aware core of buildTxnWriteset.
+// canonical-name, FK-resolution, and fieldIdx maps are shared across txns
+// on the same scheduler goroutine to avoid rebuilding them per txn. Fails
+// closed (returns an error) on partial row images or missing FK columns so
+// the caller can route the txn through the serial path instead of producing
+// a writeset that misses conflict-determining columns.
+func buildTxnWritesetWithCache(tablePlans map[string]*TablePlan, fkRefs map[string][]fkConstraintRef, parentRefs map[string][]parentFKRef, events []*binlogdatapb.VEvent, cache *txnWritesetCache) ([]uint64, error) {
+ // Pre-estimate capacity to avoid map rehashing during key insertion.
+ // Each row change can produce ~2 keys (before + after).
+ estimated := 0
+ for _, event := range events {
+ if event.Type == binlogdatapb.VEventType_ROW && event.RowEvent != nil {
+ estimated += 2 * len(event.RowEvent.RowChanges)
+ }
+ }
+ keySet := make(map[uint64]struct{}, estimated)
+ needResolvedFKRefs := len(fkRefs) > 0 || len(parentRefs) > 0
+ var canonicalTargetNames map[string]string
+ var resolvedFKRefs map[string][]fkConstraintRef
+ var resolvedParentRefs map[string][]parentFKRef
+ if needResolvedFKRefs {
+ if cache != nil {
+ canonicalTargetNames = cache.canonicalTargetNames
+ resolvedFKRefs = cache.resolvedFKRefs
+ resolvedParentRefs = cache.resolvedParentRefs
+ }
+ if canonicalTargetNames == nil {
+ canonicalTargetNames = buildCanonicalTargetTableNames(tablePlans)
+ if cache != nil {
+ cache.canonicalTargetNames = canonicalTargetNames
+ }
+ }
+ if resolvedFKRefs == nil {
+ resolvedFKRefs = make(map[string][]fkConstraintRef)
+ if cache != nil {
+ cache.resolvedFKRefs = resolvedFKRefs
+ }
+ }
+ if resolvedParentRefs == nil {
+ resolvedParentRefs = make(map[string][]parentFKRef)
+ if cache != nil {
+ cache.resolvedParentRefs = resolvedParentRefs
+ }
+ }
+ }
+ var fieldIdxCache map[string]map[string]int
+ if cache != nil && cache.fieldIdxCache != nil {
+ fieldIdxCache = cache.fieldIdxCache
+ } else {
+ fieldIdxCache = map[string]map[string]int{}
+ }
+ for _, event := range events {
+ if event.Type != binlogdatapb.VEventType_ROW {
+ continue
+ }
+ rowEvent := event.RowEvent
+ if rowEvent == nil {
+ continue
+ }
+ // tablePlans is keyed by the FIELD event's source TableName. We rely
+ // on vstreamer emitting identical-case TableName for both the FIELD
+ // and the subsequent ROW events of the same table — they share a
+ // single per-stream cache. We do NOT canonicalize via
+ // canonicalTargetTableName here because that operates on TARGET
+ // names; the SOURCE-name space is independent and a case-insensitive
+ // fold could conflate distinct source tables on case-sensitive
+ // filesystems (lower_case_table_names=0).
+ plan := tablePlans[rowEvent.TableName]
+ if plan == nil {
+ return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "missing table plan for %s", rowEvent.TableName)
+ }
+ targetTableName := plan.TargetName
+ var refs []fkConstraintRef
+ var pRefs []parentFKRef
+ if needResolvedFKRefs {
+ var ok bool
+ refs, ok = resolvedFKRefs[targetTableName]
+ if !ok {
+ refs = resolveFKRefsForTable(targetTableName, fkRefs, canonicalTargetNames)
+ resolvedFKRefs[targetTableName] = refs
+ }
+ pRefs, ok = resolvedParentRefs[targetTableName]
+ if !ok {
+ pRefs = resolveParentFKRefsForTable(targetTableName, parentRefs, canonicalTargetNames)
+ resolvedParentRefs[targetTableName] = pRefs
+ }
+ }
+ // Build fieldIdx once per table for FK, composite identity, and
+ // unique-key column lookups.
+ var fieldIdx map[string]int
+ if len(refs) > 0 || len(pRefs) > 0 || len(plan.IdentityColumns) > 1 || len(plan.UniqueKeyColumns) > 0 {
+ var ok bool
+ fieldIdx, ok = fieldIdxCache[rowEvent.TableName]
+ if !ok {
+ fieldIdx = make(map[string]int, len(plan.Fields))
+ for i, f := range plan.Fields {
+ fieldIdx[f.Name] = i
+ fieldIdx[strings.ToLower(f.Name)] = i
+ }
+ fieldIdxCache[rowEvent.TableName] = fieldIdx
+ }
+ }
+ // Fail closed when this child's FK columns and the parent's
+ // referenced columns have hash-incompatible STREAMED metadata
+ // (validated once per child table per fetch; parents without a
+ // streamed plan yet are re-checked on later transactions).
+ if len(refs) > 0 {
+ validated := false
+ if cache != nil {
+ _, validated = cache.fkStreamedValidated[rowEvent.TableName]
+ }
+ if !validated {
+ if err := validateFKStreamedFieldCompatibility(plan, fieldIdx, refs, cache, tablePlans); err != nil {
+ return nil, err
+ }
+ if cache != nil {
+ if cache.fkStreamedValidated == nil {
+ cache.fkStreamedValidated = make(map[string]struct{})
+ }
+ cache.fkStreamedValidated[rowEvent.TableName] = struct{}{}
+ }
+ }
+ }
+ // Resolve the plan's identity column positions once per table.
+ var identityIndexes []int
+ if cache != nil {
+ if cache.identityIdxCache == nil {
+ cache.identityIdxCache = make(map[string][]int)
+ }
+ var ok bool
+ identityIndexes, ok = cache.identityIdxCache[rowEvent.TableName]
+ if !ok {
+ var err error
+ identityIndexes, err = writesetIdentityFieldIndexes(plan, targetTableName, fieldIdx)
+ if err != nil {
+ return nil, err
+ }
+ cache.identityIdxCache[rowEvent.TableName] = identityIndexes
+ }
+ } else {
+ var err error
+ identityIndexes, err = writesetIdentityFieldIndexes(plan, targetTableName, fieldIdx)
+ if err != nil {
+ return nil, err
+ }
+ }
+ // Resolve the hashable unique secondary indexes' field positions once
+ // per table.
+ var uniqueKeyIndexes [][]int
+ if cache != nil {
+ if cache.uniqueKeyIdxCache == nil {
+ cache.uniqueKeyIdxCache = make(map[string][][]int)
+ }
+ var ok bool
+ uniqueKeyIndexes, ok = cache.uniqueKeyIdxCache[rowEvent.TableName]
+ if !ok {
+ var err error
+ uniqueKeyIndexes, err = writesetUniqueKeyFieldIndexes(plan, targetTableName, fieldIdx)
+ if err != nil {
+ return nil, err
+ }
+ cache.uniqueKeyIdxCache[rowEvent.TableName] = uniqueKeyIndexes
+ }
+ } else {
+ var err error
+ uniqueKeyIndexes, err = writesetUniqueKeyFieldIndexes(plan, targetTableName, fieldIdx)
+ if err != nil {
+ return nil, err
+ }
+ }
+ // Resolve the writeset-relevant column set once per table.
+ var relevantCols map[int]struct{}
+ if cache != nil {
+ if cache.relevantColsCache == nil {
+ cache.relevantColsCache = make(map[string]map[int]struct{})
+ }
+ var ok bool
+ relevantCols, ok = cache.relevantColsCache[rowEvent.TableName]
+ if !ok {
+ relevantCols = writesetRelevantColumns(plan, fieldIdx, refs, pRefs)
+ cache.relevantColsCache[rowEvent.TableName] = relevantCols
+ }
+ } else {
+ relevantCols = writesetRelevantColumns(plan, fieldIdx, refs, pRefs)
+ }
+ for _, change := range rowEvent.RowChanges {
+ // Partial row images (DataColumns/JsonPartialValues) omit columns
+ // from the binlog payload. buildTxnWriteset decodes rows with
+ // sqltypes.MakeRowTrusted(plan.Fields, change.Before/After), which
+ // treats the streamed values as positional and ignores the bitmaps.
+ // That makes both PK and FK hashing unsafe: omitted columns can
+ // shift later values into the wrong field slots. BEFORE images are
+ // ambiguous too: vstreamer can encode omitted columns as -1 lengths,
+ // but it only publishes DataColumns for AFTER rows.
+ // Fail closed until writeset reconstruction becomes bitmap-aware.
+ isPartialRow := change.DataColumns != nil || change.JsonPartialValues != nil
+ if !isPartialRow && plan.Fields != nil {
+ // Use != (not <) so an over-sized row image — which can arise
+ // from a stale plan that's missing a column the source still
+ // streams — also fails closed instead of running MakeRowTrusted
+ // past the end of plan.Fields and nil-derefing.
+ isPartialRow = (change.Before != nil && len(change.Before.Lengths) != len(plan.Fields)) ||
+ (change.After != nil && len(change.After.Lengths) != len(plan.Fields))
+ }
+ if !isPartialRow {
+ isPartialRow = rowHasNegativeRelevantLengths(change.Before, relevantCols) ||
+ rowHasNegativeRelevantLengths(change.After, relevantCols)
+ }
+ if isPartialRow {
+ return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "partial row image on table %s: forcing serialization", rowEvent.TableName)
+ }
+ // Decode Before/After row values once per change.
+ var beforeVals, afterVals []sqltypes.Value
+ if change.Before != nil && plan.Fields != nil {
+ beforeVals = sqltypes.MakeRowTrusted(plan.Fields, change.Before)
+ }
+ if change.After != nil && plan.Fields != nil {
+ afterVals = sqltypes.MakeRowTrusted(plan.Fields, change.After)
+ }
+ if err := writesetKeysForChangeWithFieldIdx(plan, targetTableName, identityIndexes, beforeVals, afterVals, keySet); err != nil {
+ return nil, err
+ }
+ for ord, colIdxs := range uniqueKeyIndexes {
+ if err := writesetKeysForUniqueKey(targetTableName, ord, colIdxs, plan.Fields, beforeVals, afterVals, keySet); err != nil {
+ return nil, err
+ }
+ }
+ for i := range refs {
+ if err := writesetKeysForFKRef(&refs[i], plan.Fields, fieldIdx, beforeVals, afterVals, keySet); err != nil {
+ return nil, err
+ }
+ }
+ // Parent-side: generate FK-aware keys using the referenced columns
+ // so parent row changes conflict with child FK keys.
+ for i := range pRefs {
+ if err := writesetKeysForParentFKRef(&pRefs[i], plan.Fields, fieldIdx, beforeVals, afterVals, keySet); err != nil {
+ return nil, err
+ }
+ }
+ }
+ }
+ if len(keySet) == 0 {
+ return nil, nil
+ }
+ keys := make([]uint64, 0, len(keySet))
+ for key := range keySet {
+ keys = append(keys, key)
+ }
+ return keys, nil
+}
+
+// writesetRelevantColumns builds the set of column indexes the writeset
+// depends on (PK plus FK-joined columns) for one table plan. Callers cache
+// the result per table (see txnWritesetCache.relevantColsCache) so the map
+// is built once per table per fetch instead of once per row change.
+//
+// Hashable unique-secondary columns are intentionally NOT included here. A
+// -1 length on a relevant column trips the partial-image guard and forces
+// serialization, but a NULL value in a full row image is also encoded as a
+// -1 length. Unique-secondary columns are commonly nullable, and a NULL
+// unique value cannot conflict (MySQL permits multiple NULLs), so the
+// emitter (writesetKeysForUniqueKey) skips it. Adding such columns to the
+// relevance set would force-serialize every NULL unique value and negate the
+// parallelism this change unlocks.
+func writesetRelevantColumns(plan *TablePlan, fieldIdx map[string]int, refs []fkConstraintRef, pRefs []parentFKRef) map[int]struct{} {
+ relevantColumns := make(map[int]struct{})
+ for i, isPK := range plan.PKIndices {
+ if isPK {
+ relevantColumns[i] = struct{}{}
+ }
+ }
+ for _, ref := range refs {
+ for _, colName := range ref.ChildColumnNames {
+ if idx, ok := fieldIndexForName(fieldIdx, colName); ok {
+ relevantColumns[idx] = struct{}{}
+ }
+ }
+ }
+ for _, ref := range pRefs {
+ for _, colName := range ref.ReferencedColumnNames {
+ if idx, ok := fieldIndexForName(fieldIdx, colName); ok {
+ relevantColumns[idx] = struct{}{}
+ }
+ }
+ }
+ return relevantColumns
+}
+
+// rowHasNegativeRelevantLengths returns true when a row image has -1
+// (omitted) lengths for any column the writeset depends on (PK or FK-joined
+// column). vstreamer encodes omitted columns as -1 length without publishing
+// a DataColumns bitmap on BEFORE rows, and the same sentinel can appear on
+// AFTER rows under partial-image producers that do not set a bitmap. Treating
+// those as partial images lets us fail closed and serialize instead of
+// hashing against wrong-slot (NULL) values.
+func rowHasNegativeRelevantLengths(row *querypb.Row, relevantColumns map[int]struct{}) bool {
+ if row == nil {
+ return false
+ }
+ for i, length := range row.Lengths {
+ if length < 0 {
+ if _, ok := relevantColumns[i]; ok {
+ return true
+ }
+ }
+ }
+ return false
+}
+
+// snapshotTablePlans returns a copy-on-write snapshot of tablePlans. It only
+// copies the map when the version has changed since the last snapshot, avoiding
+// the read-lock hold time of building writesets directly against the live map.
+func snapshotTablePlans(mu *sync.RWMutex, tablePlans map[string]*TablePlan, version *atomic.Int64, cachedVersion *int64, cached map[string]*TablePlan) map[string]*TablePlan {
+ if tablePlans == nil {
+ return nil
+ }
+ mu.RLock()
+ defer mu.RUnlock()
+ v := version.Load()
+ if cached != nil && v == *cachedVersion {
+ return cached
+ }
+ cp := make(map[string]*TablePlan, len(tablePlans))
+ maps.Copy(cp, tablePlans)
+ *cachedVersion = v
+ return cp
+}
+
+// txnTouchesExtraUniqueSecondary reports whether the txn writes any table
+// whose plan carries an extra unique secondary index. Those tables have to
+// serialize: writeset keys built from PK alone can miss conflicts that the
+// secondary unique index would otherwise enforce.
+func txnTouchesExtraUniqueSecondary(events []*binlogdatapb.VEvent, tablePlans map[string]*TablePlan) bool {
+ for _, event := range events {
+ if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil {
+ continue
+ }
+ plan := tablePlans[event.RowEvent.TableName]
+ if plan != nil && plan.HasExtraUniqueSecondary {
+ return true
+ }
+ }
+ return false
+}
+
+// txnTouchesUnsupportedWritesetMapping reports whether any ROW event in the
+// txn targets a table whose plan uses a mapping the writeset builder can't
+// reason about (expressions, generated columns, lossy casts, etc). The
+// scheduler must force serialization so those txns do not slip past conflict
+// detection.
+func txnTouchesUnsupportedWritesetMapping(events []*binlogdatapb.VEvent, tablePlans map[string]*TablePlan) bool {
+ for _, event := range events {
+ if event.Type != binlogdatapb.VEventType_ROW || event.RowEvent == nil {
+ continue
+ }
+ plan := tablePlans[event.RowEvent.TableName]
+ if plan != nil && plan.HasUnsupportedWritesetMapping {
+ return true
+ }
+ }
+ return false
+}
+
+// writesetKeysForChange extracts PK-based writeset keys from pre-decoded row
+// values and inserts them directly into the caller's keySet map as uint64 hashes.
+func writesetKeysForChange(plan *TablePlan, tableName string, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error {
+ identityIndexes, err := writesetIdentityFieldIndexes(plan, tableName, nil)
+ if err != nil {
+ return err
+ }
+ return writesetKeysForChangeWithFieldIdx(plan, tableName, identityIndexes, beforeVals, afterVals, keySet)
+}
+
+// writesetIdentityFieldIndexes resolves a plan's declared identity column
+// names to positional indexes into the streamed fields. Multi-column
+// identities go through this path; single-column identity plans use a
+// simpler fast path elsewhere. Returns an error if any declared column is
+// missing from the streamed fields so the caller can serialize the txn.
+func writesetIdentityFieldIndexes(plan *TablePlan, tableName string, fieldIdx map[string]int) ([]int, error) {
+ if plan == nil || len(plan.IdentityColumns) <= 1 {
+ return nil, nil
+ }
+ if fieldIdx == nil {
+ fieldIdx = make(map[string]int, len(plan.Fields))
+ for i, f := range plan.Fields {
+ if f == nil {
+ continue
+ }
+ fieldIdx[f.Name] = i
+ fieldIdx[strings.ToLower(f.Name)] = i
+ }
+ }
+ indexes := make([]int, 0, len(plan.IdentityColumns))
+ for _, colName := range plan.IdentityColumns {
+ idx, ok := fieldIndexForName(fieldIdx, colName)
+ if !ok {
+ return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "writeset identity column %q not in streamed fields for %s", colName, tableName)
+ }
+ indexes = append(indexes, idx)
+ }
+ return indexes, nil
+}
+
+// writesetUniqueKeyFieldIndexes resolves each hashable unique key's column
+// names to positions in the streamed fields. Returns an error (which routes
+// to serialization via writesetErrorForcesSerialization's "not in streamed
+// fields" match) when a column is missing.
+func writesetUniqueKeyFieldIndexes(plan *TablePlan, tableName string, fieldIdx map[string]int) ([][]int, error) {
+ if plan == nil || len(plan.UniqueKeyColumns) == 0 {
+ return nil, nil
+ }
+ if fieldIdx == nil {
+ fieldIdx = make(map[string]int, len(plan.Fields))
+ for i, f := range plan.Fields {
+ if f == nil {
+ continue
+ }
+ fieldIdx[f.Name] = i
+ fieldIdx[strings.ToLower(f.Name)] = i
+ }
+ }
+ uniqueKeyIndexes := make([][]int, 0, len(plan.UniqueKeyColumns))
+ for _, cols := range plan.UniqueKeyColumns {
+ indexes := make([]int, 0, len(cols))
+ for _, colName := range cols {
+ idx, ok := fieldIndexForName(fieldIdx, colName)
+ if !ok {
+ return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "writeset unique key column %q not in streamed fields for %s", colName, tableName)
+ }
+ indexes = append(indexes, idx)
+ }
+ uniqueKeyIndexes = append(uniqueKeyIndexes, indexes)
+ }
+ return uniqueKeyIndexes, nil
+}
+
+// writesetKeysForChangeWithFieldIdx is the indexed variant of
+// writesetKeysForChange: it takes the plan's identity column positions
+// pre-resolved (see writesetIdentityFieldIndexes) so multi-row txns do not
+// re-resolve them per change. The keys it inserts into keySet are what the
+// scheduler compares to decide which concurrent txns conflict.
+func writesetKeysForChangeWithFieldIdx(plan *TablePlan, tableName string, identityIndexes []int, beforeVals, afterVals []sqltypes.Value, keySet map[uint64]struct{}) error {
+ if plan == nil {
+ return nil
+ }
+ if len(plan.PKIndices) == 0 {
+ // Fail closed: a plan with no identity must not silently contribute
+ // zero keys. In a txn that also touches keyed tables the writeset
+ // would be non-empty and this table's rows would race with no
+ // conflict tracking at all. The error routes the txn to the serial
+ // path (see writesetErrorForcesSerialization).
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no usable writeset identity for %s", tableName)
+ }
+ appendKey := func(vals []sqltypes.Value) error {
+ if len(vals) == 0 {
+ return nil
+ }
+ var d xxhash.Digest
+ writesetDigestInit(&d, tableName)
+ hasPK := false
+ if len(identityIndexes) > 0 {
+ for _, idx := range identityIndexes {
+ if idx >= len(vals) {
+ return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "pk index out of range for %s", tableName)
+ }
+ hasPK = true
+ var field *querypb.Field
+ if idx < len(plan.Fields) {
+ field = plan.Fields[idx]
+ }
+ if err := writesetDigestAddFieldValue(&d, field, vals[idx]); err != nil {
+ return err
+ }
+ }
+ } else {
+ for i, isPK := range plan.PKIndices {
+ if !isPK {
+ continue
+ }
+ if i >= len(vals) {
+ return vterrors.Errorf(vtrpcpb.Code_INTERNAL, "pk index out of range for %s", tableName)
+ }
+ hasPK = true
+ var field *querypb.Field
+ if i < len(plan.Fields) {
+ field = plan.Fields[i]
+ }
+ if err := writesetDigestAddFieldValue(&d, field, vals[i]); err != nil {
+ return err
+ }
+ }
+ }
+ if !hasPK {
+ // Fail closed, same as the empty-PKIndices case above.
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "no usable writeset identity for %s", tableName)
+ }
+ keySet[d.Sum64()] = struct{}{}
+ return nil
+ }
+ if err := appendKey(beforeVals); err != nil {
+ return err
+ }
+ if err := appendKey(afterVals); err != nil {
+ return err
+ }
+ return nil
+}
+
+// queryFKRefs queries information_schema.KEY_COLUMN_USAGE to discover all
+// foreign key constraints in the given database. It returns a map from
+// child table name to a list of FK constraints. Each constraint includes
+// the referenced (parent) table name and the child column names in ordinal
+// order, so that writeset keys generated for child rows will match the
+// parent table's PK-based writeset keys.
+func queryFKRefs(dbClient *vdbClient, dbName string) (map[string][]fkConstraintRef, error) {
+ query := fmt.Sprintf(
+ "SELECT kcu.TABLE_NAME, kcu.CONSTRAINT_NAME, kcu.COLUMN_NAME, kcu.REFERENCED_TABLE_NAME, kcu.REFERENCED_COLUMN_NAME, "+
+ "child_cols.DATA_TYPE, COALESCE(child_cols.CHARACTER_SET_NAME, ''), COALESCE(child_cols.COLLATION_NAME, ''), COALESCE(child_cols.COLUMN_TYPE, ''), "+
+ "parent_cols.DATA_TYPE, COALESCE(parent_cols.CHARACTER_SET_NAME, ''), COALESCE(parent_cols.COLLATION_NAME, ''), COALESCE(parent_cols.COLUMN_TYPE, '') "+
+ "FROM information_schema.KEY_COLUMN_USAGE kcu "+
+ "JOIN information_schema.COLUMNS child_cols "+
+ "ON child_cols.TABLE_SCHEMA = kcu.TABLE_SCHEMA AND child_cols.TABLE_NAME = kcu.TABLE_NAME AND child_cols.COLUMN_NAME = kcu.COLUMN_NAME "+
+ "JOIN information_schema.COLUMNS parent_cols "+
+ "ON parent_cols.TABLE_SCHEMA = kcu.TABLE_SCHEMA AND parent_cols.TABLE_NAME = kcu.REFERENCED_TABLE_NAME AND parent_cols.COLUMN_NAME = kcu.REFERENCED_COLUMN_NAME "+
+ "WHERE kcu.TABLE_SCHEMA = %s AND kcu.REFERENCED_TABLE_NAME IS NOT NULL "+
+ "ORDER BY kcu.TABLE_NAME, kcu.CONSTRAINT_NAME, kcu.ORDINAL_POSITION",
+ encodeString(dbName),
+ )
+ qr, err := dbClient.ExecuteFetch(query, -1)
+ if err != nil {
+ return nil, vterrors.Wrapf(err, "queryFKRefs")
+ }
+ if len(qr.Rows) == 0 {
+ return nil, nil
+ }
+
+ // Group by (childTable, constraintName) — each row is one column
+ // of a potentially multi-column FK. We group by constraint name
+ // rather than parent table because a child table can have multiple
+ // FK constraints referencing the same parent table with different
+ // column sets.
+ type constraintKey struct {
+ childTable string
+ constraintName string
+ }
+ type constraintEntry struct {
+ key constraintKey
+ parentTable string
+ cols []string // child column names in ordinal order
+ referencedCols []string // parent column names in ordinal order
+ }
+
+ // Use a slice to preserve order; there are typically very few FK constraints.
+ var constraints []constraintEntry
+ idx := map[constraintKey]int{}
+
+ type fkColumnDigestMeta struct {
+ dataType string
+ charset string
+ collation string
+ columnType string
+ }
+ parseDigestMeta := func(offset int, row []sqltypes.Value) fkColumnDigestMeta {
+ return fkColumnDigestMeta{
+ dataType: strings.ToLower(row[offset].ToString()),
+ charset: row[offset+1].ToString(),
+ collation: row[offset+2].ToString(),
+ columnType: strings.ToLower(row[offset+3].ToString()),
+ }
+ }
+ usesTextDigest := func(meta fkColumnDigestMeta) bool {
+ return meta.charset != "" || meta.collation != ""
+ }
+ columnsShareWritesetEncoding := func(child, parent fkColumnDigestMeta) bool {
+ if usesTextDigest(child) || usesTextDigest(parent) {
+ return usesTextDigest(child) && usesTextDigest(parent) &&
+ child.charset == parent.charset &&
+ child.collation == parent.collation
+ }
+ return child.columnType == parent.columnType
+ }
+
+ for _, row := range qr.Rows {
+ childTable := row[0].ToString()
+ constraintName := row[1].ToString()
+ colName := row[2].ToString()
+ parentTable := row[3].ToString()
+ referencedColName := row[4].ToString()
+ childMeta := parseDigestMeta(5, row)
+ parentMeta := parseDigestMeta(9, row)
+ if !columnsShareWritesetEncoding(childMeta, parentMeta) {
+ return nil, vterrors.Errorf(
+ vtrpcpb.Code_FAILED_PRECONDITION,
+ "incompatible FK column definitions for %s.%s referencing %s.%s: child=%s/%s parent=%s/%s; align the definitions or disable parallel apply for this workflow",
+ childTable,
+ colName,
+ parentTable,
+ referencedColName,
+ childMeta.columnType,
+ childMeta.collation,
+ parentMeta.columnType,
+ parentMeta.collation,
+ )
+ }
+
+ k := constraintKey{childTable: childTable, constraintName: constraintName}
+ if i, ok := idx[k]; ok {
+ constraints[i].cols = append(constraints[i].cols, colName)
+ constraints[i].referencedCols = append(constraints[i].referencedCols, referencedColName)
+ } else {
+ idx[k] = len(constraints)
+ constraints = append(constraints, constraintEntry{
+ key: k,
+ parentTable: parentTable,
+ cols: []string{colName},
+ referencedCols: []string{referencedColName},
+ })
+ }
+ }
+
+ result := make(map[string][]fkConstraintRef, len(constraints))
+ for _, c := range constraints {
+ result[c.key.childTable] = append(result[c.key.childTable], fkConstraintRef{
+ ParentTable: c.parentTable,
+ ChildColumnNames: c.cols,
+ ReferencedColumnNames: c.referencedCols,
+ })
+ }
+
+ return result, nil
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset_test.go b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset_test.go
new file mode 100644
index 00000000000..c683356e485
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/parallel_apply_writeset_test.go
@@ -0,0 +1,1590 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/cespare/xxhash/v2"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ "vitess.io/vitess/go/mysql/capabilities"
+ "vitess.io/vitess/go/mysql/collations"
+ "vitess.io/vitess/go/sqltypes"
+ "vitess.io/vitess/go/vt/binlog/binlogplayer"
+ "vitess.io/vitess/go/vt/sqlparser"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ querypb "vitess.io/vitess/go/vt/proto/query"
+ vttablet "vitess.io/vitess/go/vt/vttablet/common"
+)
+
+// testWritesetHash mirrors production hash logic for test assertions.
+func testWritesetHash(tableName string, vals ...sqltypes.Value) uint64 {
+ var d xxhash.Digest
+ writesetDigestInit(&d, tableName)
+ for _, v := range vals {
+ writesetDigestAddValue(&d, v)
+ }
+ return d.Sum64()
+}
+
+func TestBuildTxnWritesetSinglePK(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}
+ change := &binlogdatapb.RowChange{After: row}
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.NoError(t, err)
+ expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")))
+ require.Equal(t, []uint64{expected}, keys)
+}
+
+func TestBuildTxnWritesetUsesBeforeAndAfter(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ beforeRow := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}
+ afterRow := &querypb.Row{Values: []byte("2"), Lengths: []int64{1}}
+ change := &binlogdatapb.RowChange{Before: beforeRow, After: afterRow}
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.NoError(t, err)
+ require.Len(t, keys, 2)
+ h1 := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")))
+ h2 := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("2")))
+ assert.ElementsMatch(t, []uint64{h1, h2}, keys)
+}
+
+func BenchmarkBuildTxnWriteset_NoFKRefsAvoidsPlanWideCanonicalization(b *testing.B) {
+ const tableCount = 256
+ tablePlans := make(map[string]*TablePlan, tableCount)
+ for i := range tableCount {
+ name := fmt.Sprintf("t%d", i)
+ tablePlans[name] = &TablePlan{
+ TargetName: name,
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ }
+ row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}
+ events := []*binlogdatapb.VEvent{{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t0",
+ RowChanges: []*binlogdatapb.RowChange{{After: row}},
+ },
+ }}
+
+ b.ReportAllocs()
+ for b.Loop() {
+ keys, err := buildTxnWriteset(tablePlans, nil, nil, events)
+ if err != nil {
+ b.Fatal(err)
+ }
+ if len(keys) != 1 {
+ b.Fatalf("unexpected key count: %d", len(keys))
+ }
+ }
+}
+
+func BenchmarkWritesetDigestAddFieldValue_TextAllocations(b *testing.B) {
+ collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci"))
+ field := &querypb.Field{Name: "email", Type: querypb.Type_VARCHAR, Charset: collationID}
+ value := sqltypes.NewVarChar("user@example.com ")
+
+ b.ReportAllocs()
+ for b.Loop() {
+ var d xxhash.Digest
+ writesetDigestInit(&d, "emails")
+ if err := writesetDigestAddFieldValue(&d, field, value); err != nil {
+ b.Fatal(err)
+ }
+ _ = d.Sum64()
+ }
+}
+
+func TestBuildTxnWritesetRejectsPartialRowImageWithoutFKRefs(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "a", Type: querypb.Type_INT64},
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "b", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{false, true, false},
+ }
+ change := &binlogdatapb.RowChange{
+ After: &querypb.Row{Values: []byte("23"), Lengths: []int64{1, 1}},
+ DataColumns: &binlogdatapb.RowChange_Bitmap{
+ Count: 3,
+ Cols: []byte{0x06},
+ },
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "partial row image")
+ require.Nil(t, keys)
+ assert.NotEqual(t, []uint64{testWritesetHash("t1", sqltypes.NewInt64(3))}, keys)
+}
+
+// TestWritesetDigestAddValueDistinguishesTypesAcrossByteBoundary pins the
+// invariant that the writeset type discriminator distinguishes types whose
+// values modulo-256 collide. querypb.Type is a 16-bit enum and the encoding
+// MUST not silently lose the high byte — otherwise two rows with conflicting
+// PK values but distinct types would hash to the same key, letting truly
+// conflicting transactions run in parallel and corrupt downstream apply.
+func TestWritesetDigestAddValueDistinguishesTypesAcrossByteBoundary(t *testing.T) {
+ // Two synthetic types whose low bytes are identical: 1 and 1+256.
+ // All current named querypb.Type values stay below the collision
+ // threshold, but the encoding must defend against future additions.
+ v1 := sqltypes.MakeTrusted(querypb.Type(1), []byte{0x42})
+ v2 := sqltypes.MakeTrusted(querypb.Type(1+256), []byte{0x42})
+
+ var d1, d2 xxhash.Digest
+ writesetDigestInit(&d1, "t")
+ writesetDigestInit(&d2, "t")
+ writesetDigestAddValue(&d1, v1)
+ writesetDigestAddValue(&d2, v2)
+
+ require.NotEqual(t, d1.Sum64(), d2.Sum64(), "writeset digest must distinguish types whose low byte collides")
+}
+
+// TestBuildTxnWritesetRejectsSparseAfterImageOnRelevantPKColumn covers an
+// AFTER image that carries a -1 (omitted) length in a PK column without
+// publishing a DataColumns bitmap. Before the fix, only BEFORE images were
+// scanned for negative relevant lengths, so this case fell through to
+// MakeRowTrusted and silently hashed the PK as a NULL/zero value — making the
+// row collide with any other row whose AFTER image was similarly sparse.
+func TestBuildTxnWritesetRejectsSparseAfterImageOnRelevantPKColumn(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "name", Type: querypb.Type_VARCHAR},
+ },
+ PKIndices: []bool{true, false},
+ }
+ // AFTER image omits the PK column (length=-1) but does not publish a
+ // DataColumns bitmap — only the "name" value is present.
+ change := &binlogdatapb.RowChange{
+ After: &querypb.Row{Values: []byte("john"), Lengths: []int64{-1, 4}},
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "partial row image")
+ require.Nil(t, keys)
+}
+
+// TestBuildTxnWritesetRejectsRowImageWithExtraLengths covers the case where the
+// row image carries more length entries than the plan has fields. This can
+// happen if the table plan cache is stale relative to a schema that dropped a
+// column. The writeset builder must fail closed instead of indexing into
+// plan.Fields out of bounds (which would nil-deref in MakeRowTrusted).
+func TestBuildTxnWritesetRejectsRowImageWithExtraLengths(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true},
+ }
+ // Row has 2 length entries, but plan only knows 1 field.
+ change := &binlogdatapb.RowChange{
+ After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}},
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "partial row image")
+ require.Nil(t, keys)
+}
+
+func TestBuildTxnWritesetAllowsBeforeImageWithNullValue(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "nullable_col", Type: querypb.Type_VARCHAR},
+ },
+ PKIndices: []bool{true, false},
+ }
+ change := &binlogdatapb.RowChange{
+ Before: &querypb.Row{Values: []byte("1"), Lengths: []int64{1, -1}},
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.NoError(t, err)
+ expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")))
+ require.Equal(t, []uint64{expected}, keys)
+}
+
+func TestBuildTxnWritesetRejectsSparseBeforeImageOnRelevantFKColumn(t *testing.T) {
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ {Name: "val", Type: querypb.Type_VARCHAR},
+ },
+ PKIndices: []bool{true, false, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}},
+ }
+ change := &binlogdatapb.RowChange{
+ Before: &querypb.Row{
+ Lengths: []int64{1, -1, 3},
+ Values: []byte("5aaa"),
+ },
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(
+ map[string]*TablePlan{"child": childPlan},
+ fkRefs,
+ buildParentFKRefs(fkRefs),
+ []*binlogdatapb.VEvent{vevent},
+ )
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "partial row image")
+ require.Nil(t, keys)
+}
+
+func TestBuildTxnWritesetAllowsCaseOnlyFKColumnNameMismatch(t *testing.T) {
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "ID", Type: querypb.Type_INT64},
+ {Name: "PARENT_ID", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}},
+ }
+ change := &binlogdatapb.RowChange{
+ After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}},
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(
+ map[string]*TablePlan{"child": childPlan},
+ fkRefs,
+ buildParentFKRefs(fkRefs),
+ []*binlogdatapb.VEvent{vevent},
+ )
+ require.NoError(t, err)
+ require.Len(t, keys, 2)
+}
+
+func TestBuildTxnWritesetAllowsMixedCaseFKColumnNameMismatch(t *testing.T) {
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "ID", Type: querypb.Type_INT64},
+ {Name: "PARENT_ID", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"Parent_ID"}, ReferencedColumnNames: []string{"ID"}}},
+ }
+ change := &binlogdatapb.RowChange{
+ After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}},
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(
+ map[string]*TablePlan{"child": childPlan},
+ fkRefs,
+ buildParentFKRefs(fkRefs),
+ []*binlogdatapb.VEvent{vevent},
+ )
+ require.NoError(t, err)
+ require.Len(t, keys, 2)
+}
+
+func TestBuildTxnWritesetAllowsFullRowImageWithNullValue(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "nullable_col", Type: querypb.Type_VARCHAR},
+ },
+ PKIndices: []bool{true, false},
+ }
+ change := &binlogdatapb.RowChange{
+ After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1, -1}},
+ }
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.NoError(t, err)
+ expected := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")))
+ require.Equal(t, []uint64{expected}, keys)
+}
+
+// TestBuildTxnWritesetNoPK pins that a table plan with no usable identity
+// (no PK columns and no identity columns) fails closed instead of silently
+// contributing zero keys. Silent no-keys would be a correctness hole: in a
+// transaction that also touches keyed tables, the writeset would be
+// non-empty, the scheduler would use writeset-only conflict detection, and
+// this table's rows would race with no conflict tracking at all.
+// buildColInfoMap's PK -> PK-equivalent -> all-columns fallback should make
+// this unreachable for real tables, but the writeset builder must not rely
+// on that staying true.
+func TestBuildTxnWritesetNoPK(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{false},
+ }
+ row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}
+ change := &binlogdatapb.RowChange{After: row}
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "no usable writeset identity")
+ require.Nil(t, keys)
+ // The error must route the transaction to the serial path, not fail the
+ // workflow: over-serialization is safe, a bricked workflow is not.
+ require.True(t, writesetErrorForcesSerialization(err))
+}
+
+func TestBuildTxnWritesetFailsClosedWithoutUsableIdentity(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ IdentityColumns: []string{"id"},
+ }
+ row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}
+ change := &binlogdatapb.RowChange{After: row}
+ rowEvent := &binlogdatapb.RowEvent{TableName: "t1", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "no usable writeset identity")
+ require.Nil(t, keys)
+ require.True(t, writesetErrorForcesSerialization(err), "missing identity must serialize the txn, not fail the workflow")
+}
+
+func TestWritesetKeysForChangeMissingPlan(t *testing.T) {
+ keySet := map[uint64]struct{}{}
+ err := writesetKeysForChange(nil, "t1", nil, nil, keySet)
+ require.NoError(t, err)
+ require.Empty(t, keySet)
+}
+
+func TestWritesetKeysForChangeMultiplePK(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "name", Type: querypb.Type_VARCHAR},
+ },
+ PKIndices: []bool{true, true},
+ }
+ row := &querypb.Row{Values: []byte("1foo"), Lengths: []int64{1, 3}}
+ afterVals := sqltypes.MakeRowTrusted(plan.Fields, row)
+ keySet := map[uint64]struct{}{}
+ err := writesetKeysForChange(plan, "t1", nil, afterVals, keySet)
+ require.NoError(t, err)
+ require.Len(t, keySet, 1)
+ expected := testWritesetHash("t1",
+ sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")),
+ sqltypes.MakeTrusted(querypb.Type_VARCHAR, []byte("foo")),
+ )
+ _, ok := keySet[expected]
+ require.True(t, ok)
+}
+
+func TestWritesetKeysForChangeCompositeBinaryPKValuesDoNotAlias(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id1", Type: querypb.Type_VARBINARY},
+ {Name: "id2", Type: querypb.Type_VARBINARY},
+ },
+ PKIndices: []bool{true, true},
+ }
+ valueType := querypb.Type_VARBINARY
+ typeByte := byte(valueType)
+ firstTuple := []sqltypes.Value{
+ sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'a'}),
+ sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'x', ',', typeByte, 'y'}),
+ }
+ secondTuple := []sqltypes.Value{
+ sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'a', ',', typeByte, 'x'}),
+ sqltypes.MakeTrusted(querypb.Type_VARBINARY, []byte{'y'}),
+ }
+ keySet := map[uint64]struct{}{}
+
+ require.NoError(t, writesetKeysForChange(plan, "t1", nil, firstTuple, keySet))
+ require.NoError(t, writesetKeysForChange(plan, "t1", nil, secondTuple, keySet))
+ require.Len(t, keySet, 2)
+}
+
+func TestWritesetKeysForChangeUsesMakeRowTrusted(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}
+ afterVals := sqltypes.MakeRowTrusted(plan.Fields, row)
+ keySet := map[uint64]struct{}{}
+ err := writesetKeysForChange(plan, "t1", nil, afterVals, keySet)
+ require.NoError(t, err)
+ require.Len(t, keySet, 1)
+ expected := testWritesetHash("t1", sqltypes.MakeRowTrusted(plan.Fields, row)[0])
+ _, ok := keySet[expected]
+ require.True(t, ok)
+}
+
+type stubDBClient struct {
+ result *sqltypes.Result
+ err error
+}
+
+func (s *stubDBClient) DBName() string { return "db" }
+func (s *stubDBClient) Connect() error { return nil }
+func (s *stubDBClient) Begin() error { return nil }
+func (s *stubDBClient) Commit() error { return nil }
+func (s *stubDBClient) Rollback() error { return nil }
+func (s *stubDBClient) Close() {}
+func (s *stubDBClient) IsClosed() bool { return false }
+func (s *stubDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) {
+ if s.err != nil {
+ return nil, s.err
+ }
+ return s.result, nil
+}
+
+func (s *stubDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) {
+ if s.err != nil {
+ return nil, s.err
+ }
+ return []*sqltypes.Result{s.result}, nil
+}
+
+func (s *stubDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) {
+ return false, nil
+}
+
+func TestWritesetKeysForChangePKOutOfRange(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}, {Name: "other", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true, true},
+ }
+ row := &querypb.Row{Values: []byte("1"), Lengths: []int64{1}}
+ afterVals := sqltypes.MakeRowTrusted(plan.Fields[:1], row)
+ keySet := map[uint64]struct{}{}
+ err := writesetKeysForChange(plan, "t1", nil, afterVals, keySet)
+ require.Error(t, err)
+}
+
+func TestQueryFKRefs(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ qr := sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields(
+ "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE",
+ "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar",
+ ),
+ "child|fk_child_parent|parent_id|parent|id|int|||int|int|||int",
+ "child|fk_child_parent|parent_id2|parent|id2|int|||int|int|||int",
+ "other|fk_other_parent|parent_id|parent|id|int|||int|int|||int",
+ )
+ client := newVDBClient(&stubDBClient{result: qr}, stats, 100)
+ refs, err := queryFKRefs(client, "db")
+ require.NoError(t, err)
+ require.Len(t, refs, 2)
+ require.Len(t, refs["child"], 1)
+ require.Equal(t, "parent", refs["child"][0].ParentTable)
+ require.Equal(t, []string{"parent_id", "parent_id2"}, refs["child"][0].ChildColumnNames)
+ require.Equal(t, []string{"id", "id2"}, refs["child"][0].ReferencedColumnNames)
+}
+
+func TestQueryFKRefsError(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ client := newVDBClient(&stubDBClient{err: assert.AnError}, stats, 100)
+ refs, err := queryFKRefs(client, "db")
+ require.Error(t, err)
+ require.Nil(t, refs)
+}
+
+type maxRowsAssertingDBClient struct {
+ result *sqltypes.Result
+ err error
+ assertQuery func(query string)
+ assertRows func(maxrows int) error
+}
+
+func (m *maxRowsAssertingDBClient) DBName() string { return "db" }
+func (m *maxRowsAssertingDBClient) Connect() error { return nil }
+func (m *maxRowsAssertingDBClient) Begin() error { return nil }
+func (m *maxRowsAssertingDBClient) Commit() error { return nil }
+func (m *maxRowsAssertingDBClient) Rollback() error { return nil }
+func (m *maxRowsAssertingDBClient) Close() {}
+func (m *maxRowsAssertingDBClient) IsClosed() bool { return false }
+func (m *maxRowsAssertingDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) {
+ if m.assertQuery != nil {
+ m.assertQuery(query)
+ }
+ if m.assertRows != nil {
+ if err := m.assertRows(maxrows); err != nil {
+ return nil, err
+ }
+ }
+ if m.err != nil {
+ return nil, m.err
+ }
+ return m.result, nil
+}
+
+func (m *maxRowsAssertingDBClient) ExecuteFetchMulti(query string, maxrows int) ([]*sqltypes.Result, error) {
+ qr, err := m.ExecuteFetch(query, maxrows)
+ if err != nil {
+ return nil, err
+ }
+ return []*sqltypes.Result{qr}, nil
+}
+
+func (m *maxRowsAssertingDBClient) SupportsCapability(capability capabilities.FlavorCapability) (bool, error) {
+ return false, nil
+}
+
+func TestQueryFKRefsFetchesAllRows(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ qr := sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields(
+ "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE",
+ "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar",
+ ),
+ "child|fk_child_parent|parent_id|parent|id|int|||int|int|||int",
+ )
+ client := newVDBClient(&maxRowsAssertingDBClient{
+ result: qr,
+ assertQuery: func(query string) {
+ require.Contains(t, query, "JOIN information_schema.COLUMNS child_cols")
+ require.Contains(t, query, "JOIN information_schema.COLUMNS parent_cols")
+ require.NotContains(t, query, "FROM information_schema.COLUMNS WHERE TABLE_SCHEMA")
+ },
+ assertRows: func(maxrows int) error {
+ if maxrows != -1 {
+ return fmt.Errorf("expected fetch-all maxrows, got %d", maxrows)
+ }
+ return nil
+ },
+ }, stats, 100)
+
+ refs, err := queryFKRefs(client, "db")
+ require.NoError(t, err)
+ require.Len(t, refs["child"], 1)
+ require.Equal(t, "parent", refs["child"][0].ParentTable)
+ require.Equal(t, []string{"parent_id"}, refs["child"][0].ChildColumnNames)
+ require.Equal(t, []string{"id"}, refs["child"][0].ReferencedColumnNames)
+}
+
+func TestQueryFKRefsRejectsHashIncompatibleFKColumnDefinitions(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ qr := sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields(
+ "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE",
+ "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar",
+ ),
+ "child|fk_child_parent|parent_id|parent|id|int|||int|bigint|||bigint",
+ )
+
+ client := newVDBClient(&stubDBClient{result: qr}, stats, 100)
+ refs, err := queryFKRefs(client, "db")
+ require.Error(t, err)
+ require.ErrorContains(t, err, "incompatible FK column definitions")
+ require.Nil(t, refs)
+}
+
+func TestQueryFKRefsAllowsCompatibleCharacterFKColumns(t *testing.T) {
+ stats := binlogplayer.NewStats()
+ stats.VReplicationLagGauges.Stop()
+ t.Cleanup(stats.Stop)
+
+ qr := sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields(
+ "TABLE_NAME|CONSTRAINT_NAME|COLUMN_NAME|REFERENCED_TABLE_NAME|REFERENCED_COLUMN_NAME|CHILD_DATA_TYPE|CHILD_CHARACTER_SET_NAME|CHILD_COLLATION_NAME|CHILD_COLUMN_TYPE|PARENT_DATA_TYPE|PARENT_CHARACTER_SET_NAME|PARENT_COLLATION_NAME|PARENT_COLUMN_TYPE",
+ "varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar|varchar",
+ ),
+ "child|fk_child_parent|parent_code|parent|code|varchar|utf8mb4|utf8mb4_0900_ai_ci|varchar(64)|char|utf8mb4|utf8mb4_0900_ai_ci|char(32)",
+ )
+
+ client := newVDBClient(&stubDBClient{result: qr}, stats, 100)
+ refs, err := queryFKRefs(client, "db")
+ require.NoError(t, err)
+ require.Len(t, refs["child"], 1)
+ require.Equal(t, []string{"parent_code"}, refs["child"][0].ChildColumnNames)
+ require.Equal(t, []string{"code"}, refs["child"][0].ReferencedColumnNames)
+}
+
+func TestBuildTxnWritesetMissingTablePlan(t *testing.T) {
+ rowEvent := &binlogdatapb.RowEvent{
+ TableName: "missing",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}},
+ }},
+ }
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.Error(t, err)
+ require.Nil(t, keys)
+}
+
+func TestBuildTxnWritesetNoRows(t *testing.T) {
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_BEGIN}
+ keys, err := buildTxnWriteset(map[string]*TablePlan{}, nil, nil, []*binlogdatapb.VEvent{vevent})
+ require.NoError(t, err)
+ require.Nil(t, keys)
+}
+
+func TestWritesetKeysForFKRefMissingColumn(t *testing.T) {
+ ref := &fkConstraintRef{ParentTable: "parent", ChildColumnNames: []string{"missing"}, ReferencedColumnNames: []string{"id"}}
+ fieldIdx := map[string]int{"id": 0}
+ vals := []sqltypes.Value{sqltypes.NewInt64(1)}
+ keySet := map[uint64]struct{}{}
+ // When an FK column is missing from the streamed fields, the function
+ // should return an error (fail closed) instead of silently dropping the edge.
+ err := writesetKeysForFKRef(ref, []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}}, fieldIdx, nil, vals, keySet)
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "not in streamed fields")
+ require.Empty(t, keySet)
+}
+
+func TestWritesetKeysForFKRef(t *testing.T) {
+ // Child table has columns: id (PK), parent_id (FK -> parent.id)
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ ref := &fkConstraintRef{
+ ParentTable: "parent",
+ ChildColumnNames: []string{"parent_id"},
+ }
+ // child row: id=5, parent_id=42
+ row := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}}
+ afterVals := sqltypes.MakeRowTrusted(childPlan.Fields, row)
+ // Build fieldIdx once per table, as buildTxnWriteset now does.
+ fieldIdx := make(map[string]int, len(childPlan.Fields))
+ for i, f := range childPlan.Fields {
+ fieldIdx[f.Name] = i
+ }
+ keySet := map[uint64]struct{}{}
+ writesetKeysForFKRef(ref, childPlan.Fields, fieldIdx, nil, afterVals, keySet)
+ require.Len(t, keySet, 1)
+ expected := testWritesetHash("parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42")))
+ _, ok := keySet[expected]
+ require.True(t, ok)
+}
+
+func TestBuildTxnWritesetWithFKRefs(t *testing.T) {
+ // Parent table: parent(id PK)
+ parentPlan := &TablePlan{
+ TargetName: "parent",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ // Child table: child(id PK, parent_id FK -> parent.id)
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {
+ {ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}},
+ },
+ }
+ parentRefs := buildParentFKRefs(fkRefs)
+ tablePlans := map[string]*TablePlan{
+ "parent": parentPlan,
+ "child": childPlan,
+ }
+
+ // Parent insert: id=42
+ parentRow := &querypb.Row{Values: []byte("42"), Lengths: []int64{2}}
+ parentChange := &binlogdatapb.RowChange{After: parentRow}
+ parentEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{parentChange}},
+ }
+
+ // Child insert: id=5, parent_id=42
+ childRow := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}}
+ childChange := &binlogdatapb.RowChange{After: childRow}
+ childEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{childChange}},
+ }
+
+ // Build writeset for parent txn
+ parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent})
+ require.NoError(t, err)
+ parentHash := testWritesetHash("parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42")))
+ require.Equal(t, []uint64{parentHash}, parentKeys)
+
+ // Build writeset for child txn — should have both child PK hash and parent FK ref hash
+ childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent})
+ require.NoError(t, err)
+ require.Len(t, childKeys, 2)
+ childPKHash := testWritesetHash("child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("5")))
+ assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys)
+
+ // The parent hash appears in both writesets — this creates a conflict
+ // that forces serialization, preventing FK constraint violations.
+ parentKeySet := map[uint64]struct{}{}
+ for _, k := range parentKeys {
+ parentKeySet[k] = struct{}{}
+ }
+ conflict := false
+ for _, k := range childKeys {
+ if _, ok := parentKeySet[k]; ok {
+ conflict = true
+ break
+ }
+ }
+ require.True(t, conflict, "parent and child writesets should conflict on parent hash")
+}
+
+func TestBuildTxnWritesetWithCompositeParentFKRefsUsesIdentityColumnOrder(t *testing.T) {
+ parentPlan := &TablePlan{
+ TargetName: "parent",
+ Fields: []*querypb.Field{{Name: "b", Type: querypb.Type_INT64}, {Name: "a", Type: querypb.Type_INT64}},
+ IdentityColumns: []string{"a", "b"},
+ PKIndices: []bool{true, true},
+ }
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_a", Type: querypb.Type_INT64},
+ {Name: "parent_b", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {
+ {ParentTable: "parent", ChildColumnNames: []string{"parent_a", "parent_b"}, ReferencedColumnNames: []string{"a", "b"}},
+ },
+ }
+ parentRefs := buildParentFKRefs(fkRefs)
+ tablePlans := map[string]*TablePlan{
+ "parent": parentPlan,
+ "child": childPlan,
+ }
+
+ parentEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("12"), Lengths: []int64{1, 1}},
+ }}},
+ }
+ childEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("921"), Lengths: []int64{1, 1, 1}},
+ }}},
+ }
+
+ parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent})
+ require.NoError(t, err)
+ parentHash := testWritesetHash(
+ "parent",
+ sqltypes.MakeTrusted(querypb.Type_INT64, []byte("2")),
+ sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")),
+ )
+ require.Equal(t, []uint64{parentHash}, parentKeys)
+
+ childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent})
+ require.NoError(t, err)
+ require.Len(t, childKeys, 2)
+ childPKHash := testWritesetHash("child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("9")))
+ assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys)
+
+ parentKeySet := map[uint64]struct{}{parentHash: {}}
+ conflict := false
+ for _, k := range childKeys {
+ if _, ok := parentKeySet[k]; ok {
+ conflict = true
+ break
+ }
+ }
+ require.True(t, conflict, "parent and child writesets should conflict on the parent identity hash")
+}
+
+func TestBuildTxnWritesetWithRenamedTableFKRefsUsesTargetTableNames(t *testing.T) {
+ parentPlan := &TablePlan{
+ TargetName: "parent",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {
+ {ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}},
+ },
+ }
+ parentRefs := buildParentFKRefs(fkRefs)
+ tablePlans := map[string]*TablePlan{
+ "parent_src": parentPlan,
+ "child_src": childPlan,
+ }
+
+ parentRow := &querypb.Row{Values: []byte("42"), Lengths: []int64{2}}
+ parentEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "parent_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: parentRow}},
+ },
+ }
+ childRow := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}}
+ childEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: childRow}},
+ },
+ }
+
+ parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent})
+ require.NoError(t, err)
+ parentHash := testWritesetHash("parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42")))
+ require.Equal(t, []uint64{parentHash}, parentKeys)
+
+ childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent})
+ require.NoError(t, err)
+ require.Len(t, childKeys, 2)
+ childPKHash := testWritesetHash("child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("5")))
+ assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys)
+
+ parentKeySet := map[uint64]struct{}{}
+ for _, k := range parentKeys {
+ parentKeySet[k] = struct{}{}
+ }
+ conflict := false
+ for _, k := range childKeys {
+ if _, ok := parentKeySet[k]; ok {
+ conflict = true
+ break
+ }
+ }
+ require.True(t, conflict, "renamed parent and child writesets should still conflict on target parent hash")
+}
+
+func TestBuildTxnWritesetWithMixedCaseFKRefsUsesTargetTableNames(t *testing.T) {
+ parentPlan := &TablePlan{
+ TargetName: "Parent",
+ Fields: []*querypb.Field{{Name: "id", Type: querypb.Type_INT64}},
+ PKIndices: []bool{true},
+ }
+ childPlan := &TablePlan{
+ TargetName: "Child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {
+ {ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}},
+ },
+ }
+ parentRefs := buildParentFKRefs(fkRefs)
+ tablePlans := map[string]*TablePlan{
+ "parent_src": parentPlan,
+ "child_src": childPlan,
+ }
+
+ parentRow := &querypb.Row{Values: []byte("42"), Lengths: []int64{2}}
+ parentEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "parent_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: parentRow}},
+ },
+ }
+ childRow := &querypb.Row{Values: []byte("542"), Lengths: []int64{1, 2}}
+ childEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "child_src",
+ RowChanges: []*binlogdatapb.RowChange{{After: childRow}},
+ },
+ }
+
+ parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent})
+ require.NoError(t, err)
+ parentHash := testWritesetHash("Parent", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("42")))
+ require.Equal(t, []uint64{parentHash}, parentKeys)
+
+ childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent})
+ require.NoError(t, err)
+ require.Len(t, childKeys, 2)
+ childPKHash := testWritesetHash("Child", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("5")))
+ assert.ElementsMatch(t, []uint64{childPKHash, parentHash}, childKeys)
+
+ parentKeySet := map[uint64]struct{}{parentHash: {}}
+ conflict := false
+ for _, k := range childKeys {
+ if _, ok := parentKeySet[k]; ok {
+ conflict = true
+ break
+ }
+ }
+ require.True(t, conflict, "mixed-case FK metadata should still conflict on the target parent hash")
+}
+
+func TestBuildTxnWritesetTextPrimaryKeyUsesCollationEquality(t *testing.T) {
+ collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_0900_ai_ci"))
+ require.NotZero(t, collationID)
+
+ plan := &TablePlan{
+ TargetName: "emails",
+ Fields: []*querypb.Field{{
+ Name: "email",
+ Type: querypb.Type_VARCHAR,
+ Charset: collationID,
+ }},
+ PKIndices: []bool{true},
+ }
+
+ upperEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("A"), Lengths: []int64{1}},
+ }}},
+ }
+ lowerEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("a"), Lengths: []int64{1}},
+ }}},
+ }
+
+ upperKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{upperEvent})
+ require.NoError(t, err)
+ lowerKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{lowerEvent})
+ require.NoError(t, err)
+ require.Equal(t, upperKeys, lowerKeys, "text primary keys that compare equal under MySQL collation rules must hash identically")
+}
+
+func TestBuildTxnWritesetPadSpaceTextPrimaryKeyUsesTrailingSpaceEquality(t *testing.T) {
+ collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci"))
+ require.NotZero(t, collationID)
+
+ plan := &TablePlan{
+ TargetName: "emails",
+ Fields: []*querypb.Field{{
+ Name: "email",
+ Type: querypb.Type_VARCHAR,
+ Charset: collationID,
+ }},
+ PKIndices: []bool{true},
+ }
+
+ trimmedEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("a"), Lengths: []int64{1}},
+ }}},
+ }
+ spacedEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "emails", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("a "), Lengths: []int64{2}},
+ }}},
+ }
+
+ trimmedKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{trimmedEvent})
+ require.NoError(t, err)
+ spacedKeys, err := buildTxnWriteset(map[string]*TablePlan{"emails": plan}, nil, nil, []*binlogdatapb.VEvent{spacedEvent})
+ require.NoError(t, err)
+ require.Equal(t, trimmedKeys, spacedKeys, "text primary keys that compare equal under PAD SPACE collation rules must hash identically")
+}
+
+func TestBuildTxnWritesetWithStringFKRefsUsesCollationEqualityAcrossCompatibleTypes(t *testing.T) {
+ collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_0900_ai_ci"))
+ require.NotZero(t, collationID)
+
+ parentPlan := &TablePlan{
+ TargetName: "parent",
+ Fields: []*querypb.Field{{
+ Name: "email",
+ Type: querypb.Type_CHAR,
+ Charset: collationID,
+ }},
+ PKIndices: []bool{true},
+ }
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_email", Type: querypb.Type_VARCHAR, Charset: collationID},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_email"}, ReferencedColumnNames: []string{"email"}}},
+ }
+ parentRefs := buildParentFKRefs(fkRefs)
+ tablePlans := map[string]*TablePlan{
+ "parent": parentPlan,
+ "child": childPlan,
+ }
+
+ parentEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("A"), Lengths: []int64{1}},
+ }}},
+ }
+ childEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1a"), Lengths: []int64{1, 1}},
+ }}},
+ }
+
+ parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent})
+ require.NoError(t, err)
+ childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent})
+ require.NoError(t, err)
+
+ parentKeySet := map[uint64]struct{}{}
+ for _, k := range parentKeys {
+ parentKeySet[k] = struct{}{}
+ }
+ conflict := false
+ for _, k := range childKeys {
+ if _, ok := parentKeySet[k]; ok {
+ conflict = true
+ break
+ }
+ }
+ require.True(t, conflict, "compatible string FK values that compare equal under MySQL collation rules must conflict")
+}
+
+func TestBuildTxnWritesetWithPadSpaceStringFKRefsUsesTrailingSpaceEqualityAcrossCompatibleTypes(t *testing.T) {
+ collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci"))
+ require.NotZero(t, collationID)
+
+ parentPlan := &TablePlan{
+ TargetName: "parent",
+ Fields: []*querypb.Field{{
+ Name: "email",
+ Type: querypb.Type_CHAR,
+ Charset: collationID,
+ }},
+ PKIndices: []bool{true},
+ }
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_email", Type: querypb.Type_VARCHAR, Charset: collationID},
+ },
+ PKIndices: []bool{true, false},
+ }
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_email"}, ReferencedColumnNames: []string{"email"}}},
+ }
+ parentRefs := buildParentFKRefs(fkRefs)
+ tablePlans := map[string]*TablePlan{
+ "parent": parentPlan,
+ "child": childPlan,
+ }
+
+ parentEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "parent", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("A"), Lengths: []int64{1}},
+ }}},
+ }
+ childEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1a "), Lengths: []int64{1, 2}},
+ }}},
+ }
+
+ parentKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{parentEvent})
+ require.NoError(t, err)
+ childKeys, err := buildTxnWriteset(tablePlans, fkRefs, parentRefs, []*binlogdatapb.VEvent{childEvent})
+ require.NoError(t, err)
+
+ parentKeySet := map[uint64]struct{}{}
+ for _, k := range parentKeys {
+ parentKeySet[k] = struct{}{}
+ }
+ conflict := false
+ for _, k := range childKeys {
+ if _, ok := parentKeySet[k]; ok {
+ conflict = true
+ break
+ }
+ }
+ require.True(t, conflict, "compatible PAD SPACE string FK values that compare equal under MySQL rules must conflict")
+}
+
+func TestBuildTxnWritesetExpressionPlanIsMarkedUnsupported(t *testing.T) {
+ vttablet.InitVReplicationConfigDefaults()
+ vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig}
+ plan, err := vr.buildReplicatorPlan(
+ getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: "t1",
+ Filter: "select a + b as c1, c as c2 from t1",
+ }}}),
+ map[string][]*ColumnInfo{"t1": {{Name: "c1", IsPK: true}, {Name: "c2"}}},
+ nil,
+ binlogplayer.NewStats(),
+ collations.MySQL8(),
+ sqlparser.NewTestParser(),
+ )
+ require.NoError(t, err)
+
+ tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "a", Type: querypb.Type_INT64},
+ {Name: "b", Type: querypb.Type_INT64},
+ {Name: "c", Type: querypb.Type_INT64},
+ },
+ })
+ require.NoError(t, err)
+ assert.True(t, tplan.HasUnsupportedWritesetMapping)
+}
+
+func TestBuildTxnWritesetAliasedFKColumnPlanIsMarkedUnsupported(t *testing.T) {
+ vttablet.InitVReplicationConfigDefaults()
+ vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig}
+ plan, err := vr.buildReplicatorPlan(
+ getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: "child",
+ Filter: "select id, parent_id as pid from child",
+ }}}),
+ map[string][]*ColumnInfo{"child": {{Name: "id", IsPK: true}, {Name: "pid"}}},
+ nil,
+ binlogplayer.NewStats(),
+ collations.MySQL8(),
+ sqlparser.NewTestParser(),
+ )
+ require.NoError(t, err)
+
+ tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{
+ TableName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ })
+ require.NoError(t, err)
+ assert.True(t, tplan.HasUnsupportedWritesetMapping)
+}
+
+func TestBuildTxnWritesetMatchingAliasExpressionPlanIsMarkedUnsupported(t *testing.T) {
+ vttablet.InitVReplicationConfigDefaults()
+ vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig}
+ plan, err := vr.buildReplicatorPlan(
+ getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: "t1",
+ Filter: "select lower(email) as email from t1",
+ }}}),
+ map[string][]*ColumnInfo{"t1": {{Name: "email", IsPK: true}}},
+ nil,
+ binlogplayer.NewStats(),
+ collations.MySQL8(),
+ sqlparser.NewTestParser(),
+ )
+ require.NoError(t, err)
+
+ tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ })
+ require.NoError(t, err)
+ assert.True(t, tplan.HasUnsupportedWritesetMapping)
+}
+
+func TestBuildTxnWritesetBacktickedDirectColumnPlanStaysSupported(t *testing.T) {
+ vttablet.InitVReplicationConfigDefaults()
+ vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig}
+ plan, err := vr.buildReplicatorPlan(
+ getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: "t1",
+ Filter: "select id, email from t1",
+ }}}),
+ map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}, {Name: "email"}}},
+ nil,
+ binlogplayer.NewStats(),
+ collations.MySQL8(),
+ sqlparser.NewTestParser(),
+ )
+ require.NoError(t, err)
+
+ tplan, err := plan.buildExecutionPlan(&binlogdatapb.FieldEvent{
+ TableName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "`id`", Type: querypb.Type_INT64},
+ {Name: "`email`", Type: querypb.Type_VARCHAR},
+ },
+ })
+ require.NoError(t, err)
+ assert.False(t, tplan.HasUnsupportedWritesetMapping)
+ require.Len(t, tplan.Fields, 2)
+ assert.Equal(t, "id", tplan.Fields[0].Name)
+ assert.Equal(t, "email", tplan.Fields[1].Name)
+}
+
+// keySetsIntersect reports whether two writeset key slices share any key.
+func keySetsIntersect(a, b []uint64) bool {
+ set := make(map[uint64]struct{}, len(a))
+ for _, k := range a {
+ set[k] = struct{}{}
+ }
+ for _, k := range b {
+ if _, ok := set[k]; ok {
+ return true
+ }
+ }
+ return false
+}
+
+// uniqueKeyRowEvent builds a single-change ROW event for an (id, email) table.
+func uniqueKeyRowEvent(id, email string) *binlogdatapb.VEvent {
+ values := append([]byte(id), []byte(email)...)
+ row := &querypb.Row{Values: values, Lengths: []int64{int64(len(id)), int64(len(email))}}
+ return &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: row}},
+ },
+ }
+}
+
+// uniqueKeyPlan builds an (id PK, email) table plan with a hashable unique
+// secondary on email.
+func uniqueKeyPlan() *TablePlan {
+ return &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ PKIndices: []bool{true, false},
+ IdentityColumns: []string{"id"},
+ UniqueKeyColumns: [][]string{{"email"}},
+ }
+}
+
+// TestBuildTxnWritesetUniqueKeySameValueDifferentIdentityConflicts pins the
+// core MySQL-WRITESET behavior: two changes on DIFFERENT identities but the
+// SAME unique secondary value must produce intersecting writesets (so they
+// serialize), while different unique values must stay disjoint.
+func TestBuildTxnWritesetUniqueKeySameValueDifferentIdentityConflicts(t *testing.T) {
+ plan := uniqueKeyPlan()
+ plans := map[string]*TablePlan{"t1": plan}
+
+ // id=1 and id=2 both claim email "a@x".
+ sameValueA, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("1", "a@x")})
+ require.NoError(t, err)
+ sameValueB, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("2", "a@x")})
+ require.NoError(t, err)
+ require.True(t, keySetsIntersect(sameValueA, sameValueB),
+ "changes on different identities sharing a unique value must conflict")
+
+ // id=2 with a different email "b@x" must not conflict with id=1/"a@x".
+ differentValue, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("2", "b@x")})
+ require.NoError(t, err)
+ require.False(t, keySetsIntersect(sameValueA, differentValue),
+ "changes with different unique values must not conflict")
+}
+
+// TestBuildTxnWritesetUniqueKeyUpdateEmitsBothImages pins that an UPDATE moving
+// a unique value emits keys for BOTH the before holder and the after holder, so
+// it conflicts with both the txn freeing the old value and the txn claiming the
+// new one.
+func TestBuildTxnWritesetUniqueKeyUpdateEmitsBothImages(t *testing.T) {
+ plan := uniqueKeyPlan()
+ plans := map[string]*TablePlan{"t1": plan}
+
+ // UPDATE id=1 moving email from "old@x" to "new@x".
+ beforeRow := &querypb.Row{Values: []byte("1old@x"), Lengths: []int64{1, 5}}
+ afterRow := &querypb.Row{Values: []byte("1new@x"), Lengths: []int64{1, 5}}
+ updateEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{Before: beforeRow, After: afterRow}},
+ },
+ }
+ updateKeys, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{updateEvent})
+ require.NoError(t, err)
+
+ // A concurrent txn claiming the freed "old@x" value (different identity).
+ oldHolder, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("7", "old@x")})
+ require.NoError(t, err)
+ // A concurrent txn that already holds the "new@x" value (different identity).
+ newHolder, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("8", "new@x")})
+ require.NoError(t, err)
+
+ require.True(t, keySetsIntersect(updateKeys, oldHolder),
+ "the UPDATE must conflict with a txn claiming the freed before-image value")
+ require.True(t, keySetsIntersect(updateKeys, newHolder),
+ "the UPDATE must conflict with a txn holding the after-image value")
+}
+
+// TestBuildTxnWritesetUniqueKeyNullEmitsNoKey pins that a NULL unique value
+// emits no unique-key key (two NULL rows do not conflict, since MySQL unique
+// indexes permit multiple NULLs) while the PK key is still emitted.
+func TestBuildTxnWritesetUniqueKeyNullEmitsNoKey(t *testing.T) {
+ plan := uniqueKeyPlan()
+ plans := map[string]*TablePlan{"t1": plan}
+
+ // id=1 with NULL email, id=2 with NULL email: -1 length encodes NULL.
+ nullRowA := &querypb.Row{Values: []byte("1"), Lengths: []int64{1, -1}}
+ nullRowB := &querypb.Row{Values: []byte("2"), Lengths: []int64{1, -1}}
+ eventA := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: nullRowA}},
+ },
+ }
+ eventB := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: nullRowB}},
+ },
+ }
+
+ keysA, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{eventA})
+ require.NoError(t, err)
+ keysB, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{eventB})
+ require.NoError(t, err)
+
+ // Only the PK key is emitted (one key each), and the two NULL-email rows
+ // on different identities do not conflict.
+ require.Len(t, keysA, 1, "NULL unique value must emit no unique-key key, only the PK key")
+ require.Len(t, keysB, 1)
+ require.False(t, keySetsIntersect(keysA, keysB),
+ "two NULL unique values on different identities must not conflict")
+
+ // Sanity: the single emitted key is the PK key.
+ pkKeyA := testWritesetHash("t1", sqltypes.MakeTrusted(querypb.Type_INT64, []byte("1")))
+ require.Equal(t, []uint64{pkKeyA}, keysA)
+}
+
+// TestBuildTxnWritesetUniqueKeyCaseInsensitiveCollationConflicts pins that two
+// unique values differing only by case under a case-insensitive collation hash
+// to the same unique key and therefore conflict.
+func TestBuildTxnWritesetUniqueKeyCaseInsensitiveCollationConflicts(t *testing.T) {
+ collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci"))
+ require.NotZero(t, collationID)
+
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "email", Type: querypb.Type_VARCHAR, Charset: collationID},
+ },
+ PKIndices: []bool{true, false},
+ IdentityColumns: []string{"id"},
+ UniqueKeyColumns: [][]string{{"email"}},
+ }
+ plans := map[string]*TablePlan{"t1": plan}
+
+ // Different identities, unique values "A@X" vs "a@x".
+ upperKeys, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("1", "A@X")})
+ require.NoError(t, err)
+ lowerKeys, err := buildTxnWriteset(plans, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("2", "a@x")})
+ require.NoError(t, err)
+
+ require.True(t, keySetsIntersect(upperKeys, lowerKeys),
+ "unique values equal under a case-insensitive collation must hash to the same unique key")
+}
+
+// TestBuildTxnWritesetUniqueKeyColumnMissingForcesSerialization pins that a
+// unique-key column absent from the streamed fields produces a "not in streamed
+// fields" error that routes the txn to the serial path.
+func TestBuildTxnWritesetUniqueKeyColumnMissingForcesSerialization(t *testing.T) {
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "email", Type: querypb.Type_VARCHAR},
+ },
+ PKIndices: []bool{true, false},
+ IdentityColumns: []string{"id"},
+ // The unique key references a column the stream never sends.
+ UniqueKeyColumns: [][]string{{"missing_col"}},
+ }
+
+ _, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{uniqueKeyRowEvent("1", "a@x")})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "not in streamed fields")
+ require.True(t, writesetErrorForcesSerialization(err),
+ "a missing unique-key column must route the txn to the serial path")
+}
+
+// TestBuildTxnWritesetUniqueKeyOrdinalDiscriminatesIndexes pins that two
+// different unique indexes with coincidentally equal values produce distinct
+// keys: the index ordinal is folded into the digest, so equal values on
+// different indexes do not over-serialize by colliding.
+func TestBuildTxnWritesetUniqueKeyOrdinalDiscriminatesIndexes(t *testing.T) {
+ // Two single-column unique secondaries (a, b), both INT64. A row with
+ // a == b would, without the ordinal discriminator, hash both unique keys
+ // to the same value.
+ plan := &TablePlan{
+ TargetName: "t1",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "a", Type: querypb.Type_INT64},
+ {Name: "b", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false, false},
+ IdentityColumns: []string{"id"},
+ UniqueKeyColumns: [][]string{{"a"}, {"b"}},
+ }
+
+ // id=1, a=7, b=7.
+ row := &querypb.Row{Values: []byte("177"), Lengths: []int64{1, 1, 1}}
+ event := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{After: row}},
+ },
+ }
+
+ keys, err := buildTxnWriteset(map[string]*TablePlan{"t1": plan}, nil, nil, []*binlogdatapb.VEvent{event})
+ require.NoError(t, err)
+ // PK key + two distinct unique-key keys = 3 keys, none colliding despite
+ // a == b.
+ require.Len(t, keys, 3, "equal values on different unique indexes must produce distinct keys")
+}
+
+// TestBuildTxnWritesetFKStreamedMetadataMismatchSerializes pins the
+// fail-closed path for target-only FKs whose SOURCE column metadata
+// diverges: queryFKRefs validates the TARGET schema, but the digests hash
+// the streamed (FIELD-event) metadata, so a child column streamed as INT64
+// referencing a parent column streamed as VARCHAR would hash equal logical
+// values to different keys and let the child/parent transactions reorder.
+// Such transactions must serialize instead.
+func TestBuildTxnWritesetFKStreamedMetadataMismatchSerializes(t *testing.T) {
+ collationID := uint32(collations.MySQL8().LookupByName("utf8mb4_general_ci"))
+ childPlan := &TablePlan{
+ TargetName: "child",
+ Fields: []*querypb.Field{
+ {Name: "id", Type: querypb.Type_INT64},
+ {Name: "parent_id", Type: querypb.Type_INT64},
+ },
+ PKIndices: []bool{true, false},
+ }
+ parentPlan := &TablePlan{
+ TargetName: "parent",
+ Fields: []*querypb.Field{
+ // The parent's referenced column streams as text: hash-incompatible
+ // with the child's INT64.
+ {Name: "id", Type: querypb.Type_VARCHAR, Charset: collationID},
+ },
+ PKIndices: []bool{true},
+ }
+ tablePlans := map[string]*TablePlan{"child": childPlan, "parent": parentPlan}
+ fkRefs := map[string][]fkConstraintRef{
+ "child": {{ParentTable: "parent", ChildColumnNames: []string{"parent_id"}, ReferencedColumnNames: []string{"id"}}},
+ }
+
+ row := &querypb.Row{Values: []byte("142"), Lengths: []int64{1, 2}}
+ change := &binlogdatapb.RowChange{After: row}
+ rowEvent := &binlogdatapb.RowEvent{TableName: "child", RowChanges: []*binlogdatapb.RowChange{change}}
+ vevent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_ROW, RowEvent: rowEvent}
+
+ keys, err := buildTxnWriteset(tablePlans, fkRefs, buildParentFKRefs(fkRefs), []*binlogdatapb.VEvent{vevent})
+ require.Error(t, err)
+ require.Contains(t, err.Error(), "streamed field metadata mismatch")
+ require.Nil(t, keys)
+ require.True(t, writesetErrorForcesSerialization(err), "metadata mismatch must serialize the txn, not fail the workflow")
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/relaylog_test.go b/go/vt/vttablet/tabletmanager/vreplication/relaylog_test.go
new file mode 100644
index 00000000000..af0aaf0b1af
--- /dev/null
+++ b/go/vt/vttablet/tabletmanager/vreplication/relaylog_test.go
@@ -0,0 +1,132 @@
+/*
+Copyright 2026 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vreplication
+
+import (
+ "context"
+ "io"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ querypb "vitess.io/vitess/go/vt/proto/query"
+)
+
+func TestRelayLogSendFetch(t *testing.T) {
+ ctx := t.Context()
+ rl := newRelayLog(ctx, 5, 10)
+
+ event := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}},
+ }},
+ },
+ }
+
+ require.NoError(t, rl.Send([]*binlogdatapb.VEvent{event}))
+
+ items, err := rl.Fetch()
+ require.NoError(t, err)
+ require.Len(t, items, 1)
+ require.Len(t, items[0], 1)
+ assert.Equal(t, binlogdatapb.VEventType_ROW, items[0][0].Type)
+}
+
+func TestRelayLogSendTimeout(t *testing.T) {
+ ctx := t.Context()
+ oldDeadline := vplayerProgressDeadline
+ vplayerProgressDeadline = 100 * time.Millisecond
+ t.Cleanup(func() {
+ vplayerProgressDeadline = oldDeadline
+ })
+
+ rl := newRelayLog(ctx, 1, 1)
+
+ event := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{{
+ After: &querypb.Row{Values: []byte("1"), Lengths: []int64{1}},
+ }},
+ },
+ }
+
+ require.NoError(t, rl.Send([]*binlogdatapb.VEvent{event}))
+
+ errCh := make(chan error, 1)
+ go func() {
+ errCh <- rl.Send([]*binlogdatapb.VEvent{event})
+ }()
+
+ select {
+ case err := <-errCh:
+ require.Error(t, err)
+ assert.ErrorContains(t, err, relayLogIOStalledMsg)
+ case <-time.After(5 * time.Second):
+ t.Fatal("timed out waiting for send")
+ }
+}
+
+func TestRelayLogFetchTimeout(t *testing.T) {
+ ctx := t.Context()
+ oldIdle := idleTimeout
+ idleTimeout = 100 * time.Millisecond
+ t.Cleanup(func() {
+ idleTimeout = oldIdle
+ })
+
+ rl := newRelayLog(ctx, 1, 1)
+
+ items, err := rl.Fetch()
+ require.NoError(t, err)
+ assert.Len(t, items, 0)
+}
+
+func TestRelayLogDoneReturnsEOF(t *testing.T) {
+ ctx, cancel := context.WithCancel(t.Context())
+ cancel()
+
+ rl := newRelayLog(ctx, 1, 1)
+
+ items, err := rl.Fetch()
+ assert.ErrorIs(t, err, io.EOF)
+ assert.Nil(t, items)
+}
+
+func TestRelayLogEventsSize(t *testing.T) {
+ rowEvent := &binlogdatapb.VEvent{
+ Type: binlogdatapb.VEventType_ROW,
+ RowEvent: &binlogdatapb.RowEvent{
+ TableName: "t1",
+ RowChanges: []*binlogdatapb.RowChange{
+ {Before: &querypb.Row{Values: []byte("ab"), Lengths: []int64{2}}},
+ {After: &querypb.Row{Values: []byte("cde"), Lengths: []int64{3}}},
+ },
+ },
+ }
+ otherEvent := &binlogdatapb.VEvent{Type: binlogdatapb.VEventType_COMMIT}
+
+ size := eventsSize([]*binlogdatapb.VEvent{rowEvent, otherEvent})
+ assert.Equal(t, 5, size)
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go
index c6d0675b94c..1fe7a3deaac 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan.go
@@ -23,6 +23,7 @@ import (
"slices"
"sort"
"strings"
+ "sync"
"vitess.io/vitess/go/bytes2"
"vitess.io/vitess/go/mysql/collations"
@@ -88,6 +89,7 @@ func (rp *ReplicatorPlan) buildExecutionPlan(fieldEvent *binlogdatapb.FieldEvent
trimmed.Name = strings.Trim(trimmed.Name, "`")
tplanv.Fields = append(tplanv.Fields, trimmed)
}
+ tplanv.HasUnsupportedWritesetMapping = hasUnsupportedWritesetMapping(&tplanv, tplanv.Fields)
return &tplanv, nil
}
// select * construct was used. We need to use the field names.
@@ -99,6 +101,35 @@ func (rp *ReplicatorPlan) buildExecutionPlan(fieldEvent *binlogdatapb.FieldEvent
return tplan, nil
}
+// hasUnsupportedWritesetMapping reports whether the plan's source→target
+// column mapping is something the parallel applier's writeset hasher
+// cannot reason about safely. Plans that rewrite, project, or reorder
+// columns produce hash inputs that do not correspond 1:1 with the row
+// image bytes, so the scheduler falls back to serialization rather
+// than compute a misleading writeset that could miss conflicts.
+func hasUnsupportedWritesetMapping(plan *TablePlan, streamedFields []*querypb.Field) bool {
+ if plan == nil || len(streamedFields) == 0 || len(plan.PKIndices) == 0 {
+ return false
+ }
+ if len(streamedFields) != len(plan.PKIndices) {
+ return true
+ }
+ for i, field := range streamedFields {
+ if field == nil || i >= len(plan.TablePlanBuilder.colExprs) {
+ return true
+ }
+ cexpr := plan.TablePlanBuilder.colExprs[i]
+ if cexpr == nil || !cexpr.colName.Equal(sqlparser.NewIdentifierCI(field.Name)) {
+ return true
+ }
+ sourceCol, ok := cexpr.expr.(*sqlparser.ColName)
+ if !ok || !sourceCol.Name.Equal(sqlparser.NewIdentifierCI(field.Name)) || !sourceCol.Qualifier.IsEmpty() {
+ return true
+ }
+ }
+ return false
+}
+
// buildFromFields builds a full TablePlan, but uses the field info as the
// full column list. This happens when the query used was a 'select *', which
// requires us to wait for the field info sent by the source.
@@ -210,17 +241,35 @@ type TablePlan struct {
// PKReferences is used to check if an event changed
// a primary key column (row move).
PKReferences []string
+ // IdentityColumns stores the chosen replication identity columns in key order.
+ IdentityColumns []string
// PKIndices is an array, length = #columns, true if column is part of the PK
- PKIndices []bool
- Stats *binlogplayer.Stats
- FieldsToSkip map[string]bool
- ConvertCharset map[string](*binlogdatapb.CharsetConversion)
- HasExtraSourcePkColumns bool
+ PKIndices []bool
+ // HasExtraUniqueSecondary means the table has uniqueness the writeset
+ // hasher cannot reason about (prefix/expression unique indexes, PK/identity
+ // mismatch); transactions touching it force-serialize.
+ HasExtraUniqueSecondary bool
+ // UniqueKeyColumns holds, per hashable unique secondary index, the ordered
+ // column names whose values get extra writeset keys (MySQL-WRITESET-style)
+ // so cross-row unique-value conflicts serialize against each other.
+ UniqueKeyColumns [][]string
+ // HasUnsupportedWritesetMapping means the streamed FIELD layout cannot be
+ // mapped positionally back to target PK/FK columns for safe writeset hashing.
+ HasUnsupportedWritesetMapping bool
+ Stats *binlogplayer.Stats
+ FieldsToSkip map[string]bool
+ ConvertCharset map[string](*binlogdatapb.CharsetConversion)
+ HasExtraSourcePkColumns bool
TablePlanBuilder *tablePlanBuilder
// PartialInserts is a dynamically generated cache of insert ParsedQueries, which update only some columns.
// This is when we use a binlog_row_image which is not "full". The key is a serialized bitmap of data columns
// which are sent as part of the RowEvent.
+ // partialMu protects PartialInserts and PartialUpdates from concurrent
+ // access when multiple parallel-apply workers process partial-row-image
+ // events for the same table simultaneously. Pointer to avoid copying
+ // the lock when TablePlan values are cloned in buildExecutionPlan.
+ partialMu *sync.Mutex
PartialInserts map[string]*sqlparser.ParsedQuery
// PartialUpdates are same as PartialInserts, but for update statements
PartialUpdates map[string]*sqlparser.ParsedQuery
@@ -852,6 +901,11 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange
baseQuerySize := int64(len(tp.MultiDelete.Query))
querySize := baseQuerySize
+ // lastQR captures the most recent successful flush. The oversized-row
+ // edge case below can leave pkVals empty at the end of the loop, and
+ // we must not call execQuery on an empty buffer (it would build an
+ // invalid "IN ()" clause). The final check returns lastQR in that case.
+ var lastQR *sqltypes.Result
execQuery := func(pkVals *[]sqltypes.Value) (*sqltypes.Result, error) {
pksBV, err := sqltypes.BuildBindVariable(*pkVals)
@@ -863,7 +917,12 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange
return nil, err
}
tp.TablePlanBuilder.stats.BulkQueryCount.Add("delete", 1)
- return executor(query)
+ qr, err := executor(query)
+ if err != nil {
+ return nil, err
+ }
+ lastQR = qr
+ return qr, nil
}
pkIndex := -1
@@ -880,6 +939,20 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange
}
addedSize := int64(len(vals[pkIndex].Raw()) + 2) // Plus 2 for the comma and space
if querySize+addedSize > maxQuerySize {
+ // Edge case: a single PK value is large enough to exceed the
+ // query size budget on its own (pkVals is still empty). Flush
+ // it as a one-row query, slightly exceeding maxQuerySize, rather
+ // than flushing an empty pkVals and producing an invalid empty
+ // "IN ()" clause.
+ if len(pkVals) == 0 {
+ pkVals = append(pkVals, vals[pkIndex])
+ if _, err := execQuery(&pkVals); err != nil {
+ return nil, err
+ }
+ pkVals = nil
+ querySize = baseQuerySize
+ continue
+ }
if _, err := execQuery(&pkVals); err != nil {
return nil, err
}
@@ -890,6 +963,16 @@ func (tp *TablePlan) applyBulkDeleteChanges(rowDeletes []*binlogdatapb.RowChange
querySize += addedSize
}
+ // If pkVals is empty here, every row in this batch was flushed solo via
+ // the oversized-row edge case above. Return the last successful result
+ // instead of calling execQuery on an empty buffer (which would produce
+ // an invalid empty "IN ()" clause).
+ if len(pkVals) == 0 {
+ if lastQR != nil {
+ return lastQR, nil
+ }
+ return &sqltypes.Result{}, nil
+ }
return execQuery(&pkVals)
}
@@ -913,12 +996,23 @@ func (tp *TablePlan) applyBulkInsertChanges(rowInserts []*binlogdatapb.RowChange
maxQuerySize -= int64(len(insertPrefix))
values := &strings.Builder{}
+ // lastQR captures the most recent successful flush. The oversized-row
+ // edge case below can leave the values buffer empty at the end of the
+ // loop, and we must not call execQuery on an empty buffer (it would
+ // build an invalid INSERT with no VALUES). The final check returns
+ // lastQR in that case.
+ var lastQR *sqltypes.Result
execQuery := func(vals *strings.Builder) (*sqltypes.Result, error) {
if tp.BulkInsertOnDup != nil {
vals.WriteString(tp.BulkInsertOnDup.Query)
}
tp.TablePlanBuilder.stats.BulkQueryCount.Add("insert", 1)
- return executor(insertPrefix + vals.String())
+ qr, err := executor(insertPrefix + vals.String())
+ if err != nil {
+ return nil, err
+ }
+ lastQR = qr
+ return qr, nil
}
limit := tp.maxRowJSONBytes()
@@ -962,7 +1056,19 @@ func (tp *TablePlan) applyBulkInsertChanges(rowInserts []*binlogdatapb.RowChange
if err := tp.BulkInsertValues.Append(rowValues, bindvars, nil); err != nil {
return nil, err
}
- if !newStmt && int64(values.Len()+2+rowValues.Len()) > maxQuerySize { // Plus 2 for the comma and space
+ if int64(values.Len()+2+rowValues.Len()) > maxQuerySize { // Plus 2 for the comma and space
+ // Edge case: a single row's VALUES clause is large enough to
+ // exceed the query size budget on its own (values buffer is
+ // still empty). Flush it as a one-row INSERT, slightly exceeding
+ // maxQuerySize, rather than flushing an empty VALUES buffer and
+ // producing an invalid INSERT with no VALUES.
+ if values.Len() == 0 {
+ if _, err := execQuery(rowValues); err != nil {
+ return nil, err
+ }
+ newStmt = true
+ continue
+ }
if _, err := execQuery(values); err != nil {
return nil, err
}
@@ -976,6 +1082,16 @@ func (tp *TablePlan) applyBulkInsertChanges(rowInserts []*binlogdatapb.RowChange
newStmt = false
}
+ // If the values buffer is empty here, every row in this batch was flushed
+ // solo via the oversized-row edge case above. Return the last successful
+ // result instead of calling execQuery on an empty buffer (which would
+ // produce an INSERT with no VALUES).
+ if values.Len() == 0 {
+ if lastQR != nil {
+ return lastQR, nil
+ }
+ return &sqltypes.Result{}, nil
+ }
return execQuery(values)
}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go
index 609d6851ced..c5823c96190 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/replicator_plan_test.go
@@ -807,6 +807,30 @@ func TestBuildPlayerPlanNoDup(t *testing.T) {
assert.ErrorContainsf(t, err, want, "buildReplicatorPlan err: %v, must contain: %v", err, want)
}
+func TestBuildPlayerPlanInsertIgnorePreservesPKIndices(t *testing.T) {
+ vttablet.InitVReplicationConfigDefaults()
+ vr := &vreplicator{workflowConfig: vttablet.DefaultVReplicationConfig}
+ plan, err := vr.buildReplicatorPlan(
+ getSource(&binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{
+ Match: "t1",
+ Filter: "select id, c2 from t1 group by id, c2",
+ }}}),
+ map[string][]*ColumnInfo{"t1": {{Name: "id", IsPK: true}, {Name: "c2"}}},
+ nil,
+ binlogplayer.NewStats(),
+ collations.MySQL8(),
+ sqlparser.NewTestParser(),
+ )
+ require.NoError(t, err)
+
+ tplan := plan.TablePlans["t1"]
+ require.NotNil(t, tplan)
+ require.Equal(t, []string{"id"}, tplan.IdentityColumns)
+ require.Equal(t, []bool{true, false}, tplan.PKIndices)
+ require.NotNil(t, tplan.Insert)
+ require.NotNil(t, tplan.Update)
+}
+
func TestBuildPlayerPlanExclude(t *testing.T) {
PrimaryKeyInfos := map[string][]*ColumnInfo{
"t1": {&ColumnInfo{Name: "c1"}},
diff --git a/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go b/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go
index 7ad2177fe5e..24b8857789c 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/table_plan_builder.go
@@ -22,6 +22,7 @@ import (
"regexp"
"sort"
"strings"
+ "sync"
"vitess.io/vitess/go/mysql/collations"
"vitess.io/vitess/go/sqltypes"
@@ -354,6 +355,16 @@ func (tpb *tablePlanBuilder) generate() *TablePlan {
pkrefs = append(pkrefs, k)
}
sort.Strings(pkrefs)
+ identityCols := make([]string, 0, len(tpb.pkCols))
+ for _, pkCol := range tpb.pkCols {
+ identityCols = append(identityCols, pkCol.colName.Lowered())
+ }
+ tpb.pkIndices = make([]bool, len(tpb.colExprs))
+ for i, cexpr := range tpb.colExprs {
+ if cexpr.isPK {
+ tpb.pkIndices[i] = true
+ }
+ }
bvf := &bindvarFormatter{}
@@ -374,11 +385,13 @@ func (tpb *tablePlanBuilder) generate() *TablePlan {
Delete: tpb.generateDeleteStatement(),
MultiDelete: tpb.generateMultiDeleteStatement(),
PKReferences: pkrefs,
+ IdentityColumns: identityCols,
PKIndices: tpb.pkIndices,
Stats: tpb.stats,
FieldsToSkip: fieldsToSkip,
HasExtraSourcePkColumns: len(tpb.extraSourcePkCols) > 0,
TablePlanBuilder: tpb,
+ partialMu: &sync.Mutex{},
PartialInserts: make(map[string]*sqlparser.ParsedQuery, 0),
PartialUpdates: make(map[string]*sqlparser.ParsedQuery, 0),
CollationEnv: tpb.collationEnv,
@@ -811,11 +824,7 @@ func (tpb *tablePlanBuilder) generateUpdateStatement() *sqlparser.ParsedQuery {
buf := sqlparser.NewTrackedBuffer(bvf.formatter)
buf.Myprintf("update %v set ", tpb.name)
separator := ""
- tpb.pkIndices = make([]bool, len(tpb.colExprs))
- for i, cexpr := range tpb.colExprs {
- if cexpr.isPK {
- tpb.pkIndices[i] = true
- }
+ for _, cexpr := range tpb.colExprs {
if cexpr.isGrouped || cexpr.isPK || cexpr.isGenerated {
continue
}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go b/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go
index 3f401192fdf..645b00b63cb 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/table_plan_partial.go
@@ -178,6 +178,8 @@ func (tpb *tablePlanBuilder) createPartialUpdateQuery(dataColumns *binlogdatapb.
func (tp *TablePlan) getPartialInsertQuery(dataColumns *binlogdatapb.RowChange_Bitmap) (*sqlparser.ParsedQuery, error) {
key := hex.EncodeToString(dataColumns.Cols)
+ tp.partialMu.Lock()
+ defer tp.partialMu.Unlock()
ins, ok := tp.PartialInserts[key]
if ok {
return ins, nil
@@ -193,6 +195,8 @@ func (tp *TablePlan) getPartialInsertQuery(dataColumns *binlogdatapb.RowChange_B
func (tp *TablePlan) getPartialUpdateQuery(dataColumns *binlogdatapb.RowChange_Bitmap) (*sqlparser.ParsedQuery, error) {
key := hex.EncodeToString(dataColumns.Cols)
+ tp.partialMu.Lock()
+ defer tp.partialMu.Unlock()
upd, ok := tp.PartialUpdates[key]
if ok {
return upd, nil
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vcopier.go b/go/vt/vttablet/tabletmanager/vreplication/vcopier.go
index d42fb349c80..2d58de024f5 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vcopier.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vcopier.go
@@ -425,7 +425,7 @@ func (vc *vcopier) copyTable(ctx context.Context, tableName string, copyState ma
var prevCh <-chan *vcopierCopyTaskResult
vstreamOptions := &binlogdatapb.VStreamOptions{
- ConfigOverrides: vc.vr.workflowConfig.Overrides,
+ ConfigOverrides: vc.vr.workflowConfig.SourceOverrides(),
}
serr := vc.vr.sourceVStreamer.VStreamRows(ctx, initialPlan.SendRule.Filter, lastpkpb, func(rows *binlogdatapb.VStreamRowsResponse) error {
for {
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go b/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go
index 382dab60b67..16a531af9d0 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vcopier_atomic.go
@@ -30,10 +30,11 @@ import (
"vitess.io/vitess/go/sqltypes"
"vitess.io/vitess/go/vt/binlog/binlogplayer"
"vitess.io/vitess/go/vt/log"
- binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
- querypb "vitess.io/vitess/go/vt/proto/query"
"vitess.io/vitess/go/vt/sqlparser"
"vitess.io/vitess/go/vt/vterrors"
+
+ binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ querypb "vitess.io/vitess/go/vt/proto/query"
)
/*
@@ -103,7 +104,7 @@ func (vc *vcopier) copyAll(ctx context.Context, settings binlogplayer.VRSettings
var gtid string
vstreamOptions := &binlogdatapb.VStreamOptions{
- ConfigOverrides: vc.vr.workflowConfig.Overrides,
+ ConfigOverrides: vc.vr.workflowConfig.SourceOverrides(),
}
serr := vc.vr.sourceVStreamer.VStreamTables(ctx, func(resp *binlogdatapb.VStreamTablesResponse) error {
defer vc.vr.stats.PhaseTimings.Record("copy", time.Now())
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go b/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go
index a2738686a4c..f6c23e1212b 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vcopier_test.go
@@ -18,6 +18,7 @@ package vreplication
import (
"context"
+ "encoding/json"
"fmt"
"os"
"regexp"
@@ -36,6 +37,7 @@ import (
"vitess.io/vitess/go/sqltypes"
"vitess.io/vitess/go/vt/binlog/binlogplayer"
binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ "vitess.io/vitess/go/vt/proto/vtctldata"
qh "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication/queryhistory"
"vitess.io/vitess/go/vt/vttablet/tabletserver/vstreamer"
)
@@ -258,6 +260,31 @@ func testVcopierTestCases(t *testing.T, test func(*testing.T), cases []vcopierTe
}
}
+func copyTestSourceOverrides() map[string]string {
+ return map[string]string{
+ "vstream-dynamic-packet-size": "false",
+ "vstream-packet-size": "1",
+ }
+}
+
+func createVReplicationStateWithSourceOverrides(t *testing.T, workflow string, bls *binlogdatapb.BinlogSource, state binlogdatapb.VReplicationWorkflowState, dbName string, overrides map[string]string) string {
+ t.Helper()
+
+ query := binlogplayer.CreateVReplicationState(workflow, bls, "", state, dbName, 0, 0)
+ if len(overrides) == 0 {
+ return query
+ }
+
+ options, err := json.Marshal(vtctldata.WorkflowOptions{Config: overrides})
+ require.NoError(t, err)
+
+ emptyOptions := sqltypes.EncodeStringSQL("{}")
+ idx := strings.LastIndex(query, emptyOptions)
+ require.NotEqual(t, -1, idx)
+
+ return query[:idx] + sqltypes.EncodeStringSQL(string(options)) + query[idx+len(emptyOptions):]
+}
+
func TestPlayerCopyCharPK(t *testing.T) {
testVcopierTestCases(t, testPlayerCopyCharPK, commonVcopierTestCases())
}
@@ -328,7 +355,7 @@ func testPlayerCopyCharPK(t *testing.T) {
OnDdl: binlogdatapb.OnDDLAction_IGNORE,
}
- query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0)
+ query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides())
qr, err := playerEngine.Exec(query)
require.NoError(t, err)
defer func() {
@@ -431,7 +458,7 @@ func testPlayerCopyVarcharPKCaseInsensitive(t *testing.T) {
OnDdl: binlogdatapb.OnDDLAction_IGNORE,
}
- query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0)
+ query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides())
qr, err := playerEngine.Exec(query)
require.NoError(t, err)
defer func() {
@@ -551,7 +578,7 @@ func testPlayerCopyVarcharCompositePKCaseSensitiveCollation(t *testing.T) {
OnDdl: binlogdatapb.OnDDLAction_IGNORE,
}
- query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0)
+ query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides())
qr, err := playerEngine.Exec(query)
require.NoError(t, err)
defer func() {
@@ -912,7 +939,7 @@ func testPlayerCopyBigTable(t *testing.T) {
OnDdl: binlogdatapb.OnDDLAction_IGNORE,
}
- query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0)
+ query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides())
qr, err := playerEngine.Exec(query)
require.NoError(t, err)
defer func() {
@@ -1046,7 +1073,7 @@ func testPlayerCopyWildcardRule(t *testing.T) {
Filter: filter,
OnDdl: binlogdatapb.OnDDLAction_IGNORE,
}
- query := binlogplayer.CreateVReplicationState("test", bls, "", binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, 0, 0)
+ query := createVReplicationStateWithSourceOverrides(t, "test", bls, binlogdatapb.VReplicationWorkflowState_Init, playerEngine.dbName, copyTestSourceOverrides())
qr, err := playerEngine.Exec(query)
require.NoError(t, err)
defer func() {
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go b/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go
index bb05fd5897d..92cfd9b882b 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vdbclient.go
@@ -37,14 +37,17 @@ const beginStmtLen = int64(len("begin;"))
// It allows us to retry a failed transactions on lock errors.
type vdbClient struct {
binlogplayer.DBClient
- stats *binlogplayer.Stats
- InTransaction bool
- startTime time.Time
- queries []string
- queriesPos int64
- batchSize int64
- maxBatchSize int64
- relayLogMaxItems int
+ stats *binlogplayer.Stats
+ vreplicationID int32
+ InTransaction bool
+ foreignKeyChecksEnabled bool
+ foreignKeyChecksStateInitialized bool
+ startTime time.Time
+ queries []string
+ queriesPos int64
+ batchSize int64
+ maxBatchSize int64
+ relayLogMaxItems int
}
func newVDBClient(dbclient binlogplayer.DBClient, stats *binlogplayer.Stats, relayLogMaxItems int) *vdbClient {
@@ -55,6 +58,15 @@ func newVDBClient(dbclient binlogplayer.DBClient, stats *binlogplayer.Stats, rel
}
}
+// newVDBClientWithID creates a vdbClient with a pre-set vreplicationID.
+// Used by parallel apply workers so each worker's connection is associated
+// with the correct vreplication stream for relay log batching.
+func newVDBClientWithID(dbclient binlogplayer.DBClient, stats *binlogplayer.Stats, relayLogMaxItems int, vreplicationID int32) *vdbClient {
+ client := newVDBClient(dbclient, stats, relayLogMaxItems)
+ client.vreplicationID = vreplicationID
+ return client
+}
+
func (vc *vdbClient) Begin() error {
if vc.InTransaction {
return nil
@@ -78,6 +90,31 @@ func (vc *vdbClient) Begin() error {
return nil
}
+// BeginImmediate starts a real transaction on the server even when batch mode
+// is enabled. This is needed for commit paths that must execute a couple of
+// statements immediately on one connection and still commit them atomically.
+func (vc *vdbClient) BeginImmediate() error {
+ if vc.InTransaction {
+ return nil
+ }
+ if err := vc.DBClient.Begin(); err != nil {
+ return err
+ }
+ // The "begin" entry is for Retry's replay loop, which calls vc.Begin()
+ // when it sees "begin" in the buffer. BEGIN has already gone down the
+ // wire above, so advance queriesPos past it: any later
+ // ExecuteTrxQueryBatch / CommitTrxQueryBatch must not include this
+ // "begin" in its multi-statement, because a nested BEGIN would
+ // implicit-commit the current transaction and break atomicity with
+ // the immediate writes the caller is about to do.
+ vc.queries = []string{"begin"}
+ vc.queriesPos = 1
+ vc.batchSize = 0
+ vc.InTransaction = true
+ vc.startTime = time.Now()
+ return nil
+}
+
func (vc *vdbClient) Commit() error {
if err := vc.DBClient.Commit(); err != nil {
return err
@@ -96,7 +133,7 @@ func (vc *vdbClient) Commit() error {
func (vc *vdbClient) CommitTrxQueryBatch() error {
vc.queries = append(vc.queries, "commit")
queries := strings.Join(vc.queries[vc.queriesPos:], ";")
- for _, err := vc.ExecuteFetchMulti(queries, -1); err != nil; {
+ if _, err := vc.ExecuteFetchMulti(queries, -1); err != nil {
return err
}
vc.InTransaction = false
@@ -128,7 +165,8 @@ func (vc *vdbClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result,
} else {
vc.queries = append(vc.queries, query)
}
- return vc.DBClient.ExecuteFetch(query, maxrows)
+ qr, err := vc.DBClient.ExecuteFetch(query, maxrows)
+ return qr, err
}
// AddQueryToTrxBatch adds the query to the current transaction's query
@@ -157,7 +195,8 @@ func (vc *vdbClient) AddQueryToTrxBatch(query string) error {
func (vc *vdbClient) ExecuteTrxQueryBatch() ([]*sqltypes.Result, error) {
defer vc.stats.Timings.Record(binlogplayer.BlplMultiQuery, time.Now())
- qrs, err := vc.ExecuteFetchMulti(strings.Join(vc.queries[vc.queriesPos:], ";"), -1)
+ queries := strings.Join(vc.queries[vc.queriesPos:], ";")
+ qrs, err := vc.ExecuteFetchMulti(queries, -1)
if err != nil {
return nil, err
}
@@ -168,6 +207,19 @@ func (vc *vdbClient) ExecuteTrxQueryBatch() ([]*sqltypes.Result, error) {
return qrs, nil
}
+// markTrxBatchedQueriesFlushed advances the batch position past every
+// query currently buffered. ExecuteFetch appends each query it runs to
+// the trx batch buffer (so Retry can replay them), but in batch-commit
+// mode that buffer is also what CommitTrxQueryBatch sends as a single
+// multi-statement, which double-executes any query that was already
+// run on the wire via ExecuteFetch. Callers that have already executed
+// queries through ExecuteFetch mid-batch use this to keep them out of
+// the upcoming CommitTrxQueryBatch replay.
+func (vc *vdbClient) markTrxBatchedQueriesFlushed() {
+ vc.queriesPos = int64(len(vc.queries))
+ vc.batchSize = 0
+}
+
// Execute is ExecuteFetch without the maxrows.
func (vc *vdbClient) Execute(query string) (*sqltypes.Result, error) {
// Number of rows should never exceed relayLogMaxItems.
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vplayer.go b/go/vt/vttablet/tabletmanager/vreplication/vplayer.go
index f1ecad333b5..76b8459e93c 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vplayer.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vplayer.go
@@ -24,6 +24,8 @@ import (
"math"
"strconv"
"strings"
+ "sync"
+ "sync/atomic"
"time"
"vitess.io/vitess/go/mysql/replication"
@@ -35,6 +37,7 @@ import (
"vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp"
binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
)
const failedToRecordHeartbeatMsg = "failed to record heartbeat"
@@ -57,13 +60,16 @@ type vplayer struct {
saveStop bool
copyState map[string]*sqltypes.Result
- replicatorPlan *ReplicatorPlan
- tablePlans map[string]*TablePlan
+ replicatorPlan *ReplicatorPlan
+ tablePlansMu *sync.RWMutex
+ tablePlans map[string]*TablePlan
+ tablePlansVersion *atomic.Int64
// These are set when creating the VPlayer based on whether the VPlayer
// is in batch (stmt and trx) execution mode or not.
- query func(ctx context.Context, sql string) (*sqltypes.Result, error)
- commit func() error
+ query func(ctx context.Context, sql string) (*sqltypes.Result, error)
+ commit func() error
+ dbClient *vdbClient
// If the VPlayer is in batch mode, we accumulate each transaction's statements
// that are then sent as a single multi-statement protocol request to the database.
batchMode bool
@@ -74,12 +80,16 @@ type vplayer struct {
// If nothing else happens for idleTimeout since timeLastSaved,
// the position of the unsavedEvent gets saved.
unsavedEvent *binlogdatapb.VEvent
- // timeLastSaved is set every time a GTID is saved.
+ // timeLastSaved tracks when the latest pending position was durably saved.
+ // Older saves behind a later unsavedEvent must not refresh it.
timeLastSaved time.Time
- // lastTimestampNs is the last timestamp seen so far.
- lastTimestampNs int64
- // timeOffsetNs keeps track of the clock difference with respect to source tablet.
- timeOffsetNs int64
+ // lagSnapshot packs the last timestamp seen and the clock offset to the
+ // source tablet into a single atomic struct. Storing them together (vs
+ // two independent atomic.Int64 fields) prevents the parallel applier's
+ // throttled-path lag estimator from seeing a torn pair (new ts with
+ // stale offset, or vice versa) when the commitLoop's updateLag races
+ // with the scheduleLoop's reader.
+ lagSnapshot *atomic.Pointer[lagSnapshot]
// numAccumulatedHeartbeats keeps track of how many heartbeats have been received since we updated the time_updated column of _vt.vreplication
numAccumulatedHeartbeats int
@@ -90,15 +100,64 @@ type vplayer struct {
throttlerAppName string
- // See updateFKCheck for more details on how the two fields below are used.
+ serialMu *sync.Mutex
+ parallelOrder *atomic.Int64
- // foreignKeyChecksEnabled is the current state of the foreign key checks for the current session.
- // It reflects what we have set the @@session.foreign_key_checks session variable to.
- foreignKeyChecksEnabled bool
+ // fkRefs maps child table name → FK constraints for that table.
+ // Used by the parallel applier to generate writeset keys that
+ // create conflicts between child and parent table transactions.
+ fkRefs map[string][]fkConstraintRef
+ // parentFKRefs is the reverse map: parent table name → FK constraints
+ // that reference it. Used to generate parent-side writeset keys that
+ // match child FK keys, ensuring correct conflict detection even when
+ // FKs reference non-PK unique keys.
+ parentFKRefs map[string][]parentFKRef
+ // postDDLDroppedTables records dropped table names from executed DDLs so the
+ // parallel scheduler can clear post-DDL barriers without mutating tablePlans.
+ postDDLDroppedTables map[string]struct{}
+ // postDDLStalePlans records the still-stale table plans left behind by the
+ // most recently executed EXEC* DDLs. scheduleLoop snapshots this under
+ // serialMu so commitLoop can publish real runtime DDL effects without
+ // racing the scheduler.
+ postDDLStalePlans map[string]postDDLStalePlan
+ // postDDLConservative keeps unknown DDL barriers fail-closed until every
+ // currently tracked plan refreshes.
+ postDDLConservative bool
+ // pendingFieldRefreshTables tracks tables whose FIELD refresh was scheduled
+ // but has not committed yet, so later row transactions do not hash against a
+ // still-cold table-plan cache.
+ pendingFieldRefreshTables map[string]int
- // foreignKeyChecksStateInitialized is set to true once we have initialized the foreignKeyChecksEnabled.
- // The initialization is done on the first row event that this vplayer sees.
- foreignKeyChecksStateInitialized bool
+ // idStr is vp.idStr, cached to avoid repeated
+ // conversions on every lag gauge update.
+ idStr string
+}
+
+// lagSnapshot pairs the most-recent source-side timestamp seen by the
+// applier with the corresponding clock offset to the source. It is stored
+// behind an atomic.Pointer so readers always see a consistent (ts, offset)
+// pair instead of a torn mix from two concurrent writers.
+type lagSnapshot struct {
+ timestampNs int64
+ offsetNs int64
+}
+
+// loadLagSnapshot returns the latest snapshot, or a zero-value snapshot if
+// nothing has been stored yet. Callers can compare timestampNs against zero
+// to detect "no data yet".
+func (vp *vplayer) loadLagSnapshot() lagSnapshot {
+ snap := vp.lagSnapshot.Load()
+ if snap == nil {
+ return lagSnapshot{}
+ }
+ return *snap
+}
+
+// storeLagSnapshot atomically replaces the lag snapshot with a new (ts, offset)
+// pair. A reader's loadLagSnapshot will either see the entire previous
+// snapshot or the entire new one — never a mix.
+func (vp *vplayer) storeLagSnapshot(timestampNs, offsetNs int64) {
+ vp.lagSnapshot.Store(&lagSnapshot{timestampNs: timestampNs, offsetNs: offsetNs})
}
// NoForeignKeyCheckFlagBitmask is the bitmask for the 2nd bit (least significant) of the flags in a binlog row event.
@@ -151,22 +210,41 @@ func newVPlayer(vr *vreplicator, settings binlogplayer.VRSettings, copyState map
}
return &vplayer{
- vr: vr,
- startPos: settings.StartPos,
- pos: settings.StartPos,
- stopPos: settings.StopPos,
- saveStop: saveStop,
- copyState: copyState,
- timeLastSaved: time.Now(),
- tablePlans: make(map[string]*TablePlan),
- phase: phase,
- throttlerAppName: throttlerapp.VPlayerName.ConcatenateString(vr.throttlerAppName()),
- query: queryFunc,
- commit: commitFunc,
- batchMode: batchMode,
+ vr: vr,
+ startPos: settings.StartPos,
+ pos: settings.StartPos,
+ stopPos: settings.StopPos,
+ saveStop: saveStop,
+ copyState: copyState,
+ timeLastSaved: time.Now(),
+ lagSnapshot: &atomic.Pointer[lagSnapshot]{},
+ tablePlansMu: &sync.RWMutex{},
+ tablePlans: make(map[string]*TablePlan),
+ tablePlansVersion: &atomic.Int64{},
+ serialMu: &sync.Mutex{},
+ parallelOrder: &atomic.Int64{},
+ phase: phase,
+ throttlerAppName: throttlerapp.VPlayerName.ConcatenateString(vr.throttlerAppName()),
+ pendingFieldRefreshTables: make(map[string]int),
+ query: queryFunc,
+ commit: commitFunc,
+ batchMode: batchMode,
+ dbClient: vr.dbClient,
+ idStr: strconv.Itoa(int(vr.id)),
}
}
+// activeDBClient returns the vplayer's current DB connection. In the parallel
+// applier, workers swap vp.dbClient to their own connection before applying
+// events, so this returns whichever connection is currently active. Falls back
+// to vr.dbClient (the main connection) when vp.dbClient is nil.
+func (vp *vplayer) activeDBClient() *vdbClient {
+ if vp.dbClient != nil {
+ return vp.dbClient
+ }
+ return vp.vr.dbClient
+}
+
// play is the entry point for playing binlogs.
func (vp *vplayer) play(ctx context.Context) error {
if !vp.stopPos.IsZero() && vp.startPos.AtLeast(vp.stopPos) {
@@ -197,7 +275,7 @@ func (vp *vplayer) play(ctx context.Context) error {
}
// updateFKCheck updates the @@session.foreign_key_checks variable based on the binlog row event flags.
-// The function only does it if it has changed to avoid redundant updates, using the cached vplayer.foreignKeyChecksEnabled
+// The function only does it if it has changed to avoid redundant updates, using the cached state on the active db session.
// The foreign_key_checks value for a transaction is determined by the 2nd bit (least significant) of the flags:
// - If set (1), foreign key checks are disabled.
// - If unset (0), foreign key checks are enabled.
@@ -208,7 +286,7 @@ func (vp *vplayer) updateFKCheck(ctx context.Context, flags2 uint32) error {
// If this is an atomic copy, we must update the foreign_key_checks state even when the vplayer runs during
// the copy phase, i.e., for catchup and fastforward.
mustUpdate = true
- } else if vp.vr.state == binlogdatapb.VReplicationWorkflowState_Running {
+ } else if vp.vr.getState() == binlogdatapb.VReplicationWorkflowState_Running {
// If the vreplication workflow is in Running state, we must update the foreign_key_checks
// state for all workflow types.
mustUpdate = true
@@ -218,18 +296,19 @@ func (vp *vplayer) updateFKCheck(ctx context.Context, flags2 uint32) error {
}
dbForeignKeyChecksEnabled := flags2&NoForeignKeyCheckFlagBitmask != NoForeignKeyCheckFlagBitmask
- if vp.foreignKeyChecksStateInitialized /* already set earlier */ &&
- dbForeignKeyChecksEnabled == vp.foreignKeyChecksEnabled /* no change in the state, no need to update */ {
+ activeClient := vp.activeDBClient()
+ if activeClient.foreignKeyChecksStateInitialized /* already set earlier */ &&
+ dbForeignKeyChecksEnabled == activeClient.foreignKeyChecksEnabled /* no change in the state, no need to update */ {
return nil
}
log.Info("Setting this session's foreign_key_checks to " + strconv.FormatBool(dbForeignKeyChecksEnabled))
if _, err := vp.query(ctx, "set @@session.foreign_key_checks="+strconv.FormatBool(dbForeignKeyChecksEnabled)); err != nil {
return fmt.Errorf("failed to set session foreign_key_checks: %w", err)
}
- vp.foreignKeyChecksEnabled = dbForeignKeyChecksEnabled
- if !vp.foreignKeyChecksStateInitialized {
+ activeClient.foreignKeyChecksEnabled = dbForeignKeyChecksEnabled
+ if !activeClient.foreignKeyChecksStateInitialized {
log.Info("First foreign_key_checks update to: " + strconv.FormatBool(dbForeignKeyChecksEnabled))
- vp.foreignKeyChecksStateInitialized = true
+ activeClient.foreignKeyChecksStateInitialized = true
}
return nil
}
@@ -255,16 +334,21 @@ func (vp *vplayer) fetchAndApply(ctx context.Context) (err error) {
streamErr := make(chan error, 1)
go func() {
vstreamOptions := &binlogdatapb.VStreamOptions{
- ConfigOverrides: vp.vr.workflowConfig.Overrides,
+ ConfigOverrides: vp.vr.workflowConfig.SourceOverrides(),
}
- streamErr <- vp.vr.sourceVStreamer.VStream(ctx, replication.EncodePosition(vp.startPos), nil,
+ err := vp.vr.sourceVStreamer.VStream(ctx, replication.EncodePosition(vp.startPos), nil,
vp.replicatorPlan.VStreamFilter, func(events []*binlogdatapb.VEvent) error {
return relay.Send(events)
}, vstreamOptions)
+ streamErr <- err
}()
applyErr := make(chan error, 1)
go func() {
+ if vp.vr.workflowConfig.ParallelReplicationWorkers > 1 && len(vp.copyState) == 0 {
+ applyErr <- vp.applyEventsParallel(ctx, relay)
+ return
+ }
applyErr <- vp.applyEvents(ctx, relay)
}()
@@ -296,6 +380,13 @@ func (vp *vplayer) fetchAndApply(ctx context.Context) (err error) {
return nil
default:
}
+ // If the vstream received a gRPC CANCELED error, it means the
+ // context was canceled but the Go context hasn't propagated yet.
+ // Treat this the same as ctx.Done() — return nil to avoid a
+ // spurious retry.
+ if vterrors.Code(err) == vtrpcpb.Code_CANCELED && ctx.Err() != nil {
+ return nil
+ }
// If the stream ends normally we have to return an error indicating
// that the controller has to retry a different vttablet.
if err == nil || err == io.EOF {
@@ -325,7 +416,9 @@ func (vp *vplayer) applyRowEvent(ctx context.Context, rowEvent *binlogdatapb.Row
if err := vp.updateFKCheck(ctx, rowEvent.Flags); err != nil {
return err
}
+ vp.tablePlansMu.RLock()
tplan := vp.tablePlans[rowEvent.TableName]
+ vp.tablePlansMu.RUnlock()
if tplan == nil {
return fmt.Errorf("unexpected event on table %s", rowEvent.TableName)
}
@@ -346,14 +439,14 @@ func (vp *vplayer) applyRowEvent(ctx context.Context, rowEvent *binlogdatapb.Row
// then we can perform a simple bulk DELETE using an IN clause.
if (rowEvent.RowChanges[0].Before != nil && rowEvent.RowChanges[0].After == nil) &&
tplan.MultiDelete != nil {
- _, err := tplan.applyBulkDeleteChanges(rowEvent.RowChanges, applyFunc, vp.vr.dbClient.maxBatchSize)
+ _, err := tplan.applyBulkDeleteChanges(rowEvent.RowChanges, applyFunc, vp.activeDBClient().maxBatchSize)
return err
}
// If we're done with the copy phase then we will be replicating all INSERTS
// regardless of the PK value and can use a single INSERT statment with
// multiple VALUES clauses.
if len(vp.copyState) == 0 && (rowEvent.RowChanges[0].Before == nil && rowEvent.RowChanges[0].After != nil) {
- _, err := tplan.applyBulkInsertChanges(rowEvent.RowChanges, applyFunc, vp.vr.dbClient.maxBatchSize)
+ _, err := tplan.applyBulkInsertChanges(rowEvent.RowChanges, applyFunc, vp.activeDBClient().maxBatchSize)
return err
}
}
@@ -368,22 +461,79 @@ func (vp *vplayer) applyRowEvent(ctx context.Context, rowEvent *binlogdatapb.Row
}
// updatePos should get called at a minimum of vreplicationMinimumHeartbeatUpdateInterval.
-func (vp *vplayer) updatePos(ctx context.Context, ts int64) (posReached bool, err error) {
- update := binlogplayer.GenerateUpdatePos(vp.vr.id, vp.pos, time.Now().Unix(), ts, vp.vr.stats.CopyRowCount.Get(), vp.vr.workflowConfig.StoreCompressedGTID)
- if _, err := vp.query(ctx, update); err != nil {
+func (vp *vplayer) generateUpdatePosQuery(pos replication.Position, ts int64) string {
+ return binlogplayer.GenerateUpdatePos(vp.vr.id, pos, time.Now().Unix(), ts, vp.vr.stats.CopyRowCount.Get(), vp.vr.workflowConfig.StoreCompressedGTID)
+}
+
+// updatePosWithoutStop writes the position update through the supplied
+// query function without applying the stop-position state transition.
+// The parallel commitLoop uses this because the position update,
+// COMMIT, and workflow state update must all run on the worker's
+// connection — activeDBClient() would pick the wrong one here.
+func (vp *vplayer) updatePosWithoutStop(ctx context.Context, pos replication.Position, ts int64, query func(context.Context, string) (*sqltypes.Result, error)) (posReached bool, err error) {
+ if _, err := query(ctx, vp.generateUpdatePosQuery(pos, ts)); err != nil {
return false, fmt.Errorf("error %v updating position", err)
}
+ return !vp.stopPos.IsZero() && pos.AtLeast(vp.stopPos), nil
+}
+
+// recordPositionSave updates the in-memory bookkeeping that follows a
+// successful position write (clear unsaved-event state, refresh the
+// idle-flush timer, advance the lag gauge). Split out of updatePos so
+// the parallel commitLoop can record the save after committing the
+// worker's transaction instead of during apply.
+func (vp *vplayer) recordPositionSave(pos replication.Position, clearUnsavedEvent bool) {
vp.numAccumulatedHeartbeats = 0
- vp.unsavedEvent = nil
- vp.timeLastSaved = time.Now()
- vp.vr.stats.SetLastPosition(vp.pos)
- posReached = !vp.stopPos.IsZero() && vp.pos.AtLeast(vp.stopPos)
+ refreshIdleTimer := clearUnsavedEvent || vp.unsavedEvent == nil || !vp.pos.AtLeast(pos) || vp.pos.Equal(pos)
+ if clearUnsavedEvent {
+ vp.unsavedEvent = nil
+ }
+ if refreshIdleTimer {
+ vp.timeLastSaved = time.Now()
+ }
+ vp.vr.stats.SetLastPosition(pos)
+}
+
+// setStopPositionState marks the workflow as Stopped using the given
+// dbClient's batch mode (if any). Used from the serial applier path
+// where the stop-state write can ride along with the rest of the
+// batched flush.
+func (vp *vplayer) setStopPositionState(dbClient *vdbClient) error {
+ log.Info(fmt.Sprintf("Stopped at position: %v", vp.stopPos))
+ if !vp.saveStop {
+ return nil
+ }
+ return vp.vr.setStateWithDBClient(dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, fmt.Sprintf("Stopped at position %v", vp.stopPos))
+}
+
+// setStopPositionStateImmediate marks the workflow as Stopped using a
+// direct (non-batched) write. The parallel commitLoop uses this after
+// the worker has flushed its batch and is about to COMMIT, so the
+// state row update has to stay inside the same transaction rather than
+// deferring to a later batch flush.
+func (vp *vplayer) setStopPositionStateImmediate(dbClient *vdbClient) error {
+ log.Info(fmt.Sprintf("Stopped at position: %v", vp.stopPos))
+ if !vp.saveStop {
+ return nil
+ }
+ return vp.vr.setStateWithDBClientImmediate(dbClient, binlogdatapb.VReplicationWorkflowState_Stopped, fmt.Sprintf("Stopped at position %v", vp.stopPos))
+}
+
+// updatePos persists the current position, records the save, and —
+// if the stop position has been reached — transitions the workflow to
+// Stopped on the active DB client. The serial applier uses this
+// end-to-end; the parallel flow calls the constituent helpers
+// (updatePosWithoutStop, recordPositionSave,
+// setStopPositionStateImmediate) on the worker connection instead.
+func (vp *vplayer) updatePos(ctx context.Context, ts int64) (posReached bool, err error) {
+ posReached, err = vp.updatePosWithoutStop(ctx, vp.pos, ts, vp.query)
+ if err != nil {
+ return false, err
+ }
+ vp.recordPositionSave(vp.pos, true)
if posReached {
- log.Info(fmt.Sprintf("Stopped at position: %v", vp.stopPos))
- if vp.saveStop {
- if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, fmt.Sprintf("Stopped at position %v", vp.stopPos)); err != nil {
- return false, err
- }
+ if err := vp.setStopPositionState(vp.activeDBClient()); err != nil {
+ return false, err
}
}
return posReached, nil
@@ -464,17 +614,18 @@ func (vp *vplayer) applyEvents(ctx context.Context, relay *relayLog) error {
defer vp.vr.dbClient.Rollback()
estimateLag := func() {
- behind := time.Now().UnixNano() - vp.lastTimestampNs - vp.timeOffsetNs
+ snap := vp.loadLagSnapshot()
+ behind := time.Now().UnixNano() - snap.timestampNs - snap.offsetNs
behindSecs := behind / 1e9
vp.vr.stats.ReplicationLagSeconds.Store(behindSecs)
- vp.vr.stats.VReplicationLagGauges.Set(strconv.Itoa(int(vp.vr.id)), behindSecs)
+ vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, behindSecs)
}
// If we're not running, set ReplicationLagSeconds to be very high.
// TODO(sougou): if we also stored the time of the last event, we
// can estimate this value more accurately.
defer vp.vr.stats.ReplicationLagSeconds.Store(math.MaxInt64)
- defer vp.vr.stats.VReplicationLagGauges.Set(strconv.Itoa(int(vp.vr.id)), math.MaxInt64)
+ defer vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, math.MaxInt64)
var lag int64
for {
if ctx.Err() != nil {
@@ -561,10 +712,11 @@ func (vp *vplayer) applyEvents(ctx context.Context, relay *relayLog) error {
// determine the actual lag, as the vstreamer is fully throttled, and we
// will estimate it after processing the batch.
if event.Type != binlogdatapb.VEventType_HEARTBEAT || !event.Throttled {
- vp.lastTimestampNs = event.Timestamp * 1e9
+ tsNs := event.Timestamp * 1e9
now := time.Now().UnixNano()
- vp.timeOffsetNs = now - event.CurrentTime
- lag = now - vp.lastTimestampNs - vp.timeOffsetNs
+ offset := now - event.CurrentTime
+ vp.storeLagSnapshot(tsNs, offset)
+ lag = now - tsNs - offset
}
}
}
@@ -573,7 +725,7 @@ func (vp *vplayer) applyEvents(ctx context.Context, relay *relayLog) error {
if lag >= 0 {
lagSecs := lag / 1e9
vp.vr.stats.ReplicationLagSeconds.Store(lagSecs)
- vp.vr.stats.VReplicationLagGauges.Set(strconv.Itoa(int(vp.vr.id)), lagSecs)
+ vp.vr.stats.VReplicationLagGauges.Set(vp.idStr, lagSecs)
} else { // We couldn't determine the lag, so we need to estimate it
estimateLag()
}
@@ -647,12 +799,12 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
// No-op: begin is called as needed.
case binlogdatapb.VEventType_COMMIT:
if mustSave {
- if err := vp.vr.dbClient.Begin(); err != nil {
+ if err := vp.activeDBClient().Begin(); err != nil {
return err
}
}
- if !vp.vr.dbClient.InTransaction {
+ if !vp.activeDBClient().InTransaction {
// We're skipping an empty transaction. We may have to save the position on inactivity.
vp.unsavedEvent = event
return nil
@@ -668,14 +820,50 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
return io.EOF
}
case binlogdatapb.VEventType_FIELD:
- if err := vp.vr.dbClient.Begin(); err != nil {
+ if err := vp.activeDBClient().Begin(); err != nil {
return err
}
tplan, err := vp.replicatorPlan.buildExecutionPlan(event.FieldEvent)
if err != nil {
return err
}
- vp.tablePlans[event.FieldEvent.TableName] = tplan
+ // HasExtraUniqueSecondary only matters to the parallel applier's
+ // writeset scheduling, which runs only in the replication phase
+ // (fetchAndApply requires len(copyState) == 0). During copy-phase
+ // catchup/fastforward this vplayer is serial and its table plans
+ // die with it, so the schema lookup would be a wasted mysqld
+ // round-trip and a needless failure mode.
+ if vp.vr.workflowConfig.ParallelReplicationWorkers > 1 && len(vp.copyState) == 0 {
+ vp.tablePlansMu.RLock()
+ cachedPlan := vp.tablePlans[event.FieldEvent.TableName]
+ vp.tablePlansMu.RUnlock()
+ vp.serialMu.Lock()
+ staleEntry, hasStaleEntry := vp.postDDLStalePlans[event.FieldEvent.TableName]
+ cacheInvalidatedByRefreshTarget := !hasStaleEntry && postDDLRefreshTargetMatchesCachedPlan(vp.postDDLStalePlans, event.FieldEvent.TableName, cachedPlan)
+ vp.serialMu.Unlock()
+ cacheInvalidatedByDDL := (hasStaleEntry && staleEntry.stalePlan == cachedPlan) || cacheInvalidatedByRefreshTarget
+ if cachedPlan != nil && cachedPlan.TargetName == tplan.TargetName && !cacheInvalidatedByDDL {
+ tplan.HasExtraUniqueSecondary = cachedPlan.HasExtraUniqueSecondary
+ tplan.UniqueKeyColumns = cachedPlan.UniqueKeyColumns
+ } else {
+ uniqueKeys, mustSerialize, err := vp.vr.writesetUniqueKeys(ctx, tplan.TargetName, tplan)
+ if err != nil {
+ return err
+ }
+ tplan.UniqueKeyColumns = uniqueKeys
+ tplan.HasExtraUniqueSecondary = mustSerialize
+ }
+ }
+ fieldTableName := event.FieldEvent.TableName
+ vp.tablePlansMu.Lock()
+ vp.tablePlans[fieldTableName] = tplan
+ vp.tablePlansVersion.Add(1)
+ vp.tablePlansMu.Unlock()
+ vp.serialMu.Lock()
+ // FIELD means this table name is live again, so later DDL barriers must
+ // treat it as tracked instead of as a previously dropped name.
+ delete(vp.postDDLDroppedTables, canonicalPostDDLTableKey(vp.postDDLDroppedTables, fieldTableName))
+ vp.serialMu.Unlock()
if stats != nil {
stats.Send(fmt.Sprintf("%v", event.FieldEvent))
}
@@ -690,7 +878,7 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
// If the event is for one of the AWS RDS "special" or pt-table-checksum tables, we skip
if !strings.Contains(sql, " mysql.rds_") && !strings.Contains(sql, " percona.checksums") {
// This is a player using statement based replication
- if err := vp.vr.dbClient.Begin(); err != nil {
+ if err := vp.activeDBClient().Begin(); err != nil {
return err
}
if err := vp.applyStmtEvent(ctx, event); err != nil {
@@ -702,7 +890,7 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
}
case binlogdatapb.VEventType_ROW:
// This player is configured for row based replication
- if err := vp.vr.dbClient.Begin(); err != nil {
+ if err := vp.activeDBClient().Begin(); err != nil {
return err
}
if err := vp.applyRowEvent(ctx, event.RowEvent); err != nil {
@@ -715,7 +903,7 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
stats.Send(fmt.Sprintf("%v", event.RowEvent))
}
case binlogdatapb.VEventType_OTHER:
- if vp.vr.dbClient.InTransaction {
+ if vp.activeDBClient().InTransaction {
// Unreachable
log.Error(fmt.Sprintf("internal error: vplayer is in a transaction on event: %v", event))
return fmt.Errorf("internal error: vplayer is in a transaction on event: %v", event)
@@ -729,73 +917,20 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
return io.EOF
}
case binlogdatapb.VEventType_DDL:
- if vp.vr.dbClient.InTransaction {
+ if vp.activeDBClient().InTransaction {
// Unreachable
log.Error(fmt.Sprintf("internal error: vplayer is in a transaction on event: %v", event))
return fmt.Errorf("internal error: vplayer is in a transaction on event: %v", event)
}
- vp.vr.stats.DDLEventActions.Add(vp.vr.source.OnDdl.String(), 1) // Record the DDL handling
- switch vp.vr.source.OnDdl {
- case binlogdatapb.OnDDLAction_IGNORE:
- // We still have to update the position.
- posReached, err := vp.updatePos(ctx, event.Timestamp)
- if err != nil {
- return err
- }
- if posReached {
- return io.EOF
- }
- case binlogdatapb.OnDDLAction_STOP:
- if err := vp.vr.dbClient.Begin(); err != nil {
- return err
- }
- if _, err := vp.updatePos(ctx, event.Timestamp); err != nil {
- return err
- }
- if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at DDL "+event.Statement); err != nil {
- return err
- }
- if err := vp.commit(); err != nil {
- return err
- }
- return io.EOF
- case binlogdatapb.OnDDLAction_EXEC:
- // It's impossible to save the position transactionally with the statement.
- // So, we apply the DDL first, and then save the position.
- // Manual intervention may be needed if there is a partial
- // failure here.
- if _, err := vp.query(ctx, event.Statement); err != nil {
- return err
- }
- if stats != nil {
- stats.Send(event.Statement)
- }
- posReached, err := vp.updatePos(ctx, event.Timestamp)
- if err != nil {
- return err
- }
- if posReached {
- return io.EOF
- }
- case binlogdatapb.OnDDLAction_EXEC_IGNORE:
- if _, err := vp.query(ctx, event.Statement); err != nil {
- log.Info(fmt.Sprintf("Ignoring error: %v for DDL: %s", err, event.Statement))
- }
- if stats != nil {
- stats.Send(event.Statement)
- }
- posReached, err := vp.updatePos(ctx, event.Timestamp)
- if err != nil {
- return err
- }
- if posReached {
- return io.EOF
- }
- }
+ _, err := vp.applyDDLEvent(ctx, event, stats)
+ return err
case binlogdatapb.VEventType_ROWS_QUERY:
// The original SQL query is informational only; VReplication applies row changes directly.
+ case binlogdatapb.VEventType_VERSION:
+ // VERSION only tells downstream consumers that schema_version changed.
+ // vplayer does not apply any data for it.
case binlogdatapb.VEventType_JOURNAL:
- if vp.vr.dbClient.InTransaction {
+ if vp.activeDBClient().InTransaction {
// Unreachable
log.Error(fmt.Sprintf("internal error: vplayer is in a transaction on event: %v", event))
return fmt.Errorf("internal error: vplayer is in a transaction on event: %v", event)
@@ -832,6 +967,19 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
}
// All were found. We must register journal.
}
+ // We must NOT persist the position past the journal event here.
+ // registerJournal returns nil as soon as THIS participant has
+ // registered, even when other participants of the journal have not
+ // joined yet, and the engine's journaler state is in-memory only.
+ // The position is only safe to advance once transitionJournal has
+ // durably rewritten the participating streams. If we saved the
+ // position now and the tablet restarted before all participants
+ // joined, this stream would resume past the journal, never
+ // re-register, and the workflow would hang forever waiting for a
+ // transition that can no longer happen. Keeping the saved position
+ // before the journal means a restart re-delivers the journal event,
+ // and registerJournal is idempotent (per-key lookup,
+ // existing-participant guard), so re-registering is safe.
log.Info(fmt.Sprintf("Binlog event registering journal event %+v", event.Journal))
if err := vp.vr.vre.registerJournal(event.Journal, vp.vr.id); err != nil {
if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, err.Error()); err != nil {
@@ -849,13 +997,83 @@ func (vp *vplayer) applyEvent(ctx context.Context, event *binlogdatapb.VEvent, m
return err
}
}
- if !vp.vr.dbClient.InTransaction {
+ if !vp.activeDBClient().InTransaction {
vp.numAccumulatedHeartbeats++
if err := vp.recordHeartbeat(); err != nil {
return err
}
}
+ default:
+ return vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unsupported vevent type: %v", event.Type)
}
return nil
}
+
+// applyDDLEvent executes the DDL handling policy and reports whether the target
+// schema was actually changed, so commitLoop can publish only real EXEC* side effects.
+func (vp *vplayer) applyDDLEvent(ctx context.Context, event *binlogdatapb.VEvent, stats *VrLogStats) (bool, error) {
+ vp.vr.stats.DDLEventActions.Add(vp.vr.source.OnDdl.String(), 1)
+ sendStats := func() {
+ if stats != nil {
+ stats.Send(event.Statement)
+ }
+ }
+ switch vp.vr.source.OnDdl {
+ case binlogdatapb.OnDDLAction_IGNORE:
+ posReached, err := vp.updatePos(ctx, event.Timestamp)
+ if err != nil {
+ return false, err
+ }
+ if posReached {
+ return false, io.EOF
+ }
+ return false, nil
+ case binlogdatapb.OnDDLAction_STOP:
+ if err := vp.activeDBClient().Begin(); err != nil {
+ return false, err
+ }
+ if _, err := vp.updatePos(ctx, event.Timestamp); err != nil {
+ return false, err
+ }
+ if err := vp.vr.setState(binlogdatapb.VReplicationWorkflowState_Stopped, "Stopped at DDL "+event.Statement); err != nil {
+ return false, err
+ }
+ if err := vp.commit(); err != nil {
+ return false, err
+ }
+ return false, io.EOF
+ case binlogdatapb.OnDDLAction_EXEC:
+ // DDL and position save cannot be committed atomically, so we only
+ // publish the post-DDL barrier after the statement itself succeeds.
+ if _, err := vp.query(ctx, event.Statement); err != nil {
+ return false, err
+ }
+ sendStats()
+ posReached, err := vp.updatePos(ctx, event.Timestamp)
+ if err != nil {
+ return false, err
+ }
+ if posReached {
+ return true, io.EOF
+ }
+ return true, nil
+ case binlogdatapb.OnDDLAction_EXEC_IGNORE:
+ executed := true
+ if _, err := vp.query(ctx, event.Statement); err != nil {
+ executed = false
+ log.Info(fmt.Sprintf("Ignoring error: %v for DDL: %s", err, event.Statement))
+ }
+ sendStats()
+ posReached, err := vp.updatePos(ctx, event.Timestamp)
+ if err != nil {
+ return executed, err
+ }
+ if posReached {
+ return executed, io.EOF
+ }
+ return executed, nil
+ default:
+ return false, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "unsupported ddl action: %v", vp.vr.source.OnDdl)
+ }
+}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go
index 02ae1c5b9af..869bf7f857e 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vplayer_flaky_test.go
@@ -39,10 +39,12 @@ import (
"vitess.io/vitess/go/vt/binlog/binlogplayer"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/logutil"
+ "vitess.io/vitess/go/vt/vterrors"
vttablet "vitess.io/vitess/go/vt/vttablet/common"
"vitess.io/vitess/go/vt/vttablet/tabletserver/vstreamer/testenv"
binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
+ vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
qh "vitess.io/vitess/go/vt/vttablet/tabletmanager/vreplication/queryhistory"
)
@@ -218,7 +220,7 @@ func TestHeartbeatFrequencyFlag(t *testing.T) {
stats := binlogplayer.NewStats()
defer stats.Stop()
- vp := &vplayer{vr: &vreplicator{
+ vp := &vplayer{tablePlansMu: &sync.RWMutex{}, serialMu: &sync.Mutex{}, vr: &vreplicator{
dbClient: newVDBClient(realDBClientFactory(), stats, vttablet.DefaultVReplicationConfig.RelayLogMaxItems),
stats: stats,
workflowConfig: vttablet.DefaultVReplicationConfig,
@@ -2941,6 +2943,50 @@ func TestTimestamp(t *testing.T) {
expectData(t, "t1", [][]string{{"1", want, want}})
}
+func TestVPlayerDoesNotTreatRemoteCanceledStreamAsLocalShutdown(t *testing.T) {
+ tablet := addTablet(100)
+ defer deleteTablet(tablet)
+
+ filter := &binlogdatapb.Filter{Rules: []*binlogdatapb.Rule{{Match: "/.*"}}}
+ bls := &binlogdatapb.BinlogSource{Keyspace: env.KeyspaceName, Shard: env.ShardName, Filter: filter}
+ stats := binlogplayer.NewStats()
+ defer stats.Stop()
+ dbClient := playerEngine.dbClientFactoryFiltered()
+ err := dbClient.Connect()
+ require.NoError(t, err)
+ defer dbClient.Close()
+
+ _, err = dbClient.ExecuteFetch(fmt.Sprintf("insert into _vt.vreplication (id, workflow, source, pos, max_tps, max_replication_lag, time_updated, transaction_timestamp, state, db_name, options) values (1, 'test', '', '', 99999, 99999, 0, 0, 'Stopped', '%s', '{}') on duplicate key update workflow='test', source='', pos='', max_tps=99999, max_replication_lag=99999, time_updated=0, transaction_timestamp=0, state='Stopped', db_name='%s'", dbClient.DBName(), dbClient.DBName()), 1)
+ require.NoError(t, err)
+ drainDBQueries()
+ defer func() {
+ _, err := dbClient.ExecuteFetch("delete from _vt.vreplication where id = 1", 1)
+ require.NoError(t, err)
+ drainDBQueries()
+ }()
+
+ oldErrors := vstreamErrorsByTablet
+ defer func() { vstreamErrorsByTablet = oldErrors }()
+ vstreamErrorsByTablet = map[uint32]error{
+ tablet.Alias.Uid: vterrors.New(vtrpcpb.Code_CANCELED, "remote canceled"),
+ }
+
+ vsClient := newTabletConnector(tablet)
+ require.NoError(t, vsClient.Open(t.Context()))
+ defer func() { _ = vsClient.Close(t.Context()) }()
+
+ vr := newVReplicator(1, bls, vsClient, stats, dbClient, env.Mysqld, playerEngine, vttablet.DefaultVReplicationConfig)
+ settings, _, err := vr.loadSettings(t.Context(), newVDBClient(dbClient, stats, vttablet.DefaultVReplicationConfig.RelayLogMaxItems))
+ require.NoError(t, err)
+
+ vp := newVPlayer(vr, settings, nil, replication.Position{}, "replicate")
+ vp.replicatorPlan = &ReplicatorPlan{VStreamFilter: filter}
+
+ err = vp.fetchAndApply(t.Context())
+ require.Error(t, err)
+ require.Equal(t, vtrpcpb.Code_CANCELED, vterrors.Code(err))
+}
+
// TestPlayerJSONDocs validates more complex and 'large' json docs. It only validates that the data on target matches that on source.
// TestPlayerTypes, above, also verifies the sql queries applied on the target.
func TestPlayerJSONDocs(t *testing.T) {
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go b/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go
index 47686336fb2..b34a14e8298 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vreplicator.go
@@ -25,6 +25,7 @@ import (
"sort"
"strconv"
"strings"
+ "sync/atomic"
"time"
"vitess.io/vitess/go/mysql/capabilities"
@@ -109,8 +110,22 @@ type vreplicator struct {
// source
source *binlogdatapb.BinlogSource
sourceVStreamer VStreamerClient
- state binlogdatapb.VReplicationWorkflowState
- stats *binlogplayer.Stats
+ // state is the workflow state as last written by setState*. It is read
+ // by worker goroutines (updateFKCheck) and the controller while the
+ // parallel commitLoop may be writing it, so access goes through
+ // getState/storeState.
+ state atomic.Int32 // binlogdatapb.VReplicationWorkflowState
+ // inCopyPhase reports whether the workflow still has tables to copy
+ // (_vt.copy_state is non-empty). It is refreshed from the durable row
+ // on every loadSettings call, so — unlike state, which is only updated
+ // by setState calls — it is truthful immediately after a tablet
+ // restart. That matters for AtomicCopy: its copy path (copyAll) never
+ // calls setState(Copying) — only initTablesForCopy does, on first
+ // start — so after a restart the in-memory state stays at zero for the
+ // whole remaining copy. The controller's AtomicCopy terminal-error
+ // guard reads this from another goroutine, hence atomic.
+ inCopyPhase atomic.Bool
+ stats *binlogplayer.Stats
// mysqld is used to fetch the local schema.
mysqld mysqlctl.MysqlDaemon
colInfoMap map[string][]*ColumnInfo
@@ -165,7 +180,7 @@ func newVReplicator(id int32, source *binlogdatapb.BinlogSource, sourceVStreamer
source: source,
sourceVStreamer: sourceVStreamer,
stats: stats,
- dbClient: newVDBClient(dbClient, stats, workflowConfig.RelayLogMaxItems),
+ dbClient: newVDBClientWithID(dbClient, stats, workflowConfig.RelayLogMaxItems, id),
mysqld: mysqld,
workflowConfig: workflowConfig,
}
@@ -314,7 +329,7 @@ func (vr *vreplicator) replicate(ctx context.Context) error {
return err
}
} else {
- if vr.state != binlogdatapb.VReplicationWorkflowState_Copying {
+ if vr.getState() != binlogdatapb.VReplicationWorkflowState_Copying {
if err := vr.setState(binlogdatapb.VReplicationWorkflowState_Copying, ""); err != nil {
vr.stats.ErrorCounts.Add([]string{"Copy"}, 1)
return err
@@ -496,6 +511,7 @@ func (vr *vreplicator) loadSettings(ctx context.Context, dbClient *vdbClient) (s
vr.WorkflowType = int32(settings.WorkflowType)
vr.WorkflowSubType = int32(settings.WorkflowSubType)
vr.WorkflowName = settings.WorkflowName
+ vr.inCopyPhase.Store(numTablesToCopy != 0)
}
return settings, numTablesToCopy, err
}
@@ -533,7 +549,7 @@ func (vr *vreplicator) setMessage(message string) (err error) {
if _, err := vr.dbClient.Execute(query); err != nil {
return fmt.Errorf("could not set message: %v: %v", query, err)
}
- insertLog(vr.dbClient, LogMessage, vr.id, vr.state.String(), message)
+ insertLog(vr.dbClient, LogMessage, vr.id, vr.getState().String(), message)
return nil
}
@@ -555,10 +571,42 @@ func (vr *vreplicator) maxQuerySize(dbc *vdbClient) int64 {
}
func (vr *vreplicator) insertLog(typ, message string) {
- insertLog(vr.dbClient, typ, vr.id, vr.state.String(), message)
+ insertLog(vr.dbClient, typ, vr.id, vr.getState().String(), message)
+}
+
+// isInCopyPhase reports whether the workflow had tables left to copy as of
+// the last loadSettings call.
+func (vr *vreplicator) isInCopyPhase() bool {
+ return vr.inCopyPhase.Load()
+}
+
+// getState returns the workflow state as last recorded by setState*.
+func (vr *vreplicator) getState() binlogdatapb.VReplicationWorkflowState {
+ return binlogdatapb.VReplicationWorkflowState(vr.state.Load())
+}
+
+// storeState records the workflow state. Use setState* to also persist it.
+func (vr *vreplicator) storeState(state binlogdatapb.VReplicationWorkflowState) {
+ vr.state.Store(int32(state))
}
func (vr *vreplicator) setState(state binlogdatapb.VReplicationWorkflowState, message string) error {
+ return vr.setStateWithDBClient(vr.dbClient, state, message)
+}
+
+// setStateWithDBClientImmediate is setStateWithDBClient; the name survives at
+// call sites (e.g. the parallel commitLoop) to document that the stop-state
+// write executes immediately within the connection's open transaction.
+func (vr *vreplicator) setStateWithDBClientImmediate(dbClient *vdbClient, state binlogdatapb.VReplicationWorkflowState, message string) error {
+ return vr.setStateWithDBClient(dbClient, state, message)
+}
+
+// setStateWithDBClient writes the workflow's state/message row to
+// _vt.vreplication using the supplied connection. Mid-batch, it flushes the
+// pending batch first and executes its own writes immediately (still inside
+// the same open MySQL transaction), marking the buffer flushed so nothing
+// double-executes on the later batch commit.
+func (vr *vreplicator) setStateWithDBClient(dbClient *vdbClient, state binlogdatapb.VReplicationWorkflowState, message string) error {
if message != "" {
vr.stats.History.Add(&binlogplayer.StatsHistoryRecord{
Time: time.Now(),
@@ -567,20 +615,33 @@ func (vr *vreplicator) setState(state binlogdatapb.VReplicationWorkflowState, me
}
vr.stats.State.Store(state.String())
query := fmt.Sprintf("update _vt.vreplication set state=%v, message=left(%v, 1000) where id=%v", encodeString(state.String()), encodeString(binlogplayer.MessageTruncate(message)), vr.id)
- // If we're batching a transaction, then include the state update
- // in the current transaction batch.
- if vr.dbClient.InTransaction && vr.dbClient.maxBatchSize > 0 {
- vr.dbClient.AddQueryToTrxBatch(query)
- } else { // Otherwise, send it down the wire
- if _, err := vr.dbClient.ExecuteFetch(query, 1); err != nil {
- return fmt.Errorf("could not set state: %v: %v", query, err)
+ // In batch-commit mode, queries run via ExecuteFetch execute on the wire
+ // AND get appended to the trx batch buffer (for Retry). A later
+ // CommitTrxQueryBatch would replay them in a fresh MySQL transaction,
+ // doubling the state UPDATE and the vreplication_log SELECT/INSERT that
+ // insertLog below issues, and breaking atomicity with the position
+ // write. So mid-batch we always: flush the pending batch first (the
+ // flush stays inside the same open MySQL transaction, preserving
+ // stop-path atomicity with the position update), run the state write
+ // and insertLog immediately, and mark the buffer flushed on EVERY exit
+ // path so the caller's CommitTrxQueryBatch only sends "commit".
+ // (Deferring the state UPDATE into the batch instead is not an option:
+ // insertLog must read getLastLog and cannot be batched, so its
+ // statements would double-execute on replay.)
+ if dbClient.InTransaction && dbClient.maxBatchSize > 0 {
+ if _, err := dbClient.ExecuteTrxQueryBatch(); err != nil {
+ return fmt.Errorf("could not flush pending batched queries before set state: %v: %v", query, err)
}
+ defer dbClient.markTrxBatchedQueriesFlushed()
}
- if state == vr.state {
+ if _, err := dbClient.ExecuteFetch(query, 1); err != nil {
+ return fmt.Errorf("could not set state: %v: %v", query, err)
+ }
+ if state == vr.getState() {
return nil
}
- insertLog(vr.dbClient, LogStateChange, vr.id, state.String(), message)
- vr.state = state
+ insertLog(dbClient, LogStateChange, vr.id, state.String(), message)
+ vr.storeState(state)
return nil
}
@@ -632,7 +693,16 @@ func (vr *vreplicator) getSettingFKRestrict() error {
func (vr *vreplicator) resetFKCheckAfterCopy(dbClient *vdbClient) error {
_, err := dbClient.Execute(fmt.Sprintf("set @@session.foreign_key_checks=%d", vr.originalFKCheckSetting))
- return err
+ if err != nil {
+ return err
+ }
+ // Keep the connection's cached FK session state coherent: updateFKCheck
+ // skips its SET when the cache says the session already matches, so a
+ // session mutation here must be reflected in the cache or the applier
+ // will silently run with the wrong foreign_key_checks setting.
+ dbClient.foreignKeyChecksEnabled = vr.originalFKCheckSetting != 0
+ dbClient.foreignKeyChecksStateInitialized = true
+ return nil
}
func (vr *vreplicator) resetFKRestrictAfterCopy(dbClient *vdbClient) error {
@@ -738,7 +808,14 @@ func (vr *vreplicator) updateHeartbeatTime(tm int64) error {
func (vr *vreplicator) clearFKCheck(dbClient *vdbClient) error {
_, err := dbClient.Execute("set @@session.foreign_key_checks=0")
- return err
+ if err != nil {
+ return err
+ }
+ // See resetFKCheckAfterCopy: the cached FK session state must follow
+ // every out-of-band session mutation.
+ dbClient.foreignKeyChecksEnabled = false
+ dbClient.foreignKeyChecksStateInitialized = true
+ return nil
}
func (vr *vreplicator) clearFKRestrict(dbClient *vdbClient) error {
@@ -861,6 +938,57 @@ func (vr *vreplicator) stashSecondaryKeys(ctx context.Context, tableName string)
}
func (vr *vreplicator) getTableSecondaryKeys(ctx context.Context, tableName string) ([]*sqlparser.IndexDefinition, error) {
+ tableSpec, err := vr.getTargetTableSpec(ctx, tableName)
+ if err != nil {
+ return nil, err
+ }
+ return extractSecondaryKeys(tableSpec), nil
+}
+
+// extractSecondaryKeys returns the non-PK, non-FK-backed secondary
+// indexes on a parsed CreateTable. Indexes that exist only to satisfy
+// a foreign-key constraint are filtered out because dropping them
+// would break the constraint.
+func extractSecondaryKeys(tableSpec *sqlparser.TableSpec) []*sqlparser.IndexDefinition {
+ if tableSpec == nil {
+ return nil
+ }
+ var secondaryKeys []*sqlparser.IndexDefinition
+ fkIndexCols := make(map[string]bool)
+ for _, constraint := range tableSpec.Constraints {
+ if fkDef, ok := constraint.Details.(*sqlparser.ForeignKeyDefinition); ok {
+ fkCols := make([]string, len(fkDef.Source))
+ for i, fkCol := range fkDef.Source {
+ fkCols[i] = fkCol.Lowered()
+ }
+ fkIndexCols[strings.Join(fkCols, ",")] = true
+ }
+ }
+ for _, index := range tableSpec.Indexes {
+ if index.Info.Type != sqlparser.IndexTypePrimary {
+ cols := make([]string, len(index.Columns))
+ for i, col := range index.Columns {
+ cols[i] = col.Column.Lowered()
+ }
+ if fkIndexCols[strings.Join(cols, ",")] {
+ // This index is needed for a FK constraint so we cannot drop it.
+ continue
+ }
+ secondaryKeys = append(secondaryKeys, index)
+ }
+ }
+ return secondaryKeys
+}
+
+// getTargetTableSpec fetches the target-side CREATE TABLE for the
+// named table and returns its parsed TableSpec. Used by helpers that
+// need to reason about target structure after the stream is running —
+// e.g. detecting extra unique secondary indexes that affect the
+// parallel applier's conflict detection.
+func (vr *vreplicator) getTargetTableSpec(ctx context.Context, tableName string) (*sqlparser.TableSpec, error) {
+ if vr.mysqld == nil || vr.vre == nil || vr.vre.env == nil {
+ return nil, vterrors.Errorf(vtrpcpb.Code_FAILED_PRECONDITION, "missing schema lookup dependencies for %s", tableName)
+ }
req := &tabletmanagerdatapb.GetSchemaRequest{Tables: []string{tableName}}
schema, err := vr.mysqld.GetSchema(ctx, vr.dbClient.DBName(), req)
if err != nil {
@@ -872,10 +1000,9 @@ func (vr *vreplicator) getTableSecondaryKeys(ctx context.Context, tableName stri
tableName, len(schema.TableDefinitions))
}
tableSchema := schema.TableDefinitions[0].Schema
- var secondaryKeys []*sqlparser.IndexDefinition
parsedDDL, err := vr.vre.env.Parser().ParseStrictDDL(tableSchema)
if err != nil {
- return secondaryKeys, err
+ return nil, err
}
createTable, ok := parsedDDL.(*sqlparser.CreateTable)
// createTable or createTable.TableSpec should never be nil
@@ -883,32 +1010,149 @@ func (vr *vreplicator) getTableSecondaryKeys(ctx context.Context, tableName stri
if !ok || createTable == nil || createTable.GetTableSpec() == nil {
return nil, fmt.Errorf("could not determine CREATE TABLE statement from table schema %q", tableSchema)
}
+ return createTable.GetTableSpec(), nil
+}
- tableSpec := createTable.GetTableSpec()
- fkIndexCols := make(map[string]bool)
- for _, constraint := range tableSpec.Constraints {
- if fkDef, ok := constraint.Details.(*sqlparser.ForeignKeyDefinition); ok {
- fkCols := make([]string, len(fkDef.Source))
- for i, fkCol := range fkDef.Source {
- fkCols[i] = fkCol.Lowered()
+// writesetUniqueKeys analyzes the target table's unique secondary indexes
+// for parallel-apply writeset hashing. nil plan -> (nil, false, nil).
+func (vr *vreplicator) writesetUniqueKeys(ctx context.Context, tableName string, plan *TablePlan) (uniqueKeys [][]string, mustSerialize bool, err error) {
+ if plan == nil {
+ return nil, false, nil
+ }
+ tableSpec, err := vr.getTargetTableSpec(ctx, tableName)
+ if err != nil {
+ return nil, false, err
+ }
+ uniqueKeys, mustSerialize = writesetUniqueKeysFromSpec(plan, tableSpec)
+ return uniqueKeys, mustSerialize, nil
+}
+
+// writesetUniqueKeysFromSpec analyzes the target table's unique secondary
+// indexes for parallel-apply writeset hashing, mirroring MySQL's WRITESET
+// dependency tracking (which hashes every unique key, not just the PK:
+// uniqueness constraints make transactions on DIFFERENT rows order-dependent,
+// e.g. one txn freeing a unique value and another claiming it).
+// It returns:
+// - uniqueKeys: ordered column-name lists (lowercased, index order) of each
+// plain-column unique secondary index not covered by the identity. The
+// writeset builder emits additional conflict keys for these.
+// - mustSerialize: true when the table carries uniqueness the hasher cannot
+// reason about — prefix or expression index columns, a PK that does not
+// match the replication identity, or unique secondaries with no usable
+// identity — in which case the table's transactions force-serialize.
+func writesetUniqueKeysFromSpec(plan *TablePlan, tableSpec *sqlparser.TableSpec) (uniqueKeys [][]string, mustSerialize bool) {
+ if plan == nil || tableSpec == nil {
+ return nil, false
+ }
+ secondaryKeys := extractSecondaryKeys(tableSpec)
+ if len(secondaryKeys) == 0 {
+ return nil, false
+ }
+
+ identityCols := plan.IdentityColumns
+ if len(identityCols) == 0 {
+ // No usable identity but the table has secondary indexes that may
+ // enforce uniqueness we cannot reason about via PK-based writeset
+ // keys. Force serialization for any unique-not-null secondary so two
+ // parallel inserts cannot collide at apply time.
+ for _, secondaryKey := range secondaryKeys {
+ if secondaryKey == nil || secondaryKey.Info == nil {
+ continue
+ }
+ if secondaryKey.Info.IsUnique() {
+ return nil, true
}
- fkIndexCols[strings.Join(fkCols, ",")] = true
}
+ return nil, false
}
+
+ identityColSet := make(map[string]struct{}, len(identityCols))
+ for _, col := range identityCols {
+ identityColSet[col] = struct{}{}
+ }
+
+ primaryKeyMatchesIdentity := true
+ primaryKeyMatchesIdentitySet := len(identityColSet) == len(identityCols)
+ primaryKeyColumnCount := 0
for _, index := range tableSpec.Indexes {
- if index.Info.Type != sqlparser.IndexTypePrimary {
- cols := make([]string, len(index.Columns))
- for i, col := range index.Columns {
- cols[i] = col.Column.Lowered()
+ if index == nil || index.Info == nil || index.Info.Type != sqlparser.IndexTypePrimary {
+ continue
+ }
+ primaryKeyColumnCount = len(index.Columns)
+ if primaryKeyColumnCount != len(identityCols) {
+ return nil, true
+ }
+ for i, idxCol := range index.Columns {
+ if idxCol.Expression != nil {
+ primaryKeyMatchesIdentity = false
+ primaryKeyMatchesIdentitySet = false
+ break
}
- if fkIndexCols[strings.Join(cols, ",")] {
- // This index is needed for a FK constraint so we cannot drop it.
+ if idxCol.Length != nil {
+ primaryKeyMatchesIdentity = false
+ primaryKeyMatchesIdentitySet = false
+ break
+ }
+ colName := idxCol.Column.Lowered()
+ if colName != identityCols[i] {
+ primaryKeyMatchesIdentity = false
+ }
+ if _, ok := identityColSet[colName]; !ok {
+ primaryKeyMatchesIdentitySet = false
+ }
+ }
+ break
+ }
+ if primaryKeyColumnCount > 0 && !primaryKeyMatchesIdentity && !primaryKeyMatchesIdentitySet {
+ return nil, true
+ }
+
+ for _, secondaryKey := range secondaryKeys {
+ if secondaryKey == nil || secondaryKey.Info == nil || !secondaryKey.Info.IsUnique() {
+ continue
+ }
+ // A unique secondary index can only enforce conflicts beyond the
+ // identity if its raw column set does not contain the identity. If
+ // the index covers (id, anything-else) and id is the identity, two
+ // rows with different identity values cannot collide on the index.
+ // Functional expressions and prefix lengths break that reasoning
+ // because uniqueness is enforced over a derived value rather than
+ // the raw column, so identity uniqueness no longer implies index
+ // uniqueness, and we cannot hash a faithful writeset key for them.
+ indexColNames := make([]string, 0, len(secondaryKey.Columns))
+ indexColSet := make(map[string]struct{}, len(secondaryKey.Columns))
+ hasDerivedColumn := false
+ for _, idxCol := range secondaryKey.Columns {
+ if idxCol == nil {
continue
}
- secondaryKeys = append(secondaryKeys, index)
+ if idxCol.Expression != nil || idxCol.Length != nil {
+ hasDerivedColumn = true
+ break
+ }
+ colName := idxCol.Column.Lowered()
+ indexColNames = append(indexColNames, colName)
+ indexColSet[colName] = struct{}{}
+ }
+ if hasDerivedColumn {
+ return nil, true
+ }
+ containsIdentity := true
+ for _, col := range identityCols {
+ if _, ok := indexColSet[col]; !ok {
+ containsIdentity = false
+ break
+ }
+ }
+ if containsIdentity {
+ // The index's column set contains all identity columns, so two
+ // rows with different identities cannot collide on it. No extra
+ // writeset key needed.
+ continue
}
+ uniqueKeys = append(uniqueKeys, indexColNames)
}
- return secondaryKeys, err
+ return uniqueKeys, false
}
func (vr *vreplicator) execPostCopyActions(ctx context.Context, tableName string) error {
@@ -1177,7 +1421,7 @@ func (vr *vreplicator) newClientConnection(ctx context.Context) (*vdbClient, err
if err := dbc.Connect(); err != nil {
return nil, vterrors.Wrap(err, "can't connect to database")
}
- dbClient := newVDBClient(dbc, vr.stats, vr.workflowConfig.RelayLogMaxItems)
+ dbClient := newVDBClientWithID(dbc, vr.stats, vr.workflowConfig.RelayLogMaxItems, vr.id)
if _, err := vr.setSQLMode(ctx, dbClient); err != nil {
return nil, vterrors.Wrap(err, "failed to set sql_mode")
}
diff --git a/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go b/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go
index a1d28a2fd67..78d64c80b7e 100644
--- a/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go
+++ b/go/vt/vttablet/tabletmanager/vreplication/vreplicator_test.go
@@ -39,12 +39,115 @@ import (
"vitess.io/vitess/go/vt/dbconfigs"
"vitess.io/vitess/go/vt/mysqlctl"
"vitess.io/vitess/go/vt/schemadiff"
+ "vitess.io/vitess/go/vt/sqlparser"
vttablet "vitess.io/vitess/go/vt/vttablet/common"
binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
tabletmanagerdatapb "vitess.io/vitess/go/vt/proto/tabletmanagerdata"
)
+// TestWritesetUniqueKeysFromSpec pins the spec-analysis rules the parallel
+// applier relies on. Plain-column unique secondaries that aren't covered by
+// the identity now emit writeset unique keys (uniqueKeys set, mustSerialize
+// false) instead of force-serializing; only uniqueness the hasher cannot
+// reason about (prefix/expression indexes, PK/identity mismatch, no usable
+// identity) still forces serialization.
+func TestWritesetUniqueKeysFromSpec(t *testing.T) {
+ parser := sqlparser.NewTestParser()
+ specFor := func(t *testing.T, ddl string) *sqlparser.TableSpec {
+ t.Helper()
+ parsedDDL, err := parser.ParseStrictDDL(ddl)
+ require.NoError(t, err)
+ createTable, ok := parsedDDL.(*sqlparser.CreateTable)
+ require.True(t, ok)
+ tableSpec := createTable.GetTableSpec()
+ require.NotNil(t, tableSpec)
+ return tableSpec
+ }
+
+ tests := []struct {
+ name string
+ ddl string
+ identityCols []string
+ wantUniqueKeys [][]string
+ wantMustSerialize bool
+ }{
+ {
+ // No usable identity but a unique-not-null secondary the
+ // PK-based writeset can't reason about: force serialization.
+ name: "no identity with unique secondary",
+ ddl: "create table t1 (id int, email varchar(64) not null, unique key uk_email(email))",
+ identityCols: nil,
+ wantMustSerialize: true,
+ },
+ {
+ // Plain single-column unique secondary not covered by the
+ // identity: emit a writeset unique key, don't serialize.
+ name: "plain unique secondary emits key",
+ ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email(email))",
+ identityCols: []string{"id"},
+ wantUniqueKeys: [][]string{{"email"}},
+ },
+ {
+ // Multi-column plain unique secondary: ordered column list.
+ name: "composite unique secondary emits ordered key",
+ ddl: "create table t1 (id int not null, a int not null, b int not null, primary key(id), unique key uk_ab(a, b))",
+ identityCols: []string{"id"},
+ wantUniqueKeys: [][]string{{"a", "b"}},
+ },
+ {
+ // Unique secondary whose column set contains the identity can't
+ // create cross-identity conflicts: skip it (no key, no serialize).
+ name: "unique secondary covering identity is skipped",
+ ddl: "create table t1 (id int not null, b int not null, primary key(id), unique key uk_idb(id, b))",
+ identityCols: []string{"id"},
+ },
+ {
+ // Prefix index on the unique secondary: uniqueness is over a
+ // derived value, force serialization.
+ name: "prefix unique secondary serializes",
+ ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email(email(8)))",
+ identityCols: []string{"id"},
+ wantMustSerialize: true,
+ },
+ {
+ // Expression/functional unique index: force serialization.
+ name: "expression unique secondary serializes",
+ ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email((lower(email))))",
+ identityCols: []string{"id"},
+ wantMustSerialize: true,
+ },
+ {
+ // PK does not match the chosen replication identity: the
+ // PK-based writeset key is unreliable, force serialization.
+ name: "pk identity mismatch serializes",
+ ddl: "create table t1 (id int not null, email varchar(64) not null, primary key(id), unique key uk_email(email))",
+ identityCols: []string{"email"},
+ wantMustSerialize: true,
+ },
+ {
+ // A mix: one hashable key plus one covered-by-identity key.
+ name: "mixed hashable and covered keys",
+ ddl: "create table t1 (id int not null, email varchar(64) not null, b int not null, primary key(id), unique key uk_email(email), unique key uk_idb(id, b))",
+ identityCols: []string{"id"},
+ wantUniqueKeys: [][]string{{"email"}},
+ },
+ }
+
+ for _, tc := range tests {
+ t.Run(tc.name, func(t *testing.T) {
+ tableSpec := specFor(t, tc.ddl)
+ plan := &TablePlan{
+ TargetName: "t1",
+ IdentityColumns: tc.identityCols,
+ }
+ uniqueKeys, mustSerialize := writesetUniqueKeysFromSpec(plan, tableSpec)
+ assert.Equal(t, tc.wantMustSerialize, mustSerialize)
+ assert.Equal(t, tc.wantUniqueKeys, uniqueKeys)
+ })
+ }
+}
+
func TestMaxQuerySize(t *testing.T) {
makeVR := func(dbClient binlogplayer.DBClient, relayLogMaxSize int) *vreplicator {
stats := binlogplayer.NewStats()
@@ -984,3 +1087,77 @@ func TestThrottlerAppNames(t *testing.T) {
assert.Contains(t, vc.throttlerAppName, "vcopier")
assert.NotContains(t, vc.throttlerAppName, "vplayer")
}
+
+// TestFKCheckHelpersUpdateSessionCache pins that clearFKCheck and
+// resetFKCheckAfterCopy keep the vdbClient's cached foreign_key_checks
+// session state coherent. updateFKCheck skips the SET when the cache says
+// the session already matches, so any helper that mutates the session
+// out-of-band MUST update the cache — otherwise an atomic-copy workflow's
+// catchup -> clearFKCheck -> copy -> resetFKCheckAfterCopy -> catchup cycle
+// leaves the session FK state out of sync with what the applier believes,
+// silently applying with the wrong foreign_key_checks setting.
+func TestFKCheckHelpersUpdateSessionCache(t *testing.T) {
+ dbc := binlogplayer.NewMockDBClient(t)
+ vdbc := newVDBClient(dbc, binlogplayer.NewStats(), 0)
+ vr := &vreplicator{originalFKCheckSetting: 1}
+
+ // Simulate updateFKCheck having initialized the cache with FK checks ON.
+ vdbc.foreignKeyChecksEnabled = true
+ vdbc.foreignKeyChecksStateInitialized = true
+
+ // The mock treats "set @@session.foreign_key_checks..." as an invariant,
+ // so no per-query expectations are needed.
+ require.NoError(t, vr.clearFKCheck(vdbc))
+ assert.False(t, vdbc.foreignKeyChecksEnabled, "clearFKCheck must record FK checks as disabled in the session cache")
+ assert.True(t, vdbc.foreignKeyChecksStateInitialized)
+
+ require.NoError(t, vr.resetFKCheckAfterCopy(vdbc))
+ assert.True(t, vdbc.foreignKeyChecksEnabled, "resetFKCheckAfterCopy must record the restored FK state in the session cache")
+ assert.True(t, vdbc.foreignKeyChecksStateInitialized)
+}
+
+// copyPhaseDBClient serves loadSettings like failingDBClient but reports a
+// non-empty _vt.copy_state, simulating a workflow restarted mid-copy.
+type copyPhaseDBClient struct {
+ failingDBClient
+}
+
+func (c *copyPhaseDBClient) ExecuteFetch(query string, maxrows int) (*sqltypes.Result, error) {
+ if strings.Contains(query, "from _vt.copy_state where vrepl_id=") {
+ return sqltypes.MakeTestResult(
+ sqltypes.MakeTestFields("count(distinct table_name)", "int64"),
+ "2",
+ ), nil
+ }
+ return c.failingDBClient.ExecuteFetch(query, maxrows)
+}
+
+// TestLoadSettingsTracksCopyPhase is a restart-style regression test: a fresh
+// vreplicator (in-memory state at its zero value, as after a tablet or
+// controller restart) whose durable _vt.copy_state still has rows must report
+// that it is in the copy phase. The controller's AtomicCopy terminal-error
+// guard needs this durable-evidence signal because the AtomicCopy copy path
+// (copyAll) never calls setState(Copying) — only initTablesForCopy does, on
+// first start — so after a restart the entire remaining copy phase would
+// otherwise run with vr.state at zero and copy-phase errors would be
+// misclassified as retryable.
+func TestLoadSettingsTracksCopyPhase(t *testing.T) {
+ vr := &vreplicator{
+ id: 1,
+ dbClient: newVDBClient(©PhaseDBClient{}, binlogplayer.NewStats(), 0),
+ }
+ require.False(t, vr.isInCopyPhase(), "zero value must preserve existing (retryable) behavior")
+
+ _, numTablesToCopy, err := vr.loadSettings(t.Context(), vr.dbClient)
+ require.NoError(t, err)
+ require.Equal(t, int64(2), numTablesToCopy)
+ require.True(t, vr.isInCopyPhase(), "loadSettings must record that tables remain to be copied")
+
+ // Once the copy completes (no copy_state rows), the next loadSettings
+ // clears the flag so post-copy errors are classified as before.
+ vr.dbClient = newVDBClient(&failingDBClient{}, binlogplayer.NewStats(), 0)
+ _, numTablesToCopy, err = vr.loadSettings(t.Context(), vr.dbClient)
+ require.NoError(t, err)
+ require.Equal(t, int64(0), numTablesToCopy)
+ require.False(t, vr.isInCopyPhase())
+}
diff --git a/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go b/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go
index 5816afd8091..6cbe1baf391 100644
--- a/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go
+++ b/go/vt/vttablet/tabletserver/vstreamer/vstreamer.go
@@ -24,6 +24,7 @@ import (
"io"
"slices"
"strings"
+ "sync"
"sync/atomic"
"time"
@@ -260,11 +261,27 @@ func (vs *vstreamer) refreshHistorianForStartup(ctx context.Context) error {
// parseEvents parses and sends events.
func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.BinlogEvent, errs <-chan error) error {
+ ctx, cancel := context.WithCancel(ctx)
// bufferAndTransmit uses bufferedEvents and curSize to buffer events.
var (
bufferedEvents []*binlogdatapb.VEvent
curSize int
)
+ var pendingStreamErr error
+ drainSourceEvents := make(chan struct{})
+ var drainSourceEventsOnce sync.Once
+ signalDrainSourceEvents := func() {
+ drainSourceEventsOnce.Do(func() {
+ close(drainSourceEvents)
+ })
+ }
+ recordSourceStreamErr := func(err error, ok bool) {
+ if ok && err != nil && pendingStreamErr == nil {
+ pendingStreamErr = err
+ signalDrainSourceEvents()
+ }
+ errs = nil
+ }
// Only the following patterns are possible:
// BEGIN->ROWs or Statements->GTID->COMMIT. In the case of large transactions, this can be broken into chunks.
@@ -400,29 +417,56 @@ func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog
wfNameLog = " in workflow " + vs.filter.WorkflowName
}
throttlerErrs := make(chan error, 1) // How we share the error when we've been fully throttled too long
- defer close(throttlerErrs)
throttleEvents := func(throttledEvents chan mysql.BinlogEvent) {
+ drainingAfterSourceError := false
throttledTime := atomic.Int64{}
for {
- // Check throttler.
- if checkResult, ok := vs.vse.throttlerClient.ThrottleCheckOKOrWaitAppName(ctx, vs.throttlerApp); !ok {
- // Make sure to leave if context is cancelled.
+ if !drainingAfterSourceError {
select {
- case <-ctx.Done():
- return
+ case <-drainSourceEvents:
+ drainingAfterSourceError = true
default:
- // Do nothing special.
}
- vs.vse.throttledCounts.Add(1)
- curtime := time.Now().Unix()
- if !throttledTime.CompareAndSwap(0, curtime) {
- if curtime-throttledTime.Load() > int64(fullyThrottledTimeout.Seconds()) {
- throttlerErrs <- vterrors.Errorf(vtrpcpb.Code_INTERNAL, "vstreamer has been fully throttled for more than %v, giving up so that we can retry", fullyThrottledTimeout)
+ }
+ // Check throttler.
+ if !drainingAfterSourceError {
+ if checkResult, ok := vs.vse.throttlerClient.ThrottleCheckOKOrWaitAppName(ctx, vs.throttlerApp); !ok {
+ // Make sure to leave if context is cancelled.
+ select {
+ case <-ctx.Done():
return
+ default:
+ // Do nothing special.
+ }
+ select {
+ case <-drainSourceEvents:
+ drainingAfterSourceError = true
+ throttledTime.Store(0)
+ continue
+ default:
+ }
+ // Count only iterations that remain throttled: the drain transition
+ // above is not a throttle wait.
+ vs.vse.throttledCounts.Add(1)
+ curtime := time.Now().Unix()
+ if !throttledTime.CompareAndSwap(0, curtime) {
+ if curtime-throttledTime.Load() > int64(fullyThrottledTimeout.Seconds()) {
+ throttlerErrs <- vterrors.Errorf(vtrpcpb.Code_INTERNAL, "vstreamer has been fully throttled for more than %v, giving up so that we can retry", fullyThrottledTimeout)
+ // Close throttledEvents so the main parseEvents loop's
+ // `case ev, ok := <-throttledEvents` fires with ok=false
+ // and can return the throttler error (or a pending
+ // source error). Without this close, if pendingStreamErr
+ // is already set the main loop's throttlerErrs case
+ // `continue`s and the only remaining live select case
+ // is hbTimer.C, which spins forever swallowing the
+ // pending error until the caller cancels.
+ close(throttledEvents)
+ return
+ }
}
+ logger.Infof("vstreamer throttled%s: %s.", wfNameLog, checkResult.Summary())
+ continue
}
- logger.Infof("vstreamer throttled%s: %s.", wfNameLog, checkResult.Summary())
- continue
}
throttledTime.Store(0) // We are no longer fully throttled
select {
@@ -446,7 +490,32 @@ func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog
// throttledEvents pulls data from events, but throttles pulling data,
// which in turn blocks the BinlogConnection from pushing events to the channel
throttledEvents := make(chan mysql.BinlogEvent)
- go throttleEvents(throttledEvents)
+ throttleEventsDone := make(chan struct{})
+ go func() {
+ defer close(throttleEventsDone)
+ throttleEvents(throttledEvents)
+ }()
+ defer func() {
+ cancel()
+ <-throttleEventsDone
+ }()
+ handleThrottledEvent := func(ev mysql.BinlogEvent) error {
+ vevents, err := vs.parseEvent(ev, bufferAndTransmit)
+ if err != nil {
+ vs.vse.errorCounts.Add("ParseEvent", 1)
+ return err
+ }
+ for _, vevent := range vevents {
+ if err := bufferAndTransmit(vevent); err != nil {
+ if err == io.EOF {
+ return err
+ }
+ vs.vse.errorCounts.Add("BufferAndTransmit", 1)
+ return vterrors.Wrapf(err, "error sending event: %+v", vevent)
+ }
+ }
+ return nil
+ }
for {
hbTimer.Reset(HeartbeatTime)
@@ -459,48 +528,79 @@ func (vs *vstreamer) parseEvents(ctx context.Context, events <-chan mysql.Binlog
select {
case ev, ok := <-throttledEvents:
if !ok {
+ if pendingStreamErr != nil {
+ return pendingStreamErr
+ }
+ if errs != nil {
+ select {
+ case err, ok := <-errs:
+ recordSourceStreamErr(err, ok)
+ default:
+ }
+ }
+ if pendingStreamErr != nil {
+ return pendingStreamErr
+ }
+ // throttleEvents closes throttledEvents right after sending its
+ // timeout error to throttlerErrs; both select cases become ready
+ // at once and Go picks randomly, so when the closed-channel case
+ // wins we must surface the real throttler error rather than a
+ // misleading "unexpected server EOF".
+ select {
+ case err := <-throttlerErrs:
+ if err != nil {
+ return err
+ }
+ default:
+ }
select {
- case err := <-errs:
- return err
case <-ctx.Done():
return nil
default:
}
return vterrors.Errorf(vtrpcpb.Code_ABORTED, "unexpected server EOF while parsing events")
}
- vevents, err := vs.parseEvent(ev, bufferAndTransmit)
- if err != nil {
- vs.vse.errorCounts.Add("ParseEvent", 1)
+ if err := func() error {
+ return handleThrottledEvent(ev)
+ }(); err != nil {
+ if err == io.EOF {
+ return nil
+ }
return err
}
- for _, vevent := range vevents {
- if err := bufferAndTransmit(vevent); err != nil {
- if err == io.EOF {
- return nil
+ case vs.vschema = <-vs.vevents:
+ if pendingStreamErr != nil {
+ continue
+ }
+ if errs != nil {
+ select {
+ case err, ok := <-errs:
+ recordSourceStreamErr(err, ok)
+ if pendingStreamErr != nil {
+ continue
}
- vs.vse.errorCounts.Add("BufferAndTransmit", 1)
- return vterrors.Wrapf(err, "error sending event: %+v", vevent)
+ case <-ctx.Done():
+ return nil
+ default:
}
}
- case vs.vschema = <-vs.vevents:
- select {
- case err := <-errs:
- return err
- case <-ctx.Done():
- return nil
- default:
- if err := vs.rebuildPlans(); err != nil {
- return vterrors.Wrap(err, "failed to rebuild replication plans after vschema change notification")
- }
+ if err := vs.rebuildPlans(); err != nil {
+ return vterrors.Wrap(err, "failed to rebuild replication plans after vschema change notification")
}
- case err := <-errs:
- return err
+ case err, ok := <-errs:
+ recordSourceStreamErr(err, ok)
case throttlerErr := <-throttlerErrs:
+ if pendingStreamErr != nil {
+ continue
+ }
vs.vse.errorCounts.Add(fullyThrottledMetricLabel, 1)
return throttlerErr
case <-ctx.Done():
return nil
case <-hbTimer.C:
+ if pendingStreamErr != nil {
+ continue
+ }
checkResult, ok := vs.vse.throttlerClient.ThrottleCheckOK(ctx, vs.throttlerApp)
if err := injectHeartbeat(!ok, checkResult.Summary()); err != nil {
if err == io.EOF {
diff --git a/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go b/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go
index 7c3e6edda72..c84ea33eff8 100644
--- a/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go
+++ b/go/vt/vttablet/tabletserver/vstreamer/vstreamer_test.go
@@ -39,6 +39,7 @@ import (
"vitess.io/vitess/go/mysql/fakesqldb"
"vitess.io/vitess/go/mysql/replication"
"vitess.io/vitess/go/sqltypes"
+ "vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/dbconfigs"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/sqlparser"
@@ -46,6 +47,8 @@ import (
"vitess.io/vitess/go/vt/vterrors"
"vitess.io/vitess/go/vt/vttablet/tabletserver/schema"
"vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv"
+ "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle"
+ throttlebase "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base"
"vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/throttlerapp"
"vitess.io/vitess/go/vt/vttablet/tabletserver/vstreamer/testenv"
@@ -58,6 +61,286 @@ type testcase struct {
output [][]string
}
+func TestParseEventsDrainsBufferedEventsBeforeTerminalError(t *testing.T) {
+ f := mysql.NewMySQL56BinlogFormat()
+ s := mysql.NewFakeBinlogStream()
+ s.ServerID = 62344
+
+ input := []mysql.BinlogEvent{
+ mysql.NewRotateEvent(f, s, 0, ""),
+ mysql.NewFormatDescriptionEvent(f, s),
+ mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */),
+ mysql.NewXIDEvent(f, s),
+ }
+
+ streamErr := errors.New("stream ended after buffered events")
+ cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName})
+ // A nil throttlerClient is intentional and safe: Client.ThrottleCheckOK
+ // nil-checks its receiver and reports "not throttled".
+ vse := &Engine{keyspace: testenv.DBName, shard: testenv.DefaultShard, throttledCounts: stats.NewCounter("", "")}
+
+ for i := range 64 {
+ events := make(chan mysql.BinlogEvent, len(input))
+ errs := make(chan error, 1)
+ for _, ev := range input {
+ events <- ev
+ }
+ close(events)
+ errs <- streamErr
+ close(errs)
+
+ var got [][]*binlogdatapb.VEvent
+ vs := &vstreamer{
+ ctx: t.Context(),
+ cp: cp,
+ send: func(vevents []*binlogdatapb.VEvent) error {
+ got = append(got, vevents)
+ return nil
+ },
+ vse: vse,
+ }
+
+ err := vs.parseEvents(t.Context(), events, errs)
+ require.ErrorIs(t, err, streamErr, "iteration %d", i)
+ require.Len(t, got, 1, "iteration %d", i)
+ require.Len(t, got[0], 2, "iteration %d", i)
+ require.Equal(t, binlogdatapb.VEventType_GTID, got[0][0].Type, "iteration %d", i)
+ require.Equal(t, binlogdatapb.VEventType_COMMIT, got[0][1].Type, "iteration %d", i)
+ require.Equal(t, testenv.DBName, got[0][0].Keyspace, "iteration %d", i)
+ require.Equal(t, testenv.DefaultShard, got[0][0].Shard, "iteration %d", i)
+ require.Equal(t, testenv.DBName, got[0][1].Keyspace, "iteration %d", i)
+ require.Equal(t, testenv.DefaultShard, got[0][1].Shard, "iteration %d", i)
+ }
+}
+
+func TestParseEventsDrainsBufferedEventsBeforeTerminalErrorWhenThrottled(t *testing.T) {
+ f := mysql.NewMySQL56BinlogFormat()
+ s := mysql.NewFakeBinlogStream()
+ s.ServerID = 62344
+
+ input := []mysql.BinlogEvent{
+ mysql.NewRotateEvent(f, s, 0, ""),
+ mysql.NewFormatDescriptionEvent(f, s),
+ mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */),
+ mysql.NewXIDEvent(f, s),
+ }
+
+ streamErr := errors.New("stream ended after buffered events")
+ cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName})
+ vse := &Engine{
+ keyspace: testenv.DBName,
+ shard: testenv.DefaultShard,
+ throttlerClient: throttle.NewBackgroundClient(nil, throttlerapp.VStreamerName, throttlebase.UndefinedScope),
+ // Unpublished counter (empty name skips stats registration): this bare
+ // Engine bypasses NewEngine, so any counter the production code touches
+ // must be non-nil here.
+ throttledCounts: stats.NewCounter("", ""),
+ }
+
+ events := make(chan mysql.BinlogEvent, len(input))
+ for _, ev := range input {
+ events <- ev
+ }
+ close(events)
+ errCh := make(chan error, 1)
+
+ ctx, cancel := context.WithCancel(t.Context())
+ defer cancel()
+ done := make(chan error, 1)
+ var got [][]*binlogdatapb.VEvent
+ vs := &vstreamer{
+ ctx: ctx,
+ cp: cp,
+ throttlerApp: throttlerapp.TestingAlwaysThrottledName,
+ send: func(vevents []*binlogdatapb.VEvent) error {
+ got = append(got, vevents)
+ return nil
+ },
+ vse: vse,
+ }
+
+ go func() {
+ done <- vs.parseEvents(ctx, events, errCh)
+ }()
+ go func() {
+ tmr := time.NewTimer(100 * time.Millisecond)
+ defer tmr.Stop()
+ select {
+ case <-ctx.Done():
+ case <-tmr.C:
+ errCh <- streamErr
+ close(errCh)
+ }
+ }()
+
+ var err error
+ require.Eventually(t, func() bool {
+ select {
+ case err = <-done:
+ return true
+ default:
+ return false
+ }
+ }, 2*time.Second, 50*time.Millisecond)
+ require.ErrorIs(t, err, streamErr)
+ require.Len(t, got, 1)
+ require.Len(t, got[0], 2)
+ require.Equal(t, binlogdatapb.VEventType_GTID, got[0][0].Type)
+ require.Equal(t, binlogdatapb.VEventType_COMMIT, got[0][1].Type)
+}
+
+func TestParseEventsReturnsNilOnClientEOF(t *testing.T) {
+ f := mysql.NewMySQL56BinlogFormat()
+ s := mysql.NewFakeBinlogStream()
+ s.ServerID = 62344
+
+ input := []mysql.BinlogEvent{
+ mysql.NewRotateEvent(f, s, 0, ""),
+ mysql.NewFormatDescriptionEvent(f, s),
+ mysql.NewMariaDBGTIDEvent(f, s, replication.MariadbGTID{Domain: 0, Sequence: 0xd}, false /* hasBegin */),
+ mysql.NewXIDEvent(f, s),
+ }
+
+ events := make(chan mysql.BinlogEvent, len(input))
+ for _, ev := range input {
+ events <- ev
+ }
+ close(events)
+ errCh := make(chan error)
+ close(errCh)
+
+ cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName})
+ // A nil throttlerClient is intentional and safe: Client.ThrottleCheckOK
+ // nil-checks its receiver and reports "not throttled".
+ vse := &Engine{keyspace: testenv.DBName, shard: testenv.DefaultShard, throttledCounts: stats.NewCounter("", "")}
+
+ sendCalls := 0
+ vs := &vstreamer{
+ ctx: t.Context(),
+ cp: cp,
+ send: func(vevents []*binlogdatapb.VEvent) error {
+ sendCalls++
+ return io.EOF
+ },
+ vse: vse,
+ }
+
+ err := vs.parseEvents(t.Context(), events, errCh)
+ require.NoError(t, err)
+ require.Equal(t, 1, sendCalls)
+}
+
+func TestParseEventsClientEOFDuringThrottleDoesNotPanicAfterReturn(t *testing.T) {
+ origTimeout := fullyThrottledTimeout
+ origHeartbeatTime := HeartbeatTime
+ fullyThrottledTimeout = -time.Second
+ HeartbeatTime = 10 * time.Millisecond
+ t.Cleanup(func() {
+ fullyThrottledTimeout = origTimeout
+ HeartbeatTime = origHeartbeatTime
+ })
+
+ events := make(chan mysql.BinlogEvent)
+ close(events)
+ errCh := make(chan error)
+
+ cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName})
+ vse := &Engine{
+ keyspace: testenv.DBName,
+ shard: testenv.DefaultShard,
+ throttlerClient: throttle.NewBackgroundClient(nil, throttlerapp.VStreamerName, throttlebase.UndefinedScope),
+ // Unpublished counter: bare Engines bypass NewEngine, so counters the
+ // production code touches must be non-nil.
+ throttledCounts: stats.NewCounter("", ""),
+ }
+
+ vs := &vstreamer{
+ ctx: t.Context(),
+ cp: cp,
+ throttlerApp: throttlerapp.TestingAlwaysThrottledName,
+ send: func(vevents []*binlogdatapb.VEvent) error {
+ require.Len(t, vevents, 1)
+ require.Equal(t, binlogdatapb.VEventType_HEARTBEAT, vevents[0].Type)
+ return io.EOF
+ },
+ vse: vse,
+ }
+
+ err := vs.parseEvents(t.Context(), events, errCh)
+ require.NoError(t, err)
+
+ // Give the throttling goroutine time to hit its fully-throttled timeout path.
+ // The background client sleeps once per throttle check, so the sender needs
+ // two iterations before it reaches the send. Before the fix, parseEvents
+ // closed throttlerErrs on return, so the sender panicked here with
+ // "send on closed channel".
+ time.Sleep(750 * time.Millisecond)
+}
+
+func TestParseEventsReturnsPendingSourceErrorAfterFullyThrottledTimeout(t *testing.T) {
+ origTimeout := fullyThrottledTimeout
+ origHeartbeatTime := HeartbeatTime
+ fullyThrottledTimeout = -time.Second
+ HeartbeatTime = time.Hour
+ t.Cleanup(func() {
+ fullyThrottledTimeout = origTimeout
+ HeartbeatTime = origHeartbeatTime
+ })
+
+ ctx, cancel := context.WithTimeout(t.Context(), time.Second)
+ t.Cleanup(cancel)
+
+ streamErr := errors.New("stream ended while throttler wait was sleeping")
+ events := make(chan mysql.BinlogEvent)
+ errCh := make(chan error, 1)
+ cp := dbconfigs.New(&mysql.ConnParams{DbName: testenv.DBName})
+ vse := &Engine{
+ keyspace: testenv.DBName,
+ shard: testenv.DefaultShard,
+ throttlerClient: throttle.NewBackgroundClient(nil, throttlerapp.VStreamerName, throttlebase.UndefinedScope),
+ // Unpublished counter: bare Engines bypass NewEngine, so counters the
+ // production code touches must be non-nil.
+ throttledCounts: stats.NewCounter("", ""),
+ }
+
+ vs := &vstreamer{
+ ctx: ctx,
+ cp: cp,
+ throttlerApp: throttlerapp.TestingAlwaysThrottledName,
+ send: func(vevents []*binlogdatapb.VEvent) error {
+ return nil
+ },
+ vse: vse,
+ }
+
+ done := make(chan error, 1)
+ go func() {
+ done <- vs.parseEvents(ctx, events, errCh)
+ }()
+ go func() {
+ tmr := time.NewTimer(400 * time.Millisecond)
+ defer tmr.Stop()
+ select {
+ case <-ctx.Done():
+ case <-tmr.C:
+ close(events)
+ errCh <- streamErr
+ close(errCh)
+ }
+ }()
+
+ var err error
+ require.Eventually(t, func() bool {
+ select {
+ case err = <-done:
+ return true
+ default:
+ return false
+ }
+ }, 2*time.Second, 10*time.Millisecond)
+ require.ErrorIs(t, err, streamErr)
+}
+
func checkIfOptionIsSupported(t *testing.T, variable string) bool {
qr, err := env.Mysqld.FetchSuperQuery(t.Context(), fmt.Sprintf("show variables like '%s'", variable))
require.NoError(t, err)
diff --git a/test/config.json b/test/config.json
index 0b6de16beea..fad122d5f63 100644
--- a/test/config.json
+++ b/test/config.json
@@ -243,6 +243,7 @@
"Shard": "xb_backup",
"Tags": [],
"Needs": [
+ "larger-runner",
"xtrabackup"
]
},
@@ -262,6 +263,7 @@
"Shard": "xb_backup",
"Tags": [],
"Needs": [
+ "larger-runner",
"xtrabackup"
]
},
@@ -2108,20 +2110,20 @@
"binlog-compression"
]
},
- "vreplication_copy_parallel": {
+ "vreplication_parallel": {
"File": "unused.go",
"Packages": [
"vitess.io/vitess/go/test/endtoend/vreplication"
],
"Args": [
"-run",
- "TestVreplicationCopyParallel",
+ "TestVreplicationParallel",
"-timeout",
"20m"
],
"Command": [],
"Manual": false,
- "Shard": "vreplication_copy_parallel",
+ "Shard": "vreplication_parallel",
"Tags": [],
"Needs": [
"larger-runner",