|
| 1 | +# SPDX-FileCopyrightText: 2026 Demerzel Solutions Limited |
| 2 | +# SPDX-License-Identifier: LGPL-3.0-only |
| 3 | + |
| 4 | +"""Shared utilities for EVM opcode benchmark comparison. |
| 5 | +
|
| 6 | +Used by the detect-noisy and compare steps in evm-opcode-benchmark-diff.yml. |
| 7 | +""" |
| 8 | + |
| 9 | +import glob |
| 10 | +import os |
| 11 | +import re |
| 12 | +import statistics |
| 13 | + |
| 14 | +ANSI_RE = re.compile(r"\x1B\[[0-9;]*[A-Za-z]") |
| 15 | +VALUE_RE = re.compile(r"^\s*([0-9][0-9,]*(?:\.[0-9]+)?)\s*([a-zA-Zµμ]+)\s*$") |
| 16 | +UNIT_TO_NS = { |
| 17 | + "ns": 1.0, |
| 18 | + "us": 1_000.0, |
| 19 | + "µs": 1_000.0, |
| 20 | + "μs": 1_000.0, |
| 21 | + "ms": 1_000_000.0, |
| 22 | + "s": 1_000_000_000.0, |
| 23 | +} |
| 24 | + |
| 25 | + |
| 26 | +def read_env_config(): |
| 27 | + """Read benchmark comparison thresholds from environment variables.""" |
| 28 | + return { |
| 29 | + "default_threshold": float(os.environ.get("THRESHOLD_PERCENT", "5")), |
| 30 | + "noise_multiplier": float(os.environ.get("NOISE_MULTIPLIER", "2.0")), |
| 31 | + "error_multiplier": float(os.environ.get("ERROR_MULTIPLIER", "1.0")), |
| 32 | + "abs_delta_ns_floor": float(os.environ.get("ABS_DELTA_NS_FLOOR", "2.0")), |
| 33 | + "delta_margin_percent": float(os.environ.get("DELTA_MARGIN_PERCENT", "2.0")), |
| 34 | + } |
| 35 | + |
| 36 | + |
| 37 | +def collect_logs(base_pattern="evm-opcodes-base*.log", pr_pattern="evm-opcodes-pr*.log"): |
| 38 | + """Collect and sort benchmark log files, with fallback defaults.""" |
| 39 | + base_logs = sorted(glob.glob(base_pattern)) |
| 40 | + pr_logs = sorted(glob.glob(pr_pattern)) |
| 41 | + if not base_logs: |
| 42 | + base_logs = ["evm-opcodes-base.log"] |
| 43 | + if not pr_logs: |
| 44 | + pr_logs = ["evm-opcodes-pr.log"] |
| 45 | + return base_logs, pr_logs |
| 46 | + |
| 47 | + |
| 48 | +def normalize_text(text): |
| 49 | + """Strip ANSI escape codes and non-breaking spaces.""" |
| 50 | + text = text.replace("\xa0", " ") |
| 51 | + return ANSI_RE.sub("", text) |
| 52 | + |
| 53 | + |
| 54 | +def parse_ns(value): |
| 55 | + """Parse a BenchmarkDotNet timing value (e.g. '12.34 ns') to nanoseconds.""" |
| 56 | + m = VALUE_RE.match(value.strip()) |
| 57 | + if not m: |
| 58 | + return None |
| 59 | + number = float(m.group(1).replace(",", "")) |
| 60 | + unit = m.group(2) |
| 61 | + scale = UNIT_TO_NS.get(unit) |
| 62 | + if scale is None: |
| 63 | + return None |
| 64 | + return number * scale |
| 65 | + |
| 66 | + |
| 67 | +def cv_percent(mean, stddev): |
| 68 | + """Compute coefficient of variation as a percentage.""" |
| 69 | + if mean is None or stddev is None or mean <= 0: |
| 70 | + return None |
| 71 | + return (stddev / mean) * 100.0 |
| 72 | + |
| 73 | + |
| 74 | +def fmt_cv(mean, stddev): |
| 75 | + """Format coefficient of variation for display.""" |
| 76 | + if mean is None or stddev is None or mean == 0: |
| 77 | + return "N/A" |
| 78 | + cv = (stddev / mean) * 100 |
| 79 | + return f"{cv:.1f}%" |
| 80 | + |
| 81 | + |
| 82 | +def uncertainty_floor_percent(base_val, base_error, pr_error, error_multiplier): |
| 83 | + """Compute uncertainty floor from BDN Error columns as a percentage.""" |
| 84 | + if base_val is None or base_val <= 0: |
| 85 | + return None |
| 86 | + if base_error is None and pr_error is None: |
| 87 | + return None |
| 88 | + be = base_error or 0.0 |
| 89 | + pe = pr_error or 0.0 |
| 90 | + return ((be + pe) / base_val) * 100.0 * error_multiplier |
| 91 | + |
| 92 | + |
| 93 | +def find_col(headers, name): |
| 94 | + """Find column index by name, or None if missing.""" |
| 95 | + return headers.index(name) if name in headers else None |
| 96 | + |
| 97 | + |
| 98 | +def pick_median(values): |
| 99 | + """Return the median of non-None values, or None if empty.""" |
| 100 | + values = [v for v in values if v is not None] |
| 101 | + if not values: |
| 102 | + return None |
| 103 | + return statistics.median(values) |
| 104 | + |
| 105 | + |
| 106 | +def extract_opcode_data(path): |
| 107 | + """Extract opcode stats (median, mean, error, stddev, threshold) from a BDN log file.""" |
| 108 | + with open(path, "r", encoding="utf-8", errors="replace") as f: |
| 109 | + text = normalize_text(f.read()) |
| 110 | + |
| 111 | + lines = text.splitlines() |
| 112 | + header_idx = -1 |
| 113 | + for i, line in enumerate(lines): |
| 114 | + if line.strip().startswith("|") and "Opcode" in line and "Mean" in line: |
| 115 | + header_idx = i |
| 116 | + |
| 117 | + if header_idx < 0: |
| 118 | + return {} |
| 119 | + |
| 120 | + headers = [c.strip() for c in lines[header_idx].strip().strip("|").split("|")] |
| 121 | + opcode_col = find_col(headers, "Opcode") |
| 122 | + median_col = find_col(headers, "Median") |
| 123 | + mean_col = find_col(headers, "Mean") |
| 124 | + error_col = find_col(headers, "Error") |
| 125 | + stddev_col = find_col(headers, "StdDev") |
| 126 | + threshold_col = find_col(headers, "Threshold") |
| 127 | + |
| 128 | + if opcode_col is None or mean_col is None: |
| 129 | + return {} |
| 130 | + |
| 131 | + data = {} |
| 132 | + i = header_idx + 2 |
| 133 | + while i < len(lines): |
| 134 | + line = lines[i].strip() |
| 135 | + if not line.startswith("|"): |
| 136 | + break |
| 137 | + |
| 138 | + cells = [c.strip() for c in line.strip("|").split("|")] |
| 139 | + if len(cells) <= max(opcode_col, mean_col): |
| 140 | + i += 1 |
| 141 | + continue |
| 142 | + |
| 143 | + opcode = cells[opcode_col] |
| 144 | + mean = parse_ns(cells[mean_col]) |
| 145 | + if opcode and mean is not None: |
| 146 | + median = parse_ns(cells[median_col]) if median_col is not None and len(cells) > median_col else None |
| 147 | + error = parse_ns(cells[error_col]) if error_col is not None and len(cells) > error_col else None |
| 148 | + stddev = parse_ns(cells[stddev_col]) if stddev_col is not None and len(cells) > stddev_col else None |
| 149 | + threshold = None |
| 150 | + if threshold_col is not None and len(cells) > threshold_col: |
| 151 | + try: |
| 152 | + threshold = float(cells[threshold_col]) |
| 153 | + except (ValueError, IndexError): |
| 154 | + pass |
| 155 | + data[opcode] = {"median": median, "mean": mean, "error": error, "stddev": stddev, "threshold": threshold} |
| 156 | + i += 1 |
| 157 | + |
| 158 | + return data |
| 159 | + |
| 160 | + |
| 161 | +def aggregate(log_paths): |
| 162 | + """Aggregate opcode data across multiple benchmark log files using median.""" |
| 163 | + runs = [extract_opcode_data(path) for path in log_paths] |
| 164 | + all_opcodes = sorted(set().union(*(r.keys() for r in runs))) |
| 165 | + result = {} |
| 166 | + for opcode in all_opcodes: |
| 167 | + rows = [r[opcode] for r in runs if opcode in r] |
| 168 | + result[opcode] = { |
| 169 | + "median": pick_median([x.get("median") for x in rows]), |
| 170 | + "mean": pick_median([x.get("mean") for x in rows]), |
| 171 | + "error": pick_median([x.get("error") for x in rows]), |
| 172 | + "stddev": pick_median([x.get("stddev") for x in rows]), |
| 173 | + "threshold": pick_median([x.get("threshold") for x in rows]), |
| 174 | + } |
| 175 | + return result |
| 176 | + |
| 177 | + |
| 178 | +def compare_opcodes(base_data, pr_data, config): |
| 179 | + """Compare base vs PR opcode data and return per-opcode comparison results. |
| 180 | +
|
| 181 | + Returns a list of (opcode, info) tuples for every opcode. Each info dict contains: |
| 182 | + base_val, pr_val, delta_pct, delta_abs_ns, |
| 183 | + base_mean, pr_mean, base_error, pr_error, base_stddev, pr_stddev, |
| 184 | + threshold, noise_floor, uncertainty_floor, effective_threshold, |
| 185 | + is_flagged, is_noisy |
| 186 | + """ |
| 187 | + results = [] |
| 188 | + for opcode in sorted(set(base_data.keys()) | set(pr_data.keys())): |
| 189 | + b = base_data.get(opcode) |
| 190 | + p = pr_data.get(opcode) |
| 191 | + base_val = (b.get("median") or b.get("mean")) if b else None |
| 192 | + pr_val = (p.get("median") or p.get("mean")) if p else None |
| 193 | + base_mean = b["mean"] if b else None |
| 194 | + pr_mean = p["mean"] if p else None |
| 195 | + base_error = b.get("error") if b else None |
| 196 | + pr_error = p.get("error") if p else None |
| 197 | + base_stddev = b["stddev"] if b else None |
| 198 | + pr_stddev = p["stddev"] if p else None |
| 199 | + threshold = (b or p or {}).get("threshold") or config["default_threshold"] |
| 200 | + |
| 201 | + base_cv_pct = cv_percent(base_mean, base_stddev) |
| 202 | + pr_cv_pct = cv_percent(pr_mean, pr_stddev) |
| 203 | + cv_values = [v for v in (base_cv_pct, pr_cv_pct) if v is not None] |
| 204 | + noise_floor = (max(cv_values) * config["noise_multiplier"]) if cv_values else 0.0 |
| 205 | + uf = uncertainty_floor_percent(base_val, base_error, pr_error, config["error_multiplier"]) or 0.0 |
| 206 | + effective_threshold = max(threshold, noise_floor, uf) |
| 207 | + |
| 208 | + delta_pct = None |
| 209 | + delta_abs_ns = None |
| 210 | + is_flagged = False |
| 211 | + is_noisy = noise_floor > threshold or uf > threshold |
| 212 | + |
| 213 | + if base_val is None or pr_val is None: |
| 214 | + # New or removed opcode |
| 215 | + is_flagged = True |
| 216 | + elif base_val == 0: |
| 217 | + is_flagged = pr_val != 0 |
| 218 | + else: |
| 219 | + delta_pct = ((pr_val - base_val) / base_val) * 100.0 |
| 220 | + delta_abs_ns = abs(pr_val - base_val) |
| 221 | + is_flagged = ( |
| 222 | + abs(delta_pct) >= (effective_threshold + config["delta_margin_percent"]) |
| 223 | + and delta_abs_ns >= config["abs_delta_ns_floor"] |
| 224 | + ) |
| 225 | + |
| 226 | + results.append((opcode, { |
| 227 | + "base_val": base_val, |
| 228 | + "pr_val": pr_val, |
| 229 | + "delta_pct": delta_pct, |
| 230 | + "delta_abs_ns": delta_abs_ns, |
| 231 | + "base_mean": base_mean, |
| 232 | + "pr_mean": pr_mean, |
| 233 | + "base_error": base_error, |
| 234 | + "pr_error": pr_error, |
| 235 | + "base_stddev": base_stddev, |
| 236 | + "pr_stddev": pr_stddev, |
| 237 | + "threshold": threshold, |
| 238 | + "noise_floor": noise_floor, |
| 239 | + "uncertainty_floor": uf, |
| 240 | + "effective_threshold": effective_threshold, |
| 241 | + "is_flagged": is_flagged, |
| 242 | + "is_noisy": is_noisy, |
| 243 | + })) |
| 244 | + return results |
0 commit comments