Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 57 additions & 7 deletions .github/workflows/test-drift.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ jobs:
if: github.event_name != 'pull_request'
runs-on: ubuntu-latest
timeout-minutes: 15
outputs:
# Short, scannable Slack mrkdwn summary of which providers drifted and
# what changed. Consumed by the `notify` job (which runs in a separate
# workspace and so cannot read drift-report.json directly).
summary: ${{ steps.summary.outputs.drift_summary }}
# How many collector runs confirmed the drift (only set when drift
# persisted across every retry). Lets the alert say "confirmed across N
# runs". Empty on the common green/transient path.
runs: ${{ steps.drift.outputs.drift_runs }}
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with: { persist-credentials: false }
Expand All @@ -45,6 +54,14 @@ jobs:
cache: pnpm
- run: pnpm install --frozen-lockfile

# Run the collector behind a "retry before alert" wrapper. A single
# critical run can be a transient real-API hiccup (a streaming call
# failing mid-flight), NOT a format change β€” so the wrapper re-runs the
# collector and only reports critical drift (exit 2) if it PERSISTS
# across every attempt. Any clean retry = transient = exit 0, no alert.
# The common green path stays fast (no extra runs when the first is
# clean). Exit codes mirror the collector contract so the steps below
# are unchanged.
- name: Run drift tests
id: drift
env:
Expand All @@ -53,15 +70,15 @@ jobs:
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
run: |
set +e
npx tsx scripts/drift-report-collector.ts
npx tsx scripts/drift-retry.ts
EXIT_CODE=$?
set -e
echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT"
if [ "$EXIT_CODE" -eq 2 ]; then
: # critical drift found, continue
: # critical drift persisted across retries, continue
elif [ "$EXIT_CODE" -ne 0 ]; then
echo "::error::Collector script crashed with exit code $EXIT_CODE"
exit $EXIT_CODE
exit "$EXIT_CODE"
fi

- name: Upload drift report
Expand All @@ -73,6 +90,15 @@ jobs:
if-no-files-found: warn
retention-days: 30

# Distill drift-report.json into a short Slack summary and expose it as a
# job output so the separate `notify` job can include it in the alert.
# Runs before the failure step below so the summary is captured even when
# critical drift is present.
- name: Summarize drift for Slack
id: summary
if: always()
run: npx tsx scripts/drift-slack-summary.ts

- name: Fail if critical drift detected
if: steps.drift.outputs.exit_code == '2'
run: exit 1
Expand Down Expand Up @@ -101,6 +127,8 @@ jobs:
PREV: ${{ steps.prev.outputs.conclusion }}
DRIFT_RESULT: ${{ needs.drift.result }}
AGUI_RESULT: ${{ needs.agui-schema-drift.result }}
DRIFT_SUMMARY: ${{ needs.drift.outputs.summary }}
DRIFT_RUNS: ${{ needs.drift.outputs.runs }}
REPO: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
run: |
Expand All @@ -125,18 +153,40 @@ jobs:

RUN_URL="<https://github.com/${REPO}/actions/runs/${RUN_ID}|View run>"

# Build the "what drifted" detail block from the drift job's summary
# output. DRIFT_SUMMARY already contains real newlines (one bullet per
# provider); we prepend a real newline so the detail sits on its own
# lines under the headline. jq's --arg encodes these newlines into the
# JSON payload correctly, so Slack renders real line breaks.
# (No literal "\n" in any format() expression β€” that GitHub Actions
# gotcha renders a visible backslash-n; here we use bash newlines.)
NL=$'\n'
DETAIL=""
if [ -n "$DRIFT_SUMMARY" ]; then
DETAIL="${NL}${DRIFT_SUMMARY}"
fi

# When HTTP API drift fired, it only alerts after the drift PERSISTED
# across every retry of the collector. Note that confirmation so the
# reader knows this was not a one-off transient blip. DRIFT_RUNS is
# the count of runs that confirmed the drift (set by drift-retry.ts).
CONFIRMED=""
if [ "$DRIFT_RESULT" = "failure" ] && [ -n "$DRIFT_RUNS" ] && [ "$DRIFT_RUNS" -gt 1 ] 2>/dev/null; then
CONFIRMED="${NL}_(confirmed across ${DRIFT_RUNS} runs)_"
fi

# Both types of drift
if [ "$HTTP_DRIFT" = "true" ] && [ "$AGUI_DRIFT" = "true" ]; then
EMOJI="🚨"
MSG="*Drift detected* in aimock β€” HTTP API drift + AG-UI schema drift. ${RUN_URL}"
MSG="*Drift detected* in aimock β€” HTTP API drift + AG-UI schema drift.${DETAIL}${CONFIRMED}${NL}${RUN_URL}"
# HTTP API drift only
elif [ "$HTTP_DRIFT" = "true" ]; then
EMOJI="🚨"
MSG="*HTTP API drift detected* in aimock β€” providers changed response formats. ${RUN_URL}"
MSG="*HTTP API drift detected* in aimock β€” providers changed response formats.${DETAIL}${CONFIRMED}${NL}${RUN_URL}"
# AG-UI schema drift only
elif [ "$AGUI_DRIFT" = "true" ]; then
EMOJI="🚨"
MSG="*AG-UI schema drift detected* in aimock β€” canonical ag-ui types changed. ${RUN_URL}"
MSG="*AG-UI schema drift detected* in aimock β€” canonical ag-ui types changed.${DETAIL}${NL}${RUN_URL}"
# Infra failure β€” always notify
elif [ "$INFRA_ERROR" = "true" ]; then
EMOJI="❌"
Expand Down
202 changes: 202 additions & 0 deletions scripts/drift-retry.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
/// <reference types="node" />

/**
* Drift Retry Wrapper β€” "retry before alert"
*
* Wraps `drift-report-collector.ts` so a TRANSIENT real-API hiccup no longer
* pages the team. Real LLM provider APIs occasionally fail a single streaming
* call mid-flight (e.g. emit `error` + `response.failed` with no terminal
* `response.completed`). That looks identical to a "critical diff" to the
* collector β€” but it is NOT a format change, and clears on a re-run a moment
* later. Alerting on it produces false alarms.
*
* Policy (matches the owner directive β€” "if it ran the collector/detector a
* second time and it passed, it should not have emitted the warning"):
*
* - collector exit 0 β†’ no critical drift β†’ SUCCESS, no further runs (fast
* common green path).
* - collector exit 2 β†’ critical drift β†’ retry up to `maxAttempts` total runs
* with a short backoff between attempts. As soon as ANY attempt returns 0
* critical β†’ transient β†’ SUCCESS, no alert.
* - Only when EVERY attempt (all `maxAttempts`) shows critical drift do we
* declare a real failure (exit 2) β†’ the `notify` job alerts.
* - collector exit 1 (or any other non-0/2 code) β†’ script/infra crash, NOT
* drift β†’ propagate immediately without retrying (the collector already
* distinguishes infra errors from drift internally; a crash here is a real
* break worth surfacing, and retrying won't help).
*
* Retries hit real provider APIs, so `maxAttempts` is kept small.
*
* The retry decision is implemented as a pure function (`retryUntilStable`)
* with injected collector-runner / sleep / log, so it is unit-testable without
* spawning subprocesses or sleeping. `main()` wires the real collector
* subprocess and emits a `drift_runs` marker via GITHUB_OUTPUT recording how
* many attempts confirmed the drift (so the alert can say "confirmed across N
* runs").
*
* CLI usage (in CI):
* npx tsx scripts/drift-retry.ts [-- <args forwarded to collector>]
*
* Exit codes mirror the collector's contract so downstream YAML logic is
* unchanged: 0 = clean (or transient), 2 = persistent critical drift, other =
* collector crash.
*/

import { spawnSync } from "node:child_process";
import { appendFileSync } from "node:fs";
import { resolve } from "node:path";
import { fileURLToPath } from "node:url";

// Collector exit-code contract (see drift-report-collector.ts header).
export const EXIT_CLEAN = 0;
export const EXIT_CRITICAL_DRIFT = 2;

// Defaults: keep the fleet of real-API calls small. 3 total attempts with a
// ~45s backoff mirrors the observed transient window (the Fix Drift workflow
// re-ran the collector ~1 minute later and saw 0 critical).
export const DEFAULT_MAX_ATTEMPTS = 3;
export const DEFAULT_BACKOFF_MS = 45_000;

export interface RetryAttempt {
/** 1-based attempt number. */
attempt: number;
/** The collector process exit code for this attempt. */
exitCode: number;
}

export interface RetryOptions {
/** Total number of collector runs to attempt before giving up. */
maxAttempts: number;
/** Milliseconds to wait between attempts after a critical run. */
backoffMs: number;
/** Runs the collector once and returns its exit code. */
runCollector: () => number;
/** Sleeps synchronously for the given milliseconds (injected for tests). */
sleep: (ms: number) => void;
/** Logger (injected so tests stay quiet). */
log: (msg: string) => void;
}

export interface RetryResult {
/** Final exit code to propagate (0 = clean/transient, 2 = persistent, other = crash). */
exitCode: number;
/** True when at least one critical run was seen but a later run cleared it. */
transient: boolean;
/** Number of attempts that reported critical drift. */
criticalRuns: number;
/** Per-attempt record, in order. */
attempts: RetryAttempt[];
}

/**
* Core "retry before alert" decision loop. Pure given its injected
* dependencies β€” no subprocesses, no real sleeping.
*/
export function retryUntilStable(opts: RetryOptions): RetryResult {
const attempts: RetryAttempt[] = [];
let criticalRuns = 0;

for (let attempt = 1; attempt <= opts.maxAttempts; attempt++) {
if (attempt > 1) {
opts.log(
`Critical drift on attempt ${attempt - 1}; re-running collector ` +
`(attempt ${attempt}/${opts.maxAttempts}) after ${opts.backoffMs}ms backoff ` +
`to confirm it is not a transient API hiccup...`,
);
opts.sleep(opts.backoffMs);
}

const exitCode = opts.runCollector();
attempts.push({ attempt, exitCode });

if (exitCode === EXIT_CLEAN) {
const transient = criticalRuns > 0;
if (transient) {
opts.log(
`Attempt ${attempt} returned 0 critical β€” earlier critical drift was ` +
`transient (cleared on retry). No alert.`,
);
}
return { exitCode: EXIT_CLEAN, transient, criticalRuns, attempts };
}

if (exitCode === EXIT_CRITICAL_DRIFT) {
criticalRuns++;
continue;
}

// Any other code = collector crash / infra error. Do not retry β€” surface it.
opts.log(`Collector exited ${exitCode} (not drift) β€” propagating without retry.`);
return { exitCode, transient: false, criticalRuns, attempts };
}

// Exhausted all attempts and every one showed critical drift β†’ persistent.
opts.log(
`Critical drift persisted across all ${opts.maxAttempts} attempts β€” treating as ` +
`real drift and alerting.`,
);
return {
exitCode: EXIT_CRITICAL_DRIFT,
transient: false,
criticalRuns,
attempts,
};
}

// ---------------------------------------------------------------------------
// CLI wiring
// ---------------------------------------------------------------------------

/** Run the collector subprocess once, inheriting stdio, returning its exit code. */
function runCollectorSubprocess(forwardArgs: string[]): number {
const result = spawnSync("npx", ["tsx", "scripts/drift-report-collector.ts", ...forwardArgs], {
stdio: "inherit",
encoding: "utf-8",
});
if (result.error) {
// Failed to even spawn β€” treat as a crash (non-0/2) so it is not retried.
console.error(`Failed to run collector: ${result.error.message}`);
return 1;
}
// A signal kill has null status; map to 1 (crash).
return result.status ?? 1;
}

function writeGithubOutput(name: string, value: string): void {
const outPath = process.env.GITHUB_OUTPUT;
if (!outPath) return;
appendFileSync(outPath, `${name}=${value}\n`, "utf-8");
}

function main(): void {
// Forward anything after a literal `--` to the collector (e.g. --out).
const argv = process.argv.slice(2);
const sepIndex = argv.indexOf("--");
const forwardArgs = sepIndex !== -1 ? argv.slice(sepIndex + 1) : [];

const result = retryUntilStable({
maxAttempts: DEFAULT_MAX_ATTEMPTS,
backoffMs: DEFAULT_BACKOFF_MS,
runCollector: () => runCollectorSubprocess(forwardArgs),
sleep: (ms: number) => {
// Synchronous busy-free sleep via Atomics so the CLI can stay sync.
const sab = new Int32Array(new SharedArrayBuffer(4));
Atomics.wait(sab, 0, 0, ms);
},
log: (msg: string) => console.log(`[drift-retry] ${msg}`),
});

// Expose how many runs confirmed the drift so the alert can note
// "confirmed across N runs". Only meaningful when we actually alert.
if (result.exitCode === EXIT_CRITICAL_DRIFT) {
writeGithubOutput("drift_runs", String(result.criticalRuns));
}

process.exit(result.exitCode);
}

// Only run as a CLI β€” guard so importing this module (e.g. from tests) does
// not execute main().
if (process.argv[1] && resolve(process.argv[1]) === fileURLToPath(import.meta.url)) {
main();
}
Loading
Loading