callstackincubator · artus9033 · Mar 25, 2026
diff --git a/.github/workflows/webhook-dispatch.yml b/.github/workflows/webhook-dispatch.yml
@@ -0,0 +1,31 @@
+name: trigger-webhook
+
+on:
+  push:
+    branches: [main]
+    tags: ['v*']
+
+jobs:
+  dispatch:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Trigger webhook
+        env:
+          WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }}
+          WEBHOOK_USER: ${{ secrets.WEBHOOK_USER }}
+          WEBHOOK_PASSWORD: ${{ secrets.WEBHOOK_PASSWORD }}
+        run: |
+          if [ -z "$WEBHOOK_URL" ]; then
+            echo "WEBHOOK_URL secret unset; skip WEBHOOK dispatch"
+            exit 0
+          fi
+          basic=$(printf '%s:%s' "$WEBHOOK_USER" "$WEBHOOK_PASSWORD" | base64 -w0)
+          body=$(jq -nc \
+            --arg sha "${{ github.sha }}" \
+            --arg ref "${{ github.ref_name }}" \
+            --arg url "https://github.com/${{ github.repository }}.git" \
+            '{conf: {commit_sha: $sha, git_ref: $ref, git_repo_url: $url}}')
+          curl -sfS -X POST "$WEBHOOK_URL" \
+            -H "Authorization: Basic $basic" \
+            -H "Content-Type: application/json" \
+            -d "$body"
diff --git a/paper/benchmark-methodology-whitepaper.tex b/paper/benchmark-methodology-whitepaper.tex
@@ -314,6 +314,8 @@ \subsection{OpenCode integration in the judgment stage.}
 
 The judge CLI also supports targeted re-judging over existing judge outputs using \\\texttt{----rerun-requirements-file} with \texttt{----output}; optionally pass \texttt{----rerun-requirement-id} to refresh only one requirement. Without \texttt{----rerun-requirement-id}, all requirements for the targeted eval are re-judged and replaced in the per-eval JSON. It also supports \texttt{----rerun-missing-judgements} with \texttt{----output} to scan for and judge all evals missing per-eval judge JSON outputs (the same missing rows counted by \texttt{evalsErrored} in the rebuilt summary). In rerun modes, the previous \texttt{summary.json} is backed up as \texttt{summary.backup.<timestamp>.json}, and a new aggregate summary is generated from the current per-eval result set.
 
+Both CLIs accept repeatable \texttt{----skip-eval-id}. Generation omits those eval IDs from discovery processing. Judging skips LLM calls for those IDs and instead requires an existing per-eval judge JSON already present under the configured judge output directory (used when orchestration merges prior verdict artifacts). Skipped evals still contribute to the aggregate summary when their per-eval JSON is present.
+
 \section{Solver Stage Methodology}
 
 The solver methodology treats each benchmark item as a constrained code-generation task: the model receives a task prompt plus project context, produces a complete implementation artifact, and is evaluated only against task-defined requirements. The same solver protocol is applied across all tasks and runs, with outputs stored in a structured, reproducible format, ensuring fair comparison across models and direct traceability between generated code and downstream requirement-level scores.
@@ -338,7 +340,7 @@ \subsection{Output Contract}
 Returned paths are sanitized before writing to disk (e.g., stripping leading slashes and removing traversal segments such as \texttt{..}).
 
 \subsection{Model Requirement}
-The generation CLI requires \texttt{----model}. When set to \texttt{noop}, the generation stage copies files from each eval's \texttt{reference/} directory into the configured output and still writes a validated \texttt{manifest.json}. Any other model value runs normal solver generation.
+The generation CLI requires \texttt{----model}. When set to \texttt{noop}, the generation stage copies files from each eval's \texttt{reference/} directory into the configured output and still writes a validated \texttt{manifest.json}. Any other model value runs normal solver generation. Repeatable \texttt{----skip-eval-id} removes matching eval IDs from the generation pass after glob discovery (orchestration may merge outputs from a prior run for those IDs before judging).
 
 \section{LLM Judge Methodology}
 
@@ -405,7 +407,7 @@ \subsection{Requirement Mapping and Failure Policy}
   \item Requirement weights are normalized before scoring (\cref{sec:scoring}).
 \end{itemize}
 
-The judge CLI requires \texttt{----model}; there is no judge skip/noop path.
+The judge CLI requires \texttt{----model}. Aside from repeatable \texttt{----skip-eval-id} (which reuses existing per-eval judge JSON on disk rather than calling the judge model), there is no judge noop path.
 
 \section{Scoring Methodology}
 \label{sec:scoring}
@@ -545,7 +547,7 @@ \section{Recommended Reporting Protocol}
 For comparative studies, report at minimum:
 \begin{enumerate}
   \item repository commit hash (dataset and runner version)
-  \item CLI options (\texttt{run: ----pattern, ----model, ----timeout, ----concurrency, ----output}; \\\texttt{judge: ----model, ----timeout, ----concurrency, ----input} \\and optional rerun flags \texttt{----rerun-requirements-file}, \texttt{----output}, \\scope optional \texttt{----rerun-requirement-id}, scope optional \texttt{----rerun-missing-judgements})
+  \item CLI options (\texttt{run: ----pattern, ----model, ----timeout, ----concurrency, ----output}, \\optional repeatable \texttt{----skip-eval-id}; \\\texttt{judge: ----model, ----timeout, ----concurrency, ----input}, \\optional repeatable \texttt{----skip-eval-id}, \\and optional rerun flags \texttt{----rerun-requirements-file}, \texttt{----output}, \\scope optional \texttt{----rerun-requirement-id}, scope optional \texttt{----rerun-missing-judgements})
   \item execution date and time
   \item counts of discovered, processed, and errored evals
   \item \texttt{weightedAverageScore} and \texttt{requirementsPassed/Total}

diff --git a/runner/config.ts b/runner/config.ts
@@ -1,5 +1,15 @@
 import { parseArgs as parseArgv } from 'node:util'
 
+function normalizeSkipEvalIds(value: string | string[] | undefined) {
+  if (!value) {
+    return []
+  }
+
+  return (Array.isArray(value) ? value : [value]).filter(
+    (item) => item.length > 0
+  )
+}
+
 function parsePositiveInteger(rawValue: string, flagName: string) {
   const parsedValue = Number.parseInt(rawValue, 10)
   if (!Number.isInteger(parsedValue) || parsedValue <= 0) {
@@ -25,6 +35,7 @@ export function parseRunCliArgs(argv: string[] = Bun.argv.slice(2)) {
       'max-retries': { type: 'string', default: '1' },
       'model': { type: 'string' },
       'pattern': { type: 'string', default: 'evals/**/*' },
+      'skip-eval-id': { type: 'string', multiple: true },
       'timeout': { type: 'string', default: '120000' },
       'port': { type: 'string' },
       'output': { type: 'string' },
@@ -43,6 +54,7 @@ export function parseRunCliArgs(argv: string[] = Bun.argv.slice(2)) {
     maxRetries: parsePositiveInteger(values['max-retries'], '--max-retries'),
     model: values.model,
     pattern: values.pattern,
+    skipEvalIds: normalizeSkipEvalIds(values['skip-eval-id']),
     timeout: parsePositiveInteger(values.timeout, '--timeout'),
     port: parsePort(values.port),
     output: values.output,
@@ -64,6 +76,7 @@ export function parseJudgeCliArgs(argv: string[] = Bun.argv.slice(2)) {
       'rerun-missing-judgements': { type: 'boolean', default: false },
       'rerun-requirement-id': { type: 'string' },
       'rerun-requirements-file': { type: 'string' },
+      'skip-eval-id': { type: 'string', multiple: true },
       'timeout': { type: 'string', default: '120000' },
       'port': { type: 'string' },
       'input': { type: 'string' },
@@ -106,6 +119,7 @@ export function parseJudgeCliArgs(argv: string[] = Bun.argv.slice(2)) {
     failFast: values['fail-fast'] ?? false,
     maxRetries: parsePositiveInteger(values['max-retries'], '--max-retries'),
     model: values.model,
+    skipEvalIds: normalizeSkipEvalIds(values['skip-eval-id']),
     rerunMissingJudgements: values['rerun-missing-judgements'] ?? false,
     rerunRequirementId: values['rerun-requirement-id'],
     rerunRequirementsFile: values['rerun-requirements-file'],

diff --git a/runner/judge.ts b/runner/judge.ts
@@ -326,6 +326,7 @@ async function runWithRetries<T>(
 */
 export async function runJudgeEntry(argv: string[] = Bun.argv.slice(2)) {
   const cliOptions = parseJudgeCliArgs(argv)
+  const skipEvalIdSet = new Set(cliOptions.skipEvalIds)
   const inputDirectory = path.resolve(process.cwd(), cliOptions.input)
   const outputDirectory = cliOptions.output ?? path.dirname(inputDirectory)
   const outputDirectories = await createRunOutputDirectories(outputDirectory)
@@ -610,6 +611,35 @@ export async function runJudgeEntry(argv: string[] = Bun.argv.slice(2)) {
     cliOptions.concurrency,
     async (manifestEval, index) => {
       try {
+        if (skipEvalIdSet.has(manifestEval.evalId)) {
+          const resultFilePath = getResultFilePath(
+            outputDirectories.runDirectory,
+            manifestEval.generatedPath,
+            manifestEval.evalId
+          )
+
+          try {
+            const raw = await readFile(resultFilePath, 'utf8')
+            const parsed = parsePersistedEvalResult(raw, resultFilePath)
+            const position = index + 1
+            console.log(
+              `[${position}/${manifestEvals.length}] ${manifestEval.evalId} ` +
+                `-> llm:${parsed.score.ratio} (reused prior judge output)`
+            )
+
+            return { kind: 'success' as const, index, result: parsed }
+          } catch (error) {
+            if (isNotFoundError(error)) {
+              throw new Error(
+                `missing judge output for skipped eval ${manifestEval.evalId} at ` +
+                  `${toRelativePath(resultFilePath)}`
+              )
+            }
+
+            throw error
+          }
+        }
+
         const stageResult = await runJudgeForManifestEval({
           manifestEval,
           index,

diff --git a/runner/run.ts b/runner/run.ts
@@ -57,7 +57,10 @@ async function runWithRetries<T>(
 */
 export async function runGenerationEntry(argv: string[] = Bun.argv.slice(2)) {
   const cliOptions = parseRunCliArgs(argv)
-  const discoveredEvals = await discoverEvals(cliOptions.pattern)
+  const skipEvalIdSet = new Set(cliOptions.skipEvalIds)
+  const discoveredEvals = (await discoverEvals(cliOptions.pattern)).filter(
+    (item) => !skipEvalIdSet.has(item.evalId)
+  )
   const runId = new Date().toISOString().replace(/[:.]/g, '-')
   const startedAt = new Date().toISOString()
   const outputDirectory = path.resolve(