diff --git a/skills/bfts-config-prep/SKILL.md b/skills/bfts-config-prep/SKILL.md
new file mode 100644
index 00000000..abbe4a9c
--- /dev/null
+++ b/skills/bfts-config-prep/SKILL.md
@@ -0,0 +1,29 @@
+---
+name: bfts-config-prep
+description: Prepare a run directory and BFTS config for experiments from an idea JSON + idea.md. Use before running experiment-bfts-runner.
+---
+
+# BFTS Config Prep
+
+## Overview
+Create a run folder with a timestamped name, copy a BFTS config template, and fill in required paths (desc_file, data_dir, log_dir, workspace_dir).
+
+## Workflow
+1. **Ensure idea files exist**
+   - `idea.json` follows `references/idea.schema.json`.
+   - `idea.md` generated by idea-to-markdown.
+2. **Prepare run folder**
+   - `UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with pyyaml -s scripts/prep_bfts_config.py --idea-json idea.json --idea-md idea.md --out-root runs`
+
+## Outputs
+- `runs/<timestamp>_<idea_name>/`
+  - `idea.json`, `idea.md`, `bfts_config.yaml`
+  - `data/`, `logs/`, `workspaces/`
+
+## Safeguards
+- Does not modify source idea files.
+- Writes only under `--out-root`.
+
+## References
+- Idea schema: `references/idea.schema.json`
+- BFTS template: `references/bfts_config_template.yaml`
diff --git a/skills/bfts-config-prep/agents/openai.yaml b/skills/bfts-config-prep/agents/openai.yaml
new file mode 100644
index 00000000..4f6c9518
--- /dev/null
+++ b/skills/bfts-config-prep/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "BFTS Config Prep"
+  short_description: "Prepare run dirs + BFTS config"
+  default_prompt: "Create a timestamped run directory with idea.json/idea.md and a configured bfts_config.yaml."
diff --git a/skills/bfts-config-prep/references/bfts_config_template.yaml b/skills/bfts-config-prep/references/bfts_config_template.yaml
new file mode 100644
index 00000000..189b45bb
--- /dev/null
+++ b/skills/bfts-config-prep/references/bfts_config_template.yaml
@@ -0,0 +1,87 @@
+# path to the task data directory
+data_dir: "data"
+preprocess_data: False
+
+goal: null
+eval: null
+
+log_dir: logs
+workspace_dir: workspaces
+
+# whether to copy the data to the workspace directory (otherwise it will be symlinked)
+# copying is recommended to prevent the agent from accidentally modifying the original data
+copy_data: True
+
+exp_name: run # a random experiment name will be generated if not provided
+
+# settings for code execution
+exec:
+  timeout: 3600
+  agent_file_name: runfile.py
+  format_tb_ipython: False
+
+generate_report: True
+# LLM settings for final report from journal
+report:
+  model: gpt-4o-2024-11-20
+  temp: 1.0
+
+experiment:
+  num_syn_datasets: 1
+
+debug:
+  stage4: False
+
+# agent hyperparams
+agent:
+  type: parallel
+  num_workers: 4
+  stages:
+    stage1_max_iters: 20
+    stage2_max_iters: 12
+    stage3_max_iters: 12
+    stage4_max_iters: 18
+  # how many improvement iterations to run
+  steps: 5 # if stage-specific max_iters are not provided, the agent will use this value for all stages
+  # whether to instruct the agent to use CV (set to 1 to disable)
+  k_fold_validation: 1
+  multi_seed_eval:
+    num_seeds: 3 # should be the same as num_workers if num_workers < 3. Otherwise, set it to be 3.
+  # whether to instruct the agent to generate a prediction function
+  expose_prediction: False
+  # whether to provide the agent with a preview of the data
+  data_preview: False
+
+  # LLM settings for coding
+  code:
+    model: anthropic.claude-3-5-sonnet-20241022-v2:0
+    temp: 1.0
+    max_tokens: 12000
+
+  # LLM settings for evaluating program output / tracebacks
+  feedback:
+    model: gpt-4o-2024-11-20
+    # gpt-4o
+    temp: 0.5
+    max_tokens: 8192
+
+  vlm_feedback:
+    model: gpt-4o-2024-11-20
+    temp: 0.5
+    max_tokens: null
+
+  search:
+    max_debug_depth: 3
+    debug_prob: 0.5
+    num_drafts: 3
+
+  # Options for summarizing findings and selecting the best node
+  # If not specified, the default behavior will be used.
+
+  # summary:
+  #   model: gpt-4o
+  #   temp: 0.3
+
+  # select_node:
+  #   model: gpt-4o
+  #   temp: 0.3
diff --git a/skills/bfts-config-prep/references/idea.schema.json b/skills/bfts-config-prep/references/idea.schema.json
new file mode 100644
index 00000000..334899aa
--- /dev/null
+++ b/skills/bfts-config-prep/references/idea.schema.json
@@ -0,0 +1,34 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "AI Scientist Idea",
+  "type": "object",
+  "required": [
+    "Name",
+    "Title",
+    "Short Hypothesis",
+    "Related Work",
+    "Abstract",
+    "Experiments",
+    "Risk Factors and Limitations"
+  ],
+  "properties": {
+    "Name": {"type": "string", "pattern": "^[a-z0-9_-]+$"},
+    "Title": {"type": "string"},
+    "Short Hypothesis": {"type": "string"},
+    "Related Work": {"type": "string"},
+    "Abstract": {"type": "string"},
+    "Experiments": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": ["string", "object"]}}
+      ]
+    },
+    "Risk Factors and Limitations": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": "string"}}
+      ]
+    }
+  },
+  "additionalProperties": true
+}
diff --git a/skills/bfts-config-prep/scripts/prep_bfts_config.py b/skills/bfts-config-prep/scripts/prep_bfts_config.py
new file mode 100755
index 00000000..08230859
--- /dev/null
+++ b/skills/bfts-config-prep/scripts/prep_bfts_config.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Prepare a BFTS run directory and config from idea JSON + idea.md.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+
+try:
+    import yaml  # type: ignore
+except Exception:
+    yaml = None
+
+
+def _load_json(path: Path) -> dict:
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        raise SystemExit(f"[ERROR] File not found: {path}")
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"[ERROR] Invalid JSON: {path}: {e}")
+
+
+def _extract_idea_name(obj: dict) -> str:
+    if "Name" in obj and isinstance(obj["Name"], str):
+        return obj["Name"].strip()
+    if "idea" in obj and isinstance(obj["idea"], dict) and isinstance(obj["idea"].get("Name"), str):
+        return obj["idea"]["Name"].strip()
+    return "idea"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Prepare BFTS run directory and config.")
+    ap.add_argument("--idea-json", required=True, help="Path to idea JSON.")
+    ap.add_argument("--idea-md", required=True, help="Path to idea markdown.")
+    ap.add_argument("--out-root", required=True, help="Root directory for runs.")
+    ap.add_argument(
+        "--config-template",
+        default=None,
+        help="BFTS config template YAML (default: references/bfts_config_template.yaml).",
+    )
+    args = ap.parse_args()
+
+    if yaml is None:
+        raise SystemExit("[ERROR] pyyaml is required. Try: uv run --with pyyaml -s scripts/prep_bfts_config.py --help")
+
+    idea_json = Path(args.idea_json).expanduser().resolve()
+    idea_md = Path(args.idea_md).expanduser().resolve()
+    out_root = Path(args.out_root).expanduser().resolve()
+
+    if not idea_json.exists():
+        raise SystemExit(f"[ERROR] idea JSON not found: {idea_json}")
+    if not idea_md.exists():
+        raise SystemExit(f"[ERROR] idea markdown not found: {idea_md}")
+
+    obj = _load_json(idea_json)
+    if isinstance(obj, list) and obj:
+        name = _extract_idea_name(obj[0] if isinstance(obj[0], dict) else {})
+    elif isinstance(obj, dict):
+        name = _extract_idea_name(obj)
+    else:
+        name = "idea"
+
+    ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    run_dir = out_root / f"{ts}_{name}"
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    data_dir = run_dir / "data"
+    logs_dir = run_dir / "logs"
+    workspaces_dir = run_dir / "workspaces"
+    for d in (data_dir, logs_dir, workspaces_dir):
+        d.mkdir(parents=True, exist_ok=True)
+
+    # Copy idea files
+    (run_dir / "idea.json").write_text(idea_json.read_text(encoding="utf-8"), encoding="utf-8")
+    (run_dir / "idea.md").write_text(idea_md.read_text(encoding="utf-8"), encoding="utf-8")
+
+    # Load template
+    if args.config_template:
+        tpl = Path(args.config_template).expanduser().resolve()
+    else:
+        tpl = Path(__file__).parent.parent / "references" / "bfts_config_template.yaml"
+    if not tpl.exists():
+        raise SystemExit(f"[ERROR] Config template not found: {tpl}")
+
+    config = yaml.safe_load(tpl.read_text(encoding="utf-8"))
+    if not isinstance(config, dict):
+        raise SystemExit("[ERROR] Invalid config template format.")
+
+    config["desc_file"] = str((run_dir / "idea.md").resolve())
+    config["data_dir"] = str(data_dir)
+    config["log_dir"] = str(logs_dir)
+    config["workspace_dir"] = str(workspaces_dir)
+
+    out_cfg = run_dir / "bfts_config.yaml"
+    out_cfg.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
+
+    print(f"[OK] Prepared run directory: {run_dir}")
+    print(f"[OK] Wrote config: {out_cfg}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/citation-harvest/SKILL.md b/skills/citation-harvest/SKILL.md
new file mode 100644
index 00000000..31f18982
--- /dev/null
+++ b/skills/citation-harvest/SKILL.md
@@ -0,0 +1,35 @@
+---
+name: citation-harvest
+description: Query Semantic Scholar to collect citations and generate a deduplicated BibTeX file. Offline by default.
+---
+
+# Citation Harvest
+
+## Overview
+Collect citations from Semantic Scholar using query strings and output a JSON bundle plus a BibTeX file.
+
+## Workflow
+1. Prepare queries (one per line)
+2. Run the harvester
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run -s scripts/citation_harvest.py \
+     --online --in queries.txt --out-json citations.json --out-bib citations.bib
+   ~~~
+
+## Inputs
+- --in: text file with one query per line (optional)
+- --query: repeatable query strings
+- --limit: results per query (default 5)
+- --online: enable network calls (required)
+
+## Outputs
+- citations.json
+- citations.bib
+
+## Safeguards
+- Offline by default; --online required.
+- No uploads; only queries sent to Semantic Scholar.
+- API key must be provided via S2_API_KEY env var if needed.
+
+## References
+- Safeguards: references/safeguards.md
diff --git a/skills/citation-harvest/agents/openai.yaml b/skills/citation-harvest/agents/openai.yaml
new file mode 100644
index 00000000..cf511024
--- /dev/null
+++ b/skills/citation-harvest/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Citation Harvest"
+  short_description: "Query Semantic Scholar and output deduplicated BibTeX"
+  default_prompt: "Gather citations using provided queries, deduplicate, and produce citations.json and citations.bib."
diff --git a/skills/citation-harvest/references/safeguards.md b/skills/citation-harvest/references/safeguards.md
new file mode 100644
index 00000000..be949e83
--- /dev/null
+++ b/skills/citation-harvest/references/safeguards.md
@@ -0,0 +1,3 @@
+- Do not claim novelty based solely on sparse results.
+- Record query strings and the query date in your notes.
+- Do not upload private data; only send keyword queries.
diff --git a/skills/citation-harvest/scripts/citation_harvest.py b/skills/citation-harvest/scripts/citation_harvest.py
new file mode 100644
index 00000000..5c098838
--- /dev/null
+++ b/skills/citation-harvest/scripts/citation_harvest.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import sys
+import urllib.parse
+import urllib.request
+from pathlib import Path
+
+S2_API = "https://api.semanticscholar.org/graph/v1/paper/search"
+
+
+def _require_online() -> None:
+    if os.getenv("ASV2_ONLINE") != "1":
+        raise SystemExit("[ERROR] Offline mode. Re-run with --online to allow network calls.")
+
+
+def _load_queries(path: Path | None, queries: list[str]) -> list[str]:
+    items = list(queries)
+    if path:
+        text = path.read_text(encoding="utf-8")
+        for line in text.splitlines():
+            if line.strip():
+                items.append(line.strip())
+    return items
+
+
+def _sanitize_key(text: str) -> str:
+    text = re.sub(r"[^a-zA-Z0-9]+", "_", text.strip().lower())
+    return text.strip("_") or "paper"
+
+
+def _bibtex_entry(paper: dict) -> str:
+    title = paper.get("title", "Untitled")
+    year = paper.get("year", "")
+    authors = paper.get("authors", [])
+    author_str = " and ".join([a.get("name", "") for a in authors if a.get("name")])
+    venue = paper.get("venue", "")
+    key = _sanitize_key(f"{title}_{year}")
+    fields = {
+        "title": title,
+        "author": author_str,
+        "year": str(year),
+        "journal": venue,
+    }
+    body = ",\n".join([f"  {k}={{" + v.replace("{", "").replace("}", "") + "}}" for k, v in fields.items() if v])
+    return f"@article{{{key},\n{body}\n}}"
+
+
+def _fetch(query: str, limit: int, fields: str) -> list[dict]:
+    params = {
+        "query": query,
+        "limit": str(limit),
+        "fields": fields,
+    }
+    url = f"{S2_API}?{urllib.parse.urlencode(params)}"
+    req = urllib.request.Request(url)
+    api_key = os.getenv("S2_API_KEY")
+    if api_key:
+        req.add_header("x-api-key", api_key)
+    with urllib.request.urlopen(req, timeout=60) as resp:
+        data = json.loads(resp.read().decode("utf-8"))
+    return data.get("data", [])
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Harvest citations from Semantic Scholar.")
+    ap.add_argument("--in", dest="in_path", help="Path to text file with one query per line.")
+    ap.add_argument("--query", action="append", default=[], help="Query string (repeatable).")
+    ap.add_argument("--limit", type=int, default=5, help="Results per query.")
+    ap.add_argument("--out-json", required=True, help="Output JSON file.")
+    ap.add_argument("--out-bib", required=True, help="Output BibTeX file.")
+    ap.add_argument("--online", action="store_true", help="Enable network calls.")
+    args = ap.parse_args()
+
+    if args.online:
+        os.environ["ASV2_ONLINE"] = "1"
+    _require_online()
+
+    in_path = Path(args.in_path) if args.in_path else None
+    queries = _load_queries(in_path, args.query)
+    if not queries:
+        print("[ERROR] No queries provided.", file=sys.stderr)
+        return 2
+
+    all_results: list[dict] = []
+    seen = set()
+    fields = "title,authors,venue,year,externalIds,citationCount,url"
+    for q in queries:
+        for paper in _fetch(q, args.limit, fields):
+            ext = paper.get("externalIds") or {}
+            key = ext.get("DOI") or (paper.get("title", "").lower(), paper.get("year"))
+            if key in seen:
+                continue
+            seen.add(key)
+            paper["query"] = q
+            all_results.append(paper)
+
+    out_json = Path(args.out_json)
+    out_json.write_text(json.dumps({"queries": queries, "results": all_results}, indent=2), encoding="utf-8")
+
+    bib_entries = ["% Generated by citation_harvest.py"]
+    for paper in all_results:
+        bib_entries.append(_bibtex_entry(paper))
+    Path(args.out_bib).write_text("\n\n".join(bib_entries), encoding="utf-8")
+
+    print(f"[OK] Wrote {out_json} and {args.out_bib}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/experiment-bfts-runner/SKILL.md b/skills/experiment-bfts-runner/SKILL.md
new file mode 100644
index 00000000..949dbcd2
--- /dev/null
+++ b/skills/experiment-bfts-runner/SKILL.md
@@ -0,0 +1,42 @@
+---
+name: experiment-bfts-runner
+description: Run the standalone BFTS experiment pipeline to execute multi-agent tree-search experiments from a prepared bfts_config.yaml. Use after idea-to-markdown and bfts-config-prep to produce logs, workspaces, and experiment artifacts.
+---
+
+# Experiment BFTS Runner
+
+## Overview
+Execute the full BFTS tree-search experiment workflow from a prepared bfts_config.yaml, producing logs, workspaces, and per-node experiment results.
+
+## Workflow
+1. Prepare a run directory
+   - Use bfts-config-prep to create runs/<timestamp>_<idea_name>/ with bfts_config.yaml and idea.md.
+2. Run the experiment
+   - Offline default:
+     ~~~bash
+     UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with pyyaml,omegaconf,openai,anthropic,backoff,rich,humanize -s scripts/run_bfts.py --config runs/<run>/bfts_config.yaml
+     ~~~
+   - Online (required for LLM calls):
+     ~~~bash
+     uv run -s scripts/run_bfts.py --config runs/<run>/bfts_config.yaml --online
+     ~~~
+3. Inspect outputs
+   - Logs and workspaces are placed under the run directory; use experiment-log-summarizer for summaries.
+
+## Inputs
+- --config: path to bfts_config.yaml.
+- --online: enable network calls to LLM providers (default: offline).
+
+## Outputs
+- logs/ and workspaces/ under the run directory.
+- Per-node experiment results (e.g., experiment_results/ containing experiment_data.npy).
+
+## Safeguards
+- Offline by default; --online required for network calls.
+- Reads only from the run directory; writes only within the run directory and its logs/ and workspaces/ subfolders.
+- No file deletion unless you manually clean outputs.
+
+## References
+- Run manifest schema: references/run.manifest.json
+- Idea schema: references/idea.schema.json
+- Summary schema: references/summary.schema.json
diff --git a/skills/experiment-bfts-runner/agents/openai.yaml b/skills/experiment-bfts-runner/agents/openai.yaml
new file mode 100644
index 00000000..af751f09
--- /dev/null
+++ b/skills/experiment-bfts-runner/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Experiment BFTS Runner"
+  short_description: "Run standalone BFTS experiments from bfts_config.yaml"
+  default_prompt: "Execute a prepared BFTS run using the provided bfts_config.yaml. Keep outputs within the run directory."
diff --git a/skills/experiment-bfts-runner/references/idea.schema.json b/skills/experiment-bfts-runner/references/idea.schema.json
new file mode 100644
index 00000000..334899aa
--- /dev/null
+++ b/skills/experiment-bfts-runner/references/idea.schema.json
@@ -0,0 +1,34 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "AI Scientist Idea",
+  "type": "object",
+  "required": [
+    "Name",
+    "Title",
+    "Short Hypothesis",
+    "Related Work",
+    "Abstract",
+    "Experiments",
+    "Risk Factors and Limitations"
+  ],
+  "properties": {
+    "Name": {"type": "string", "pattern": "^[a-z0-9_-]+$"},
+    "Title": {"type": "string"},
+    "Short Hypothesis": {"type": "string"},
+    "Related Work": {"type": "string"},
+    "Abstract": {"type": "string"},
+    "Experiments": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": ["string", "object"]}}
+      ]
+    },
+    "Risk Factors and Limitations": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": "string"}}
+      ]
+    }
+  },
+  "additionalProperties": true
+}
diff --git a/skills/experiment-bfts-runner/references/run.manifest.json b/skills/experiment-bfts-runner/references/run.manifest.json
new file mode 100644
index 00000000..243d32fd
--- /dev/null
+++ b/skills/experiment-bfts-runner/references/run.manifest.json
@@ -0,0 +1,48 @@
+{
+  "title": "RunManifest",
+  "type": "object",
+  "required": [
+    "run_dir",
+    "idea_md",
+    "idea_json",
+    "bfts_config",
+    "logs_dir",
+    "workspaces_dir"
+  ],
+  "properties": {
+    "run_dir": {
+      "type": "string"
+    },
+    "idea_md": {
+      "type": "string"
+    },
+    "idea_json": {
+      "type": "string"
+    },
+    "bfts_config": {
+      "type": "string"
+    },
+    "logs_dir": {
+      "type": "string"
+    },
+    "workspaces_dir": {
+      "type": "string"
+    },
+    "outputs": {
+      "type": "object",
+      "properties": {
+        "experiment_results": {
+          "type": "string"
+        },
+        "summary_json": {
+          "type": "string"
+        },
+        "summary_md": {
+          "type": "string"
+        }
+      },
+      "additionalProperties": true
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/experiment-bfts-runner/references/summary.schema.json b/skills/experiment-bfts-runner/references/summary.schema.json
new file mode 100644
index 00000000..18a340f6
--- /dev/null
+++ b/skills/experiment-bfts-runner/references/summary.schema.json
@@ -0,0 +1,47 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "ExperimentSummary",
+  "type": "object",
+  "required": [
+    "Experiment_description",
+    "Significance",
+    "Description",
+    "List_of_included_plots",
+    "Key_numerical_results"
+  ],
+  "properties": {
+    "Experiment_description": {
+      "type": "string"
+    },
+    "Significance": {
+      "type": "string"
+    },
+    "Description": {
+      "type": "string"
+    },
+    "List_of_included_plots": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "Key_numerical_results": {
+      "oneOf": [
+        {
+          "type": "array",
+          "items": {
+            "type": [
+              "string",
+              "object",
+              "number"
+            ]
+          }
+        },
+        {
+          "type": "object"
+        }
+      ]
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/experiment-bfts-runner/scripts/asv2/__init__.py b/skills/experiment-bfts-runner/scripts/asv2/__init__.py
new file mode 100644
index 00000000..180974ab
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/__init__.py
@@ -0,0 +1 @@
+# asv2 package for experiment-bfts-runner
diff --git a/skills/experiment-bfts-runner/scripts/asv2/llm.py b/skills/experiment-bfts-runner/scripts/asv2/llm.py
new file mode 100644
index 00000000..7ef3a59d
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/llm.py
@@ -0,0 +1,554 @@
+import json
+import os
+import re
+from typing import Any
+from asv2.token_tracker import track_token_usage
+
+import anthropic
+import backoff
+import openai
+
+MAX_NUM_TOKENS = 4096
+ONLINE_ENV_VAR = "ASV2_ONLINE"
+
+AVAILABLE_LLMS = [
+    "claude-3-5-sonnet-20240620",
+    "claude-3-5-sonnet-20241022",
+    # OpenAI models
+    "gpt-4o-mini",
+    "gpt-4o-mini-2024-07-18",
+    "gpt-4o",
+    "gpt-4o-2024-05-13",
+    "gpt-4o-2024-08-06",
+    "gpt-4.1",
+    "gpt-4.1-2025-04-14",
+    "gpt-4.1-mini",
+    "gpt-4.1-mini-2025-04-14",
+    "o1",
+    "o1-2024-12-17",
+    "o1-preview-2024-09-12",
+    "o1-mini",
+    "o1-mini-2024-09-12",
+    "o3-mini",
+    "o3-mini-2025-01-31",
+    # DeepSeek Models
+    "deepseek-coder-v2-0724",
+    "deepcoder-14b",
+    # Llama 3 models
+    "llama3.1-405b",
+    # Anthropic Claude models via Amazon Bedrock
+    "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
+    "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0",
+    "bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0",
+    "bedrock/anthropic.claude-3-haiku-20240307-v1:0",
+    "bedrock/anthropic.claude-3-opus-20240229-v1:0",
+    # Anthropic Claude models Vertex AI
+    "vertex_ai/claude-3-opus@20240229",
+    "vertex_ai/claude-3-5-sonnet@20240620",
+    "vertex_ai/claude-3-5-sonnet@20241022",
+    "vertex_ai/claude-3-sonnet@20240229",
+    "vertex_ai/claude-3-haiku@20240307",
+    # Google Gemini models
+    "gemini-2.0-flash",
+    "gemini-2.5-flash-preview-04-17",
+    "gemini-2.5-pro-preview-03-25",
+    # GPT-OSS models via Ollama
+    "ollama/gpt-oss:20b",
+    "ollama/gpt-oss:120b",
+    # Qwen models via Ollama
+    "ollama/qwen3:8b",
+    "ollama/qwen3:32b",
+    "ollama/qwen3:235b",
+
+    "ollama/qwen2.5vl:8b",
+    "ollama/qwen2.5vl:32b",
+
+    "ollama/qwen3-coder:70b",
+    "ollama/qwen3-coder:480b",
+
+    # Deepseek models via Ollama
+    "ollama/deepseek-r1:8b",
+    "ollama/deepseek-r1:32b",
+    "ollama/deepseek-r1:70b",
+    "ollama/deepseek-r1:671b",
+]
+
+
+def _require_online() -> None:
+    if os.getenv(ONLINE_ENV_VAR) != "1":
+        raise RuntimeError(
+            "Offline mode: set ASV2_ONLINE=1 or pass --online to the runner to allow network calls."
+        )
+
+
+# Get N responses from a single message, used for ensembling.
+@backoff.on_exception(
+    backoff.expo,
+    (
+        openai.RateLimitError,
+        openai.APITimeoutError,
+        openai.InternalServerError,
+        anthropic.RateLimitError,
+    ),
+)
+@track_token_usage
+def get_batch_responses_from_llm(
+    prompt,
+    client,
+    model,
+    system_message,
+    print_debug=False,
+    msg_history=None,
+    temperature=0.7,
+    n_responses=1,
+) -> tuple[list[str], list[list[dict[str, Any]]]]:
+    _require_online()
+    msg = prompt
+    if msg_history is None:
+        msg_history = []
+
+    if model.startswith("ollama/"):
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model.replace("ollama/", ""),
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=n_responses,
+            stop=None,
+        )
+        content = [r.message.content for r in response.choices]
+        new_msg_history = [
+            new_msg_history + [{"role": "assistant", "content": c}] for c in content
+        ]
+    elif "gpt" in model:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=n_responses,
+            stop=None,
+            seed=0,
+        )
+        content = [r.message.content for r in response.choices]
+        new_msg_history = [
+            new_msg_history + [{"role": "assistant", "content": c}] for c in content
+        ]
+    elif model == "deepseek-coder-v2-0724":
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model="deepseek-coder",
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=n_responses,
+            stop=None,
+        )
+        content = [r.message.content for r in response.choices]
+        new_msg_history = [
+            new_msg_history + [{"role": "assistant", "content": c}] for c in content
+        ]
+    elif model == "llama-3-1-405b-instruct":
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model="meta-llama/llama-3.1-405b-instruct",
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=n_responses,
+            stop=None,
+        )
+        content = [r.message.content for r in response.choices]
+        new_msg_history = [
+            new_msg_history + [{"role": "assistant", "content": c}] for c in content
+        ]
+    elif 'gemini' in model:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=n_responses,
+            stop=None,
+        )
+        content = [r.message.content for r in response.choices]
+        new_msg_history = [
+            new_msg_history + [{"role": "assistant", "content": c}] for c in content
+        ]
+    else:
+        content, new_msg_history = [], []
+        for _ in range(n_responses):
+            c, hist = get_response_from_llm(
+                msg,
+                client,
+                model,
+                system_message,
+                print_debug=False,
+                msg_history=None,
+                temperature=temperature,
+            )
+            content.append(c)
+            new_msg_history.append(hist)
+
+    if print_debug:
+        # Just print the first one.
+        print()
+        print("*" * 20 + " LLM START " + "*" * 20)
+        for j, msg in enumerate(new_msg_history[0]):
+            print(f'{j}, {msg["role"]}: {msg["content"]}')
+        print(content)
+        print("*" * 21 + " LLM END " + "*" * 21)
+        print()
+
+    return content, new_msg_history
+
+
+@track_token_usage
+def make_llm_call(client, model, temperature, system_message, prompt):
+    if model.startswith("ollama/"):
+        return client.chat.completions.create(
+            model=model.replace("ollama/", ""),
+            messages=[
+                {"role": "system", "content": system_message},
+                *prompt,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=1,
+            stop=None,
+        )
+    elif "gpt" in model:
+        return client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_message},
+                *prompt,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=1,
+            stop=None,
+            seed=0,
+        )
+    elif "o1" in model or "o3" in model:
+        return client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "user", "content": system_message},
+                *prompt,
+            ],
+            temperature=1,
+            n=1,
+            seed=0,
+        )
+    
+    else:
+        raise ValueError(f"Model {model} not supported.")
+
+
+@backoff.on_exception(
+    backoff.expo,
+    (
+        openai.RateLimitError,
+        openai.APITimeoutError,
+        openai.InternalServerError,
+        anthropic.RateLimitError,
+    ),
+)
+def get_response_from_llm(
+    prompt,
+    client,
+    model,
+    system_message,
+    print_debug=False,
+    msg_history=None,
+    temperature=0.7,
+) -> tuple[str, list[dict[str, Any]]]:
+    _require_online()
+    msg = prompt
+    if msg_history is None:
+        msg_history = []
+
+    if "claude" in model:
+        new_msg_history = msg_history + [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": msg,
+                    }
+                ],
+            }
+        ]
+        response = client.messages.create(
+            model=model,
+            max_tokens=MAX_NUM_TOKENS,
+            temperature=temperature,
+            system=system_message,
+            messages=new_msg_history,
+        )
+        # response = make_llm_call(client, model, temperature, system_message=system_message, prompt=new_msg_history)
+        content = response.content[0].text
+        new_msg_history = new_msg_history + [
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": content,
+                    }
+                ],
+            }
+        ]
+    elif model.startswith("ollama/"):
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model.replace("ollama/", ""),
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=1,
+            stop=None,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif "gpt" in model:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = make_llm_call(
+            client,
+            model,
+            temperature,
+            system_message=system_message,
+            prompt=new_msg_history,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif "o1" in model or "o3" in model:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = make_llm_call(
+            client,
+            model,
+            temperature,
+            system_message=system_message,
+            prompt=new_msg_history,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif model == "deepseek-coder-v2-0724":
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model="deepseek-coder",
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=1,
+            stop=None,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif model == "deepcoder-14b":
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        try:
+            response = client.chat.completions.create(
+                model="agentica-org/DeepCoder-14B-Preview",
+                messages=[
+                    {"role": "system", "content": system_message},
+                    *new_msg_history,
+                ],
+                temperature=temperature,
+                max_tokens=MAX_NUM_TOKENS,
+                n=1,
+                stop=None,
+            )
+            content = response.choices[0].message.content
+        except Exception as e:
+            # Fallback to direct API call if OpenAI client doesn't work with HuggingFace
+            import requests
+            headers = {
+                "Authorization": f"Bearer {os.environ['HUGGINGFACE_API_KEY']}",
+                "Content-Type": "application/json"
+            }
+            payload = {
+                "inputs": {
+                    "system": system_message,
+                    "messages": [{"role": m["role"], "content": m["content"]} for m in new_msg_history]
+                },
+                "parameters": {
+                    "temperature": temperature,
+                    "max_new_tokens": MAX_NUM_TOKENS,
+                    "return_full_text": False
+                }
+            }
+            response = requests.post(
+                "https://api-inference.huggingface.co/models/agentica-org/DeepCoder-14B-Preview",
+                headers=headers,
+                json=payload
+            )
+            if response.status_code == 200:
+                content = response.json()["generated_text"]
+            else:
+                raise ValueError(f"Error from HuggingFace API: {response.text}")
+
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif model in ["meta-llama/llama-3.1-405b-instruct", "llama-3-1-405b-instruct"]:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model="meta-llama/llama-3.1-405b-instruct",
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=1,
+            stop=None,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif 'gemini' in model:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            temperature=temperature,
+            max_tokens=MAX_NUM_TOKENS,
+            n=1,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    else:
+        raise ValueError(f"Model {model} not supported.")
+
+    if print_debug:
+        print()
+        print("*" * 20 + " LLM START " + "*" * 20)
+        for j, msg in enumerate(new_msg_history):
+            print(f'{j}, {msg["role"]}: {msg["content"]}')
+        print(content)
+        print("*" * 21 + " LLM END " + "*" * 21)
+        print()
+
+    return content, new_msg_history
+
+
+def extract_json_between_markers(llm_output: str) -> dict | None: 
+    # Regular expression pattern to find JSON content between ```json and ```
+    json_pattern = r"```json(.*?)```"
+    matches = re.findall(json_pattern, llm_output, re.DOTALL)
+
+    if not matches:
+        # Fallback: Try to find any JSON-like content in the output
+        json_pattern = r"\{.*?\}"
+        matches = re.findall(json_pattern, llm_output, re.DOTALL)
+
+    for json_string in matches:
+        json_string = json_string.strip()
+        try:
+            parsed_json = json.loads(json_string)
+            return parsed_json
+        except json.JSONDecodeError:
+            # Attempt to fix common JSON issues
+            try:
+                # Remove invalid control characters
+                json_string_clean = re.sub(r"[\x00-\x1F\x7F]", "", json_string)
+                parsed_json = json.loads(json_string_clean)
+                return parsed_json
+            except json.JSONDecodeError:
+                continue  # Try next match
+
+    return None  # No valid JSON found
+
+
+def create_client(model) -> tuple[Any, str]:
+    if model.startswith("claude-"):
+        print(f"Using Anthropic API with model {model}.")
+        return anthropic.Anthropic(), model
+    elif model.startswith("bedrock") and "claude" in model:
+        client_model = model.split("/")[-1]
+        print(f"Using Amazon Bedrock with model {client_model}.")
+        return anthropic.AnthropicBedrock(), client_model
+    elif model.startswith("vertex_ai") and "claude" in model:
+        client_model = model.split("/")[-1]
+        print(f"Using Vertex AI with model {client_model}.")
+        return anthropic.AnthropicVertex(), client_model
+    elif model.startswith("ollama/"):
+        print(f"Using Ollama with model {model}.")
+        return openai.OpenAI(
+            api_key=os.environ.get("OLLAMA_API_KEY", ""),
+            base_url="http://localhost:11434/v1",
+        ), model
+    elif "gpt" in model:
+        print(f"Using OpenAI API with model {model}.")
+        return openai.OpenAI(), model
+    elif "o1" in model or "o3" in model:
+        print(f"Using OpenAI API with model {model}.")
+        return openai.OpenAI(), model
+    elif model == "deepseek-coder-v2-0724":
+        print(f"Using OpenAI API with {model}.")
+        return (
+            openai.OpenAI(
+                api_key=os.environ["DEEPSEEK_API_KEY"],
+                base_url="https://api.deepseek.com",
+            ),
+            model,
+        )
+    elif model == "deepcoder-14b":
+        print(f"Using HuggingFace API with {model}.")
+        # Using OpenAI client with HuggingFace API
+        if "HUGGINGFACE_API_KEY" not in os.environ:
+            raise ValueError("HUGGINGFACE_API_KEY environment variable not set")
+        return (
+            openai.OpenAI(
+                api_key=os.environ["HUGGINGFACE_API_KEY"],
+                base_url="https://api-inference.huggingface.co/models/agentica-org/DeepCoder-14B-Preview",
+            ),
+            model,
+        )
+    elif model == "llama3.1-405b":
+        print(f"Using OpenAI API with {model}.")
+        return (
+            openai.OpenAI(
+                api_key=os.environ["OPENROUTER_API_KEY"],
+                base_url="https://openrouter.ai/api/v1",
+            ),
+            "meta-llama/llama-3.1-405b-instruct",
+        )
+    elif 'gemini' in model:
+        print(f"Using OpenAI API with {model}.")
+        return (
+            openai.OpenAI(
+                api_key=os.environ["GEMINI_API_KEY"],
+                base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
+            ),
+            model,
+        )
+    else:
+        raise ValueError(f"Model {model} not supported.")
diff --git a/skills/experiment-bfts-runner/scripts/asv2/token_tracker.py b/skills/experiment-bfts-runner/scripts/asv2/token_tracker.py
new file mode 100644
index 00000000..58050243
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/token_tracker.py
@@ -0,0 +1,222 @@
+from functools import wraps
+from typing import Dict, Optional, List
+import tiktoken
+from collections import defaultdict
+import asyncio
+from datetime import datetime
+import logging
+
+
+class TokenTracker:
+    def __init__(self):
+        """
+        Token counts for prompt, completion, reasoning, and cached.
+        Reasoning tokens are included in completion tokens.
+        Cached tokens are included in prompt tokens.
+        Also tracks prompts, responses, and timestamps.
+        We assume we get these from the LLM response, and we don't count
+        the tokens by ourselves.
+        """
+        self.token_counts = defaultdict(
+            lambda: {"prompt": 0, "completion": 0, "reasoning": 0, "cached": 0}
+        )
+        self.interactions = defaultdict(list)
+
+        self.MODEL_PRICES = {
+            "gpt-4o-2024-11-20": {
+                "prompt": 2.5 / 1000000,  # $2.50 per 1M tokens
+                "cached": 1.25 / 1000000,  # $1.25 per 1M tokens
+                "completion": 10 / 1000000,  # $10.00 per 1M tokens
+            },
+            "gpt-4o-2024-08-06": {
+                "prompt": 2.5 / 1000000,  # $2.50 per 1M tokens
+                "cached": 1.25 / 1000000,  # $1.25 per 1M tokens
+                "completion": 10 / 1000000,  # $10.00 per 1M tokens
+            },
+            "gpt-4o-2024-05-13": {  # this ver does not support cached tokens
+                "prompt": 5.0 / 1000000,  # $5.00 per 1M tokens
+                "completion": 15 / 1000000,  # $15.00 per 1M tokens
+            },
+            "gpt-4o-mini-2024-07-18": {
+                "prompt": 0.15 / 1000000,  # $0.15 per 1M tokens
+                "cached": 0.075 / 1000000,  # $0.075 per 1M tokens
+                "completion": 0.6 / 1000000,  # $0.60 per 1M tokens
+            },
+            "o1-2024-12-17": {
+                "prompt": 15 / 1000000,  # $15.00 per 1M tokens
+                "cached": 7.5 / 1000000,  # $7.50 per 1M tokens
+                "completion": 60 / 1000000,  # $60.00 per 1M tokens
+            },
+            "o1-preview-2024-09-12": {
+                "prompt": 15 / 1000000,  # $15.00 per 1M tokens
+                "cached": 7.5 / 1000000,  # $7.50 per 1M tokens
+                "completion": 60 / 1000000,  # $60.00 per 1M tokens
+            },
+            "o3-mini-2025-01-31": {
+                "prompt": 1.1 / 1000000,  # $1.10 per 1M tokens
+                "cached": 0.55 / 1000000,  # $0.55 per 1M tokens
+                "completion": 4.4 / 1000000,  # $4.40 per 1M tokens
+            },
+        }
+
+    def add_tokens(
+        self,
+        model: str,
+        prompt_tokens: int,
+        completion_tokens: int,
+        reasoning_tokens: int,
+        cached_tokens: int,
+    ):
+        self.token_counts[model]["prompt"] += prompt_tokens
+        self.token_counts[model]["completion"] += completion_tokens
+        self.token_counts[model]["reasoning"] += reasoning_tokens
+        self.token_counts[model]["cached"] += cached_tokens
+
+    def add_interaction(
+        self,
+        model: str,
+        system_message: str,
+        prompt: str,
+        response: str,
+        timestamp: datetime,
+    ):
+        """Record a single interaction with the model."""
+        self.interactions[model].append(
+            {
+                "system_message": system_message,
+                "prompt": prompt,
+                "response": response,
+                "timestamp": timestamp,
+            }
+        )
+
+    def get_interactions(self, model: Optional[str] = None) -> Dict[str, List[Dict]]:
+        """Get all interactions, optionally filtered by model."""
+        if model:
+            return {model: self.interactions[model]}
+        return dict(self.interactions)
+
+    def reset(self):
+        """Reset all token counts and interactions."""
+        self.token_counts = defaultdict(
+            lambda: {"prompt": 0, "completion": 0, "reasoning": 0, "cached": 0}
+        )
+        self.interactions = defaultdict(list)
+        # self._encoders = {}
+
+    def calculate_cost(self, model: str) -> float:
+        """Calculate the cost for a specific model based on token usage."""
+        if model not in self.MODEL_PRICES:
+            logging.warning(f"Price information not available for model {model}")
+            return 0.0
+
+        prices = self.MODEL_PRICES[model]
+        tokens = self.token_counts[model]
+
+        # Calculate cost for prompt and completion tokens
+        if "cached" in prices:
+            prompt_cost = (tokens["prompt"] - tokens["cached"]) * prices["prompt"]
+            cached_cost = tokens["cached"] * prices["cached"]
+        else:
+            prompt_cost = tokens["prompt"] * prices["prompt"]
+            cached_cost = 0
+        completion_cost = tokens["completion"] * prices["completion"]
+
+        return prompt_cost + cached_cost + completion_cost
+
+    def get_summary(self) -> Dict[str, Dict[str, int]]:
+        # return dict(self.token_counts)
+        """Get summary of token usage and costs for all models."""
+        summary = {}
+        for model, tokens in self.token_counts.items():
+            summary[model] = {
+                "tokens": tokens.copy(),
+                "cost (USD)": self.calculate_cost(model),
+            }
+        return summary
+
+
+# Global token tracker instance
+token_tracker = TokenTracker()
+
+
+def track_token_usage(func):
+    @wraps(func)
+    async def async_wrapper(*args, **kwargs):
+        prompt = kwargs.get("prompt")
+        system_message = kwargs.get("system_message")
+        if not prompt and not system_message:
+            raise ValueError(
+                "Either 'prompt' or 'system_message' must be provided for token tracking"
+            )
+
+        logging.info("args: ", args)
+        logging.info("kwargs: ", kwargs)
+
+        result = await func(*args, **kwargs)
+        model = result.model
+        timestamp = result.created
+
+        if hasattr(result, "usage") and result.usage.completion_tokens_details is not None:
+            token_tracker.add_tokens(
+                model,
+                result.usage.prompt_tokens,
+                result.usage.completion_tokens,
+                result.usage.completion_tokens_details.reasoning_tokens,
+                (
+                    result.usage.prompt_tokens_details.cached_tokens
+                    if hasattr(result.usage, "prompt_tokens_details")
+                    else 0
+                ),
+            )
+            # Add interaction details
+            token_tracker.add_interaction(
+                model,
+                system_message,
+                prompt,
+                result.choices[
+                    0
+                ].message.content,  # Assumes response is in content field
+                timestamp,
+            )
+        return result
+
+    @wraps(func)
+    def sync_wrapper(*args, **kwargs):
+        prompt = kwargs.get("prompt")
+        system_message = kwargs.get("system_message")
+        if not prompt and not system_message:
+            raise ValueError(
+                "Either 'prompt' or 'system_message' must be provided for token tracking"
+            )
+        result = func(*args, **kwargs)
+        model = result.model
+        timestamp = result.created
+        logging.info("args: ", args)
+        logging.info("kwargs: ", kwargs)
+
+        if hasattr(result, "usage") and result.usage.completion_tokens_details is not None:
+            token_tracker.add_tokens(
+                model,
+                result.usage.prompt_tokens,
+                result.usage.completion_tokens,
+                result.usage.completion_tokens_details.reasoning_tokens,
+                (
+                    result.usage.prompt_tokens_details.cached_tokens
+                    if hasattr(result.usage, "prompt_tokens_details")
+                    else 0
+                ),
+            )
+            # Add interaction details
+            token_tracker.add_interaction(
+                model,
+                system_message,
+                prompt,
+                result.choices[
+                    0
+                ].message.content,  # Assumes response is in content field
+                timestamp,
+            )
+        return result
+
+    return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/__init__.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/agent_manager.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/agent_manager.py
new file mode 100644
index 00000000..a81a31f9
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/agent_manager.py
@@ -0,0 +1,1221 @@
+from typing import List, Optional, Dict, Callable, Any, Tuple
+import pickle
+from dataclasses import dataclass
+from enum import Enum, auto
+from pathlib import Path
+import logging
+from .parallel_agent import ParallelAgent
+from .journal import Journal, Node
+import copy
+import re
+from .backend import query, FunctionSpec
+import json
+from rich import print
+from .utils.serialize import parse_markdown_to_dict
+from .utils.metric import WorstMetricValue
+
+
+logger = logging.getLogger(__name__)
+
+
+stage_config_spec = FunctionSpec(
+    name="generate_stage_config",
+    description="Generate configuration for the next experimental stage",
+    json_schema={
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string",
+                "description": "Brief, descriptive name for the stage",
+            },
+            "description": {
+                "type": "string",
+                "description": "Detailed description of the stage's purpose",
+            },
+            "goals": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "List of specific, measurable goals for this stage",
+            },
+            "max_iterations": {
+                "type": "integer",
+                "description": "Maximum number of iterations to run in this stage",
+            },
+        },
+        "required": ["name", "description", "goals", "max_iterations"],
+    },
+)
+
+stage_progress_eval_spec = FunctionSpec(
+    name="evaluate_stage_progression",
+    description="Evaluate readiness to progress to next experimental stage",
+    json_schema={
+        "type": "object",
+        "properties": {
+            "ready_for_next_stage": {
+                "type": "boolean",
+                "description": "Whether the experiment is ready to progress to next stage",
+            },
+            "reasoning": {
+                "type": "string",
+                "description": "Detailed reasoning for the progression decision",
+            },
+            "recommendations": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Specific recommendations for current or next stage",
+            },
+            "suggested_focus": {
+                "type": "string",
+                "description": "Key areas to focus on in the next iterations",
+            },
+        },
+        "required": ["ready_for_next_stage", "reasoning", "recommendations"],
+    },
+)
+
+
+stage_completion_eval_spec = FunctionSpec(
+    name="evaluate_stage_completion",
+    description="Evaluate if the current stage is complete",
+    json_schema={
+        "type": "object",
+        "properties": {
+            "is_complete": {
+                "type": "boolean",
+                "description": "Whether the current stage is complete",
+            },
+            "reasoning": {
+                "type": "string",
+                "description": "Detailed reasoning for the decision",
+            },
+            "missing_criteria": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "List of criteria still needed",
+            },
+        },
+        "required": ["is_complete", "reasoning", "missing_criteria"],
+    },
+)
+
+
+@dataclass
+class Stage:
+    name: str
+    description: str
+    goals: List[str]
+    max_iterations: int
+    num_drafts: int
+    stage_number: int
+
+
+@dataclass
+class StageTransition:
+    """Records transition between stages and the reasoning"""
+
+    from_stage: str
+    to_stage: str
+    reason: str
+    config_adjustments: Dict[str, Any]
+
+
+class AgentManager:
+    def __init__(self, task_desc: str, cfg: Any, workspace_dir: Path):
+        self.task_desc = json.loads(task_desc)
+        for k in [
+            "Title",
+            "Abstract",
+            "Short Hypothesis",
+            "Experiments",
+            "Risk Factors and Limitations",
+        ]:
+            if k not in self.task_desc.keys():
+                raise ValueError(f"Key {k} not found in task_desc")
+        self.cfg = cfg
+        self.workspace_dir = workspace_dir
+        self.current_stage_number = 0
+        self.stages: List[Stage] = []
+        self.current_stage: Optional[Stage] = None
+        self.journals: Dict[str, Journal] = {}
+        self.stage_history: List[StageTransition] = []
+        self.completed_stages: List[str] = []
+        self.main_stage_dict: Dict[int, str] = {
+            1: "initial_implementation",
+            2: "baseline_tuning",
+            3: "creative_research",
+            4: "ablation_studies",
+        }
+        self.main_stage_goals: Dict[int, str] = {
+            1: """
+                - Focus on getting basic working implementation
+                - Use a simple dataset
+                - Aim for basic functional correctness
+                - If you are given \"Code To Use\", you can directly use it as a starting point.""",
+            2: """
+                - Change hyperparameters such as learning rate, number of epochs, batch size, etc. to improve the performance
+                - DO NOT change the model architecture from the previous stage
+                - Introduce TWO more new datasets from HuggingFace test the model. Try very hard to think what Huggingface datasets can be used here for testing.""",
+            3: """
+                - Explore novel improvements
+                - Come up with experiments to reveal new insights
+                - Be creative and think outside the box
+                - MAKE SURE you use THREE HuggingFace dataset in total to test your models""",
+            4: """
+                - Conduct systematic component analysis that reveals the contribution of each part
+                - Use the same datasets you used from the previous stage""",
+        }
+        # Create initial stage
+        self._create_initial_stage()
+
+    def _get_max_iterations(self, stage_number: int) -> int:
+        """Get max iterations for a stage from config or default"""
+        return getattr(
+            self.cfg.agent.stages,
+            f"stage{stage_number}_max_iters",
+            self.cfg.agent.steps,
+        )
+
+    def _get_task_desc_str(self):
+        task_desc = """You are an ambitious AI researcher who is looking to publish a paper that will contribute significantly to the field.
+You have an idea and you want to conduct creative experiments to gain scientific insights.
+Your aim is to run experiments to gather sufficient results for a top conference paper.
+Your research idea:\n\n
+"""
+        task_desc += (
+            "Title:\n"
+            + self.task_desc["Title"]
+            + "\n"
+            + "Abstract:\n"
+            + self.task_desc["Abstract"]
+            + "\n"
+            + "Short Hypothesis:\n"
+            + self.task_desc["Short Hypothesis"]
+            + "\n"
+        )
+        if "Code" in self.task_desc:
+            task_desc += "Code To Use:\n" + self.task_desc["Code"] + "\n"
+        return task_desc
+
+    def _create_initial_stage(self):
+        """Create the initial stage configuration"""
+        self.current_stage_number += 1
+        initial_stage = Stage(
+            name="1_initial_implementation_1_preliminary",
+            description="preliminary",
+            goals=self.main_stage_goals[1],
+            max_iterations=self._get_max_iterations(self.current_stage_number),
+            num_drafts=self.cfg.agent.search.num_drafts,
+            stage_number=self.current_stage_number,
+        )
+
+        self.stages.append(initial_stage)
+        self.current_stage = initial_stage
+        self.journals[initial_stage.name] = Journal()
+
+    def _curate_task_desc(self, stage: Stage) -> str:
+        task_desc = self._get_task_desc_str()
+
+        if stage.name.startswith("3_"):
+            if isinstance(self.task_desc["Experiments"], list):
+                if isinstance(self.task_desc["Experiments"][0], str):
+                    experiment_str = "\n".join(self.task_desc["Experiments"])
+                elif isinstance(self.task_desc["Experiments"][0], dict):
+                    experiment_str = "\n".join(
+                        [
+                            f"{k}: {v}"
+                            for d in self.task_desc["Experiments"]
+                            for k, v in d.items()
+                        ]
+                    )
+            elif isinstance(self.task_desc["Experiments"], str):
+                experiment_str = self.task_desc["Experiments"]
+            else:
+                raise ValueError(
+                    f"Experiments is not a list or string: {self.task_desc['Experiments']}"
+                )
+            task_desc += "Experiment Plan: " + experiment_str + "\n"
+        elif stage.name.startswith("4_"):
+            if isinstance(self.task_desc["Risk Factors and Limitations"], list):
+                risk_factors_str = "\n".join(
+                    self.task_desc["Risk Factors and Limitations"]
+                )
+            else:
+                risk_factors_str = self.task_desc["Risk Factors and Limitations"]
+            task_desc += "Risk Factors and Limitations: " + risk_factors_str + "\n"
+
+        return task_desc
+
+    def _save_checkpoint(self):
+        """Save the current state of the experiment"""
+        if self.current_stage is None:
+            logger.warning("Cannot save checkpoint: current_stage is None")
+            return
+        stage_name = "stage_" + self.current_stage.name
+        save_path = (
+            Path(self.workspace_dir).parent
+            / "logs"
+            / Path(self.workspace_dir).name
+            / stage_name
+            / "checkpoint.pkl"
+        )
+        checkpoint = {
+            "journals": self.journals,
+            "stage_history": self.stage_history,
+            "task_desc": self.task_desc,
+            "cfg": self.cfg,
+            "workspace_dir": self.workspace_dir,
+            "current_stage": self.current_stage,
+        }
+        print("Saving checkpoint to ", save_path)
+        with open(save_path, "wb") as f:
+            pickle.dump(checkpoint, f)
+
+    def _create_agent_for_stage(self, stage: Stage) -> ParallelAgent:
+        """Create a ParallelAgent configured for the given stage"""
+        stage_cfg = self.cfg.copy()
+        stage_cfg.agent.search.num_drafts = stage.num_drafts
+        task_desc = self._curate_task_desc(stage)
+
+        (
+            main_stage,
+            main_stage_name,
+            sub_stage_num,
+            sub_stage_name,
+        ) = self.parse_stage_names(stage.name)
+        task_desc = f"{task_desc}\n\nCurrent Main Stage: {main_stage_name}\n"
+        task_desc += f"Sub-stage: {sub_stage_num} - {sub_stage_name}\n"
+        task_desc += f"Sub-stage goals: {stage.goals}"
+        print("Checking task_desc inside _create_agent_for_stage")
+        print(task_desc)
+
+        if main_stage == 2:
+            stage1_substages = [s for s in self.stages if s.name.startswith("1_")]
+            if not stage1_substages:
+                raise ValueError(f"No stage 1 substages found in {self.stages}")
+            best_stage1_node = self._get_best_implementation(stage1_substages[-1].name)
+            best_stage2_node = None
+            best_stage3_node = None
+        elif main_stage == 3:
+            stage2_substages = [s for s in self.stages if s.name.startswith("2_")]
+            if not stage2_substages:
+                raise ValueError(f"No stage 2 substages found in {self.stages}")
+            best_stage2_node = self._get_best_implementation(stage2_substages[-1].name)
+            best_stage1_node = None
+            best_stage3_node = None
+        elif main_stage == 4:
+            # Use the last (sub-)stage's best node
+            stage3_substages = [s for s in self.stages if s.name.startswith("3_")]
+            if stage3_substages:
+                last_substage = stage3_substages[-1]
+                best_stage3_node = self._get_best_implementation(last_substage.name)
+                best_stage2_node = None
+                best_stage1_node = None
+            else:
+                raise ValueError(f"No stage 3 substages found in {self.stages}")
+        else:
+            best_stage3_node = None
+            best_stage2_node = None
+            best_stage1_node = None
+
+        return ParallelAgent(
+            task_desc=task_desc,
+            cfg=stage_cfg,
+            journal=self.journals[stage.name],
+            stage_name=stage.name,
+            best_stage3_node=best_stage3_node,
+            best_stage2_node=best_stage2_node,
+            best_stage1_node=best_stage1_node,
+        )
+
+    def _parse_vlm_feedback(self, node: Node) -> str:
+        """Parse the feedback from the VLM"""
+        if len(node.plot_analyses) > 0:
+            feedback = f"Plot analyses: {node.plot_analyses[0]['analysis']}\n"
+        else:
+            feedback = "No plot analyses found\n"
+            logger.warning(
+                f"No plot analyses found for node {node.id} during stage {self.current_stage.name}"
+            )
+        feedback += f"VLM Feedback Summary: {node.vlm_feedback_summary}\n"
+        return feedback
+
+    def _check_substage_completion(
+        self, current_substage: Stage, journal: Journal
+    ) -> bool:
+        """Check if the current sub-stage is complete"""
+        best_node = journal.get_best_node(cfg=self.cfg)
+        if not best_node:
+            return False, "No best node found"
+
+        vlm_feedback = self._parse_vlm_feedback(best_node)
+        eval_prompt = f"""
+        Evaluate if the current sub-stage is complete based on the following evidence:
+        1. Figure Analysis:
+        {vlm_feedback}
+
+        Requirements for completion:
+        - {current_substage.goals}
+
+        Provide a detailed evaluation of completion status.
+        """
+
+        try:
+            evaluation = query(
+                system_message=eval_prompt,
+                user_message=None,
+                func_spec=stage_completion_eval_spec,
+                model=self.cfg.agent.feedback.model,
+                temperature=self.cfg.agent.feedback.temp,
+            )
+            if evaluation["is_complete"]:
+                logger.info(
+                    f"Stage {current_substage.name} completed: {evaluation['reasoning']}"
+                )
+                print(
+                    f"[green]Stage {current_substage.name} completed: {evaluation['reasoning']}[/green]"
+                )
+                return True, "Found working implementation"
+            else:
+                missing = ", ".join(evaluation["missing_criteria"])
+                logger.info(
+                    f"Stage {current_substage.name} not complete. Missing: {missing}"
+                )
+                print(
+                    f"[yellow]Stage {current_substage.name} not complete. Missing: {missing}[/yellow]"
+                )
+                return False, "Missing criteria: " + missing
+        except Exception as e:
+            logger.error(
+                f"Error in sub-stage {current_substage.name} completion evaluation: {e}"
+            )
+            return (
+                False,
+                f"Error in sub-stage {current_substage.name} completion evaluation",
+            )
+
+        # Terminate if max iterations reached
+        if len(journal.nodes) >= current_substage.max_iterations:
+            logger.info(
+                f"Stage {current_substage.name} completed: reached max iterations"
+            )
+            print(
+                f"[green]Stage {current_substage.name} completed: reached max iterations[/green]"
+            )
+            return True, "Reached max iterations"
+
+        print(f"[green]Stage {current_substage.name} not completed[/green]")
+        return False
+
+    def _check_stage_completion(self, stage: Stage) -> bool:
+        """Check if current stage is complete based on criteria"""
+        journal = self.journals[stage.name]
+        # Terminate if max iterations reached
+        if len(journal.nodes) >= stage.max_iterations:
+            logger.info(f"Stage {stage.name} completed: reached max iterations")
+            print(
+                f"[green]Stage {stage.name} completed: reached max iterations[/green]"
+            )
+            if stage.stage_number == 1:
+                # For initial stage, if it didn't even find a working implementation until max iterations,
+                # end gracefully and stop the experiment.
+                logger.error(
+                    f"Initial stage {stage.name} did not find a working implementation after {stage.max_iterations} iterations. Consider increasing the max iterations or reducing the complexity of the research idea."
+                )
+                print(
+                    f"[red]Experiment ended: Could not find working implementation in initial stage after {stage.max_iterations} iterations[/red]"
+                )
+                self.current_stage = None  # This will cause the run loop to exit
+                return True, "Failed to find working implementation"
+            else:
+                return True, "Reached max iterations"
+
+        # For initial stage, complete when we have at least one working implementation
+        if stage.stage_number == 1:
+            if len(journal.good_nodes) > 0:
+                logger.info(
+                    f"Stage {stage.name} completed: found working implementation"
+                )
+                print(
+                    f"[green]Stage {stage.name} completed: found working implementation[/green]"
+                )
+                return True, "Found working implementation"
+
+        if stage.stage_number == 2:
+            best_node = journal.get_best_node(cfg=self.cfg)
+            if not best_node:
+                return False, "No best node found"
+            if best_node == journal.nodes[0]:
+                return (
+                    False,
+                    "No improvement found from the base node (which is the best node from the previous stage)",
+                )
+
+            # Normal stage 2 completion check
+            vlm_feedback = self._parse_vlm_feedback(best_node)
+            eval_prompt = f"""
+            Evaluate if stage 2 (baseline tuning) is complete based on the following evidence:
+
+            1. Figure Analysis:
+            {vlm_feedback}
+
+            2. Datasets Tested: {best_node.datasets_successfully_tested}
+
+            Requirements for completion:
+            1. Training curves should show stable convergence
+            2. Results should be tested on at least two datasets
+            3. No major instabilities or issues in the plots
+
+            Provide a detailed evaluation of completion status.
+            """
+
+            try:
+                evaluation = query(
+                    system_message=eval_prompt,
+                    user_message=None,
+                    func_spec=stage_completion_eval_spec,
+                    model=self.cfg.agent.feedback.model,
+                    temperature=self.cfg.agent.feedback.temp,
+                )
+
+                if evaluation["is_complete"]:
+                    logger.info(
+                        f"Stage {stage.name} completed: {evaluation['reasoning']}"
+                    )
+                    print(
+                        f"[green]Stage {stage.name} completed: {evaluation['reasoning']}[/green]"
+                    )
+                    return True, "Found working implementation"
+                else:
+                    missing = ", ".join(evaluation["missing_criteria"])
+                    logger.info(f"Stage {stage.name} not complete. Missing: {missing}")
+                    print(
+                        f"[yellow]Stage {stage.name} not complete. Missing: {missing}[/yellow]"
+                    )
+                    return False, "Missing criteria: " + missing
+            except Exception as e:
+                logger.error(f"Error in stage 2 completion evaluation: {e}")
+                return False, "Error in stage 2 completion evaluation"
+
+        if stage.stage_number == 3:
+            best_node = journal.get_best_node(cfg=self.cfg)
+            if not best_node:
+                return False, "No best node found"
+            if best_node == journal.nodes[0]:
+                return (
+                    False,
+                    "No improvement found from the base node (which is the best node from the previous stage)",
+                )
+            # Check if there are enough research results
+            # Or, we could just let the agent run until max iterations is reached
+            # Check if the experiment execution time is too short
+            exec_time_minutes = best_node.exec_time / 60
+            print(f"[cyan]exec_time_minutes: {exec_time_minutes}[/cyan]")
+            if len(self.journals[stage.name].nodes) > (
+                self.cfg.agent.stages.stage3_max_iters / 2
+            ):
+                if exec_time_minutes < self.cfg.exec.timeout / 60 / 2:
+                    exec_time_feedback = (
+                        f"Implementation works but runs too quickly ({exec_time_minutes:.2f} minutes)."
+                        "We have up to 60 minutes available for each experiment."
+                        "Make sure to scale up the experiment "
+                        "by increasing the number of epochs, using a larger model, or working with bigger datasets."
+                        "Given that the current execution time is {exec_time_minutes:.2f} minutes, think about how changing the number of epochs to run, or using a larger model, or working with bigger datasets to run"
+                        "will affect the execution time, and make sure to scale up the experiment accordingly."
+                    )
+                    print(f"[cyan]exec_time_feedback: {exec_time_feedback}[/cyan]")
+                    self.journals[stage.name].nodes[
+                        -1
+                    ].exec_time_feedback = exec_time_feedback
+                    return False, exec_time_feedback
+        if stage.stage_number == 4:
+            # Just let the agent run until max iterations is reached
+            pass
+
+        print(f"[green]Stage {stage.name} not completed[/green]")
+        return False, "stage not completed"
+
+    def _get_best_implementation(self, stage_name: str) -> Optional[Node]:
+        """Get the best implementation from a completed stage"""
+        if stage_name not in self.journals:
+            return None
+        best_node = self.journals[stage_name].get_best_node(cfg=self.cfg)
+        if best_node:
+            # Create a clean copy of the node for the next stage
+            copied_node = copy.deepcopy(best_node)
+            # Reset parent relationship and children
+            copied_node.parent = None
+            copied_node.children = set()
+            return copied_node
+        return None
+
+    def _generate_substage_goal(self, main_stage_goal: str, journal: Journal) -> str:
+        """Generate the next sub-stage goal based on what has been done so far.
+
+        Args:
+            main_stage_goal: The overall goal for the current main stage
+            journal: Journal containing the results and progress so far
+
+        Returns:
+            str: Specific goals for the next sub-stage
+        """
+        # Gather current progress metrics
+        metrics = self._gather_stage_metrics(journal)
+        issues = self._identify_issues(journal)
+        progress = self._analyze_progress(journal)
+
+        # Create prompt for the LLM
+        prompt = f"""
+        Based on the current experimental progress, generate focused goals for the next sub-stage.
+
+        Main Stage Goals:
+        {main_stage_goal}
+
+        Current Progress:
+        - Total attempts: {metrics['total_nodes']}
+        - Successful implementations: {metrics['good_nodes']}
+        - Best performance: {metrics['best_metric']['value'] if metrics['best_metric'] else 'N/A'}
+        - Convergence status: {progress['convergence_status']}
+
+        Current Issues:
+        {json.dumps(issues, indent=2)}
+
+        Recent Changes:
+        {json.dumps(progress['recent_changes'], indent=2)}
+
+        Generate specific, actionable sub-stage goals that:
+        1. Address current issues and limitations
+        2. Build on recent progress
+        3. Move towards main stage goals
+        4. Are concrete and measurable
+        """
+
+        # Define the function specification for the LLM
+        substage_goal_spec = FunctionSpec(
+            name="generate_substage_goals",
+            description="Generate specific goals for the next experimental sub-stage",
+            json_schema={
+                "type": "object",
+                "properties": {
+                    "goals": {
+                        "type": "string",
+                        "description": "Detailed, specific goals for the next sub-stage",
+                    },
+                    "sub_stage_name": {
+                        "type": "string",
+                        "description": "The name of the next sub-stage",
+                    },
+                },
+                "required": ["goals", "sub_stage_name"],
+            },
+        )
+
+        try:
+            # Get response from LLM
+            response = query(
+                system_message=prompt,
+                user_message=None,
+                func_spec=substage_goal_spec,
+                model=self.cfg.agent.feedback.model,
+                temperature=self.cfg.agent.feedback.temp,
+            )
+
+            # Format the response into a structured goal string
+            goal_str = f"""
+            {response['goals']}
+            """
+
+            return goal_str.strip(), response["sub_stage_name"]
+
+        except Exception as e:
+            logger.error(f"Error generating sub-stage goals: {e}")
+            # Provide fallback goals if LLM fails
+            return f"""
+            Sub-stage Goals:
+            Continue progress on main stage objectives while addressing current issues.
+            """
+
+    def _create_next_substage(
+        self, current_substage: Stage, journal: Journal, substage_feedback: str
+    ) -> Optional[Stage]:
+        """Create the next sub-stage. Ask LLM to come up with the next sub-stage name and goals
+        based on what has been done so far.
+        """
+        main_stage_num, main_stage_name, sub_stage_num, _ = self.parse_stage_names(
+            current_substage.name
+        )
+        main_stage_goal = self.main_stage_goals[main_stage_num]
+        sub_stage_goal, sub_stage_name = self._generate_substage_goal(
+            main_stage_goal, journal
+        )
+
+        return Stage(
+            name=f"{main_stage_num}_{main_stage_name}_{sub_stage_num + 1}_{sub_stage_name}",
+            description=sub_stage_name,
+            goals="Main stage goals:\n"
+            + main_stage_goal
+            + "\n\nSub-stage goals:\n"
+            + sub_stage_goal,
+            max_iterations=self._get_max_iterations(main_stage_num),
+            num_drafts=0,
+            stage_number=current_substage.stage_number + 1,
+        )
+
+    def _create_next_main_stage(
+        self, current_substage: Stage, journal: Journal
+    ) -> Optional[Stage]:
+        (
+            main_stage_num,
+            main_stage_name,
+            sub_stage_num,
+            sub_stage_name,
+        ) = self.parse_stage_names(current_substage.name)
+        if main_stage_num == 4:
+            return None
+        next_main_stage_name = self.main_stage_dict[main_stage_num + 1]
+        sub_stage_num = 1
+        sub_stage_name = "first_attempt"
+        num_drafts = 0
+        stage_number = current_substage.stage_number + 1
+        description = f"first_attempt"
+        main_stage_goal = self.main_stage_goals[main_stage_num + 1]
+
+        return Stage(
+            name=f"{main_stage_num + 1}_{next_main_stage_name}_{sub_stage_num}_{sub_stage_name}",
+            description=description,
+            goals=main_stage_goal,
+            max_iterations=self._get_max_iterations(main_stage_num + 1),
+            num_drafts=num_drafts,
+            stage_number=stage_number,
+        )
+
+    def run(self, exec_callback, step_callback=None):
+        """Run the experiment through generated stages"""
+        while self.current_stage:  # Main stage loop
+            main_stage = self.parse_stage_names(self.current_stage.name)[0]
+            print(f"[green]Starting main stage: {main_stage}[/green]")
+            print(f"[cyan]Goals: {self.current_stage.goals}[/cyan]")
+
+            current_substage = self.current_stage
+            while current_substage:  # Sub-stage loop
+                print(f"[green]Starting sub-stage: {current_substage.name}[/green]")
+
+                with self._create_agent_for_stage(current_substage) as agent:
+                    # Initialize with best result from previous sub-stage if available
+                    if self.stage_history:
+                        prev_stage = self.stage_history[-1].from_stage
+                        print(f"[cyan]prev_stage: {prev_stage}[/cyan]")
+                        print(f"[cyan]self.stage_history: {self.stage_history}[/cyan]")
+                        prev_best = self._get_best_implementation(prev_stage)
+                        if prev_best:
+                            self.journals[self.current_stage.name].append(prev_best)
+                        else:
+                            print(
+                                f"[red]No previous best implementation found for {self.current_stage.name}. Something went wrong so finishing the experiment...[/red]"
+                            )
+                            self.current_stage = None
+                            current_substage = None
+                            break
+
+                    # Run until sub-stage completion
+                    while True:
+                        agent.step(exec_callback)
+                        if step_callback:
+                            step_callback(
+                                current_substage, self.journals[current_substage.name]
+                            )
+
+                        # First check if main stage is complete
+                        (
+                            main_stage_complete,
+                            main_stage_feedback,
+                        ) = self._check_stage_completion(current_substage)
+                        print(
+                            f"[cyan]Feedback from _check_stage_completion: {main_stage_feedback}[/cyan]"
+                        )
+                        if main_stage_complete:
+                            # After main stage completion, run multi-seed eval on the best node
+                            if current_substage.stage_number in [1, 2, 3, 4]:
+                                best_node = self._get_best_implementation(
+                                    current_substage.name
+                                )
+                                if best_node:
+                                    seed_nodes = agent._run_multi_seed_evaluation(
+                                        best_node
+                                    )
+                                    if step_callback:
+                                        step_callback(
+                                            current_substage,
+                                            self.journals[current_substage.name],
+                                        )
+                                    agent._run_plot_aggregation(best_node, seed_nodes)
+                                    if step_callback:
+                                        step_callback(
+                                            current_substage,
+                                            self.journals[current_substage.name],
+                                        )
+                                    print(
+                                        f"Stage {current_substage.name} multi-seed eval done."
+                                    )
+                                else:
+                                    logger.error(
+                                        f"No best node found for {current_substage.name} during multi-seed eval, something went wrong so finishing the experiment..."
+                                    )
+                                    self.current_stage = None
+                                    current_substage = None
+                                    break
+
+                            # Exit the loop to move to next main stage
+                            current_substage = None
+                            break
+
+                        (
+                            substage_complete,
+                            substage_feedback,
+                        ) = self._check_substage_completion(
+                            current_substage, self.journals[current_substage.name]
+                        )
+
+                        if substage_complete:
+                            # Create next sub-stage
+                            next_substage = self._create_next_substage(
+                                current_substage,
+                                self.journals[current_substage.name],
+                                substage_feedback,
+                            )
+                            if next_substage:
+                                # Record sub-stage transition
+                                self.stage_history.append(
+                                    StageTransition(
+                                        from_stage=current_substage.name,
+                                        to_stage=next_substage.name,
+                                        reason=substage_feedback,
+                                        config_adjustments={},
+                                    )
+                                )
+
+                                # Setup new sub-stage
+                                self.stages.append(next_substage)
+                                self.journals[next_substage.name] = Journal()
+                                current_substage = next_substage
+                            else:
+                                # If no next sub-stage could be created, end this main stage
+                                current_substage = None
+                            break
+            self._save_checkpoint()
+            # Main stage complete - create next main stage
+            if self.current_stage:
+                next_main_stage = self._create_next_main_stage(
+                    self.stages[-1], self.journals[self.stages[-1].name]
+                )
+                if next_main_stage:
+                    # Record main stage transition
+                    self.stage_history.append(
+                        StageTransition(
+                            from_stage=self.stages[-1].name,
+                            to_stage=next_main_stage.name,
+                            reason=f"Moving to {next_main_stage.description}",
+                            config_adjustments={},
+                        )
+                    )
+
+                    self.stages.append(next_main_stage)
+                    self.journals[next_main_stage.name] = Journal()
+                    self.current_stage = next_main_stage
+                else:
+                    # Exit the outer loop if no more main stages
+                    logger.info(f"Completed stage: {self.current_stage.name}")
+                    logger.info("No more stages to run -- exiting the loop...")
+                    self.current_stage = None
+
+    def _create_stage_analysis_prompt(
+        self,
+        previous_stages: List[Stage],
+        previous_results: Optional[Dict[str, Any]],
+        is_initial_stage: bool,
+    ) -> str:
+        """Create detailed prompt to determine next stage configuration"""
+        prompt_parts = [
+            f"Task Description: {self._curate_task_desc(previous_stages[-1])}",
+            f"Current Stage Number: {previous_stages[-1].stage_number}",
+        ]
+
+        if previous_stages:
+            stage_history = "\n".join(
+                f"Stage {i+1}: {stage.name} - {stage.description}"
+                for i, stage in enumerate(previous_stages)
+            )
+            prompt_parts.append(f"Previous Stages:\n{stage_history}")
+
+        if previous_results:
+            # Format node summaries
+            if "node_summaries" in previous_results["metrics"]:
+                summaries = "\n".join(
+                    f"Node {i}: {summary}"
+                    for i, summary in enumerate(
+                        previous_results["metrics"]["node_summaries"]
+                    )
+                )
+                prompt_parts.append(f"Node Analysis:\n{summaries}")
+
+            # Format VLM feedback and plot analysis
+            if "plot_insights" in previous_results:
+                plot_insights = previous_results["plot_insights"]
+                prompt_parts.append("Visual Analysis Findings:")
+                for analysis in plot_insights["analyses"]:
+                    prompt_parts.append(f"- {analysis['analysis']}")
+
+            # Format other metrics and findings
+            metrics_summary = (
+                f"Progress Summary:\n"
+                f"- Total attempts: {previous_results['metrics']['total_nodes']}\n"
+                f"- Successful implementations: {previous_results['metrics']['good_nodes']}\n"
+                f"- Failed attempts: {previous_results['metrics']['buggy_nodes']}\n"
+                f"- Best performance: {previous_results['metrics']['best_metric']['value'] if previous_results['metrics']['best_metric'] else 'N/A'}\n"
+                f"- Issues identified: {', '.join(previous_results['issues'])}\n"
+                f"- Progress status: {previous_results['progress']['convergence_status']}"
+            )
+            prompt_parts.append(metrics_summary)
+
+            # Save stage transition analysis to notes directory
+            base_dir = Path(self.workspace_dir).parent.parent
+            run_name = Path(self.workspace_dir).name
+            notes_dir = (
+                base_dir
+                / "logs"
+                / run_name
+                / "notes"
+                / f"stage_{stage_number-1}_to_{stage_number}"
+            )
+            notes_dir.mkdir(parents=True, exist_ok=True)
+
+            analysis_data = {
+                "stage_transition": {
+                    "from_stage": stage_number - 1,
+                    "to_stage": stage_number,
+                    "is_initial_stage": is_initial_stage,  # Add flag for initial stage
+                    "metrics_summary": metrics_summary,
+                    "node_summaries": previous_results["metrics"].get(
+                        "node_summaries", []
+                    ),
+                    "plot_insights": previous_results.get("plot_insights", {}),
+                    "issues": previous_results["issues"],
+                    "progress": previous_results["progress"],
+                }
+            }
+
+            with open(notes_dir / "stage_transition_analysis.json", "w") as f:
+                json.dump(analysis_data, f, indent=2)
+
+        prompt_parts.append(
+            "Based on the above comprehensive analysis, determine the appropriate "
+            "configuration for the next experimental stage. Consider:\n"
+            "1. Visual analysis insights from plots\n"
+            "2. Individual node performance and patterns\n"
+            "3. Overall progress and convergence status\n"
+            "4. Identified issues and challenges\n\n"
+            "Include:\n"
+            "1. Stage name (brief, descriptive)\n"
+            "2. Detailed description of the stage's purpose\n"
+            "3. Specific, measurable goals\n"
+            "4. Maximum iterations needed\n"
+            "5. Success metric threshold (if applicable)"
+        )
+
+        return "\n\n".join(prompt_parts)
+
+    def parse_stage_names(self, stage_name: str) -> Tuple[int, str, int, str]:
+        """Parse stage name into main stage number, main stage name,
+        sub-stage number, and sub-stage name"""
+        # Find the two numbers in the current stage name
+        numbers = [int(n) for n in re.findall(r"\d+", stage_name)]
+
+        main_stage = numbers[0]
+        sub_stage_num = numbers[1]
+        # Extract main_stage_name (everything between the two numbers)
+        parts = re.split(r"\d+", stage_name)[1:-1]
+        main_stage_name = "_".join(p.strip("_") for p in parts if p.strip("_"))
+        # Extract sub_stage_name (everything after the second number)
+        sub_stage_name = re.split(r"\d+", stage_name)[-1].strip("_")
+
+        return main_stage, main_stage_name, sub_stage_num, sub_stage_name
+
+    def _save_stage_summary(
+        self, current_results: Dict[str, Any], evaluation: Dict[str, Any]
+    ):
+        """Save comprehensive stage completion summary"""
+        base_dir = Path(self.workspace_dir).parent.parent
+        run_name = Path(self.workspace_dir).name
+        notes_dir = (
+            base_dir
+            / "logs"
+            / run_name
+            / "notes"
+            / f"stage_{self.current_stage.stage_number}_complete"
+        )
+        notes_dir.mkdir(parents=True, exist_ok=True)
+
+        completion_data = {
+            "stage_completion": {
+                "stage_number": self.current_stage.stage_number,
+                "stage_name": self.current_stage.name,
+                "final_metrics": current_results["metrics"],
+                "identified_issues": current_results["issues"],
+                "progress_analysis": current_results["progress"],
+                "plot_insights": current_results.get("plot_insights", {}),
+                "progression_evaluation": {
+                    "ready_for_next_stage": evaluation["ready_for_next_stage"],
+                    "reasoning": evaluation["reasoning"],
+                    "recommendations": evaluation["recommendations"],
+                    "suggested_focus": evaluation["suggested_focus"],
+                },
+            }
+        }
+
+        with open(notes_dir / "stage_completion_summary.json", "w") as f:
+            json.dump(completion_data, f, indent=2)
+
+    def _get_response(self, prompt: str) -> Dict[str, Any]:
+        """Get structured response from LLM for stage configuration.
+
+        Args:
+            prompt: The analysis prompt to send to the LLM
+
+        Returns:
+            Dictionary containing stage configuration with keys:
+            - name: str
+            - description: str
+            - goals: List[str]
+            - max_iterations: int
+            - success_metric_threshold: Optional[float]
+        """
+        stage_config_spec = {
+            "name": "generate_stage_config",
+            "json_schema": {
+                "type": "object",
+                "properties": {
+                    "name": {
+                        "type": "string",
+                        "description": "Brief, descriptive name for the stage",
+                    },
+                    "description": {
+                        "type": "string",
+                        "description": "Detailed description of the stage's purpose",
+                    },
+                    "goals": {
+                        "type": "array",
+                        "items": {"type": "string"},
+                        "description": "List of specific, measurable goals for this stage",
+                    },
+                    "max_iterations": {
+                        "type": "integer",
+                        "description": "Maximum number of iterations to run in this stage",
+                    },
+                },
+                "required": ["name", "description", "goals", "max_iterations"],
+            },
+            "description": "Generate configuration for the next experimental stage",
+        }
+
+        try:
+            response = query(
+                system_message=prompt,
+                user_message=None,
+                func_spec=stage_config_spec,
+                model=self.cfg.agent.feedback.model,
+                temperature=self.cfg.agent.feedback.temp,
+            )
+            return response
+
+        except Exception as e:
+            logger.error(f"Error getting LLM response: {e}")
+            # Provide a fallback configuration in case of errors
+            return {
+                "name": "fallback_stage",
+                "description": "Fallback stage due to LLM error",
+                "goals": ["Recover from error and continue execution"],
+                "max_iterations": 3,
+                "success_metric_threshold": None,
+            }
+
+    def _gather_stage_metrics(self, journal: Journal) -> Dict[str, Any]:
+        """Gather detailed metrics and analysis from the stage's nodes"""
+        metrics = {
+            "total_nodes": len(journal.nodes),
+            "good_nodes": len(journal.good_nodes),
+            "buggy_nodes": len(journal.buggy_nodes),
+            "best_metric": None,
+            "node_summaries": [],
+            "vlm_feedback": [],
+        }
+
+        # Gather individual node summaries
+        for node in journal.nodes:
+            if hasattr(node, "_agent"):
+                node_summary = node._agent._generate_node_summary(node)
+                metrics["node_summaries"].append(node_summary)
+
+        # Get VLM feedback from plot analysis
+        for node in journal.good_nodes:
+            if hasattr(node, "_vlm_feedback"):
+                metrics["vlm_feedback"].append(node._vlm_feedback)
+
+        best_node = journal.get_best_node(cfg=self.cfg)
+        if best_node:
+            metrics["best_metric"] = {
+                "value": best_node.metric.value,
+                "name": (
+                    best_node.metric.name
+                    if hasattr(best_node.metric, "name")
+                    else "validation_metric"
+                ),
+                "maximize": (
+                    best_node.metric.maximize
+                    if hasattr(best_node.metric, "maximize")
+                    else False
+                ),
+                "analysis": (
+                    best_node.analysis if hasattr(best_node, "analysis") else None
+                ),
+            }
+
+        return metrics
+
+    def _identify_issues(self, journal: Journal) -> List[str]:
+        """Identify systemic issues and challenges from the current stage's results"""
+        issues = []
+
+        # Look for patterns in leaf nodes (endpoints of improvement attempts)
+        leaf_nodes = [n for n in journal.nodes if n.is_leaf]
+        buggy_leaves = [n for n in leaf_nodes if n.is_buggy]
+
+        # If we have buggy leaf nodes, it means we couldn't fix some issues
+        if buggy_leaves:
+            # Group similar issues
+            error_patterns = {}
+            for node in buggy_leaves:
+                if hasattr(node, "analysis"):
+                    # Use the error message as key to group similar issues
+                    error_patterns.setdefault(node.analysis, []).append(node.id)
+
+            # Report persistent issues
+            for error_msg, node_ids in error_patterns.items():
+                if len(node_ids) >= 2:  # If same error occurs multiple times
+                    issues.append(f"Persistent issue in nodes {node_ids}: {error_msg}")
+
+        # Include VLM-identified systemic issues
+        vlm_issues = set()  # Use set to avoid duplicate issues
+        for node in journal.good_nodes:
+            if hasattr(node, "_vlm_feedback"):
+                vlm_feedback = node._vlm_feedback
+                if isinstance(vlm_feedback, dict):
+                    # Look for systemic issues identified by VLM
+                    if "systemic_issues" in vlm_feedback:
+                        vlm_issues.update(vlm_feedback["systemic_issues"])
+                    # Look for recurring patterns in plot analysis
+                    if "plot_analyses" in vlm_feedback:
+                        for analysis in vlm_feedback["plot_analyses"]:
+                            if "limitation" in analysis.get("type", "").lower():
+                                vlm_issues.add(
+                                    f"VLM (Node {node.id}): {analysis['analysis']}"
+                                )
+
+        issues.extend(list(vlm_issues))
+
+        return issues
+
+    def _analyze_progress(self, journal: Journal) -> Dict[str, Any]:
+        """Analyze progress and convergence in the current stage"""
+        progress = {
+            "iterations_completed": len(journal.nodes),
+            "improvements_found": 0,
+            "convergence_status": "not_converged",
+            "improvement_trend": [],
+            "recent_changes": [],
+        }
+
+        # Analyze recent changes
+        recent_nodes = journal.nodes[-3:] if len(journal.nodes) >= 3 else journal.nodes
+        for node in recent_nodes:
+            if not node.is_buggy:
+                change = {
+                    "node_id": node.id,
+                    "metric": node.metric.value,
+                    "parent_id": node.parent.id if node.parent else None,
+                    "analysis": node.analysis if hasattr(node, "analysis") else None,
+                }
+                progress["recent_changes"].append(change)
+
+        return progress
+
+    def _evaluate_stage_progression(
+        self, current_stage: Stage, previous_results: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Evaluate whether experiment is ready for next stage"""
+
+        eval_prompt = f"""
+        Evaluate whether the current experimental stage should progress to the next stage.
+        Consider all available evidence holistically:
+
+        Current Stage Information:
+        - Name: {current_stage.name}
+        - Description: {current_stage.description}
+        - Goals: {', '.join(current_stage.goals) if isinstance(current_stage.goals, list) else current_stage.goals}
+
+        Performance Metrics:
+        {json.dumps(previous_results.get('metrics', {}), indent=2)}
+
+        Identified Issues:
+        {json.dumps(previous_results.get('issues', []), indent=2)}
+
+        Progress Analysis:
+        {json.dumps(previous_results.get('progress', {}), indent=2)}
+
+        Expected Stage Progression:
+        1. Initial Implementation: Focus on basic working implementation
+        2. Baseline Tuning: Systematic optimization of core parameters
+        3. Creative Research: Novel improvements and approaches
+        4. Ablation Studies: Systematic component analysis
+
+        Consider factors like:
+        - Progress toward stage goals
+        - Performance trends and stability
+        - Quality and reliability of results
+        - Understanding of the problem
+        - Presence of systematic issues
+        - Convergence indicators
+        - Readiness for next stage challenges
+
+        Provide a holistic evaluation of whether the experiment should:
+        1. Progress to next stage
+        2. Continue current stage with specific focus
+        3. Extend current stage with modifications
+        """
+
+        try:
+            evaluation = query(
+                system_message=eval_prompt,
+                user_message=None,
+                func_spec=stage_progress_eval_spec,
+                model=self.cfg.agent.feedback.model,
+                temperature=self.cfg.agent.feedback.temp,
+            )
+
+            # Log the evaluation for transparency
+            logger.info(
+                f"Stage progression evaluation:\n{json.dumps(evaluation, indent=2)}"
+            )
+
+            return evaluation
+
+        except Exception as e:
+            logger.error(f"Error in stage progression evaluation: {e}")
+            return {
+                "ready_for_next_stage": False,
+                "reasoning": "Error in evaluation process - continuing current stage",
+                "recommendations": [
+                    "Address evaluation error",
+                    "Continue current approach",
+                ],
+                "suggested_focus": "Maintain current direction while resolving evaluation issues",
+            }
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/__init__.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/__init__.py
new file mode 100644
index 00000000..93eb9aa3
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/__init__.py
@@ -0,0 +1,68 @@
+from . import backend_anthropic, backend_openai
+from .utils import FunctionSpec, OutputType, PromptType, compile_prompt_to_md
+
+
+def _normalize_model(model: str) -> tuple[str, str]:
+    """
+    Normalize model identifier and choose backend.
+    Returns (backend, model_name).
+    backend in {"anthropic", "openai"}.
+    """
+    if model.startswith("bedrock/") and "claude" in model:
+        return "anthropic", model.split("/", 1)[1]
+    if model.startswith("vertex_ai/") and "claude" in model:
+        return "anthropic", model.split("/", 1)[1]
+    if model.startswith("claude-"):
+        return "anthropic", model
+    return "openai", model
+
+
+def get_ai_client(model: str, **model_kwargs):
+    backend, model_name = _normalize_model(model)
+    if backend == "anthropic":
+        return backend_anthropic.get_ai_client(model=model_name, **model_kwargs)
+    return backend_openai.get_ai_client(model=model_name, **model_kwargs)
+
+
+def query(
+    system_message: PromptType | None,
+    user_message: PromptType | None,
+    model: str,
+    temperature: float | None = None,
+    max_tokens: int | None = None,
+    func_spec: FunctionSpec | None = None,
+    **model_kwargs,
+) -> OutputType:
+    backend, model_name = _normalize_model(model)
+
+    model_kwargs = model_kwargs | {
+        "model": model_name,
+        "temperature": temperature,
+    }
+
+    # Handle models with beta limitations
+    if model_name.startswith("o1"):
+        if system_message and user_message is None:
+            user_message = system_message
+        elif system_message is None and user_message:
+            pass
+        elif system_message and user_message:
+            system_message["Main Instructions"] = {}
+            system_message["Main Instructions"] |= user_message
+            user_message = system_message
+        system_message = None
+        model_kwargs["reasoning_effort"] = "high"
+        model_kwargs["max_completion_tokens"] = 100000
+        model_kwargs.pop("temperature", None)
+    else:
+        model_kwargs["max_tokens"] = max_tokens
+
+    query_func = backend_anthropic.query if backend == "anthropic" else backend_openai.query
+    output, req_time, in_tok_count, out_tok_count, info = query_func(
+        system_message=compile_prompt_to_md(system_message) if system_message else None,
+        user_message=compile_prompt_to_md(user_message) if user_message else None,
+        func_spec=func_spec,
+        **model_kwargs,
+    )
+
+    return output
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/backend_anthropic.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/backend_anthropic.py
new file mode 100644
index 00000000..e532d72d
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/backend_anthropic.py
@@ -0,0 +1,75 @@
+import time
+
+from .utils import FunctionSpec, OutputType, opt_messages_to_list, backoff_create
+from funcy import notnone, select_values
+import anthropic
+
+
+ANTHROPIC_TIMEOUT_EXCEPTIONS = (
+    anthropic.RateLimitError,
+    anthropic.APIConnectionError,
+    anthropic.APITimeoutError,
+    anthropic.InternalServerError,
+    anthropic.APIStatusError,
+)
+
+
+def get_ai_client(model: str, max_retries=2):
+    if model.startswith("bedrock/"):
+        return anthropic.AnthropicBedrock(max_retries=max_retries)
+    if model.startswith("vertex_ai/"):
+        return anthropic.AnthropicVertex(max_retries=max_retries)
+    return anthropic.Anthropic(max_retries=max_retries)
+
+
+def query(
+    system_message: str | None,
+    user_message: str | None,
+    func_spec: FunctionSpec | None = None,
+    **model_kwargs,
+) -> tuple[OutputType, float, int, int, dict]:
+    client = get_ai_client(model_kwargs.get("model", ""), max_retries=0)
+
+    filtered_kwargs: dict = select_values(notnone, model_kwargs)  # type: ignore
+    if "max_tokens" not in filtered_kwargs:
+        filtered_kwargs["max_tokens"] = 8192
+
+    if func_spec is not None:
+        raise NotImplementedError("Anthropic function calling not supported in this runner.")
+
+    if system_message is not None and user_message is None:
+        system_message, user_message = user_message, system_message
+
+    if system_message is not None:
+        filtered_kwargs["system"] = system_message
+
+    messages = opt_messages_to_list(None, user_message)
+
+    t0 = time.time()
+    message = backoff_create(
+        client.messages.create,
+        ANTHROPIC_TIMEOUT_EXCEPTIONS,
+        messages=messages,
+        **filtered_kwargs,
+    )
+    req_time = time.time() - t0
+
+    if "thinking" in filtered_kwargs:
+        assert (
+            len(message.content) == 2
+            and message.content[0].type == "thinking"
+            and message.content[1].type == "text"
+        )
+        output: str = message.content[1].text
+    else:
+        assert len(message.content) == 1 and message.content[0].type == "text"
+        output: str = message.content[0].text
+
+    in_tokens = message.usage.input_tokens
+    out_tokens = message.usage.output_tokens
+
+    info = {
+        "stop_reason": message.stop_reason,
+    }
+
+    return output, req_time, in_tokens, out_tokens, info
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/backend_openai.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/backend_openai.py
new file mode 100644
index 00000000..736eb756
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/backend_openai.py
@@ -0,0 +1,120 @@
+import json
+import logging
+import time
+import os
+
+from .utils import FunctionSpec, OutputType, opt_messages_to_list, backoff_create
+from funcy import notnone, select_values
+import openai
+from rich import print
+
+logger = logging.getLogger("ai-scientist")
+
+
+OPENAI_TIMEOUT_EXCEPTIONS = (
+    openai.RateLimitError,
+    openai.APIConnectionError,
+    openai.APITimeoutError,
+    openai.InternalServerError,
+)
+
+
+def get_ai_client(model: str, max_retries=2) -> openai.OpenAI:
+    if model.startswith("ollama/"):
+        base_url = os.environ.get("OLLAMA_HOST", "http://localhost:11434/v1")
+        return openai.OpenAI(
+            api_key=os.environ.get("OLLAMA_API_KEY", ""),
+            base_url=base_url,
+            max_retries=max_retries,
+        )
+    if model == "deepseek-coder-v2-0724":
+        return openai.OpenAI(
+            api_key=os.environ.get("DEEPSEEK_API_KEY", ""),
+            base_url="https://api.deepseek.com",
+            max_retries=max_retries,
+        )
+    if model == "deepcoder-14b":
+        return openai.OpenAI(
+            api_key=os.environ.get("HUGGINGFACE_API_KEY", ""),
+            base_url="https://api-inference.huggingface.co/models/agentica-org/DeepCoder-14B-Preview",
+            max_retries=max_retries,
+        )
+    if model == "llama3.1-405b":
+        return openai.OpenAI(
+            api_key=os.environ.get("OPENROUTER_API_KEY", ""),
+            base_url="https://openrouter.ai/api/v1",
+            max_retries=max_retries,
+        )
+    if "gemini" in model:
+        return openai.OpenAI(
+            api_key=os.environ.get("GEMINI_API_KEY", ""),
+            base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
+            max_retries=max_retries,
+        )
+    base_url = os.environ.get("OPENAI_BASE_URL")
+    if base_url:
+        return openai.OpenAI(
+            api_key=os.environ.get("OPENAI_API_KEY", ""),
+            base_url=base_url,
+            max_retries=max_retries,
+        )
+    return openai.OpenAI(max_retries=max_retries)
+
+
+def query(
+    system_message: str | None,
+    user_message: str | None,
+    func_spec: FunctionSpec | None = None,
+    **model_kwargs,
+) -> tuple[OutputType, float, int, int, dict]:
+    client = get_ai_client(model_kwargs.get("model"), max_retries=0)
+    filtered_kwargs: dict = select_values(notnone, model_kwargs)  # type: ignore
+
+    messages = opt_messages_to_list(system_message, user_message)
+
+    if func_spec is not None:
+        filtered_kwargs["tools"] = [func_spec.as_openai_tool_dict]
+        filtered_kwargs["tool_choice"] = func_spec.openai_tool_choice_dict
+
+    if filtered_kwargs.get("model", "").startswith("ollama/"):
+        filtered_kwargs["model"] = filtered_kwargs["model"].replace("ollama/", "")
+
+    t0 = time.time()
+    completion = backoff_create(
+        client.chat.completions.create,
+        OPENAI_TIMEOUT_EXCEPTIONS,
+        messages=messages,
+        **filtered_kwargs,
+    )
+    req_time = time.time() - t0
+
+    choice = completion.choices[0]
+
+    if func_spec is None:
+        output = choice.message.content
+    else:
+        assert (
+            choice.message.tool_calls
+        ), f"function_call is empty, it is not a function call: {choice.message}"
+        assert (
+            choice.message.tool_calls[0].function.name == func_spec.name
+        ), "Function name mismatch"
+        try:
+            print(f"[cyan]Raw func call response: {choice}[/cyan]")
+            output = json.loads(choice.message.tool_calls[0].function.arguments)
+        except json.JSONDecodeError as e:
+            logger.error(
+                f"Error decoding the function arguments: {choice.message.tool_calls[0].function.arguments}"
+            )
+            raise e
+
+    in_tokens = completion.usage.prompt_tokens
+    out_tokens = completion.usage.completion_tokens
+
+    info = {
+        "system_fingerprint": completion.system_fingerprint,
+        "model": completion.model,
+        "created": completion.created,
+    }
+
+    return output, req_time, in_tokens, out_tokens, info
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/utils.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/utils.py
new file mode 100644
index 00000000..60419c7c
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/backend/utils.py
@@ -0,0 +1,131 @@
+from dataclasses import dataclass
+
+import jsonschema
+from dataclasses_json import DataClassJsonMixin
+
+PromptType = str | dict | list
+FunctionCallType = dict
+OutputType = str | FunctionCallType
+
+
+import backoff
+import logging
+from typing import Callable
+
+logger = logging.getLogger("ai-scientist")
+
+
+@backoff.on_predicate(
+    wait_gen=backoff.expo,
+    max_value=60,
+    factor=1.5,
+)
+def backoff_create(
+    create_fn: Callable, retry_exceptions: list[Exception], *args, **kwargs
+):
+    try:
+        return create_fn(*args, **kwargs)
+    except retry_exceptions as e:
+        logger.info(f"Backoff exception: {e}")
+        return False
+
+
+def opt_messages_to_list(
+    system_message: str | None, user_message: str | None
+) -> list[dict[str, str]]:
+    messages = []
+    if system_message:
+        messages.append({"role": "system", "content": system_message})
+    if user_message:
+        messages.append({"role": "user", "content": user_message})
+    return messages
+
+
+def compile_prompt_to_md(prompt: PromptType, _header_depth: int = 1) -> str:
+    """Convert a prompt into markdown format"""
+    try:
+        logger.debug(f"compile_prompt_to_md input: type={type(prompt)}")
+        if isinstance(prompt, (list, dict)):
+            logger.debug(f"prompt content: {prompt}")
+
+        if prompt is None:
+            return ""
+
+        if isinstance(prompt, str):
+            return prompt.strip() + "\n"
+
+        if isinstance(prompt, list):
+            # Handle empty list case
+            if not prompt:
+                return ""
+            # Special handling for multi-modal messages
+            if all(isinstance(item, dict) and "type" in item for item in prompt):
+                # For multi-modal messages, just pass through without modification
+                return prompt
+
+            try:
+                result = "\n".join([f"- {s.strip()}" for s in prompt] + ["\n"])
+                return result
+            except Exception as e:
+                logger.error(f"Error processing list items: {e}")
+                logger.error("List contents:")
+                for i, item in enumerate(prompt):
+                    logger.error(f"  Item {i}: type={type(item)}, value={item}")
+                raise
+
+        if isinstance(prompt, dict):
+            # Check if this is a single multi-modal message
+            if "type" in prompt:
+                return prompt
+
+            # Regular dict processing
+            try:
+                out = []
+                header_prefix = "#" * _header_depth
+                for k, v in prompt.items():
+                    logger.debug(f"Processing dict key: {k}")
+                    out.append(f"{header_prefix} {k}\n")
+                    out.append(compile_prompt_to_md(v, _header_depth=_header_depth + 1))
+                return "\n".join(out)
+            except Exception as e:
+                logger.error(f"Error processing dict: {e}")
+                logger.error(f"Dict contents: {prompt}")
+                raise
+
+        raise ValueError(f"Unsupported prompt type: {type(prompt)}")
+
+    except Exception as e:
+        logger.error("Error in compile_prompt_to_md:")
+        logger.error(f"Input type: {type(prompt)}")
+        logger.error(f"Input content: {prompt}")
+        logger.error(f"Error: {str(e)}")
+        raise
+
+
+@dataclass
+class FunctionSpec(DataClassJsonMixin):
+    name: str
+    json_schema: dict  # JSON schema
+    description: str
+
+    def __post_init__(self):
+        # validate the schema
+        jsonschema.Draft7Validator.check_schema(self.json_schema)
+
+    @property
+    def as_openai_tool_dict(self):
+        return {
+            "type": "function",
+            "function": {
+                "name": self.name,
+                "description": self.description,
+                "parameters": self.json_schema,
+            },
+        }
+
+    @property
+    def openai_tool_choice_dict(self):
+        return {
+            "type": "function",
+            "function": {"name": self.name},
+        }
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/bfts_utils.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/bfts_utils.py
new file mode 100644
index 00000000..dd44d908
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/bfts_utils.py
@@ -0,0 +1,76 @@
+import os
+import os.path as osp
+import shutil
+import yaml
+
+
+def idea_to_markdown(data: dict, output_path: str, load_code: str) -> None:
+    """
+    Convert a dictionary into a markdown file.
+
+    Args:
+        data: Dictionary containing the data to convert
+        output_path: Path where the markdown file will be saved
+        load_code: Path to a code file to include in the markdown
+    """
+    with open(output_path, "w", encoding="utf-8") as f:
+        for key, value in data.items():
+            # Convert key to title format and make it a header
+            header = key.replace("_", " ").title()
+            f.write(f"## {header}\n\n")
+
+            # Handle different value types
+            if isinstance(value, (list, tuple)):
+                for item in value:
+                    f.write(f"- {item}\n")
+                f.write("\n")
+            elif isinstance(value, dict):
+                for sub_key, sub_value in value.items():
+                    f.write(f"### {sub_key}\n")
+                    f.write(f"{sub_value}\n\n")
+            else:
+                f.write(f"{value}\n\n")
+
+        # Add the code to the markdown file
+        if load_code:
+            # Assert that the code file exists before trying to open it
+            assert os.path.exists(load_code), f"Code path at {load_code} must exist if using the 'load_code' flag. This is an optional code prompt that you may choose to include; if not, please do not set 'load_code'."
+            f.write(f"## Code To Potentially Use\n\n")
+            f.write(f"Use the following code as context for your experiments:\n\n")
+            with open(load_code, "r") as code_file:
+                code = code_file.read()
+                f.write(f"```python\n{code}\n```\n\n")
+
+
+def edit_bfts_config_file(config_path: str, idea_dir: str, idea_path: str) -> str:
+    """
+    Edit the bfts_config.yaml file to point to the idea.md file
+
+    Args:
+        config_path: Path to the bfts_config.yaml file
+        idea_dir: Directory where the idea.md file is located
+        idea_path: Path to the idea.md file
+
+    Returns:
+        Path to the edited bfts_config.yaml file
+    """
+    run_config_path = osp.join(idea_dir, "bfts_config.yaml")
+    shutil.copy(config_path, run_config_path)
+    with open(run_config_path, "r") as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    config["desc_file"] = idea_path
+    config["workspace_dir"] = idea_dir
+
+    # make an empty data directory
+    data_dir = osp.join(idea_dir, "data")
+    os.makedirs(data_dir, exist_ok=True)
+    config["data_dir"] = data_dir
+
+    # make an empty log directory
+    log_dir = osp.join(idea_dir, "logs")
+    os.makedirs(log_dir, exist_ok=True)
+    config["log_dir"] = log_dir
+
+    with open(run_config_path, "w") as f:
+        yaml.dump(config, f)
+    return run_config_path
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/interpreter.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/interpreter.py
new file mode 100644
index 00000000..0ad57cfa
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/interpreter.py
@@ -0,0 +1,313 @@
+"""
+Python interpreter for executing code snippets and capturing their output.
+Supports:
+- captures stdout and stderr
+- captures exceptions and stack traces
+- limits execution time
+"""
+
+import logging
+import os
+import queue
+import signal
+import sys
+import time
+import traceback
+from dataclasses import dataclass
+from multiprocessing import Process, Queue
+from pathlib import Path
+
+import humanize
+from dataclasses_json import DataClassJsonMixin
+
+logger = logging.getLogger("ai-scientist")
+
+
+@dataclass
+class ExecutionResult(DataClassJsonMixin):
+    """
+    Result of executing a code snippet in the interpreter.
+    Contains the output, execution time, and exception information.
+    """
+
+    term_out: list[str]
+    exec_time: float
+    exc_type: str | None
+    exc_info: dict | None = None
+    exc_stack: list[tuple] | None = None
+
+
+def exception_summary(e, working_dir, exec_file_name, format_tb_ipython):
+    """Generates a string that summarizes an exception and its stack trace (either in standard python repl or in IPython format)."""
+    if format_tb_ipython:
+        import IPython.core.ultratb
+
+        tb = IPython.core.ultratb.VerboseTB(tb_offset=1, color_scheme="NoColor")
+        tb_str = str(tb.text(*sys.exc_info()))
+    else:
+        tb_lines = traceback.format_exception(e)
+        # skip parts of stack trace in weflow code
+        tb_str = "".join(
+            [l for l in tb_lines if "treesearch/" not in l and "importlib" not in l]
+        )
+
+    # replace whole path to file with just filename (to remove agent workspace dir)
+    tb_str = tb_str.replace(str(working_dir / exec_file_name), exec_file_name)
+
+    exc_info = {}
+    if hasattr(e, "args"):
+        exc_info["args"] = [str(i) for i in e.args]
+    for att in ["name", "msg", "obj"]:
+        if hasattr(e, att):
+            exc_info[att] = str(getattr(e, att))
+
+    tb = traceback.extract_tb(e.__traceback__)
+    exc_stack = [(t.filename, t.lineno, t.name, t.line) for t in tb]
+
+    return tb_str, e.__class__.__name__, exc_info, exc_stack
+
+
+class RedirectQueue:
+    def __init__(self, queue):
+        self.queue = queue
+
+    def write(self, msg):
+        self.queue.put(msg)
+
+    def flush(self):
+        pass
+
+
+class Interpreter:
+    def __init__(
+        self,
+        working_dir: Path | str,
+        timeout: int = 3600,
+        format_tb_ipython: bool = False,
+        agent_file_name: str = "runfile.py",
+        env_vars: dict[str, str] = {},
+    ):
+        """
+        Simulates a standalone Python REPL with an execution time limit.
+
+        Args:
+            working_dir (Path | str): working directory of the agent
+            timeout (int, optional): Timeout for each code execution step. Defaults to 3600.
+            format_tb_ipython (bool, optional): Whether to use IPython or default python REPL formatting for exceptions. Defaults to False.
+            agent_file_name (str, optional): The name for the agent's code file. Defaults to "runfile.py".
+            env_vars (dict[str, str], optional): Environment variables to set in the child process. Defaults to {}.
+        """
+        # this really needs to be a path, otherwise causes issues that don't raise exc
+        self.working_dir = Path(working_dir).resolve()
+        assert (
+            self.working_dir.exists()
+        ), f"Working directory {self.working_dir} does not exist"
+        self.timeout = timeout
+        self.format_tb_ipython = format_tb_ipython
+        self.agent_file_name = agent_file_name
+        self.process: Process = None  # type: ignore
+        self.env_vars = env_vars
+
+    def child_proc_setup(self, result_outq: Queue) -> None:
+        # disable all warnings (before importing anything)
+        import shutup
+
+        shutup.mute_warnings()
+
+        for key, value in self.env_vars.items():
+            os.environ[key] = value
+
+        os.chdir(str(self.working_dir))
+
+        # this seems to only  benecessary because we're exec'ing code from a string,
+        # a .py file should be able to import modules from the cwd anyway
+        sys.path.append(str(self.working_dir))
+
+        # capture stdout and stderr
+        # trunk-ignore(mypy/assignment)
+        sys.stdout = sys.stderr = RedirectQueue(result_outq)
+
+    def _run_session(
+        self, code_inq: Queue, result_outq: Queue, event_outq: Queue
+    ) -> None:
+        self.child_proc_setup(result_outq)
+
+        global_scope: dict = {}
+        while True:
+            code = code_inq.get()
+            os.chdir(str(self.working_dir))
+            with open(self.agent_file_name, "w") as f:
+                f.write(code)
+
+            event_outq.put(("state:ready",))
+            try:
+                exec(compile(code, self.agent_file_name, "exec"), global_scope)
+            except BaseException as e:
+                tb_str, e_cls_name, exc_info, exc_stack = exception_summary(
+                    e,
+                    self.working_dir,
+                    self.agent_file_name,
+                    self.format_tb_ipython,
+                )
+                result_outq.put(tb_str)
+                if e_cls_name == "KeyboardInterrupt":
+                    e_cls_name = "TimeoutError"
+
+                event_outq.put(("state:finished", e_cls_name, exc_info, exc_stack))
+            else:
+                event_outq.put(("state:finished", None, None, None))
+
+            # put EOF marker to indicate that we're done
+            result_outq.put("<|EOF|>")
+
+    def create_process(self) -> None:
+        # we use three queues to communicate with the child process:
+        # - code_inq: send code to child to execute
+        # - result_outq: receive stdout/stderr from child
+        # - event_outq: receive events from child (e.g. state:ready, state:finished)
+        # trunk-ignore(mypy/var-annotated)
+        self.code_inq, self.result_outq, self.event_outq = Queue(), Queue(), Queue()
+        self.process = Process(
+            target=self._run_session,
+            args=(self.code_inq, self.result_outq, self.event_outq),
+        )
+        self.process.start()
+
+    def _drain_queues(self):
+        """Quickly drain all in-flight messages to prevent blocking."""
+        while not self.result_outq.empty():
+            try:
+                self.result_outq.get_nowait()
+            except Exception:
+                break
+
+        while not self.event_outq.empty():
+            try:
+                self.event_outq.get_nowait()
+            except Exception:
+                break
+
+        while not self.code_inq.empty():
+            try:
+                self.code_inq.get_nowait()
+            except Exception:
+                break
+
+    def cleanup_session(self):
+        if self.process is None:
+            return
+        # give the child process a chance to terminate gracefully
+        self.process.terminate()
+        self._drain_queues()
+        self.process.join(timeout=2)
+        # kill the child process if it's still alive
+        if self.process.exitcode is None:
+            logger.warning("Child process failed to terminate gracefully, killing it..")
+            self.process.kill()
+            self._drain_queues()
+            self.process.join(timeout=2)
+        # don't wait for gc, clean up immediately
+        self.process.close()
+        self.process = None  # type: ignore
+
+    def run(self, code: str, reset_session=True) -> ExecutionResult:
+        """
+        Execute the provided Python command in a separate process and return its output.
+
+        Parameters:
+            code (str): Python code to execute.
+            reset_session (bool, optional): Whether to reset the interpreter session before executing the code. Defaults to True.
+
+        Returns:
+            ExecutionResult: Object containing the output and metadata of the code execution.
+
+        """
+
+        logger.debug(f"REPL is executing code (reset_session={reset_session})")
+
+        if reset_session:
+            if self.process is not None:
+                # terminate and clean up previous process
+                self.cleanup_session()
+            self.create_process()
+        else:
+            # reset_session needs to be True on first exec
+            assert self.process is not None
+
+        assert self.process.is_alive()
+
+        self.code_inq.put(code)
+
+        # wait for child to actually start execution (we don't want interrupt child setup)
+        try:
+            state = self.event_outq.get(timeout=10)
+        except queue.Empty:
+            msg = "REPL child process failed to start execution"
+            logger.critical(msg)
+            while not self.result_outq.empty():
+                logger.error(f"REPL output queue dump: {self.result_outq.get()}")
+            raise RuntimeError(msg) from None
+        assert state[0] == "state:ready", state
+        start_time = time.time()
+
+        # this flag indicates that the child ahs exceeded the time limit and an interrupt was sent
+        # if the child process dies without this flag being set, it's an unexpected termination
+        child_in_overtime = False
+
+        while True:
+            try:
+                # check if the child is done
+                state = self.event_outq.get(timeout=1)  # wait for state:finished
+                assert state[0] == "state:finished", state
+                exec_time = time.time() - start_time
+                break
+            except queue.Empty:
+                # we haven't heard back from the child -> check if it's still alive (assuming overtime interrupt wasn't sent yet)
+                if not child_in_overtime and not self.process.is_alive():
+                    msg = "REPL child process died unexpectedly"
+                    logger.critical(msg)
+                    while not self.result_outq.empty():
+                        logger.error(
+                            f"REPL output queue dump: {self.result_outq.get()}"
+                        )
+                    raise RuntimeError(msg) from None
+
+                # child is alive and still executing -> check if we should sigint..
+                if self.timeout is None:
+                    continue
+                running_time = time.time() - start_time
+                if running_time > self.timeout:
+                    # [TODO] handle this in a better way
+                    assert reset_session, "Timeout ocurred in interactive session"
+
+                    # send interrupt to child
+                    os.kill(self.process.pid, signal.SIGINT)  # type: ignore
+                    child_in_overtime = True
+                    # terminate if we're overtime by more than a minute
+                    if running_time > self.timeout + 60:
+                        logger.warning("Child failed to terminate, killing it..")
+                        self.cleanup_session()
+
+                        state = (None, "TimeoutError", {}, [])
+                        exec_time = self.timeout
+                        break
+
+        output: list[str] = []
+        # read all stdout/stderr from child up to the EOF marker
+        # waiting until the queue is empty is not enough since
+        # the feeder thread in child might still be adding to the queue
+        while not self.result_outq.empty() or not output or output[-1] != "<|EOF|>":
+            output.append(self.result_outq.get())
+        output.pop()  # remove the EOF marker
+
+        e_cls_name, exc_info, exc_stack = state[1:]
+
+        if e_cls_name == "TimeoutError":
+            output.append(
+                f"TimeoutError: Execution exceeded the time limit of {humanize.naturaldelta(self.timeout)}"
+            )
+        else:
+            output.append(
+                f"Execution time: {humanize.naturaldelta(exec_time)} seconds (time limit is {humanize.naturaldelta(self.timeout)})."
+            )
+        return ExecutionResult(output, exec_time, e_cls_name, exc_info, exc_stack)
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/journal.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/journal.py
new file mode 100644
index 00000000..a2a338a7
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/journal.py
@@ -0,0 +1,612 @@
+from __future__ import annotations
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Literal, Optional, Any
+import copy
+import os
+import json
+
+from dataclasses_json import DataClassJsonMixin
+from .interpreter import ExecutionResult
+from .utils.metric import MetricValue, WorstMetricValue
+from .utils.response import trim_long_string
+from .backend import FunctionSpec, query
+
+from rich import print
+
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+node_selection_spec = FunctionSpec(
+    name="select_best_implementation",
+    description="Select the best implementation based on comprehensive analysis",
+    json_schema={
+        "type": "object",
+        "properties": {
+            "selected_id": {
+                "type": "string",
+                "description": "ID of the selected best implementation",
+            },
+            "reasoning": {
+                "type": "string",
+                "description": "Detailed explanation of why this implementation was chosen",
+            },
+        },
+        "required": ["selected_id", "reasoning"],
+    },
+)
+
+
+@dataclass(eq=False)
+class Node(DataClassJsonMixin):
+    """A single node in the solution tree. Contains code, execution results, and evaluation information."""
+
+    # ---- code & plan ----
+    plan: str = field(default="", kw_only=True)  # type: ignore
+    overall_plan: str = field(default="", kw_only=True)  # type: ignore
+    code: str = field(default="", kw_only=True)  # type: ignore
+    plot_code: str = field(default=None, kw_only=True)  # type: ignore
+    plot_plan: str = field(default=None, kw_only=True)  # type: ignore
+
+    # ---- general attrs ----
+    step: int = field(default=None, kw_only=True)  # type: ignore
+    id: str = field(default_factory=lambda: uuid.uuid4().hex, kw_only=True)
+    ctime: float = field(default_factory=lambda: time.time(), kw_only=True)
+    parent: Optional["Node"] = field(default=None, kw_only=True)
+    children: set["Node"] = field(default_factory=set, kw_only=True)
+    exp_results_dir: str = field(default=None, kw_only=True)  # type: ignore
+
+    # ---- execution info ----
+    _term_out: list[str] = field(default=None, kw_only=True)  # type: ignore
+    exec_time: float = field(default=None, kw_only=True)  # type: ignore
+    exc_type: str | None = field(default=None, kw_only=True)
+    exc_info: dict | None = field(default=None, kw_only=True)
+    exc_stack: list[tuple] | None = field(default=None, kw_only=True)
+
+    # ---- parsing info ----
+    parse_metrics_plan: str = field(default="", kw_only=True)
+    parse_metrics_code: str = field(default="", kw_only=True)
+    # parse_exec_result: ExecutionResult = field(default=None, kw_only=True)
+    parse_term_out: list[str] = field(default=None, kw_only=True)
+    parse_exc_type: str | None = field(default=None, kw_only=True)
+    parse_exc_info: dict | None = field(default=None, kw_only=True)
+    parse_exc_stack: list[tuple] | None = field(default=None, kw_only=True)
+
+    # ---- plot execution info ----
+    plot_term_out: list[str] = field(default=None, kw_only=True)  # type: ignore
+    plot_exec_time: float = field(default=None, kw_only=True)  # type: ignore
+    plot_exc_type: str | None = field(default=None, kw_only=True)
+    plot_exc_info: dict | None = field(default=None, kw_only=True)
+    plot_exc_stack: list[tuple] | None = field(default=None, kw_only=True)
+
+    # ---- evaluation ----
+    # post-execution result analysis (findings/feedback)
+    analysis: str = field(default=None, kw_only=True)  # type: ignore
+    metric: MetricValue = field(default=None, kw_only=True)  # type: ignore
+    # whether the agent decided that the code is buggy
+    # -> always True if exc_type is not None or no valid metric
+    is_buggy: bool = field(default=None, kw_only=True)  # type: ignore
+    is_buggy_plots: bool = field(default=None, kw_only=True)
+
+    # ---- plotting ----
+    plot_data: dict = field(default_factory=dict, kw_only=True)
+    plots_generated: bool = field(default=False, kw_only=True)
+    plots: List[str] = field(default_factory=list)  # Relative paths for visualization
+    plot_paths: List[str] = field(
+        default_factory=list
+    )  # Absolute paths for programmatic access
+
+    # ---- VLM feedback ----
+    plot_analyses: List[str] = field(default_factory=list)
+    vlm_feedback_summary: List[str] = field(default_factory=list)
+    datasets_successfully_tested: List[str] = field(default_factory=list)
+
+    # ---- execution time feedback ----
+    exec_time_feedback: str = field(default="", kw_only=True)
+
+    # ---- ablation study ----
+    ablation_name: str = field(default=None, kw_only=True)
+
+    # ---- hyperparam tuning ----
+    hyperparam_name: str = field(default=None, kw_only=True)
+
+    # ---- seed node ----
+    is_seed_node: bool = field(default=False, kw_only=True)
+    is_seed_agg_node: bool = field(default=False, kw_only=True)
+
+    def __post_init__(self) -> None:
+        # Ensure children is a set even if initialized with a list
+        if isinstance(self.children, list):
+            self.children = set(self.children)
+        # Only try to add to parent's children if parent is a Node object
+        if self.parent is not None and not isinstance(self.parent, str):
+            self.parent.children.add(self)
+
+    def __deepcopy__(self, memo):
+        # Create a new instance with copied attributes
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+
+        # Copy all attributes except parent and children to avoid circular references
+        for k, v in self.__dict__.items():
+            if k not in ("parent", "children"):
+                setattr(result, k, copy.deepcopy(v, memo))
+
+        # Handle parent and children separately
+        result.parent = self.parent  # Keep the same parent reference
+        result.children = set()  # Start with empty children set
+
+        return result
+
+    def __getstate__(self):
+        """Return state for pickling"""
+        state = self.__dict__.copy()
+        # Ensure id is included in the state
+        if hasattr(self, "id"):
+            state["id"] = self.id
+        return state
+
+    def __setstate__(self, state):
+        """Set state during unpickling"""
+        # Ensure all required attributes are present
+        self.__dict__.update(state)
+
+    @property
+    def stage_name(self) -> Literal["draft", "debug", "improve"]:
+        """
+        Return the stage of the node:
+        - "stage" if the node is an initial solution draft
+        - "debug" if the node is the result of a debugging step
+        - "improve" if the node is the result of an improvement step
+        """
+        if self.parent is None:
+            return "draft"
+        return "debug" if self.parent.is_buggy else "improve"
+
+    def absorb_exec_result(self, exec_result: ExecutionResult):
+        """Absorb the result of executing the code from this node."""
+        self._term_out = exec_result.term_out
+        self.exec_time = exec_result.exec_time
+        self.exc_type = exec_result.exc_type
+        self.exc_info = exec_result.exc_info
+        self.exc_stack = exec_result.exc_stack
+
+    def absorb_plot_exec_result(self, plot_exec_result: ExecutionResult):
+        """Absorb the result of executing the plotting code from this node."""
+        self.plot_term_out = plot_exec_result.term_out
+        self.plot_exec_time = plot_exec_result.exec_time
+        self.plot_exc_type = plot_exec_result.exc_type
+        self.plot_exc_info = plot_exec_result.exc_info
+        self.plot_exc_stack = plot_exec_result.exc_stack
+
+    @property
+    def term_out(self) -> str:
+        """Get the terminal output of the code execution (after truncating it)."""
+        return trim_long_string("".join(self._term_out))
+
+    @property
+    def is_leaf(self) -> bool:
+        """Check if the node is a leaf node in the solution tree."""
+        return not self.children
+
+    def __eq__(self, other):
+        return isinstance(other, Node) and self.id == other.id
+
+    def __hash__(self):
+        return hash(self.id)
+
+    @property
+    def debug_depth(self) -> int:
+        """
+        Length of the current debug path
+        - 0 if the node is not a debug node (parent is not buggy)
+        - 1 if the parent is buggy but the skip parent isn't
+        - n if there were n consecutive debugging steps
+        """
+        if self.stage_name != "debug":
+            return 0
+        return self.parent.debug_depth + 1  # type: ignore
+
+    def to_dict(self) -> Dict:
+        """Convert node to dictionary for serialization"""
+        return {
+            "code": self.code,
+            "plan": self.plan,
+            "overall_plan": (
+                self.overall_plan if hasattr(self, "overall_plan") else None
+            ),
+            "plot_code": self.plot_code,
+            "plot_plan": self.plot_plan,
+            "step": self.step,
+            "id": self.id,
+            "ctime": self.ctime,
+            "_term_out": self._term_out,
+            "parse_metrics_plan": self.parse_metrics_plan,
+            "parse_metrics_code": self.parse_metrics_code,
+            "parse_term_out": self.parse_term_out,
+            "parse_exc_type": self.parse_exc_type,
+            "parse_exc_info": self.parse_exc_info,
+            "parse_exc_stack": self.parse_exc_stack,
+            "exec_time": self.exec_time,
+            "exc_type": self.exc_type,
+            "exc_info": self.exc_info,
+            "exc_stack": self.exc_stack,
+            "analysis": self.analysis,
+            "exp_results_dir": (
+                str(Path(self.exp_results_dir).resolve().relative_to(os.getcwd()))
+                if self.exp_results_dir
+                else None
+            ),
+            "metric": {
+                "value": self.metric.value if self.metric else None,
+                "maximize": self.metric.maximize if self.metric else None,
+                "name": self.metric.name if hasattr(self.metric, "name") else None,
+                "description": (
+                    self.metric.description
+                    if hasattr(self.metric, "description")
+                    else None
+                ),
+            },
+            "is_buggy": self.is_buggy,
+            "is_buggy_plots": self.is_buggy_plots,
+            "parent_id": None if self.parent is None else self.parent.id,
+            "children": [child.id for child in self.children] if self.children else [],
+            "plot_data": self.plot_data,
+            "plots_generated": self.plots_generated,
+            "plots": self.plots,
+            "plot_paths": (
+                [
+                    str(Path(p).resolve().relative_to(os.getcwd()))
+                    for p in self.plot_paths
+                ]
+                if self.plot_paths
+                else []
+            ),
+            "plot_analyses": [
+                {
+                    **analysis,
+                    "plot_path": (
+                        str(
+                            Path(analysis["plot_path"])
+                            .resolve()
+                            .relative_to(os.getcwd())
+                        )
+                        if analysis.get("plot_path")
+                        else None
+                    ),
+                }
+                for analysis in self.plot_analyses
+            ],
+            "vlm_feedback_summary": self.vlm_feedback_summary,
+            "datasets_successfully_tested": self.datasets_successfully_tested,
+            "ablation_name": self.ablation_name,
+            "hyperparam_name": self.hyperparam_name,
+            "is_seed_node": self.is_seed_node,
+            "is_seed_agg_node": self.is_seed_agg_node,
+            "exec_time_feedback": self.exec_time_feedback,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict, journal: Optional[Journal] = None) -> "Node":
+        """Create a Node from a dictionary, optionally linking to journal for relationships"""
+        # Remove relationship IDs from constructor data
+        parent_id = data.pop("parent_id", None)
+        children = data.pop("children", [])
+
+        # Handle metric conversion
+        metric_data = data.pop("metric", None)
+        if metric_data:
+            if isinstance(metric_data, dict):
+                data["metric"] = MetricValue(
+                    value=metric_data["value"],
+                    maximize=metric_data["maximize"],
+                    name=metric_data["name"],
+                    description=metric_data["description"],
+                )
+            else:
+                # Handle legacy format or None
+                data["metric"] = (
+                    WorstMetricValue()
+                    if data.get("is_buggy")
+                    else MetricValue(metric_data)
+                )
+
+        # Create node instance
+        node = cls(**data)
+
+        # If journal is provided, restore relationships
+        if journal is not None and parent_id:
+            parent = journal.get_node_by_id(parent_id)
+            if parent:
+                node.parent = parent
+                parent.children.add(node)
+
+        return node
+
+
+@dataclass
+class InteractiveSession(DataClassJsonMixin):
+    """
+    A collection of nodes for an interaction session
+    (when the agent interacts with a Jupyter notebook-like interface).
+    """
+
+    nodes: list[Node] = field(default_factory=list)
+    completed: bool = False
+
+    def append(self, node: Node) -> None:
+        node.step = len(self.nodes)
+        self.nodes.append(node)
+
+    def generate_nb_trace(self, include_prompt, comment_headers=True) -> str:
+        """Generate a trace of the interactive session in IPython format."""
+        trace = []
+        header_prefix = "## " if comment_headers else ""
+        for n in self.nodes:
+            trace.append(f"\n{header_prefix}In [{n.step+1}]:\n")
+            trace.append(n.code)
+            trace.append(f"\n{header_prefix}Out [{n.step+1}]:\n")
+            trace.append(n.term_out)
+
+        if include_prompt and self.nodes:
+            trace.append(f"\n{header_prefix}In [{self.nodes[-1].step+2}]:\n")
+
+        return "\n".join(trace).strip()
+
+
+@dataclass
+class Journal:
+    """A collection of nodes representing the solution tree."""
+
+    nodes: list[Node] = field(default_factory=list)
+
+    def __getitem__(self, idx: int) -> Node:
+        return self.nodes[idx]
+
+    def __len__(self) -> int:
+        """Return the number of nodes in the journal."""
+        return len(self.nodes)
+
+    def append(self, node: Node) -> None:
+        """Append a new node to the journal."""
+        node.step = len(self.nodes)
+        self.nodes.append(node)
+
+    @property
+    def draft_nodes(self) -> list[Node]:
+        """Return a list of nodes representing intial coding drafts"""
+        return [n for n in self.nodes if n.parent is None]
+
+    @property
+    def buggy_nodes(self) -> list[Node]:
+        """Return a list of nodes that are considered buggy by the agent."""
+        return [n for n in self.nodes if n.is_buggy]
+
+    @property
+    def good_nodes(self) -> list[Node]:
+        """Return a list of nodes that are not considered buggy by the agent."""
+        list_of_nodes = [
+            [
+                n.step,
+                n.parent.step if n.parent else None,
+                n.id,
+                n.is_buggy,
+                n.is_buggy_plots,
+            ]
+            for n in self.nodes
+        ]
+        print(
+            f"[purple]all nodes ID and is_buggy/is_buggy_plots flags: {list_of_nodes}[/purple]"
+        )
+        return [
+            n for n in self.nodes if n.is_buggy is False and n.is_buggy_plots is False
+        ]
+
+    def get_node_by_id(self, node_id: str) -> Optional[Node]:
+        """Get a node by its ID."""
+        for node in self.nodes:
+            if node.id == node_id:
+                return node
+        return None
+
+    def get_metric_history(self) -> list[MetricValue]:
+        """Return a list of all metric values in the journal."""
+        return [n.metric for n in self.nodes]
+
+    def get_best_node(self, only_good=True, use_val_metric_only=False, cfg=None) -> None | Node:
+        """Return the best solution found so far."""
+        if only_good:
+            nodes = self.good_nodes
+            if not nodes:
+                return None
+        else:
+            nodes = self.nodes
+
+        if use_val_metric_only:
+            return max(nodes, key=lambda n: n.metric)
+
+        if len(nodes) == 1:
+            return nodes[0]
+
+        # Create evaluation prompt for LLM
+        prompt = {
+            "Introduction": (
+                "You are an experienced AI researcher evaluating different implementations "
+                "of an experiment to select the best one. You should consider all aspects "
+                "including performance metrics, training dynamics, generated plots quality."
+            ),
+            "Task": (
+                "Select the best implementation from the candidates below, considering all available evidence."
+                "Avoid relying too heavily on the validation loss alone, because "
+                "it may not be directly comparable across different objective functions or training details. "
+                "If there are multiple validation losses (e.g., when evaluating multiple datasets), "
+                "consider all of them and select the implementation that performs best overall."
+            ),
+            "Candidates": "",
+        }
+        # Gather info about each node
+        for node in nodes:
+            if not node.is_seed_node:
+                candidate_info = (
+                    f"ID: {node.id}\n" f"Metric: {str(node.metric)}\n"
+                    if node.metric
+                    else (
+                        "N/A\n" f"Training Analysis: {node.analysis}\n"
+                        if hasattr(node, "analysis")
+                        else (
+                            "N/A\n" f"VLM Feedback: {node.vlm_feedback_summary}\n"
+                            if hasattr(node, "vlm_feedback_summary")
+                            else "N/A\n"
+                        )
+                    )
+                )
+                prompt["Candidates"] += candidate_info
+
+        try:
+            if cfg is None or cfg.agent.get("select_node", None) is None:
+                model = "gpt-4o"
+                temperature = 0.3
+            else:
+                model = cfg.agent.select_node.model
+                temperature = cfg.agent.select_node.temp
+            selection = query(
+                system_message=prompt,
+                user_message=None,
+                func_spec=node_selection_spec,
+                model=model,
+                temperature=temperature
+            )
+
+            # Find and return the selected node
+            selected_node = next(
+                (node for node in nodes if str(node.id) == selection["selected_id"]),
+                None,
+            )
+            if selected_node:
+                logger.warning(
+                    f"Selected node {selected_node.id} as best implementation"
+                )
+                logger.warning(f"Reasoning: {selection['reasoning']}")
+                return selected_node
+            else:
+                logger.warning("Falling back to metric-based selection")
+                return max(nodes, key=lambda n: n.metric)
+
+        except Exception as e:
+            logger.error(f"Error in LLM selection process: {e}")
+            logger.warning("Falling back to metric-based selection")
+            return max(nodes, key=lambda n: n.metric)
+
+    def generate_summary(self, include_code: bool = False, **model_kwargs) -> str:
+        """Generate a summary of the research progress using LLM, including both successes and failures."""
+        if not self.nodes:
+            return "No experiments conducted yet."
+
+        prompt = {
+            "Introduction": (
+                "You are an AI researcher summarizing experimental progress. "
+                "Please analyze both successful and failed experiments to provide insights "
+                "for future improvements."
+            ),
+            "Successful Experiments": "",
+            "Failed Experiments": "",
+        }
+
+        for node in self.good_nodes:
+            exp_info = f"Design: {node.plan}\n  "
+            exp_info += f"Results: {node.analysis}\n"
+            exp_info += f"Metric: {str(node.metric)}\n"
+            if include_code:
+                exp_info += f"Code: {node.code}\n"
+            prompt["Successful Experiments"] += exp_info
+
+        for node in self.buggy_nodes:
+            failure_info = f"Design: {node.plan}\n  "
+            failure_info += f"Error Analysis: {node.analysis}\n"
+            failure_info += f"Error Type: {node.exc_type if hasattr(node, 'exc_type') else 'Unknown'}\n"
+            failure_info += f"Debug Depth: {node.debug_depth}\n"
+            if include_code:
+                failure_info += f"Code: {node.code}\n"
+            prompt["Failed Experiments"] += failure_info
+
+        summary = query(
+            system_message=prompt,
+            user_message=(
+                "Please provide a comprehensive summary of the experimental progress that includes:\n"
+                "1. Key patterns of success across working experiments\n"
+                "2. Common failure patterns and pitfalls to avoid\n"
+                "3. Specific recommendations for future experiments based on both successes and failures"
+            ),
+            model=model_kwargs.get("model", "gpt-4o"),
+            temperature=model_kwargs.get("temp", 0.3)
+        )
+
+        return summary
+
+    def generate_summary_old(self, include_code: bool = False) -> str:
+        summary = []
+        for n in self.good_nodes:
+            summary_part = f"Design: {n.plan}\n"
+            if include_code:
+                summary_part += f"Code: {n.code}\n"
+            summary_part += f"Results: {n.analysis}\n"
+            summary_part += f"Validation Metric: {n.metric.value}\n"
+            summary.append(summary_part)
+        return "\n-------------------------------\n".join(summary)
+
+    def to_dict(self):
+        """Convert journal to a JSON-serializable dictionary"""
+        return {"nodes": [node.to_dict() for node in self.nodes]}
+
+    def save_experiment_notes(self, workspace_dir: str, stage_name: str, cfg: Any) -> None:
+        """Save experimental notes and summaries to files"""
+        notes_dir = os.path.join(workspace_dir, "experiment_notes")
+        os.makedirs(notes_dir, exist_ok=True)
+
+        # Get all node summaries once
+        node_summaries = []
+        for node in self.nodes:
+            if hasattr(node, "_agent"):
+                summary = node._agent._generate_node_summary(node)
+                node_summaries.append(
+                    {
+                        "node_id": node.id,
+                        "metric": str(node.metric) if node.metric else "Failed",
+                        "summary": summary,
+                    }
+                )
+                # Save individual node summary
+                with open(
+                    os.path.join(
+                        notes_dir, f"{stage_name}_node_{node.id}_summary.json"
+                    ),
+                    "w",
+                ) as f:
+                    json.dump(summary, f, indent=2)
+
+        summary_prompt = {
+            "Introduction": "Synthesize the experimental findings from this stage",
+            "Node Summaries": node_summaries,
+            "Best Node": (
+                {
+                    "id": self.get_best_node().id,
+                    "metric": str(self.get_best_node(cfg=cfg).metric),
+                }
+                if self.get_best_node(cfg=cfg)
+                else None
+            ),
+        }
+
+        stage_summary = query(
+            system_message=summary_prompt,
+            user_message="Generate a comprehensive summary of the experimental findings in this stage",
+            model=cfg.agent.summary.model if cfg.agent.get("summary", None) else "gpt-4o",
+            temperature=cfg.agent.summary.temp if cfg.agent.get("summary", None) else 0.3
+        )
+
+        with open(os.path.join(notes_dir, f"{stage_name}_summary.txt"), "w") as f:
+            f.write(stage_summary)
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/journal2report.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/journal2report.py
new file mode 100644
index 00000000..3dd47e9a
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/journal2report.py
@@ -0,0 +1,31 @@
+from .backend import query
+from .journal import Journal
+from .utils.config import StageConfig
+
+
+def journal2report(journal: Journal, task_desc: dict, rcfg: StageConfig):
+    """
+    Generate a report from a journal, the report will be in markdown format.
+    """
+    report_input = journal.generate_summary(include_code=True)
+    system_prompt_dict = {
+        "Role": "You are a research assistant that always uses concise language.",
+        "Goal": "The goal is to write a technical report summarising the empirical findings and technical decisions.",
+        "Input": "You are given a raw research journal with list of design attempts and their outcomes, and a research idea description.",
+        "Output": [
+            "Your output should be a single markdown document.",
+            "Your report should have the following sections: Introduction, Preprocessing, Methods, Results Discussion, Future Work",
+            "You can include subsections if needed.",
+        ],
+    }
+    context_prompt = (
+        f"Here is the research journal of the agent: <journal>{report_input}<\\journal>, "
+        f"and the research idea description is: <research_proposal>{task_desc}<\\research_proposal>."
+    )
+    return query(
+        system_message=system_prompt_dict,
+        user_message=context_prompt,
+        model=rcfg.model,
+        temperature=rcfg.temp,
+        max_tokens=4096,
+    )
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/log_summarization.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/log_summarization.py
new file mode 100644
index 00000000..77febe30
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/log_summarization.py
@@ -0,0 +1,452 @@
+import json
+import os
+import sys
+
+from .journal import Node, Journal
+
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+sys.path.insert(0, parent_dir)
+from asv2.llm import get_response_from_llm, extract_json_between_markers
+from asv2.treesearch.backend import get_ai_client
+
+
+report_summarizer_sys_msg = """You are an expert machine learning researcher.
+You are given multiple experiment logs, each representing a node in a stage of exploring scientific ideas and implementations.
+Your task is to aggregate these logs and provide scientifically insightful information.
+
+Important instructions:
+- Do NOT hallucinate or fabricate information that is not present in the logs.
+- Do NOT introduce errors when repeating information from the logs.
+- Identify notable insights or differences across the nodes without repeating the same information.
+"""
+
+output_format_control = """Respond in the following format:
+
+THOUGHT:
+<THOUGHT>
+
+JSON:
+```json
+<JSON>
+```
+
+In <THOUGHT>, thoroughly reason as an expert researcher. First, reason about each node, and then reason carefully by combining all the information. It is okay to be very detailed.
+
+In <JSON>, provide the review in JSON format with the following fields in exactly this order:
+- "Experiment_description": a string describing the conducted experiments
+- "Significance": a string explaining why these experiments are important and what impact their findings might have
+- "Description": a string describing the methods, steps taken, and any pertinent context needed to understand the experiments
+- "List_of_included_plots": a list of plots that should be included. Each entry should include:
+  • "path" (the plot path)
+  • "description" (its original description)
+  • "analysis" (your analysis of its scientific insights)
+- "Key_numerical_results": a list of all important numerical results. Be selective about results that contribute to scientific insights. Each entry should include:
+  • "result" (float number)
+  • "description" (your short description of the result)
+  • "analysis" (your analysis of its scientific insights)
+
+Ensure the JSON is valid and properly formatted, as it will be automatically parsed."""
+
+report_summarizer_prompt = (
+    """You are given multiple experiment logs from different "nodes". Each node represents attempts and experiments exploring various scientific ideas.
+
+One key point is that these nodes collectively illustrate a stage of testing different methods or approaches. The crucial task is to identify the scientific insights gleaned from this stage. For example, if one node tries method A and another node tries method B, you should compare any observed differences in performance or outcomes. Summarize both experiments in "Experiment_description", explain the processes in "Description", and place any key numerical findings (such as accuracy metrics, loss values, or runtime comparisons) in "Key_numerical_results."
+
+Be concise and avoid repeating the same information from different nodes. You are encouraged to be thorough, but you do not need to include information from every node. Reason carefully about which results from which nodes are scientifically insightful.
+
+The name of this stage of the experiment: {stage_name}
+
+Here are the experiment logs of the nodes:
+
+{node_infos}
+"""
+    + output_format_control
+)
+
+stage_aggregate_prompt = """You are given:
+
+1) The summary of all previous experiment stages:
+{prev_summary}
+
+2) The name of the current experiment stage:
+{stage_name}
+
+3) The summary of the current stage:
+{current_summary}
+
+
+Your task is to produce an **updated comprehensive summary** of all experiment stages, including the newly introduced results from the current stage.
+
+**Key Requirements:**
+1. **No Loss of Critical Information**
+   - Preserve valuable insights from the summary of all previous experiment stages. Do not remove or alter crucial texts.
+   - Absolutely no hallucinations: if something does not appear in the logs or summaries, do not invent it. If something appears in the previous summary, do not make any mistakes when repeating it.
+2. **Merge New Stage Data**
+   - Integrate relevant results from the current stage into the existing summary.
+   - Identify any overlap or repetition between new and old content, and remove only that which is clearly redundant or no longer scientifically insightful.
+   - Be very careful if you want to remove or shorten the old content. By default, you can keep most of it and append new text.
+   - Highlight how new findings connect to or differ from previous findings.
+3. **Numerical Results and Visuals**
+   - Carefully maintain the most insightful plots, figures, and numerical results.
+   - Do not delete crucial quantitative findings or meaningful visual references.
+4. **Length and Format**
+   - The final summary will likely be **very long**. That is acceptable.
+   - Present the updated summary in a format consistent with the style of the previous summaries (e.g., same section headings or structure).
+
+Respond in the following format:
+
+THOUGHT:
+<THOUGHT>
+
+JSON:
+```json
+<JSON>
+```
+Ensure the JSON is valid and properly formatted, as it will be automatically parsed.
+"""
+
+
+def get_nodes_infos(nodes):
+    node_infos = ""
+    for n in nodes:
+        node_info = f"Node ID: {n.id}\n"
+        node_info += (
+            f"Plan: {n.overall_plan}\n"
+            if hasattr(n, "overall_plan")
+            else "Plan: Not available\n"
+        )
+        node_info += (
+            f"Analysis: {n.analysis}\n"
+            if hasattr(n, "analysis")
+            else "Analysis: Not available\n"
+        )
+        node_info += (
+            f"Numerical Results: {n.metric}\n"
+            if hasattr(n, "metric")
+            else "Numerical Results: Not available\n"
+        )
+        node_info += "Plot Analyses:\n"
+        if hasattr(n, "plot_analyses") and n.plot_analyses:
+            for plot in n.plot_analyses:
+                node_info += f"- Plot Path: {plot.get('plot_path', 'Not available')}, Description: {plot.get('analysis', 'Not available')}\n"
+        else:
+            node_info += "No plot analyses available\n"
+        node_infos += node_info + "\n"
+    return node_infos
+
+
+def get_summarizer_prompt(journal, stage_name):
+    good_leaf_nodes = [n for n in journal.good_nodes if n.is_leaf]
+    if not good_leaf_nodes:
+        print("NO GOOD LEAF NODES!!!")
+        good_leaf_nodes = [n for n in journal.good_nodes]
+    node_infos = get_nodes_infos(good_leaf_nodes)
+    return report_summarizer_sys_msg, report_summarizer_prompt.format(
+        node_infos=node_infos, stage_name=stage_name
+    )
+
+
+def get_stage_summary(journal, stage_name, model, client):
+    sys_msg, prompt = get_summarizer_prompt(journal, stage_name)
+    response = get_response_from_llm(prompt, client, model, sys_msg)
+    summary_json = extract_json_between_markers(response[0])
+    return summary_json
+
+
+def get_node_log(node):
+    node_dict = node.to_dict()
+    # Only include keys that are relevant for logging/analysis
+    keys_to_include = [
+        "overall_plan",
+        "analysis",
+        "metric",
+        "code",
+        "plot_code",
+        "plot_plan",
+        "plot_analyses",
+        "plot_paths",
+        "vlm_feedback_summary",
+        "exp_results_dir",
+        "ablation_name",
+    ]
+    ret = {
+        key: node_dict[key]
+        for key in keys_to_include
+        if key in node_dict and node_dict[key] is not None
+    }
+    if "exp_results_dir" in ret:
+        original_dir_path = ret["exp_results_dir"]
+        # Remove leading path segments before "experiment_results"
+        idx = original_dir_path.find("experiment_results")
+        short_dir_path = original_dir_path
+        if idx != -1:
+            short_dir_path = original_dir_path[idx:]
+
+        ret["exp_results_dir"] = short_dir_path
+
+        if os.path.isdir(original_dir_path):
+            npy_files = [f for f in os.listdir(original_dir_path) if f.endswith(".npy")]
+            # Prepend the shortened path to each .npy filename
+            ret["exp_results_npy_files"] = [
+                os.path.join(short_dir_path, f) for f in npy_files
+            ]
+        else:
+            ret["exp_results_npy_files"] = []
+    return ret
+
+
+def update_summary(
+    prev_summary, cur_stage_name, cur_journal, cur_summary, model, client, max_retry=5
+):
+    good_leaf_nodes = [n for n in cur_journal.good_nodes if n.is_leaf]
+    node_infos = get_nodes_infos(good_leaf_nodes)
+    prompt = stage_aggregate_prompt.format(
+        prev_summary=prev_summary,
+        stage_name=cur_stage_name,
+        current_summary=cur_summary,
+    )
+    try:
+        response = get_response_from_llm(
+            prompt, client, model, "You are an expert machine learning researcher."
+        )
+        summary_json = extract_json_between_markers(response[0])
+        assert summary_json
+    except Exception as e:
+        if max_retry > 0:
+            print(f"Error occurred: {e}. Retrying... ({max_retry} attempts left)")
+            return update_summary(
+                prev_summary,
+                cur_stage_name,
+                cur_journal,
+                cur_summary,
+                model,
+                client,
+                max_retry - 1,
+            )
+        else:
+            print(f"Failed to update summary after multiple attempts. Error: {e}")
+            raise
+    return summary_json
+
+
+overall_plan_summarizer_prompt = """You have been provided with the plans for both the parent node and the current node. Your task is to synthesize a comprehensive summary of the overall plan by integrating details from both the parent and current node plans.
+The summary should be thorough and clearly articulate the underlying motivations.
+For example, if in your previous overall plan you were experimenting with a new idea, and now your current plan is to fix certain bugs in the previous implementation, your returned overall plan should focus on your previous overall plan, and briefly mention that the current plan includes bug fixes. If your current plan is more about implementing new ideas, then you should summarize that thoroughly along with the previous overall plan.
+The goal is to create a comprehensive summary of all historical plans, focusing on the main scientific planning and objectives.
+
+Previous overall plan:
+{prev_overall_plan}
+
+Current plan:
+{current_plan}
+
+Respond in the following format:
+
+THOUGHT:
+<THOUGHT>
+
+JSON:
+```json
+<JSON>
+```
+
+In <THOUGHT>, thoroughly reason as an expert researcher. First, reason over each node, and then carefully combine all information. It is okay to be very detailed.
+
+In <JSON>, provide the review in JSON format with the following field in exactly this order:
+- "overall_plan": a string that describes the overall plan based on the current and previous overall plans
+
+Ensure the JSON is valid and properly formatted, as it will be automatically parsed.
+"""
+
+
+def annotate_history(journal, cfg=None):
+    for node in journal.nodes:
+        if node.parent:
+            max_retries = 3
+            retry_count = 0
+            while retry_count < max_retries:
+                try:
+                    if cfg.agent.get("summary", None) is not None:
+                        model = cfg.agent.summary.model
+                    else:
+                        model = "gpt-4o-2024-08-06"
+                    client = get_ai_client(model)
+                    response = get_response_from_llm(
+                        overall_plan_summarizer_prompt.format(
+                            prev_overall_plan=node.parent.overall_plan,
+                            current_plan=node.plan,
+                        ),
+                        client,
+                        model,
+                        report_summarizer_sys_msg,
+                    )
+                    node.overall_plan = extract_json_between_markers(response[0])[
+                        "overall_plan"
+                    ]
+                    break
+                except Exception as e:
+                    retry_count += 1
+                    if retry_count == max_retries:
+                        print(f"Failed after {max_retries} attempts. Error: {e}")
+                        raise
+                    print(
+                        f"Error occurred: {e}. Retrying... ({max_retries - retry_count} attempts left)"
+                    )
+        else:
+            node.overall_plan = node.plan
+
+
+def overall_summarize(journals, cfg=None):
+    from concurrent.futures import ThreadPoolExecutor
+
+    def process_stage(idx, stage_tuple):
+        stage_name, journal = stage_tuple
+        annotate_history(journal, cfg=cfg)
+        if idx in [1, 2]:
+            best_node = journal.get_best_node(cfg=cfg)
+            # get multi-seed results and aggregater node
+            child_nodes = best_node.children
+            multi_seed_nodes = [
+                n for n in child_nodes if n.is_seed_node and not n.is_seed_agg_node
+            ]
+            agg_node = None
+            for n in child_nodes:
+                if n.is_seed_node and n.is_seed_agg_node:
+                    agg_node = n
+                    break
+            if agg_node is None:
+                # skip agg node
+                return {
+                    "best node": get_node_log(best_node),
+                    "best node with different seeds": [
+                        get_node_log(n) for n in multi_seed_nodes
+                    ],
+                }
+            else:
+                return {
+                    "best node": get_node_log(best_node),
+                    "best node with different seeds": [
+                        get_node_log(n) for n in multi_seed_nodes
+                    ],
+                    "aggregated results of nodes with different seeds": get_node_log(
+                        agg_node
+                    ),
+                }
+        elif idx == 3:
+            good_leaf_nodes = [
+                n for n in journal.good_nodes if n.is_leaf and n.ablation_name
+            ]
+            return [get_node_log(n) for n in good_leaf_nodes]
+        elif idx == 0:
+            if cfg.agent.get("summary", None) is not None:
+                model = cfg.agent.summary.get("model", "")
+            else:
+                model = "gpt-4o-2024-08-06"
+            client = get_ai_client(model)
+            summary_json = get_stage_summary(journal, stage_name, model, client)
+            return summary_json
+
+    from tqdm import tqdm
+
+    with ThreadPoolExecutor() as executor:
+        results = list(
+            tqdm(
+                executor.map(process_stage, range(len(list(journals))), journals),
+                desc="Processing stages",
+                total=len(list(journals)),
+            )
+        )
+        draft_summary, baseline_summary, research_summary, ablation_summary = results
+
+    return draft_summary, baseline_summary, research_summary, ablation_summary
+
+
+if __name__ == "__main__":
+    # Test
+    example_path = "logs/247-run"
+
+    def load_stage_folders(base_path):
+        """
+        Load the folders that start with 'stage_' followed by a number.
+
+        Args:
+            base_path (str): The base directory path where stage folders are located.
+
+        Returns:
+            list: A sorted list of stage folder paths.
+        """
+        stage_folders = []
+        for folder_name in os.listdir(base_path):
+            if folder_name.startswith("stage_"):
+                stage_folders.append(os.path.join(base_path, folder_name))
+        return sorted(stage_folders, key=lambda x: int(x.split("_")[1]))
+
+    def reconstruct_journal(journal_data):
+        # Create a mapping of node IDs to Node instances
+        id_to_node = {}
+        for node_data in journal_data["nodes"]:
+            # Remove unused or invalid keys if needed
+            if "actionable_insights_from_plots" in node_data:
+                del node_data["actionable_insights_from_plots"]
+            node = Node.from_dict(node_data)
+            id_to_node[node.id] = node
+
+        # Set up parent-child relationships using node2parent
+        for node_id, parent_id in journal_data["node2parent"].items():
+            child_node = id_to_node[node_id]
+            parent_node = id_to_node[parent_id]
+            child_node.parent = parent_node
+            parent_node.children.add(child_node)
+
+        # Create a Journal and add all nodes
+        journal = Journal()
+        journal.nodes.extend(id_to_node.values())
+
+        return journal
+
+    # Example usage
+    stage_folders = load_stage_folders(example_path)
+    journals = []
+    for index, folder in enumerate(stage_folders, start=1):
+        print(f"Stage {index}: {folder}")
+        stage_name = os.path.basename(folder)
+        journal_path = os.path.join(folder, "journal.json")
+        if os.path.exists(journal_path):
+            with open(journal_path, "r") as file:
+                journal_data = json.load(file)
+                print(f"Loaded journal.json for Stage {index}")
+        else:
+            print(f"No journal.json found for Stage {index}")
+        journal = reconstruct_journal(journal_data)
+        journals.append((stage_name, journal))
+
+    # Convert manager journals to list of (stage_name, journal) tuples
+    (
+        draft_summary,
+        baseline_summary,
+        research_summary,
+        ablation_summary,
+    ) = overall_summarize(journals)
+    log_dir = "logs/247-run"
+    draft_summary_path = log_dir + "/draft_summary.json"
+    baseline_summary_path = log_dir + "/baseline_summary.json"
+    research_summary_path = log_dir + "/research_summary.json"
+    ablation_summary_path = log_dir + "/ablation_summary.json"
+
+    with open(draft_summary_path, "w") as draft_file:
+        json.dump(draft_summary, draft_file, indent=2)
+
+    with open(baseline_summary_path, "w") as baseline_file:
+        json.dump(baseline_summary, baseline_file, indent=2)
+
+    with open(research_summary_path, "w") as research_file:
+        json.dump(research_summary, research_file, indent=2)
+
+    with open(ablation_summary_path, "w") as ablation_file:
+        json.dump(ablation_summary, ablation_file, indent=2)
+
+    print(f"Summary reports written to files:")
+    print(f"- Draft summary: {draft_summary_path}")
+    print(f"- Baseline summary: {baseline_summary_path}")
+    print(f"- Research summary: {research_summary_path}")
+    print(f"- Ablation summary: {ablation_summary_path}")
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/parallel_agent.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/parallel_agent.py
new file mode 100644
index 00000000..5ca224a9
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/parallel_agent.py
@@ -0,0 +1,2372 @@
+from concurrent.futures import ProcessPoolExecutor
+from typing import List, Optional, Set, Any, Callable, cast, Dict, Tuple
+import random
+import subprocess
+import os
+from queue import Queue
+import logging
+import humanize
+from .backend import FunctionSpec, compile_prompt_to_md, query
+from .interpreter import ExecutionResult
+from .journal import Journal, Node
+from .utils import data_preview
+from .utils.config import Config
+from .utils.metric import MetricValue, WorstMetricValue
+from .utils.response import extract_code, extract_text_up_to_code, wrap_code
+import copy
+import pickle
+from dataclasses import asdict
+from omegaconf import OmegaConf
+
+from rich import print
+from pathlib import Path
+import base64
+import sys
+
+logger = logging.getLogger("ai-scientist")
+
+AI_SCIENTIST_ROOT = "AI_SCIENTIST_ROOT"
+AI_SCIENTIST_SKILLS_ROOT = "AI_SCIENTIST_SKILLS_ROOT"
+
+ExecCallbackType = Callable[[str, bool], ExecutionResult]
+
+
+def _safe_pickle_test(obj, name="object"):
+    """Test if an object can be pickled"""
+    try:
+        pickle.dumps(obj)
+        return True
+    except Exception as e:
+        logger.error(f"Cannot pickle {name}: {str(e)}")
+        return False
+
+
+def _parse_keyword_prefix_response(
+    response: str, keyword_prefix1: str, keyword_prefix2: str
+) -> Tuple[Optional[str], Optional[str]]:
+    """Parse the response into name and description based on keyword prefix"""
+    try:
+        # Split response into lines and clean up
+        lines = [line.strip() for line in response.split("\n") if line.strip()]
+
+        # Find the idea and description
+        name = None
+        description = None
+
+        for line in lines:
+            if line.startswith(keyword_prefix1):
+                name = line.replace(keyword_prefix1, "").strip()
+            elif line.startswith(keyword_prefix2):
+                description = line.replace(keyword_prefix2, "").strip()
+                # Combine any following lines that don't start with a marker
+                desc_lines = []
+                for next_line in lines[lines.index(line) + 1 :]:
+                    if not next_line.startswith((keyword_prefix1, keyword_prefix2)):
+                        desc_lines.append(next_line)
+                    else:
+                        break
+                if desc_lines:
+                    description = " ".join([description] + desc_lines)
+
+        if name is None or description is None:
+            raise ValueError(
+                f"Missing required keywords in response: {keyword_prefix1} and/or {keyword_prefix2}"
+            )
+
+        return name, description
+
+    except Exception as e:
+        logger.error(f"Error parsing response: {str(e)}")
+        logger.debug(f"Raw response: {response}")
+        return None, None
+
+
+review_func_spec = FunctionSpec(
+    name="submit_review",
+    json_schema={
+        "type": "object",
+        "properties": {
+            "is_bug": {
+                "type": "boolean",
+                "description": "true if the output log shows that the execution failed or has some bug, otherwise false.",
+            },
+            "summary": {
+                "type": "string",
+                "description": "if there is a bug, summarize the bug and propose a fix. Otherwise, leave it empty.",
+            },
+        },
+        "required": [
+            "is_bug",
+            "summary",
+        ],
+    },
+    description="Submit a review evaluating the output of the training script.",
+)
+
+vlm_feedback_spec = FunctionSpec(
+    name="analyze_experiment_plots",
+    json_schema={
+        "type": "object",
+        "properties": {
+            "plot_analyses": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "analysis": {
+                            "type": "string",
+                            "description": "Detailed analysis of the plot's results and implications",
+                        },
+                    },
+                    "required": ["analysis"],
+                },
+            },
+            "valid_plots_received": {
+                "type": "boolean",
+                "description": "True if valid plots were received, False otherwise. For example, if the plots are empty or not meaningful, this should be False.",
+            },
+            "vlm_feedback_summary": {
+                "type": "string",
+                "description": "Summarize the feedback from the VLM. If the task involves generative modeling, make sure to focus on the generated samples.",
+            },
+        },
+        "required": ["plot_analyses", "valid_plots_received", "vlm_feedback_summary"],
+    },
+    description="Analyze experimental plots and provide detailed feedback on the results.",
+)
+
+metric_parse_spec = FunctionSpec(
+    name="parse_metrics",
+    json_schema={
+        "type": "object",
+        "strict": True,
+        "properties": {
+            "valid_metrics_received": {
+                "type": "boolean",
+                "description": "True if the metrics were successfully received, False otherwise. For example if the execution output does not contain any metrics, set this to False.",
+            },
+            "metric_names": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "metric_name": {
+                            "type": "string",
+                            "description": "Specify the metric name clearly. Avoid vague terms like 'train,' 'val,' or 'test.' Instead, use precise labels such as 'train accuracy,' 'validation loss,' or 'test F1 score,' etc.",
+                        },
+                        "lower_is_better": {
+                            "type": "boolean",
+                            "description": "Whether lower values are better for this metric",
+                        },
+                        "description": {
+                            "type": "string",
+                            "description": "Description of the metric",
+                        },
+                        "data": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "dataset_name": {
+                                        "type": "string",
+                                        "description": "The name of the dataset. Never include 'train', 'val', or 'test' in the dataset name.",
+                                    },
+                                    "final_value": {
+                                        "type": "number",
+                                        "description": "The final value of the metric for this dataset",
+                                    },
+                                    "best_value": {
+                                        "type": "number",
+                                        "description": "The best value of the metric for this dataset",
+                                    },
+                                },
+                                "required": [
+                                    "dataset_name",
+                                    "final_value",
+                                    "best_value",
+                                ],
+                            },
+                        },
+                    },
+                    "required": [
+                        "data",
+                        "metric_name",
+                        "lower_is_better",
+                        "description",
+                    ],
+                },
+                "additionalProperties": False,
+            },
+        },
+        "required": ["valid_metrics_received", "metric_names"],
+        "additionalProperties": False,
+    },
+    description="Parse metrics from execution output",
+)
+
+
+plot_selection_spec = FunctionSpec(
+    name="select_plots",
+    json_schema={
+        "type": "object",
+        "properties": {
+            "selected_plots": {
+                "type": "array",
+                "description": "List of selected plot file paths",
+                "items": {"type": "string", "description": "Full path to a plot file"},
+                "maxItems": 10,
+            }
+        },
+        "required": ["selected_plots"],
+    },
+    description="Select the 10 most relevant plots for analysis",
+)
+
+
+class AblationConfig:
+    """Track state of ablation experiments"""
+
+    def __init__(self, name: str, description: str, code: str, base_node: Node):
+        self.name = name
+        self.description = description
+        self.code = code
+        self.base_node = base_node
+        self.attempts = 0
+        self.max_attempts = 3  # Maximum number of retry attempts
+        self.last_error = None
+        self.completed = False
+        self.current_node = None
+
+
+class AblationIdea:
+    """Ablation idea"""
+
+    def __init__(self, name: str, description: str):
+        self.name = name
+        self.description = description
+
+
+class HyperparamTuningIdea:
+    """Hyperparameter tuning idea"""
+
+    def __init__(self, name: str, description: str):
+        self.name = name
+        self.description = description
+
+
+class MinimalAgent:
+    """A minimal agent class that only contains what's needed for processing nodes"""
+
+    def __init__(
+        self,
+        task_desc,
+        cfg,
+        memory_summary=None,
+        evaluation_metrics=None,
+        stage=None,
+        stage_name=None,
+    ):
+        self.task_desc = task_desc
+        self.memory_summary = memory_summary
+        self.cfg = cfg
+        self.evaluation_metrics = evaluation_metrics
+        self.stage_name = stage_name
+        self.data_preview = None
+
+    @property
+    def _prompt_environment(self):
+        pkgs = [
+            "numpy",
+            "pandas",
+            "scikit-learn",
+            "statsmodels",
+            "xgboost",
+            "lightGBM",
+            "torch",
+            "torchvision",
+            "torch-geometric",
+            "bayesian-optimization",
+            "timm",
+            "albumentations",
+        ]
+        random.shuffle(pkgs)
+        pkg_str = ", ".join([f"`{p}`" for p in pkgs])
+
+        env_prompt = {
+            "Installed Packages": f"Your solution can use any relevant machine learning packages such as: {pkg_str}. Feel free to use any other packages too (all packages are already installed!). For neural networks we suggest using PyTorch rather than TensorFlow."
+        }
+        return env_prompt
+
+    @property
+    def _prompt_impl_guideline(self):
+        impl_guideline = [
+            "CRITICAL GPU REQUIREMENTS - Your code MUST include ALL of these:",
+            "  - At the start of your code, add these lines to handle GPU/CPU:",
+            "    ```python",
+            "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')",
+            "    print(f'Using device: {device}')",
+            "    ```",
+            "  - ALWAYS move models to device using the `.to(device)` method",
+            "  - ALWAYS move input tensors to device using the `.to(device)` method",
+            "  - ALWAYS move model related tensors to device using the `.to(device)` method",
+            "  - For optimizers, create them AFTER moving model to device",
+            "  - When using DataLoader, move batch tensors to device in training loop: `batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}`",
+            "CRITICAL MODEL INPUT GUIDELINES:",
+            "  - Always pay extra attention to the input to the model being properly normalized",
+            "  - This is extremely important because the input to the model's forward pass directly affects the output, and the loss function is computed based on the output",
+        ]
+        if hasattr(self.cfg.experiment, "num_syn_datasets"):
+            num_syn_datasets = self.cfg.experiment.num_syn_datasets
+            if num_syn_datasets > 1:
+                impl_guideline.extend(
+                    [
+                        f"You MUST evaluate your solution on at least {num_syn_datasets} different synthetic datasets to ensure robustness:",
+                        "  - Use standard benchmark datasets when available",
+                        f"  - If using synthetic data, generate at least {num_syn_datasets} variants with different characteristics",
+                        "  - Report metrics separately for each dataset",
+                        "  - Compute and report the average metric across all datasets",
+                    ]
+                )
+        impl_guideline.extend(
+            [
+                "For generative modeling tasks, you must:",
+                "  - Generate a set of samples from your model",
+                "  - Compare these samples with ground truth data using appropriate visualizations",
+                "  - When saving plots, always use the 'working_dir' variable that will be defined at the start of the script",
+                "  - Make sure to give each figure a unique and appropriate name based on the dataset it represents, rather than reusing the same filename.",
+                "Important code structure requirements:",
+                "  - Do NOT put any execution code inside 'if __name__ == \"__main__\":' block",
+                "  - All code should be at the global scope or in functions that are called from the global scope",
+                "  - The script should execute immediately when run, without requiring any special entry point",
+                "The code should start with:",
+                "  import os",
+                "  working_dir = os.path.join(os.getcwd(), 'working')",
+                "  os.makedirs(working_dir, exist_ok=True)",
+                "The code should be a single-file python program that is self-contained and can be executed as-is.",
+                "No parts of the code should be skipped, don't terminate the code execution before finishing the script.",
+                "Your response should only contain a single code block.",
+                f"Be aware of the running time of the code, it should complete within {humanize.naturaldelta(self.cfg.exec.timeout)}.",
+                'You can also use the "./working" directory to store any temporary files that your code needs to create.',
+                "Data saving requirements:",
+                "- Save all plottable data (metrics, losses, predictions, etc.) as numpy arrays using np.save()",
+                "- Use the following naming convention for saved files:",
+                "  ```python",
+                "  # At the start of your code",
+                "  experiment_data = {",
+                "      'dataset_name_1': {",
+                "          'metrics': {'train': [], 'val': []},",
+                "          'losses': {'train': [], 'val': []},",
+                "          'predictions': [],",
+                "          'ground_truth': [],",
+                "          # Add other relevant data",
+                "      },",
+                "      # Add additional datasets as needed:",
+                "      'dataset_name_2': {",
+                "          'metrics': {'train': [], 'val': []},",
+                "          'losses': {'train': [], 'val': []},",
+                "          'predictions': [],",
+                "          'ground_truth': [],",
+                "          # Add other relevant data",
+                "      },",
+                "  }",
+                "  # During training/evaluation:",
+                "  experiment_data['dataset_name_1']['metrics']['train'].append(train_metric)",
+                "  ```",
+                "- Include timestamps or epochs with the saved metrics",
+                "- For large datasets, consider saving in chunks or using np.savez_compressed()",
+                "CRITICAL EVALUATION REQUIREMENTS - Your code MUST include ALL of these:",
+                "  1. Track and print validation loss at each epoch or at suitable intervals:",
+                "     ```python",
+                "     print(f'Epoch {{epoch}}: validation_loss = {{val_loss:.4f}}')",
+                "     ```",
+                "  2. Track and update ALL these additional metrics: "
+                + str(self.evaluation_metrics),
+                "  3. Update metrics at EACH epoch:",
+                "  4. Save ALL metrics at the end:",
+                "     ```python",
+                "     np.save(os.path.join(working_dir, 'experiment_data.npy'), experiment_data)",
+                "     ```",
+            ]
+        )
+
+        if self.cfg.agent.k_fold_validation > 1:
+            impl_guideline.append(
+                f"The evaluation should be based on {self.cfg.agent.k_fold_validation}-fold cross-validation but only if that's an appropriate evaluation for the task at hand."
+            )
+
+        return {"Implementation guideline": impl_guideline}
+
+    @property
+    def _prompt_resp_fmt(self):
+        return {
+            "Response format": (
+                "Your response should be a brief outline/sketch of your proposed solution in natural language (7-10 sentences), "
+                "followed by a single markdown code block (using the format ```python ... ```) which implements this solution and prints out the evaluation metric(s) if applicable. "
+                "There should be no additional headings or text in your response. Just natural language text followed by a newline and then the markdown code block. "
+                "Make sure to write concise code."
+            )
+        }
+
+    def _prompt_metricparse_resp_fmt(self):
+        return {
+            "Response format": (
+                "Your response should be a brief outline/sketch of your proposed solution in natural language (3-5 sentences), "
+                "followed by a single markdown code block (using the format ```python ... ```) which implements the full code for the metric parsing. "
+                "There should be no additional headings or text in your response. Just natural language text followed by a newline and then the markdown code block. "
+                "Your generated code should be complete and executable. "
+            )
+        }
+
+    @property
+    def _prompt_debug_resp_fmt(self):
+        return {
+            "Response format": (
+                "Your response should be a brief outline/sketch of your proposed solution in natural language (3-5 sentences), "
+                "followed by a single markdown code block (using the format ```python ... ```) which implements the full code including the bugfix/solution. "
+                "There should be no additional headings or text in your response. Just natural language text followed by a newline and then the markdown code block. "
+                "Your generated code should be complete and executable. Do not omit any part of the code, even if it was part of a previous implementation."
+                "Make sure to write concise code."
+            )
+        }
+
+    @property
+    def _prompt_hyperparam_tuning_resp_fmt(self):
+        return {
+            "Response format": (
+                "Your response should be a brief outline/sketch of your proposed solution in natural language (3-5 sentences), "
+                "followed by a single markdown code block (using the format ```python ... ```) which implements the full code including hyperparameter tuning. "
+                "There should be no additional headings or text in your response. Do not omit any part of the code, "
+                "Your generated code should be complete and executable."
+                "Make sure to write concise code."
+            )
+        }
+
+    @property
+    def _prompt_ablation_resp_fmt(self):
+        return {
+            "Response format": (
+                "Your response should be a brief outline/sketch of your proposed solution in natural language (3-5 sentences), "
+                "followed by a single markdown code block (using the format ```python ... ```) which implements the full code including the ablation study. "
+                "There should be no additional headings or text in your response. Do not omit any part of the code, "
+                "Your generated code should be complete and executable."
+                "Make sure to write concise code."
+            )
+        }
+
+    def _draft(self) -> Node:
+        prompt: Any = {
+            "Introduction": (
+                "You are an AI researcher who is looking to publish a paper that will contribute significantly to the field."
+                "Your first task is to write a python code to implement a solid baseline based on your research idea provided below, "
+                "from data preparation to model training, as well as evaluation and visualization. "
+                "Focus on getting a simple but working implementation first, before any sophisticated improvements. "
+                "We will explore more advanced variations in later stages."
+            ),
+            "Research idea": self.task_desc,
+            "Memory": self.memory_summary if self.memory_summary else "",
+            "Instructions": {},
+        }
+        prompt["Instructions"] |= self._prompt_resp_fmt
+        prompt["Instructions"] |= {
+            "Experiment design sketch guideline": [
+                "This first experiment design should be relatively simple, without extensive hyper-parameter optimization.",
+                "Take the Memory section into consideration when proposing the design. ",
+                "The solution sketch should be 6-10 sentences. ",
+                "Don't suggest to do EDA.",
+                "Make sure to create synthetic data if needed.",
+                "",
+            ],
+            "Evaluation Metric(s)": self.evaluation_metrics,
+        }
+        prompt["Instructions"] |= self._prompt_impl_guideline
+        prompt["Instructions"] |= self._prompt_environment
+
+        if self.cfg.agent.data_preview:
+            prompt["Data Overview"] = self.data_preview
+
+        print("[cyan]--------------------------------[/cyan]")
+        print("[cyan]self.task_desc[/cyan]")
+        print("[cyan]" + self.task_desc + "[/cyan]")
+        print("[cyan]--------------------------------[/cyan]")
+
+        print("MinimalAgent: Getting plan and code")
+        plan, code = self.plan_and_code_query(prompt)
+        print("MinimalAgent: Draft complete")
+        return Node(plan=plan, code=code)
+
+    def _debug(self, parent_node: Node) -> Node:
+        prompt: Any = {
+            "Introduction": (
+                "You are an experienced AI researcher. Your previous code for research experiment had a bug, so based on the information below, you should revise it in order to fix this bug. "
+                "Your response should be an implementation outline in natural language,"
+                " followed by a single markdown code block which implements the bugfix/solution."
+            ),
+            "Research idea": self.task_desc,
+            "Previous (buggy) implementation": wrap_code(parent_node.code),
+            "Execution output": wrap_code(parent_node.term_out, lang=""),
+            "Feedback based on generated plots": parent_node.vlm_feedback_summary,
+            "Feedback about execution time": parent_node.exec_time_feedback,
+            "Instructions": {},
+        }
+        prompt["Instructions"] |= self._prompt_debug_resp_fmt
+        prompt["Instructions"] |= {
+            "Bugfix improvement sketch guideline": [
+                "You should write a brief natural language description (3-5 sentences) of how the issue in the previous implementation can be fixed.",
+                "Don't suggest to do EDA.",
+            ],
+        }
+        prompt["Instructions"] |= self._prompt_impl_guideline
+
+        if self.cfg.agent.data_preview:
+            prompt["Data Overview"] = self.data_preview
+
+        plan, code = self.plan_and_code_query(prompt)
+        return Node(plan=plan, code=code, parent=parent_node)
+
+    def _improve(self, parent_node: Node) -> Node:
+        prompt: Any = {
+            "Introduction": (
+                "You are an experienced AI researcher. You are provided with a previously developed "
+                "implementation. Your task is to improve it based on the current experimental stage."
+            ),
+            "Research idea": self.task_desc,
+            "Memory": self.memory_summary if self.memory_summary else "",
+            "Feedback based on generated plots": parent_node.vlm_feedback_summary,
+            "Feedback about execution time": parent_node.exec_time_feedback,
+            "Instructions": {},
+        }
+        prompt["Previous solution"] = {
+            "Code": wrap_code(parent_node.code),
+        }
+
+        prompt["Instructions"] |= self._prompt_resp_fmt
+        prompt["Instructions"] |= self._prompt_impl_guideline
+
+        plan, code = self.plan_and_code_query(prompt)
+        return Node(
+            plan=plan,
+            code=code,
+            parent=parent_node,
+        )
+
+    def _generate_seed_node(self, parent_node: Node):
+        return Node(
+            plan="Seed node",
+            code=parent_node.code,
+            parent=parent_node,
+            is_seed_node=True,
+        )
+
+    def _generate_hyperparam_tuning_node(
+        self, parent_node: Node, hyperparam_idea: HyperparamTuningIdea
+    ):
+        prompt: Any = {
+            "Introduction": (
+                "You are an experienced AI researcher. You are provided with a previously developed "
+                "baseline implementation. Your task is to implement hyperparameter tuning for the following idea: "
+                + hyperparam_idea.name
+                + ". "
+                + hyperparam_idea.description
+            ),
+            "Base code you are working on": wrap_code(parent_node.code),
+            "Instructions": {},
+        }
+        prompt["Instructions"] |= {
+            "Implementation guideline": [
+                "The code should be a single-file python program that is self-contained and can be executed as-is.",
+                "No parts of the code should be skipped, don't terminate the code execution before finishing the script.",
+                "Data saving requirements:",
+                "- Save all plottable data (metrics, losses, predictions, etc.) as numpy arrays using np.save()",
+                "- Use the following naming convention for saved files:",
+                "  ```python",
+                "  # At the start of your code",
+                "  experiment_data = {",
+                "      'hyperparam_tuning_type_1': {",
+                "          'dataset_name_1': {",
+                "              'metrics': {'train': [], 'val': []},",
+                "              'losses': {'train': [], 'val': []},",
+                "              'predictions': [],",
+                "              'ground_truth': [],",
+                "              # Add other relevant data",
+                "          },",
+                "          # Add additional datasets as needed:",
+                "      },",
+                "      # Add additional hyperparam tuning types as needed",
+                "  }",
+                "Make sure to use a filename 'experiment_data.npy' to save the data. Do not use any other filename.",
+            ]
+        }
+        prompt["Instructions"] |= self._prompt_hyperparam_tuning_resp_fmt
+        plan, code = self.plan_and_code_query(prompt)
+        return Node(
+            plan="Hyperparam tuning name: " + hyperparam_idea.name + ".\n" + plan,
+            code=code,
+            parent=parent_node,
+            hyperparam_name=hyperparam_idea.name,
+        )
+
+    def _generate_ablation_node(self, parent_node: Node, ablation_idea: AblationIdea):
+        prompt: Any = {
+            "Introduction": (
+                "You are an experienced AI researcher. You are provided with a previously developed "
+                "baseline implementation. Your task is to implement the ablation study for the following idea: "
+                + ablation_idea.name
+                + ". "
+                + ablation_idea.description
+            ),
+            "Base code you are working on": wrap_code(parent_node.code),
+            "Instructions": {},
+        }
+        prompt["Instructions"] |= {
+            "Implementation guideline": [
+                "The code should be a single-file python program that is self-contained and can be executed as-is.",
+                "No parts of the code should be skipped, don't terminate the code execution before finishing the script.",
+                "Data saving requirements:",
+                "- Save all plottable data (metrics, losses, predictions, etc.) as numpy arrays using np.save()",
+                "- Use the following naming convention for saved files:",
+                "  ```python",
+                "  # At the start of your code",
+                "  experiment_data = {",
+                "      'ablation_type_1': {",
+                "          'dataset_name_1': {",
+                "              'metrics': {'train': [], 'val': []},",
+                "              'losses': {'train': [], 'val': []},",
+                "              'predictions': [],",
+                "              'ground_truth': [],",
+                "              # Add other relevant data",
+                "          },",
+                "          # Add additional datasets as needed:",
+                "          'dataset_name_2': {",
+                "              'metrics': {'train': [], 'val': []},",
+                "              'losses': {'train': [], 'val': []},",
+                "              'predictions': [],",
+                "              'ground_truth': [],",
+                "              # Add other relevant data",
+                "          },",
+                "      },",
+                "      # Add additional ablation types as needed",
+                "  }",
+                "Make sure to use a filename 'experiment_data.npy' to save the data. Do not use any other filename.",
+            ]
+        }
+        prompt["Instructions"] |= self._prompt_ablation_resp_fmt
+        plan, code = self.plan_and_code_query(prompt)
+        return Node(
+            plan="Ablation name: " + ablation_idea.name + ".\n" + plan,
+            code=code,
+            parent=parent_node,
+            ablation_name=ablation_idea.name,
+        )
+
+    def plan_and_code_query(self, prompt, retries=3) -> tuple[str, str]:
+        """Generate a natural language plan + code in the same LLM call and split them apart."""
+        completion_text = None
+        for _ in range(retries):
+            completion_text = query(
+                system_message=prompt,
+                user_message=None,
+                model=self.cfg.agent.code.model,
+                temperature=self.cfg.agent.code.temp,
+            )
+
+            code = extract_code(completion_text)
+            nl_text = extract_text_up_to_code(completion_text)
+
+            if code and nl_text:
+                # merge all code blocks into a single string
+                return nl_text, code
+
+            print("Plan + code extraction failed, retrying...")
+            prompt["Parsing Feedback"] = (
+                "The code extraction failed. Make sure to use the format ```python ... ``` for the code blocks."
+            )
+        print("Final plan + code extraction attempt failed, giving up...")
+        return "", completion_text  # type: ignore
+
+    def parse_exec_result(
+        self, node: Node, exec_result: ExecutionResult, workspace: str
+    ):
+        logger.info(f"Agent is parsing execution results for node {node.id}")
+
+        node.absorb_exec_result(exec_result)
+
+        prompt = {
+            "Introduction": (
+                "You are an experienced AI researcher. "
+                "You have written code for your research experiment and now need to evaluate the output of the code execution. "
+                "Analyze the execution output, determine if there were any bugs, and provide a summary of the findings. "
+            ),
+            "Research idea": self.task_desc,
+            "Implementation": wrap_code(node.code),
+            "Execution output": wrap_code(node.term_out, lang=""),
+        }
+
+        response = cast(
+            dict,
+            query(
+                system_message=prompt,
+                user_message=None,
+                func_spec=review_func_spec,
+                model=self.cfg.agent.feedback.model,
+                temperature=self.cfg.agent.feedback.temp,
+            ),
+        )
+
+        node.analysis = response["summary"]
+        node.is_buggy = response["is_bug"] or node.exc_type is not None
+        print(
+            "[red]Checking if response contains metric name and description[/red]",
+            flush=True,
+        )
+        print(response)
+
+    def _generate_plotting_code(
+        self, node: Node, working_dir: str, plot_code_from_prev_stage: str = None
+    ) -> str:
+        """Generate code for plotting experiment results"""
+        prompt_guideline = [
+            "AVAILABLE DATA: ",
+            "Experiment Data: experiment_data.npy",
+        ]
+        prompt_guideline += [
+            "REQUIREMENTS: ",
+            "The code should start with:",
+            "  import matplotlib.pyplot as plt",
+            "  import numpy as np",
+            "  import os",
+            "  working_dir = os.path.join(os.getcwd(), 'working')",
+            "Create standard visualizations of experiment results",
+            "Save all plots to working_dir",
+            "Include training/validation curves if available",
+            "ONLY plot data that exists in experiment_data.npy - DO NOT make up or simulate any values",
+            "Use basic matplotlib without custom styles",
+            "Each plot should be in a separate try-except block",
+            "Always close figures after saving",
+            "Always include a title for each plot, and be sure to use clear subtitles—such as 'Left: Ground Truth, Right: Generated Samples'—while also specifying the type of dataset being used.",
+            "Make sure to use descriptive names for figures when saving e.g. always include the dataset name and the type of plot in the name",
+            "When there are many similar figures to plot (e.g. generated samples at each epoch), make sure to plot only at a suitable interval of epochs so that you only plot at most 5 figures.",
+            "Use the following experiment code to infer the data to plot: " + node.code,
+            "Example to extract data from experiment_data: experiment_data['dataset_name_1']['metrics']['train']",
+        ]
+        prompt_guideline += [
+            "Example data loading and plot saving code: ",
+            """
+                try:
+                    experiment_data = np.load(os.path.join(working_dir, 'experiment_data.npy'), allow_pickle=True).item()
+                except Exception as e:
+                    print(f'Error loading experiment data: {{e}}')
+
+                try:
+                    # First plot
+                    plt.figure()
+                    # ... plotting code ...
+                    plt.savefig('working_dir/[plot_name_1].png')
+                    plt.close()
+                except Exception as e:
+                    print(f"Error creating plot1: {{e}}")
+                    plt.close()  # Always close figure even if error occurs
+
+                try:
+                    # Second plot
+                    plt.figure()
+                    # ... plotting code ...
+                    plt.savefig('working_dir/[plot_name_2].png')
+                    plt.close()
+                except Exception as e:
+                    print(f"Error creating plot2: {{e}}")
+                    plt.close()
+            """,
+        ]
+        # add instruction for format
+        plotting_prompt = {
+            "Instructions": {},
+        }
+        plotting_prompt["Instructions"] |= self._prompt_resp_fmt
+        plotting_prompt["Instructions"] |= {
+            "Plotting code guideline": prompt_guideline,
+        }
+
+        # For stage 3, initialize with stage 2's plotting code
+        if (
+            self.stage_name
+            and self.stage_name.startswith("3_")
+            and plot_code_from_prev_stage
+        ):
+            prompt_guideline.extend(
+                [
+                    "IMPORTANT: Use the following base plotting code as a starting point:",
+                    "Base plotting code: " + plot_code_from_prev_stage,
+                    "Modify the base plotting code to:",
+                    "1. Keep the same numpy data structure and plotting style",
+                    "2. Add comparison plots between different datasets",
+                    "3. Add dataset-specific visualizations if needed",
+                    "4. Include clear labels indicating which plots are from which dataset",
+                    "5. Use consistent naming conventions for saved files",
+                ]
+            )
+        # For stage 4, initialize with stage 3's plotting code
+        elif (
+            self.stage_name
+            and self.stage_name.startswith("4_")
+            and plot_code_from_prev_stage
+        ):
+            prompt_guideline.extend(
+                [
+                    "IMPORTANT: This is an ablation study. Use the following base plotting code as a starting point:",
+                    "Base plotting code: \n" + plot_code_from_prev_stage,
+                    "Modify the base plotting code to:",
+                    "1. Keep the same numpy data structure and plotting style",
+                    "2. Add comparison plots between ablation and baseline results",
+                    "3. Add ablation-specific visualizations if needed",
+                    "4. Include clear labels indicating which plots are from ablation vs baseline",
+                    "5. Use consistent naming conventions for saved files",
+                ]
+            )
+
+        # Get plotting code from LLM
+        plan, code = self.plan_and_code_query(plotting_prompt)
+
+        # Ensure the code starts with imports
+        if not code.strip().startswith("import"):
+            code = "import matplotlib.pyplot as plt\nimport numpy as np\n\n" + code
+
+        node.plot_code = code
+        node.plot_plan = plan
+
+        return code
+
+    def _determine_datasets_successfully_tested(self, node: Node) -> List[str]:
+        """Determine which datasets are successfully tested based on VLM feedback"""
+        plot_analyses = ""
+        for i, plot_analysis in enumerate(node.plot_analyses):
+            plot_analyses += f"plot {i+1}: {plot_analysis['analysis']}\n"
+
+        determine_prompt = {
+            "Introduction": "You are an AI researcher analyzing experiment results. Based on the plot analyses and feedback, determine which datasets are successfully tested. Return reasoning and the dataset names that are successfully executed, or an empty string if no datasets are successfully executed.",
+            "Plot analyses": plot_analyses,
+            "VLM feedback summary": node.vlm_feedback_summary,
+            "Original plotting code": node.plot_code,
+            "Response format": (
+                "Your response should start with 'REASONING: <reasoning>' to think about the plot analysis and feedback in the first line."
+                "In the second line, you should have a list of dataset names that are successfully executed, starting with 'SUCCESSFULLY_TESTED_DATASETS: <list_datasets_successfully_tested>', "
+            ),
+        }
+
+        retry_count = 0
+        retry_limit = 5
+        while retry_count < retry_limit:
+            response = query(
+                system_message=determine_prompt,
+                user_message=None,
+                model=self.cfg.agent.feedback.model,
+                temperature=self.cfg.agent.feedback.temp,
+            )
+
+            (
+                reasoning,
+                datasets_successfully_tested_str,
+            ) = _parse_keyword_prefix_response(
+                response, "REASONING:", "SUCCESSFULLY_TESTED_DATASETS:"
+            )
+            print(f"[green]Reasoning:[/green] {reasoning}")
+            print(
+                f"[green]Datasets successfully tested:[/green] {datasets_successfully_tested_str}"
+            )
+            if reasoning is not None and datasets_successfully_tested_str is not None:
+                if datasets_successfully_tested_str == "":
+                    return [""]
+                # Split by comma and clean each dataset name
+                datasets = [
+                    ds.strip() for ds in datasets_successfully_tested_str.split(",")
+                ]
+                # Filter out empty strings and ensure all elements are strings
+                datasets = [ds for ds in datasets if isinstance(ds, str) and ds]
+                logger.info(f"Successfully parsed datasets: {datasets}")
+                return datasets
+
+            retry_count += 1
+            logger.warning(
+                f"Failed to parse successfully tested datasets response (attempt {retry_count}/{retry_limit})"
+            )
+
+        logger.error(
+            f"Failed to parse successfully tested datasets response after {retry_limit} retries. Falling back to an empty list."
+        )
+        return [""]
+
+    def _analyze_plots_with_vlm(self, node: Node) -> None:
+        """Analyze experimental plots using VLM"""
+        if not node.plot_paths:
+            return
+
+        # for debugging
+        print(f"[cyan]Plot paths:[/cyan] {node.plot_paths}")
+
+        def encode_image_to_base64(image_path):
+            with open(image_path, "rb") as image_file:
+                try:
+                    return base64.b64encode(image_file.read()).decode("utf-8")
+                except Exception as e:
+                    print(f"[red]Error encoding image {image_path}: {e}[/red]")
+                    return None
+
+        if not len(node.plot_paths) > 10:
+            selected_plots = node.plot_paths
+        else:
+            print(
+                f"[red]Warning: {len(node.plot_paths)} plots received, this may be too many to analyze effectively. Calling LLM to select the most relevant plots to analyze.[/red]"
+            )
+            # select 10 plots to analyze
+            prompt_select_plots = {
+                "Introduction": (
+                    "You are an experienced AI researcher analyzing experimental results. "
+                    "You have been provided with plots from a machine learning experiment. "
+                    "Please select 10 most relevant plots to analyze. "
+                    "For similar plots (e.g. generated samples at each epoch), select only at most 5 plots at a suitable interval of epochs."
+                    "Format your response as a list of plot paths, where each plot path includes the full path to the plot file."
+                ),
+                "Plot paths": node.plot_paths,
+            }
+
+            try:
+                response_select_plots = cast(
+                    dict,
+                    query(
+                        system_message=prompt_select_plots,
+                        user_message=None,
+                        func_spec=plot_selection_spec,
+                        model=self.cfg.agent.feedback.model,
+                        temperature=self.cfg.agent.feedback.temp,
+                    ),
+                )
+
+                print(f"[cyan]Plot selection response:[/cyan] {response_select_plots}")
+                # Extract the plot paths list
+                selected_plots = response_select_plots.get("selected_plots", [])
+
+                # Validate that all paths exist and are image files
+                valid_plots = []
+                for plot_path in selected_plots:
+                    if (
+                        isinstance(plot_path, str)
+                        and os.path.exists(plot_path)
+                        and plot_path.lower().endswith((".png", ".jpg", ".jpeg"))
+                    ):
+                        valid_plots.append(plot_path)
+                    else:
+                        logger.warning(f"Invalid plot path received: {plot_path}")
+
+                # Use the validated list
+                if valid_plots:
+                    print(f"[cyan]Selected valid plots:[/cyan] {valid_plots}")
+                    selected_plots = valid_plots
+                else:
+                    logger.warning(
+                        "No valid plot paths found in response, falling back to first 10 plots"
+                    )
+                    # fallback to first 10 plots
+                    # validate node.plot_paths
+                    selected_plots = []
+                    for plot_path in node.plot_paths[:10]:
+                        if os.path.exists(plot_path) and plot_path.lower().endswith(
+                            (".png", ".jpg", ".jpeg")
+                        ):
+                            selected_plots.append(plot_path)
+                        else:
+                            logger.warning(f"Invalid plot path received: {plot_path}")
+
+            except Exception as e:
+                logger.error(
+                    f"Error in plot selection: {str(e)}; falling back to first 10 plots"
+                )
+                # Fallback to using first 10 plots
+                selected_plots = node.plot_paths[:10]
+
+        print("[cyan]Before encoding images[/cyan]")
+        user_message = [
+            {
+                "type": "text",
+                "text": (
+                    "You are an experienced AI researcher analyzing experimental results. "
+                    "You have been provided with plots from a machine learning experiment. "
+                    f"This experiment is based on the following research idea: {self.task_desc}"
+                    "Please analyze these plots and provide detailed insights about the results. "
+                    "If you don't receive any plots, say 'No plots received'. "
+                    "Never make up plot analysis. "
+                    "Please return the analyzes with strict order of uploaded images, but DO NOT include any word "
+                    "like 'the first plot'."
+                ),
+            }
+        ] + [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{encode_image_to_base64(plot_path)}"
+                },
+            }
+            for plot_path in selected_plots
+        ]
+
+        response = cast(
+            dict,
+            query(
+                system_message=None,
+                user_message=user_message,
+                func_spec=vlm_feedback_spec,
+                model=self.cfg.agent.vlm_feedback.model,
+                temperature=self.cfg.agent.vlm_feedback.temp,
+            ),
+        )
+        print(
+            f"[cyan]VLM response from {self.cfg.agent.vlm_feedback.model}:[/cyan] {response}"
+        )
+        if response["valid_plots_received"]:
+            node.is_buggy_plots = False
+        else:
+            node.is_buggy_plots = True
+
+        for index, analysis in enumerate(response["plot_analyses"]):
+            analysis["plot_path"] = node.plot_paths[index]
+
+        node.plot_analyses = response["plot_analyses"]
+        node.vlm_feedback_summary = response["vlm_feedback_summary"]
+
+        node.datasets_successfully_tested = (
+            self._determine_datasets_successfully_tested(node)
+        )
+
+    def _generate_node_summary(self, node: Node) -> dict:
+        """Generate a summary of the node's experimental findings"""
+        summary_prompt = {
+            "Introduction": (
+                "You are an AI researcher analyzing experimental results. "
+                "Please summarize the findings from this experiment iteration."
+            ),
+            "Research idea": self.task_desc,
+            "Implementation": wrap_code(node.code),
+            "Plan": node.plan,
+            "Execution output": wrap_code(node.term_out, lang=""),
+            "Analysis": node.analysis,
+            "Metric": str(node.metric) if node.metric else "Failed",
+            "Plot Analyses": (
+                node.plot_analyses if hasattr(node, "plot_analyses") else []
+            ),
+            "VLM Feedback": (
+                node.vlm_feedback_summary
+                if hasattr(node, "vlm_feedback_summary")
+                else ""
+            ),
+        }
+
+        return cast(
+            dict,
+            query(
+                system_message=summary_prompt,
+                user_message=None,
+                func_spec={
+                    "name": "summarize_experiment",
+                    "description": "Summarize experimental findings",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "findings": {
+                                "type": "string",
+                                "description": "Key findings and results",
+                            },
+                            "significance": {
+                                "type": "string",
+                                "description": "Why these results matter",
+                            },
+                            "next_steps": {
+                                "type": "string",
+                                "description": "Suggested improvements or next experiments",
+                            },
+                        },
+                        "required": ["findings", "significance"],
+                    },
+                },
+                model=self.cfg.agent.feedback.model,
+                temperature=self.cfg.agent.feedback.temp,
+            ),
+        )
+
+
+class GPUManager:
+    """Manages GPU allocation across processes"""
+
+    def __init__(self, num_gpus: int):
+        self.num_gpus = num_gpus
+        self.available_gpus: Set[int] = set(range(num_gpus))
+        self.gpu_assignments: Dict[str, int] = {}  # process_id -> gpu_id
+
+    def acquire_gpu(self, process_id: str) -> int:
+        """Assigns a GPU to a process"""
+        if not self.available_gpus:
+            raise RuntimeError("No GPUs available")
+        print(f"Available GPUs: {self.available_gpus}")
+        print(f"Process ID: {process_id}")
+        gpu_id = min(self.available_gpus)
+        print(f"Acquiring GPU {gpu_id} for process {process_id}")
+        self.available_gpus.remove(gpu_id)
+        self.gpu_assignments[process_id] = gpu_id
+        print(f"GPU assignments: {self.gpu_assignments}")
+        return gpu_id
+
+    def release_gpu(self, process_id: str):
+        """Releases GPU assigned to a process"""
+        if process_id in self.gpu_assignments:
+            gpu_id = self.gpu_assignments[process_id]
+            self.available_gpus.add(gpu_id)
+            del self.gpu_assignments[process_id]
+
+
+def get_gpu_count() -> int:
+    """Get number of available NVIDIA GPUs without using torch"""
+    try:
+        # First try using nvidia-smi
+        nvidia_smi = subprocess.run(
+            ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        gpus = nvidia_smi.stdout.strip().split("\n")
+        return len(gpus)
+    except (subprocess.SubprocessError, FileNotFoundError):
+        # If nvidia-smi fails, try environment variable
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cuda_visible_devices:
+            # Filter out empty strings and -1 values
+            devices = [d for d in cuda_visible_devices.split(",") if d and d != "-1"]
+            return len(devices)
+        return 0
+
+
+class ParallelAgent:
+    def __init__(
+        self,
+        task_desc: str,
+        cfg: Config,
+        journal: Journal,
+        stage_name=None,
+        best_stage3_node=None,
+        best_stage2_node=None,
+        best_stage1_node=None,
+    ):
+        super().__init__()
+        self.task_desc = task_desc
+        self.cfg = cfg
+        self.journal = journal
+        self.stage_name = stage_name
+        self.best_stage3_node = (
+            best_stage3_node  # to initialize ablation stuides (stage 4)
+        )
+        self.best_stage1_node = (
+            best_stage1_node  # to initialize hyperparam tuning (stage 2)
+        )
+        self.best_stage2_node = (
+            best_stage2_node  # to initialize plotting code (stage 3)
+        )
+        self.data_preview = None
+        self.num_workers = cfg.agent.num_workers
+        self.num_gpus = get_gpu_count()
+        print(f"num_gpus: {self.num_gpus}")
+        if self.num_gpus == 0:
+            print("No GPUs detected, falling back to CPU-only mode")
+        else:
+            print(f"Detected {self.num_gpus} GPUs")
+
+        self.gpu_manager = GPUManager(self.num_gpus) if self.num_gpus > 0 else None
+
+        if self.num_gpus > 0:
+            self.num_workers = min(self.num_workers, self.num_gpus)
+            logger.info(f"Limiting workers to {self.num_workers} to match GPU count")
+
+        self.timeout = self.cfg.exec.timeout
+        self.executor = ProcessPoolExecutor(max_workers=self.num_workers)
+        self._is_shutdown = False
+        # Define the metric once at initialization
+        self.evaluation_metrics = self._define_global_metrics()
+        self._ablation_state = {  # store ablation names
+            "completed_ablations": set(),
+        }
+        self._hyperparam_tuning_state = {  # store hyperparam tuning ideas
+            "tried_hyperparams": set(),
+        }
+
+    def _define_global_metrics(self) -> str:
+        """Define eval metric to be used across all experiments"""
+        prompt = {
+            "Introduction": (
+                "You are an AI researcher setting up experiments. "
+                "Please propose meaningful evaluation metrics that will help analyze "
+                "the performance and characteristics of solutions for this research task."
+            ),
+            "Research idea": self.task_desc,
+            "Instructions": [
+                "Propose a single evaluation metric that would be useful for analyzing the performance of solutions for this research task.",
+                "Note: Validation loss will be tracked separately so you don't need to include it in your response.",
+                "Format your response as a list containing:",
+                "- name: The name of the metric",
+                "- maximize: Whether higher values are better (true/false)",
+                "- description: A brief explanation of what the metric measures"
+                "Your list should contain only one metric.",
+            ],
+        }
+
+        response = query(
+            system_message=prompt,
+            user_message=None,
+            model=self.cfg.agent.code.model,
+            temperature=self.cfg.agent.code.temp,
+        )
+
+        print(f"[green]Defined eval metrics:[/green] {response}")
+        return response
+
+    def plan_and_code_query(self, prompt, retries=3) -> tuple[str, str]:
+        """Generate a natural language plan + code in the same LLM call and split them apart."""
+        completion_text = None
+        for _ in range(retries):
+            completion_text = query(
+                system_message=prompt,
+                user_message=None,
+                model=self.cfg.agent.code.model,
+                temperature=self.cfg.agent.code.temp,
+            )
+
+            code = extract_code(completion_text)
+            nl_text = extract_text_up_to_code(completion_text)
+
+            if code and nl_text:
+                # merge all code blocks into a single string
+                return nl_text, code
+            print("Plan + code extraction failed, retrying...")
+            prompt["Parsing Feedback"] = (
+                "The code extraction failed. Make sure to use the format ```python ... ``` for the code blocks."
+            )
+        print("Final plan + code extraction attempt failed, giving up...")
+        return "", completion_text
+
+    def _generate_seed_eval_aggregation_node(
+        self, node: Node, agg_plotting_code: str
+    ) -> Node:
+        """Generate a special aggregation node for seed evaluation results"""
+        return Node(
+            plan="Aggregate results from multiple seeds",
+            code="# plotting aggregation code",
+            plot_code=agg_plotting_code,
+            parent=node,
+            is_seed_node=True,
+            is_seed_agg_node=True,
+        )
+
+    def _run_multi_seed_evaluation(self, node: Node) -> List[Node]:
+        """Run multiple seeds of the same node to get statistical metrics.
+        Returns a list of nodes with different random seeds."""
+
+        # Convert node to dict for parallel processing
+        node_data = node.to_dict()
+        node_code = node.code
+
+        # Submit parallel jobs for different seeds
+        seed_nodes = []
+        futures = []
+        for seed in range(self.cfg.agent.multi_seed_eval.num_seeds):
+            gpu_id = None
+            if self.gpu_manager is not None:
+                try:
+                    process_id = f"seed_{seed}_worker"
+                    gpu_id = self.gpu_manager.acquire_gpu(process_id)
+                    logger.info(f"Assigned GPU {gpu_id} to seed {seed}")
+                except RuntimeError as e:
+                    logger.warning(
+                        f"Could not acquire GPU for seed {seed}: {e}. Running on CPU"
+                    )
+
+            # Add seed to node code
+            node_data["code"] = (
+                f"# Set random seed\nimport random\nimport numpy as np\nimport torch\n\nseed = {seed}\nrandom.seed(seed)\nnp.random.seed(seed)\ntorch.manual_seed(seed)\nif torch.cuda.is_available():\n    torch.cuda.manual_seed(seed)\n\n"
+                + node_code
+            )
+
+            new_ablation_idea = None
+            new_hyperparam_idea = None
+            best_stage1_plot_code = None
+            best_stage2_plot_code = None
+            best_stage3_plot_code = None
+            seed_eval = True
+            memory_summary = ""
+            print("[yellow]Starting multi-seed eval...[/yellow]")
+            futures.append(
+                self.executor.submit(
+                    self._process_node_wrapper,
+                    node_data,
+                    self.task_desc,
+                    self.cfg,
+                    gpu_id,
+                    memory_summary,
+                    self.evaluation_metrics,
+                    self.stage_name,
+                    new_ablation_idea,
+                    new_hyperparam_idea,
+                    best_stage1_plot_code,
+                    best_stage2_plot_code,
+                    best_stage3_plot_code,
+                    seed_eval,
+                )
+            )
+
+        for future in futures:
+            try:
+                result_data = future.result(timeout=self.timeout)
+                result_node = Node.from_dict(result_data, self.journal)
+                print(f"Parent node id: {result_node.parent.id}")
+                print(f"Sanity check: actual parent node id: {node.id}")
+                # Add node to journal's list and assign its step number
+                self.journal.append(result_node)
+                seed_nodes.append(self.journal.get_node_by_id(result_node.id))
+                print("Added result node to journal")
+            except Exception as e:
+                logger.error(f"Error in multi-seed evaluation: {str(e)}")
+
+        return seed_nodes
+
+    def _run_plot_aggregation(self, node: Node, seed_nodes: List[Node]) -> Node:
+        """Generate an aggregation node for seed evaluation results"""
+        if seed_nodes:
+            try:
+                from .interpreter import Interpreter
+
+                # Create aggregation plotting code
+                agg_plotting_code = self._aggregate_seed_eval_results(seed_nodes, node)
+
+                # Create a special aggregation node
+                agg_node = self._generate_seed_eval_aggregation_node(
+                    node, agg_plotting_code
+                )
+                agg_node.parent = node
+
+                # Execute aggregation plotting code
+                print("[blue]Creating Interpreter for seed node aggregation[/blue]")
+                process_interpreter = Interpreter(
+                    working_dir=self.cfg.workspace_dir,
+                    timeout=self.cfg.exec.timeout,
+                    format_tb_ipython=self.cfg.exec.format_tb_ipython,
+                    agent_file_name=self.cfg.exec.agent_file_name,
+                    env_vars={AI_SCIENTIST_ROOT: os.getenv(AI_SCIENTIST_ROOT), AI_SCIENTIST_SKILLS_ROOT: os.getenv(AI_SCIENTIST_SKILLS_ROOT)},
+                )
+
+                try:
+                    working_dir = process_interpreter.working_dir
+                    plot_exec_result = process_interpreter.run(agg_plotting_code, True)
+                    print(plot_exec_result)
+                    process_interpreter.cleanup_session()
+                    # Save aggregated plots
+                    plots_dir = Path(working_dir) / "working"
+                    print("[red]plots_dir[/red]", plots_dir)
+                    if plots_dir.exists():
+                        base_dir = Path(self.cfg.workspace_dir).parent  # .parent
+                        run_name = Path(self.cfg.workspace_dir).name
+                        exp_results_dir = (
+                            base_dir
+                            / "logs"
+                            / run_name
+                            / "experiment_results"
+                            / f"seed_aggregation_{agg_node.id}"
+                        )
+                        print("[red]exp_results_dir[/red]", exp_results_dir)
+                        exp_results_dir.mkdir(parents=True, exist_ok=True)
+
+                        # Save plotting code
+                        with open(
+                            exp_results_dir / "aggregation_plotting_code.py", "w"
+                        ) as f:
+                            f.write(agg_plotting_code)
+
+                        # Move generated plots
+                        for plot_file in plots_dir.glob("*.png"):
+                            final_path = exp_results_dir / plot_file.name
+                            print("mv_from:plot_file.resolve(): ", plot_file.resolve())
+                            print("mv_to:final_path: ", final_path)
+                            plot_file.resolve().rename(final_path)
+                            web_path = f"../../logs/{Path(self.cfg.workspace_dir).name}/experiment_results/seed_aggregation_{agg_node.id}/{plot_file.name}"
+                            agg_node.plots.append(web_path)
+                            agg_node.plot_paths.append(str(final_path.absolute()))
+
+                    agg_node.is_buggy = False
+                    agg_node.exp_results_dir = exp_results_dir
+                    agg_node_dict = agg_node.to_dict()
+                    agg_node_new = Node.from_dict(
+                        agg_node_dict, self.journal
+                    )  # to update the parent-child relationship in the journal
+                    # Add aggregation node to journal
+                    self.journal.append(agg_node_new)
+                finally:
+                    if process_interpreter:
+                        process_interpreter.cleanup_session()
+
+            except Exception as e:
+                print(f"Error in seed result aggregation: {str(e)}")
+
+    @staticmethod
+    def _process_node_wrapper(
+        node_data,
+        task_desc,
+        cfg,
+        gpu_id: int = None,
+        memory_summary: str = None,
+        evaluation_metrics=None,
+        stage_name=None,
+        new_ablation_idea=None,
+        new_hyperparam_idea=None,
+        best_stage3_plot_code=None,
+        best_stage2_plot_code=None,
+        best_stage1_plot_code=None,
+        seed_eval=False,
+    ):
+        """Wrapper function that creates a fresh environment for each process"""
+        from .interpreter import Interpreter
+        from .journal import Node, Journal
+        from copy import deepcopy
+        import os
+        import multiprocessing
+
+        print("Starting _process_node_wrapper")
+
+        # Create process-specific workspace
+        process_id = multiprocessing.current_process().name
+        workspace = os.path.join(cfg.workspace_dir, f"process_{process_id}")
+        os.makedirs(workspace, exist_ok=True)
+        print(f"Process {process_id} using workspace: {workspace}")
+        # Create process-specific working directory
+        working_dir = os.path.join(workspace, "working")
+        os.makedirs(working_dir, exist_ok=True)
+
+        if gpu_id is not None:
+            os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+            logger.info(f"Process {process_id} assigned to GPU {gpu_id}")
+        else:
+            os.environ["CUDA_VISIBLE_DEVICES"] = ""
+            logger.info(f"Process {process_id} running on CPU")
+
+        # Create minimal agent for worker process with the global metric definition
+        worker_agent = MinimalAgent(
+            task_desc=task_desc,
+            cfg=cfg,
+            memory_summary=memory_summary,
+            evaluation_metrics=evaluation_metrics,
+            stage_name=stage_name,
+        )
+
+        # Create interpreter instance for worker process
+        print("Creating Interpreter")
+        process_interpreter = Interpreter(
+            working_dir=workspace,
+            timeout=cfg.exec.timeout,
+            format_tb_ipython=cfg.exec.format_tb_ipython,
+            agent_file_name=cfg.exec.agent_file_name,
+        )
+
+        try:
+            print(f"stage_name: {stage_name}")
+            # Recreate node object from node_data, which becomes a parent node.
+            if node_data:
+                parent_node = Node.from_dict(node_data, journal=None)
+                print(f"Recreated parent node: {parent_node.id}")
+            else:
+                parent_node = None
+                print("No parent node to recreate")
+
+            # Process the node using worker agent
+            print("Starting node processing")
+            if seed_eval:
+                # Use the parent node's code to run the same code again
+                child_node = worker_agent._generate_seed_node(parent_node)
+                child_node.parent = parent_node
+                # Plot code should also be the same as the parent node
+                child_node.plot_code = parent_node.plot_code
+            else:
+                if parent_node is None:
+                    print("Drafting new node")
+                    child_node = worker_agent._draft()
+                elif parent_node.is_buggy:
+                    print("Debugging node with id: ", parent_node.id)
+                    child_node = worker_agent._debug(parent_node)
+                    child_node.parent = parent_node
+                else:
+                    if (
+                        new_hyperparam_idea is not None and new_ablation_idea is None
+                    ):  # stage 2
+                        child_node = worker_agent._generate_hyperparam_tuning_node(
+                            parent_node, new_hyperparam_idea
+                        )
+                        child_node.parent = parent_node
+                        logger.info(
+                            f"Processing hyperparam tuning: {child_node.hyperparam_name}"
+                        )
+                        print(
+                            f"[cyan]Running hyperparam tuning: {child_node.hyperparam_name}[/cyan]"
+                        )
+                    elif (
+                        new_ablation_idea is not None and new_hyperparam_idea is None
+                    ):  # stage 4
+                        child_node = worker_agent._generate_ablation_node(
+                            parent_node, new_ablation_idea
+                        )
+                        child_node.parent = parent_node
+                        logger.info(f"Processing ablation: {child_node.ablation_name}")
+                        print(
+                            f"[cyan]Running ablation study: {child_node.ablation_name}[/cyan]"
+                        )
+                    else:
+                        print("Improving node with id: ", parent_node.id)
+                        child_node = worker_agent._improve(parent_node)
+                        child_node.parent = parent_node
+
+            # Execute and parse results
+            print("Running code")
+            exec_result = process_interpreter.run(child_node.code, True)
+            process_interpreter.cleanup_session()
+
+            print("Parsing execution results")
+            worker_agent.parse_exec_result(
+                node=child_node, exec_result=exec_result, workspace=working_dir
+            )
+
+            # Add check for saved data files
+            data_files = [f for f in os.listdir(working_dir) if f.endswith(".npy")]
+            if not data_files:
+                logger.warning(
+                    "No .npy files found in working directory. Data may not have been saved properly."
+                )
+            else:
+                if seed_eval:
+                    # Use the parent node's parse code to parse the same data files again
+                    parse_metrics_code = parent_node.parse_metrics_code
+                    parse_metrics_plan = parent_node.parse_metrics_plan
+                    print(
+                        f"[blue]SEED EVAL: Parse metrics plan:[/blue] {parse_metrics_plan}"
+                    )
+                    print(
+                        f"[blue]SEED EVAL: Parse metrics code:[/blue] {parse_metrics_code}"
+                    )
+                    child_node.parse_metrics_code = parse_metrics_code
+                    child_node.parse_metrics_plan = parse_metrics_plan
+                else:
+                    # Call LLM to parse data files and extract metrics
+                    parse_metrics_prompt = {
+                        "Introduction": (
+                            "You are an AI researcher analyzing experimental results stored in numpy files. "
+                            "Write code to load and analyze the metrics from experiment_data.npy."
+                        ),
+                        "Context": [
+                            "Original Code: " + child_node.code,
+                        ],
+                        "Instructions": [
+                            "0. Make sure to get the working directory from os.path.join(os.getcwd(), 'working')",
+                            "1. Load the experiment_data.npy file, which is located in the working directory",
+                            "2. Extract metrics for each dataset. Make sure to refer to the original code to understand the structure of the data.",
+                            "3. Always print the name of the dataset before printing the metrics",
+                            "4. Always print the name of the metric before printing the value by specifying the metric name clearly. Avoid vague terms like 'train,' 'val,' or 'test.' Instead, use precise labels such as 'train accuracy,' 'validation loss,' or 'test F1 score,' etc.",
+                            "5. You only need to print the best or final value for each metric for each dataset",
+                            "6. DO NOT CREATE ANY PLOTS",
+                            "Important code structure requirements:",
+                            "  - Do NOT put any execution code inside 'if __name__ == \"__main__\":' block. Do not use 'if __name__ == \"__main__\":' at all.",
+                            "  - All code should be at the global scope or in functions that are called from the global scope",
+                            "  - The script should execute immediately when run, without requiring any special entry point",
+                        ],
+                        "Example data loading code": [
+                            """
+                            import matplotlib.pyplot as plt
+import numpy as np
+
+                            experiment_data = np.load(os.path.join(os.getcwd(), 'experiment_data.npy'), allow_pickle=True).item()
+                            """
+                        ],
+                        "Response format": worker_agent._prompt_metricparse_resp_fmt(),
+                    }
+
+                    (
+                        parse_metrics_plan,
+                        parse_metrics_code,
+                    ) = worker_agent.plan_and_code_query(parse_metrics_prompt)
+                    print(f"[blue]Parse metrics plan:[/blue] {parse_metrics_plan}")
+                    print(f"[blue]Parse metrics code:[/blue] {parse_metrics_code}")
+                    child_node.parse_metrics_plan = parse_metrics_plan
+                    child_node.parse_metrics_code = parse_metrics_code
+                try:
+                    # Execute the parsing code
+                    metrics_exec_result = process_interpreter.run(
+                        parse_metrics_code, True
+                    )
+                    process_interpreter.cleanup_session()
+                    child_node.parse_term_out = metrics_exec_result.term_out
+                    child_node.parse_exc_type = metrics_exec_result.exc_type
+                    child_node.parse_exc_info = metrics_exec_result.exc_info
+                    child_node.parse_exc_stack = metrics_exec_result.exc_stack
+
+                    if metrics_exec_result.exc_type is None:
+                        # Extract metrics from the execution output
+                        metrics_prompt = {
+                            "Introduction": "Parse the metrics from the execution output. You only need the final or best value of a metric for each dataset, not the entire list during training.",
+                            "Execution Output": metrics_exec_result.term_out,
+                        }
+                        print(
+                            f"[blue]Metrics_exec_result.term_out: {metrics_exec_result.term_out}[/blue]"
+                        )
+                        print(
+                            f"[blue]Metrics Parsing Execution Result:\n[/blue] {metrics_exec_result}"
+                        )
+
+                        metrics_response = cast(
+                            dict,
+                            query(
+                                system_message=metrics_prompt,
+                                user_message=None,
+                                func_spec=metric_parse_spec,
+                                model=cfg.agent.feedback.model,
+                                temperature=cfg.agent.feedback.temp,
+                            ),
+                        )
+                        # If there is any None value, child_node.metric should be set to WorstMetricValue.
+                        # This is achieved by raising an error in the MetricValue class,
+                        # which sets child_node.is_buggy to True, thereby
+                        # causing child_node.metric to be assigned WorstMetricValue.
+                        print(f"[blue]Metrics:[/blue] {metrics_response}")
+                        if metrics_response["valid_metrics_received"]:
+                            child_node.metric = MetricValue(
+                                value={"metric_names": metrics_response["metric_names"]}
+                            )
+                            logger.info(
+                                f"Successfully extracted metrics for node {child_node.id}"
+                            )
+                        else:
+                            child_node.metric = WorstMetricValue()
+                            child_node.is_buggy = True
+                            logger.error(
+                                f"No valid metrics received for node {child_node.id}"
+                            )
+                    else:
+                        logger.error(
+                            f"Error executing metrics parsing code: {metrics_exec_result.exc_info}"
+                        )
+                        child_node.metric = WorstMetricValue()
+                        child_node.is_buggy = True
+
+                except Exception as e:
+                    logger.error(
+                        f"Error parsing metrics for node {child_node.id}: {str(e)}"
+                    )
+                    child_node.metric = WorstMetricValue()
+                    child_node.is_buggy = True
+                    child_node.parse_exc_type = str(e)
+                    child_node.parse_exc_info = None
+                    child_node.parse_exc_stack = None
+                    child_node.parse_term_out = (
+                        "Error parsing metrics. There was an error in the parsing code: "
+                        + str(e)
+                    )
+
+            # if experiment was successful, generate and run plotting code
+            if not child_node.is_buggy:
+                try:
+                    retry_count = 0
+                    while True:
+                        if seed_eval:
+                            # Use the parent node's plotting code instead of generating new one
+                            plotting_code = parent_node.plot_code
+                        else:
+                            if (
+                                worker_agent.stage_name
+                                and worker_agent.stage_name.startswith("3_")
+                                and best_stage2_plot_code
+                            ):
+                                plot_code_from_prev_stage = best_stage2_plot_code
+                            elif (
+                                worker_agent.stage_name
+                                and worker_agent.stage_name.startswith("4_")
+                                and best_stage3_plot_code
+                            ):
+                                plot_code_from_prev_stage = best_stage3_plot_code
+                            else:
+                                plot_code_from_prev_stage = None
+
+                            plotting_code = worker_agent._generate_plotting_code(
+                                child_node, working_dir, plot_code_from_prev_stage
+                            )
+                        plot_exec_result = process_interpreter.run(plotting_code, True)
+                        process_interpreter.cleanup_session()
+                        child_node.plot_exec_result = plot_exec_result
+                        if child_node.plot_exc_type and retry_count < 3:
+                            print(
+                                f"[red]Plotting code failed with exception: {child_node.plot_exc_type}[/red]"
+                            )
+                            print(
+                                f"[red]Plotting code term out:[/red] {child_node.plot_term_out}"
+                            )
+                            print(
+                                f"[red]Plotting code code:[/red] {child_node.plot_code}"
+                            )
+                            retry_count += 1
+                            continue
+                        else:
+                            break
+
+                    print("[blue]Plotting result:[/blue] ", plot_exec_result)
+                    # Track generated plots
+                    plots_dir = Path(working_dir)
+                    if plots_dir.exists():
+                        print("Plots directory exists, saving plots to node")
+                        # Save the plotting code first
+                        base_dir = Path(cfg.workspace_dir).parent
+                        run_name = Path(cfg.workspace_dir).name
+                        exp_results_dir = (
+                            base_dir
+                            / "logs"
+                            / run_name
+                            / "experiment_results"
+                            / f"experiment_{child_node.id}_proc_{os.getpid()}"
+                        )
+                        child_node.exp_results_dir = exp_results_dir
+                        exp_results_dir.mkdir(parents=True, exist_ok=True)
+                        plot_code_path = exp_results_dir / "plotting_code.py"
+                        with open(plot_code_path, "w") as f:
+                            f.write(plotting_code)
+                        logger.info(f"Saved plotting code to {plot_code_path}")
+                        # Save experiment code to experiment_results directory
+                        exp_code_path = exp_results_dir / "experiment_code.py"
+                        with open(exp_code_path, "w") as f:
+                            f.write(child_node.code)
+                        logger.info(f"Saved experiment code to {exp_code_path}")
+                        # Move experiment data files to experiment_results directory
+                        for exp_data_file in plots_dir.glob("*.npy"):
+                            exp_data_path = exp_results_dir / exp_data_file.name
+                            exp_data_file.resolve().rename(exp_data_path)
+                            logger.info(f"Saved experiment data to {exp_data_path}")
+
+                        for plot_file in plots_dir.glob("*.png"):
+                            # Get the base directory (parent of workspaces/logs)
+                            base_dir = Path(cfg.workspace_dir).parent.parent
+                            run_name = Path(cfg.workspace_dir).name
+
+                            # Create the final path in logs directory
+                            final_path = exp_results_dir / plot_file.name
+                            plot_file.resolve().rename(final_path)
+
+                            # Create a web-friendly relative path starting from logs directory
+                            web_path = f"../../logs/{Path(cfg.workspace_dir).name}/experiment_results/experiment_{child_node.id}_proc_{os.getpid()}/{plot_file.name}"
+
+                            child_node.plots.append(web_path)  # For visualization
+                            child_node.plot_paths.append(
+                                str(final_path.absolute())
+                            )  # For programmatic access
+
+                            logger.info(
+                                f"[green]Generated plot: {plot_file.stem}[/green]"
+                            )
+                            logger.debug(f"Plot absolute path: {final_path.absolute()}")
+                            logger.debug(f"Plot web path: {web_path}")
+                except Exception as e:
+                    logger.error(
+                        f"Error generating plots for node {child_node.id}: {str(e)}"
+                    )
+
+                if child_node.plots:
+                    try:
+                        worker_agent._analyze_plots_with_vlm(child_node)
+                        logger.info(
+                            f"Generated VLM analysis for plots in node {child_node.id}"
+                        )
+                    except Exception as e:
+                        logger.error(
+                            f"Error analyzing plots for node {child_node.id}: {str(e)}"
+                        )
+
+            # Convert result node to dict
+            print("Converting result to dict")
+            result_data = child_node.to_dict()
+            print(f"Result data keys: {result_data.keys()}")
+            print(f"Result data size: {len(str(result_data))} chars")
+            print("Returning result")
+            return result_data
+
+        except Exception as e:
+            print(f"Worker process error: {str(e)}")
+            import traceback
+
+            traceback.print_exc()
+            raise
+
+    def _generate_hyperparam_tuning_idea(self) -> Optional[HyperparamTuningIdea]:
+        """Generate the next hyperparam tuning idea based on what's been done.
+        This is minaly for Stage 2 (baseline tuning).
+        """
+        tried = list(self._hyperparam_tuning_state["tried_hyperparams"])
+
+        hyperparam_tuning_prompt = {
+            "Introduction": (
+                "You are an AI researcher conducting hyperparameter tuning for baseline experiments. "
+                "Based on the current implementation and previous hyperparameter tuning attempts (if any), "
+                "propose ONE new hyperparameter tuning idea to see if it improves the performance."
+                "You should first check if simply training longer (more epochs) improves the performance."
+                "Then try tuning common hyperparameters such as learning rate, batch size, etc."
+                "Only propose algorithm-specific and/or model-specific hyperparameters after you have tried the above."
+            ),
+            "Base code you are working on": wrap_code(self.best_stage1_node.code),
+            "Previous Hyperparam Tuning Attempts": {
+                "Has been tried": tried if tried else "Nothing has been tried yet.",
+            },
+            "Instructions": {
+                "Requirements": [
+                    "1. Identify ONE specific hyperparameter to tune",
+                    "2. Ensure the hyperparameter is different from previous attempts",
+                ]
+            },
+            "Response format": (
+                "Your response should start with 'HYPERPARAM NAME: <hyperparam name>' on the first line to represent the name of the hyperparameter."
+                "The second line should start with 'DESCRIPTION: <description>', a brief description of what hyperparameter is being tuned and why (3-5 sentences). "
+            ),
+        }
+
+        retry_count = 0
+        retry_limit = 5
+        while retry_count < retry_limit:
+            response = query(
+                system_message=hyperparam_tuning_prompt,
+                user_message=None,
+                model=self.cfg.agent.code.model,
+                temperature=self.cfg.agent.code.temp,
+            )
+
+            # Parse the response
+            hyperparam_name, hyperparam_description = _parse_keyword_prefix_response(
+                response, "HYPERPARAM NAME:", "DESCRIPTION:"
+            )
+            if hyperparam_name and hyperparam_description:
+                return HyperparamTuningIdea(
+                    name=hyperparam_name, description=hyperparam_description
+                )
+
+            retry_count += 1
+            logger.warning(
+                f"Failed to parse hyperparam tuning response (attempt {retry_count}/{retry_limit})"
+            )
+
+        logger.error(
+            f"Failed to parse hyperparam tuning response after {retry_limit} retries. Falling back to default idea of increasing learning rate."
+        )
+        return HyperparamTuningIdea(
+            name="increase learning rate", description="increase learning rate"
+        )
+
+    def _generate_ablation_idea(self) -> Optional[AblationIdea]:
+        """Generate the next ablation idea based on what's been done"""
+
+        # Prepare context of what's been tried
+        completed = list(self._ablation_state["completed_ablations"])
+
+        ablation_prompt = {
+            "Introduction": (
+                "You are an AI researcher conducting ablation studies. "
+                "Based on the current implementation and previous ablations (if any), "
+                "propose ONE new ablation study that tests a different aspect of the model."
+            ),
+            "Base code you are working on": wrap_code(self.best_stage3_node.code),
+            "Previous Ablations": {
+                "Has been tried": (
+                    completed if completed else "Nothing has been tried yet."
+                ),
+            },
+            "Instructions": {
+                "Requirements": [
+                    "1. Identify ONE specific component/feature to ablate",
+                    "2. Ensure the ablation is different from previous completed or running attempts",
+                    "3. The ablation should be a new idea, not a variation of previous ideas",
+                    "4. If you have only used a single synthetic dataset throughout the experiment, one of your ablations should be to use multiple synthetic datasets (at least 3 different datasets)",
+                ]
+            },
+            "Response format": (
+                "Your response should start with 'ABLATION NAME: <ablation name>' on the first line to represent the name of the ablation."
+                "The second line should start with 'ABLATION DESCRIPTION: <description>', a brief description of what component is being ablated and why (3-5 sentences), "
+            ),
+        }
+
+        retry_count = 0
+        retry_limit = 5
+        while retry_count < retry_limit:
+            response = query(
+                system_message=ablation_prompt,
+                user_message=None,
+                model=self.cfg.agent.code.model,
+                temperature=self.cfg.agent.code.temp,
+            )
+
+            # Parse the response
+            ablation_name, ablation_description = _parse_keyword_prefix_response(
+                response, "ABLATION NAME:", "ABLATION DESCRIPTION:"
+            )
+            if ablation_name and ablation_description:
+                return AblationIdea(
+                    name=ablation_name, description=ablation_description
+                )
+
+            retry_count += 1
+            logger.warning(
+                f"Failed to parse ablation response (attempt {retry_count}/{retry_limit})"
+            )
+
+        logger.error(
+            f"Failed to parse ablation response after {retry_limit} retries. Falling back to default idea of removing dropout."
+        )
+        return AblationIdea(name="add one more layer", description="add one more layer")
+
+    def _get_leaves(self, node: Node) -> List[Node]:
+        """Get all leaf nodes in the subtree rooted at node."""
+        if not node.children:
+            return [node]
+
+        leaves = []
+        for child in node.children:
+            leaves.extend(self._get_leaves(child))
+        return leaves
+
+    def _select_parallel_nodes(self) -> List[Optional[Node]]:
+        """Select N nodes to process in parallel,
+        balancing between tree exploration and exploitation.
+        Note:
+        - This function runs in the main process.
+        Some design considerations:
+        - For Stage 2 and 4, we generate nodes in the main process and
+        send them to worker processes.
+        This is to make sure we don't run duplicate ideas in parallel.
+        - For Stage 1 and 3, we generate nodes in worker processes.
+        """
+        nodes_to_process = []
+        processed_trees = set()
+        search_cfg = self.cfg.agent.search
+        print(f"[cyan]self.num_workers: {self.num_workers}, [/cyan]")
+
+        while len(nodes_to_process) < self.num_workers:
+            # Initial drafting phase, creating root nodes
+            print(
+                f"Checking draft nodes... num of journal.draft_nodes: {len(self.journal.draft_nodes)}, search_cfg.num_drafts: {search_cfg.num_drafts}"
+            )
+            if len(self.journal.draft_nodes) < search_cfg.num_drafts:
+                nodes_to_process.append(None)
+                continue
+
+            # Get viable trees
+            viable_trees = [
+                root
+                for root in self.journal.draft_nodes
+                if not all(leaf.is_buggy for leaf in self._get_leaves(root))
+            ]
+
+            # Debugging phase (with some probability)
+            if random.random() < search_cfg.debug_prob:
+                print("Checking debuggable nodes")
+                # print(f"Buggy nodes: {self.journal.buggy_nodes}")
+                try:
+                    debuggable_nodes = None
+                    print("Checking buggy nodes...")
+                    buggy_nodes = self.journal.buggy_nodes
+                    print(f"Type of buggy_nodes: {type(buggy_nodes)}")
+                    print(f"Length of buggy_nodes: {len(buggy_nodes)}")
+
+                    for i, n in enumerate(buggy_nodes):
+                        if not isinstance(n, Node):
+                            print(f"Found non-Node object in journal.buggy_nodes: {n}")
+                            raise ValueError(
+                                "Found non-Node object in journal.buggy_nodes"
+                            )
+                    debuggable_nodes = [
+                        n
+                        for n in self.journal.buggy_nodes
+                        if (
+                            isinstance(n, Node)
+                            and n.is_leaf
+                            and n.debug_depth <= search_cfg.max_debug_depth
+                        )
+                    ]
+                except Exception as e:
+                    print(f"Error getting debuggable nodes: {e}")
+                if debuggable_nodes:
+                    print("Found debuggable nodes")
+                    node = random.choice(debuggable_nodes)
+                    tree_root = node
+                    while tree_root.parent:
+                        tree_root = tree_root.parent
+
+                    tree_id = id(tree_root)
+                    if tree_id not in processed_trees or len(processed_trees) >= len(
+                        viable_trees
+                    ):
+                        nodes_to_process.append(node)
+                        processed_trees.add(tree_id)
+                        continue
+
+            # Special handling for Stage 4 (Ablation Studies)
+            print(f"[red]self.stage_name: {self.stage_name}[/red]")
+            # print(f"[red]self.best_stage3_node: {self.best_stage3_node}[/red]")
+            if self.stage_name and self.stage_name.startswith("4_"):
+                nodes_to_process.append(self.best_stage3_node)
+                continue
+            # Special handling for Stage 2 (Hyperparam tuning for baseline)
+            elif self.stage_name and self.stage_name.startswith("2_"):
+                nodes_to_process.append(self.best_stage1_node)
+                continue
+            else:  # Stage 1, 3 (normal best-first search)
+                # Improvement phase
+                print("Checking good nodes..")
+                good_nodes = self.journal.good_nodes
+                if not good_nodes:
+                    nodes_to_process.append(None)  # Back to drafting
+                    continue
+
+                # Get best node from unprocessed tree if possible
+                best_node = self.journal.get_best_node(cfg=self.cfg)
+                tree_root = best_node
+                while tree_root.parent:
+                    tree_root = tree_root.parent
+
+                tree_id = id(tree_root)
+                if tree_id not in processed_trees or len(processed_trees) >= len(
+                    viable_trees
+                ):
+                    nodes_to_process.append(best_node)
+                    processed_trees.add(tree_id)
+                    continue
+
+                # If we can't use best node (tree already processed), try next best nodes
+                for node in sorted(good_nodes, key=lambda n: n.metric, reverse=True):
+                    tree_root = node
+                    while tree_root.parent:
+                        tree_root = tree_root.parent
+                    tree_id = id(tree_root)
+                    if tree_id not in processed_trees or len(processed_trees) >= len(
+                        viable_trees
+                    ):
+                        nodes_to_process.append(node)
+                        processed_trees.add(tree_id)
+                        break
+
+        return nodes_to_process
+
+    def step(self, exec_callback: ExecCallbackType):
+        print("Selecting nodes to process")
+        nodes_to_process = self._select_parallel_nodes()
+        print(f"Selected nodes: {[n.id if n else None for n in nodes_to_process]}")
+
+        # Convert nodes to dicts
+        node_data_list = []
+        for node in nodes_to_process:
+            if node:
+                try:
+                    node_data = node.to_dict()
+                    _safe_pickle_test(node_data, f"node {node.id} data")
+                    node_data_list.append(node_data)
+                except Exception as e:
+                    logger.error(f"Error preparing node {node.id}: {str(e)}")
+                    raise
+            else:
+                node_data_list.append(None)  # None means new draft
+
+        if self.cfg.agent.get("summary", None) is not None:
+            memory_summary = self.journal.generate_summary(
+                include_code=False, 
+                **{
+                    "model": self.cfg.agent.summary.model, 
+                    "temp": self.cfg.agent.summary.temp
+                }
+            )
+        else:
+            memory_summary = self.journal.generate_summary(include_code=False)
+
+        print("Submitting tasks to process pool")
+        futures = []
+        for node_data in node_data_list:
+            gpu_id = None
+            if self.gpu_manager is not None:
+                try:
+                    # Get current process ID for GPU assignment
+                    process_id = f"worker_{len(futures)}"
+                    gpu_id = self.gpu_manager.acquire_gpu(process_id)
+                    logger.info(f"Assigned GPU {gpu_id} to process {process_id}")
+                except RuntimeError as e:
+                    logger.warning(f"Could not acquire GPU: {e}. Running on CPU")
+
+            if (
+                self.stage_name
+                and self.stage_name.startswith("2_")
+                and node_data["is_buggy"] is False
+            ):
+                new_hyperparam_idea = self._generate_hyperparam_tuning_idea()
+                self._hyperparam_tuning_state["tried_hyperparams"].add(
+                    new_hyperparam_idea.name
+                )
+                new_ablation_idea = None
+            elif (
+                self.stage_name
+                and self.stage_name.startswith("4_")
+                and node_data["is_buggy"] is False
+            ):
+                new_ablation_idea = self._generate_ablation_idea()
+                self._ablation_state["completed_ablations"].add(new_ablation_idea.name)
+                new_hyperparam_idea = None
+            else:
+                new_ablation_idea = None
+                new_hyperparam_idea = None
+
+            best_stage1_plot_code = (
+                self.best_stage1_node.plot_code if self.best_stage1_node else None
+            )
+            best_stage2_plot_code = (
+                self.best_stage2_node.plot_code if self.best_stage2_node else None
+            )
+            best_stage3_plot_code = (
+                self.best_stage3_node.plot_code if self.best_stage3_node else None
+            )
+            seed_eval = False
+            futures.append(
+                self.executor.submit(
+                    self._process_node_wrapper,
+                    node_data,
+                    self.task_desc,
+                    self.cfg,
+                    gpu_id,
+                    memory_summary,
+                    self.evaluation_metrics,
+                    self.stage_name,
+                    new_ablation_idea,
+                    new_hyperparam_idea,
+                    best_stage1_plot_code,
+                    best_stage2_plot_code,
+                    best_stage3_plot_code,
+                    seed_eval,
+                )
+            )
+
+        # Add results to journal
+        print("Waiting for results")
+        for i, future in enumerate(futures):
+            try:
+                print("About to get result from future")
+                result_data = future.result(timeout=self.timeout)
+                if "metric" in result_data:
+                    print(f"metric type: {type(result_data['metric'])}")
+                    print(f"metric contents: {result_data['metric']}")
+
+                # Create node and restore relationships using journal.
+                # Journal acts as a database to look up a parent node,
+                # and add the result node as a child.
+                result_node = Node.from_dict(result_data, self.journal)
+                print("[red]Investigating if result node has metric[/red]", flush=True)
+                print(result_node.metric)
+                # Update hyperparam tuning state if in Stage 2
+                self._update_hyperparam_tuning_state(result_node)
+                # Update ablation state if in Stage 4
+                self._update_ablation_state(result_node)
+
+                # Add node to journal's list and assign its step number
+                self.journal.append(result_node)
+                print("Added result node to journal")
+
+            except TimeoutError:
+                print("Worker process timed out, couldn't get the result")
+                logger.error(f"Worker process timed out, couldn't get the result")
+            except Exception as e:
+                print(f"Error processing node: {str(e)}")
+                logger.error(f"Error processing node: {str(e)}")
+                import traceback
+
+                traceback.print_exc()
+                raise
+            finally:
+                # Release GPU for this process if it was using one
+                process_id = f"worker_{i}"
+                if (
+                    self.gpu_manager is not None
+                    and process_id in self.gpu_manager.gpu_assignments
+                ):
+                    self.gpu_manager.release_gpu(process_id)
+                    logger.info(f"Released GPU for process {process_id}")
+
+    def _update_hyperparam_tuning_state(self, result_node: Node):
+        """Update hyperparam tuning tracking state based on execution results."""
+        if not self.stage_name or not self.stage_name.startswith("2_"):
+            return
+
+        hyperparam_name = result_node.hyperparam_name
+        if hyperparam_name is None:
+            print(
+                f"[red]hyperparam_name is None for result_node: {result_node.id}[/red]"
+            )
+            return
+
+        if not result_node.is_buggy:
+            self._hyperparam_tuning_state["tried_hyperparams"].add(hyperparam_name)
+            logger.info(f"Hyperparam tuning {hyperparam_name} ran successfully")
+        else:
+            logger.warning(f"Hyperparam tuning {hyperparam_name} failed")
+
+    def _update_ablation_state(self, result_node: Node):
+        """Update ablation tracking state based on execution results.
+
+        Args:
+            result_node: Node containing ablation execution results
+        """
+        if not self.stage_name or not self.stage_name.startswith("4_"):
+            return
+
+        ablation_name = result_node.ablation_name
+        if ablation_name is None:
+            print(f"[red]ablation_name is None for result_node: {result_node.id}[/red]")
+            return
+
+        if not result_node.is_buggy:
+            self._ablation_state["completed_ablations"].add(ablation_name)
+            logger.info(f"Ablation {ablation_name} completed successfully")
+
+    def _aggregate_seed_eval_results(
+        self, seed_nodes: List[Node], parent_node: Node
+    ) -> str:
+        """Generate aggregated plots from multi-seed evaluation results.
+
+        Args:
+            seed_nodes: List of nodes from seed evaluation
+            parent_node: The original node that was evaluated
+
+        Returns:
+            str: The plotting code for aggregated results
+        """
+        prompt_guideline = []
+        prompt_guideline += [
+            "REQUIREMENTS: ",
+            "The code should start with:",
+            "  import matplotlib.pyplot as plt",
+            "  import numpy as np",
+            "  import os",
+            "  working_dir = os.path.join(os.getcwd(), 'working')",
+            "Create standard visualizations of experiment results",
+            "Save all plots to working_dir",
+            "Include training/validation curves if available",
+            "ONLY plot data that exists in experiment_data.npy - DO NOT make up or simulate any values",
+            "Use basic matplotlib without custom styles",
+            "Each plot should be in a separate try-except block",
+            "Always close figures after saving",
+            "Always include a title for each plot, and be sure to use clear subtitles—such as 'Left: Ground Truth, Right: Generated Samples'—while also specifying the type of dataset being used.",
+            "Make sure to use descriptive names for figures when saving e.g. always include the dataset name and the type of plot in the name",
+            "When there are many similar figures to plot (e.g. generated samples at each epoch), make sure to plot only at a suitable interval of epochs so that you only plot at most 5 figures.",
+            "Example to extract data from experiment_data: experiment_data['dataset_name_1']['metrics']['train']",
+            "Make sure to add legend for standard error bars and means if applicable",
+        ]
+        prompt_guideline += [
+            "Example data loading and plot saving code: ",
+            """
+                try:
+                    experiment_data_path_list = # Make sure to use the correct experiment data path that's provided in the Experiment Data Path section
+                    all_experiment_data = []
+                    for experiment_data_path in experiment_data_path_list:
+                        root = os.getenv("AI_SCIENTIST_SKILLS_ROOT") or os.getenv("AI_SCIENTIST_ROOT") or ""
+                        experiment_data = np.load(os.path.join(root, experiment_data_path), allow_pickle=True).item()
+                        all_experiment_data.append(experiment_data)
+                except Exception as e:
+                    print(f'Error loading experiment data: {{e}}')
+
+                try:
+                    # First plot
+                    plt.figure()
+                    # ... plotting code ...
+                    plt.savefig('working_dir/[plot_name_1].png')
+                    plt.close()
+                except Exception as e:
+                    print(f"Error creating plot1: {{e}}")
+                    plt.close()  # Always close figure even if error occurs
+
+                try:
+                    # Second plot
+                    plt.figure()
+                    # ... plotting code ...
+                    plt.savefig('working_dir/[plot_name_2].png')
+                    plt.close()
+                except Exception as e:
+                    print(f"Error creating plot2: {{e}}")
+                    plt.close()
+            """,
+        ]
+        # add instruction for format
+        plotting_prompt = {
+            "Introduction": (
+                "You are an expert in data visualization and plotting. "
+                "You are given a set of evaluation results and the code that was used to plot them. "
+                "Your task is to write a new plotting code that aggregate the results "
+                "e.g. for example, by adding mean values and standard error bars to the plots."
+            ),
+            "Instructions": {},
+        }
+        plotting_prompt["Instructions"] |= {
+            "Response format": (
+                "Your response should be a brief outline/sketch of your proposed solution in natural language (7-10 sentences), "
+                "followed by a single markdown code block (wrapped in ```) which implements this solution and prints out the evaluation metric(s) if applicable. "
+                "There should be no additional headings or text in your response. Just natural language text followed by a newline and then the markdown code block. "
+            )
+        }
+        plotting_prompt["Instructions"] |= {
+            "Plotting code guideline": prompt_guideline,
+        }
+        plotting_prompt["Instructions"] |= {
+            "Plotting code reference": (
+                "plotting code 1:\n" + seed_nodes[0].plot_code + "\n\n"
+                "plotting code 2:\n" + seed_nodes[1].plot_code + "\n\n"
+                "plotting code 3:\n" + seed_nodes[2].plot_code + "\n\n"
+            ),
+            "Experiment Data Path": (
+                f"{seed_nodes[0].exp_results_dir}/experiment_data.npy\n"
+                f"{seed_nodes[1].exp_results_dir}/experiment_data.npy\n"
+                f"{seed_nodes[2].exp_results_dir}/experiment_data.npy\n"
+            ),
+        }
+        plan, code = self.plan_and_code_query(plotting_prompt)
+
+        print("[green]Plan:[/green]\n", plan)
+        print(f"[green]Generated aggregated plotting code:[/green]\n{code}")
+
+        return code
+
+    def __enter__(self):
+        return self
+
+    def cleanup(self):
+        """Cleanup parallel workers and resources"""
+        if not self._is_shutdown:
+            print("Shutting down parallel executor...")
+            try:
+                # Release all GPUs
+                if self.gpu_manager is not None:
+                    for process_id in list(self.gpu_manager.gpu_assignments.keys()):
+                        self.gpu_manager.release_gpu(process_id)
+
+                # Shutdown executor first
+                self.executor.shutdown(wait=False, cancel_futures=True)
+
+                # Force terminate all worker processes
+                if self.executor._processes:
+                    ## Get copy of processes
+                    processes = list(self.executor._processes.values())
+
+                    # Then terminate processes if they're still alive
+                    for process in processes:
+                        if process.is_alive():
+                            process.terminate()
+                            process.join(timeout=1)
+
+                print("Executor shutdown complete")
+
+            except Exception as e:
+                print(f"Error during executor shutdown: {e}")
+            finally:
+                self._is_shutdown = True
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/perform_experiments_bfts_with_agentmanager.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/perform_experiments_bfts_with_agentmanager.py
new file mode 100644
index 00000000..7779630b
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/perform_experiments_bfts_with_agentmanager.py
@@ -0,0 +1,262 @@
+import atexit
+import logging
+import shutil
+import json
+import pickle
+from . import backend
+from .journal import Journal, Node
+from .journal2report import journal2report
+from rich.columns import Columns
+from rich.console import Group
+from rich.live import Live
+from rich.padding import Padding
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    TextColumn,
+    TimeRemainingColumn,
+)
+from rich.text import Text
+from rich.status import Status
+from rich.tree import Tree
+from .utils.config import load_task_desc, prep_agent_workspace, save_run, load_cfg
+from .agent_manager import AgentManager
+from pathlib import Path
+from .agent_manager import Stage
+from .log_summarization import overall_summarize
+
+
+logger = logging.getLogger("ai-scientist")
+
+
+def journal_to_rich_tree(journal: Journal, cfg):
+    best_node = journal.get_best_node(cfg=cfg)
+
+    def append_rec(node: Node, tree):
+        if node.is_buggy:
+            s = "[red]◍ bug"
+        else:
+            style = "bold " if node is best_node else ""
+
+            if node is best_node:
+                s = f"[{style}green]● {node.metric.value:.3f} (best)"
+            else:
+                s = f"[{style}green]● {node.metric.value:.3f}"
+
+        subtree = tree.add(s)
+        for child in node.children:
+            append_rec(child, subtree)
+
+    tree = Tree("[bold blue]Solution tree")
+    for n in journal.draft_nodes:
+        append_rec(n, tree)
+    return tree
+
+
+def perform_experiments_bfts(config_path: str):
+    # turn config path string into a path object
+    config_path = Path(config_path)
+    cfg = load_cfg(config_path)
+    logger.info(f'Starting run "{cfg.exp_name}"')
+
+    task_desc = load_task_desc(cfg)
+    print(task_desc)
+    task_desc_str = backend.compile_prompt_to_md(task_desc)
+
+    global_step = 0
+
+    with Status("Preparing agent workspace (copying and extracting files) ..."):
+        prep_agent_workspace(cfg)
+
+    def cleanup():
+        if global_step == 0:
+            shutil.rmtree(cfg.workspace_dir)
+
+    atexit.register(cleanup)
+
+    manager = AgentManager(
+        task_desc=task_desc,
+        cfg=cfg,
+        workspace_dir=Path(cfg.workspace_dir),
+    )
+
+    prog = Progress(
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(bar_width=20),
+        MofNCompleteColumn(),
+        TimeRemainingColumn(),
+    )
+    status = Status("[green]Running experiments...")
+    prog.add_task("Progress:", total=cfg.agent.steps, completed=global_step)
+
+    def create_exec_callback(status_obj):
+        def exec_callback(*args, **kwargs):
+            status_obj.update("[magenta]Executing code...")
+            res = interpreter.run(*args, **kwargs)
+            status_obj.update("[green]Generating code...")
+            return res
+
+        return exec_callback
+
+    def step_callback(stage, journal):
+        print("Step complete")
+        try:
+            # Generate and save notes for this step
+            notes_dir = cfg.log_dir / f"stage_{stage.name}" / "notes"
+            notes_dir.mkdir(parents=True, exist_ok=True)
+
+            # Save latest node summary
+            if journal.nodes:
+                latest_node = journal.nodes[-1]
+                if hasattr(latest_node, "_agent"):
+                    summary = latest_node._agent._generate_node_summary(latest_node)
+                    with open(
+                        notes_dir / f"node_{latest_node.id}_summary.json", "w"
+                    ) as f:
+                        json.dump(summary, f, indent=2)
+
+
+            if cfg.agent.get("summary", None) is not None:
+                current_findings = journal.generate_summary(
+                    include_code=False, 
+                    **{
+                        "model": cfg.agent.summary.model, 
+                        "temp": cfg.agent.summary.temp
+                    }
+                )
+            else:
+                current_findings = journal.generate_summary(include_code=False)
+
+            best_metric = journal.get_best_node(cfg=cfg)
+
+            # Generate and save stage progress summary
+            stage_summary = {
+                "stage": stage.name,
+                "total_nodes": len(journal.nodes),
+                "buggy_nodes": len(journal.buggy_nodes),
+                "good_nodes": len(journal.good_nodes),
+                "best_metric": (
+                    str(best_metric.metric)
+                    if best_metric
+                    else "None"
+                ),
+                "current_findings": current_findings,
+            }
+
+            with open(notes_dir / "stage_progress.json", "w") as f:
+                json.dump(stage_summary, f, indent=2)
+
+            # Save the run as before
+            save_run(cfg, journal, stage_name=f"stage_{stage.name}")
+
+        except Exception as e:
+            print(f"Error in step callback: {e}")
+
+        print(f"Run saved at {cfg.log_dir / f'stage_{stage.name}'}")
+        print(f"Step {len(journal)}/{stage.max_iterations} at stage_{stage.name}")
+        print(f"Run saved at {cfg.log_dir / f'stage_{stage.name}'}")
+
+    def generate_live(manager):
+        current_stage = manager.current_stage
+        current_journal = manager.journals.get(
+            current_stage.name if current_stage else None, None
+        )
+
+        if current_journal:
+            tree = journal_to_rich_tree(current_journal, cfg)
+        else:
+            tree = Tree("[bold blue]No results yet")
+
+        file_paths = [
+            f"Result visualization:\n[yellow]▶ {str((cfg.log_dir / 'tree_plot.html'))}",
+            f"Agent workspace directory:\n[yellow]▶ {str(cfg.workspace_dir)}",
+            f"Experiment log directory:\n[yellow]▶ {str(cfg.log_dir)}",
+        ]
+
+        stage_info = [
+            "[bold]Experiment Progress:",
+            f"Current Stage: [cyan]{current_stage.name if current_stage else 'None'}[/cyan]",
+            f"Completed Stages: [green]{', '.join(manager.completed_stages)}[/green]",
+        ]
+
+        left = Group(
+            Panel(Text(task_desc_str.strip()), title="Task description"),
+            Panel(Text("\n".join(stage_info)), title="Stage Progress"),
+            prog,
+            status,
+        )
+        right = tree
+        wide = Group(*file_paths)
+
+        return Panel(
+            Group(
+                Padding(wide, (1, 1, 1, 1)),
+                Columns(
+                    [Padding(left, (1, 2, 1, 1)), Padding(right, (1, 1, 1, 2))],
+                    equal=True,
+                ),
+            ),
+            title=f'[b]AIDE is working on experiment: [bold green]"{cfg.exp_name}[/b]"',
+            subtitle="Press [b]Ctrl+C[/b] to stop the run",
+        )
+
+    live = Live(
+        generate_live(manager),
+        refresh_per_second=16,
+        screen=True,
+    )
+
+    manager.run(exec_callback=create_exec_callback(status), step_callback=step_callback)
+
+    manager_pickle_path = cfg.log_dir / "manager.pkl"
+    try:
+        with open(manager_pickle_path, "wb") as f:
+            pickle.dump(manager, f)
+        logger.info(f"Saved manager state to: {manager_pickle_path}")
+    except Exception as e:
+        logger.warning(f"Failed to save full manager state: {e}")
+        try:
+            with open(manager_pickle_path, "wb") as f:
+                pickle.dump(manager.journals.items(), f)
+            logger.info(f"Saved manager journals to: {manager_pickle_path}")
+        except Exception as e:
+            logger.error(f"Failed to save manager journals: {e}")
+
+    if cfg.generate_report:
+        print("Generating final report from all stages...")
+        (
+            draft_summary,
+            baseline_summary,
+            research_summary,
+            ablation_summary,
+        ) = overall_summarize(manager.journals.items(), cfg)
+        draft_summary_path = cfg.log_dir / "draft_summary.json"
+        baseline_summary_path = cfg.log_dir / "baseline_summary.json"
+        research_summary_path = cfg.log_dir / "research_summary.json"
+        ablation_summary_path = cfg.log_dir / "ablation_summary.json"
+
+        with open(draft_summary_path, "w") as draft_file:
+            json.dump(draft_summary, draft_file, indent=2)
+
+        with open(baseline_summary_path, "w") as baseline_file:
+            json.dump(baseline_summary, baseline_file, indent=2)
+
+        with open(research_summary_path, "w") as research_file:
+            json.dump(research_summary, research_file, indent=2)
+
+        with open(ablation_summary_path, "w") as ablation_file:
+            json.dump(ablation_summary, ablation_file, indent=2)
+
+        print(f"Summary reports written to files:")
+        print(f"- Draft summary: {draft_summary_path}")
+        print(f"- Baseline summary: {baseline_summary_path}")
+        print(f"- Research summary: {research_summary_path}")
+        print(f"- Ablation summary: {ablation_summary_path}")
+
+
+if __name__ == "__main__":
+    cfg_path = "treesearch/utils/config.yaml"
+    cfg = load_cfg(cfg_path)
+    perform_experiments_bfts(cfg_path)
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/__init__.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/__init__.py
new file mode 100644
index 00000000..edd72d1e
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/__init__.py
@@ -0,0 +1,100 @@
+import logging
+import shutil
+import zipfile
+from pathlib import Path
+
+logger = logging.getLogger("ai-scientist")
+
+
+def copytree(src: Path, dst: Path, use_symlinks=True):
+    """
+    Copy contents of `src` to `dst`. Unlike shutil.copytree, the dst dir can exist and will be merged.
+    If src is a file, only that file will be copied. Optionally uses symlinks instead of copying.
+
+    Args:
+        src (Path): source directory
+        dst (Path): destination directory
+    """
+    assert dst.is_dir()
+
+    if src.is_file():
+        dest_f = dst / src.name
+        assert not dest_f.exists(), dest_f
+        if use_symlinks:
+            (dest_f).symlink_to(src)
+        else:
+            shutil.copyfile(src, dest_f)
+        return
+
+    for f in src.iterdir():
+        dest_f = dst / f.name
+        assert not dest_f.exists(), dest_f
+        if use_symlinks:
+            (dest_f).symlink_to(f)
+        elif f.is_dir():
+            shutil.copytree(f, dest_f)
+        else:
+            shutil.copyfile(f, dest_f)
+
+
+def clean_up_dataset(path: Path):
+    for item in path.rglob("__MACOSX"):
+        if item.is_dir():
+            shutil.rmtree(item)
+    for item in path.rglob(".DS_Store"):
+        if item.is_file():
+            item.unlink()
+
+
+def extract_archives(path: Path):
+    """
+    unzips all .zip files within `path` and cleans up task dir
+
+    [TODO] handle nested zips
+    """
+    for zip_f in path.rglob("*.zip"):
+        f_out_dir = zip_f.with_suffix("")
+
+        # special case: the intended output path already exists (maybe data has already been extracted by user)
+        if f_out_dir.exists():
+            logger.debug(
+                f"Skipping {zip_f} as an item with the same name already exists."
+            )
+            # if it's a file, it's probably exactly the same as in the zip -> remove the zip
+            # [TODO] maybe add an extra check to see if zip file content matches the colliding file
+            if f_out_dir.is_file() and f_out_dir.suffix != "":
+                zip_f.unlink()
+            continue
+
+        logger.debug(f"Extracting: {zip_f}")
+        f_out_dir.mkdir(exist_ok=True)
+        with zipfile.ZipFile(zip_f, "r") as zip_ref:
+            zip_ref.extractall(f_out_dir)
+
+        # remove any unwanted files
+        clean_up_dataset(f_out_dir)
+
+        contents = list(f_out_dir.iterdir())
+
+        # special case: the zip contains a single dir/file with the same name as the zip
+        if len(contents) == 1 and contents[0].name == f_out_dir.name:
+            sub_item = contents[0]
+            # if it's a dir, move its contents to the parent and remove it
+            if sub_item.is_dir():
+                logger.debug(f"Special handling (child is dir) enabled for: {zip_f}")
+                for f in sub_item.rglob("*"):
+                    shutil.move(f, f_out_dir)
+                sub_item.rmdir()
+            # if it's a file, rename it to the parent and remove the parent
+            elif sub_item.is_file():
+                logger.debug(f"Special handling (child is file) enabled for: {zip_f}")
+                sub_item_tmp = sub_item.rename(f_out_dir.with_suffix(".__tmp_rename"))
+                f_out_dir.rmdir()
+                sub_item_tmp.rename(f_out_dir)
+
+        zip_f.unlink()
+
+
+def preproc_data(path: Path):
+    extract_archives(path)
+    clean_up_dataset(path)
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/config.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/config.py
new file mode 100644
index 00000000..aaeba678
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/config.py
@@ -0,0 +1,259 @@
+"""configuration and setup utils"""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Hashable, cast, Literal, Optional
+
+import coolname
+import rich
+from omegaconf import OmegaConf
+from rich.syntax import Syntax
+import shutup
+from rich.logging import RichHandler
+import logging
+
+from . import tree_export
+from . import copytree, preproc_data, serialize
+
+shutup.mute_warnings()
+logging.basicConfig(
+    level="WARNING", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
+)
+logger = logging.getLogger("ai-scientist")
+logger.setLevel(logging.WARNING)
+
+
+""" these dataclasses are just for type hinting, the actual config is in config.yaml """
+
+
+@dataclass
+class ThinkingConfig:
+    type: str
+    budget_tokens: Optional[int] = None
+
+
+@dataclass
+class StageConfig:
+    model: str
+    temp: float
+    thinking: ThinkingConfig
+    betas: str
+    max_tokens: Optional[int] = None
+
+
+@dataclass
+class SearchConfig:
+    max_debug_depth: int
+    debug_prob: float
+    num_drafts: int
+
+
+@dataclass
+class DebugConfig:
+    stage4: bool
+
+
+@dataclass
+class AgentConfig:
+    steps: int
+    stages: dict[str, int]
+    k_fold_validation: int
+    expose_prediction: bool
+    data_preview: bool
+
+    code: StageConfig
+    feedback: StageConfig
+    vlm_feedback: StageConfig
+
+    search: SearchConfig
+    num_workers: int
+    type: str
+    multi_seed_eval: dict[str, int]
+
+    summary: Optional[StageConfig] = None
+    select_node: Optional[StageConfig] = None
+
+@dataclass
+class ExecConfig:
+    timeout: int
+    agent_file_name: str
+    format_tb_ipython: bool
+
+
+@dataclass
+class ExperimentConfig:
+    num_syn_datasets: int
+
+
+@dataclass
+class Config(Hashable):
+    data_dir: Path
+    desc_file: Path | None
+
+    goal: str | None
+    eval: str | None
+
+    log_dir: Path
+    workspace_dir: Path
+
+    preprocess_data: bool
+    copy_data: bool
+
+    exp_name: str
+
+    exec: ExecConfig
+    generate_report: bool
+    report: StageConfig
+    agent: AgentConfig
+    experiment: ExperimentConfig
+    debug: DebugConfig
+
+
+def _get_next_logindex(dir: Path) -> int:
+    """Get the next available index for a log directory."""
+    max_index = -1
+    for p in dir.iterdir():
+        try:
+            if (current_index := int(p.name.split("-")[0])) > max_index:
+                max_index = current_index
+        except ValueError:
+            pass
+    print("max_index: ", max_index)
+    return max_index + 1
+
+
+def _load_cfg(
+    path: Path = Path(__file__).parent / "config.yaml", use_cli_args=False
+) -> Config:
+    cfg = OmegaConf.load(path)
+    if use_cli_args:
+        cfg = OmegaConf.merge(cfg, OmegaConf.from_cli())
+    return cfg
+
+
+def load_cfg(path: Path = Path(__file__).parent / "config.yaml") -> Config:
+    """Load config from .yaml file and CLI args, and set up logging directory."""
+    return prep_cfg(_load_cfg(path))
+
+
+def prep_cfg(cfg: Config):
+    if cfg.data_dir is None:
+        raise ValueError("`data_dir` must be provided.")
+
+    if cfg.desc_file is None and cfg.goal is None:
+        raise ValueError(
+            "You must provide either a description of the task goal (`goal=...`) or a path to a plaintext file containing the description (`desc_file=...`)."
+        )
+
+    if cfg.data_dir.startswith("example_tasks/"):
+        cfg.data_dir = Path(__file__).parent.parent / cfg.data_dir
+    cfg.data_dir = Path(cfg.data_dir).resolve()
+
+    if cfg.desc_file is not None:
+        cfg.desc_file = Path(cfg.desc_file).resolve()
+
+    top_log_dir = Path(cfg.log_dir).resolve()
+    top_log_dir.mkdir(parents=True, exist_ok=True)
+
+    top_workspace_dir = Path(cfg.workspace_dir).resolve()
+    top_workspace_dir.mkdir(parents=True, exist_ok=True)
+
+    # generate experiment name and prefix with consecutive index
+    ind = max(_get_next_logindex(top_log_dir), _get_next_logindex(top_workspace_dir))
+    cfg.exp_name = cfg.exp_name or coolname.generate_slug(3)
+    cfg.exp_name = f"{ind}-{cfg.exp_name}"
+
+    cfg.log_dir = (top_log_dir / cfg.exp_name).resolve()
+    cfg.workspace_dir = (top_workspace_dir / cfg.exp_name).resolve()
+
+    # validate the config
+    cfg_schema: Config = OmegaConf.structured(Config)
+    cfg = OmegaConf.merge(cfg_schema, cfg)
+
+    if cfg.agent.type not in ["parallel", "sequential"]:
+        raise ValueError("agent.type must be either 'parallel' or 'sequential'")
+
+    return cast(Config, cfg)
+
+
+def print_cfg(cfg: Config) -> None:
+    rich.print(Syntax(OmegaConf.to_yaml(cfg), "yaml", theme="paraiso-dark"))
+
+
+def load_task_desc(cfg: Config):
+    """Load task description from markdown file or config str."""
+
+    # either load the task description from a file
+    if cfg.desc_file is not None:
+        if not (cfg.goal is None and cfg.eval is None):
+            logger.warning(
+                "Ignoring goal and eval args because task description file is provided."
+            )
+
+        with open(cfg.desc_file) as f:
+            return f.read()
+
+    # or generate it from the goal and eval args
+    if cfg.goal is None:
+        raise ValueError(
+            "`goal` (and optionally `eval`) must be provided if a task description file is not provided."
+        )
+
+    task_desc = {"Task goal": cfg.goal}
+    if cfg.eval is not None:
+        task_desc["Task evaluation"] = cfg.eval
+    print(task_desc)
+    return task_desc
+
+
+def prep_agent_workspace(cfg: Config):
+    """Setup the agent's workspace and preprocess data if necessary."""
+    (cfg.workspace_dir / "input").mkdir(parents=True, exist_ok=True)
+    (cfg.workspace_dir / "working").mkdir(parents=True, exist_ok=True)
+
+    copytree(cfg.data_dir, cfg.workspace_dir / "input", use_symlinks=not cfg.copy_data)
+    if cfg.preprocess_data:
+        preproc_data(cfg.workspace_dir / "input")
+
+
+def save_run(cfg: Config, journal, stage_name: str = None):
+    if stage_name is None:
+        stage_name = "NoStageRun"
+    save_dir = cfg.log_dir / stage_name
+    save_dir.mkdir(parents=True, exist_ok=True)
+
+    # save journal
+    try:
+        serialize.dump_json(journal, save_dir / "journal.json")
+    except Exception as e:
+        print(f"Error saving journal: {e}")
+        raise
+    # save config
+    try:
+        OmegaConf.save(config=cfg, f=save_dir / "config.yaml")
+    except Exception as e:
+        print(f"Error saving config: {e}")
+        raise
+    # create the tree + code visualization
+    try:
+        tree_export.generate(cfg, journal, save_dir / "tree_plot.html")
+    except Exception as e:
+        print(f"Error generating tree: {e}")
+        raise
+    # save the best found solution
+    try:
+        best_node = journal.get_best_node(only_good=False, cfg=cfg)
+        if best_node is not None:
+            for existing_file in save_dir.glob("best_solution_*.py"):
+                existing_file.unlink()
+            # Create new best solution file
+            filename = f"best_solution_{best_node.id}.py"
+            with open(save_dir / filename, "w") as f:
+                f.write(best_node.code)
+            # save best_node.id to a text file
+            with open(save_dir / "best_node_id.txt", "w") as f:
+                f.write(str(best_node.id))
+        else:
+            print("No best node found yet")
+    except Exception as e:
+        print(f"Error saving best solution: {e}")
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/data_preview.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/data_preview.py
new file mode 100644
index 00000000..cfdf6a90
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/data_preview.py
@@ -0,0 +1,153 @@
+"""
+Contains functions to manually generate a textual preview of some common file types (.csv, .json,..) for the agent.
+"""
+
+import json
+from pathlib import Path
+
+import humanize
+import pandas as pd
+from genson import SchemaBuilder
+from pandas.api.types import is_numeric_dtype
+
+# these files are treated as code (e.g. markdown wrapped)
+code_files = {".py", ".sh", ".yaml", ".yml", ".md", ".html", ".xml", ".log", ".rst"}
+# we treat these files as text (rather than binary) files
+plaintext_files = {".txt", ".csv", ".json", ".tsv"} | code_files
+
+
+def get_file_len_size(f: Path) -> tuple[int, str]:
+    """
+    Calculate the size of a file (#lines for plaintext files, otherwise #bytes)
+    Also returns a human-readable string representation of the size.
+    """
+    if f.suffix in plaintext_files:
+        num_lines = sum(1 for _ in open(f))
+        return num_lines, f"{num_lines} lines"
+    else:
+        s = f.stat().st_size
+        return s, humanize.naturalsize(s)
+
+
+def file_tree(path: Path, depth=0) -> str:
+    """Generate a tree structure of files in a directory"""
+    result = []
+    files = [p for p in Path(path).iterdir() if not p.is_dir()]
+    dirs = [p for p in Path(path).iterdir() if p.is_dir()]
+    max_n = 4 if len(files) > 30 else 8
+    for p in sorted(files)[:max_n]:
+        result.append(f"{' '*depth*4}{p.name} ({get_file_len_size(p)[1]})")
+    if len(files) > max_n:
+        result.append(f"{' '*depth*4}... and {len(files)-max_n} other files")
+
+    for p in sorted(dirs):
+        result.append(f"{' '*depth*4}{p.name}/")
+        result.append(file_tree(p, depth + 1))
+
+    return "\n".join(result)
+
+
+def _walk(path: Path):
+    """Recursively walk a directory (analogous to os.walk but for pathlib.Path)"""
+    for p in sorted(Path(path).iterdir()):
+        if p.is_dir():
+            yield from _walk(p)
+            continue
+        yield p
+
+
+def preview_csv(p: Path, file_name: str, simple=True) -> str:
+    """Generate a textual preview of a csv file
+
+    Args:
+        p (Path): the path to the csv file
+        file_name (str): the file name to use in the preview
+        simple (bool, optional): whether to use a simplified version of the preview. Defaults to True.
+
+    Returns:
+        str: the textual preview
+    """
+    df = pd.read_csv(p)
+
+    out = []
+
+    out.append(f"-> {file_name} has {df.shape[0]} rows and {df.shape[1]} columns.")
+
+    if simple:
+        cols = df.columns.tolist()
+        sel_cols = 15
+        cols_str = ", ".join(cols[:sel_cols])
+        res = f"The columns are: {cols_str}"
+        if len(cols) > sel_cols:
+            res += f"... and {len(cols)-sel_cols} more columns"
+        out.append(res)
+    else:
+        out.append("Here is some information about the columns:")
+        for col in sorted(df.columns):
+            dtype = df[col].dtype
+            name = f"{col} ({dtype})"
+
+            nan_count = df[col].isnull().sum()
+
+            if dtype == "bool":
+                v = df[col][df[col].notnull()].mean()
+                out.append(f"{name} is {v*100:.2f}% True, {100-v*100:.2f}% False")
+            elif df[col].nunique() < 10:
+                out.append(
+                    f"{name} has {df[col].nunique()} unique values: {df[col].unique().tolist()}"
+                )
+            elif is_numeric_dtype(df[col]):
+                out.append(
+                    f"{name} has range: {df[col].min():.2f} - {df[col].max():.2f}, {nan_count} nan values"
+                )
+            elif dtype == "object":
+                out.append(
+                    f"{name} has {df[col].nunique()} unique values. Some example values: {df[col].value_counts().head(4).index.tolist()}"
+                )
+
+    return "\n".join(out)
+
+
+def preview_json(p: Path, file_name: str):
+    """Generate a textual preview of a json file using a generated json schema"""
+    builder = SchemaBuilder()
+    with open(p) as f:
+        builder.add_object(json.load(f))
+    return f"-> {file_name} has auto-generated json schema:\n" + builder.to_json(
+        indent=2
+    )
+
+
+def generate(base_path, include_file_details=True, simple=False):
+    """
+    Generate a textual preview of a directory, including an overview of the directory
+    structure and previews of individual files
+    """
+    tree = f"```\n{file_tree(base_path)}```"
+    out = [tree]
+
+    if include_file_details:
+        for fn in _walk(base_path):
+            file_name = str(fn.relative_to(base_path))
+
+            if fn.suffix == ".csv":
+                out.append(preview_csv(fn, file_name, simple=simple))
+            elif fn.suffix == ".json":
+                out.append(preview_json(fn, file_name))
+            elif fn.suffix in plaintext_files:
+                if get_file_len_size(fn)[0] < 30:
+                    with open(fn) as f:
+                        content = f.read()
+                        if fn.suffix in code_files:
+                            content = f"```\n{content}\n```"
+                        out.append(f"-> {file_name} has content:\n\n{content}")
+
+    result = "\n\n".join(out)
+
+    # if the result is very long we generate a simpler version
+    if len(result) > 6_000 and not simple:
+        return generate(
+            base_path, include_file_details=include_file_details, simple=True
+        )
+
+    return result
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/metric.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/metric.py
new file mode 100644
index 00000000..e73f0c5f
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/metric.py
@@ -0,0 +1,340 @@
+from dataclasses import dataclass, field
+from functools import total_ordering
+from typing import Any
+
+import numpy as np
+from dataclasses_json import DataClassJsonMixin
+
+
+@dataclass
+@total_ordering
+class MetricValue_old(DataClassJsonMixin):
+    """
+    Represents the value of a metric to be optimized, which can be compared to other metric values.
+    Comparisons (and max, min) are based on which value is better, not which is larger.
+    """
+
+    value: float | int | np.number | np.floating | np.ndarray | dict | None
+    maximize: bool | None = field(default=None, kw_only=True)
+    name: str | None = field(
+        default=None, kw_only=True
+    )  # e.g., "accuracy", "loss", "f1_score"
+    description: str | None = field(
+        default=None, kw_only=True
+    )  # e.g., "Classification accuracy on validation set"
+
+    def __post_init__(self):
+        if self.value is not None:
+            if isinstance(self.value, dict):
+                self.value = {k: float(v) for k, v in self.value.items()}
+            else:
+                assert isinstance(self.value, (float, int, np.number, np.floating))
+                self.value = float(self.value)
+
+    def __gt__(self, other) -> bool:
+        """True if self is a _better_ (not necessarily larger) metric value than other"""
+        if self.value is None:
+            return False
+        if other.value is None:
+            return True
+
+        assert type(self) is type(other) and (self.maximize == other.maximize)
+
+        # For multi-dataset metrics, use mean for comparison
+        self_val = (
+            np.mean(list(self.value.values()))
+            if isinstance(self.value, dict)
+            else self.value
+        )
+        other_val = (
+            np.mean(list(other.value.values()))
+            if isinstance(other.value, dict)
+            else other.value
+        )
+
+        if self_val == other_val:
+            return False
+
+        comp = self_val > other_val
+        return comp if self.maximize else not comp  # type: ignore
+
+    def __eq__(self, other: Any) -> bool:
+        return self.value == other.value
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    def __str__(self) -> str:
+        if self.maximize is None:
+            opt_dir = "?"
+        elif self.maximize:
+            opt_dir = "↑"
+        else:
+            opt_dir = "↓"
+        metric_name = f"({self.name})" if self.name else ""
+        if isinstance(self.value_npsafe, dict):
+            values_str = ", ".join(f"{k}:{v:.4f}" for k, v in self.value_npsafe.items())
+            mean_val = np.mean(list(self.value_npsafe.values()))
+            return f"Metric{opt_dir}{metric_name}[{values_str}](mean={mean_val:.4f})"
+        else:
+            return f"Metric{opt_dir}{metric_name}({self.value_npsafe:.4f})"
+
+    @property
+    def is_worst(self):
+        """True if the metric value is the worst possible value."""
+        return self.value is None
+
+    @property
+    def value_npsafe(self):
+        if self.value is None:
+            return float("nan")
+        if isinstance(self.value, dict):
+            return {
+                k: v if v is not None else float("nan") for k, v in self.value.items()
+            }
+        return self.value
+
+    def get_dataset_value(self, dataset_name: str) -> float | None:
+        """Get the metric value for a specific dataset"""
+        if isinstance(self.value, dict):
+            return self.value.get(dataset_name)
+        return None
+
+    def get_mean_value(self) -> float:
+        """Get the mean value across all datasets (or single value if not multi-dataset)"""
+        if self.value is None:
+            return float("nan")
+        if isinstance(self.value, dict):
+            return float(np.mean(list(self.value.values())))
+        return float(self.value)
+
+
+@dataclass
+@total_ordering
+class MetricValue(DataClassJsonMixin):
+    """
+    Represents one or more metric values to be optimized, which can be compared to other metric values.
+    Comparisons (and max, min) are based on which value is better, not which is larger.
+
+    The value can be:
+    - A single number (float/int)
+    - A dictionary in the format:
+      {
+        "metric_names": [
+          {
+            "metric_name": str,
+            "lower_is_better": bool,
+            "description": str,
+            "data": [
+                {"dataset_name": str, "final_value": float, "best_value": float},
+                {"dataset_name": str, "final_value": float, "best_value": float},
+                ...
+            ]
+          },
+          ...
+        ]
+      }
+    """
+
+    value: float | int | np.number | np.floating | dict | None
+    maximize: bool | None = field(default=None, kw_only=True)
+    name: str | None = field(default=None, kw_only=True)
+    description: str | None = field(default=None, kw_only=True)
+
+    def __post_init__(self):
+        if self.value is not None:
+            if isinstance(self.value, dict):
+                # Check if it's the new format with metric_names list
+                if "metric_names" in self.value:
+                    # New format - validate and convert values to float
+                    for metric in self.value["metric_names"]:
+                        for data_point in metric["data"]:
+                            if data_point["final_value"] is not None:
+                                data_point["final_value"] = float(
+                                    data_point["final_value"]
+                                )
+                            if data_point["best_value"] is not None:
+                                data_point["best_value"] = float(
+                                    data_point["best_value"]
+                                )
+                else:
+                    # Old format - convert to float
+                    self.value = {
+                        k: float(v) if v is not None else None
+                        for k, v in self.value.items()
+                    }
+            else:
+                # Single value case
+                assert isinstance(self.value, (float, int, np.number, np.floating))
+                self.value = float(self.value)
+
+    def __gt__(self, other) -> bool:
+        if self.value is None:
+            return False
+        if other.value is None:
+            return True
+
+        assert type(self) is type(other)
+
+        # Get mean values for comparison
+        self_val = self.get_mean_value()
+        other_val = other.get_mean_value()
+
+        if self_val == other_val:
+            return False
+
+        # Determine if we should maximize or minimize
+        should_maximize = self._should_maximize()
+        comp = self_val > other_val
+        return comp if should_maximize else not comp
+
+    def _should_maximize(self) -> bool:
+        """Determine if we should maximize based on the metric format"""
+        if isinstance(self.value, dict):
+            # New format
+            if "metric_names" in self.value:
+                # Use the first metric's lower_is_better value
+                try:
+                    return not self.value["metric_names"][0]["lower_is_better"]
+                except Exception as e:
+                    print(f"error during metric value: {e}")
+            # Old format
+            return bool(self.maximize)
+        # Single value case
+        return bool(self.maximize)
+
+    def __str__(self) -> str:
+        if isinstance(self.value, dict):
+            # New format with metric_names list
+            if "metric_names" in self.value:
+                parts = []
+                for metric in self.value["metric_names"]:
+                    opt_dir = (
+                        "↓"
+                        if "lower_is_better" in metric and metric["lower_is_better"]
+                        else "↑"
+                    )
+                    try:
+                        values_str = ", ".join(
+                            f"{d['dataset_name']}:(final={d['final_value']:.4f}, best={d['best_value']:.4f})"
+                            for d in metric["data"]
+                        )
+                    except Exception as e:
+                        print(f"error during metric value: {e}")
+                        values_str = "None"
+                    parts.append(f"{metric['metric_name']}{opt_dir}[{values_str}]")
+                return "Metrics(" + "; ".join(parts) + ")"
+            # Old format
+            opt_dir = "↓" if not self.maximize else "↑"
+            values_str = ", ".join(f"{k}:{v:.4f}" for k, v in self.value.items())
+            mean_val = np.mean([v for v in self.value.values() if v is not None])
+            return f"Metric{opt_dir}({self.name})[{values_str}](mean={mean_val:.4f})"
+        # Single value case
+        opt_dir = "?" if self.maximize is None else ("↑" if self.maximize else "↓")
+        metric_name = f"({self.name})" if self.name else ""
+        return f"Metric{opt_dir}{metric_name}({self.value_npsafe:.4f})"
+
+    def __eq__(self, other: Any) -> bool:
+        """Compare equality of metric values"""
+        if not isinstance(other, MetricValue):
+            raise NotImplementedError
+        if self.value is None and other.value is None:
+            return True
+        if self.value is None or other.value is None:
+            return False
+
+        # For new format, compare entire dictionaries
+        if isinstance(self.value, dict) and isinstance(other.value, dict):
+            # If both are new format with metric_names
+            if "metric_names" in self.value and "metric_names" in other.value:
+                return self.value == other.value
+            # If both are old format (no metric_names)
+            elif "metric_names" not in self.value and "metric_names" not in other.value:
+                return self.value == other.value
+            # Mixed formats should not be equal
+            return False
+        # Single values
+        return self.value == other.value
+
+    def __repr__(self) -> str:
+        """Return string representation"""
+        return str(self)
+
+    @property
+    def value_npsafe(self):
+        """Return a NaN-safe version of the value"""
+        if self.value is None:
+            return float("nan")
+        if isinstance(self.value, dict):
+            # New format with metric_names list
+            if "metric_names" in self.value:
+                return {
+                    "metric_names": [
+                        {
+                            **metric,
+                            "data": [
+                                {
+                                    **data_point,
+                                    "final_value": (
+                                        data_point["final_value"]
+                                        if data_point["final_value"] is not None
+                                        else float("nan")
+                                    ),
+                                    "best_value": (
+                                        data_point["best_value"]
+                                        if data_point["best_value"] is not None
+                                        else float("nan")
+                                    ),
+                                }
+                                for data_point in metric["data"]
+                            ],
+                        }
+                        for metric in self.value["metric_names"]
+                    ]
+                }
+            # Old format
+            return {
+                k: v if v is not None else float("nan") for k, v in self.value.items()
+            }
+        # Single value case
+        return self.value if self.value is not None else float("nan")
+
+    def get_mean_value(self) -> float:
+        """Get the mean value across all metrics and datasets"""
+        if self.value is None:
+            return float("nan")
+        if isinstance(self.value, dict):
+            # New format
+            if "metric_names" in self.value:
+                all_values = []
+                for metric in self.value["metric_names"]:
+                    # Use final_value for comparison
+                    values = [
+                        d["final_value"]
+                        for d in metric["data"]
+                        if d["final_value"] is not None
+                    ]
+                    if values:
+                        all_values.extend(values)
+                return float(np.mean(all_values)) if all_values else float("nan")
+            # Old format
+            values = [v for v in self.value.values() if v is not None]
+            return float(np.mean(values)) if values else float("nan")
+        # Single value case
+        return float(self.value)
+
+
+@dataclass
+class WorstMetricValue(MetricValue):
+    """
+    Represents an invalid metric value, e.g. when the agent creates a buggy solution.
+    Always compares worse than any valid metric value.
+    """
+
+    value: None = None
+
+    def __repr__(self):
+        return super().__repr__()
+
+    def __str__(self):
+        return super().__str__()
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/response.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/response.py
new file mode 100644
index 00000000..5c5ab110
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/response.py
@@ -0,0 +1,91 @@
+import json
+import re
+
+import black
+
+
+def wrap_code(code: str, lang="python") -> str:
+    """Wraps code with three backticks."""
+    return f"```{lang}\n{code}\n```"
+
+
+def is_valid_python_script(script):
+    """Check if a script is a valid Python script."""
+    try:
+        compile(script, "<string>", "exec")
+        return True
+    except SyntaxError:
+        return False
+
+
+def extract_jsons(text):
+    """Extract all JSON objects from the text. Caveat: This function cannot handle nested JSON objects."""
+    json_objects = []
+    matches = re.findall(r"\{.*?\}", text, re.DOTALL)
+    for match in matches:
+        try:
+            json_obj = json.loads(match)
+            json_objects.append(json_obj)
+        except json.JSONDecodeError:
+            pass
+
+    # Sometimes chatgpt-turbo forget the last curly bracket, so we try to add it back when no json is found
+    if len(json_objects) == 0 and not text.endswith("}"):
+        json_objects = extract_jsons(text + "}")
+        if len(json_objects) > 0:
+            return json_objects
+
+    return json_objects
+
+
+def trim_long_string(string, threshold=5100, k=2500):
+    # Check if the length of the string is longer than the threshold
+    if len(string) > threshold:
+        # Output the first k and last k characters
+        first_k_chars = string[:k]
+        last_k_chars = string[-k:]
+
+        truncated_len = len(string) - 2 * k
+
+        return f"{first_k_chars}\n ... [{truncated_len} characters truncated] ... \n{last_k_chars}"
+    else:
+        return string
+
+
+def extract_code(text):
+    """Extract python code blocks from the text."""
+    parsed_codes = []
+
+    # When code is in a text or python block
+    matches = re.findall(r"```(python)?\n*(.*?)\n*```", text, re.DOTALL)
+    for match in matches:
+        code_block = match[1]
+        parsed_codes.append(code_block)
+
+    # When the entire text is code or backticks of the code block is missing
+    if len(parsed_codes) == 0:
+        matches = re.findall(r"^(```(python)?)?\n?(.*?)\n?(```)?$", text, re.DOTALL)
+        if matches:
+            code_block = matches[0][2]
+            parsed_codes.append(code_block)
+
+    # validate the parsed codes
+    valid_code_blocks = [
+        format_code(c) for c in parsed_codes if is_valid_python_script(c)
+    ]
+    return format_code("\n\n".join(valid_code_blocks))
+
+
+def extract_text_up_to_code(s):
+    """Extract (presumed) natural language text up to the start of the first code block."""
+    if "```" not in s:
+        return ""
+    return s[: s.find("```")].strip()
+
+
+def format_code(code) -> str:
+    """Format Python code using Black."""
+    try:
+        return black.format_str(code, mode=black.FileMode())
+    except black.parsing.InvalidInput:  # type: ignore
+        return code
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/serialize.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/serialize.py
new file mode 100644
index 00000000..73c9c5a1
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/serialize.py
@@ -0,0 +1,79 @@
+import copy
+import json
+from pathlib import Path
+from typing import Type, TypeVar
+import re
+
+import dataclasses_json
+from ..journal import Journal, Node
+
+
+def dumps_json(obj: dataclasses_json.DataClassJsonMixin):
+    """Serialize dataclasses (such as Journals) to JSON."""
+    if isinstance(obj, Journal):
+        obj = copy.deepcopy(obj)
+        node2parent = {}
+        for n in obj.nodes:
+            if n.parent is not None:
+                # Handle both Node objects and string IDs
+                parent_id = n.parent.id if isinstance(n.parent, Node) else n.parent
+                node2parent[n.id] = parent_id
+        for n in obj.nodes:
+            n.parent = None
+            n.children = set()
+
+    obj_dict = obj.to_dict()
+
+    if isinstance(obj, Journal):
+        obj_dict["node2parent"] = node2parent
+        obj_dict["__version"] = "2"
+
+    return json.dumps(obj_dict, separators=(",", ":"))
+
+
+def dump_json(obj: dataclasses_json.DataClassJsonMixin, path: Path):
+    with open(path, "w") as f:
+        f.write(dumps_json(obj))
+
+
+G = TypeVar("G", bound=dataclasses_json.DataClassJsonMixin)
+
+
+def loads_json(s: str, cls: Type[G]) -> G:
+    """Deserialize JSON to AIDE dataclasses."""
+    obj_dict = json.loads(s)
+    obj = cls.from_dict(obj_dict)
+
+    if isinstance(obj, Journal):
+        id2nodes = {n.id: n for n in obj.nodes}
+        for child_id, parent_id in obj_dict["node2parent"].items():
+            id2nodes[child_id].parent = id2nodes[parent_id]
+            id2nodes[child_id].__post_init__()
+    return obj
+
+
+def load_json(path: Path, cls: Type[G]) -> G:
+    with open(path, "r") as f:
+        return loads_json(f.read(), cls)
+
+
+def parse_markdown_to_dict(content: str):
+    """
+    Reads a file that contains lines of the form:
+
+        "Key": "Value",
+        "Another Key": "Another Value",
+        ...
+
+    including possible multi-line values, and returns a Python dictionary.
+    """
+
+    pattern = r'"([^"]+)"\s*:\s*"([^"]*?)"(?:,\s*|\s*$)'
+
+    matches = re.findall(pattern, content, flags=re.DOTALL)
+
+    data_dict = {}
+    for key, value in matches:
+        data_dict[key] = value
+
+    return data_dict
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/tree_export.py b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/tree_export.py
new file mode 100644
index 00000000..5baa4dc0
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/tree_export.py
@@ -0,0 +1,484 @@
+"""Export journal to HTML visualization of tree + code."""
+
+import json
+import textwrap
+from pathlib import Path
+
+import numpy as np
+from igraph import Graph
+from ..journal import Journal
+
+from rich import print
+
+
+def get_edges(journal: Journal):
+    for node in journal:
+        for c in node.children:
+            yield (node.step, c.step)
+
+
+def generate_layout(n_nodes, edges, layout_type="rt"):
+    """Generate visual layout of graph"""
+    layout = Graph(
+        n_nodes,
+        edges=edges,
+        directed=True,
+    ).layout(layout_type)
+    y_max = max(layout[k][1] for k in range(n_nodes))
+    layout_coords = []
+    for n in range(n_nodes):
+        layout_coords.append((layout[n][0], 2 * y_max - layout[n][1]))
+    return np.array(layout_coords)
+
+
+def normalize_layout(layout: np.ndarray):
+    """Normalize layout to [0, 1]"""
+    layout = (layout - layout.min(axis=0)) / (layout.max(axis=0) - layout.min(axis=0))
+    layout[:, 1] = 1 - layout[:, 1]
+    layout[:, 1] = np.nan_to_num(layout[:, 1], nan=0)
+    layout[:, 0] = np.nan_to_num(layout[:, 0], nan=0.5)
+    return layout
+
+
+def get_completed_stages(log_dir):
+    """
+    Determine completed stages by checking for the existence of stage directories
+    that contain evidence of completion (tree_data.json, tree_plot.html, or journal.json).
+
+    Returns:
+        list: A list of stage names (e.g., ["Stage_1", "Stage_2"])
+    """
+    completed_stages = []
+
+    # Check for each stage (1-4)
+    for stage_num in range(1, 5):
+        prefix = f"stage_{stage_num}"
+
+        # Find all directories that match this stage number
+        matching_dirs = [
+            d for d in log_dir.iterdir() if d.is_dir() and d.name.startswith(prefix)
+        ]
+
+        # Check if any of these directories have completion evidence
+        for stage_dir in matching_dirs:
+            has_tree_data = (stage_dir / "tree_data.json").exists()
+            has_tree_plot = (stage_dir / "tree_plot.html").exists()
+            has_journal = (stage_dir / "journal.json").exists()
+
+            if has_tree_data or has_tree_plot or has_journal:
+                # Found evidence this stage was completed
+                completed_stages.append(f"Stage_{stage_num}")
+                break  # No need to check other directories for this stage
+
+    return completed_stages
+
+
+def cfg_to_tree_struct(cfg, jou: Journal, out_path: Path = None):
+    edges = list(get_edges(jou))
+    print(f"[red]Edges: {edges}[/red]")
+    try:
+        gen_layout = generate_layout(len(jou), edges)
+    except Exception as e:
+        print(f"Error in generate_layout: {e}")
+        raise
+    try:
+        layout = normalize_layout(gen_layout)
+    except Exception as e:
+        print(f"Error in normalize_layout: {e}")
+        raise
+
+    best_node = jou.get_best_node(cfg=cfg)
+    metrics = []
+    is_best_node = []
+
+    for n in jou:
+        # print(f"Node {n.id} exc_stack: {type(n.exc_stack)} = {n.exc_stack}")
+        if n.metric:
+            # Pass the entire metric structure for the new format
+            if isinstance(n.metric.value, dict) and "metric_names" in n.metric.value:
+                metrics.append(n.metric.value)
+            else:
+                # Handle legacy format by wrapping it in the new structure
+                metrics.append(
+                    {
+                        "metric_names": [
+                            {
+                                "metric_name": n.metric.name or "value",
+                                "lower_is_better": not n.metric.maximize,
+                                "description": n.metric.description or "",
+                                "data": [
+                                    {
+                                        "dataset_name": "default",
+                                        "final_value": n.metric.value,
+                                        "best_value": n.metric.value,
+                                    }
+                                ],
+                            }
+                        ]
+                    }
+                )
+        else:
+            metrics.append(None)
+
+        # Track whether this is the best node
+        is_best_node.append(n is best_node)
+
+    tmp = {}
+
+    # Add each item individually with error handling
+    try:
+        tmp["edges"] = edges
+    except Exception as e:
+        print(f"Error setting edges: {e}")
+        raise
+
+    try:
+        tmp["layout"] = layout.tolist()
+    except Exception as e:
+        print(f"Error setting layout: {e}")
+        raise
+
+    try:
+        tmp["plan"] = [
+            textwrap.fill(str(n.plan) if n.plan is not None else "", width=80)
+            for n in jou.nodes
+        ]
+    except Exception as e:
+        print(f"Error setting plan: {e}")
+        raise
+
+    try:
+        tmp["code"] = [n.code for n in jou]
+    except Exception as e:
+        print(f"Error setting code: {e}")
+        raise
+
+    try:
+        tmp["term_out"] = [
+            textwrap.fill(str(n._term_out) if n._term_out is not None else "", width=80)
+            for n in jou
+        ]
+    except Exception as e:
+        print(f"Error setting term_out: {e}")
+        print(f"n.term_out: {n._term_out}")
+        raise
+
+    try:
+        tmp["analysis"] = [
+            textwrap.fill(str(n.analysis) if n.analysis is not None else "", width=80)
+            for n in jou
+        ]
+    except Exception as e:
+        print(f"Error setting analysis: {e}")
+        raise
+
+    try:
+        tmp["exc_type"] = [n.exc_type for n in jou]
+    except Exception as e:
+        print(f"Error setting exc_type: {e}")
+        raise
+
+    try:
+        tmp["exc_info"] = [n.exc_info for n in jou]
+    except Exception as e:
+        print(f"Error setting exc_info: {e}")
+        raise
+
+    try:
+        tmp["exc_stack"] = [n.exc_stack for n in jou]
+    except Exception as e:
+        print(f"Error setting exc_stack: {e}")
+        raise
+
+    try:
+        tmp["exp_name"] = cfg.exp_name
+    except Exception as e:
+        print(f"Error setting exp_name: {e}")
+        raise
+
+    try:
+        tmp["metrics"] = metrics
+    except Exception as e:
+        print(f"Error setting metrics: {e}")
+        raise
+
+    try:
+        tmp["is_best_node"] = is_best_node
+    except Exception as e:
+        print(f"Error setting is_best_node: {e}")
+        raise
+
+    try:
+        tmp["plots"] = [n.plots for n in jou]
+    except Exception as e:
+        print(f"Error setting plots: {e}")
+        raise
+
+    try:
+        tmp["plot_paths"] = [n.plot_paths for n in jou]
+    except Exception as e:
+        print(f"Error setting plot_paths: {e}")
+        raise
+
+    try:
+        tmp["plot_analyses"] = [n.plot_analyses for n in jou]
+    except Exception as e:
+        print(f"Error setting plot_analyses: {e}")
+        raise
+
+    try:
+        tmp["vlm_feedback_summary"] = [
+            textwrap.fill(
+                (
+                    str(n.vlm_feedback_summary)
+                    if n.vlm_feedback_summary is not None
+                    else ""
+                ),
+                width=80,
+            )
+            for n in jou
+        ]
+    except Exception as e:
+        print(f"Error setting vlm_feedback_summary: {e}")
+        raise
+
+    try:
+        tmp["exec_time"] = [n.exec_time for n in jou]
+    except Exception as e:
+        print(f"Error setting exec_time: {e}")
+        raise
+
+    try:
+        tmp["exec_time_feedback"] = [
+            textwrap.fill(
+                str(n.exec_time_feedback) if n.exec_time_feedback is not None else "",
+                width=80,
+            )
+            for n in jou
+        ]
+    except Exception as e:
+        print(f"Error setting exec_time_feedback: {e}")
+        raise
+
+    try:
+        tmp["datasets_successfully_tested"] = [
+            n.datasets_successfully_tested for n in jou
+        ]
+    except Exception as e:
+        print(f"Error setting datasets_successfully_tested: {e}")
+        raise
+
+    try:
+        tmp["plot_code"] = [n.plot_code for n in jou]
+    except Exception as e:
+        print(f"Error setting plot_code: {e}")
+        raise
+
+    try:
+        tmp["plot_plan"] = [n.plot_plan for n in jou]
+    except Exception as e:
+        print(f"Error setting plot_plan: {e}")
+        raise
+
+    try:
+        tmp["ablation_name"] = [n.ablation_name for n in jou]
+    except Exception as e:
+        print(f"Error setting ablation_name: {e}")
+        raise
+
+    try:
+        tmp["hyperparam_name"] = [n.hyperparam_name for n in jou]
+    except Exception as e:
+        print(f"Error setting hyperparam_name: {e}")
+        raise
+
+    try:
+        tmp["is_seed_node"] = [n.is_seed_node for n in jou]
+    except Exception as e:
+        print(f"Error setting is_seed_node: {e}")
+        raise
+
+    try:
+        tmp["is_seed_agg_node"] = [n.is_seed_agg_node for n in jou]
+    except Exception as e:
+        print(f"Error setting is_seed_agg_node: {e}")
+        raise
+
+    try:
+        tmp["parse_metrics_plan"] = [
+            textwrap.fill(
+                str(n.parse_metrics_plan) if n.parse_metrics_plan is not None else "",
+                width=80,
+            )
+            for n in jou
+        ]
+    except Exception as e:
+        print(f"Error setting parse_metrics_plan: {e}")
+        raise
+
+    try:
+        tmp["parse_metrics_code"] = [n.parse_metrics_code for n in jou]
+    except Exception as e:
+        print(f"Error setting parse_metrics_code: {e}")
+        raise
+
+    try:
+        tmp["parse_term_out"] = [
+            textwrap.fill(
+                str(n.parse_term_out) if n.parse_term_out is not None else "", width=80
+            )
+            for n in jou
+        ]
+    except Exception as e:
+        print(f"Error setting parse_term_out: {e}")
+        raise
+
+    try:
+        tmp["parse_exc_type"] = [n.parse_exc_type for n in jou]
+    except Exception as e:
+        print(f"Error setting parse_exc_type: {e}")
+        raise
+
+    try:
+        tmp["parse_exc_info"] = [n.parse_exc_info for n in jou]
+    except Exception as e:
+        print(f"Error setting parse_exc_info: {e}")
+        raise
+
+    try:
+        tmp["parse_exc_stack"] = [n.parse_exc_stack for n in jou]
+    except Exception as e:
+        print(f"Error setting parse_exc_stack: {e}")
+        raise
+
+    # Add the list of completed stages by checking directories
+    if out_path:
+        log_dir = out_path.parent.parent
+        tmp["completed_stages"] = get_completed_stages(log_dir)
+
+    return tmp
+
+
+def generate_html(tree_graph_str: str):
+    template_dir = Path(__file__).parent / "viz_templates"
+
+    with open(template_dir / "template.js") as f:
+        js = f.read()
+        js = js.replace('"PLACEHOLDER_TREE_DATA"', tree_graph_str)
+
+    with open(template_dir / "template.html") as f:
+        html = f.read()
+        html = html.replace("<!-- placeholder -->", js)
+
+        return html
+
+
+def generate(cfg, jou: Journal, out_path: Path):
+    print("[red]Checking Journal[/red]")
+    try:
+        tree_struct = cfg_to_tree_struct(cfg, jou, out_path)
+    except Exception as e:
+        print(f"Error in cfg_to_tree_struct: {e}")
+        raise
+
+    # Save tree data as JSON for loading by the tabbed visualization
+    try:
+        # Save the tree data as a JSON file in the same directory
+        data_path = out_path.parent / "tree_data.json"
+        with open(data_path, "w") as f:
+            json.dump(tree_struct, f)
+    except Exception as e:
+        print(f"Error saving tree data JSON: {e}")
+
+    try:
+        tree_graph_str = json.dumps(tree_struct)
+    except Exception as e:
+        print(f"Error in json.dumps: {e}")
+        raise
+    try:
+        html = generate_html(tree_graph_str)
+    except Exception as e:
+        print(f"Error in generate_html: {e}")
+        raise
+    with open(out_path, "w") as f:
+        f.write(html)
+
+    # Create a unified tree visualization that shows all stages
+    try:
+        create_unified_viz(cfg, out_path)
+    except Exception as e:
+        print(f"Error creating unified visualization: {e}")
+        # Continue even if unified viz creation fails
+
+
+def create_unified_viz(cfg, current_stage_viz_path):
+    """
+    Create a unified visualization that shows all completed stages in a tabbed interface.
+    This will be placed in the main log directory.
+    """
+    # The main log directory is two levels up from the stage-specific visualization
+    log_dir = current_stage_viz_path.parent.parent
+
+    # Get the current stage name from the path
+    current_stage = current_stage_viz_path.parent.name
+    if current_stage.startswith("stage_"):
+        # Extract the stage number from the directory name
+        parts = current_stage.split("_")
+        if len(parts) >= 2 and parts[1].isdigit():
+            stage_num = parts[1]
+            current_stage = f"Stage_{stage_num}"
+
+    # Create a combined visualization at the top level
+    unified_viz_path = log_dir / "unified_tree_viz.html"
+
+    # Copy the template files
+    template_dir = Path(__file__).parent / "viz_templates"
+
+    with open(template_dir / "template.html") as f:
+        html = f.read()
+
+    with open(template_dir / "template.js") as f:
+        js = f.read()
+
+    # Get completed stages by checking directories
+    completed_stages = get_completed_stages(log_dir)
+
+    # Try to load the current stage's tree data to use as a basis
+    try:
+        current_stage_data_path = current_stage_viz_path.parent / "tree_data.json"
+        if current_stage_data_path.exists():
+            with open(current_stage_data_path, "r") as f:
+                base_data = json.load(f)
+                # Add the necessary metadata
+                base_data["current_stage"] = current_stage
+                base_data["completed_stages"] = completed_stages
+        else:
+            # If we can't load the tree data, create a minimal structure
+            base_data = {
+                "current_stage": current_stage,
+                "completed_stages": completed_stages,
+                # Add empty layout and edges to prevent errors
+                "layout": [],
+                "edges": [],
+            }
+    except Exception as e:
+        print(f"Error loading stage data: {e}")
+        # Create a minimal data structure that won't cause JS errors
+        base_data = {
+            "current_stage": current_stage,
+            "completed_stages": completed_stages,
+            "layout": [],
+            "edges": [],
+        }
+
+    # Replace the placeholder in the JS with our data
+    js = js.replace('"PLACEHOLDER_TREE_DATA"', json.dumps(base_data))
+
+    # Replace the placeholder in the HTML with our JS
+    html = html.replace("<!-- placeholder -->", js)
+
+    # Write the unified visualization
+    with open(unified_viz_path, "w") as f:
+        f.write(html)
+
+    print(f"[green]Created unified visualization at {unified_viz_path}[/green]")
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/viz_templates/template.html b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/viz_templates/template.html
new file mode 100644
index 00000000..d7ac761a
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/viz_templates/template.html
@@ -0,0 +1,298 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <script>
+      // Check if we're running under Live Server
+      if (window.location.hostname === '127.0.0.1' || window.location.hostname === 'localhost') {
+          let lastModified = '';
+
+          // Check for file changes every second
+          setInterval(async () => {
+              try {
+                  const response = await fetch(window.location.href, { method: 'HEAD' });
+                  // get a timestamp that shows when the file was last changed
+                  const currentModified = response.headers.get('last-modified');
+
+                  if (lastModified && lastModified !== currentModified) {
+                      window.location.reload();
+                  }
+
+                  lastModified = currentModified;
+              } catch (e) {
+                  console.error('Error checking for updates:', e);
+              }
+          }, 1000);
+      }
+  </script>
+    <script
+      id="p5scripttag"
+      src="https://cdnjs.cloudflare.com/ajax/libs/p5.js/1.9.0/p5.min.js"
+      integrity="sha512-uaz5GpnQoE6t5echKlX8P52czvsIGgLPcvlzfvRubLZ1Hp8JemUDnbUiAahbVtPb+jUVrNETuXvAhDDF/N3M4w=="
+      crossorigin="anonymous"
+      referrerpolicy="no-referrer"
+    ></script>
+
+    <link
+      rel="stylesheet"
+      href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/atom-one-dark.min.css"
+    />
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>
+
+    <script>
+      <!-- placeholder -->
+    </script>
+    <title>AI Scientist-v2 Visualization</title>
+    <style>
+      body,
+      * {
+        margin: 0;
+        padding: 0;
+        box-sizing: border-box;
+      }
+      body {
+        background-color: #ffffff;
+        font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+      }
+      #canvas-container {
+        position: absolute;
+        left: 0;
+        top: 0;
+        width: 40vw;
+        height: 100vh;
+        background-color: inherit;
+        padding-top: 40px;
+      }
+      canvas {
+        float: left;
+        height: 100vh;
+        width: 100vw;
+      }
+      #text-container {
+        float: right;
+        height: 100vh;
+        width: 50vw;
+        background-color: #282c34;
+        overflow: auto;
+      }
+      #plan {
+        /* border-left: 2px solid #282c34; */
+        background-color: #282c34;
+        color: #f2f0e7;
+        min-height: 5rem;
+        padding: 1em 0 1em 1em;
+      }
+      #plot_plan {
+        background-color: #282c34;
+        color: #f2f0e7;
+        min-height: 5rem;
+        padding: 1em 0 1em 1em;
+        white-space: pre-wrap;
+      }
+      #exec_time_feedback {
+        margin-top: 20px;
+        padding: 10px;
+        background-color: #282c34;
+        border-left: 3px solid #ff5555;
+        color: #f2f0e7;
+      }
+      #exec_time {
+        margin-top: 20px;
+        padding: 10px;
+        background-color: #282c34;
+        border-left: 3px solid #ff5555;
+        color: #f2f0e7;
+      }
+      #exc_info {
+        margin-top: 20px;
+        padding: 10px;
+        background-color: #2c1f1f;
+        border-left: 3px solid #ff5555;
+        color: #f2f0e7;
+      }
+      #metrics {
+        margin-top: 20px;
+        padding: 10px;
+        background-color: #282c34;
+        color: #f2f0e7;
+      }
+      #vlm_feedback {
+        margin-top: 20px;
+        padding: 10px;
+        background-color: #1f2c2f;
+        color: #f2f0e7;
+        border-left: 3px solid #55ff55;
+      }
+      #vlm_feedback p {
+        margin: 0.5em 0;
+        white-space: pre-wrap;
+      }
+      .datasets_successfully_tested {
+        margin-top: 20px;
+        padding: 10px;
+        background-color: #282c34;
+        color: #f2f0e7;
+        border-left: 3px solid #55ff55;
+      }
+      .plots-container {
+        float: right;
+        width: 50vw;
+        padding: 1rem;
+        background-color: #282c34;
+        margin-top: 1rem;
+      }
+
+      .plot-item {
+        flex: 1 1 300px;
+        max-width: 100%;
+        margin-bottom: 1rem;
+        white-space: pre-wrap;
+      }
+
+      .plot-item img {
+        width: 100%;
+        height: auto;
+        border-radius: 4px;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        display: block;
+      }
+
+      .metric-group {
+        margin-bottom: 20px;
+        padding: 10px;
+        border: 1px solid #ddd;
+        border-radius: 4px;
+      }
+
+      .metric-table {
+        width: 100%;
+        border-collapse: collapse;
+        margin-top: 10px;
+      }
+
+      .metric-table th,
+      .metric-table td {
+        padding: 8px;
+        text-align: left;
+        border: 1px solid #ddd;
+      }
+
+      .metric-table th {
+        background-color: #363b44;
+      }
+
+      /* Styles for tabs */
+      .tabs-container {
+        position: fixed;
+        top: 0;
+        left: 0;
+        width: 49vw;
+        background-color: #000000;
+        z-index: 10;
+        display: flex;
+        padding: 0;
+      }
+
+      .tab {
+        cursor: pointer;
+        padding: 10px 15px;
+        background-color: #333;
+        color: #f2f0e7;
+        border: none;
+        outline: none;
+        transition: background-color 0.3s;
+        flex: 1;
+        text-align: center;
+      }
+
+      .tab:hover {
+        background-color: #444;
+      }
+
+      .tab.active {
+        background-color: #4c76af;
+        font-weight: bold;
+      }
+
+      .tab.disabled {
+        opacity: 0.5;
+        cursor: not-allowed;
+        background-color: #282c34;
+      }
+
+      .tab-content {
+        display: none;
+        padding-top: 40px; /* Space for tabs */
+      }
+
+      .tab-content.active {
+        display: block;
+      }
+
+      .stage-info {
+        padding: 10px;
+        background-color: #282c34;
+        color: #f2f0e7;
+        margin-bottom: 10px;
+        font-size: 0.9em;
+      }
+
+      .stage-status {
+        display: inline-block;
+        padding: 3px 6px;
+        border-radius: 3px;
+        margin-left: 8px;
+        font-size: 0.8em;
+      }
+
+      .stage-status.completed {
+        background-color: #4caf50;
+      }
+
+      .stage-status.in-progress {
+        background-color: #2196f3;
+      }
+
+      .stage-status.not-started {
+        background-color: #9e9e9e;
+      }
+    </style>
+  </head>
+  <body>
+    <div class="tabs-container" id="stage-tabs">
+      <button class="tab" data-stage="Stage_1" onclick="selectStage('Stage_1')">Stage 1</button>
+      <button class="tab" data-stage="Stage_2" onclick="selectStage('Stage_2')">Stage 2</button>
+      <button class="tab" data-stage="Stage_3" onclick="selectStage('Stage_3')">Stage 3</button>
+      <button class="tab" data-stage="Stage_4" onclick="selectStage('Stage_4')">Stage 4</button>
+    </div>
+
+    <div id="canvas-container"></div>
+
+    <pre id="text-container">
+        <div id="stage-info" class="stage-info"></div>
+        <div id="plan"></div>
+        <hr>
+        <div id="exc_info"></div>
+        <hr>
+        <div id="exec_time"></div>
+        <hr>
+        <div id="exec_time_feedback"></div>
+        <hr>
+        <div id="metrics"></div>
+        <hr>
+        <div id="plot_plan"></div>
+        <hr>
+        <div class="plots-container" id="plots"></div>
+        <hr>
+        <div id="vlm_feedback"></div>
+        <hr>
+        <div id="datasets_successfully_tested"></div>
+        <hr>
+        <code id="code" class="language-python"></code>
+        <hr>
+        <code id="plot_code" class="language-python"></code>
+    </pre>
+  </body>
+</html>
diff --git a/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/viz_templates/template.js b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/viz_templates/template.js
new file mode 100644
index 00000000..f2e6a174
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/asv2/treesearch/utils/viz_templates/template.js
@@ -0,0 +1,695 @@
+const bgCol = "#FFFFFF";
+const accentCol = "#1a439e";
+
+hljs.initHighlightingOnLoad();
+
+// Function to update background color globally
+function updateBackgroundColor(color) {
+  // Update the JS variable
+  window.bgColCurrent = color;
+
+  // Update body background
+  document.body.style.backgroundColor = color;
+
+  // Update canvas container background
+  const canvasContainer = document.getElementById('canvas-container');
+  if (canvasContainer) {
+    canvasContainer.style.backgroundColor = color;
+  }
+}
+
+// Store tree data for each stage
+const stageData = {
+  Stage_1: null,
+  Stage_2: null,
+  Stage_3: null,
+  Stage_4: null
+};
+
+// Keep track of current selected stage
+let currentStage = null;
+let currentSketch = null;
+let availableStages = [];
+
+// Class definitions for nodes and edges
+class Node {
+  constructor(x, y, id, isRoot = false) {
+    this.x = x;
+    this.y = y;
+    this.id = id;
+    this.visible = isRoot; // Only root nodes are visible initially
+    this.appearProgress = 0;
+    this.popEffect = 0;
+    this.selected = false;
+    this.isRootNode = isRoot;
+  }
+
+  update() {
+    if (this.visible) {
+      // Handle the main appearance animation
+      if (this.appearProgress < 1) {
+        this.appearProgress += 0.06;
+
+        // When we reach full size, trigger the pop effect
+        if (this.appearProgress >= 1) {
+          this.appearProgress = 1; // Cap at 1
+          this.popEffect = 1; // Start the pop effect
+        }
+      }
+
+      // Handle the pop effect animation
+      if (this.popEffect > 0) {
+        this.popEffect -= 0.15; // Control how quickly it shrinks back
+        if (this.popEffect < 0) this.popEffect = 0; // Don't go negative
+      }
+    }
+  }
+
+  startAnimation() {
+    this.visible = true;
+  }
+
+  color() {
+    if (this.selected) {
+      return accentCol; // Use the global accent color variable for selected node
+    }
+    return '#4263eb'; // Default blue color
+  }
+
+  render(p5) {
+    if (this.visible) {
+      const popBonus = this.popEffect * 0.1;
+      const nodeScale = p5.map(this.appearProgress, 0, 1, 0, 1) + popBonus;
+      const alpha = p5.map(this.appearProgress, 0, 1, 0, 255);
+
+      p5.push();
+      p5.translate(this.x, this.y);
+
+      // Shadow effect
+      p5.noStroke();
+      p5.rectMode(p5.CENTER);
+
+      for (let i = 1; i <= 4; i++) {
+        p5.fill(0, 0, 0, alpha * 0.06);
+        p5.rect(i, i, 30 * nodeScale, 30 * nodeScale, 10);
+      }
+
+      // Main square - use node's color with alpha
+      let nodeColor = p5.color(this.color());
+      nodeColor.setAlpha(alpha);
+      p5.fill(nodeColor);
+      p5.rect(0, 0, 30 * nodeScale, 30 * nodeScale, 10);
+
+      // Draw checkmark icon if the node is selected
+      if (this.selected && this.appearProgress >= 1) {
+        p5.stroke(255);
+        p5.strokeWeight(2 * nodeScale);
+        p5.noFill();
+        // Draw checkmark
+        p5.beginShape();
+        p5.vertex(-8, 0);
+        p5.vertex(-3, 5);
+        p5.vertex(8, -6);
+        p5.endShape();
+      }
+
+      p5.pop();
+    }
+  }
+
+  isMouseOver(p5) {
+    return this.visible &&
+           p5.mouseX > this.x - 15 &&
+           p5.mouseX < this.x + 15 &&
+           p5.mouseY > this.y - 15 &&
+           p5.mouseY < this.y + 15;
+  }
+
+  // Connect this node to a child node
+  child(childNode) {
+    // Create an edge from this node to the child
+    let isLeft = childNode.x < this.x;
+    let isRight = childNode.x > this.x;
+    let edge = new Edge(this, childNode, isLeft, isRight);
+    return edge;
+  }
+}
+
+class Edge {
+  constructor(parent, child, isLeft, isRight) {
+    this.parent = parent;
+    this.child = child;
+    this.isLeft = isLeft;
+    this.isRight = isRight;
+    this.progress = 0;
+
+    // Calculate the midpoint where branching occurs
+    this.midY = parent.y + (child.y - parent.y) * 0.6;
+
+    // Use the actual child x-coordinate
+    // This ensures the edge will connect directly to the child node
+    this.branchX = child.x;
+  }
+
+  update() {
+    if (this.parent.visible && this.progress < 1) {
+      this.progress += 0.01; // Adjust animation speed
+    }
+    if (this.progress >= 1) {
+      this.child.visible = true;
+    }
+  }
+
+  color() {
+    return this.child.color();
+  }
+
+  render(p5) {
+    if (!this.parent.visible) return;
+
+    // Calculate path lengths
+    const verticalDist1 = this.midY - this.parent.y;
+    const horizontalDist = Math.abs(this.branchX - this.parent.x);
+    const verticalDist2 = this.child.y - this.midY;
+    const totalLength = verticalDist1 + horizontalDist + verticalDist2;
+
+    // Calculate how much of each segment to draw
+    const currentLength = totalLength * this.progress;
+
+    p5.stroke(180, 190, 205);
+    p5.strokeWeight(1.5);
+    p5.noFill();
+
+    // Always draw the first vertical segment from parent
+    if (currentLength > 0) {
+      const firstSegmentLength = Math.min(currentLength, verticalDist1);
+      const currentMidY = p5.lerp(this.parent.y, this.midY, firstSegmentLength / verticalDist1);
+      p5.line(this.parent.x, this.parent.y, this.parent.x, currentMidY);
+    }
+
+    if (currentLength > verticalDist1) {
+      // Draw second segment (horizontal)
+      const secondSegmentLength = Math.min(currentLength - verticalDist1, horizontalDist);
+      const currentBranchX = p5.lerp(this.parent.x, this.branchX, secondSegmentLength / horizontalDist);
+      p5.line(this.parent.x, this.midY, currentBranchX, this.midY);
+
+      if (currentLength > verticalDist1 + horizontalDist) {
+        // Draw third segment (vertical to child)
+        const thirdSegmentLength = currentLength - verticalDist1 - horizontalDist;
+        const currentChildY = p5.lerp(this.midY, this.child.y, thirdSegmentLength / verticalDist2);
+        p5.line(this.branchX, this.midY, this.branchX, currentChildY);
+      }
+    }
+  }
+}
+
+// Create a modified sketch for each stage
+function createTreeSketch(stageId) {
+  return function(p5) {
+    let nodes = [];
+    let edges = [];
+    let treeData = stageData[stageId];
+
+    p5.setup = function() {
+      const canvas = p5.createCanvas(p5.windowWidth * 0.4, p5.windowHeight);
+      canvas.parent('canvas-container');
+      p5.smooth();
+      p5.frameRate(60);
+
+      if (treeData) {
+        createTreeFromData(treeData);
+      }
+    };
+
+    p5.windowResized = function() {
+      p5.resizeCanvas(p5.windowWidth * 0.4, p5.windowHeight);
+    };
+
+    function createTreeFromData(data) {
+      // Clear existing nodes and edges
+      nodes = [];
+      edges = [];
+
+      // Add defensive checks to prevent errors
+      if (!data || !data.layout || !Array.isArray(data.layout) || !data.edges || !Array.isArray(data.edges)) {
+        console.error("Invalid tree data format:", data);
+        return; // Exit if data structure is invalid
+      }
+
+      // Find all parent nodes in edges
+      const parentNodes = new Set();
+      for (const [parentId, childId] of data.edges) {
+        parentNodes.add(parentId);
+      }
+
+      // Create nodes
+      for (let i = 0; i < data.layout.length; i++) {
+        const [nx, ny] = data.layout[i];
+        // A node is a root if it's a parent and not a child in any edge
+        const isRoot = parentNodes.has(i) && data.edges.every(edge => edge[1] !== i);
+
+        const node = new Node(
+          nx * p5.width * 0.8 + p5.width * 0.1,
+          ny * p5.height * 0.8 + p5.height * 0.1,
+          i,
+          isRoot
+        );
+        nodes.push(node);
+      }
+
+      // If no root was found, make the first parent node visible
+      if (!nodes.some(node => node.visible) && parentNodes.size > 0) {
+        // Get the first parent node
+        const firstParentId = [...parentNodes][0];
+        if (nodes[firstParentId]) {
+          nodes[firstParentId].visible = true;
+        }
+      }
+
+      // Create edges
+      for (const [parentId, childId] of data.edges) {
+        const parent = nodes[parentId];
+        const child = nodes[childId];
+        if (parent && child) { // Verify both nodes exist
+          const isLeft = child.x < parent.x;
+          const isRight = child.x > parent.x;
+          edges.push(new Edge(parent, child, isLeft, isRight));
+        }
+      }
+
+      // Select the first node by default
+      if (nodes.length > 0) {
+        nodes[0].selected = true;
+        updateNodeInfo(0);
+      }
+    }
+
+    p5.draw = function() {
+      // Use the global background color if available, otherwise use the default bgCol
+      const currentBgColor = window.bgColCurrent || bgCol;
+      p5.background(currentBgColor);
+
+      // Update and render edges
+      for (const edge of edges) {
+        edge.update();
+        edge.render(p5);
+      }
+
+      // Update and render nodes
+      for (const node of nodes) {
+        node.update();
+        node.render(p5);
+      }
+
+      // Handle mouse hover
+      p5.cursor(p5.ARROW);
+      for (const node of nodes) {
+        if (node.isMouseOver(p5)) {
+          p5.cursor(p5.HAND);
+        }
+      }
+    };
+
+    p5.mousePressed = function() {
+      // Check if any node was clicked
+      for (let i = 0; i < nodes.length; i++) {
+        if (nodes[i].visible && nodes[i].isMouseOver(p5)) {
+          // Deselect all nodes
+          nodes.forEach(n => n.selected = false);
+          // Select the clicked node
+          nodes[i].selected = true;
+          // Update the right panel with node info
+          updateNodeInfo(i);
+          break;
+        }
+      }
+    };
+
+    function updateNodeInfo(nodeIndex) {
+      if (treeData) {
+        setNodeInfo(
+          treeData.code[nodeIndex],
+          treeData.plan[nodeIndex],
+          treeData.plot_code?.[nodeIndex],
+          treeData.plot_plan?.[nodeIndex],
+          treeData.metrics?.[nodeIndex],
+          treeData.exc_type?.[nodeIndex] || '',
+          treeData.exc_info?.[nodeIndex]?.args?.[0] || '',
+          treeData.exc_stack?.[nodeIndex] || [],
+          treeData.plots?.[nodeIndex] || [],
+          treeData.plot_analyses?.[nodeIndex] || [],
+          treeData.vlm_feedback_summary?.[nodeIndex] || '',
+          treeData.datasets_successfully_tested?.[nodeIndex] || [],
+          treeData.exec_time_feedback?.[nodeIndex] || '',
+          treeData.exec_time?.[nodeIndex] || ''
+        );
+      }
+    }
+  };
+}
+
+// Start a new p5 sketch for the given stage
+function startSketch(stageId) {
+  if (currentSketch) {
+    currentSketch.remove();
+  }
+
+  if (stageData[stageId]) {
+    currentSketch = new p5(createTreeSketch(stageId));
+
+    // Update stage info
+    const stageNumber = stageId.split('_')[1];
+    let stageDesc = '';
+    switch(stageId) {
+      case 'Stage_1': stageDesc = 'Preliminary Investigation'; break;
+      case 'Stage_2': stageDesc = 'Baseline Tuning'; break;
+      case 'Stage_3': stageDesc = 'Research Agenda Execution'; break;
+      case 'Stage_4': stageDesc = 'Ablation Studies'; break;
+    }
+
+    document.getElementById('stage-info').innerHTML =
+      `<strong>Current Stage: ${stageNumber} - ${stageDesc}</strong>`;
+  }
+}
+
+// Handle tab selection
+function selectStage(stageId) {
+  if (!stageData[stageId] || !availableStages.includes(stageId)) {
+    return; // Don't allow selection of unavailable stages
+  }
+
+  // Update active tab styles
+  document.querySelectorAll('.tab').forEach(tab => {
+    tab.classList.remove('active');
+  });
+  document.querySelector(`.tab[data-stage="${stageId}"]`).classList.add('active');
+
+  // Start the new sketch
+  currentStage = stageId;
+  startSketch(stageId);
+}
+
+// Function to load the tree data for all stages
+async function loadAllStageData(baseTreeData) {
+  console.log("Loading stage data with base data:", baseTreeData);
+
+  // The base tree data is for the current stage
+  const currentStageId = baseTreeData.current_stage || 'Stage_1';
+
+  // Ensure base tree data is valid and has required properties
+  if (baseTreeData && baseTreeData.layout && baseTreeData.edges) {
+    stageData[currentStageId] = baseTreeData;
+    availableStages.push(currentStageId);
+    console.log(`Added current stage ${currentStageId} to available stages`);
+  } else {
+    console.warn(`Current stage ${currentStageId} data is invalid:`, baseTreeData);
+  }
+
+  // Use relative path to load other stage trees
+  const logDirPath = baseTreeData.log_dir_path || '.';
+  console.log("Log directory path:", logDirPath);
+
+  // Load data for each stage if available
+  const stageNames = ['Stage_1', 'Stage_2', 'Stage_3', 'Stage_4'];
+  const stageNames2actualNames = {
+    'Stage_1': 'stage_1_initial_implementation_1_preliminary',
+    'Stage_2': 'stage_2_baseline_tuning_1_first_attempt',
+    'Stage_3': 'stage_3_creative_research_1_first_attempt',
+    'Stage_4': 'stage_4_ablation_studies_1_first_attempt'
+    }
+
+  for (const stage of stageNames) {
+
+    if (baseTreeData.completed_stages && baseTreeData.completed_stages.includes(stage)) {
+      try {
+        console.log(`Attempting to load data for ${stage} from ${logDirPath}/${stageNames2actualNames[stage]}/tree_data.json`);
+        const response = await fetch(`${logDirPath}/${stageNames2actualNames[stage]}/tree_data.json`);
+
+        if (response.ok) {
+          const data = await response.json();
+
+          // Validate the loaded data
+          if (data && data.layout && data.edges) {
+            stageData[stage] = data;
+            availableStages.push(stage);
+            console.log(`Successfully loaded and validated data for ${stage}`);
+          } else {
+            console.warn(`Loaded data for ${stage} is invalid:`, data);
+          }
+        } else {
+          console.warn(`Failed to load data for ${stage} - HTTP status ${response.status}`);
+        }
+      } catch (error) {
+        console.error(`Error loading data for ${stage}:`, error);
+      }
+    } else {
+      console.log(`Skipping stage ${stage} - not in completed stages list:`, baseTreeData.completed_stages);
+    }
+  }
+
+  // Update tab visibility based on available stages
+  updateTabVisibility();
+
+  // Start with the first available stage
+  if (availableStages.length > 0) {
+    selectStage(availableStages[0]);
+  } else {
+    console.warn("No stages available to display");
+    // Display a message in the canvas area
+    document.getElementById('canvas-container').innerHTML =
+      '<div style="padding: 20px; color: #333; text-align: center;"><h3>No valid tree data available to display</h3></div>';
+  }
+}
+
+// Update tab visibility based on available stages
+function updateTabVisibility() {
+  const tabs = document.querySelectorAll('.tab');
+  tabs.forEach(tab => {
+    const stageId = tab.getAttribute('data-stage');
+    if (availableStages.includes(stageId)) {
+      tab.classList.remove('disabled');
+    } else {
+      tab.classList.add('disabled');
+    }
+  });
+}
+
+// Utility function to set the node info in the right panel
+const setNodeInfo = (code, plan, plot_code, plot_plan, metrics = null, exc_type = '', exc_info = '',
+    exc_stack = [], plots = [], plot_analyses = [], vlm_feedback_summary = '',
+    datasets_successfully_tested = [], exec_time_feedback = '', exec_time = '') => {
+  const codeElm = document.getElementById("code");
+  if (codeElm) {
+    if (code) {
+      codeElm.innerHTML = hljs.highlight(code, { language: "python" }).value;
+    } else {
+      codeElm.innerHTML = '<p>No code available</p>';
+    }
+  }
+
+  const planElm = document.getElementById("plan");
+  if (planElm) {
+    if (plan) {
+      planElm.innerHTML = hljs.highlight(plan, { language: "plaintext" }).value;
+    } else {
+      planElm.innerHTML = '<p>No plan available</p>';
+    }
+  }
+
+  const plot_codeElm = document.getElementById("plot_code");
+  if (plot_codeElm) {
+    if (plot_code) {
+      plot_codeElm.innerHTML = hljs.highlight(plot_code, { language: "python" }).value;
+    } else {
+      plot_codeElm.innerHTML = '<p>No plot code available</p>';
+    }
+  }
+
+  const plot_planElm = document.getElementById("plot_plan");
+  if (plot_planElm) {
+    if (plot_plan) {
+      plot_planElm.innerHTML = hljs.highlight(plot_plan, { language: "plaintext" }).value;
+    } else {
+      plot_planElm.innerHTML = '<p>No plot plan available</p>';
+    }
+  }
+
+  const metricsElm = document.getElementById("metrics");
+  if (metricsElm) {
+      let metricsContent = `<h3>Metrics:</h3>`;
+      if (metrics && metrics.metric_names) {
+          for (const metric of metrics.metric_names) {
+              metricsContent += `<div class="metric-group">`;
+              metricsContent += `<h4>${metric.metric_name}</h4>`;
+              metricsContent += `<p><strong>Description:</strong> ${metric.description || 'N/A'}</p>`;
+              metricsContent += `<p><strong>Optimization:</strong> ${metric.lower_is_better ? 'Minimize' : 'Maximize'}</p>`;
+
+              // Create table for dataset values
+              metricsContent += `<table class="metric-table">
+                  <tr>
+                      <th>Dataset</th>
+                      <th>Final Value</th>
+                      <th>Best Value</th>
+                  </tr>`;
+
+              for (const dataPoint of metric.data) {
+                  metricsContent += `<tr>
+                      <td>${dataPoint.dataset_name}</td>
+                      <td>${dataPoint.final_value?.toFixed(4) || 'N/A'}</td>
+                      <td>${dataPoint.best_value?.toFixed(4) || 'N/A'}</td>
+                  </tr>`;
+              }
+
+              metricsContent += `</table></div>`;
+          }
+      } else if (metrics === null) {
+          metricsContent += `<p>No metrics available</p>`;
+      }
+      metricsElm.innerHTML = metricsContent;
+  }
+
+  // Add plots display
+  const plotsElm = document.getElementById("plots");
+  if (plotsElm) {
+      if (plots && plots.length > 0) {
+          let plotsContent = '';
+          plots.forEach(plotPath => {
+              plotsContent += `
+                  <div class="plot-item">
+                      <img src="${plotPath}" alt="Experiment Plot" onerror="console.error('Failed to load plot:', this.src)"/>
+                  </div>`;
+          });
+          plotsElm.innerHTML = plotsContent;
+      } else {
+          plotsElm.innerHTML = '';
+      }
+  }
+
+  // Add error info display
+  const errorElm = document.getElementById("exc_info");
+  if (errorElm) {
+    if (exc_type) {
+      let errorContent = `<h3 style="color: #ff5555">Exception Information:</h3>
+                          <p><strong>Type:</strong> ${exc_type}</p>`;
+
+      if (exc_info) {
+        errorContent += `<p><strong>Details:</strong> <pre>${JSON.stringify(exc_info, null, 2)}</pre></p>`;
+      }
+
+      if (exc_stack) {
+        errorContent += `<p><strong>Stack Trace:</strong> <pre>${exc_stack.join('\n')}</pre></p>`;
+      }
+
+      errorElm.innerHTML = errorContent;
+    } else {
+      errorElm.innerHTML = "No exception info available";
+    }
+  }
+
+  const exec_timeElm = document.getElementById("exec_time");
+  if (exec_timeElm) {
+    let exec_timeContent = '<div id="exec_time"><h3>Execution Time (in seconds):</h3><p>' + exec_time + '</p></div>';
+    exec_timeElm.innerHTML = exec_timeContent;
+  }
+
+  const exec_time_feedbackElm = document.getElementById("exec_time_feedback");
+  if (exec_time_feedbackElm) {
+    let exec_time_feedbackContent = '<div id="exec_time_feedback_content">'
+    exec_time_feedbackContent += '<h3>Execution Time Feedback:</h3>'
+    exec_time_feedbackContent += '<p>' + exec_time_feedback + '</p>'
+    exec_time_feedbackContent += '</div>';
+    exec_time_feedbackElm.innerHTML = exec_time_feedbackContent;
+  }
+
+  const vlm_feedbackElm = document.getElementById("vlm_feedback");
+  if (vlm_feedbackElm) {
+      let vlm_feedbackContent = '';
+
+      if (plot_analyses && plot_analyses.length > 0) {
+          vlm_feedbackContent += `<h3>Plot Analysis:</h3>`;
+          plot_analyses.forEach(analysis => {
+              if (analysis && analysis.plot_path) {  // Add null check
+                  vlm_feedbackContent += `
+                      <div class="plot-analysis">
+                          <h4>Analysis for ${analysis.plot_path.split('/').pop()}</h4>
+                          <p>${analysis.analysis || 'No analysis available'}</p>
+                          <ul class="key-findings">
+                              ${(analysis.key_findings || []).map(finding => `<li>${finding}</li>`).join('')}
+                          </ul>
+                      </div>`;
+              } else {
+                  console.warn('Received invalid plot analysis:', analysis);
+                  vlm_feedbackContent += `
+                      <div class="plot-analysis">
+                          <p>Invalid plot analysis data received</p>
+                      </div>`;
+              }
+          });
+      }
+
+      // Add actionable insights if available
+      if (vlm_feedback_summary && typeof vlm_feedback_summary === 'string') {
+          vlm_feedbackContent += `
+              <div class="vlm_feedback">
+                  <h3>VLM Feedback Summary:</h3>
+                  <p>${vlm_feedback_summary}</p>
+              </div>`;
+      }
+
+      console.log("Datasets successfully tested:", datasets_successfully_tested);
+      if (datasets_successfully_tested && datasets_successfully_tested.length > 0) {
+          vlm_feedbackContent += `
+              <div id="datasets_successfully_tested">
+                  <h3>Datasets Successfully Tested:</h3>
+                  <p>${datasets_successfully_tested.join(', ')}</p>
+              </div>`;
+      }
+
+      if (!vlm_feedbackContent) {
+          vlm_feedbackContent = '<p>No insights available for this experiment.</p>';
+      }
+
+      vlm_feedbackElm.innerHTML = vlm_feedbackContent;
+  }
+
+  const datasets_successfully_testedElm = document.getElementById("datasets_successfully_tested");
+  if (datasets_successfully_testedElm) {
+      let datasets_successfully_testedContent = '';
+      if (datasets_successfully_tested && datasets_successfully_tested.length > 0) {
+          datasets_successfully_testedContent = `<h3>Datasets Successfully Tested:</h3><ul>`;
+          datasets_successfully_tested.forEach(dataset => {
+              datasets_successfully_testedContent += `<li>${dataset}</li>`;
+          });
+          datasets_successfully_testedContent += `</ul>`;
+      } else {
+          datasets_successfully_testedContent = '<p>No datasets tested yet</p>';
+      }
+      datasets_successfully_testedElm.innerHTML = datasets_successfully_testedContent;
+  }
+};
+
+// Initialize with the provided tree data
+const treeStructData = "PLACEHOLDER_TREE_DATA";
+
+// Add log directory path and stage info to the tree data
+treeStructData.log_dir_path = window.location.pathname.split('/').slice(0, -1).join('/');
+treeStructData.current_stage = window.location.pathname.includes('stage_')
+  ? window.location.pathname.split('stage_')[1].split('/')[0]
+  : 'Stage_1';
+
+// Initialize background color
+window.bgColCurrent = bgCol;
+
+// Function to set background color that can be called from the console
+function setBackgroundColor(color) {
+  // Update the global color
+  updateBackgroundColor(color);
+
+  // Refresh the current sketch to apply the new background color
+  if (currentStage) {
+    startSketch(currentStage);
+  }
+}
+
+// Load all stage data and initialize the visualization
+loadAllStageData(treeStructData);
diff --git a/skills/experiment-bfts-runner/scripts/run_bfts.py b/skills/experiment-bfts-runner/scripts/run_bfts.py
new file mode 100755
index 00000000..9d9ec53e
--- /dev/null
+++ b/skills/experiment-bfts-runner/scripts/run_bfts.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+"""Run BFTS experiments using the standalone runner package."""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Run BFTS experiments (standalone).")
+    ap.add_argument("--config", required=True, help="Path to bfts_config.yaml")
+    ap.add_argument(
+        "--online",
+        action="store_true",
+        help="Allow network calls to LLM providers (default: offline).",
+    )
+    args = ap.parse_args()
+
+    config_path = Path(args.config).expanduser().resolve()
+    if not config_path.exists():
+        print(f"[ERROR] Config not found: {config_path}")
+        return 2
+
+    # Ensure package import
+    scripts_dir = Path(__file__).parent.resolve()
+    if str(scripts_dir) not in sys.path:
+        sys.path.insert(0, str(scripts_dir))
+
+    # Default root for experiment data
+    os.environ.setdefault("AI_SCIENTIST_SKILLS_ROOT", str(config_path.parent))
+    os.environ.setdefault("AI_SCIENTIST_ROOT", str(config_path.parent))
+    if args.online:
+        os.environ["ASV2_ONLINE"] = "1"
+
+    from asv2.treesearch.perform_experiments_bfts_with_agentmanager import (
+        perform_experiments_bfts,
+    )
+
+    perform_experiments_bfts(str(config_path))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/experiment-log-summarizer/SKILL.md b/skills/experiment-log-summarizer/SKILL.md
new file mode 100644
index 00000000..0b96ec82
--- /dev/null
+++ b/skills/experiment-log-summarizer/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: experiment-log-summarizer
+description: Summarize an experiment run directory into a draft report (Markdown + JSON) by scanning logs, summary JSONs, figures, and artifacts without executing code. Use when you have a run folder (AI-Scientist, ML experiments, benchmarks) and need a grounded, source-linked summary for writeups or debugging.
+---
+
+# Experiment Log Summarizer
+
+## Overview
+Produce a grounded summary from a run directory by reading files only. The output is suitable as input for a paper writeup, lab notes, or an internal report.
+
+This skill does not run experiments and does not call external services. It inventories artifacts and extracts key fields from common summary JSON formats.
+
+## Workflow
+1. Pick the run directory
+   - Examples are described in references/run-layout.md.
+2. Run the summarizer
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run -s scripts/summarize_dir.py --dir /path/to/run --out summary.md --json-out summary.json
+   ~~~
+3. Use the summary
+   - Paste summary.md into your writeup workflow.
+   - If you see missing artifacts, use the Missing section to locate or regenerate them.
+
+## Output
+- summary.md: human-readable report, with explicit source file paths.
+- summary.json: structured extraction of key numbers, plots, and artifact inventory.
+
+## Guardrails
+Follow references/safeguards.md:
+- Do not execute code.
+- Do not infer missing results.
+- Do not include secrets from logs.
+
+## References
+- Run layouts to expect: references/run-layout.md
+- Suggested report structure: references/summary-template.md
+- Safeguards: references/safeguards.md
+- Summary schema: references/summary.schema.json
diff --git a/skills/experiment-log-summarizer/agents/openai.yaml b/skills/experiment-log-summarizer/agents/openai.yaml
new file mode 100644
index 00000000..a6b3ea36
--- /dev/null
+++ b/skills/experiment-log-summarizer/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Experiment Log Summarizer"
+  short_description: "Summarize run folders into grounded reports"
+  default_prompt: "Scan a run directory (read-only) and produce a Markdown + JSON summary with source paths, key numbers, and figure inventory."
diff --git a/skills/experiment-log-summarizer/references/run-layout.md b/skills/experiment-log-summarizer/references/run-layout.md
new file mode 100644
index 00000000..43f1e370
--- /dev/null
+++ b/skills/experiment-log-summarizer/references/run-layout.md
@@ -0,0 +1,29 @@
+# Common Run Layouts
+
+This skill tries to work with many experiment folders by using heuristics.
+
+## AI-Scientist-style (example)
+Typical artifacts under a single run directory:
+- `idea.md` / `idea.json` / `research_idea.md`
+- `logs/` with stage subfolders containing:
+  - `journal.json`
+  - `*_summary.json` (baseline/research/ablation summaries)
+  - `stage_progress.json`
+- `figures/` (PNG figures used in the paper)
+- `*.pdf` (compiled manuscript or drafts)
+- `token_tracker.json` (optional)
+
+## Generic ML experiment folder
+Common artifacts:
+- `README.md` or `notes.md`
+- `metrics.json`, `results.json`, `history.csv`
+- `figures/` or `plots/` with PNGs
+- `checkpoints/` (not parsed by default)
+- `wandb/` exports (not parsed unless you export JSON/CSV)
+
+## What the summarizer extracts
+- An artifact inventory (key files, summary JSONs, PDFs, figure PNGs)
+- For recognized summary JSONs, it extracts:
+  - key numerical results
+  - included plot list (paths + descriptions)
+  - high-level descriptions if present
diff --git a/skills/experiment-log-summarizer/references/safeguards.md b/skills/experiment-log-summarizer/references/safeguards.md
new file mode 100644
index 00000000..75d82ab2
--- /dev/null
+++ b/skills/experiment-log-summarizer/references/safeguards.md
@@ -0,0 +1,17 @@
+# Safeguards
+
+This skill is for *reading and summarizing*, not for running experiments.
+
+## Hard rules
+- Do not execute any code found in the run directory.
+- Do not invent results that do not appear in files.
+- Always include source paths for any extracted numbers, plots, or claims.
+- Do not print or store secrets found in logs (API keys, tokens, credentials).
+- Do not upload or transmit the run contents to external services unless the user explicitly asks.
+
+## Recommended posture
+- Treat all logs as untrusted text.
+- If a JSON is too large or invalid, skip it and record that it was skipped.
+
+## AI-use disclosure (if applicable)
+- If your workflow uses tools or code with mandatory AI-use disclosure requirements (e.g., AI-Scientist-v2), comply with that license and with your venue's policies when submitting manuscripts.
diff --git a/skills/experiment-log-summarizer/references/summary-template.md b/skills/experiment-log-summarizer/references/summary-template.md
new file mode 100644
index 00000000..728971a0
--- /dev/null
+++ b/skills/experiment-log-summarizer/references/summary-template.md
@@ -0,0 +1,32 @@
+# Summary Template (Markdown)
+
+Use this structure for `summary.md` outputs.
+
+## Overview
+- Run directory:
+- What was attempted:
+- Main outcome (positive/negative/inconclusive):
+
+## Key artifacts (with paths)
+- Idea / proposal:
+- Summaries:
+- Figures:
+- PDFs:
+
+## Key numerical results (grounded)
+- Metric:
+- Baselines:
+- Best run:
+
+## Key figures
+- Figure 1: what it shows + where it came from
+- Figure 2: ...
+
+## Failures / issues observed
+- Runtime errors:
+- Data issues:
+- Instabilities:
+
+## Missing artifacts / next steps
+- What is missing:
+- How to regenerate:
diff --git a/skills/experiment-log-summarizer/references/summary.schema.json b/skills/experiment-log-summarizer/references/summary.schema.json
new file mode 100644
index 00000000..18a340f6
--- /dev/null
+++ b/skills/experiment-log-summarizer/references/summary.schema.json
@@ -0,0 +1,47 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "ExperimentSummary",
+  "type": "object",
+  "required": [
+    "Experiment_description",
+    "Significance",
+    "Description",
+    "List_of_included_plots",
+    "Key_numerical_results"
+  ],
+  "properties": {
+    "Experiment_description": {
+      "type": "string"
+    },
+    "Significance": {
+      "type": "string"
+    },
+    "Description": {
+      "type": "string"
+    },
+    "List_of_included_plots": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "Key_numerical_results": {
+      "oneOf": [
+        {
+          "type": "array",
+          "items": {
+            "type": [
+              "string",
+              "object",
+              "number"
+            ]
+          }
+        },
+        {
+          "type": "object"
+        }
+      ]
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/experiment-log-summarizer/scripts/summarize_dir.py b/skills/experiment-log-summarizer/scripts/summarize_dir.py
new file mode 100644
index 00000000..d9c51be1
--- /dev/null
+++ b/skills/experiment-log-summarizer/scripts/summarize_dir.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+"""
+Summarize a run directory into Markdown + JSON without executing code.
+
+The script uses heuristics to discover common artifacts and extracts key fields
+from summary JSONs when present.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+
+DEFAULT_MAX_FILES = 4000
+DEFAULT_MAX_BYTES = 2_000_000  # 2 MB per file
+
+
+def _safe_rel(path: Path, base: Path) -> str:
+    try:
+        return str(path.relative_to(base))
+    except Exception:
+        return str(path)
+
+
+def _read_text(path: Path, max_bytes: int) -> str | None:
+    try:
+        data = path.read_bytes()
+    except Exception:
+        return None
+    if len(data) > max_bytes:
+        return None
+    try:
+        return data.decode("utf-8", errors="replace")
+    except Exception:
+        return None
+
+
+def _read_json(path: Path, max_bytes: int) -> Any | None:
+    raw = _read_text(path, max_bytes=max_bytes)
+    if raw is None:
+        return None
+    try:
+        return json.loads(raw)
+    except Exception:
+        return None
+
+
+def _walk_files(base: Path, max_files: int) -> list[Path]:
+    paths: list[Path] = []
+    count = 0
+    for root, dirs, files in os.walk(base):
+        # avoid descending into very large/irrelevant dirs
+        dirs[:] = [d for d in dirs if d not in {".git", "__pycache__", ".venv", "venv"}]
+        for fn in files:
+            paths.append(Path(root) / fn)
+            count += 1
+            if count >= max_files:
+                return paths
+    return paths
+
+
+def _extract_summary_fields(obj: Any) -> dict[str, Any]:
+    if not isinstance(obj, dict):
+        return {}
+    out: dict[str, Any] = {}
+    for k in [
+        "Experiment_description",
+        "Significance",
+        "Description",
+        "current_findings",
+        "stage",
+        "best_metric",
+        "total_nodes",
+        "good_nodes",
+        "buggy_nodes",
+    ]:
+        if k in obj:
+            out[k] = obj[k]
+
+    # AI-Scientist-style keys
+    if "Key_numerical_results" in obj and isinstance(obj["Key_numerical_results"], list):
+        out["Key_numerical_results"] = obj["Key_numerical_results"]
+    if "List_of_included_plots" in obj and isinstance(obj["List_of_included_plots"], list):
+        out["List_of_included_plots"] = obj["List_of_included_plots"]
+    return out
+
+
+def _render_md(report: dict[str, Any]) -> str:
+    base_dir = report["base_dir"]
+    scanned_at = report["scanned_at"]
+    inv = report["inventory"]
+    extracted = report["extracted"]
+
+    lines: list[str] = []
+    lines.append("# Run Summary")
+    lines.append("")
+    lines.append(f"- Base directory: `{base_dir}`")
+    lines.append(f"- Scanned at: `{scanned_at}`")
+    lines.append("")
+
+    lines.append("## Key Artifacts")
+    lines.append("")
+    for section in ["key_files", "summary_jsons", "pdfs", "figures"]:
+        items = inv.get(section, [])
+        lines.append(f"### {section}")
+        if not items:
+            lines.append("_None found._")
+        else:
+            for it in items:
+                lines.append(f"- `{it}`")
+        lines.append("")
+
+    lines.append("## Extracted Summaries (Grounded)")
+    lines.append("")
+    summaries = extracted.get("summaries", [])
+    if not summaries:
+        lines.append("_No parseable summary JSONs found._")
+    else:
+        for s in summaries:
+            lines.append(f"### `{s.get('path','')}`")
+            for k in ["Experiment_description", "Significance", "Description", "current_findings"]:
+                if k in s:
+                    val = s[k]
+                    if isinstance(val, str):
+                        val = val.strip()
+                    lines.append(f"**{k}:** {val}")
+                    lines.append("")
+
+            knr = s.get("Key_numerical_results")
+            if isinstance(knr, list) and knr:
+                lines.append("**Key numerical results:**")
+                for r in knr[:30]:
+                    if isinstance(r, dict):
+                        res = r.get("result")
+                        desc = r.get("description") or r.get("desc") or ""
+                        lines.append(f"- {res}: {desc}")
+                    else:
+                        lines.append(f"- {r}")
+                lines.append("")
+
+            lop = s.get("List_of_included_plots")
+            if isinstance(lop, list) and lop:
+                lines.append("**Plots mentioned:**")
+                for p in lop[:50]:
+                    if isinstance(p, dict):
+                        lines.append(f"- {p.get('path')}: {p.get('description','')}")
+                    else:
+                        lines.append(f"- {p}")
+                lines.append("")
+
+    lines.append("")
+    lines.append("## Missing / Next Steps")
+    lines.append("")
+    missing = report.get("missing_recommended", [])
+    if not missing:
+        lines.append("_No obvious missing recommended artifacts detected._")
+    else:
+        for m in missing:
+            lines.append(f"- {m}")
+    lines.append("")
+
+    lines.append("## Notes")
+    lines.append("")
+    lines.append(
+        "- This report is generated by file scanning only; it does not execute any code."
+    )
+    lines.append(
+        "- If a file was too large or invalid JSON, it is skipped and recorded in the JSON output."
+    )
+    lines.append("")
+    return "\n".join(lines)
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Summarize a run directory (read-only).")
+    ap.add_argument("--dir", required=True, help="Run directory to scan.")
+    ap.add_argument("--out", required=True, help="Output Markdown path.")
+    ap.add_argument("--json-out", required=True, help="Output JSON path.")
+    ap.add_argument(
+        "--max-files",
+        type=int,
+        default=DEFAULT_MAX_FILES,
+        help=f"Max files to scan (default: {DEFAULT_MAX_FILES}).",
+    )
+    ap.add_argument(
+        "--max-bytes",
+        type=int,
+        default=DEFAULT_MAX_BYTES,
+        help=f"Max bytes to read per file (default: {DEFAULT_MAX_BYTES}).",
+    )
+    args = ap.parse_args(argv)
+
+    base = Path(args.dir).expanduser().resolve()
+    if not base.exists() or not base.is_dir():
+        print(f"[ERROR] Not a directory: {base}")
+        return 2
+
+    all_files = _walk_files(base, max_files=max(1, args.max_files))
+    rels = [_safe_rel(p, base) for p in all_files]
+
+    key_candidates = [
+        "idea.md",
+        "idea.json",
+        "research_idea.md",
+        "token_tracker.json",
+        "review_text.txt",
+        "review_img_cap_ref.json",
+    ]
+    key_files: list[str] = []
+    for k in key_candidates:
+        p = base / k
+        if p.exists():
+            key_files.append(_safe_rel(p, base))
+
+    summary_jsons = sorted(
+        [r for r in rels if r.endswith(".json") and ("summary" in Path(r).name.lower())]
+    )
+    pdfs = sorted([r for r in rels if r.lower().endswith(".pdf")])
+    figures = sorted([r for r in rels if "/figures/" in ("/" + r).replace("\\", "/") and r.lower().endswith(".png")])
+
+    extracted_summaries: list[dict[str, Any]] = []
+    skipped: list[dict[str, Any]] = []
+
+    for rel in summary_jsons:
+        p = base / rel
+        obj = _read_json(p, max_bytes=args.max_bytes)
+        if obj is None:
+            skipped.append({"path": rel, "reason": "invalid_or_too_large"})
+            continue
+        fields = _extract_summary_fields(obj)
+        fields["path"] = rel
+        extracted_summaries.append(fields)
+
+    missing_recommended: list[str] = []
+    if not summary_jsons:
+        missing_recommended.append("No *summary*.json files found (e.g., baseline_summary.json).")
+    if not figures and not (base / "figures").exists():
+        missing_recommended.append("No figures/ directory with PNGs found.")
+
+    report: dict[str, Any] = {
+        "base_dir": str(base),
+        "scanned_at": _dt.datetime.now().isoformat(timespec="seconds"),
+        "params": {
+            "max_files": args.max_files,
+            "max_bytes": args.max_bytes,
+        },
+        "inventory": {
+            "key_files": key_files,
+            "summary_jsons": summary_jsons,
+            "pdfs": pdfs,
+            "figures": figures,
+        },
+        "extracted": {
+            "summaries": extracted_summaries,
+            "skipped": skipped,
+        },
+        "missing_recommended": missing_recommended,
+    }
+
+    out_md = _render_md(report)
+    Path(args.out).write_text(out_md, encoding="utf-8")
+    Path(args.json_out).write_text(json.dumps(report, indent=2), encoding="utf-8")
+    print(f"[OK] Wrote: {args.out}")
+    print(f"[OK] Wrote: {args.json_out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
diff --git a/skills/figure-caption-ref-audit/SKILL.md b/skills/figure-caption-ref-audit/SKILL.md
new file mode 100644
index 00000000..0d92b34b
--- /dev/null
+++ b/skills/figure-caption-ref-audit/SKILL.md
@@ -0,0 +1,28 @@
+---
+name: figure-caption-ref-audit
+description: Extract figure screenshots, captions, and in-text figure references (figrefs) from a PDF into an audit bundle (images + figures.json). Use when checking whether figures match captions and main text, deciding what to move to appendix, or auditing figure redundancy and clarity.
+---
+
+# Figure / Caption / Figref Audit
+
+## Overview
+Create an auditable bundle for figure review: extracted figure-region images, caption text, and main-text figref snippets that mention each figure.
+
+This is designed for *human/LLM-assisted auditing* (including vision models) and does not attempt to "understand" results beyond what is visible and referenced.
+
+## Workflow
+1. **Extract an audit bundle**
+   - `UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with PyMuPDF -s scripts/extract_figures_and_refs.py --pdf paper.pdf --out-dir audit_out --max-pages 50 --dpi 150`
+   - Output:
+     - `audit_out/figures.json`
+     - `audit_out/images/*.png`
+2. **Audit with the checklist**
+   - Use `references/audit-checklist.md` to review alignment and information density.
+3. **Make edits to the paper**
+   - Fix caption mismatches, missing figrefs, confusing axes/legends.
+   - Decide what to move to appendix.
+
+## References
+- Output schema: `references/extraction-output-schema.md`
+- Audit checklist: `references/audit-checklist.md`
+- Safeguards: `references/safeguards.md`
diff --git a/skills/figure-caption-ref-audit/agents/openai.yaml b/skills/figure-caption-ref-audit/agents/openai.yaml
new file mode 100644
index 00000000..602d61ce
--- /dev/null
+++ b/skills/figure-caption-ref-audit/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Figure/Caption/Figref Audit"
+  short_description: "Extract figures and audit caption/text alignment"
+  default_prompt: "Extract figure screenshots, captions, and figrefs from a PDF into an audit bundle, then identify caption mismatches, missing figrefs, redundancy, and which figures to move to appendix."
diff --git a/skills/figure-caption-ref-audit/references/audit-checklist.md b/skills/figure-caption-ref-audit/references/audit-checklist.md
new file mode 100644
index 00000000..556ad24a
--- /dev/null
+++ b/skills/figure-caption-ref-audit/references/audit-checklist.md
@@ -0,0 +1,26 @@
+# Audit Checklist
+
+Use this checklist after extraction to review each figure.
+
+## Caption alignment
+- Does the caption accurately describe what the figure shows?
+- Does the caption contain a concise takeaway, not just setup?
+- Are axis labels, legend entries, and units consistent with the caption?
+
+## Main-text integration (figrefs)
+- Does the main text reference the figure where needed?
+- Do figrefs explain why the figure matters (not just "see Figure X")?
+- Are there missing references for important figures?
+
+## Information density and redundancy
+- Is the figure too sparse to justify space in the main text?
+- Could it be combined with another figure as subplots?
+- Does it duplicate another figure (same story, different rendering)?
+
+## Figure quality
+- Readability at paper scale (font sizes, line widths, legend placement).
+- Clear baselines and comparisons.
+- Avoid misleading scales or cropped axes unless justified.
+
+## Decision
+- Keep in main text / move to appendix / merge / remove.
diff --git a/skills/figure-caption-ref-audit/references/extraction-output-schema.md b/skills/figure-caption-ref-audit/references/extraction-output-schema.md
new file mode 100644
index 00000000..d738deb8
--- /dev/null
+++ b/skills/figure-caption-ref-audit/references/extraction-output-schema.md
@@ -0,0 +1,31 @@
+# Extraction Output Schema
+
+The extractor writes:
+- `figures.json`
+- `images/*.png`
+
+## figures.json format
+```json
+{
+  "pdf": "/abs/path/to/paper.pdf",
+  "generated_at": "2026-02-04T12:34:56",
+  "images_dir": "images",
+  "figures": [
+    {
+      "label": "1",
+      "page": 3,
+      "caption": "Figure 1: ...",
+      "caption_bbox": [x0, y0, x1, y1],
+      "figure_bbox": [x0, y0, x1, y1],
+      "image_path": "images/figure_1_p3.png",
+      "figrefs": [
+        {"page": 2, "text": "As shown in Figure 1, ..."}
+      ],
+      "notes": "any extraction warnings"
+    }
+  ]
+}
+```
+
+## Coordinate system
+- Bounding boxes are in PDF page coordinates from PyMuPDF (float values).
diff --git a/skills/figure-caption-ref-audit/references/safeguards.md b/skills/figure-caption-ref-audit/references/safeguards.md
new file mode 100644
index 00000000..038ccac9
--- /dev/null
+++ b/skills/figure-caption-ref-audit/references/safeguards.md
@@ -0,0 +1,15 @@
+# Safeguards
+
+## Privacy
+- Do not upload or share the PDF, extracted images, or figref text unless the user explicitly asks.
+
+## No hallucinated results
+- Do not infer experimental outcomes beyond what is visible in figures and written in the text.
+- If a figure is ambiguous, note uncertainty and recommend clarifying edits.
+
+## Safety of extraction
+- The extractor is read-only and writes only under `--out-dir`.
+- By default it will not overwrite existing outputs unless `--overwrite` is passed.
+
+## AI-use disclosure (if applicable)
+- If your workflow uses tools or code with mandatory AI-use disclosure requirements (e.g., AI-Scientist-v2), comply with that license and with your venue's policies when submitting manuscripts.
diff --git a/skills/figure-caption-ref-audit/scripts/extract_figures_and_refs.py b/skills/figure-caption-ref-audit/scripts/extract_figures_and_refs.py
new file mode 100644
index 00000000..76d355dd
--- /dev/null
+++ b/skills/figure-caption-ref-audit/scripts/extract_figures_and_refs.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+Extract figure-region screenshots, captions, and figref snippets from a PDF.
+
+Requires: PyMuPDF (pip package: PyMuPDF; import name: fitz)
+
+Safety:
+- Read-only on the input PDF.
+- Writes only under --out-dir.
+- Does not overwrite existing outputs unless --overwrite is provided.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import re
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+
+
+CAPTION_RE = re.compile(
+    r"^(?:Figure)\s+(?P<label>(?:\d+|[A-Za-z]+\.\d+|\(\s*[A-Za-z]+\s*\)\.\d+))(?:\.|:)",
+    re.IGNORECASE,
+)
+
+
+@dataclass
+class TextBlock:
+    page: int
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    text: str
+
+
+def _safe_label(label: str) -> str:
+    # For filenames
+    s = label.strip()
+    s = s.replace(" ", "")
+    s = s.replace("(", "").replace(")", "")
+    s = s.replace("/", "-")
+    return s
+
+
+def _collect_blocks(doc, max_pages: int | None) -> list[TextBlock]:
+    blocks: list[TextBlock] = []
+    limit = min(doc.page_count, max_pages) if max_pages is not None else doc.page_count
+    for i in range(limit):
+        page = doc.load_page(i)
+        try:
+            raw = page.get_text("blocks")
+        except Exception:
+            continue
+        for b in raw:
+            # [x0, y0, x1, y1, text, block_no, ...]
+            txt = (b[4] or "").strip()
+            if not txt:
+                continue
+            blocks.append(TextBlock(i, float(b[0]), float(b[1]), float(b[2]), float(b[3]), txt))
+    return blocks
+
+
+def _find_caption_blocks(blocks: list[TextBlock]) -> list[TextBlock]:
+    caps: list[TextBlock] = []
+    for blk in blocks:
+        if CAPTION_RE.match(blk.text):
+            caps.append(blk)
+    # Sort by page then vertical position
+    caps.sort(key=lambda b: (b.page, b.y0))
+    return caps
+
+
+def _horiz_overlap_ratio(a: TextBlock, b: TextBlock) -> float:
+    overlap = min(a.x1, b.x1) - max(a.x0, b.x0)
+    if overlap <= 0:
+        return 0.0
+    width_min = min(a.x1 - a.x0, b.x1 - b.x0)
+    return float(overlap) / float(width_min) if width_min > 0 else 0.0
+
+
+def _pick_figure_bbox(page_rect, caption: TextBlock, page_blocks: list[TextBlock], margin: float, min_text_len: int, min_vertical_gap: float):
+    """
+    Heuristic:
+    - Find the closest substantial text block above the caption that overlaps horizontally.
+    - Use the region between that block's bottom and caption's top as the figure bbox.
+    """
+    above = [
+        b
+        for b in page_blocks
+        if b.y1 < caption.y0 and (caption.y0 - b.y1) >= min_vertical_gap and len(b.text) >= min_text_len and _horiz_overlap_ratio(caption, b) >= 0.3
+    ]
+    if above:
+        anchor = max(above, key=lambda b: b.y1)
+        top = anchor.y1 + margin
+    else:
+        top = float(page_rect.y0) + margin
+
+    bottom = caption.y0 - margin
+    left = max(float(page_rect.x0), caption.x0 - margin)
+    right = min(float(page_rect.x1), caption.x1 + margin)
+
+    if bottom - top < 20:
+        # Fallback: use most of page width.
+        left = float(page_rect.x0) + margin
+        right = float(page_rect.x1) - margin
+        top = float(page_rect.y0) + margin
+        bottom = caption.y0 - margin
+
+    if bottom - top < 20:
+        return None
+    return (left, top, right, bottom)
+
+
+def _build_figref_pattern(label: str) -> re.Pattern:
+    label = label.strip()
+    if label.isdigit():
+        # Avoid matching 11 when label is 1.
+        return re.compile(
+            rf"\b(?:Figure|Fig\.?)(?:\s|\xA0)*{re.escape(label)}(?!\d)",
+            re.IGNORECASE,
+        )
+    return re.compile(
+        rf"\b(?:Figure|Fig\.?)(?:\s|\xA0)*{re.escape(label)}\b",
+        re.IGNORECASE,
+    )
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Extract figure images + captions + figrefs from a PDF.")
+    ap.add_argument("--pdf", required=True, help="Input PDF path.")
+    ap.add_argument("--out-dir", required=True, help="Output directory.")
+    ap.add_argument("--max-pages", type=int, default=None, help="Max pages to scan.")
+    ap.add_argument("--dpi", type=int, default=150, help="Render DPI for images.")
+    ap.add_argument("--margin", type=float, default=6.0, help="Margin in PDF coordinates.")
+    ap.add_argument("--min-text-length", type=int, default=50, help="Min chars for anchor text blocks.")
+    ap.add_argument("--min-vertical-gap", type=float, default=30.0, help="Min vertical gap (points).")
+    ap.add_argument("--overwrite", action="store_true", help="Overwrite outputs in out-dir.")
+    args = ap.parse_args(argv)
+
+    pdf_path = Path(args.pdf).expanduser().resolve()
+    if not pdf_path.exists():
+        print(f"[ERROR] PDF not found: {pdf_path}")
+        return 2
+
+    out_dir = Path(args.out_dir).expanduser().resolve()
+    images_dir = out_dir / "images"
+    figures_json = out_dir / "figures.json"
+
+    if out_dir.exists():
+        if not args.overwrite:
+            print(f"[ERROR] out-dir exists (use --overwrite): {out_dir}")
+            return 2
+        # Safer than rmtree(out_dir): delete known outputs only.
+        if images_dir.exists():
+            shutil.rmtree(images_dir)
+        if figures_json.exists():
+            figures_json.unlink()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        import fitz  # type: ignore
+    except Exception as e:
+        print("[ERROR] PyMuPDF is required. Try: uv run --with PyMuPDF -s scripts/extract_figures_and_refs.py --help")
+        print(f"Details: {e}")
+        return 3
+
+    doc = fitz.open(str(pdf_path))
+    blocks = _collect_blocks(doc, max_pages=args.max_pages)
+    captions = _find_caption_blocks(blocks)
+
+    # Pre-group blocks by page for faster lookup
+    blocks_by_page: dict[int, list[TextBlock]] = {}
+    for b in blocks:
+        blocks_by_page.setdefault(b.page, []).append(b)
+    for page_blocks in blocks_by_page.values():
+        page_blocks.sort(key=lambda b: b.y0)
+
+    figures_out = []
+    for cap in captions:
+        m = CAPTION_RE.match(cap.text)
+        if not m:
+            continue
+        label = m.group("label").strip()
+        page = doc.load_page(cap.page)
+        page_rect = page.rect
+        bbox = _pick_figure_bbox(
+            page_rect,
+            caption=cap,
+            page_blocks=blocks_by_page.get(cap.page, []),
+            margin=args.margin,
+            min_text_len=args.min_text_length,
+            min_vertical_gap=args.min_vertical_gap,
+        )
+        notes = []
+        if bbox is None:
+            notes.append("failed_to_compute_figure_bbox")
+            continue
+
+        # Render bbox to image
+        rect = fitz.Rect(*bbox)
+        mat = fitz.Matrix(args.dpi / 72.0, args.dpi / 72.0)
+        pix = page.get_pixmap(matrix=mat, clip=rect, alpha=False)
+        img_name = f"figure_{_safe_label(label)}_p{cap.page+1}.png"
+        img_path = images_dir / img_name
+        pix.save(str(img_path))
+
+        # Collect figrefs across scanned pages
+        pat = _build_figref_pattern(label)
+        figrefs = []
+        for blk in blocks:
+            # figrefs are meant to be in-text mentions; skip the caption block itself.
+            if blk.page == cap.page and blk.text == cap.text:
+                continue
+            if pat.search(blk.text):
+                figrefs.append({"page": int(blk.page + 1), "text": blk.text})
+
+        figures_out.append(
+            {
+                "label": label,
+                "page": int(cap.page + 1),
+                "caption": cap.text,
+                "caption_bbox": [cap.x0, cap.y0, cap.x1, cap.y1],
+                "figure_bbox": [bbox[0], bbox[1], bbox[2], bbox[3]],
+                "image_path": str(Path("images") / img_name),
+                "figrefs": figrefs,
+                "notes": ";".join(notes) if notes else "",
+            }
+        )
+
+    out_obj = {
+        "pdf": str(pdf_path),
+        "generated_at": _dt.datetime.now().isoformat(timespec="seconds"),
+        "images_dir": "images",
+        "figures": figures_out,
+    }
+    figures_json.write_text(json.dumps(out_obj, indent=2), encoding="utf-8")
+    print(f"[OK] Wrote: {figures_json}")
+    print(f"[OK] Images: {images_dir} ({len(figures_out)})")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/idea-to-markdown/SKILL.md b/skills/idea-to-markdown/SKILL.md
new file mode 100644
index 00000000..f12abed6
--- /dev/null
+++ b/skills/idea-to-markdown/SKILL.md
@@ -0,0 +1,32 @@
+---
+name: idea-to-markdown
+description: Convert idea JSON into a structured idea.md (optionally embedding a code snippet). Use when preparing idea inputs for BFTS experiments or writeup workflows.
+---
+
+# Idea -> Markdown
+
+## Overview
+Convert a single idea JSON (or list) into `idea.md` with section headers. Optionally include a code snippet block.
+
+## Workflow
+1. **Prepare idea JSON**
+   - Must follow `references/idea.schema.json`.
+2. **Generate idea.md**
+   - `UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run -s scripts/idea_to_markdown.py --in idea.json --out idea.md`
+3. **(Optional) Include code**
+   - `... --code-path baseline.py`
+
+## Inputs
+- `--in`: idea JSON file
+- `--out`: output markdown file
+- `--code-path` (optional): path to a code file to embed
+
+## Outputs
+- `idea.md`
+
+## Safeguards
+- Read-only on inputs.
+- Writes only to `--out`.
+
+## References
+- Idea schema: `references/idea.schema.json`
diff --git a/skills/idea-to-markdown/agents/openai.yaml b/skills/idea-to-markdown/agents/openai.yaml
new file mode 100644
index 00000000..ffd744e2
--- /dev/null
+++ b/skills/idea-to-markdown/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Idea to Markdown"
+  short_description: "Convert idea JSON to idea.md"
+  default_prompt: "Convert idea JSON into a structured idea.md with section headers, optionally embedding a code snippet." 
diff --git a/skills/idea-to-markdown/references/idea.schema.json b/skills/idea-to-markdown/references/idea.schema.json
new file mode 100644
index 00000000..334899aa
--- /dev/null
+++ b/skills/idea-to-markdown/references/idea.schema.json
@@ -0,0 +1,34 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "AI Scientist Idea",
+  "type": "object",
+  "required": [
+    "Name",
+    "Title",
+    "Short Hypothesis",
+    "Related Work",
+    "Abstract",
+    "Experiments",
+    "Risk Factors and Limitations"
+  ],
+  "properties": {
+    "Name": {"type": "string", "pattern": "^[a-z0-9_-]+$"},
+    "Title": {"type": "string"},
+    "Short Hypothesis": {"type": "string"},
+    "Related Work": {"type": "string"},
+    "Abstract": {"type": "string"},
+    "Experiments": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": ["string", "object"]}}
+      ]
+    },
+    "Risk Factors and Limitations": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": "string"}}
+      ]
+    }
+  },
+  "additionalProperties": true
+}
diff --git a/skills/idea-to-markdown/scripts/idea_to_markdown.py b/skills/idea-to-markdown/scripts/idea_to_markdown.py
new file mode 100755
index 00000000..d0fc52a3
--- /dev/null
+++ b/skills/idea-to-markdown/scripts/idea_to_markdown.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Convert idea JSON to a structured Markdown file.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+
+
+def _load_json(path: Path) -> Any:
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        raise SystemExit(f"[ERROR] File not found: {path}")
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"[ERROR] Invalid JSON: {path}: {e}")
+
+
+def _normalize_ideas(obj: Any) -> list[dict]:
+    if isinstance(obj, dict) and "ideas" in obj and isinstance(obj["ideas"], list):
+        return obj["ideas"]
+    if isinstance(obj, dict) and "idea" in obj and isinstance(obj["idea"], dict):
+        return [obj["idea"]]
+    if isinstance(obj, list):
+        return obj
+    if isinstance(obj, dict):
+        return [obj]
+    return []
+
+
+def _render_one(idea: dict, code_text: str | None) -> str:
+    lines: list[str] = []
+    for key, value in idea.items():
+        header = str(key).replace("_", " ").title()
+        lines.append(f"## {header}\n")
+        if isinstance(value, (list, tuple)):
+            for item in value:
+                lines.append(f"- {item}")
+            lines.append("")
+        elif isinstance(value, dict):
+            for k, v in value.items():
+                lines.append(f"### {k}")
+                lines.append(f"{v}\n")
+        else:
+            lines.append(f"{value}\n")
+    if code_text:
+        lines.append("## Code To Potentially Use\n")
+        lines.append("Use the following code as context for your experiments:\n")
+        lines.append("```python")
+        lines.append(code_text.rstrip())
+        lines.append("```\n")
+    return "\n".join(lines).strip() + "\n"
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Convert idea JSON to Markdown.")
+    ap.add_argument("--in", dest="in_path", required=True, help="Input idea JSON path.")
+    ap.add_argument("--out", dest="out_path", required=True, help="Output markdown path.")
+    ap.add_argument("--code-path", help="Optional code file to embed.")
+    args = ap.parse_args()
+
+    in_path = Path(args.in_path).expanduser().resolve()
+    out_path = Path(args.out_path).expanduser().resolve()
+
+    obj = _load_json(in_path)
+    ideas = _normalize_ideas(obj)
+    if not ideas:
+        raise SystemExit("[ERROR] No ideas found in input JSON.")
+
+    code_text = None
+    if args.code_path:
+        code_path = Path(args.code_path).expanduser().resolve()
+        if not code_path.exists():
+            raise SystemExit(f"[ERROR] Code file not found: {code_path}")
+        code_text = code_path.read_text(encoding="utf-8")
+
+    # If multiple ideas, write all separated by ---
+    blocks = []
+    for idea in ideas:
+        if not isinstance(idea, dict):
+            raise SystemExit("[ERROR] Each idea must be a JSON object.")
+        blocks.append(_render_one(idea, code_text))
+    out_path.write_text("\n---\n\n".join(blocks), encoding="utf-8")
+    print(f"[OK] Wrote: {out_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/paper-plot-aggregator/SKILL.md b/skills/paper-plot-aggregator/SKILL.md
new file mode 100644
index 00000000..f082e98a
--- /dev/null
+++ b/skills/paper-plot-aggregator/SKILL.md
@@ -0,0 +1,40 @@
+---
+name: paper-plot-aggregator
+description: Create publication-quality plots from experiment artifacts (.npy, JSON, CSV). Use when preparing final figures for a paper, consolidating plots across runs, or turning raw metrics into a clean figures/ folder.
+---
+
+# Paper Plot Aggregator
+
+## Overview
+Inventory metric files, generate a plotting skeleton, and produce publication-ready figures. This skill is read-only on inputs and writes outputs to a figures directory.
+
+## Workflow
+1. Inventory NPY files
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with numpy -s scripts/npy_inventory.py --dir /path/to/run --out inventory.json
+   ~~~
+2. Generate aggregator skeleton
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with numpy,matplotlib -s scripts/gen_aggregator_skeleton.py --inventory inventory.json --out auto_plot_aggregator.py --figures-dir figures
+   ~~~
+3. Run the generated script and inspect figures
+
+## Inputs
+- --dir: input directory
+- --inventory: inventory.json from npy_inventory.py
+- --figures-dir: output directory for plots
+
+## Outputs
+- figures/ directory
+- auto_plot_aggregator.py
+- plot_manifest.json (optional)
+
+## Safeguards
+- Read-only on inputs; writes only to provided output locations.
+- Use --clean only if you explicitly want to overwrite existing figures.
+
+## References
+- Plot guidelines: references/plot-guidelines.md
+- Plot manifest format: references/plot-manifest.md
+- Plot manifest schema: references/plot_manifest.schema.json
+- Safeguards: references/safeguards.md
diff --git a/skills/paper-plot-aggregator/agents/openai.yaml b/skills/paper-plot-aggregator/agents/openai.yaml
new file mode 100644
index 00000000..e4c56b0a
--- /dev/null
+++ b/skills/paper-plot-aggregator/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Paper Plot Aggregator"
+  short_description: "Inventory results and generate plotting scripts"
+  default_prompt: "Inventory experiment outputs (e.g., .npy) and generate a safe, runnable plot aggregator skeleton plus guidelines for paper-ready figures."
diff --git a/skills/paper-plot-aggregator/references/plot-guidelines.md b/skills/paper-plot-aggregator/references/plot-guidelines.md
new file mode 100644
index 00000000..0cab3861
--- /dev/null
+++ b/skills/paper-plot-aggregator/references/plot-guidelines.md
@@ -0,0 +1,24 @@
+# Plot Guidelines (Paper-Ready)
+
+Use these guidelines when creating the final `figures/` set.
+
+## Grounding and correctness
+- Load real data from files (`.npy`, JSON, CSV). Do not fabricate numbers.
+- If you need a single number, copy it from an existing summary file and cite its path.
+
+## Figure budgeting
+- Prefer ~6-12 figures total for a full paper (adjust to venue).
+- Combine related plots into subplots (e.g., 1x2 or 1x3).
+- Keep only the most informative figures in the main text; move extras to appendix.
+
+## Style and readability
+- Use `dpi=300` for saved PNGs.
+- Include axis labels, titles, and a legend where needed.
+- Avoid underscores in labels; replace with spaces.
+- Keep legends visible and non-overlapping.
+- Consider removing top/right spines for a cleaner look (matplotlib style).
+
+## Robust scripting
+- Put each figure in its own `try/except` block so one failure doesn't block others.
+- Validate file existence before loading.
+- Use safe defaults and deterministic filenames.
diff --git a/skills/paper-plot-aggregator/references/plot-manifest.md b/skills/paper-plot-aggregator/references/plot-manifest.md
new file mode 100644
index 00000000..43c2ff87
--- /dev/null
+++ b/skills/paper-plot-aggregator/references/plot-manifest.md
@@ -0,0 +1,27 @@
+# Plot Manifest Schema
+
+Use a manifest to track provenance and suggested captions for each saved figure.
+
+## Suggested JSON format
+```json
+{
+  "generated_at": "2026-02-04T12:34:56",
+  "figures_dir": "figures",
+  "figures": [
+    {
+      "filename": "training loss.png",
+      "title": "Training loss vs epoch",
+      "caption_suggestion": "Loss decreases monotonically and plateaus after ...",
+      "data_sources": [
+        "experiment_results/foo.npy",
+        "results/metrics.json"
+      ],
+      "notes": "Any caveats, preprocessing, or filtering used."
+    }
+  ]
+}
+```
+
+## Guardrails
+- `data_sources` should point to real files.
+- Captions must not claim results not supported by the data.
diff --git a/skills/paper-plot-aggregator/references/plot_manifest.schema.json b/skills/paper-plot-aggregator/references/plot_manifest.schema.json
new file mode 100644
index 00000000..70466c65
--- /dev/null
+++ b/skills/paper-plot-aggregator/references/plot_manifest.schema.json
@@ -0,0 +1,36 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "PlotManifest",
+  "type": "object",
+  "required": [
+    "figures"
+  ],
+  "properties": {
+    "figures": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": [
+          "filename",
+          "caption"
+        ],
+        "properties": {
+          "filename": {
+            "type": "string"
+          },
+          "caption": {
+            "type": "string"
+          },
+          "sources": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          }
+        },
+        "additionalProperties": true
+      }
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/paper-plot-aggregator/references/safeguards.md b/skills/paper-plot-aggregator/references/safeguards.md
new file mode 100644
index 00000000..a3692eb7
--- /dev/null
+++ b/skills/paper-plot-aggregator/references/safeguards.md
@@ -0,0 +1,14 @@
+# Safeguards
+
+## Safety posture
+- Do not delete anything by default.
+- Avoid out-of-memory by using `mmap_mode='r'` and element limits for stats.
+- Do not reference files that do not exist.
+- Do not use network access in plot scripts.
+
+## Cleaning behavior
+- Only clean output folders when an explicit `--clean` flag is provided.
+- If cleaning is enabled, only delete the figures directory inside the current working directory.
+
+## AI-use disclosure (if applicable)
+- If your workflow uses tools or code with mandatory AI-use disclosure requirements (e.g., AI-Scientist-v2), comply with that license and with your venue's policies when submitting manuscripts.
diff --git a/skills/paper-plot-aggregator/scripts/gen_aggregator_skeleton.py b/skills/paper-plot-aggregator/scripts/gen_aggregator_skeleton.py
new file mode 100644
index 00000000..715e7c2d
--- /dev/null
+++ b/skills/paper-plot-aggregator/scripts/gen_aggregator_skeleton.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+"""
+Generate a runnable plot aggregator skeleton from an inventory.json file.
+
+This generator is intentionally dependency-light. The generated script will
+require numpy + matplotlib to actually plot.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+
+TEMPLATE = '''#!/usr/bin/env python3
+"""
+Auto-generated plot aggregator skeleton.
+
+This script is intended as a starting point. It will:
+- Load existing .npy data only (no hallucinated numbers).
+- Write figures into a figures directory.
+- Generate a few \"quicklook\" plots for the first N arrays.
+
+To run (recommended):
+  uv run --with numpy --with matplotlib -s auto_plot_aggregator.py --figures-dir "__FIGURES_DIR__"
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+from pathlib import Path
+
+import numpy as np
+
+
+DEFAULT_CLEAN = __DEFAULT_CLEAN__
+DEFAULT_MAX_PLOTS = __MAX_PLOTS__
+
+INVENTORY_JSON = r"""__INVENTORY_JSON__"""
+INVENTORY = json.loads(INVENTORY_JSON)
+
+
+def _safe_label(text: str) -> str:
+    # Avoid underscores in labels (better for papers).
+    return text.replace("_", " ")
+
+
+def _resolve_path(entry: dict) -> Path:
+    # Prefer absolute path when available, otherwise resolve relative to base_dir.
+    if entry.get("abs_path"):
+        p = Path(entry["abs_path"])
+        if p.exists():
+            return p
+    base_dir = INVENTORY.get("base_dir") or os.getcwd()
+    return Path(base_dir) / entry.get("rel_path", "")
+
+
+def _ensure_figures_dir(figures_dir: Path, clean: bool) -> Path:
+    # Resolve against CWD if relative.
+    if not figures_dir.is_absolute():
+        figures_dir = (Path(os.getcwd()) / figures_dir).resolve()
+    else:
+        figures_dir = figures_dir.resolve()
+
+    # Safety: only allow cleaning inside CWD.
+    if clean:
+        cwd = Path(os.getcwd()).resolve()
+        if cwd not in figures_dir.parents:
+            raise RuntimeError(f\"Refusing to clean figures outside CWD: {figures_dir}\")
+        if figures_dir.exists():
+            shutil.rmtree(figures_dir)
+
+    figures_dir.mkdir(parents=True, exist_ok=True)
+    return figures_dir
+
+
+def _quicklook_plot(entry: dict, figures_dir: Path, idx: int) -> None:
+    try:
+        import matplotlib.pyplot as plt
+    except Exception as e:
+        raise RuntimeError(
+            "matplotlib is required to plot. Install with: uv run --with matplotlib ..."
+        ) from e
+
+    path = _resolve_path(entry)
+    if not path.exists():
+        raise FileNotFoundError(str(path))
+
+    arr = np.load(path, mmap_mode="r", allow_pickle=False)
+
+    name = Path(entry.get("rel_path") or path.name).stem
+    name = _safe_label(name)
+
+    fig = plt.figure(figsize=(6, 4))
+    if arr.ndim == 1:
+        plt.plot(arr)
+        plt.xlabel("index")
+        plt.ylabel("value")
+        plt.title(f\"{name} (1D)\")
+    elif arr.ndim == 2:
+        # Quicklook: show a heatmap for 2D arrays (may not be paper-ready).
+        plt.imshow(arr, aspect="auto")
+        plt.colorbar()
+        plt.title(f\"{name} (2D heatmap)\")
+    else:
+        plt.text(0.1, 0.5, f\"ndim={arr.ndim} not plotted\", fontsize=12)
+        plt.axis("off")
+        plt.title(f\"{name} (skipped)\")
+
+    out_name = f\"{idx:02d} - {name}.png\"
+    out_path = figures_dir / out_name
+    plt.tight_layout()
+    plt.savefig(out_path, dpi=300)
+    plt.close(fig)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Run the auto plot aggregator.")
+    ap.add_argument("--figures-dir", default="__FIGURES_DIR__", help="Output figures directory.")
+    ap.add_argument(
+        "--clean",
+        dest=\"clean\",
+        action=\"store_true\",
+        default=DEFAULT_CLEAN,
+        help="Clean figures-dir before plotting.",
+    )
+    ap.add_argument(
+        "--no-clean",
+        dest=\"clean\",
+        action=\"store_false\",
+        help="Do not clean figures-dir before plotting.",
+    )
+    ap.add_argument(
+        "--max-plots",
+        type=int,
+        default=DEFAULT_MAX_PLOTS,
+        help="Max quicklook plots to attempt.",
+    )
+    args = ap.parse_args()
+
+    figures_dir = _ensure_figures_dir(Path(args.figures_dir), clean=bool(args.clean))
+
+    entries = INVENTORY.get("entries", [])
+    if not isinstance(entries, list):
+        raise RuntimeError("INVENTORY.entries must be a list")
+
+    n = max(0, min(int(args.max_plots), len(entries)))
+    for i in range(n):
+        try:
+            _quicklook_plot(entries[i], figures_dir, i + 1)
+        except Exception as e:
+            print(f\"[WARN] Plot {i+1} failed: {e}\")
+
+    print(f\"[OK] Wrote figures to: {figures_dir}\")
+    return 0
+
+
+if __name__ == \"__main__\":
+    raise SystemExit(main())
+'''
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Generate auto_plot_aggregator.py skeleton.")
+    ap.add_argument("--inventory", required=True, help="Input inventory.json from npy_inventory.py.")
+    ap.add_argument("--out", required=True, help="Output path for auto_plot_aggregator.py.")
+    ap.add_argument("--figures-dir", default="figures", help="Figures directory (default: figures).")
+    ap.add_argument(
+        "--clean",
+        action="store_true",
+        help="Set generated script default to clean the figures dir before plotting (default: false).",
+    )
+    ap.add_argument(
+        "--max-plots",
+        type=int,
+        default=12,
+        help="Max quicklook plots the generated script will attempt (default: 12).",
+    )
+    args = ap.parse_args(argv)
+
+    inv_path = Path(args.inventory).expanduser().resolve()
+    try:
+        inv = json.loads(inv_path.read_text(encoding="utf-8"))
+    except Exception as e:
+        print(f"[ERROR] Failed to read inventory JSON: {inv_path}: {e}")
+        return 2
+
+    entries = inv.get("entries", [])
+    if not isinstance(entries, list):
+        print("[ERROR] inventory.entries must be a list")
+        return 2
+
+    embedded = {
+        "base_dir": inv.get("base_dir"),
+        "generated_at": inv.get("generated_at"),
+        "entries": [
+            {
+                "rel_path": e.get("rel_path"),
+                "abs_path": e.get("abs_path"),
+                "shape": e.get("shape"),
+                "dtype": e.get("dtype"),
+            }
+            for e in entries
+            if isinstance(e, dict) and (e.get("rel_path") or e.get("abs_path"))
+        ],
+    }
+
+    inventory_json = json.dumps(embedded, indent=2)
+    script = (
+        TEMPLATE.replace("__FIGURES_DIR__", args.figures_dir)
+        .replace("__DEFAULT_CLEAN__", "True" if args.clean else "False")
+        .replace("__MAX_PLOTS__", str(int(args.max_plots)))
+        .replace("__INVENTORY_JSON__", inventory_json)
+    )
+
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.write_text(script, encoding="utf-8")
+    print(f"[OK] Wrote skeleton: {out_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/paper-plot-aggregator/scripts/npy_inventory.py b/skills/paper-plot-aggregator/scripts/npy_inventory.py
new file mode 100644
index 00000000..a15f69aa
--- /dev/null
+++ b/skills/paper-plot-aggregator/scripts/npy_inventory.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+Inventory .npy files under a directory.
+
+Requires: numpy
+Safe by default: uses mmap and avoids reading full arrays unless stats are requested.
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+
+def _walk(base: Path) -> list[Path]:
+    out: list[Path] = []
+    for root, dirs, files in os.walk(base):
+        dirs[:] = [d for d in dirs if d not in {".git", "__pycache__", ".venv", "venv"}]
+        for fn in files:
+            if fn.lower().endswith(".npy"):
+                out.append(Path(root) / fn)
+    return out
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Inventory .npy files (safe, read-only).")
+    ap.add_argument("--dir", required=True, help="Directory to scan for .npy files.")
+    ap.add_argument("--out", required=True, help="Output inventory JSON path.")
+    ap.add_argument(
+        "--max-bytes",
+        type=int,
+        default=None,
+        help="If set, skip .npy files larger than this many bytes.",
+    )
+    ap.add_argument(
+        "--mmap",
+        dest="mmap",
+        action="store_true",
+        help="Use mmap_mode='r' for numpy.load (default).",
+    )
+    ap.add_argument(
+        "--no-mmap",
+        dest="mmap",
+        action="store_false",
+        help="Disable mmap (may OOM on large arrays).",
+    )
+    ap.add_argument(
+        "--stats",
+        action="store_true",
+        help="Compute lightweight stats for small arrays (bounded by --max-elements).",
+    )
+    ap.add_argument(
+        "--max-elements",
+        type=int,
+        default=1_000_000,
+        help="Max elements allowed for stats (default: 1,000,000).",
+    )
+    ap.set_defaults(mmap=True)
+    args = ap.parse_args(argv)
+
+    base = Path(args.dir).expanduser().resolve()
+    if not base.exists() or not base.is_dir():
+        print(f"[ERROR] Not a directory: {base}")
+        return 2
+
+    try:
+        import numpy as np  # type: ignore
+    except Exception as e:
+        print("[ERROR] numpy is required. Try: uv run --with numpy -s scripts/npy_inventory.py --help")
+        print(f"Details: {e}")
+        return 3
+
+    mmap_mode = "r" if args.mmap else None
+    entries: list[dict[str, Any]] = []
+    for path in sorted(_walk(base)):
+        rel = str(path.relative_to(base))
+        size_bytes = path.stat().st_size if path.exists() else None
+        info: dict[str, Any] = {
+            "rel_path": rel,
+            "abs_path": str(path),
+            "size_bytes": size_bytes,
+        }
+        if args.max_bytes is not None and size_bytes is not None and size_bytes > args.max_bytes:
+            info["skipped"] = f"size_bytes {size_bytes} > max_bytes {args.max_bytes}"
+            entries.append(info)
+            continue
+        try:
+            arr = np.load(path, mmap_mode=mmap_mode, allow_pickle=False)
+            info["dtype"] = str(arr.dtype)
+            info["shape"] = list(arr.shape)
+            info["ndim"] = int(arr.ndim)
+            if args.stats:
+                n = int(arr.size)
+                if n <= max(1, args.max_elements):
+                    try:
+                        # Avoid materializing if possible; numpy may still scan the array.
+                        finite = arr[np.isfinite(arr)]
+                        info["stats"] = {
+                            "min": float(np.min(finite)),
+                            "max": float(np.max(finite)),
+                            "mean": float(np.mean(finite)),
+                        }
+                    except Exception as e:
+                        info["stats"] = {"error": str(e)}
+                else:
+                    info["stats"] = {"skipped": f"size {n} > max_elements"}
+        except Exception as e:
+            info["error"] = str(e)
+        entries.append(info)
+
+    out_obj = {
+        "base_dir": str(base),
+        "generated_at": _dt.datetime.now().isoformat(timespec="seconds"),
+        "entries": entries,
+    }
+    Path(args.out).write_text(json.dumps(out_obj, indent=2), encoding="utf-8")
+    print(f"[OK] Wrote inventory: {args.out} ({len(entries)} entries)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/paper-reviewer-json/SKILL.md b/skills/paper-reviewer-json/SKILL.md
new file mode 100644
index 00000000..f5d1111b
--- /dev/null
+++ b/skills/paper-reviewer-json/SKILL.md
@@ -0,0 +1,29 @@
+---
+name: paper-reviewer-json
+description: Produce structured peer-review JSON (NeurIPS-style fields) for a paper given PDF/LaTeX/Markdown text.
+---
+
+# Paper Reviewer JSON
+
+## Overview
+Extract paper text and produce a structured review JSON for automated or human review workflows.
+
+## Workflow
+1. Extract text from PDF
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with pypdf -s scripts/extract_pdf_text.py --pdf paper.pdf --out paper.txt
+   ~~~
+2. Generate review JSON with your LLM of choice using references/review-prompt.md and references/review.schema.json
+
+## Outputs
+- paper.txt
+- review.json
+
+## Safeguards
+- Do not fabricate citations or claims not present in the input.
+- Keep quotes short and avoid reproducing large sections of the paper.
+
+## References
+- Review JSON schema: references/review.schema.json
+- Review prompt: references/review-prompt.md
+- Safeguards: references/safeguards.md
diff --git a/skills/paper-reviewer-json/agents/openai.yaml b/skills/paper-reviewer-json/agents/openai.yaml
new file mode 100644
index 00000000..17aa631a
--- /dev/null
+++ b/skills/paper-reviewer-json/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Paper Reviewer JSON"
+  short_description: "Generate structured peer reviews from paper text"
+  default_prompt: "Extract paper text (PDF/LaTeX/Markdown) and produce a NeurIPS-style structured review JSON with scores, strengths, weaknesses, questions, and an accept/reject decision."
diff --git a/skills/paper-reviewer-json/references/review-json-schema.md b/skills/paper-reviewer-json/references/review-json-schema.md
new file mode 100644
index 00000000..51beaa44
--- /dev/null
+++ b/skills/paper-reviewer-json/references/review-json-schema.md
@@ -0,0 +1,29 @@
+# Review JSON Schema (NeurIPS-style)
+
+Produce valid JSON with the following fields **in this order**:
+- `Summary` (string)
+- `Strengths` (list of strings)
+- `Weaknesses` (list of strings)
+- `Originality` (int 1-4)
+- `Quality` (int 1-4)
+- `Clarity` (int 1-4)
+- `Significance` (int 1-4)
+- `Questions` (list of strings)
+- `Limitations` (list of strings)
+- `Ethical Concerns` (boolean)
+- `Soundness` (int 1-4)
+- `Presentation` (int 1-4)
+- `Contribution` (int 1-4)
+- `Overall` (int 1-10)
+- `Confidence` (int 1-5)
+- `Decision` (string; must be `Accept` or `Reject`)
+
+## Rating scales
+- 1-4: low/poor to very high/excellent (use integers only)
+- Overall 1-10: strong reject to award quality
+- Confidence 1-5: low to absolute
+
+## Guardrails
+- Keep the review specific to the provided paper text.
+- Do not invent results, claims, or citations not present in the paper.
+- Avoid long verbatim quotes.
diff --git a/skills/paper-reviewer-json/references/review-prompt.md b/skills/paper-reviewer-json/references/review-prompt.md
new file mode 100644
index 00000000..ed507127
--- /dev/null
+++ b/skills/paper-reviewer-json/references/review-prompt.md
@@ -0,0 +1,35 @@
+# Review Prompt Template
+
+Use this template to generate a structured review.
+
+## System instruction (recommended)
+```text
+You are an AI researcher reviewing a paper submitted to a top ML venue. Be critical and specific. If you are unsure, lean skeptical. Do not hallucinate.
+```
+
+## User prompt
+```text
+Respond in the following format:
+
+THOUGHT:
+<brief reasoning notes; be specific, not generic>
+
+REVIEW JSON:
+```json
+<JSON>
+```
+
+In <JSON>, follow the schema in references/review-json-schema.md exactly (field names, order, integer ranges).
+Decision must be only Accept or Reject (no borderline variants).
+
+Here is the paper text:
+```
+{PASTE_PAPER_TEXT_HERE}
+```
+```
+
+## Optional: ensemble
+Generate 3-5 independent reviews (different seeds/temps), then aggregate:
+- average numeric fields (round to nearest int)
+- union strengths/weaknesses/questions (deduplicate)
+- write a short meta rationale in THOUGHT
diff --git a/skills/paper-reviewer-json/references/review.schema.json b/skills/paper-reviewer-json/references/review.schema.json
new file mode 100644
index 00000000..4cf69d7a
--- /dev/null
+++ b/skills/paper-reviewer-json/references/review.schema.json
@@ -0,0 +1,42 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "PaperReview",
+  "type": "object",
+  "required": [
+    "summary",
+    "strengths",
+    "weaknesses",
+    "questions",
+    "score"
+  ],
+  "properties": {
+    "summary": {
+      "type": "string"
+    },
+    "strengths": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "weaknesses": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "questions": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "score": {
+      "type": "number"
+    },
+    "confidence": {
+      "type": "number"
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/paper-reviewer-json/references/safeguards.md b/skills/paper-reviewer-json/references/safeguards.md
new file mode 100644
index 00000000..6ce6560c
--- /dev/null
+++ b/skills/paper-reviewer-json/references/safeguards.md
@@ -0,0 +1,16 @@
+# Safeguards
+
+## Confidentiality
+- Treat the paper as private unless the user says it is public.
+- Do not upload the PDF or extracted text to external services without explicit user request.
+
+## Factuality
+- Keep all critiques grounded in the provided text.
+- If the paper omits details, ask questions instead of assuming.
+
+## Plagiarism / quoting
+- Avoid long verbatim excerpts.
+- Prefer paraphrases and short quotes only when necessary.
+
+## AI-use disclosure (if applicable)
+- If your workflow uses tools or code with mandatory AI-use disclosure requirements (e.g., AI-Scientist-v2), comply with that license and with your venue's policies when submitting manuscripts.
diff --git a/skills/paper-reviewer-json/scripts/extract_pdf_text.py b/skills/paper-reviewer-json/scripts/extract_pdf_text.py
new file mode 100644
index 00000000..0c595971
--- /dev/null
+++ b/skills/paper-reviewer-json/scripts/extract_pdf_text.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+Extract text from a PDF for downstream review.
+
+Preferred backend: pypdf (pure Python).
+Optional backend: PyMuPDF (fitz) for PDFs where pypdf extraction is poor.
+
+This script is read-only and writes only to --out.
+"""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+
+def extract_with_pypdf(pdf_path: Path, max_pages: int | None, keep_page_breaks: bool) -> str:
+    try:
+        from pypdf import PdfReader  # type: ignore
+    except Exception as e:
+        raise RuntimeError(
+            "pypdf is required. Try: uv run --with pypdf -s scripts/extract_pdf_text.py --help"
+        ) from e
+
+    reader = PdfReader(str(pdf_path))
+    out: list[str] = []
+    n = len(reader.pages)
+    limit = min(n, max_pages) if max_pages is not None else n
+    for i in range(limit):
+        txt = reader.pages[i].extract_text() or ""
+        out.append(txt)
+        if keep_page_breaks:
+            out.append(f"\n\n----- PAGE {i+1} / {limit} -----\n\n")
+    return "".join(out)
+
+
+def extract_with_pymupdf(pdf_path: Path, max_pages: int | None, keep_page_breaks: bool) -> str:
+    try:
+        import fitz  # type: ignore
+    except Exception as e:
+        raise RuntimeError(
+            "PyMuPDF is required. Try: uv run --with PyMuPDF -s scripts/extract_pdf_text.py --backend pymupdf --help"
+        ) from e
+
+    doc = fitz.open(str(pdf_path))
+    out: list[str] = []
+    n = doc.page_count
+    limit = min(n, max_pages) if max_pages is not None else n
+    for i in range(limit):
+        page = doc.load_page(i)
+        out.append(page.get_text("text") or "")
+        if keep_page_breaks:
+            out.append(f"\n\n----- PAGE {i+1} / {limit} -----\n\n")
+    return "".join(out)
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Extract PDF text (read-only).")
+    ap.add_argument("--pdf", required=True, help="Input PDF path.")
+    ap.add_argument("--out", required=True, help="Output text path.")
+    ap.add_argument("--max-pages", type=int, default=None, help="Max pages to extract.")
+    ap.add_argument(
+        "--keep-page-breaks",
+        action="store_true",
+        help="Insert page boundary markers in the output.",
+    )
+    ap.add_argument(
+        "--backend",
+        choices=("pypdf", "pymupdf"),
+        default="pypdf",
+        help="Extraction backend (default: pypdf).",
+    )
+    args = ap.parse_args(argv)
+
+    pdf_path = Path(args.pdf).expanduser().resolve()
+    if not pdf_path.exists():
+        print(f"[ERROR] PDF not found: {pdf_path}")
+        return 2
+
+    try:
+        if args.backend == "pymupdf":
+            text = extract_with_pymupdf(pdf_path, args.max_pages, args.keep_page_breaks)
+        else:
+            text = extract_with_pypdf(pdf_path, args.max_pages, args.keep_page_breaks)
+    except Exception as e:
+        print(f"[ERROR] Extraction failed: {e}")
+        return 3
+
+    out_path = Path(args.out).expanduser().resolve()
+    out_path.write_text(text, encoding="utf-8")
+    print(f"[OK] Wrote: {out_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/research-ideation-novelty-check/SKILL.md b/skills/research-ideation-novelty-check/SKILL.md
new file mode 100644
index 00000000..e0aa71a4
--- /dev/null
+++ b/skills/research-ideation-novelty-check/SKILL.md
@@ -0,0 +1,45 @@
+---
+name: research-ideation-novelty-check
+description: Generate and refine research ideas and run a lightweight novelty sanity-check using Semantic Scholar. Use when drafting proposal-style ideas from a topic brief, validating idea JSON outputs, or checking whether an idea seems close to existing work (without over-claiming novelty).
+---
+
+# Research Ideation + Novelty Check
+
+## Overview
+Turn a short topic brief into structured research ideas, then run targeted Semantic Scholar searches to sanity-check novelty and record what you searched.
+
+This skill is intentionally tool-agnostic and does not depend on this repo. Use the scripts for Semantic Scholar search and idea JSON schema validation.
+
+## Workflow
+1. Draft a topic brief
+   - Use the template in references/prompt-templates.md (keep it concrete: task, constraints, evaluation, compute budget).
+2. Produce idea JSON
+   - Output one or more ideas that match references/idea.schema.json.
+3. Validate the schema
+   - Run scripts/idea_schema_validate.py on the generated JSON.
+4. Novelty sanity-check
+   - For each idea, run 3-6 focused searches via scripts/s2_search.py.
+   - Record the exact query strings and the search date/time in your notes.
+   - Do not claim novelty just because search results look sparse; follow references/novelty-check.md.
+
+## Scripts
+### Semantic Scholar search
+- Offline (default-safe; no network):
+  ~~~bash
+  UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run -s scripts/s2_search.py --query "your query"
+  ~~~
+- Online (explicit opt-in; optional S2_API_KEY in env):
+  ~~~bash
+  UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run -s scripts/s2_search.py --online --query "attention is all you need" --limit 10
+  ~~~
+
+### Idea schema validation
+- ~~~bash
+  UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run -s scripts/idea_schema_validate.py --in ideas.json
+  ~~~
+
+## References
+- Idea schema: references/idea.schema.json
+- Novelty check guidance: references/novelty-check.md
+- Prompt templates: references/prompt-templates.md
+- Safeguards: references/safeguards.md
diff --git a/skills/research-ideation-novelty-check/agents/openai.yaml b/skills/research-ideation-novelty-check/agents/openai.yaml
new file mode 100644
index 00000000..c10a0b82
--- /dev/null
+++ b/skills/research-ideation-novelty-check/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Research Ideation + Novelty Check"
+  short_description: "Generate ideas and sanity-check novelty"
+  default_prompt: "Draft proposal-style research ideas from a topic brief, validate idea JSON, and propose Semantic Scholar queries with cautious novelty wording."
diff --git a/skills/research-ideation-novelty-check/references/idea-schema.md b/skills/research-ideation-novelty-check/references/idea-schema.md
new file mode 100644
index 00000000..627726e4
--- /dev/null
+++ b/skills/research-ideation-novelty-check/references/idea-schema.md
@@ -0,0 +1,37 @@
+# Idea JSON Schema
+
+This schema is designed for proposal-style research ideas that can be executed with realistic academic resources.
+
+## Accepted file shapes
+Your ideas file may be:
+- A list of idea objects: `[ { ... }, { ... } ]`
+- A single idea object: `{ ... }`
+- A wrapper object: `{ "idea": { ... } }` or `{ "ideas": [ ... ] }`
+
+## Required fields (per idea)
+- `Name` (string)
+  - Short identifier; lowercase; no spaces.
+  - Recommended pattern: `^[a-z0-9_-]+$`
+- `Title` (string)
+  - Human-readable title.
+- `Short Hypothesis` (string)
+  - One concise hypothesis or research question.
+- `Related Work` (string)
+  - Brief discussion of closest prior work and why this is not a trivial extension.
+- `Abstract` (string)
+  - ~200-300 words; motivation, method, expected outcome.
+- `Experiments` (list or string)
+  - Prefer a list of experiments (each as string or dict).
+  - If dict, recommended keys: `Goal`, `Method`, `Metrics`, `Baselines`, `Datasets`.
+- `Risk Factors and Limitations` (list or string)
+  - Prefer a list of short bullets.
+
+## Optional fields
+- `Code` (string): starter code or constraints.
+- `Datasets` (list): suggested datasets.
+- `Metrics` (list): target metrics.
+
+## Quality checklist
+- Experiments are feasible, specific, and tied to the hypothesis.
+- Related Work names at least 2-5 close baselines/lines of work.
+- Risks include at least one technical risk and one evaluation risk.
diff --git a/skills/research-ideation-novelty-check/references/idea.schema.json b/skills/research-ideation-novelty-check/references/idea.schema.json
new file mode 100644
index 00000000..334899aa
--- /dev/null
+++ b/skills/research-ideation-novelty-check/references/idea.schema.json
@@ -0,0 +1,34 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "AI Scientist Idea",
+  "type": "object",
+  "required": [
+    "Name",
+    "Title",
+    "Short Hypothesis",
+    "Related Work",
+    "Abstract",
+    "Experiments",
+    "Risk Factors and Limitations"
+  ],
+  "properties": {
+    "Name": {"type": "string", "pattern": "^[a-z0-9_-]+$"},
+    "Title": {"type": "string"},
+    "Short Hypothesis": {"type": "string"},
+    "Related Work": {"type": "string"},
+    "Abstract": {"type": "string"},
+    "Experiments": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": ["string", "object"]}}
+      ]
+    },
+    "Risk Factors and Limitations": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": "string"}}
+      ]
+    }
+  },
+  "additionalProperties": true
+}
diff --git a/skills/research-ideation-novelty-check/references/novelty-check.md b/skills/research-ideation-novelty-check/references/novelty-check.md
new file mode 100644
index 00000000..79347d5d
--- /dev/null
+++ b/skills/research-ideation-novelty-check/references/novelty-check.md
@@ -0,0 +1,42 @@
+# Novelty Sanity-Check (Semantic Scholar)
+
+This is a *sanity-check*, not a proof of novelty.
+
+## Goals
+- Find the closest prior work quickly.
+- Avoid over-claiming novelty.
+- Record what you searched (queries + date/time) so the process is auditable.
+
+## Workflow
+1. **Decompose the idea**
+   - Write 3-6 key phrases: task + method + twist + evaluation setting.
+2. **Write several queries**
+   - One broad: `"<task>" + "<method family>"`
+   - One specific: `"<core mechanism>" + "<task>"`
+   - One baseline-centric: `"<closest baseline name>" + "<setting>"`
+3. **Search and record**
+   - For each query, record:
+     - the exact query string
+     - the time (local) you ran it
+     - the top 5-10 results you considered most similar
+4. **Interpret cautiously**
+   - If you find a close match:
+     - adjust the idea, or
+     - narrow the claim, or
+     - position as a replication / stress-test / negative result.
+   - If you do *not* find a match:
+     - write: "We did not find direct matches for [queries] as of [date]; we position this as ..."
+     - do not write: "This is the first work to ..."
+
+## Writing novelty claims safely
+Prefer:
+- "To our knowledge, based on searches for [queries] on [date], we did not find ..."
+- "We study [X] in the under-explored setting of [Y] ..."
+Avoid:
+- "We are the first to ..."
+- "No prior work has studied ..."
+
+## Common failure modes
+- Query is too narrow (misses synonyms).
+- Query is too broad (returns irrelevant classics).
+- You conflate "not found" with "non-existent".
diff --git a/skills/research-ideation-novelty-check/references/prompt-templates.md b/skills/research-ideation-novelty-check/references/prompt-templates.md
new file mode 100644
index 00000000..5235782f
--- /dev/null
+++ b/skills/research-ideation-novelty-check/references/prompt-templates.md
@@ -0,0 +1,61 @@
+# Prompt Templates
+
+Use these templates to keep ideation structured and easy to validate.
+
+## Topic brief template (user input)
+```markdown
+# Title
+
+# Keywords
+
+# TL;DR
+
+# Abstract / Background
+
+# Constraints
+- compute budget:
+- data availability:
+- timeline:
+
+# Evaluation preferences
+- primary metric:
+- baselines to include:
+- datasets to use:
+```
+
+## Ideation prompt (generate 1 idea)
+Paste the topic brief, then:
+```text
+Generate ONE research proposal as JSON that matches the schema in references/idea-schema.md.
+Requirements:
+- Be specific about experiments and evaluation.
+- Include a brief Related Work section that distinguishes this idea from close baselines.
+- Keep the idea feasible for an academic lab.
+- Do not claim novelty without evidence.
+Output only JSON (no markdown fences).
+```
+
+## Reflection prompt (improve an idea)
+```text
+Review the idea for:
+1) Clarity and falsifiability of the hypothesis
+2) Feasibility of experiments
+3) Whether Related Work is specific and non-trivial
+4) Missing baselines/datasets/metrics
+5) Obvious novelty risks
+
+Rewrite the JSON to improve weak points, keeping the core idea intact unless it is flawed.
+Output only JSON.
+```
+
+## Novelty check note template (what to write down)
+```text
+Novelty sanity-check (Semantic Scholar)
+Date/time:
+Queries:
+- ...
+Closest papers considered:
+- [title] (year, venue) - why close
+Conclusion:
+- Positioning statement (no over-claiming)
+```
diff --git a/skills/research-ideation-novelty-check/references/safeguards.md b/skills/research-ideation-novelty-check/references/safeguards.md
new file mode 100644
index 00000000..f33745d6
--- /dev/null
+++ b/skills/research-ideation-novelty-check/references/safeguards.md
@@ -0,0 +1,12 @@
+# Safeguards
+
+## Network / privacy
+- `scripts/s2_search.py` is **offline by default**. Only use `--online` after user confirmation.
+- Never paste API keys into chat logs. If needed, read from an environment variable (e.g., `S2_API_KEY`).
+
+## Novelty claims
+- Semantic Scholar search is a *sanity-check*, not a proof of novelty.
+- Record your query strings and the date/time of the search in your notes (see `references/novelty-check.md`).
+
+## AI-use disclosure (if applicable)
+- If your workflow uses tools or code with mandatory AI-use disclosure requirements (e.g., AI-Scientist-v2), comply with that license and with your venue's policies when submitting manuscripts.
diff --git a/skills/research-ideation-novelty-check/scripts/idea_schema_validate.py b/skills/research-ideation-novelty-check/scripts/idea_schema_validate.py
new file mode 100644
index 00000000..c00371c8
--- /dev/null
+++ b/skills/research-ideation-novelty-check/scripts/idea_schema_validate.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+"""
+Validate idea JSON files against a lightweight schema.
+
+This is intentionally strict about required keys, but flexible about types
+for some fields (e.g., Experiments may be a list or a string).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+RE_NAME = re.compile(r"^[a-z0-9_-]+$")
+REQUIRED_KEYS = [
+    "Name",
+    "Title",
+    "Short Hypothesis",
+    "Related Work",
+    "Abstract",
+    "Experiments",
+    "Risk Factors and Limitations",
+]
+
+
+def _load(path: Path):
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        raise SystemExit(f"[ERROR] File not found: {path}")
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"[ERROR] Invalid JSON: {path}: {e}")
+
+
+def _unwrap(obj):
+    if isinstance(obj, dict) and "ideas" in obj and isinstance(obj["ideas"], list):
+        return obj["ideas"]
+    if isinstance(obj, dict) and "idea" in obj and isinstance(obj["idea"], dict):
+        return [obj["idea"]]
+    if isinstance(obj, list):
+        return obj
+    if isinstance(obj, dict):
+        return [obj]
+    return []
+
+
+def _validate_one(idea: dict, idx: int) -> list[str]:
+    errs: list[str] = []
+    if not isinstance(idea, dict):
+        return [f"idea[{idx}]: not an object"]
+
+    for k in REQUIRED_KEYS:
+        if k not in idea:
+            errs.append(f"idea[{idx}]: missing key: {k}")
+
+    name = idea.get("Name")
+    if not isinstance(name, str) or not name.strip():
+        errs.append(f"idea[{idx}]: Name must be a non-empty string")
+    else:
+        if " " in name:
+            errs.append(f"idea[{idx}]: Name must not contain spaces: {name!r}")
+        if not RE_NAME.match(name):
+            errs.append(
+                f"idea[{idx}]: Name should match {RE_NAME.pattern}: {name!r}"
+            )
+
+    # Basic type checks (flexible where common)
+    for k in ["Title", "Short Hypothesis", "Related Work", "Abstract"]:
+        v = idea.get(k)
+        if v is not None and not isinstance(v, str):
+            errs.append(f"idea[{idx}]: {k} must be a string")
+
+    exps = idea.get("Experiments")
+    if exps is not None and not isinstance(exps, (list, str)):
+        errs.append(f"idea[{idx}]: Experiments must be a list or a string")
+    if isinstance(exps, list):
+        for j, e in enumerate(exps):
+            if not isinstance(e, (str, dict)):
+                errs.append(f"idea[{idx}]: Experiments[{j}] must be str or dict")
+
+    risks = idea.get("Risk Factors and Limitations")
+    if risks is not None and not isinstance(risks, (list, str)):
+        errs.append(
+            f"idea[{idx}]: Risk Factors and Limitations must be a list or a string"
+        )
+    if isinstance(risks, list):
+        for j, r in enumerate(risks):
+            if not isinstance(r, str):
+                errs.append(
+                    f"idea[{idx}]: Risk Factors and Limitations[{j}] must be a string"
+                )
+
+    return errs
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Validate idea JSON schema.")
+    ap.add_argument("--in", dest="in_path", required=True, help="Input JSON file path.")
+    args = ap.parse_args(argv)
+
+    path = Path(args.in_path)
+    obj = _load(path)
+    ideas = _unwrap(obj)
+
+    if not ideas:
+        print("[ERROR] No ideas found. Expected list or object with idea fields.", file=sys.stderr)
+        return 2
+
+    all_errs: list[str] = []
+    for i, idea in enumerate(ideas):
+        all_errs.extend(_validate_one(idea, i))
+
+    if all_errs:
+        print("[FAIL] Schema validation failed:", file=sys.stderr)
+        for e in all_errs:
+            print(f"- {e}", file=sys.stderr)
+        return 1
+
+    print(f"[OK] {len(ideas)} idea(s) validated successfully.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+
diff --git a/skills/research-ideation-novelty-check/scripts/s2_search.py b/skills/research-ideation-novelty-check/scripts/s2_search.py
new file mode 100644
index 00000000..1148095c
--- /dev/null
+++ b/skills/research-ideation-novelty-check/scripts/s2_search.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Semantic Scholar search helper (Graph API).
+
+Default behavior is safe/offline unless you explicitly pass --online.
+Reads API key from env var: S2_API_KEY (optional; improves rate limits).
+"""
+
+from __future__ import annotations
+
+import argparse
+import datetime as _dt
+import json
+import os
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+
+
+API_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
+DEFAULT_FIELDS = "title,authors,venue,year,abstract,citationCount,url"
+
+
+def _format_text(papers: list[dict]) -> str:
+    lines: list[str] = []
+    for i, p in enumerate(papers, start=1):
+        authors = ", ".join(a.get("name", "Unknown") for a in p.get("authors", []) or [])
+        lines.append(
+            f"{i}: {p.get('title','')}\n"
+            f"   Authors: {authors}\n"
+            f"   Venue: {p.get('venue','')} ({p.get('year','')})\n"
+            f"   Citations: {p.get('citationCount','')}\n"
+            f"   URL: {p.get('url','')}\n"
+            f"   Abstract: {p.get('abstract','')}\n"
+        )
+    return "\n".join(lines).strip() + "\n"
+
+
+def _http_get_json(url: str, api_key: str | None, timeout: float) -> dict:
+    req = urllib.request.Request(url, method="GET")
+    req.add_header("User-Agent", "codex-s2-search/0.1")
+    if api_key:
+        req.add_header("X-API-KEY", api_key)
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            raw = resp.read()
+    except urllib.error.HTTPError as e:
+        body = ""
+        try:
+            body = e.read().decode("utf-8", errors="replace")
+        except Exception:
+            body = ""
+        raise RuntimeError(f"HTTP {e.code}: {body[:500]}") from e
+    except urllib.error.URLError as e:
+        raise RuntimeError(f"Network error: {e}") from e
+    try:
+        return json.loads(raw.decode("utf-8"))
+    except Exception as e:
+        raise RuntimeError("Failed to parse JSON response") from e
+
+
+def main(argv: list[str] | None = None) -> int:
+    p = argparse.ArgumentParser(description="Search Semantic Scholar (Graph API).")
+    p.add_argument("--query", required=True, help="Search query string.")
+    p.add_argument("--limit", type=int, default=10, help="Max results (default: 10).")
+    p.add_argument(
+        "--format",
+        choices=("text", "json"),
+        default="text",
+        help="Output format (default: text).",
+    )
+    p.add_argument("--out", help="Write output to this file (default: stdout).")
+    mode = p.add_mutually_exclusive_group()
+    mode.add_argument(
+        "--online",
+        action="store_true",
+        help="Make a network request to Semantic Scholar (explicit opt-in).",
+    )
+    mode.add_argument(
+        "--offline",
+        action="store_true",
+        help="Do not make network requests (default).",
+    )
+    p.add_argument("--timeout", type=float, default=30.0, help="HTTP timeout seconds.")
+    args = p.parse_args(argv)
+
+    if not args.online:
+        msg = (
+            "Offline mode (default): no request sent.\n"
+            f"Query: {args.query!r}\n"
+            "Re-run with --online to query Semantic Scholar.\n"
+        )
+        if args.out:
+            with open(args.out, "w", encoding="utf-8") as f:
+                f.write(msg)
+        else:
+            sys.stdout.write(msg)
+        return 0
+
+    params = {
+        "query": args.query,
+        "limit": str(max(1, min(args.limit, 100))),
+        "fields": DEFAULT_FIELDS,
+    }
+    url = API_URL + "?" + urllib.parse.urlencode(params)
+    api_key = os.environ.get("S2_API_KEY")
+    payload = _http_get_json(url, api_key=api_key, timeout=args.timeout)
+
+    out_obj = {
+        "query": args.query,
+        "retrieved_at": _dt.datetime.now().isoformat(timespec="seconds"),
+        "total": payload.get("total"),
+        "papers": payload.get("data", []),
+    }
+
+    if args.format == "json":
+        out_text = json.dumps(out_obj, indent=2, ensure_ascii=False) + "\n"
+    else:
+        out_text = _format_text(out_obj["papers"])
+
+    if args.out:
+        with open(args.out, "w", encoding="utf-8") as f:
+            f.write(out_text)
+    else:
+        sys.stdout.write(out_text)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/token-cost-tracker/SKILL.md b/skills/token-cost-tracker/SKILL.md
new file mode 100644
index 00000000..89668c90
--- /dev/null
+++ b/skills/token-cost-tracker/SKILL.md
@@ -0,0 +1,30 @@
+---
+name: token-cost-tracker
+description: Summarize token usage and cost records from JSON or JSONL logs.
+---
+
+# Token Cost Tracker
+
+## Overview
+Aggregate token usage and costs by model from recorded LLM call logs.
+
+## Workflow
+1. Collect JSON or JSONL records with prompt_tokens, completion_tokens, total_tokens, and optional cost.
+2. Run the summarizer
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run -s scripts/token_cost_tracker.py \
+     --in token_records.jsonl --out token_summary.json
+   ~~~
+
+## Inputs
+- --in: JSON or JSONL records
+- --out: output summary JSON
+
+## Outputs
+- token_summary.json
+
+## Safeguards
+- Read-only on inputs; writes only to --out.
+
+## References
+- Schema: references/token_record_schema.md
diff --git a/skills/token-cost-tracker/agents/openai.yaml b/skills/token-cost-tracker/agents/openai.yaml
new file mode 100644
index 00000000..5a6b4a69
--- /dev/null
+++ b/skills/token-cost-tracker/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Token Cost Tracker"
+  short_description: "Summarize token and cost usage from JSON logs"
+  default_prompt: "Aggregate token usage and cost by model from provided JSON or JSONL records."
diff --git a/skills/token-cost-tracker/references/token_record_schema.md b/skills/token-cost-tracker/references/token_record_schema.md
new file mode 100644
index 00000000..50f12d25
--- /dev/null
+++ b/skills/token-cost-tracker/references/token_record_schema.md
@@ -0,0 +1,7 @@
+Token record fields (JSON or JSONL per line):
+- model: string
+- prompt_tokens: integer
+- completion_tokens: integer
+- total_tokens: integer
+- cost: float (optional)
+- timestamp: string (optional)
diff --git a/skills/token-cost-tracker/scripts/token_cost_tracker.py b/skills/token-cost-tracker/scripts/token_cost_tracker.py
new file mode 100644
index 00000000..d4111a06
--- /dev/null
+++ b/skills/token-cost-tracker/scripts/token_cost_tracker.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+
+def _load_records(path: Path):
+    text = path.read_text(encoding="utf-8")
+    text = text.strip()
+    if not text:
+        return []
+    if text.startswith("{") or text.startswith("["):
+        data = json.loads(text)
+        if isinstance(data, dict) and "records" in data:
+            return data["records"]
+        if isinstance(data, list):
+            return data
+        return [data]
+    # JSONL fallback
+    records = []
+    for line in text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        records.append(json.loads(line))
+    return records
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Summarize token and cost usage records.")
+    ap.add_argument("--in", dest="in_path", required=True, help="Input JSON or JSONL file.")
+    ap.add_argument("--out", required=True, help="Output JSON summary.")
+    args = ap.parse_args()
+
+    records = _load_records(Path(args.in_path))
+    totals = {
+        "prompt_tokens": 0,
+        "completion_tokens": 0,
+        "total_tokens": 0,
+        "cost": 0.0,
+        "by_model": {},
+        "records": len(records),
+    }
+
+    for rec in records:
+        if not isinstance(rec, dict):
+            continue
+        model = rec.get("model", "unknown")
+        pt = int(rec.get("prompt_tokens", 0) or 0)
+        ct = int(rec.get("completion_tokens", 0) or 0)
+        tt = int(rec.get("total_tokens", pt + ct) or 0)
+        cost = float(rec.get("cost", 0.0) or 0.0)
+        totals["prompt_tokens"] += pt
+        totals["completion_tokens"] += ct
+        totals["total_tokens"] += tt
+        totals["cost"] += cost
+        by = totals["by_model"].setdefault(model, {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "cost": 0.0})
+        by["prompt_tokens"] += pt
+        by["completion_tokens"] += ct
+        by["total_tokens"] += tt
+        by["cost"] += cost
+
+    Path(args.out).write_text(json.dumps(totals, indent=2), encoding="utf-8")
+    print(f"[OK] Wrote {args.out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/writeup-icbinb/SKILL.md b/skills/writeup-icbinb/SKILL.md
new file mode 100644
index 00000000..648ebd59
--- /dev/null
+++ b/skills/writeup-icbinb/SKILL.md
@@ -0,0 +1,48 @@
+---
+name: writeup-icbinb
+description: Draft a 4-page ICBINB-style LaTeX paper from idea JSON, summary artifacts, and plot manifest. Offline by default.
+---
+
+# ICBINB Writeup
+
+## Overview
+Generate a compact LaTeX paper skeleton for the ICBINB format using idea and summary inputs, plus optional plot manifests.
+
+## Workflow
+1. Prepare inputs
+   - idea JSON (references/idea.schema.json)
+   - summary JSON (references/summary.schema.json)
+   - plot manifest JSON (references/plot_manifest.schema.json)
+2. Generate LaTeX
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with openai -s scripts/writeup_icbinb.py \
+     --idea-json idea.json --summary-json summary.json --plots-manifest plot_manifest.json \
+     --out-dir writeup_icbinb
+   ~~~
+3. Optional LLM drafting (requires --online)
+   ~~~bash
+   uv run -s scripts/writeup_icbinb.py --idea-json idea.json --summary-json summary.json \
+     --out-dir writeup_icbinb --online --model gpt-4o-mini
+   ~~~
+
+## Inputs
+- --idea-json: path to idea JSON (optional)
+- --summary-json: path to summary JSON (optional)
+- --summary-md: path to summary markdown (optional)
+- --plots-manifest: plot manifest JSON (optional)
+- --out-dir: output directory (required)
+- --online: enable network calls to LLMs (default offline)
+
+## Outputs
+- writeup_icbinb/paper.tex
+
+## Safeguards
+- Offline by default; --online required for network calls.
+- Writes only within --out-dir.
+- No file deletion unless --overwrite is set.
+
+## References
+- Template: references/icbinb_template.tex
+- Idea schema: references/idea.schema.json
+- Summary schema: references/summary.schema.json
+- Plot manifest schema: references/plot_manifest.schema.json
diff --git a/skills/writeup-icbinb/agents/openai.yaml b/skills/writeup-icbinb/agents/openai.yaml
new file mode 100644
index 00000000..734a1a0e
--- /dev/null
+++ b/skills/writeup-icbinb/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "ICBINB Writeup"
+  short_description: "Generate ICBINB-style LaTeX from idea/summary artifacts"
+  default_prompt: "Create a compact ICBINB paper skeleton using the provided idea JSON, summary JSON, and plot manifest. Keep it concise and cite sources from inputs only."
diff --git a/skills/writeup-icbinb/references/icbinb_template.tex b/skills/writeup-icbinb/references/icbinb_template.tex
new file mode 100644
index 00000000..96b0961a
--- /dev/null
+++ b/skills/writeup-icbinb/references/icbinb_template.tex
@@ -0,0 +1,39 @@
+\\documentclass[10pt]{article}
+\\usepackage{graphicx}
+\\usepackage{times}
+\\usepackage{url}
+\\usepackage{amsmath}
+\\usepackage{amssymb}
+\\usepackage[margin=1in]{geometry}
+
+\\title{{{TITLE}}}
+\\author{{{AUTHORS}}}
+
+\\begin{document}
+\\maketitle
+
+\\begin{abstract}
+{ABSTRACT}
+\\end{abstract}
+
+\\section{Introduction}
+{INTRO}
+
+\\section{Methods}
+{METHODS}
+
+\\section{Results}
+{RESULTS}
+
+{FIGURES}
+
+\\section{Discussion}
+{DISCUSSION}
+
+\\section*{Ethics and Disclosure}
+This manuscript may have used AI assistance. Please ensure compliance with any disclosure requirements.
+
+\\bibliographystyle{plain}
+\\bibliography{references}
+
+\\end{document}
diff --git a/skills/writeup-icbinb/references/idea.schema.json b/skills/writeup-icbinb/references/idea.schema.json
new file mode 100644
index 00000000..334899aa
--- /dev/null
+++ b/skills/writeup-icbinb/references/idea.schema.json
@@ -0,0 +1,34 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "AI Scientist Idea",
+  "type": "object",
+  "required": [
+    "Name",
+    "Title",
+    "Short Hypothesis",
+    "Related Work",
+    "Abstract",
+    "Experiments",
+    "Risk Factors and Limitations"
+  ],
+  "properties": {
+    "Name": {"type": "string", "pattern": "^[a-z0-9_-]+$"},
+    "Title": {"type": "string"},
+    "Short Hypothesis": {"type": "string"},
+    "Related Work": {"type": "string"},
+    "Abstract": {"type": "string"},
+    "Experiments": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": ["string", "object"]}}
+      ]
+    },
+    "Risk Factors and Limitations": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": "string"}}
+      ]
+    }
+  },
+  "additionalProperties": true
+}
diff --git a/skills/writeup-icbinb/references/plot_manifest.schema.json b/skills/writeup-icbinb/references/plot_manifest.schema.json
new file mode 100644
index 00000000..70466c65
--- /dev/null
+++ b/skills/writeup-icbinb/references/plot_manifest.schema.json
@@ -0,0 +1,36 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "PlotManifest",
+  "type": "object",
+  "required": [
+    "figures"
+  ],
+  "properties": {
+    "figures": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": [
+          "filename",
+          "caption"
+        ],
+        "properties": {
+          "filename": {
+            "type": "string"
+          },
+          "caption": {
+            "type": "string"
+          },
+          "sources": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          }
+        },
+        "additionalProperties": true
+      }
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/writeup-icbinb/references/safeguards.md b/skills/writeup-icbinb/references/safeguards.md
new file mode 100644
index 00000000..a745b636
--- /dev/null
+++ b/skills/writeup-icbinb/references/safeguards.md
@@ -0,0 +1,3 @@
+- Do not fabricate results; use provided summaries and plots only.
+- Keep citations consistent with input artifacts.
+- Offline by default; require --online for any LLM calls.
diff --git a/skills/writeup-icbinb/references/summary.schema.json b/skills/writeup-icbinb/references/summary.schema.json
new file mode 100644
index 00000000..18a340f6
--- /dev/null
+++ b/skills/writeup-icbinb/references/summary.schema.json
@@ -0,0 +1,47 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "ExperimentSummary",
+  "type": "object",
+  "required": [
+    "Experiment_description",
+    "Significance",
+    "Description",
+    "List_of_included_plots",
+    "Key_numerical_results"
+  ],
+  "properties": {
+    "Experiment_description": {
+      "type": "string"
+    },
+    "Significance": {
+      "type": "string"
+    },
+    "Description": {
+      "type": "string"
+    },
+    "List_of_included_plots": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "Key_numerical_results": {
+      "oneOf": [
+        {
+          "type": "array",
+          "items": {
+            "type": [
+              "string",
+              "object",
+              "number"
+            ]
+          }
+        },
+        {
+          "type": "object"
+        }
+      ]
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/writeup-icbinb/scripts/llm_adapter.py b/skills/writeup-icbinb/scripts/llm_adapter.py
new file mode 100644
index 00000000..44596f15
--- /dev/null
+++ b/skills/writeup-icbinb/scripts/llm_adapter.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Lightweight LLM adapter with offline-by-default safeguards.
+from __future__ import annotations
+
+import json
+import os
+import urllib.request
+
+ONLINE_ENV_VAR = "ASV2_ONLINE"
+
+
+def _require_online() -> None:
+    if os.getenv(ONLINE_ENV_VAR) != "1":
+        raise RuntimeError(
+            "Offline mode: set ASV2_ONLINE=1 or pass --online to allow network calls."
+        )
+
+
+def _ollama_chat(prompt: str, system: str, model: str, temperature: float) -> str:
+    _require_online()
+    host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+    payload = {
+        "model": model.replace("ollama/", ""),
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": prompt},
+        ],
+        "options": {"temperature": temperature},
+        "stream": False,
+    }
+    req = urllib.request.Request(
+        f"{host}/api/chat",
+        data=json.dumps(payload).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+    )
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        data = json.loads(resp.read().decode("utf-8"))
+    return data.get("message", {}).get("content", "")
+
+
+def chat(
+    prompt: str,
+    system: str,
+    model: str,
+    temperature: float = 0.4,
+    max_tokens: int = 1200,
+) -> str:
+    _require_online()
+    if model.startswith("ollama/"):
+        return _ollama_chat(prompt, system, model, temperature)
+    if model.startswith("bedrock/"):
+        raise RuntimeError("Bedrock adapter not implemented in this lightweight skill.")
+    if model.startswith("vertex_ai/"):
+        raise RuntimeError("Vertex AI adapter not implemented in this lightweight skill.")
+    if model.startswith("gemini-"):
+        raise RuntimeError("Gemini adapter not implemented in this lightweight skill.")
+
+    # Default: OpenAI-compatible
+    try:
+        import openai
+    except Exception as e:
+        raise RuntimeError(
+            "OpenAI adapter requires the openai package. Install with --with openai."
+        ) from e
+
+    base_url = os.getenv("OPENAI_BASE_URL")
+    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url)
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": prompt},
+        ],
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    return resp.choices[0].message.content or ""
diff --git a/skills/writeup-icbinb/scripts/writeup_icbinb.py b/skills/writeup-icbinb/scripts/writeup_icbinb.py
new file mode 100644
index 00000000..4351606b
--- /dev/null
+++ b/skills/writeup-icbinb/scripts/writeup_icbinb.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+from llm_adapter import chat
+
+
+def _load_json(path: Path):
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        raise SystemExit(f"[ERROR] File not found: {path}")
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"[ERROR] Invalid JSON: {path}: {e}")
+
+
+def _first_idea(obj):
+    if isinstance(obj, dict) and "ideas" in obj and isinstance(obj["ideas"], list):
+        return obj["ideas"][0] if obj["ideas"] else None
+    if isinstance(obj, dict) and "idea" in obj and isinstance(obj["idea"], dict):
+        return obj["idea"]
+    if isinstance(obj, list):
+        return obj[0] if obj else None
+    if isinstance(obj, dict):
+        return obj
+    return None
+
+
+def _load_template(path: Path) -> str:
+    return path.read_text(encoding="utf-8")
+
+
+def _render_figures(manifest: dict) -> str:
+    figures = manifest.get("figures", []) if isinstance(manifest, dict) else []
+    blocks = []
+    for fig in figures:
+        filename = fig.get("filename")
+        caption = fig.get("caption", "")
+        if not filename:
+            continue
+        blocks.append(
+            "\\begin{figure}[t]\n"
+            "\\centering\n"
+            f"\\includegraphics[width=0.9\\linewidth]{{figures/{filename}}}\n"
+            f"\\caption{{{caption}}}\n"
+            "\\end{figure}\n"
+        )
+    return "\n".join(blocks) if blocks else ""
+
+
+def _safe_section(text: str, fallback: str) -> str:
+    return text.strip() if text and text.strip() else fallback
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Generate ICBINB-style LaTeX writeup.")
+    ap.add_argument("--idea-json", help="Path to idea JSON.")
+    ap.add_argument("--summary-json", help="Path to summary JSON.")
+    ap.add_argument("--summary-md", help="Path to summary markdown.")
+    ap.add_argument("--plots-manifest", help="Path to plot manifest JSON.")
+    ap.add_argument("--out-dir", required=True, help="Output directory.")
+    ap.add_argument("--title", help="Override title.")
+    ap.add_argument("--authors", default="Anonymous", help="Author list.")
+    ap.add_argument("--online", action="store_true", help="Enable LLM generation.")
+    ap.add_argument("--model", default="gpt-4o-mini", help="Model name.")
+    ap.add_argument("--overwrite", action="store_true", help="Overwrite existing output.")
+    args = ap.parse_args()
+
+    out_dir = Path(args.out_dir).expanduser().resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_tex = out_dir / "paper.tex"
+    if out_tex.exists() and not args.overwrite:
+        print(f"[ERROR] Output exists: {out_tex} (use --overwrite)")
+        return 2
+
+    idea = None
+    if args.idea_json:
+        idea = _first_idea(_load_json(Path(args.idea_json)))
+
+    summary = None
+    if args.summary_json:
+        summary = _load_json(Path(args.summary_json))
+
+    summary_md = ""
+    if args.summary_md:
+        summary_md = Path(args.summary_md).read_text(encoding="utf-8")
+
+    manifest = None
+    if args.plots_manifest:
+        manifest = _load_json(Path(args.plots_manifest))
+
+    title = args.title or (idea.get("Title") if isinstance(idea, dict) else None) or "Untitled"
+    abstract = ""
+    if isinstance(idea, dict):
+        abstract = idea.get("Abstract", "") or ""
+
+    intro = ""
+    methods = ""
+    results = ""
+    discussion = ""
+
+    if summary:
+        intro = summary.get("Description", "")
+        results = summary.get("Experiment_description", "")
+        discussion = summary.get("Significance", "")
+
+    if not intro and summary_md:
+        intro = summary_md
+
+    if args.online:
+        os.environ["ASV2_ONLINE"] = "1"
+        prompt = (
+            "Use the provided idea and summary to draft brief sections for a 4-page paper. "
+            "Return JSON with keys: abstract, introduction, methods, results, discussion.\n\n"
+            f"IDEA: {json.dumps(idea, ensure_ascii=False)}\n\nSUMMARY: {json.dumps(summary, ensure_ascii=False)}"
+        )
+        system = "You are a concise scientific writer."
+        response = chat(prompt=prompt, system=system, model=args.model)
+        try:
+            data = json.loads(response)
+            abstract = data.get("abstract", abstract)
+            intro = data.get("introduction", intro)
+            methods = data.get("methods", methods)
+            results = data.get("results", results)
+            discussion = data.get("discussion", discussion)
+        except Exception:
+            pass
+
+    template_path = Path(__file__).parent.parent / "references" / "icbinb_template.tex"
+    template = _load_template(template_path)
+
+    figures_block = _render_figures(manifest) if manifest else ""
+
+    filled = template
+    filled = filled.replace("{{{TITLE}}}", title)
+    filled = filled.replace("{{{AUTHORS}}}", args.authors)
+    filled = filled.replace("{ABSTRACT}", _safe_section(abstract, "TBD."))
+    filled = filled.replace("{INTRO}", _safe_section(intro, "TBD."))
+    filled = filled.replace("{METHODS}", _safe_section(methods, "TBD."))
+    filled = filled.replace("{RESULTS}", _safe_section(results, "TBD."))
+    filled = filled.replace("{DISCUSSION}", _safe_section(discussion, "TBD."))
+    filled = filled.replace("{FIGURES}", figures_block)
+
+    out_tex.write_text(filled, encoding="utf-8")
+    print(f"[OK] Wrote {out_tex}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/skills/writeup-normal/SKILL.md b/skills/writeup-normal/SKILL.md
new file mode 100644
index 00000000..6d382d6f
--- /dev/null
+++ b/skills/writeup-normal/SKILL.md
@@ -0,0 +1,48 @@
+---
+name: writeup-normal
+description: Draft a normal-length LaTeX paper from idea JSON, summary artifacts, and plot manifest. Offline by default.
+---
+
+# Normal Writeup
+
+## Overview
+Generate a longer LaTeX paper skeleton using idea and summary inputs, plus optional plot manifests.
+
+## Workflow
+1. Prepare inputs
+   - idea JSON (references/idea.schema.json)
+   - summary JSON (references/summary.schema.json)
+   - plot manifest JSON (references/plot_manifest.schema.json)
+2. Generate LaTeX
+   ~~~bash
+   UV_CACHE_DIR=/tmp/uv-cache XDG_CACHE_HOME=/tmp uv run --with openai -s scripts/writeup_normal.py \
+     --idea-json idea.json --summary-json summary.json --plots-manifest plot_manifest.json \
+     --out-dir writeup_normal
+   ~~~
+3. Optional LLM drafting (requires --online)
+   ~~~bash
+   uv run -s scripts/writeup_normal.py --idea-json idea.json --summary-json summary.json \
+     --out-dir writeup_normal --online --model gpt-4o-mini
+   ~~~
+
+## Inputs
+- --idea-json: path to idea JSON (optional)
+- --summary-json: path to summary JSON (optional)
+- --summary-md: path to summary markdown (optional)
+- --plots-manifest: plot manifest JSON (optional)
+- --out-dir: output directory (required)
+- --online: enable network calls to LLMs (default offline)
+
+## Outputs
+- writeup_normal/paper.tex
+
+## Safeguards
+- Offline by default; --online required for network calls.
+- Writes only within --out-dir.
+- No file deletion unless --overwrite is set.
+
+## References
+- Template: references/normal_template.tex
+- Idea schema: references/idea.schema.json
+- Summary schema: references/summary.schema.json
+- Plot manifest schema: references/plot_manifest.schema.json
diff --git a/skills/writeup-normal/agents/openai.yaml b/skills/writeup-normal/agents/openai.yaml
new file mode 100644
index 00000000..f29c398e
--- /dev/null
+++ b/skills/writeup-normal/agents/openai.yaml
@@ -0,0 +1,4 @@
+interface:
+  display_name: "Normal Writeup"
+  short_description: "Generate a full-length LaTeX paper skeleton from idea/summary artifacts"
+  default_prompt: "Create a normal-length paper skeleton using idea JSON, summary JSON, and plot manifest. Keep sections concise and grounded in inputs."
diff --git a/skills/writeup-normal/references/idea.schema.json b/skills/writeup-normal/references/idea.schema.json
new file mode 100644
index 00000000..334899aa
--- /dev/null
+++ b/skills/writeup-normal/references/idea.schema.json
@@ -0,0 +1,34 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "AI Scientist Idea",
+  "type": "object",
+  "required": [
+    "Name",
+    "Title",
+    "Short Hypothesis",
+    "Related Work",
+    "Abstract",
+    "Experiments",
+    "Risk Factors and Limitations"
+  ],
+  "properties": {
+    "Name": {"type": "string", "pattern": "^[a-z0-9_-]+$"},
+    "Title": {"type": "string"},
+    "Short Hypothesis": {"type": "string"},
+    "Related Work": {"type": "string"},
+    "Abstract": {"type": "string"},
+    "Experiments": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": ["string", "object"]}}
+      ]
+    },
+    "Risk Factors and Limitations": {
+      "oneOf": [
+        {"type": "string"},
+        {"type": "array", "items": {"type": "string"}}
+      ]
+    }
+  },
+  "additionalProperties": true
+}
diff --git a/skills/writeup-normal/references/normal_template.tex b/skills/writeup-normal/references/normal_template.tex
new file mode 100644
index 00000000..0fc279d7
--- /dev/null
+++ b/skills/writeup-normal/references/normal_template.tex
@@ -0,0 +1,45 @@
+\\documentclass[11pt]{article}
+\\usepackage{graphicx}
+\\usepackage{times}
+\\usepackage{url}
+\\usepackage{amsmath}
+\\usepackage{amssymb}
+\\usepackage[margin=1in]{geometry}
+
+\\title{{{TITLE}}}
+\\author{{{AUTHORS}}}
+
+\\begin{document}
+\\maketitle
+
+\\begin{abstract}
+{ABSTRACT}
+\\end{abstract}
+
+\\section{Introduction}
+{INTRO}
+
+\\section{Related Work}
+{RELATED}
+
+\\section{Methods}
+{METHODS}
+
+\\section{Experiments}
+{EXPERIMENTS}
+
+\\section{Results}
+{RESULTS}
+
+{FIGURES}
+
+\\section{Discussion}
+{DISCUSSION}
+
+\\section*{Ethics and Disclosure}
+This manuscript may have used AI assistance. Please ensure compliance with any disclosure requirements.
+
+\\bibliographystyle{plain}
+\\bibliography{references}
+
+\\end{document}
diff --git a/skills/writeup-normal/references/plot_manifest.schema.json b/skills/writeup-normal/references/plot_manifest.schema.json
new file mode 100644
index 00000000..70466c65
--- /dev/null
+++ b/skills/writeup-normal/references/plot_manifest.schema.json
@@ -0,0 +1,36 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "PlotManifest",
+  "type": "object",
+  "required": [
+    "figures"
+  ],
+  "properties": {
+    "figures": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": [
+          "filename",
+          "caption"
+        ],
+        "properties": {
+          "filename": {
+            "type": "string"
+          },
+          "caption": {
+            "type": "string"
+          },
+          "sources": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          }
+        },
+        "additionalProperties": true
+      }
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/writeup-normal/references/safeguards.md b/skills/writeup-normal/references/safeguards.md
new file mode 100644
index 00000000..a745b636
--- /dev/null
+++ b/skills/writeup-normal/references/safeguards.md
@@ -0,0 +1,3 @@
+- Do not fabricate results; use provided summaries and plots only.
+- Keep citations consistent with input artifacts.
+- Offline by default; require --online for any LLM calls.
diff --git a/skills/writeup-normal/references/summary.schema.json b/skills/writeup-normal/references/summary.schema.json
new file mode 100644
index 00000000..18a340f6
--- /dev/null
+++ b/skills/writeup-normal/references/summary.schema.json
@@ -0,0 +1,47 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "ExperimentSummary",
+  "type": "object",
+  "required": [
+    "Experiment_description",
+    "Significance",
+    "Description",
+    "List_of_included_plots",
+    "Key_numerical_results"
+  ],
+  "properties": {
+    "Experiment_description": {
+      "type": "string"
+    },
+    "Significance": {
+      "type": "string"
+    },
+    "Description": {
+      "type": "string"
+    },
+    "List_of_included_plots": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "Key_numerical_results": {
+      "oneOf": [
+        {
+          "type": "array",
+          "items": {
+            "type": [
+              "string",
+              "object",
+              "number"
+            ]
+          }
+        },
+        {
+          "type": "object"
+        }
+      ]
+    }
+  },
+  "additionalProperties": true
+}
\ No newline at end of file
diff --git a/skills/writeup-normal/scripts/llm_adapter.py b/skills/writeup-normal/scripts/llm_adapter.py
new file mode 100644
index 00000000..44596f15
--- /dev/null
+++ b/skills/writeup-normal/scripts/llm_adapter.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# Lightweight LLM adapter with offline-by-default safeguards.
+from __future__ import annotations
+
+import json
+import os
+import urllib.request
+
+ONLINE_ENV_VAR = "ASV2_ONLINE"
+
+
+def _require_online() -> None:
+    if os.getenv(ONLINE_ENV_VAR) != "1":
+        raise RuntimeError(
+            "Offline mode: set ASV2_ONLINE=1 or pass --online to allow network calls."
+        )
+
+
+def _ollama_chat(prompt: str, system: str, model: str, temperature: float) -> str:
+    _require_online()
+    host = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+    payload = {
+        "model": model.replace("ollama/", ""),
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": prompt},
+        ],
+        "options": {"temperature": temperature},
+        "stream": False,
+    }
+    req = urllib.request.Request(
+        f"{host}/api/chat",
+        data=json.dumps(payload).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+    )
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        data = json.loads(resp.read().decode("utf-8"))
+    return data.get("message", {}).get("content", "")
+
+
+def chat(
+    prompt: str,
+    system: str,
+    model: str,
+    temperature: float = 0.4,
+    max_tokens: int = 1200,
+) -> str:
+    _require_online()
+    if model.startswith("ollama/"):
+        return _ollama_chat(prompt, system, model, temperature)
+    if model.startswith("bedrock/"):
+        raise RuntimeError("Bedrock adapter not implemented in this lightweight skill.")
+    if model.startswith("vertex_ai/"):
+        raise RuntimeError("Vertex AI adapter not implemented in this lightweight skill.")
+    if model.startswith("gemini-"):
+        raise RuntimeError("Gemini adapter not implemented in this lightweight skill.")
+
+    # Default: OpenAI-compatible
+    try:
+        import openai
+    except Exception as e:
+        raise RuntimeError(
+            "OpenAI adapter requires the openai package. Install with --with openai."
+        ) from e
+
+    base_url = os.getenv("OPENAI_BASE_URL")
+    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=base_url)
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": prompt},
+        ],
+        temperature=temperature,
+        max_tokens=max_tokens,
+    )
+    return resp.choices[0].message.content or ""
diff --git a/skills/writeup-normal/scripts/writeup_normal.py b/skills/writeup-normal/scripts/writeup_normal.py
new file mode 100644
index 00000000..32b6361b
--- /dev/null
+++ b/skills/writeup-normal/scripts/writeup_normal.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+from llm_adapter import chat
+
+
+def _load_json(path: Path):
+    try:
+        return json.loads(path.read_text(encoding="utf-8"))
+    except FileNotFoundError:
+        raise SystemExit(f"[ERROR] File not found: {path}")
+    except json.JSONDecodeError as e:
+        raise SystemExit(f"[ERROR] Invalid JSON: {path}: {e}")
+
+
+def _first_idea(obj):
+    if isinstance(obj, dict) and "ideas" in obj and isinstance(obj["ideas"], list):
+        return obj["ideas"][0] if obj["ideas"] else None
+    if isinstance(obj, dict) and "idea" in obj and isinstance(obj["idea"], dict):
+        return obj["idea"]
+    if isinstance(obj, list):
+        return obj[0] if obj else None
+    if isinstance(obj, dict):
+        return obj
+    return None
+
+
+def _load_template(path: Path) -> str:
+    return path.read_text(encoding="utf-8")
+
+
+def _render_figures(manifest: dict) -> str:
+    figures = manifest.get("figures", []) if isinstance(manifest, dict) else []
+    blocks = []
+    for fig in figures:
+        filename = fig.get("filename")
+        caption = fig.get("caption", "")
+        if not filename:
+            continue
+        blocks.append(
+            "\\begin{figure}[t]\n"
+            "\\centering\n"
+            f"\\includegraphics[width=0.9\\linewidth]{{figures/{filename}}}\n"
+            f"\\caption{{{caption}}}\n"
+            "\\end{figure}\n"
+        )
+    return "\n".join(blocks) if blocks else ""
+
+
+def _safe_section(text: str, fallback: str) -> str:
+    return text.strip() if text and text.strip() else fallback
+
+
+def _format_experiments(value) -> str:
+    if isinstance(value, list):
+        return "\n".join([f"- {v}" for v in value])
+    if isinstance(value, str):
+        return value
+    return ""
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Generate normal-length LaTeX writeup.")
+    ap.add_argument("--idea-json", help="Path to idea JSON.")
+    ap.add_argument("--summary-json", help="Path to summary JSON.")
+    ap.add_argument("--summary-md", help="Path to summary markdown.")
+    ap.add_argument("--plots-manifest", help="Path to plot manifest JSON.")
+    ap.add_argument("--out-dir", required=True, help="Output directory.")
+    ap.add_argument("--title", help="Override title.")
+    ap.add_argument("--authors", default="Anonymous", help="Author list.")
+    ap.add_argument("--online", action="store_true", help="Enable LLM generation.")
+    ap.add_argument("--model", default="gpt-4o-mini", help="Model name.")
+    ap.add_argument("--overwrite", action="store_true", help="Overwrite existing output.")
+    args = ap.parse_args()
+
+    out_dir = Path(args.out_dir).expanduser().resolve()
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_tex = out_dir / "paper.tex"
+    if out_tex.exists() and not args.overwrite:
+        print(f"[ERROR] Output exists: {out_tex} (use --overwrite)")
+        return 2
+
+    idea = None
+    if args.idea_json:
+        idea = _first_idea(_load_json(Path(args.idea_json)))
+
+    summary = None
+    if args.summary_json:
+        summary = _load_json(Path(args.summary_json))
+
+    summary_md = ""
+    if args.summary_md:
+        summary_md = Path(args.summary_md).read_text(encoding="utf-8")
+
+    manifest = None
+    if args.plots_manifest:
+        manifest = _load_json(Path(args.plots_manifest))
+
+    title = args.title or (idea.get("Title") if isinstance(idea, dict) else None) or "Untitled"
+    abstract = ""
+    related = ""
+    experiments = ""
+    if isinstance(idea, dict):
+        abstract = idea.get("Abstract", "") or ""
+        related = idea.get("Related Work", "") or ""
+        experiments = _format_experiments(idea.get("Experiments"))
+
+    intro = ""
+    methods = ""
+    results = ""
+    discussion = ""
+
+    if summary:
+        intro = summary.get("Description", "")
+        results = summary.get("Experiment_description", "")
+        discussion = summary.get("Significance", "")
+
+    if not intro and summary_md:
+        intro = summary_md
+
+    if args.online:
+        os.environ["ASV2_ONLINE"] = "1"
+        prompt = (
+            "Use the provided idea and summary to draft sections for an 8-page paper. "
+            "Return JSON with keys: abstract, introduction, related, methods, experiments, results, discussion.\n\n"
+            f"IDEA: {json.dumps(idea, ensure_ascii=False)}\n\nSUMMARY: {json.dumps(summary, ensure_ascii=False)}"
+        )
+        system = "You are a concise scientific writer."
+        response = chat(prompt=prompt, system=system, model=args.model)
+        try:
+            data = json.loads(response)
+            abstract = data.get("abstract", abstract)
+            intro = data.get("introduction", intro)
+            related = data.get("related", related)
+            methods = data.get("methods", methods)
+            experiments = data.get("experiments", experiments)
+            results = data.get("results", results)
+            discussion = data.get("discussion", discussion)
+        except Exception:
+            pass
+
+    template_path = Path(__file__).parent.parent / "references" / "normal_template.tex"
+    template = _load_template(template_path)
+
+    figures_block = _render_figures(manifest) if manifest else ""
+
+    filled = template
+    filled = filled.replace("{{{TITLE}}}", title)
+    filled = filled.replace("{{{AUTHORS}}}", args.authors)
+    filled = filled.replace("{ABSTRACT}", _safe_section(abstract, "TBD."))
+    filled = filled.replace("{INTRO}", _safe_section(intro, "TBD."))
+    filled = filled.replace("{RELATED}", _safe_section(related, "TBD."))
+    filled = filled.replace("{METHODS}", _safe_section(methods, "TBD."))
+    filled = filled.replace("{EXPERIMENTS}", _safe_section(experiments, "TBD."))
+    filled = filled.replace("{RESULTS}", _safe_section(results, "TBD."))
+    filled = filled.replace("{DISCUSSION}", _safe_section(discussion, "TBD."))
+    filled = filled.replace("{FIGURES}", figures_block)
+
+    out_tex.write_text(filled, encoding="utf-8")
+    print(f"[OK] Wrote {out_tex}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())